Spaces:

yatima-k
/

test_fastchat

Runtime error

App Files Files Community

test_fastchat / api_provider.py

yatima-k

Upload folder using huggingface_hub

b818699 verified over 1 year ago

raw

history blame contribute delete

13.1 kB

	"""Call API providers."""

	import json
	import os
	import random
	import time

	import requests

	from fastchat.utils import build_logger


	logger = build_logger("gradio_web_server", "gradio_web_server.log")


	def get_api_provider_stream_iter(
	conv,
	model_name,
	model_api_dict,
	temperature,
	top_p,
	max_new_tokens,
	):
	if model_api_dict["api_type"] == "openai":
	prompt = conv.to_openai_api_messages()
	stream_iter = openai_api_stream_iter(
	model_api_dict["model_name"],
	prompt,
	temperature,
	top_p,
	max_new_tokens,
	api_base=model_api_dict["api_base"],
	api_key=model_api_dict["api_key"],
	)
	elif model_api_dict["api_type"] == "anthropic":
	prompt = conv.get_prompt()
	stream_iter = anthropic_api_stream_iter(
	model_name, prompt, temperature, top_p, max_new_tokens
	)
	elif model_api_dict["api_type"] == "gemini":
	stream_iter = gemini_api_stream_iter(
	model_api_dict["model_name"],
	conv,
	temperature,
	top_p,
	max_new_tokens,
	api_key=model_api_dict["api_key"],
	)
	elif model_api_dict["api_type"] == "bard":
	prompt = conv.to_openai_api_messages()
	stream_iter = bard_api_stream_iter(
	model_api_dict["model_name"],
	prompt,
	temperature,
	top_p,
	api_key=model_api_dict["api_key"],
	)
	elif model_api_dict["api_type"] == "mistral":
	prompt = conv.to_openai_api_messages()
	stream_iter = mistral_api_stream_iter(
	model_name, prompt, temperature, top_p, max_new_tokens
	)
	elif model_api_dict["api_type"] == "nvidia":
	prompt = conv.to_openai_api_messages()
	stream_iter = nvidia_api_stream_iter(
	model_name,
	prompt,
	temperature,
	top_p,
	max_new_tokens,
	model_api_dict["api_base"],
	)
	elif model_api_dict["api_type"] == "ai2":
	prompt = conv.to_openai_api_messages()
	stream_iter = ai2_api_stream_iter(
	model_name,
	model_api_dict["model_name"],
	prompt,
	temperature,
	top_p,
	max_new_tokens,
	api_base=model_api_dict["api_base"],
	api_key=model_api_dict["api_key"],
	)
	else:
	raise NotImplementedError()

	return stream_iter


	def openai_api_stream_iter(
	model_name,
	messages,
	temperature,
	top_p,
	max_new_tokens,
	api_base=None,
	api_key=None,
	):
	import openai

	api_key = api_key or os.environ["OPENAI_API_KEY"]

	if "azure" in model_name:
	client = openai.AzureOpenAI(
	api_version="2023-07-01-preview",
	azure_endpoint=api_base or "https://api.openai.com/v1",
	api_key=api_key,
	)
	else:
	client = openai.OpenAI(
	base_url=api_base or "https://api.openai.com/v1", api_key=api_key
	)

	if model_name == "gpt-4-turbo":
	model_name = "gpt-4-1106-preview"

	# Make requests
	gen_params = {
	"model": model_name,
	"prompt": messages,
	"temperature": temperature,
	"top_p": top_p,
	"max_new_tokens": max_new_tokens,
	}
	logger.info(f"==== request ====\n{gen_params}")

	res = client.chat.completions.create(
	model=model_name,
	messages=messages,
	temperature=temperature,
	max_tokens=max_new_tokens,
	stream=True,
	)
	text = ""
	for chunk in res:
	if len(chunk.choices) > 0:
	text += chunk.choices[0].delta.content or ""
	data = {
	"text": text,
	"error_code": 0,
	}
	yield data


	def anthropic_api_stream_iter(model_name, prompt, temperature, top_p, max_new_tokens):
	import anthropic

	c = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])

	# Make requests
	gen_params = {
	"model": model_name,
	"prompt": prompt,
	"temperature": temperature,
	"top_p": top_p,
	"max_new_tokens": max_new_tokens,
	}
	logger.info(f"==== request ====\n{gen_params}")

	res = c.completions.create(
	prompt=prompt,
	stop_sequences=[anthropic.HUMAN_PROMPT],
	max_tokens_to_sample=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	model=model_name,
	stream=True,
	)
	text = ""
	for chunk in res:
	text += chunk.completion
	data = {
	"text": text,
	"error_code": 0,
	}
	yield data


	def gemini_api_stream_iter(
	model_name, conv, temperature, top_p, max_new_tokens, api_key=None
	):
	import google.generativeai as genai # pip install google-generativeai

	if api_key is None:
	api_key = os.environ["GEMINI_API_KEY"]
	genai.configure(api_key=api_key)

	generation_config = {
	"temperature": temperature,
	"max_output_tokens": max_new_tokens,
	"top_p": top_p,
	}
	params = {
	"model": model_name,
	"prompt": conv,
	}
	params.update(generation_config)
	logger.info(f"==== request ====\n{params}")

	safety_settings = [
	{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
	{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
	{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
	{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
	]
	model = genai.GenerativeModel(
	model_name=model_name,
	generation_config=generation_config,
	safety_settings=safety_settings,
	)
	history = []
	for role, message in conv.messages[:-2]:
	history.append({"role": role, "parts": message})
	convo = model.start_chat(history=history)
	response = convo.send_message(conv.messages[-2][1], stream=True)

	try:
	text = ""
	for chunk in response:
	text += chunk.text
	data = {
	"text": text,
	"error_code": 0,
	}
	yield data
	except Exception as e:
	logger.error(f"==== error ====\n{e}")
	reason = chunk.candidates
	yield {
	"text": f"API REQUEST ERROR Reason: {reason}.",
	"error_code": 1,
	}


	def bard_api_stream_iter(model_name, conv, temperature, top_p, api_key=None):
	del top_p # not supported
	del temperature # not supported

	if api_key is None:
	api_key = os.environ["BARD_API_KEY"]

	# convert conv to conv_bard
	conv_bard = []
	for turn in conv:
	if turn["role"] == "user":
	conv_bard.append({"author": "0", "content": turn["content"]})
	elif turn["role"] == "assistant":
	conv_bard.append({"author": "1", "content": turn["content"]})
	else:
	raise ValueError(f"Unsupported role: {turn['role']}")

	params = {
	"model": model_name,
	"prompt": conv_bard,
	}
	logger.info(f"==== request ====\n{params}")

	try:
	res = requests.post(
	f"https://generativelanguage.googleapis.com/v1beta2/models/{model_name}:generateMessage?key={api_key}",
	json={
	"prompt": {
	"messages": conv_bard,
	},
	},
	timeout=30,
	)
	except Exception as e:
	logger.error(f"==== error ====\n{e}")
	yield {
	"text": f"API REQUEST ERROR Reason: {e}.",
	"error_code": 1,
	}

	if res.status_code != 200:
	logger.error(f"==== error ==== ({res.status_code}): {res.text}")
	yield {
	"text": f"API REQUEST ERROR Reason: status code {res.status_code}.",
	"error_code": 1,
	}

	response_json = res.json()
	if "candidates" not in response_json:
	logger.error(f"==== error ==== response blocked: {response_json}")
	reason = response_json["filters"][0]["reason"]
	yield {
	"text": f"API REQUEST ERROR Reason: {reason}.",
	"error_code": 1,
	}

	response = response_json["candidates"][0]["content"]
	pos = 0
	while pos < len(response):
	# simulate token streaming
	pos += random.randint(3, 6)
	time.sleep(0.002)
	data = {
	"text": response[:pos],
	"error_code": 0,
	}
	yield data


	def ai2_api_stream_iter(
	model_name,
	model_id,
	messages,
	temperature,
	top_p,
	max_new_tokens,
	api_key=None,
	api_base=None,
	):
	# get keys and needed values
	ai2_key = api_key or os.environ.get("AI2_API_KEY")
	api_base = api_base or "https://inferd.allen.ai/api/v1/infer"

	# Make requests
	gen_params = {
	"model": model_name,
	"prompt": messages,
	"temperature": temperature,
	"top_p": top_p,
	"max_new_tokens": max_new_tokens,
	}
	logger.info(f"==== request ====\n{gen_params}")

	# AI2 uses vLLM, which requires that `top_p` be 1.0 for greedy sampling:
	# https://github.com/vllm-project/vllm/blob/v0.1.7/vllm/sampling_params.py#L156-L157
	if temperature == 0.0 and top_p < 1.0:
	raise ValueError("top_p must be 1 when temperature is 0.0")

	res = requests.post(
	api_base,
	stream=True,
	headers={"Authorization": f"Bearer {ai2_key}"},
	json={
	"model_id": model_id,
	# This input format is specific to the Tulu2 model. Other models
	# may require different input formats. See the model's schema
	# documentation on InferD for more information.
	"input": {
	"messages": messages,
	"opts": {
	"max_tokens": max_new_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"logprobs": 1, # increase for more choices
	},
	},
	},
	timeout=5,
	)

	if res.status_code != 200:
	logger.error(f"unexpected response ({res.status_code}): {res.text}")
	raise ValueError("unexpected response from InferD", res)

	text = ""
	for line in res.iter_lines():
	if line:
	part = json.loads(line)
	if "result" in part and "output" in part["result"]:
	for t in part["result"]["output"]["text"]:
	text += t
	else:
	logger.error(f"unexpected part: {part}")
	raise ValueError("empty result in InferD response")

	data = {
	"text": text,
	"error_code": 0,
	}
	yield data


	def mistral_api_stream_iter(model_name, messages, temperature, top_p, max_new_tokens):
	from mistralai.client import MistralClient
	from mistralai.models.chat_completion import ChatMessage

	api_key = os.environ["MISTRAL_API_KEY"]

	client = MistralClient(api_key=api_key)

	# Make requests
	gen_params = {
	"model": model_name,
	"prompt": messages,
	"temperature": temperature,
	"top_p": top_p,
	"max_new_tokens": max_new_tokens,
	}
	logger.info(f"==== request ====\n{gen_params}")

	new_messages = [
	ChatMessage(role=message["role"], content=message["content"])
	for message in messages
	]

	res = client.chat_stream(
	model=model_name,
	temperature=temperature,
	messages=new_messages,
	max_tokens=max_new_tokens,
	top_p=top_p,
	)

	text = ""
	for chunk in res:
	if chunk.choices[0].delta.content is not None:
	text += chunk.choices[0].delta.content
	data = {
	"text": text,
	"error_code": 0,
	}
	yield data


	def nvidia_api_stream_iter(model_name, messages, temp, top_p, max_tokens, api_base):
	assert model_name in ["llama2-70b-steerlm-chat", "yi-34b-chat"]

	api_key = os.environ["NVIDIA_API_KEY"]
	headers = {
	"Authorization": f"Bearer {api_key}",
	"accept": "text/event-stream",
	"content-type": "application/json",
	}
	# nvidia api does not accept 0 temperature
	if temp == 0.0:
	temp = 0.0001

	payload = {
	"messages": messages,
	"temperature": temp,
	"top_p": top_p,
	"max_tokens": max_tokens,
	"seed": 42,
	"stream": True,
	}
	logger.info(f"==== request ====\n{payload}")

	response = requests.post(
	api_base, headers=headers, json=payload, stream=True, timeout=1
	)
	text = ""
	for line in response.iter_lines():
	if line:
	data = line.decode("utf-8")
	if data.endswith("[DONE]"):
	break
	data = json.loads(data[6:])["choices"][0]["delta"]["content"]
	text += data
	yield {"text": text, "error_code": 0}