Spaces:

tastypear
/

mistral-nemo-chat

Runtime error

App Files Files Community

mistral-nemo-chat / main.py

tastypear

Update main.py

d202b50 verified 4 months ago

raw

history blame

3.1 kB

	import random
	import requests
	from flask import Flask, request, jsonify, Response, stream_with_context, render_template_string
	from mistral_common.protocol.instruct.messages import AssistantMessage, UserMessage, SystemMessage
	from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
	from mistral_common.protocol.instruct.request import ChatCompletionRequest
	mt_v3 = MistralTokenizer.v3(is_tekken=True)

	def calc_messages_tokens(json_data):
	messages = json_data["messages"]
	m_messages = []
	for message in messages:
	if message["role"] == "system":
	m_messages.append(SystemMessage(content=message["content"]))
	elif message["role"] == "user":
	m_messages.append(UserMessage(content=message["content"]))
	elif message["role"] == "assistant":
	m_messages.append(AssistantMessage(content=message["content"]))
	else:
	continue
	tokens = mt_v3.encode_chat_completion(ChatCompletionRequest(messages=m_messages)).tokens
	return len(tokens) + len(m_messages)

	app = Flask(__name__)

	@app.route('/', methods=['GET'])
	def index():
	template = '''
	<html>
	<head>
	<title>Mistral-Nemo Chat API</title>
	</head>
	<body>
	<h1>Mistral-Nemo OpenAI Compatible API</h1>
	<li>Create your token(use as api key) <a target="_blank" href="https://huggingface.co/settings/tokens/new">[here]</a> by selecting "serverless Inference API".</li>
	<li>2. Set "https://tastypear-mistral-nemo-chat.hf.space/api" as the domain in the client configuration.</li>
	If you have multiple keys, you can concatenate them with a semicolon (`;`) to use them randomly, e.g., `hf_aaaa;hf_bbbb;hf_...`
	</body>
	</html>
	'''
	return render_template_string(template)

	@app.route('/api/v1/chat/completions', methods=['POST'])
	def proxy():
	headers = dict(request.headers)
	headers.pop('Host', None)
	headers.pop('Content-Length', None)
	keys = request.headers['Authorization'].split(' ')[1].split(';')
	headers['Authorization'] = f'Bearer {random.choice(keys)}'

	json_data = request.get_json()

	# Avoid using cache
	json_data["messages"][-1]['content'] = ' '*random.randint(1, 20)+json_data["messages"][-1]['content']

	# Use the largest ctx
	json_data['max_tokens'] = 32768 - calc_messages_tokens(json_data)

	json_data['json_mode'] = False

	model = 'mistralai/Mistral-Nemo-Instruct-2407'

	def generate():
	with requests.post(f"https://api-inference.huggingface.co/models/{model}/v1/chat/completions", json=request.json, headers=headers, stream=True) as resp:
	for chunk in resp.iter_content(chunk_size=1024):
	if chunk:
	yield chunk

	return Response(stream_with_context(generate()), content_type='text/event-stream')

	#import gevent.pywsgi
	#from gevent import monkey;monkey.patch_all()
	if __name__ == "__main__":
	app.run(debug=True)
	# gevent.pywsgi.WSGIServer((args.host, args.port), app).serve_forever()