Spaces:
Runtime error
Runtime error
import random | |
import requests | |
from flask import Flask, request, jsonify, Response, stream_with_context, render_template_string | |
from mistral_common.protocol.instruct.messages import AssistantMessage, UserMessage, SystemMessage | |
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer | |
from mistral_common.protocol.instruct.request import ChatCompletionRequest | |
mt_v3 = MistralTokenizer.v3(is_tekken=True) | |
def calc_messages_tokens(json_data): | |
messages = json_data["messages"] | |
m_messages = [] | |
for message in messages: | |
if message["role"] == "system": | |
m_messages.append(SystemMessage(content=message["content"])) | |
elif message["role"] == "user": | |
m_messages.append(UserMessage(content=message["content"])) | |
elif message["role"] == "assistant": | |
m_messages.append(AssistantMessage(content=message["content"])) | |
else: | |
continue | |
tokens = mt_v3.encode_chat_completion(ChatCompletionRequest(messages=m_messages)).tokens | |
return len(tokens) + len(m_messages) | |
app = Flask(__name__) | |
def index(): | |
template = ''' | |
<html> | |
<head> | |
<title>Mistral-Nemo Chat API</title> | |
</head> | |
<body> | |
<h1>Mistral-Nemo OpenAI Compatible API</h1> | |
<li>Create your token(use as api key) <a target="_blank" href="https://huggingface.co/settings/tokens/new">[here]</a> by selecting "serverless Inference API".</li> | |
<li>2. Set "https://tastypear-mistral-nemo-chat.hf.space/api" as the domain in the client configuration.</li> | |
If you have multiple keys, you can concatenate them with a semicolon (`;`) to use them randomly, e.g., `hf_aaaa;hf_bbbb;hf_...` | |
</body> | |
</html> | |
''' | |
return render_template_string(template) | |
def proxy(): | |
headers = dict(request.headers) | |
headers.pop('Host', None) | |
headers.pop('Content-Length', None) | |
keys = request.headers['Authorization'].split(' ')[1].split(';') | |
headers['Authorization'] = f'Bearer {random.choice(keys)}' | |
json_data = request.get_json() | |
# Avoid using cache | |
json_data["messages"][-1]['content'] = ' '*random.randint(1, 20)+json_data["messages"][-1]['content'] | |
# Use the largest ctx | |
json_data['max_tokens'] = 32768 - calc_messages_tokens(json_data) | |
json_data['json_mode'] = False | |
model = 'mistralai/Mistral-Nemo-Instruct-2407' | |
def generate(): | |
with requests.post(f"https://api-inference.huggingface.co/models/{model}/v1/chat/completions", json=request.json, headers=headers, stream=True) as resp: | |
for chunk in resp.iter_content(chunk_size=1024): | |
if chunk: | |
yield chunk | |
return Response(stream_with_context(generate()), content_type='text/event-stream') | |
#import gevent.pywsgi | |
#from gevent import monkey;monkey.patch_all() | |
if __name__ == "__main__": | |
app.run(debug=True) | |
# gevent.pywsgi.WSGIServer((args.host, args.port), app).serve_forever() |