Spaces:
Paused
Paused
from flask import Flask, request, Response | |
import logging | |
from llama_cpp import Llama | |
import threading | |
from huggingface_hub import snapshot_download | |
SYSTEM_PROMPT = "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык." | |
SYSTEM_TOKEN = 1788 | |
USER_TOKEN = 1404 | |
BOT_TOKEN = 9225 | |
LINEBREAK_TOKEN = 13 | |
ROLE_TOKENS = { | |
"user": USER_TOKEN, | |
"bot": BOT_TOKEN, | |
"system": SYSTEM_TOKEN | |
} | |
# Create a lock object | |
lock = threading.Lock() | |
app = Flask(__name__) | |
# Configure Flask logging | |
app.logger.setLevel(logging.DEBUG) # Set the desired logging level | |
# Initialize the model when the application starts | |
#model_path = "../models/model-q4_K.gguf" # Replace with the actual model path | |
#model_name = "model/ggml-model-q4_K.gguf" | |
#repo_name = "IlyaGusev/saiga2_13b_gguf" | |
#model_name = "model-q4_K.gguf" | |
repo_name = "IlyaGusev/saiga2_70b_gguf" | |
model_name = "ggml-model-q4_1.gguf" | |
snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name) | |
def get_message_tokens(model, role, content): | |
message_tokens = model.tokenize(content.encode("utf-8")) | |
message_tokens.insert(1, ROLE_TOKENS[role]) | |
message_tokens.insert(2, LINEBREAK_TOKEN) | |
message_tokens.append(model.token_eos()) | |
return message_tokens | |
def get_system_tokens(model): | |
system_message = { | |
"role": "system", | |
"content": SYSTEM_PROMPT | |
} | |
return get_message_tokens(model, **system_message) | |
def get_system_tokens_for_preprompt(model, preprompt): | |
system_message = { | |
"role": "system", | |
"content": preprompt | |
} | |
return get_message_tokens(model, **system_message) | |
app.logger.info('Evaluating system tokens start') | |
#system_tokens = get_system_tokens(model) | |
#model.eval(system_tokens) | |
app.logger.info('Evaluating system tokens end') | |
stop_generation = False | |
def generate_tokens(model, generator): | |
global stop_generation | |
app.logger.info('generate_tokens started') | |
#with lock: | |
try: | |
for token in generator: | |
if token == model.token_eos() or stop_generation: | |
stop_generation = False | |
app.logger.info('Abort generating') | |
yield b'' # End of chunk | |
break | |
token_str = model.detokenize([token])#.decode("utf-8", errors="ignore") | |
yield token_str | |
except Exception as e: | |
app.logger.info('generator exception') | |
yield b'' # End of chunk | |
def handler_stop_generation(): | |
global stop_generation | |
stop_generation = True | |
return Response('Stopped', content_type='text/plain') | |
def generate_unknown_response(): | |
app.logger.info('unknown method: '+request.method) | |
try: | |
request_payload = request.get_json() | |
app.logger.info('payload: '+request.get_json()) | |
except Exception as e: | |
app.logger.info('payload empty') | |
return Response('What do you want?', content_type='text/plain') | |
def generate_search_request(): | |
global stop_generation | |
stop_generation = False | |
data = request.get_json() | |
app.logger.info(data) | |
user_query = data.get("query", "") | |
preprompt = data.get("preprompt", "Ты — русскоязычный автоматический ассистент для написании запросов для поисковых систем на русском языке. Отвечай на сообщения пользователя только текстом поискового запроса, релевантным запросу пользователя. Если запрос пользователя уже хорош, используй его в качестве результата.") | |
parameters = data.get("parameters", {}) | |
# Extract parameters from the request | |
temperature = 0.01 | |
truncate = parameters.get("truncate", 1000) | |
max_new_tokens = parameters.get("max_new_tokens", 1024) | |
top_p = 0.8 | |
repetition_penalty = parameters.get("repetition_penalty", 1.2) | |
top_k = 20 | |
return_full_text = parameters.get("return_full_text", False) | |
model = Llama( | |
model_path=model_name, | |
n_ctx=2000, | |
n_parts=1, | |
#n_batch=100, | |
logits_all=True, | |
#n_threads=12, | |
verbose=True, | |
n_gpu_layers=40, | |
n_gqa=8 #must be set for 70b models | |
) | |
tokens = get_system_tokens_for_preprompt(model, preprompt) | |
tokens.append(LINEBREAK_TOKEN) | |
tokens = get_message_tokens(model=model, role="user", content=user_query[:200]) + [model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN] | |
generator = model.generate( | |
tokens, | |
top_k=top_k, | |
top_p=top_p, | |
temp=temperature, | |
repeat_penalty=repetition_penalty | |
) | |
# Use Response to stream tokens | |
return Response(generate_tokens(model, generator), content_type='text/plain', status=200, direct_passthrough=True) | |
def generate_response(): | |
global stop_generation | |
stop_generation = False | |
data = request.get_json() | |
app.logger.info(data) | |
messages = data.get("messages", []) | |
preprompt = data.get("preprompt", "") | |
parameters = data.get("parameters", {}) | |
# Extract parameters from the request | |
temperature = 0.02#parameters.get("temperature", 0.01) | |
truncate = parameters.get("truncate", 1000) | |
max_new_tokens = parameters.get("max_new_tokens", 1024) | |
top_p = 80#parameters.get("top_p", 0.85) | |
repetition_penalty = parameters.get("repetition_penalty", 1.2) | |
top_k = 25#parameters.get("top_k", 30) | |
return_full_text = parameters.get("return_full_text", False) | |
model = Llama( | |
model_path=model_name, | |
n_ctx=2000, | |
n_parts=1, | |
#n_batch=100, | |
logits_all=True, | |
#n_threads=12, | |
verbose=True, | |
n_gpu_layers=40, | |
n_gqa=8 #must be set for 70b models | |
) | |
# Generate the response | |
#system_tokens = get_system_tokens(model) | |
#tokens = system_tokens | |
#if preprompt != "": | |
# tokens = get_system_tokens_for_preprompt(model, preprompt) | |
#else: | |
tokens = get_system_tokens(model) | |
tokens.append(LINEBREAK_TOKEN) | |
#model.eval(tokens) | |
tokens = [] | |
for message in messages:#[:-1]: | |
if message.get("from") == "assistant": | |
message_tokens = get_message_tokens(model=model, role="bot", content=message.get("content", "")) | |
else: | |
message_tokens = get_message_tokens(model=model, role="user", content=message.get("content", "")) | |
tokens.extend(message_tokens) | |
#LINEBREAK_TOKEN) | |
#app.logger.info('model.eval start') | |
#model.eval(tokens) | |
#app.logger.info('model.eval end') | |
#last_message = messages[-1] | |
#if last_message.get("from") == "assistant": | |
# last_message_tokens = get_message_tokens(model=model, role="bot", content=last_message.get("content", "")) | |
#else: | |
# last_message_tokens = get_message_tokens(model=model, role="user", content=last_message.get("content", "")) | |
tokens.extend([model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN]) | |
app.logger.info('Prompt:') | |
app.logger.info(model.detokenize(tokens).decode("utf-8", errors="ignore")) | |
app.logger.info('Generate started') | |
generator = model.generate( | |
tokens, | |
top_k=top_k, | |
top_p=top_p, | |
temp=temperature, | |
repeat_penalty=repetition_penalty | |
) | |
app.logger.info('Generator created') | |
# Use Response to stream tokens | |
return Response(generate_tokens(model, generator), content_type='text/plain', status=200, direct_passthrough=True) | |
if __name__ == "__main__": | |
app.run(host="0.0.0.0", port=7860, debug=False, threaded=True) |