Spaces:
Runtime error
Runtime error
File size: 6,363 Bytes
4cf05e2 87928b2 1b83353 dffcd99 87928b2 822516c 87928b2 ec80f26 50f9f62 87928b2 b560d3f 87928b2 b560d3f 87928b2 b560d3f 3964343 de36b22 87928b2 3964343 4e76cb1 87928b2 3964343 95ffb37 3964343 87928b2 3964343 87928b2 16e52aa 822516c 35ae555 87928b2 dffcd99 5847cfe 0591344 5847cfe 3964343 0591344 822516c 0591344 ebc22be 1b83353 90624da 95ffb37 5847cfe 95ffb37 c8e35b7 da3119b 90624da 67ca568 822516c 87928b2 95ffb37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from fastapi import FastAPI, HTTPException, Request
from pydantic import BaseModel
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import uvicorn
import re
from spaces import GPU
app = FastAPI()
global_data = {
'models': {},
'tokens': {
'eos': 'eos_token',
'pad': 'pad_token',
'padding': 'padding_token',
'unk': 'unk_token',
'bos': 'bos_token',
'sep': 'sep_token',
'cls': 'cls_token',
'mask': 'mask_token'
}
}
model_configs = [
{"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
{"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
{"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
{"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
{"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta Llama 3.1-70B"},
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"},
{"repo_id": "Ffftdtd5dtft/Hermes-3-Llama-3.1-8B-IQ1_S-GGUF", "filename": "hermes-3-llama-3.1-8b-iq1_s-imat.gguf", "name": "Hermes 3 Llama 3.1-8B"},
{"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf", "name": "Phi 3.5 Mini Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"},
{"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"},
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"},
{"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"},
{"repo_id": "Ffftdtd5dtft/Mistral-NeMo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf", "name": "Mistral NeMo Minitron 8B Base"},
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
]
class ModelManager:
def __init__(self):
self.loaded = False
self.models = {}
def load_model(self, model_config):
if model_config['name'] not in self.models:
try:
self.models[model_config['name']] = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'])
except Exception as e:
print(f"Error loading model {model_config['name']}: {e}")
def load_all_models(self):
if not self.loaded:
with ThreadPoolExecutor() as executor:
for config in model_configs:
executor.submit(self.load_model, config)
self.loaded = True
return self.models
model_manager = ModelManager()
global_data['models'] = model_manager.load_all_models()
class ChatRequest(BaseModel):
message: str
def normalize_input(input_text):
return input_text.strip()
def remove_duplicates(text):
text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
text = text.replace('[/INST]', '')
lines = text.split('\n')
unique_lines = []
seen_lines = set()
for line in lines:
if line not in seen_lines:
unique_lines.append(line)
seen_lines.add(line)
return '\n'.join(unique_lines)
@GPU(duration=0)
def generate_model_response(model, inputs):
try:
response = model(inputs)
return remove_duplicates(response['choices'][0]['text'])
except Exception as e:
print(f"Error generating model response: {e}")
return ""
@app.post("/generate")
async def generate(request: ChatRequest):
try:
inputs = normalize_input(request.message)
with ThreadPoolExecutor() as executor:
futures = [
executor.submit(generate_model_response, model, inputs)
for model in global_data['models'].values()
]
responses = [{'model': model_name, 'response': future.result()} for model_name, future in zip(global_data['models'].keys(), as_completed(futures))]
unique_responses = remove_repetitive_responses(responses)
return unique_responses
except Exception as e:
print(f"Error generating responses: {e}")
raise HTTPException(status_code=500, detail="Error generating responses")
@app.middleware("http")
async def process_request(request: Request, call_next):
try:
response = await call_next(request)
return response
except Exception as e:
print(f"Request error: {e}")
raise HTTPException(status_code=500, detail="Internal Server Error")
def remove_repetitive_responses(responses):
unique_responses = {}
for response in responses:
if response['model'] not in unique_responses:
unique_responses[response['model']] = response['response']
return unique_responses
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860) |