Spaces:
Sleeping
Sleeping
""" A light wrapper around a bunch of chat LLMs. The class should define a method that takes text input and returns a response from the model. | |
""" | |
from abc import ABC, abstractmethod | |
from typing import Generator, Optional, AsyncGenerator | |
import os | |
import random | |
import glob | |
import openai | |
import google.generativeai as genai | |
from llama_cpp import Llama | |
from huggingface_hub import InferenceClient | |
class ChatModel(ABC): | |
def __init__(self, name): | |
self.name = name | |
def __str__(self): | |
return self.name | |
def __repr__(self): | |
return self.name | |
def get_response(self, prompt) -> Generator[str, None, None]: | |
pass | |
class DummyModel(ChatModel): | |
def __init__(self): | |
super().__init__("dummy") | |
def get_response(self, prompt: str) -> Generator[str, None, None]: | |
response = f"Dummy response to: {prompt}" | |
for idx in range(len(response)): | |
yield response[:idx+1] | |
class OpenAIModel(ChatModel): | |
def __init__(self, model: str, client: openai.OpenAI): | |
super().__init__(model) | |
self.model = model | |
self.client = client | |
def get_response(self, prompt: str) -> Generator[str, None, None]: | |
stream = self.client.chat.completions.create( | |
model=self.model, | |
messages=[ | |
{"role": "system", "content": "You are PerfGuru, a helpful assistant for assisting developers in identifying performance bottlenecks in their code and optimizing them."}, | |
{"role": "user", "content": prompt} | |
], | |
stream=True, | |
max_tokens=4096, | |
) | |
response = "" | |
for chunk in stream: | |
response += chunk.choices[0].delta.content or "" | |
yield response | |
class GeminiModel(ChatModel): | |
def __init__(self, model: str, api_key: Optional[str] = None): | |
super().__init__(model) | |
if api_key: | |
genai.configure(api_key=api_key) | |
self.model = genai.GenerativeModel(model) | |
self.config = genai.types.GenerationConfig( | |
candidate_count=1, | |
max_output_tokens=4096, | |
) | |
def get_response(self, prompt: str) -> Generator[str, None, None]: | |
stream = self.model.generate_content(prompt, stream=True, generation_config=self.config) | |
response = "" | |
for chunk in stream: | |
response += chunk.text or "" | |
yield response | |
class LocalModel(ChatModel): | |
def __init__(self, model: str, model_path: str): | |
super().__init__(model) | |
self.llm = Llama( | |
model_path=model_path, | |
n_ctx=4096, | |
) | |
def get_response(self, prompt) -> Generator[str, None, None]: | |
outputs = self.llm.create_chat_completion( | |
messages = [ | |
{"role": "system", "content": "You are PerfGuru, a helpful assistant for assisting developers in identifying performance bottlenecks in their code and optimizing them."}, | |
{ | |
"role": "user", | |
"content": prompt, | |
} | |
], | |
max_tokens=4000, | |
stream=True, | |
) | |
response = "" | |
for chunk in outputs: | |
response += chunk['choices'][0]['delta'].get('content', '') | |
yield response | |
class InferenceHubModel(ChatModel): | |
def __init__(self, model: str, client: InferenceClient, supports_system_messages: bool = True): | |
super().__init__(model) | |
self.model = model | |
self.client = client | |
self.supports_system_messages = supports_system_messages | |
def get_response(self, prompt: str) -> Generator[str, None, None]: | |
messages = [] | |
if self.supports_system_messages: | |
messages.append({"role": "system", "content": "You are PerfGuru, a helpful assistant for assisting developers in identifying performance bottlenecks in their code and optimizing them."}) | |
messages.append({"role": "user", "content": prompt}) | |
stream = self.client.chat.completions.create( | |
model=self.model, | |
messages=messages, | |
stream=True, | |
max_tokens=2048, | |
) | |
response = "" | |
for chunk in stream: | |
response += chunk.choices[0].delta.content or "" | |
yield response | |
AVAILABLE_MODELS = [] | |
if os.environ.get("USE_LOCAL_MODELS") == "1": | |
HF_HOME = os.environ.get("HF_HOME", "/home/user/.cache/huggingface") | |
GGUF_WILDCARD = os.path.join(HF_HOME, "hub", "models-*", "**", "*.gguf") | |
GGUF_PATHS = [(os.path.basename(p), p) for p in glob.glob(GGUF_WILDCARD, recursive=True)] | |
LOCAL_MODEL_PATHS = [(os.path.basename(p), p) for p in glob.glob(os.path.join("local_models", "*.gguf"))] | |
ALL_LOCAL_MODELS = GGUF_PATHS + LOCAL_MODEL_PATHS | |
AVAILABLE_MODELS.extend([ | |
LocalModel(model_name, model_path) | |
for model_name, model_path in ALL_LOCAL_MODELS | |
if os.path.exists(model_path) | |
]) | |
# AVAILABLE_MODELS.append( DummyModel() ) | |
if os.environ.get("OPENAI_API_KEY"): | |
openai_client = openai.OpenAI() | |
AVAILABLE_MODELS.append( OpenAIModel("gpt-4o-mini", openai_client) ) | |
AVAILABLE_MODELS.append( OpenAIModel("gpt-3.5-turbo", openai_client) ) | |
if os.environ.get("GOOGLE_API_KEY"): | |
AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-flash") ) | |
AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-pro") ) | |
if os.environ.get("HF_API_KEY"): | |
hf_inference_client = InferenceClient(api_key=os.environ.get("HF_API_KEY"), timeout=60) | |
#AVAILABLE_MODELS.append( InferenceHubModel("google/gemma-2-2b-it", hf_inference_client, supports_system_messages=False) ) | |
#AVAILABLE_MODELS.append( InferenceHubModel("Qwen/Qwen2.5-7B-Instruct", hf_inference_client) ) | |
AVAILABLE_MODELS.append( InferenceHubModel("microsoft/Phi-3-mini-4k-instruct", hf_inference_client) ) | |
AVAILABLE_MODELS.append( InferenceHubModel("meta-llama/Llama-3.2-1B-Instruct", hf_inference_client) ) | |
AVAILABLE_MODELS.append( InferenceHubModel("meta-llama/Llama-3.2-3B-Instruct", hf_inference_client) ) | |
AVAILABLE_MODELS.append( InferenceHubModel("meta-llama/Meta-Llama-3.1-8B-Instruct", hf_inference_client) ) | |
if not AVAILABLE_MODELS: | |
raise ValueError("No models available. Please set OPENAI_API_KEY or GOOGLE_API_KEY environment variables.") | |
def select_random_model() -> ChatModel: | |
return random.choice(AVAILABLE_MODELS) |