from typing import Dict, List, Any from llama_cpp import Llama import torch MAX_TOKENS=8192 GPU_LAYERS=99 class EndpointHandler(): def __init__(self, data): n_gpu_layers = GPU_LAYERS if not torch.cuda.is_available(): n_gpu_layers = 0 self.model = Llama.from_pretrained("lmstudio-ai/gemma-2b-it-GGUF", filename="gemma-2b-it-q4_k_m.gguf", n_ctx=8192, cache_dir="./", n_gpu_layers=n_gpu_layers) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: inputs = data.pop("inputs", "") temperature = data.pop("temperature", None) if not temperature: temperature = data.pop("temp", 0.33) if temperature > 3 or temperature < 0: return json.dumps({ "status": "error", "reason": "invalid temperature ( 0.01 - 1.00 )" }) top_p = data.pop("top-p", 0.85) if top_p > 3 or top_p < 0: return json.dumps({ "status": "error", "reason": "invalid top percentage ( 0.01 - 1.00 )" }) top_k = data.pop("top-k", 42) if top_k > 100 or top_k < 0: return json.dumps({ "status": "error", "reason": "invalid top k ( 1 - 99 )" }) system_prompt = data.pop("system-prompt", "You are Gemma. Assist user with whatever they require, in a safe and moral manner.") format = data.pop("format", "system\n{system_prompt} \nuser\n{prompt} \nmodel") try: format = format.format(system_prompt = system_prompt, prompt = inputs) except Exception as e: return json.dumps({ "status": "error", "reason": "invalid format" }) res = self.model(format, temperature=temperature, top_p=top_p, top_k=42) return res