File size: 1,947 Bytes
caa9e65 5b5a6aa caa9e65 96aa541 93a7cc3 96aa541 caa9e65 9afbac2 5b5a6aa caa9e65 3541e3b 96aa541 caa9e65 96aa541 caa9e65 96aa541 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
from typing import Dict, List, Any
from llama_cpp import Llama
import torch
MAX_TOKENS=8192
GPU_LAYERS=99
class EndpointHandler():
def __init__(self, data):
n_gpu_layers = GPU_LAYERS
if not torch.cuda.is_available():
n_gpu_layers = 0
self.model = Llama.from_pretrained("lmstudio-ai/gemma-2b-it-GGUF", filename="gemma-2b-it-q4_k_m.gguf", n_ctx=8192, cache_dir="./", n_gpu_layers=n_gpu_layers)
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
inputs = data.pop("inputs", "")
temperature = data.pop("temperature", None)
if not temperature:
temperature = data.pop("temp", 0.33)
if temperature > 3 or temperature < 0:
return json.dumps({
"status": "error",
"reason": "invalid temperature ( 0.01 - 1.00 )"
})
top_p = data.pop("top-p", 0.85)
if top_p > 3 or top_p < 0:
return json.dumps({
"status": "error",
"reason": "invalid top percentage ( 0.01 - 1.00 )"
})
top_k = data.pop("top-k", 42)
if top_k > 100 or top_k < 0:
return json.dumps({
"status": "error",
"reason": "invalid top k ( 1 - 99 )"
})
system_prompt = data.pop("system-prompt", "You are Gemma. Assist user with whatever they require, in a safe and moral manner.")
format = data.pop("format", "<startofturn>system\n{system_prompt} <endoftext>\n<startofturn>user\n{prompt} <endofturn>\n<startofturn>model")
try:
format = format.format(system_prompt = system_prompt, prompt = inputs)
except Exception as e:
return json.dumps({
"status": "error",
"reason": "invalid format"
})
res = self.model(format, temperature=temperature, top_p=top_p, top_k=42)
return res |