from typing import Dict, List, Any class EndpointHandler(): def __init__(self , path=""): # Preload all the elements you are going to need at inference. # pseudo: # self.model= load_model(path) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) model = AutoModelForCausalLM.from_pretrained(path, quantization_config=bnb_config, device_map={"":0}) tokenizer = AutoTokenizer.from_pretrained(path, add_eos_token=True) self.model = model self.tokenizer = tokenizer def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str` | `PIL.Image` | `np.array`) kwargs Return: A :obj:`list` | `dict`: will be serialized and returned """ encodeds = self.tokenizer(data['inputs'], return_tensors="pt", add_special_tokens=True) generated_ids = self.model.generate(**encodeds, max_new_tokens=data['max_new_tokens'], do_sample=False) decoded = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True) return {'output':decoded[len(data['inputs']):]}