from typing import Any, Dict import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftConfig, PeftModel from transformers import pipeline class EndpointHandler: def __init__(self, path=""): # load model and processor from path self.device = "cuda" if torch.cuda.is_available() else "cpu" config = PeftConfig.from_pretrained(path) model = AutoModelForCausalLM.from_pretrained( config.base_model_name_or_path, return_dict=True, torch_dtype=torch.float16, trust_remote_code=True, ) self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, trust_remote_code=True) model = PeftModel.from_pretrained(model, path) self.model = model self.model.to(torch.float16) self.model.to(self.device) self.model = self.model.merge_and_unload() self.model.eval() self.pipeline = pipeline('text-generation', model = self.model, tokenizer=self.tokenizer, device=self.device, torch_dtype=torch.float16) def __call__(self, data: Dict[str, Any]) -> Dict[str, str]: # process input inputs = data.pop("inputs", data) parameters = data.pop("parameters", None) # pass inputs with all kwargs in data if parameters is not None: outputs = self.pipeline(inputs, **parameters) else: outputs = self.pipeline(inputs) return outputs