import torch from transformers import AutoModelForCausalLM, AutoTokenizer from typing import Dict, List, Any class EndpointHandler(): def __init__(self, path=""): path = "tiiuae/falcon-40b" self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map="auto", load_in_8bit=True, trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(path) self.device = "cuda" if torch.cuda.is_available() else "cpu" def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: input_text = data.pop("inputs", data) inputs = self.tokenizer(input_text, return_tensors="pt") input_ids = inputs.input_ids.to(self.device) attention_mask = inputs.attention_mask.to(self.device) score = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids).loss.item() return score