from transformers import AutoModelForCausalLM, AutoTokenizer import torch import time #from unsloth import FastLanguageModel class EndpointHandler: def __init__(self, model_path=""): # Initialize the model and tokenizer print('Testoo.... Initializing the Model....') self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING max_seq_length = 2048, dtype = None, load_in_4bit = True, ) #self.model = AutoPeftModelForCausalLM.from_pretrained(model_path) #self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True) # Ensure that the tokenizer's pad token is set correctly if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token print('Testoo.... Setting Cudaa....') self.model = FastLanguageModel.get_peft_model( model, r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], lora_alpha = 16, lora_dropout = 0, # Supports any, but = 0 is optimized bias = "none", # Supports any, but = "none" is optimized # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context random_state = 3407, use_rslora = False, # We support rank stabilized LoRA loftq_config = None, # And LoftQ ) # Set the device to GPU if available, otherwise CPU self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device) print('Testoo.... Initialization Completed....') def __call__(self, html_input): """ Generate a response based on HTML input and a custom prompt text. This method formats the input with the provided prompt text and sends it to the model. """ # Prepare the prompt for the model prompt = f"{prompt_text}\n\n### Input:\n{html_input}\n\n### Output:\n" # Encode the prompt with appropriate padding and truncation encoded_input = self.tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(self.device) # Generate a response using the model, disable gradients to speed up computation start_time = time.time() alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" end_time = time.time() inputs = tokenizer( [ alpaca_prompt.format( alpaca_prompt_text, # instruction html_input, # input "", # output - leave this blank for generation! ) ], return_tensors = "pt").to("cuda") outputs = self.model.generate(**inputs, max_new_tokens = 8000, use_cache = True) tokenizer.batch_decode(outputs) # Decode the output tokens to text #output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) # Compute inference time and tokens per second for performance metrics inference_time = end_time - start_time #tokens_generated = outputs[0].shape[0] tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0 # Return the formatted response along with timing and performance information response = { "response": output_text, "time": f"{inference_time:.2f} s", "tokens_per_second": f"{tokens_per_second:.2f} tokens/s" } return response