File size: 4,090 Bytes

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
#from unsloth import FastLanguageModel

class EndpointHandler:
    def __init__(self, model_path=""):
        # Initialize the model and tokenizer
        print('Testoo.... Initializing the Model....')
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
            max_seq_length = 2048,
            dtype = None,
            load_in_4bit = True,
            )
        #self.model = AutoPeftModelForCausalLM.from_pretrained(model_path)
        #self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True)
        
        # Ensure that the tokenizer's pad token is set correctly
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        print('Testoo.... Setting Cudaa....')
        self.model = FastLanguageModel.get_peft_model(
            model,
            r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
            target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                              "gate_proj", "up_proj", "down_proj",],
            lora_alpha = 16,
            lora_dropout = 0, # Supports any, but = 0 is optimized
            bias = "none",    # Supports any, but = "none" is optimized
            # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
            use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
            random_state = 3407,
            use_rslora = False,  # We support rank stabilized LoRA
            loftq_config = None, # And LoftQ
        )
        # Set the device to GPU if available, otherwise CPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        print('Testoo.... Initialization Completed....')

    def __call__(self, html_input):
        """
        Generate a response based on HTML input and a custom prompt text.
        This method formats the input with the provided prompt text and sends it to the model.
        """
        # Prepare the prompt for the model
        prompt = f"{prompt_text}\n\n### Input:\n{html_input}\n\n### Output:\n"

        # Encode the prompt with appropriate padding and truncation
        encoded_input = self.tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(self.device)
        
        # Generate a response using the model, disable gradients to speed up computation
        start_time = time.time()
        alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
        
        ### Instruction:
        {}
        
        ### Input:
        {}
        
        ### Response:
        {}"""
        end_time = time.time()

        inputs = tokenizer(
        [
            alpaca_prompt.format(
                alpaca_prompt_text, # instruction
                html_input, # input
                "", # output - leave this blank for generation!
            )
        ], return_tensors = "pt").to("cuda")
        
        outputs = self.model.generate(**inputs, max_new_tokens = 8000, use_cache = True)
        tokenizer.batch_decode(outputs)
        
        # Decode the output tokens to text
        #output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Compute inference time and tokens per second for performance metrics
        inference_time = end_time - start_time
        #tokens_generated = outputs[0].shape[0]
        tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0

        # Return the formatted response along with timing and performance information
        response = {
            "response": output_text,
            "time": f"{inference_time:.2f} s",
            "tokens_per_second": f"{tokens_per_second:.2f} tokens/s"
        }
        
        return response