File size: 4,090 Bytes
2453a95
a9f8c89
2453a95
9a26767
a9f8c89
 
c3560b5
2453a95
 
0ed0366
 
 
 
 
 
 
2453a95
 
40c2b67
 
b38aa73
2453a95
0ed0366
 
 
 
 
 
 
 
 
 
 
 
 
 
2453a95
40c2b67
 
2453a95
 
0ed0366
a9f8c89
2453a95
 
a9f8c89
2453a95
 
a9f8c89
2453a95
 
 
 
 
0ed0366
 
 
 
 
 
 
 
 
 
2453a95
259e930
0ed0366
 
 
 
 
 
 
 
 
 
 
 
2453a95
0ed0366
2453a95
 
 
0ed0366
2453a95
4e48dd8
2453a95
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
#from unsloth import FastLanguageModel

class EndpointHandler:
    def __init__(self, model_path=""):
        # Initialize the model and tokenizer
        print('Testoo.... Initializing the Model....')
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
            max_seq_length = 2048,
            dtype = None,
            load_in_4bit = True,
            )
        #self.model = AutoPeftModelForCausalLM.from_pretrained(model_path)
        #self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True)
        
        # Ensure that the tokenizer's pad token is set correctly
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        print('Testoo.... Setting Cudaa....')
        self.model = FastLanguageModel.get_peft_model(
            model,
            r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
            target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                              "gate_proj", "up_proj", "down_proj",],
            lora_alpha = 16,
            lora_dropout = 0, # Supports any, but = 0 is optimized
            bias = "none",    # Supports any, but = "none" is optimized
            # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
            use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
            random_state = 3407,
            use_rslora = False,  # We support rank stabilized LoRA
            loftq_config = None, # And LoftQ
        )
        # Set the device to GPU if available, otherwise CPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        print('Testoo.... Initialization Completed....')

    def __call__(self, html_input):
        """
        Generate a response based on HTML input and a custom prompt text.
        This method formats the input with the provided prompt text and sends it to the model.
        """
        # Prepare the prompt for the model
        prompt = f"{prompt_text}\n\n### Input:\n{html_input}\n\n### Output:\n"

        # Encode the prompt with appropriate padding and truncation
        encoded_input = self.tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(self.device)
        
        # Generate a response using the model, disable gradients to speed up computation
        start_time = time.time()
        alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
        
        ### Instruction:
        {}
        
        ### Input:
        {}
        
        ### Response:
        {}"""
        end_time = time.time()

        inputs = tokenizer(
        [
            alpaca_prompt.format(
                alpaca_prompt_text, # instruction
                html_input, # input
                "", # output - leave this blank for generation!
            )
        ], return_tensors = "pt").to("cuda")
        
        outputs = self.model.generate(**inputs, max_new_tokens = 8000, use_cache = True)
        tokenizer.batch_decode(outputs)
        
        # Decode the output tokens to text
        #output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Compute inference time and tokens per second for performance metrics
        inference_time = end_time - start_time
        #tokens_generated = outputs[0].shape[0]
        tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0

        # Return the formatted response along with timing and performance information
        response = {
            "response": output_text,
            "time": f"{inference_time:.2f} s",
            "tokens_per_second": f"{tokens_per_second:.2f} tokens/s"
        }
        
        return response