Dhanu459
/

LLama3_8B_MarketingTemplate

@@ -7,25 +7,39 @@ class EndpointHandler:
     def __init__(self, model_path=""):
         # Initialize the model and tokenizer
         print('Testoo.... Initializing the Model....')
-        #self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
-        #    max_seq_length = 2048,
-        #    dtype = None,
-        #    load_in_4bit = True,
-        #    )
-        self.model = AutoPeftModelForCausalLM.from_pretrained(model_path)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True)
         # Ensure that the tokenizer's pad token is set correctly
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
         print('Testoo.... Setting Cudaa....')
         # Set the device to GPU if available, otherwise CPU
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
         print('Testoo.... Initialization Completed....')
-    def __call__(self, html_input, prompt_text):
         """
         Generate a response based on HTML input and a custom prompt text.
         This method formats the input with the provided prompt text and sends it to the model.
@@ -38,20 +52,36 @@ class EndpointHandler:
         # Generate a response using the model, disable gradients to speed up computation
         start_time = time.time()
-        with torch.no_grad():
-            outputs = self.model.generate(
-                **encoded_input,
-                max_new_tokens=8000,  # Adjust token limit as necessary
-                use_cache=True
-            )
         end_time = time.time()
         # Decode the output tokens to text
-        output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
         # Compute inference time and tokens per second for performance metrics
         inference_time = end_time - start_time
-        tokens_generated = outputs[0].shape[0]
         tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0
         # Return the formatted response along with timing and performance information

     def __init__(self, model_path=""):
         # Initialize the model and tokenizer
         print('Testoo.... Initializing the Model....')
+        self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
+            max_seq_length = 2048,
+            dtype = None,
+            load_in_4bit = True,
+            )
+        #self.model = AutoPeftModelForCausalLM.from_pretrained(model_path)
+        #self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True)
         # Ensure that the tokenizer's pad token is set correctly
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
         print('Testoo.... Setting Cudaa....')
+        self.model = FastLanguageModel.get_peft_model(
+            model,
+            r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+            target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                              "gate_proj", "up_proj", "down_proj",],
+            lora_alpha = 16,
+            lora_dropout = 0, # Supports any, but = 0 is optimized
+            bias = "none",    # Supports any, but = "none" is optimized
+            # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+            use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+            random_state = 3407,
+            use_rslora = False,  # We support rank stabilized LoRA
+            loftq_config = None, # And LoftQ
+        )
         # Set the device to GPU if available, otherwise CPU
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
         print('Testoo.... Initialization Completed....')
+    def __call__(self, html_input):
         """
         Generate a response based on HTML input and a custom prompt text.
         This method formats the input with the provided prompt text and sends it to the model.
         # Generate a response using the model, disable gradients to speed up computation
         start_time = time.time()
+        alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+        ### Instruction:
+        {}
+        ### Input:
+        {}
+        ### Response:
+        {}"""
         end_time = time.time()
+        inputs = tokenizer(
+        [
+            alpaca_prompt.format(
+                alpaca_prompt_text, # instruction
+                html_input, # input
+                "", # output - leave this blank for generation!
+            )
+        ], return_tensors = "pt").to("cuda")
+        outputs = self.model.generate(**inputs, max_new_tokens = 8000, use_cache = True)
+        tokenizer.batch_decode(outputs)
         # Decode the output tokens to text
+        #output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
         # Compute inference time and tokens per second for performance metrics
         inference_time = end_time - start_time
+        #tokens_generated = outputs[0].shape[0]
         tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0
         # Return the formatted response along with timing and performance information