Dhanu459 commited on
Commit
0ed0366
1 Parent(s): c3560b5

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +46 -16
handler.py CHANGED
@@ -7,25 +7,39 @@ class EndpointHandler:
7
  def __init__(self, model_path=""):
8
  # Initialize the model and tokenizer
9
  print('Testoo.... Initializing the Model....')
10
- #self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
11
- # max_seq_length = 2048,
12
- # dtype = None,
13
- # load_in_4bit = True,
14
- # )
15
- self.model = AutoPeftModelForCausalLM.from_pretrained(model_path)
16
- self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True)
17
 
18
  # Ensure that the tokenizer's pad token is set correctly
19
  if self.tokenizer.pad_token is None:
20
  self.tokenizer.pad_token = self.tokenizer.eos_token
21
 
22
  print('Testoo.... Setting Cudaa....')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # Set the device to GPU if available, otherwise CPU
24
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  self.model.to(self.device)
26
  print('Testoo.... Initialization Completed....')
27
 
28
- def __call__(self, html_input, prompt_text):
29
  """
30
  Generate a response based on HTML input and a custom prompt text.
31
  This method formats the input with the provided prompt text and sends it to the model.
@@ -38,20 +52,36 @@ class EndpointHandler:
38
 
39
  # Generate a response using the model, disable gradients to speed up computation
40
  start_time = time.time()
41
- with torch.no_grad():
42
- outputs = self.model.generate(
43
- **encoded_input,
44
- max_new_tokens=8000, # Adjust token limit as necessary
45
- use_cache=True
46
- )
 
 
 
 
47
  end_time = time.time()
48
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # Decode the output tokens to text
50
- output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
51
 
52
  # Compute inference time and tokens per second for performance metrics
53
  inference_time = end_time - start_time
54
- tokens_generated = outputs[0].shape[0]
55
  tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0
56
 
57
  # Return the formatted response along with timing and performance information
 
7
  def __init__(self, model_path=""):
8
  # Initialize the model and tokenizer
9
  print('Testoo.... Initializing the Model....')
10
+ self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
11
+ max_seq_length = 2048,
12
+ dtype = None,
13
+ load_in_4bit = True,
14
+ )
15
+ #self.model = AutoPeftModelForCausalLM.from_pretrained(model_path)
16
+ #self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True)
17
 
18
  # Ensure that the tokenizer's pad token is set correctly
19
  if self.tokenizer.pad_token is None:
20
  self.tokenizer.pad_token = self.tokenizer.eos_token
21
 
22
  print('Testoo.... Setting Cudaa....')
23
+ self.model = FastLanguageModel.get_peft_model(
24
+ model,
25
+ r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
26
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
27
+ "gate_proj", "up_proj", "down_proj",],
28
+ lora_alpha = 16,
29
+ lora_dropout = 0, # Supports any, but = 0 is optimized
30
+ bias = "none", # Supports any, but = "none" is optimized
31
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
32
+ use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
33
+ random_state = 3407,
34
+ use_rslora = False, # We support rank stabilized LoRA
35
+ loftq_config = None, # And LoftQ
36
+ )
37
  # Set the device to GPU if available, otherwise CPU
38
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
39
  self.model.to(self.device)
40
  print('Testoo.... Initialization Completed....')
41
 
42
+ def __call__(self, html_input):
43
  """
44
  Generate a response based on HTML input and a custom prompt text.
45
  This method formats the input with the provided prompt text and sends it to the model.
 
52
 
53
  # Generate a response using the model, disable gradients to speed up computation
54
  start_time = time.time()
55
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
56
+
57
+ ### Instruction:
58
+ {}
59
+
60
+ ### Input:
61
+ {}
62
+
63
+ ### Response:
64
+ {}"""
65
  end_time = time.time()
66
 
67
+ inputs = tokenizer(
68
+ [
69
+ alpaca_prompt.format(
70
+ alpaca_prompt_text, # instruction
71
+ html_input, # input
72
+ "", # output - leave this blank for generation!
73
+ )
74
+ ], return_tensors = "pt").to("cuda")
75
+
76
+ outputs = self.model.generate(**inputs, max_new_tokens = 8000, use_cache = True)
77
+ tokenizer.batch_decode(outputs)
78
+
79
  # Decode the output tokens to text
80
+ #output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
81
 
82
  # Compute inference time and tokens per second for performance metrics
83
  inference_time = end_time - start_time
84
+ #tokens_generated = outputs[0].shape[0]
85
  tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0
86
 
87
  # Return the formatted response along with timing and performance information