Update handler.py
Browse files- handler.py +46 -16
handler.py
CHANGED
@@ -7,25 +7,39 @@ class EndpointHandler:
|
|
7 |
def __init__(self, model_path=""):
|
8 |
# Initialize the model and tokenizer
|
9 |
print('Testoo.... Initializing the Model....')
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
self.model = AutoPeftModelForCausalLM.from_pretrained(model_path)
|
16 |
-
self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True)
|
17 |
|
18 |
# Ensure that the tokenizer's pad token is set correctly
|
19 |
if self.tokenizer.pad_token is None:
|
20 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
21 |
|
22 |
print('Testoo.... Setting Cudaa....')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# Set the device to GPU if available, otherwise CPU
|
24 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
25 |
self.model.to(self.device)
|
26 |
print('Testoo.... Initialization Completed....')
|
27 |
|
28 |
-
def __call__(self, html_input
|
29 |
"""
|
30 |
Generate a response based on HTML input and a custom prompt text.
|
31 |
This method formats the input with the provided prompt text and sends it to the model.
|
@@ -38,20 +52,36 @@ class EndpointHandler:
|
|
38 |
|
39 |
# Generate a response using the model, disable gradients to speed up computation
|
40 |
start_time = time.time()
|
41 |
-
with
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
47 |
end_time = time.time()
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# Decode the output tokens to text
|
50 |
-
output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
51 |
|
52 |
# Compute inference time and tokens per second for performance metrics
|
53 |
inference_time = end_time - start_time
|
54 |
-
tokens_generated = outputs[0].shape[0]
|
55 |
tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0
|
56 |
|
57 |
# Return the formatted response along with timing and performance information
|
|
|
7 |
def __init__(self, model_path=""):
|
8 |
# Initialize the model and tokenizer
|
9 |
print('Testoo.... Initializing the Model....')
|
10 |
+
self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
|
11 |
+
max_seq_length = 2048,
|
12 |
+
dtype = None,
|
13 |
+
load_in_4bit = True,
|
14 |
+
)
|
15 |
+
#self.model = AutoPeftModelForCausalLM.from_pretrained(model_path)
|
16 |
+
#self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True)
|
17 |
|
18 |
# Ensure that the tokenizer's pad token is set correctly
|
19 |
if self.tokenizer.pad_token is None:
|
20 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
21 |
|
22 |
print('Testoo.... Setting Cudaa....')
|
23 |
+
self.model = FastLanguageModel.get_peft_model(
|
24 |
+
model,
|
25 |
+
r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
26 |
+
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
|
27 |
+
"gate_proj", "up_proj", "down_proj",],
|
28 |
+
lora_alpha = 16,
|
29 |
+
lora_dropout = 0, # Supports any, but = 0 is optimized
|
30 |
+
bias = "none", # Supports any, but = "none" is optimized
|
31 |
+
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
|
32 |
+
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
|
33 |
+
random_state = 3407,
|
34 |
+
use_rslora = False, # We support rank stabilized LoRA
|
35 |
+
loftq_config = None, # And LoftQ
|
36 |
+
)
|
37 |
# Set the device to GPU if available, otherwise CPU
|
38 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
39 |
self.model.to(self.device)
|
40 |
print('Testoo.... Initialization Completed....')
|
41 |
|
42 |
+
def __call__(self, html_input):
|
43 |
"""
|
44 |
Generate a response based on HTML input and a custom prompt text.
|
45 |
This method formats the input with the provided prompt text and sends it to the model.
|
|
|
52 |
|
53 |
# Generate a response using the model, disable gradients to speed up computation
|
54 |
start_time = time.time()
|
55 |
+
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
|
56 |
+
|
57 |
+
### Instruction:
|
58 |
+
{}
|
59 |
+
|
60 |
+
### Input:
|
61 |
+
{}
|
62 |
+
|
63 |
+
### Response:
|
64 |
+
{}"""
|
65 |
end_time = time.time()
|
66 |
|
67 |
+
inputs = tokenizer(
|
68 |
+
[
|
69 |
+
alpaca_prompt.format(
|
70 |
+
alpaca_prompt_text, # instruction
|
71 |
+
html_input, # input
|
72 |
+
"", # output - leave this blank for generation!
|
73 |
+
)
|
74 |
+
], return_tensors = "pt").to("cuda")
|
75 |
+
|
76 |
+
outputs = self.model.generate(**inputs, max_new_tokens = 8000, use_cache = True)
|
77 |
+
tokenizer.batch_decode(outputs)
|
78 |
+
|
79 |
# Decode the output tokens to text
|
80 |
+
#output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
81 |
|
82 |
# Compute inference time and tokens per second for performance metrics
|
83 |
inference_time = end_time - start_time
|
84 |
+
#tokens_generated = outputs[0].shape[0]
|
85 |
tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0
|
86 |
|
87 |
# Return the formatted response along with timing and performance information
|