Dhanu459's picture
Update handler.py
0ed0366 verified
raw
history blame contribute delete
No virus
4.09 kB
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
#from unsloth import FastLanguageModel
class EndpointHandler:
def __init__(self, model_path=""):
# Initialize the model and tokenizer
print('Testoo.... Initializing the Model....')
self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
max_seq_length = 2048,
dtype = None,
load_in_4bit = True,
)
#self.model = AutoPeftModelForCausalLM.from_pretrained(model_path)
#self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True)
# Ensure that the tokenizer's pad token is set correctly
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
print('Testoo.... Setting Cudaa....')
self.model = FastLanguageModel.get_peft_model(
model,
r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
# Set the device to GPU if available, otherwise CPU
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
print('Testoo.... Initialization Completed....')
def __call__(self, html_input):
"""
Generate a response based on HTML input and a custom prompt text.
This method formats the input with the provided prompt text and sends it to the model.
"""
# Prepare the prompt for the model
prompt = f"{prompt_text}\n\n### Input:\n{html_input}\n\n### Output:\n"
# Encode the prompt with appropriate padding and truncation
encoded_input = self.tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(self.device)
# Generate a response using the model, disable gradients to speed up computation
start_time = time.time()
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
end_time = time.time()
inputs = tokenizer(
[
alpaca_prompt.format(
alpaca_prompt_text, # instruction
html_input, # input
"", # output - leave this blank for generation!
)
], return_tensors = "pt").to("cuda")
outputs = self.model.generate(**inputs, max_new_tokens = 8000, use_cache = True)
tokenizer.batch_decode(outputs)
# Decode the output tokens to text
#output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Compute inference time and tokens per second for performance metrics
inference_time = end_time - start_time
#tokens_generated = outputs[0].shape[0]
tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0
# Return the formatted response along with timing and performance information
response = {
"response": output_text,
"time": f"{inference_time:.2f} s",
"tokens_per_second": f"{tokens_per_second:.2f} tokens/s"
}
return response