File size: 4,090 Bytes
2453a95 a9f8c89 2453a95 9a26767 a9f8c89 c3560b5 2453a95 0ed0366 2453a95 40c2b67 b38aa73 2453a95 0ed0366 2453a95 40c2b67 2453a95 0ed0366 a9f8c89 2453a95 a9f8c89 2453a95 a9f8c89 2453a95 0ed0366 2453a95 259e930 0ed0366 2453a95 0ed0366 2453a95 0ed0366 2453a95 4e48dd8 2453a95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
#from unsloth import FastLanguageModel
class EndpointHandler:
def __init__(self, model_path=""):
# Initialize the model and tokenizer
print('Testoo.... Initializing the Model....')
self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
max_seq_length = 2048,
dtype = None,
load_in_4bit = True,
)
#self.model = AutoPeftModelForCausalLM.from_pretrained(model_path)
#self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True)
# Ensure that the tokenizer's pad token is set correctly
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
print('Testoo.... Setting Cudaa....')
self.model = FastLanguageModel.get_peft_model(
model,
r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
# Set the device to GPU if available, otherwise CPU
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
print('Testoo.... Initialization Completed....')
def __call__(self, html_input):
"""
Generate a response based on HTML input and a custom prompt text.
This method formats the input with the provided prompt text and sends it to the model.
"""
# Prepare the prompt for the model
prompt = f"{prompt_text}\n\n### Input:\n{html_input}\n\n### Output:\n"
# Encode the prompt with appropriate padding and truncation
encoded_input = self.tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(self.device)
# Generate a response using the model, disable gradients to speed up computation
start_time = time.time()
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
end_time = time.time()
inputs = tokenizer(
[
alpaca_prompt.format(
alpaca_prompt_text, # instruction
html_input, # input
"", # output - leave this blank for generation!
)
], return_tensors = "pt").to("cuda")
outputs = self.model.generate(**inputs, max_new_tokens = 8000, use_cache = True)
tokenizer.batch_decode(outputs)
# Decode the output tokens to text
#output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Compute inference time and tokens per second for performance metrics
inference_time = end_time - start_time
#tokens_generated = outputs[0].shape[0]
tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0
# Return the formatted response along with timing and performance information
response = {
"response": output_text,
"time": f"{inference_time:.2f} s",
"tokens_per_second": f"{tokens_per_second:.2f} tokens/s"
}
return response |