|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
import torch |
|
import time |
|
|
|
|
|
class EndpointHandler: |
|
def __init__(self, model_path=""): |
|
|
|
print('Testoo.... Initializing the Model....') |
|
self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, |
|
max_seq_length = 2048, |
|
dtype = None, |
|
load_in_4bit = True, |
|
) |
|
|
|
|
|
|
|
|
|
if self.tokenizer.pad_token is None: |
|
self.tokenizer.pad_token = self.tokenizer.eos_token |
|
|
|
print('Testoo.... Setting Cudaa....') |
|
self.model = FastLanguageModel.get_peft_model( |
|
model, |
|
r = 16, |
|
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", |
|
"gate_proj", "up_proj", "down_proj",], |
|
lora_alpha = 16, |
|
lora_dropout = 0, |
|
bias = "none", |
|
|
|
use_gradient_checkpointing = "unsloth", |
|
random_state = 3407, |
|
use_rslora = False, |
|
loftq_config = None, |
|
) |
|
|
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
self.model.to(self.device) |
|
print('Testoo.... Initialization Completed....') |
|
|
|
def __call__(self, html_input): |
|
""" |
|
Generate a response based on HTML input and a custom prompt text. |
|
This method formats the input with the provided prompt text and sends it to the model. |
|
""" |
|
|
|
prompt = f"{prompt_text}\n\n### Input:\n{html_input}\n\n### Output:\n" |
|
|
|
|
|
encoded_input = self.tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(self.device) |
|
|
|
|
|
start_time = time.time() |
|
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
|
|
### Instruction: |
|
{} |
|
|
|
### Input: |
|
{} |
|
|
|
### Response: |
|
{}""" |
|
end_time = time.time() |
|
|
|
inputs = tokenizer( |
|
[ |
|
alpaca_prompt.format( |
|
alpaca_prompt_text, |
|
html_input, |
|
"", |
|
) |
|
], return_tensors = "pt").to("cuda") |
|
|
|
outputs = self.model.generate(**inputs, max_new_tokens = 8000, use_cache = True) |
|
tokenizer.batch_decode(outputs) |
|
|
|
|
|
|
|
|
|
|
|
inference_time = end_time - start_time |
|
|
|
tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0 |
|
|
|
|
|
response = { |
|
"response": output_text, |
|
"time": f"{inference_time:.2f} s", |
|
"tokens_per_second": f"{tokens_per_second:.2f} tokens/s" |
|
} |
|
|
|
return response |