Dhanu459
/

LLama3_8B_MarketingTemplate

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

LLama3_8B_MarketingTemplate / handler.py

Dhanu459's picture

Update handler.py

0ed0366 verified 5 months ago

history blame contribute delete

No virus

4.09 kB

	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import time
	#from unsloth import FastLanguageModel

	class EndpointHandler:
	def __init__(self, model_path=""):
	# Initialize the model and tokenizer
	print('Testoo.... Initializing the Model....')
	self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
	max_seq_length = 2048,
	dtype = None,
	load_in_4bit = True,
	)
	#self.model = AutoPeftModelForCausalLM.from_pretrained(model_path)
	#self.tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False,use_fast=True)

	# Ensure that the tokenizer's pad token is set correctly
	if self.tokenizer.pad_token is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token

	print('Testoo.... Setting Cudaa....')
	self.model = FastLanguageModel.get_peft_model(
	model,
	r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
	target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj",],
	lora_alpha = 16,
	lora_dropout = 0, # Supports any, but = 0 is optimized
	bias = "none", # Supports any, but = "none" is optimized
	# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
	use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
	random_state = 3407,
	use_rslora = False, # We support rank stabilized LoRA
	loftq_config = None, # And LoftQ
	)
	# Set the device to GPU if available, otherwise CPU
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.model.to(self.device)
	print('Testoo.... Initialization Completed....')

	def __call__(self, html_input):
	"""
	Generate a response based on HTML input and a custom prompt text.
	This method formats the input with the provided prompt text and sends it to the model.
	"""
	# Prepare the prompt for the model
	prompt = f"{prompt_text}\n\n### Input:\n{html_input}\n\n### Output:\n"

	# Encode the prompt with appropriate padding and truncation
	encoded_input = self.tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(self.device)

	# Generate a response using the model, disable gradients to speed up computation
	start_time = time.time()
	alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

	### Instruction:
	{}

	### Input:
	{}

	### Response:
	{}"""
	end_time = time.time()

	inputs = tokenizer(
	[
	alpaca_prompt.format(
	alpaca_prompt_text, # instruction
	html_input, # input
	"", # output - leave this blank for generation!
	)
	], return_tensors = "pt").to("cuda")

	outputs = self.model.generate(**inputs, max_new_tokens = 8000, use_cache = True)
	tokenizer.batch_decode(outputs)

	# Decode the output tokens to text
	#output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Compute inference time and tokens per second for performance metrics
	inference_time = end_time - start_time
	#tokens_generated = outputs[0].shape[0]
	tokens_per_second = tokens_generated / inference_time if inference_time > 0 else 0

	# Return the formatted response along with timing and performance information
	response = {
	"response": output_text,
	"time": f"{inference_time:.2f} s",
	"tokens_per_second": f"{tokens_per_second:.2f} tokens/s"
	}

	return response