Spaces:

schuler
/

experimental-kphi-3-micro-4k-instruct-gradio-autoloader

Sleeping

App Files Files Community

experimental-kphi-3-micro-4k-instruct-gradio-autoloader / app.py

schuler

Update app.py

af862b7 verified about 1 month ago

raw

history blame contribute delete

5.73 kB

	import gradio as gr

	import os, sys
	from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, pipeline
	from transformers import LlamaTokenizer
	import torch
	import spaces
	import psutil

	# Define the model repository
	# REPO_NAME = 'schuler/experimental-JP47D20'
	REPO_NAME = 'schuler/experimental-JP47D21-KPhi-3-micro-4k-instruct'

	# How to cache?
	@spaces.GPU()
	def load_model(local_repo_name):
	# tokenizer = AutoTokenizer.from_pretrained(local_repo_name, trust_remote_code=True)
	tokenizer = LlamaTokenizer.from_pretrained(local_repo_name, trust_remote_code=True)
	generator_conf = GenerationConfig.from_pretrained(local_repo_name)
	model = AutoModelForCausalLM.from_pretrained(local_repo_name, trust_remote_code=True, torch_dtype=torch.bfloat16, attn_implementation="eager")
	# model.to('cuda')
	return tokenizer, generator_conf, model

	# tokenizer, generator_conf, model, generator = False, False, False, False
	# with gr.Blocks() as main_block:

	tokenizer, generator_conf, model = load_model(REPO_NAME)
	global_error = ''
	try:
	generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
	except Exception as e:
	global_error = f"Failed to load model: {str(e)}"

	@spaces.GPU()
	def local_generate(
	prompt,
	generation_config,
	max_new_tokens,
	do_sample=True,
	top_p=0.25,
	repetition_penalty=1.2,
	temperature=1.0
	):
	response_output = generator(
	prompt,
	generation_config=generation_config,
	max_new_tokens=max_new_tokens,
	do_sample=do_sample,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	temperature=temperature
	)
	generated_text = response_output[0]['generated_text']
	# Extract the assistant's response
	result = generated_text[len(prompt):]
	return result

	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	):
	result = 'none'
	try:
	# Build the conversation prompt
	prompt = ''
	messages = []
	if (len(system_message)>0):
	prompt = "<\|assistant\|>"+system_message+f"<\|end\|>\n"
	for val in history:
	if val[0]:
	messages.append({"role": "user", "content": val[0]})
	if val[1]:
	messages.append({"role": "assistant", "content": val[1]})

	messages.append({"role": "user", "content": message})

	for hmessage in messages:
	role = "<\|assistant\|>" if hmessage['role'] == 'assistant' else "<\|user\|>"
	prompt += f"{role}{hmessage['content']}\n<\|end\|>"
	prompt += f"<\|assistant\|>"

	tokens_cnt = 0
	tokens_inc = 3
	last_token_len = 1
	full_result = ''
	while ( (tokens_cnt < max_tokens) and (last_token_len > 0) ):
	# Generate the response
	result = local_generate(
	prompt,
	generation_config=generator_conf,
	max_new_tokens=tokens_inc,
	do_sample=True,
	top_p=top_p,
	repetition_penalty=1.2,
	temperature=temperature
	)
	full_result = full_result + result
	prompt = prompt + result
	tokens_cnt = tokens_cnt + tokens_inc
	last_token_len = len(result)
	yield full_result

	except Exception as error:
	exc_type, exc_obj, exc_tb = sys.exc_info()
	fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
	result = str(error) +':'+ exc_type +':'+ fname +':'+ exc_tb.tb_lineno
	yield result

	"""
	For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
	"""
	total_params = sum(p.numel() for p in model.parameters())
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	embed_params = sum(p.numel() for p in model.model.embed_tokens.parameters())*2
	non_embed_params = (trainable_params - embed_params) / 1e6
	cpu_usage = psutil.cpu_percent(interval=1)
	status_text = \
	f"This chat uses the {REPO_NAME} model with {model.get_memory_footprint() / 1e6:.2f} MB memory footprint. " + \
	f"Current CPU usage is {cpu_usage:.2f}% . '" + \
	f"Total number of non embedding trainable parameters: {non_embed_params:.2f} million. " + \
	f"You may ask questions such as 'What is biology?' or 'What is the human body?'"

	# """
	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="" + global_error, label="System message"),
	gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Temperature"),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.25,
	step=0.05,
	label="Top-p (nucleus sampling)",
	),
	],
	description=status_text
	)
	"""
	with gr.Blocks() as demo:
	# Display the status text at the top
	gr.Markdown(status_text)
	# Create the ChatInterface
	chat = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="" + global_error, label="System message"),
	gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Temperature"),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.25,
	step=0.05,
	label="Top-p (nucleus sampling)",
	),
	],
	)
	"""

	if __name__ == "__main__":
	demo.launch()