deepthought_8B_gguf_inference

Sleeping

App Files Files Community

deepthought_8B_gguf_inference / app.py

TobDeBer

prompt format

dbc454e verified about 1 month ago

raw

history blame contribute delete

3.27 kB

	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import gradio as gr
	from typing import Tuple, List
	import time

	DESCRIPTION = f"""
	# Chat with Deepthought 8B as GGUF on CPU
	"""

	MAX_MAX_NEW_TOKENS = 1024
	DEFAULT_MAX_NEW_TOKENS = 200

	# Download the GGUF file
	model_path = hf_hub_download(
	repo_id="bartowski/deepthought-8b-llama-v0.01-alpha-GGUF",
	filename="deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf",
	repo_type="model"
	)
	# Load the GGUF model
	pipe = Llama(
	n_ctx=MAX_MAX_NEW_TOKENS,
	# n_threads=4, # Set the desired number of threads to use, defaults to number of cores
	# n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
	# n_batch=1, # Set the batch size.
	# use_mlock =True, # Set to False to disable locking to RAM.
	model_path=model_path
	)

	def format_prompt(message: str, history: List[List[str]]):
	prompt = "<\|begin_of_text\|>" # Start with the begin of text token
	prompt += "<\|im_start\|>system\n<\|im_end\|>\n" # Assuming there's no system prompt here, just adding system role tags

	for user_msg, assistant_msg in history:
	prompt += f"<\|im_start\|>user\n{user_msg}<\|im_end\|>\n"
	if assistant_msg:
	prompt += f"<\|im_start\|>assistant\n{assistant_msg}<\|im_end\|>\n"

	prompt += f"<\|im_start\|>user\n{message}<\|im_end\|>\n"
	prompt += "<\|im_start\|>assistant\n" # Start of the Assistant's part

	return prompt

	def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()):
	if not message:
	return "", history, ""

	prompt = format_prompt(message, history)
	history.append([message, ""])

	# Initialize reply for this round
	reply = ""

	# Initialize token count and start time
	token_count = 0
	start_time = time.time()

	# This will produce a generator of output chunks
	stream = pipe(
	prompt,
	max_tokens=max_new_tokens,
	stop=["</s>"],
	stream=True
	)

	# Send each token stream output to the user
	for output in stream:
	new_text = output['choices'][0]['text']
	reply += new_text
	token_count += len(new_text.split()) # Estimate tokens by counting spaces
	history[-1][1] = reply # Update the current reply in history

	# Calculate elapsed time and TPS
	elapsed_time = time.time() - start_time
	if elapsed_time > 0:
	tps = token_count / elapsed_time
	else:
	tps = 0

	# Update the status using gradio's progress
	status_message = f"Tokens per second: {tps:.2f}"

	yield "", history, status_message




	with gr.Blocks() as demo:
	gr.Markdown(DESCRIPTION)
	chatbot = gr.Chatbot()
	with gr.Row():
	textbox = gr.Textbox(placeholder="Type here and press enter")
	max_new_tokens_slider = gr.Slider(
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	value=DEFAULT_MAX_NEW_TOKENS,
	label="Max New Tokens",
	)
	status_field = gr.Text(label="Status", interactive=False, visible=True) # Add Status field
	textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot, status_field])

	demo.queue().launch()