TobDeBer's picture
prompt format
dbc454e verified
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import gradio as gr
from typing import Tuple, List
import time
DESCRIPTION = f"""
# Chat with Deepthought 8B as GGUF on CPU
"""
MAX_MAX_NEW_TOKENS = 1024
DEFAULT_MAX_NEW_TOKENS = 200
# Download the GGUF file
model_path = hf_hub_download(
repo_id="bartowski/deepthought-8b-llama-v0.01-alpha-GGUF",
filename="deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf",
repo_type="model"
)
# Load the GGUF model
pipe = Llama(
n_ctx=MAX_MAX_NEW_TOKENS,
# n_threads=4, # Set the desired number of threads to use, defaults to number of cores
# n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
# n_batch=1, # Set the batch size.
# use_mlock =True, # Set to False to disable locking to RAM.
model_path=model_path
)
def format_prompt(message: str, history: List[List[str]]):
prompt = "<|begin_of_text|>" # Start with the begin of text token
prompt += "<|im_start|>system\n<|im_end|>\n" # Assuming there's no system prompt here, just adding system role tags
for user_msg, assistant_msg in history:
prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
if assistant_msg:
prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
prompt += f"<|im_start|>user\n{message}<|im_end|>\n"
prompt += "<|im_start|>assistant\n" # Start of the Assistant's part
return prompt
def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()):
if not message:
return "", history, ""
prompt = format_prompt(message, history)
history.append([message, ""])
# Initialize reply for this round
reply = ""
# Initialize token count and start time
token_count = 0
start_time = time.time()
# This will produce a generator of output chunks
stream = pipe(
prompt,
max_tokens=max_new_tokens,
stop=["</s>"],
stream=True
)
# Send each token stream output to the user
for output in stream:
new_text = output['choices'][0]['text']
reply += new_text
token_count += len(new_text.split()) # Estimate tokens by counting spaces
history[-1][1] = reply # Update the current reply in history
# Calculate elapsed time and TPS
elapsed_time = time.time() - start_time
if elapsed_time > 0:
tps = token_count / elapsed_time
else:
tps = 0
# Update the status using gradio's progress
status_message = f"Tokens per second: {tps:.2f}"
yield "", history, status_message
with gr.Blocks() as demo:
gr.Markdown(DESCRIPTION)
chatbot = gr.Chatbot()
with gr.Row():
textbox = gr.Textbox(placeholder="Type here and press enter")
max_new_tokens_slider = gr.Slider(
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
value=DEFAULT_MAX_NEW_TOKENS,
label="Max New Tokens",
)
status_field = gr.Text(label="Status", interactive=False, visible=True) # Add Status field
textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot, status_field])
demo.queue().launch()