from llama_cpp import Llama from huggingface_hub import hf_hub_download import gradio as gr from typing import Tuple, List import time DESCRIPTION = f""" # Chat with Deepthought 8B as GGUF on CPU """ MAX_MAX_NEW_TOKENS = 1024 DEFAULT_MAX_NEW_TOKENS = 200 # Download the GGUF file model_path = hf_hub_download( repo_id="bartowski/deepthought-8b-llama-v0.01-alpha-GGUF", filename="deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf", repo_type="model" ) # Load the GGUF model pipe = Llama( n_ctx=MAX_MAX_NEW_TOKENS, # n_threads=4, # Set the desired number of threads to use, defaults to number of cores # n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size. # n_batch=1, # Set the batch size. # use_mlock =True, # Set to False to disable locking to RAM. model_path=model_path ) def format_prompt(message: str, history: List[List[str]]): prompt = "<|begin_of_text|>" # Start with the begin of text token prompt += "<|im_start|>system\n<|im_end|>\n" # Assuming there's no system prompt here, just adding system role tags for user_msg, assistant_msg in history: prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n" if assistant_msg: prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n" prompt += f"<|im_start|>user\n{message}<|im_end|>\n" prompt += "<|im_start|>assistant\n" # Start of the Assistant's part return prompt def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()): if not message: return "", history, "" prompt = format_prompt(message, history) history.append([message, ""]) # Initialize reply for this round reply = "" # Initialize token count and start time token_count = 0 start_time = time.time() # This will produce a generator of output chunks stream = pipe( prompt, max_tokens=max_new_tokens, stop=[""], stream=True ) # Send each token stream output to the user for output in stream: new_text = output['choices'][0]['text'] reply += new_text token_count += len(new_text.split()) # Estimate tokens by counting spaces history[-1][1] = reply # Update the current reply in history # Calculate elapsed time and TPS elapsed_time = time.time() - start_time if elapsed_time > 0: tps = token_count / elapsed_time else: tps = 0 # Update the status using gradio's progress status_message = f"Tokens per second: {tps:.2f}" yield "", history, status_message with gr.Blocks() as demo: gr.Markdown(DESCRIPTION) chatbot = gr.Chatbot() with gr.Row(): textbox = gr.Textbox(placeholder="Type here and press enter") max_new_tokens_slider = gr.Slider( minimum=1, maximum=MAX_MAX_NEW_TOKENS, value=DEFAULT_MAX_NEW_TOKENS, label="Max New Tokens", ) status_field = gr.Text(label="Status", interactive=False, visible=True) # Add Status field textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot, status_field]) demo.queue().launch()