|
from llama_cpp import Llama |
|
from huggingface_hub import hf_hub_download |
|
import gradio as gr |
|
from typing import Tuple, List |
|
import time |
|
|
|
DESCRIPTION = f""" |
|
# Chat with Deepthought 8B as GGUF on CPU |
|
""" |
|
|
|
MAX_MAX_NEW_TOKENS = 1024 |
|
DEFAULT_MAX_NEW_TOKENS = 200 |
|
|
|
|
|
model_path = hf_hub_download( |
|
repo_id="bartowski/deepthought-8b-llama-v0.01-alpha-GGUF", |
|
filename="deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf", |
|
repo_type="model" |
|
) |
|
|
|
pipe = Llama( |
|
n_ctx=MAX_MAX_NEW_TOKENS, |
|
|
|
|
|
|
|
|
|
model_path=model_path |
|
) |
|
|
|
def format_prompt(message: str, history: List[List[str]]): |
|
prompt = "<|begin_of_text|>" |
|
prompt += "<|im_start|>system\n<|im_end|>\n" |
|
|
|
for user_msg, assistant_msg in history: |
|
prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n" |
|
if assistant_msg: |
|
prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n" |
|
|
|
prompt += f"<|im_start|>user\n{message}<|im_end|>\n" |
|
prompt += "<|im_start|>assistant\n" |
|
|
|
return prompt |
|
|
|
def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()): |
|
if not message: |
|
return "", history, "" |
|
|
|
prompt = format_prompt(message, history) |
|
history.append([message, ""]) |
|
|
|
|
|
reply = "" |
|
|
|
|
|
token_count = 0 |
|
start_time = time.time() |
|
|
|
|
|
stream = pipe( |
|
prompt, |
|
max_tokens=max_new_tokens, |
|
stop=["</s>"], |
|
stream=True |
|
) |
|
|
|
|
|
for output in stream: |
|
new_text = output['choices'][0]['text'] |
|
reply += new_text |
|
token_count += len(new_text.split()) |
|
history[-1][1] = reply |
|
|
|
|
|
elapsed_time = time.time() - start_time |
|
if elapsed_time > 0: |
|
tps = token_count / elapsed_time |
|
else: |
|
tps = 0 |
|
|
|
|
|
status_message = f"Tokens per second: {tps:.2f}" |
|
|
|
yield "", history, status_message |
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(DESCRIPTION) |
|
chatbot = gr.Chatbot() |
|
with gr.Row(): |
|
textbox = gr.Textbox(placeholder="Type here and press enter") |
|
max_new_tokens_slider = gr.Slider( |
|
minimum=1, |
|
maximum=MAX_MAX_NEW_TOKENS, |
|
value=DEFAULT_MAX_NEW_TOKENS, |
|
label="Max New Tokens", |
|
) |
|
status_field = gr.Text(label="Status", interactive=False, visible=True) |
|
textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot, status_field]) |
|
|
|
demo.queue().launch() |
|
|