|
from llama_cpp import Llama |
|
from huggingface_hub import hf_hub_download |
|
import gradio as gr |
|
from typing import Tuple, List |
|
import time |
|
|
|
DESCRIPTION = f""" |
|
# Chat with Arco 500M as GGUF on CPU |
|
""" |
|
|
|
MAX_MAX_NEW_TOKENS = 1024 |
|
DEFAULT_MAX_NEW_TOKENS = 200 |
|
|
|
|
|
model_path = hf_hub_download( |
|
repo_id="TobDeBer/arco-Q4_K_M-GGUF", |
|
filename="arco-q4_k_m.gguf", |
|
repo_type="model" |
|
) |
|
|
|
pipe = Llama( |
|
n_ctx=MAX_MAX_NEW_TOKENS, |
|
|
|
|
|
|
|
|
|
model_path=model_path |
|
) |
|
|
|
def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()): |
|
if not message: |
|
return "", history |
|
|
|
prompt = message |
|
history.append([message, ""]) |
|
|
|
|
|
reply = "" |
|
|
|
|
|
token_count = 0 |
|
start_time = time.time() |
|
|
|
|
|
stream = pipe( |
|
prompt, |
|
max_tokens=max_new_tokens, |
|
stop=["</s>"], |
|
stream=True |
|
) |
|
|
|
|
|
for output in stream: |
|
new_text = output['choices'][0]['text'] |
|
reply += new_text |
|
token_count += len(new_text.split()) |
|
history[-1][1] = reply |
|
|
|
|
|
elapsed_time = time.time() - start_time |
|
if elapsed_time > 0: |
|
tps = token_count / elapsed_time |
|
else: |
|
tps = 0 |
|
|
|
|
|
progress(message=f"Tokens per second: {tps:.2f}") |
|
|
|
yield "", history |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(DESCRIPTION) |
|
chatbot = gr.Chatbot() |
|
with gr.Row(): |
|
textbox = gr.Textbox(placeholder="Type here and press enter") |
|
max_new_tokens_slider = gr.Slider( |
|
minimum=1, |
|
maximum=MAX_MAX_NEW_TOKENS, |
|
value=DEFAULT_MAX_NEW_TOKENS, |
|
label="Max New Tokens", |
|
) |
|
status_field = gr.Text(label="Status", interactive=False, visible=True) |
|
textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot], ) |
|
|
|
demo.queue().launch() |
|
|