from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import gradio as gr
from typing import Tuple, List
import time

DESCRIPTION = f"""
# Chat with Deepthought 8B as GGUF on CPU
"""

MAX_MAX_NEW_TOKENS = 1024
DEFAULT_MAX_NEW_TOKENS = 200

# Download the GGUF file
model_path = hf_hub_download(
    repo_id="bartowski/deepthought-8b-llama-v0.01-alpha-GGUF",
    filename="deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf",
    repo_type="model"
)
# Load the GGUF model
pipe = Llama(
    n_ctx=MAX_MAX_NEW_TOKENS,
    # n_threads=4, # Set the desired number of threads to use, defaults to number of cores
    # n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
    # n_batch=1, # Set the batch size.
    # use_mlock =True, # Set to False to disable locking to RAM.
    model_path=model_path
)

def format_prompt(message: str, history: List[List[str]]):
    prompt = "<|begin_of_text|>"  # Start with the begin of text token
    prompt += "<|im_start|>system\n<|im_end|>\n" # Assuming there's no system prompt here, just adding system role tags

    for user_msg, assistant_msg in history:
         prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
         if assistant_msg:
             prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
    
    prompt += f"<|im_start|>user\n{message}<|im_end|>\n"
    prompt += "<|im_start|>assistant\n" # Start of the Assistant's part 

    return prompt

def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()):
    if not message:
        return "", history, ""

    prompt = format_prompt(message, history)
    history.append([message, ""])

    # Initialize reply for this round
    reply = ""

    # Initialize token count and start time
    token_count = 0
    start_time = time.time()

    # This will produce a generator of output chunks
    stream = pipe(
        prompt,
        max_tokens=max_new_tokens,
        stop=["</s>"],
        stream=True
    )

    # Send each token stream output to the user
    for output in stream:
        new_text = output['choices'][0]['text']
        reply += new_text
        token_count += len(new_text.split()) # Estimate tokens by counting spaces
        history[-1][1] = reply # Update the current reply in history
    
        # Calculate elapsed time and TPS
        elapsed_time = time.time() - start_time
        if elapsed_time > 0:
            tps = token_count / elapsed_time
        else:
            tps = 0
        
        # Update the status using gradio's progress
        status_message = f"Tokens per second: {tps:.2f}"
        
        yield "", history, status_message

    
with gr.Blocks() as demo:
    gr.Markdown(DESCRIPTION)
    chatbot = gr.Chatbot()
    with gr.Row():
        textbox = gr.Textbox(placeholder="Type here and press enter")
    max_new_tokens_slider = gr.Slider(
        minimum=1,
        maximum=MAX_MAX_NEW_TOKENS,
        value=DEFAULT_MAX_NEW_TOKENS,
        label="Max New Tokens",
    )
    status_field = gr.Text(label="Status", interactive=False, visible=True) # Add Status field
    textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot, status_field])

demo.queue().launch()