from collections.abc import Iterator
from datetime import datetime
from pathlib import Path
from threading import Thread
from huggingface_hub import hf_hub_download
from typing import Iterator, List, Dict

from openai import OpenAI
import subprocess
import gradio as gr

"""
LLAMA_CPP_SERVER = "http://127.0.0.1:8080"
MAX_NEW_TOKENS = 1024
TEMPERATURE = 0.7
TOP_P = 0.85
TOP_K = 50
REPETITION_PENALTY = 1.05
"""


# download GGUF into local directory
gguf_path = hf_hub_download(
            repo_id="bartowski/google_gemma-3-1b-it-GGUF",
            filename="google_gemma-3-1b-it-Q6_K.gguf",
            local_dir="."
        )

# start llama-server
subprocess.run(["chmod", "+x", "llama-server"])
command = ["./llama-server", "-m", "google_gemma-3-1b-it-Q6_K.gguf", "-ngl", "0", "-c", "8192", "-t", "8", "--port", "8081"]
process = subprocess.Popen(command)
print(f"Llama-server process started with PID {process.pid}")


# when using llamacpp-server, you need to check if the stream chunk is present
# usually the first and the last chunk are empty and will throw an error
# https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks

example = """
#### Example for Image Generation help
```
I want to create an image with Flux but I need assistance for a good prompt. 
The image should be about '''[userinput]'''. Comic art style.
```
"""
note = """#### 🔹 Gemma 3 1B Instruct
> Gemma 3, a collection of lightweight, state-of-the-art open models built from the same research and technology that powers our Gemini 2.0 models. 
<br>

These are the Google most advanced, portable and responsibly developed open models yet. 
<br>
>They are designed to run fast, directly on devices — from phones and laptops to workstations. 
<br>Gemma 3 comes in a range of sizes (1B, 4B, 12B and 27B).
<br><br>

Starting settings: `Temperature=0.45` `Max_Length=1100`
"""

modelname = 'google_gemma-3-1b-it'
NCTX = 8192
print(f"Starting llamacpp server for {modelname} Context length={NCTX} tokens...")

with gr.Blocks(theme=gr.themes.Citrus()) as demo: #gr.themes.Ocean() #https://www.gradio.app/guides/theming-guide
    gr.Markdown("# Chat with Gemma 3 1b Instruct - running Locally with llama.cpp")
    with gr.Row():
        with gr.Column(scale=1):
            maxlen = gr.Slider(minimum=250, maximum=4096, value=1100, step=1, label="Max new tokens")
            temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.45, step=0.1, label="Temperature")          
            APIKey = gr.Textbox(value="not-needed", 
                        label="LlamaCPP API key",
                        type='password',placeholder='Not required',)
            gr.Markdown(note)
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(type="messages",show_copy_button = True,
                    avatar_images=['https://i.ibb.co/m588VrQ6/fabio-Matricardi.png','https://clipartcraft.com/images/transparent-background-google-logo-brand-2.png'],
                    height=480, layout='panel')
            msg = gr.Textbox(lines=3)
            gr.Markdown(example)
            clear = gr.ClearButton([msg, chatbot])

    def user(user_message, history: list):
        return "", history + [{"role": "user", "content": user_message}]    

    def respond(chat_history, api,t,m):
        STOPS = ['<eos>']
        client = OpenAI(base_url="http://127.0.0.1:8081/v1", api_key="not-needed", organization='Gemma3')
        stream = client.chat.completions.create(     
            messages=chat_history,
            model='Gemma 3 1B Instruct',
            max_tokens=m,
            stream=True,
            temperature=t,
            stop=STOPS)
        chat_history.append({"role": "assistant", "content": ""})
        for chunk in stream:
            if chunk.choices[0].delta.content:
                chat_history[-1]['content'] += chunk.choices[0].delta.content

            yield chat_history


    msg.submit(user, [msg, chatbot], [msg, chatbot]).then(respond, [chatbot,APIKey,temperature,maxlen], [chatbot])

if __name__ == "__main__":
    demo.queue().launch()