from collections.abc import Iterator
from datetime import datetime
from pathlib import Path
from threading import Thread
from huggingface_hub import hf_hub_download
from typing import Iterator, List, Dict
from openai import OpenAI
import subprocess
import gradio as gr
"""
LLAMA_CPP_SERVER = "http://127.0.0.1:8080"
MAX_NEW_TOKENS = 1024
TEMPERATURE = 0.7
TOP_P = 0.85
TOP_K = 50
REPETITION_PENALTY = 1.05
"""
# download GGUF into local directory
gguf_path = hf_hub_download(
repo_id="bartowski/google_gemma-3-1b-it-GGUF",
filename="google_gemma-3-1b-it-Q6_K.gguf",
local_dir="."
)
# start llama-server
subprocess.run(["chmod", "+x", "llama-server"])
command = ["./llama-server", "-m", "google_gemma-3-1b-it-Q6_K.gguf", "-ngl", "0", "-c", "8192", "-t", "8", "--port", "8081"]
process = subprocess.Popen(command)
print(f"Llama-server process started with PID {process.pid}")
# when using llamacpp-server, you need to check if the stream chunk is present
# usually the first and the last chunk are empty and will throw an error
# https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks
example = """
#### Example for Image Generation help
```
I want to create an image with Flux but I need assistance for a good prompt.
The image should be about '''[userinput]'''. Comic art style.
```
"""
note = """#### 🔹 Gemma 3 1B Instruct
> Gemma 3, a collection of lightweight, state-of-the-art open models built from the same research and technology that powers our Gemini 2.0 models.
These are the Google most advanced, portable and responsibly developed open models yet.
>They are designed to run fast, directly on devices — from phones and laptops to workstations.
Gemma 3 comes in a range of sizes (1B, 4B, 12B and 27B).
Starting settings: `Temperature=0.45` `Max_Length=1100`
"""
modelname = 'google_gemma-3-1b-it'
NCTX = 8192
print(f"Starting llamacpp server for {modelname} Context length={NCTX} tokens...")
with gr.Blocks(theme=gr.themes.Citrus()) as demo: #gr.themes.Ocean() #https://www.gradio.app/guides/theming-guide
gr.Markdown("# Chat with Gemma 3 1b Instruct - running Locally with llama.cpp")
with gr.Row():
with gr.Column(scale=1):
maxlen = gr.Slider(minimum=250, maximum=4096, value=1100, step=1, label="Max new tokens")
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.45, step=0.1, label="Temperature")
APIKey = gr.Textbox(value="not-needed",
label="LlamaCPP API key",
type='password',placeholder='Not required',)
gr.Markdown(note)
with gr.Column(scale=3):
chatbot = gr.Chatbot(type="messages",show_copy_button = True,
avatar_images=['https://i.ibb.co/m588VrQ6/fabio-Matricardi.png','https://clipartcraft.com/images/transparent-background-google-logo-brand-2.png'],
height=480, layout='panel')
msg = gr.Textbox(lines=3)
gr.Markdown(example)
clear = gr.ClearButton([msg, chatbot])
def user(user_message, history: list):
return "", history + [{"role": "user", "content": user_message}]
def respond(chat_history, api,t,m):
STOPS = ['']
client = OpenAI(base_url="http://127.0.0.1:8081/v1", api_key="not-needed", organization='Gemma3')
stream = client.chat.completions.create(
messages=chat_history,
model='Gemma 3 1B Instruct',
max_tokens=m,
stream=True,
temperature=t,
stop=STOPS)
chat_history.append({"role": "assistant", "content": ""})
for chunk in stream:
if chunk.choices[0].delta.content:
chat_history[-1]['content'] += chunk.choices[0].delta.content
yield chat_history
msg.submit(user, [msg, chatbot], [msg, chatbot]).then(respond, [chatbot,APIKey,temperature,maxlen], [chatbot])
if __name__ == "__main__":
demo.queue().launch()