import gradio as gr
#from llama_cpp import Llama
import random
import subprocess
import time


# Initialize model
#llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31))

"""
def generate_response(user_message):
    encodeduserm = b"### Human: " + user_message.encode('utf-8') + b"\n### Assistant:"
    tokens = llm.tokenize(encodeduserm)
    output = b""
    count = 0

    for token in llm.generate(tokens, top_k=40, top_p=0.95, temp=0.72, repeat_penalty=1.1):
        text = llm.detokenize([token])
        output += text
        count += 1
        if count >= 500 or (token == llm.token_eos()):
            break
    return output.decode()
"""

"""
def generate_response(user_message):
    print("Before request")
    cmd = [
        "/app/llama.cpp/main",  # Path to the executable
        "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
        "-p", user_message,
        "-n", "400",
        "-e"
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    print("After response")
    return result.stdout
"""

def generate_response(user_message):
    cmd = [
        "/app/llama.cpp/main",  # Path to the executable
        "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
        "-p", user_message,
        "-n", "400",
        "-e"
    ]

    # Start the subprocess
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    start_time = time.time()
    alllines = ""

    # Yield each line of output as it becomes available
    for line in process.stdout:
        alllines += " " + line
        elapsed_time = time.time() - start_time  # Calculate elapsed time
        yield f"{alllines} [Inference time: {elapsed_time:.2f} seconds]"
    
    # Wait for the subprocess to finish if it hasn't already
    process.wait()

    # Check for any errors
    if process.returncode != 0:
        error_message = process.stderr.read()
        print(f"Error: {error_message}")

iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=2, placeholder="Type your message here..."),
    outputs="text",
    title="LLaMA Chat Interface",
    description="Enter your message and get a response from the LLaMA model.",
    flagging_dir="/usr/src/app/flagged",
)

iface.launch(server_name="0.0.0.0") #share=True)