import os
import gradio as gr
from huggingface_hub import InferenceClient

# Initialize the Inference Client
client = InferenceClient(model="RekaAI/reka-flash-3", token=os.getenv("HF_TOKEN"))

# Helper function to format the conversation history into a prompt
def format_history(history):
    prompt = "You are a helpful and harmless assistant.\n\n"
    for item in history:
        if item["role"] == "user":
            prompt += f"Human: {item['content']}\n"
        elif item["role"] == "assistant":
            prompt += f"Assistant: {item['content']}\n"
    prompt += "Assistant:"
    return prompt

# Function to handle message submission and response generation
def submit(message, history, temperature, max_new_tokens, top_p, top_k):
    # Add user's message to history
    history = history + [{"role": "user", "content": message}]
    # Add a "Thinking..." message to simulate the model's reasoning phase
    thinking_message = {"role": "assistant", "content": "Thinking..."}
    history = history + [thinking_message]
    yield history, history  # Update chatbot and state

    # Format the prompt excluding the "Thinking..." message
    prompt = format_history(history[:-1])
    # Stream the response from the Inference API
    response = client.text_generation(
        prompt,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=1.0,
        stop_sequences=["\nHuman:", "\nAssistant:"],
        stream=True
    )

    # Simulate "thinking" phase with the first 5 chunks
    thought_chunks = 0
    max_thought_chunks = 5
    accumulated_thought = ""
    for chunk in response:
        if thought_chunks < max_thought_chunks:
            accumulated_thought += chunk
            thinking_message["content"] = "Thinking: " + accumulated_thought
            thought_chunks += 1
            if thought_chunks == max_thought_chunks:
                # Finalize the "Thought" message and start the "Answer" message
                thinking_message["content"] = "Thought: " + accumulated_thought
                answer_message = {"role": "assistant", "content": "Answer:"}
                history = history + [answer_message]
        else:
            # Append subsequent chunks to the "Answer" message
            answer_message["content"] += chunk
        yield history, history  # Update UI with each chunk

    # Finalize the response
    if 'answer_message' in locals():
        answer_message["content"] += "\n\n[End of response]"
    else:
        thinking_message["content"] += "\n\n[No response generated]"
    yield history, history

# Build the Gradio interface
with gr.Blocks() as demo:
    # State to store the conversation history
    history_state = gr.State([])
    # Chatbot component to display messages
    chatbot = gr.Chatbot(type="messages", height=400, label="Conversation")
    
    # Layout with settings and input area
    with gr.Row():
        with gr.Column(scale=1):
            # Advanced settings in a collapsible panel
            with gr.Accordion("Advanced Settings", open=False):
                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7)
                max_tokens = gr.Slider(label="Max Tokens", minimum=1, maximum=1024, step=1, value=512)
                top_p = gr.Slider(label="Top P", minimum=0.1, maximum=1.0, step=0.1, value=0.9)
                top_k = gr.Slider(label="Top K", minimum=1, maximum=100, step=1, value=50)
        with gr.Column(scale=4):
            # Textbox for user input and buttons
            textbox = gr.Textbox(label="Your message")
            submit_btn = gr.Button("Submit")
            clear_btn = gr.Button("Clear")

    # Connect the submit button to the submit function
    submit_btn.click(
        submit,
        inputs=[textbox, history_state, temperature, max_tokens, top_p, top_k],
        outputs=[chatbot, history_state]
    )
    # Clear button resets the conversation
    clear_btn.click(lambda: ([], []), outputs=[chatbot, history_state])

# Launch the application
if __name__ == "__main__":
    demo.queue().launch()