RekaFlash / app.py
ZoroaStrella's picture
Update code token
e0e0cdd
import os
import gradio as gr
from huggingface_hub import InferenceClient
# Initialize the Inference Client
client = InferenceClient(model="RekaAI/reka-flash-3", token=os.getenv("HF_TOKEN"))
# Helper function to format the conversation history into a prompt
def format_history(history):
prompt = "You are a helpful and harmless assistant.\n\n"
for item in history:
if item["role"] == "user":
prompt += f"Human: {item['content']}\n"
elif item["role"] == "assistant":
prompt += f"Assistant: {item['content']}\n"
prompt += "Assistant:"
return prompt
# Function to handle message submission and response generation
def submit(message, history, temperature, max_new_tokens, top_p, top_k):
# Add user's message to history
history = history + [{"role": "user", "content": message}]
# Add a "Thinking..." message to simulate the model's reasoning phase
thinking_message = {"role": "assistant", "content": "Thinking..."}
history = history + [thinking_message]
yield history, history # Update chatbot and state
# Format the prompt excluding the "Thinking..." message
prompt = format_history(history[:-1])
# Stream the response from the Inference API
response = client.text_generation(
prompt,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=1.0,
stop_sequences=["\nHuman:", "\nAssistant:"],
stream=True
)
# Simulate "thinking" phase with the first 5 chunks
thought_chunks = 0
max_thought_chunks = 5
accumulated_thought = ""
for chunk in response:
if thought_chunks < max_thought_chunks:
accumulated_thought += chunk
thinking_message["content"] = "Thinking: " + accumulated_thought
thought_chunks += 1
if thought_chunks == max_thought_chunks:
# Finalize the "Thought" message and start the "Answer" message
thinking_message["content"] = "Thought: " + accumulated_thought
answer_message = {"role": "assistant", "content": "Answer:"}
history = history + [answer_message]
else:
# Append subsequent chunks to the "Answer" message
answer_message["content"] += chunk
yield history, history # Update UI with each chunk
# Finalize the response
if 'answer_message' in locals():
answer_message["content"] += "\n\n[End of response]"
else:
thinking_message["content"] += "\n\n[No response generated]"
yield history, history
# Build the Gradio interface
with gr.Blocks() as demo:
# State to store the conversation history
history_state = gr.State([])
# Chatbot component to display messages
chatbot = gr.Chatbot(type="messages", height=400, label="Conversation")
# Layout with settings and input area
with gr.Row():
with gr.Column(scale=1):
# Advanced settings in a collapsible panel
with gr.Accordion("Advanced Settings", open=False):
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7)
max_tokens = gr.Slider(label="Max Tokens", minimum=1, maximum=1024, step=1, value=512)
top_p = gr.Slider(label="Top P", minimum=0.1, maximum=1.0, step=0.1, value=0.9)
top_k = gr.Slider(label="Top K", minimum=1, maximum=100, step=1, value=50)
with gr.Column(scale=4):
# Textbox for user input and buttons
textbox = gr.Textbox(label="Your message")
submit_btn = gr.Button("Submit")
clear_btn = gr.Button("Clear")
# Connect the submit button to the submit function
submit_btn.click(
submit,
inputs=[textbox, history_state, temperature, max_tokens, top_p, top_k],
outputs=[chatbot, history_state]
)
# Clear button resets the conversation
clear_btn.click(lambda: ([], []), outputs=[chatbot, history_state])
# Launch the application
if __name__ == "__main__":
demo.queue().launch()