Spaces:

ZoroaStrella
/

RekaFlash

Running on Zero

App Files Files Community

ZoroaStrella commited on Mar 11

Commit

646a0c2

1 Parent(s): ce9b3a4

correct the model loading

Browse files

Files changed (2) hide show

app.py +113 -99
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,15 +1,30 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
 # Configuration
 MODEL_NAME = "RekaAI/reka-flash-3"
 DEFAULT_MAX_LENGTH = 1024
 DEFAULT_TEMPERATURE = 0.7
-# System prompt
 SYSTEM_PROMPT = """You are Reka Flash-3, a helpful AI assistant created by Reka AI.
 Provide detailed, helpful answers while maintaining safety.
-Format responses clearly using markdown when appropriate."""
 def generate_response(
     message,
@@ -24,69 +39,87 @@ def generate_response(
     frequency_penalty,
     show_reasoning
 ):
-    # Format the prompt
-    formatted_prompt = f"System: {system_prompt}\n\nUser: {message}\n\nAssistant:"
-    # Create client
-    client = InferenceClient()
-    # Generate response
-    response = client.text_generation(
-        MODEL_NAME,
-        prompt=formatted_prompt,
-        max_new_tokens=max_length,
-        temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-        repetition_penalty=repetition_penalty,
-        presence_penalty=presence_penalty,
-        frequency_penalty=frequency_penalty,
-        details=show_reasoning,
-    )
-    # Extract reasoning and final answer if available
-    reasoning = ""
-    final_answer = response
-    if show_reasoning and hasattr(response, 'details'):
-        reasoning = response.details.get('reasoning', '')
-        final_answer = response.generated_text
-    # Update chat history
-    chat_history.append((message, final_answer))
-    # Create full history with reasoning
-    full_history = list(chat_history)
-    if show_reasoning and reasoning:
-        full_history[-1] = (full_history[-1][0], f"{final_answer}\n\nREASONING:\n{reasoning}")
-    return "", chat_history, reasoning if show_reasoning else ""
 # UI Components
 with gr.Blocks(title="Reka Flash-3 Chat Demo", theme=gr.themes.Soft()) as demo:
     # Header Section
-    gr.Markdown(f"""
     # Reka Flash-3 Chat Interface
     *Powered by [Reka Core AI](https://www.reka.ai/)*
     """)
     # Deployment Notice
     with gr.Accordion("Important Deployment Notice", open=True):
-        gr.Markdown(f"""
-        **To deploy this model on Hugging Face Spaces:**
-        1. Request access to Reka Flash-3 from [Hugging Face Hub](https://huggingface.co/{MODEL_NAME})
-        2. Ensure you have Hugging Face PRO subscription
-        3. Add your HF token in Space settings
-        4. Set `GPU_SMALL` or higher in Space hardware settings
-        """)
     # Chat Interface
     with gr.Row():
-        chatbot = gr.Chatbot(height=500)
         reasoning_display = gr.Textbox(
             label="Model Reasoning",
             interactive=False,
             visible=True,
-            lines=20,
             max_lines=20
         )
@@ -100,70 +133,51 @@ with gr.Blocks(title="Reka Flash-3 Chat Demo", theme=gr.themes.Soft()) as demo:
         )
         submit_btn = gr.Button("Send", variant="primary")
-    # Parameters
-    with gr.Accordion("Normal Options", open=False):
         with gr.Row():
-            max_length = gr.Slider(128, 4096, value=DEFAULT_MAX_LENGTH, label="Max Length")
-            temperature = gr.Slider(0.1, 2.0, value=DEFAULT_TEMPERATURE, label="Temperature")
     with gr.Accordion("Advanced Options", open=False):
         with gr.Row():
-            top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
-            top_k = gr.Slider(1, 100, value=50, label="Top-k")
-            repetition_penalty = gr.Slider(0.1, 2.0, value=1.1, label="Repetition Penalty")
         with gr.Row():
-            presence_penalty = gr.Slider(-2.0, 2.0, value=0.0, label="Presence Penalty")
-            frequency_penalty = gr.Slider(-2.0, 2.0, value=0.0, label="Frequency Penalty")
     # System Prompt
     system_prompt = gr.Textbox(
         label="System Prompt",
         value=SYSTEM_PROMPT,
-        lines=3
     )
     # Debug Options
-    show_reasoning = gr.Checkbox(
-        label="Show Model Reasoning",
-        value=True
-    )
     # Event Handling
-    submit_btn.click(
-        generate_response,
-        inputs=[
-            message,
-            chatbot,
-            system_prompt,
-            max_length,
-            temperature,
-            top_p,
-            top_k,
-            repetition_penalty,
-            presence_penalty,
-            frequency_penalty,
-            show_reasoning
-        ],
-        outputs=[message, chatbot, reasoning_display]
-    )
-    message.submit(
-        generate_response,
-        inputs=[
-            message,
-            chatbot,
-            system_prompt,
-            max_length,
-            temperature,
-            top_p,
-            top_k,
-            repetition_penalty,
-            presence_penalty,
-            frequency_penalty,
-            show_reasoning
-        ],
-        outputs=[message, chatbot, reasoning_display]
-    )
-# Deployment instructions
 demo.launch(debug=True)

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
 # Configuration
 MODEL_NAME = "RekaAI/reka-flash-3"
 DEFAULT_MAX_LENGTH = 1024
 DEFAULT_TEMPERATURE = 0.7
+# System prompt with instructions for reasoning
 SYSTEM_PROMPT = """You are Reka Flash-3, a helpful AI assistant created by Reka AI.
 Provide detailed, helpful answers while maintaining safety.
+Format responses clearly using markdown when appropriate.
+When asked a question, think step by step inside <thinking> tags, then provide your final answer after </thinking> tags. For example:
+User: What is 2+2?
+Assistant: <thinking>
+Let me calculate that. 2 plus 2 equals 4.
+</thinking>
+The answer is 4."""
+# Load model and tokenizer (assuming CPU-only for zero GPU)
+try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cpu", torch_dtype=torch.float32)
+except Exception as e:
+    raise Exception(f"Failed to load model: {str(e)}. Ensure you have access to {MODEL_NAME} and sufficient CPU memory.")
 def generate_response(
     message,
     frequency_penalty,
     show_reasoning
 ):
+    """
+    Generate a response from Reka Flash-3, parsing reasoning and final answer.
+    """
+    try:
+        # Format the prompt with thinking tags
+        formatted_prompt = f"{system_prompt}\n\nUser: {message}\n\nAssistant: <thinking>\n"
+        # Tokenize input
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
+        # Generate response
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_length,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repetition_penalty=repetition_penalty,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else 0
+        )
+        # Decode the generated text
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        response = response[len(formatted_prompt):]  # Remove the prompt from the output
+        # Parse reasoning and final answer
+        if "</thinking>" in response:
+            reasoning, final_answer = response.split("</thinking>", 1)
+            reasoning = reasoning.strip()
+            final_answer = final_answer.strip()
+        else:
+            reasoning = ""
+            final_answer = response.strip()
+        # Update chat history with final answer
+        chat_history.append((message, final_answer))
+        # Display reasoning if requested
+        reasoning_display = reasoning if show_reasoning and reasoning else ""
+        if reasoning_display:
+            reasoning_display = f"**Reasoning:**\n{reasoning_display}"
+        return "", chat_history, reasoning_display
+    except Exception as e:
+        error_msg = f"Error generating response: {str(e)}"
+        gr.Warning(error_msg)
+        return "", chat_history, error_msg
 # UI Components
 with gr.Blocks(title="Reka Flash-3 Chat Demo", theme=gr.themes.Soft()) as demo:
     # Header Section
+    gr.Markdown("""
     # Reka Flash-3 Chat Interface
     *Powered by [Reka Core AI](https://www.reka.ai/)*
     """)
     # Deployment Notice
     with gr.Accordion("Important Deployment Notice", open=True):
+        gr.Textbox(
+            value="""To deploy this model on Hugging Face Spaces:
+1. Request the Reka Flash-3 OSS model from Reka AI (https://www.reka.ai/).
+2. Use a Hugging Face Pro subscription for deployment.
+3. Configure your Space with zero GPU (CPU-only) hardware.
+4. Ensure sufficient CPU memory for the 3B parameter model.""",
+            label="Deployment Instructions",
+            lines=5,
+            interactive=False
+        )
     # Chat Interface
     with gr.Row():
+        chatbot = gr.Chatbot(height=500, label="Conversation")
         reasoning_display = gr.Textbox(
             label="Model Reasoning",
             interactive=False,
             visible=True,
+            lines=10,
             max_lines=20
         )
         )
         submit_btn = gr.Button("Send", variant="primary")
+    # Normal Options
+    with gr.Accordion("Normal Options", open=True):
         with gr.Row():
+            max_length = gr.Slider(128, 4096, value=DEFAULT_MAX_LENGTH, label="Max Length", step=128)
+            temperature = gr.Slider(0.1, 2.0, value=DEFAULT_TEMPERATURE, label="Temperature", step=0.1)
+    # Advanced Options
     with gr.Accordion("Advanced Options", open=False):
         with gr.Row():
+            top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p", step=0.05)
+            top_k = gr.Slider(1, 100, value=50, label="Top-k", step=1)
+            repetition_penalty = gr.Slider(0.1, 2.0, value=1.1, label="Repetition Penalty", step=0.1)
         with gr.Row():
+            presence_penalty = gr.Slider(-2.0, 2.0, value=0.0, label="Presence Penalty", step=0.1)
+            frequency_penalty = gr.Slider(-2.0, 2.0, value=0.0, label="Frequency Penalty", step=0.1)
     # System Prompt
     system_prompt = gr.Textbox(
         label="System Prompt",
         value=SYSTEM_PROMPT,
+        lines=5,
+        max_lines=10
     )
     # Debug Options
+    show_reasoning = gr.Checkbox(label="Show Model Reasoning", value=True)
     # Event Handling
+    inputs = [
+        message,
+        chatbot,
+        system_prompt,
+        max_length,
+        temperature,
+        top_p,
+        top_k,
+        repetition_penalty,
+        presence_penalty,
+        frequency_penalty,
+        show_reasoning
+    ]
+    outputs = [message, chatbot, reasoning_display]
+    submit_btn.click(generate_response, inputs=inputs, outputs=outputs)
+    message.submit(generate_response, inputs=inputs, outputs=outputs)
+# Launch the interface
 demo.launch(debug=True)

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 gradio>=3.50
-huggingface_hub==0.25.2

 gradio>=3.50
+huggingface_hub==0.25.2
+torch