Spaces:

ZoroaStrella
/

RekaFlash

Running on Zero

App Files Files Community

ZoroaStrella commited on 15 days ago

Commit

f3f292e

1 Parent(s): e970aef

Add accelerate dependencies

Browse files

Files changed (2) hide show

app.py +49 -123
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -4,136 +4,62 @@ import torch
 # Configuration
 MODEL_NAME = "RekaAI/reka-flash-3"
-DEFAULT_MAX_LENGTH = 4096  # Reduced for CPU efficiency
 DEFAULT_TEMPERATURE = 0.7
-# System prompt with reasoning instructions
-SYSTEM_PROMPT = """You are Reka Flash-3, a helpful AI assistant created by Reka AI.
-When responding, think step-by-step within <thinking> tags and conclude your answer after </thinking>.
-For example:
-User: What is 2+2?
-Assistant: <thinking>Let me calculate that. 2 plus 2 equals 4.</thinking> The answer is 4."""
-# Load model and tokenizer with 4-bit quantization
-try:
-    quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.float16,
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_quant_type="nf4"
-    )
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_NAME,
-        quantization_config=quantization_config,
-        device_map="auto",  # Maps to CPU
-        torch_dtype=torch.float16
     )
-    tokenizer.pad_token = tokenizer.eos_token  # Ensure padding works
-except Exception as e:
-    raise Exception(f"Failed to load model: {str(e)}. Ensure access to {MODEL_NAME} and sufficient CPU memory.")
-def generate_response(
-    message,
-    chat_history,
-    system_prompt,
-    max_length,
-    temperature,
-    top_p,
-    top_k,
-    repetition_penalty,
-    show_reasoning
-):
-    """Generate a response from Reka Flash-3 with reasoning tags."""
-    try:
-        # Format chat history and prompt (multi-round conversation)
-        history_str = ""
-        for user_msg, assistant_msg in chat_history:
-            history_str += f"human: {user_msg} <sep> assistant: {assistant_msg} <sep> "
-        prompt = f"{system_prompt} <sep> human: {message} <sep> assistant: <thinking>\n"
-        # Tokenize input
-        inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
-        # Generate response with budget forcing
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=max_length,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            do_sample=True,
-            eos_token_id=tokenizer.convert_tokens_to_ids("<sep>"),  # Stop at <sep>
-            pad_token_id=tokenizer.eos_token_id
-        )
-        # Decode and clean response
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        response = response[len(prompt):].split("<sep>")[0].strip()  # Extract assistant response
-        # Parse reasoning and final answer
-        if "</thinking>" in response:
-            reasoning, final_answer = response.split("</thinking>", 1)
-            reasoning = reasoning.replace("<thinking>", "").strip()
-            final_answer = final_answer.strip()
-        else:
-            reasoning = ""
-            final_answer = response
-        # Update chat history (drop reasoning to save tokens)
-        chat_history.append({"role": "user", "content": message})
-        chat_history.append({"role": "assistant", "content": final_answer})
-        # Display reasoning if requested
-        reasoning_display = f"**Reasoning:**\n{reasoning}" if show_reasoning and reasoning else ""
-        return "", chat_history, reasoning_display
-    except Exception as e:
-        error_msg = f"Error: {str(e)}"
-        gr.Warning(error_msg)
-        return "", chat_history, error_msg
 # Gradio Interface
-with gr.Blocks(title="Reka Flash-3 Chat", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # Reka Flash-3 Chat Interface
-    *Powered by [Reka AI](https://www.reka.ai/)* - A 21B parameter reasoning model optimized for CPU.
-    """)
-    with gr.Accordion("Deployment Instructions", open=True):
-        gr.Textbox(
-            value="""To deploy on Hugging Face Spaces:
-1. Request access to RekaAI/reka-flash-3 from Reka AI.
-2. Use a Pro subscription with zero-GPU (CPU-only) hardware.
-3. Ensure 32GB+ CPU memory for 4-bit quantization.
-4. Install dependencies: gradio, transformers, torch, bitsandbytes.""",
-            label="How to Deploy",
-            interactive=False
-        )
     with gr.Row():
-        chatbot = gr.Chatbot(type="messages", height=400, label="Conversation")
-        reasoning_display = gr.Textbox(label="Model Reasoning", interactive=False, lines=8)
-    with gr.Row():
-        message = gr.Textbox(label="Your Message", placeholder="Ask me anything...", lines=2)
-        submit_btn = gr.Button("Send", variant="primary")
-    with gr.Accordion("Options", open=True):
-        max_length = gr.Slider(128, 512, value=DEFAULT_MAX_LENGTH, label="Max Length", step=64)
-        temperature = gr.Slider(0.1, 2.0, value=DEFAULT_TEMPERATURE, label="Temperature", step=0.1)
-        top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p", step=0.05)
-        top_k = gr.Slider(1, 100, value=50, label="Top-k", step=1)
-        repetition_penalty = gr.Slider(0.1, 2.0, value=1.1, label="Repetition Penalty", step=0.1)
     system_prompt = gr.Textbox(label="System Prompt", value=SYSTEM_PROMPT, lines=4)
-    show_reasoning = gr.Checkbox(label="Show Reasoning", value=True)
-    # Event handling
-    inputs = [message, chatbot, system_prompt, max_length, temperature, top_p, top_k, repetition_penalty, show_reasoning]
-    outputs = [message, chatbot, reasoning_display]
     submit_btn.click(generate_response, inputs=inputs, outputs=outputs)
     message.submit(generate_response, inputs=inputs, outputs=outputs)
-demo.launch(debug=True)

 # Configuration
 MODEL_NAME = "RekaAI/reka-flash-3"
+DEFAULT_MAX_LENGTH = 256
 DEFAULT_TEMPERATURE = 0.7
+SYSTEM_PROMPT = """You are Reka Flash-3, a helpful AI assistant created by Reka AI."""
+# Load model and tokenizer
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    quantization_config=quantization_config,
+    device_map="auto",
+    torch_dtype=torch.float16,
+    low_cpu_mem_usage=True
+)
+tokenizer.pad_token = tokenizer.eos_token
+def generate_response(message, chat_history, system_prompt, max_length, temperature, top_p, top_k, repetition_penalty):
+    prompt = f"{system_prompt} <sep> human: {message} <sep> assistant: "
+    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=max_length,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        do_sample=True,
+        pad_token_id=tokenizer.eos_token_id
     )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("<sep>")[2].strip()
+    chat_history.append({"user": message, "assistant": response})
+    return "", chat_history
 # Gradio Interface
+with gr.Blocks(title="Reka Flash-3 Chat") as demo:
+    gr.Markdown("# Reka Flash-3 Chat Interface")
+    chatbot = gr.Chatbot(type="messages", height=400, label="Conversation")
     with gr.Row():
+        message = gr.Textbox(label="Your Message", placeholder="Ask me anything...")
+        submit_btn = gr.Button("Send")
+    with gr.Accordion("Options", open=False):
+        max_length = gr.Slider(128, 512, value=DEFAULT_MAX_LENGTH, label="Max Length")
+        temperature = gr.Slider(0.1, 2.0, value=DEFAULT_TEMPERATURE, label="Temperature")
+        top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
+        top_k = gr.Slider(1, 100, value=50, label="Top-k")
+        repetition_penalty = gr.Slider(0.1, 2.0, value=1.1, label="Repetition Penalty")
     system_prompt = gr.Textbox(label="System Prompt", value=SYSTEM_PROMPT, lines=4)
+    inputs = [message, chatbot, system_prompt, max_length, temperature, top_p, top_k, repetition_penalty]
+    outputs = [message, chatbot]
     submit_btn.click(generate_response, inputs=inputs, outputs=outputs)
     message.submit(generate_response, inputs=inputs, outputs=outputs)
+demo.launch()

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ gradio>=3.50
 huggingface_hub==0.25.2
 torch
 transformers
-bitsandbytes

 huggingface_hub==0.25.2
 torch
 transformers
+bitsandbytes
+accelerate>=0.26.0