import gradio as gr
from transformers import pipeline

# Load the model and tokenizer
# def load_model():
#     # Load the NuminaMath-72B-CoT model
#     pipe = pipeline(
#         "text-generation",
#         model="AI-MO/NuminaMath-72B-CoT",
#         torch_dtype="auto",
#         device_map="auto"  # Automatically map to available GPU/CPU
#     )
#     return pipe

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

def load_model():
    # tokenizer = AutoTokenizer.from_pretrained("AI-MO/NuminaMath-72B-CoT")
    tokenizer = AutoTokenizer.from_pretrained("AI-MO/NuminaMath-7B-CoT")
    
    model = AutoModelForCausalLM.from_pretrained(
        # "AI-MO/NuminaMath-72B-CoT",
        "AI-MO/NuminaMath-7B-CoT",
        device_map="auto",  # Automatically map to available GPU
        # offload_folder="offload"  # Offload unused parts to disk
        load_in_8bit=True  # Load model in 8-bit precision
    )
    return pipeline("text-generation", model=model, tokenizer=tokenizer)

# Initialize the pipeline
model_pipeline = load_model()

# Define the function to process inputs
def solve_math_question(prompt):
    # Generate output using the model
    outputs = model_pipeline(prompt, max_new_tokens=300, do_sample=False)
    return outputs[0]["generated_text"]

# Define the Gradio interface
with gr.Blocks() as app:
    gr.Markdown("# NuminaMath-72B-CoT Math Question Solver")
    gr.Markdown(
        "Ask a math-related question, and the model will attempt to solve it with reasoning!"
    )

    with gr.Row():
        question = gr.Textbox(
            label="Your Math Question",
            placeholder="what is 2+2?",
        )
        output = gr.Textbox(label="Model Output")

    submit_button = gr.Button("Solve")
    submit_button.click(solve_math_question, inputs=question, outputs=output)

# Launch the app
app.launch()