import gradio as gr
import requests
import re

# -----------------------------
# 1. Configure the open-source LLM API endpoint
#    For demonstration, we can use a hosted inference API on Hugging Face
#    that is free to use (to a certain rate limit).
# -----------------------------
# Example: We'll use an OpenAssistant model endpoint on HF. 
# You can find many such endpoints in the Hugging Face "Spaces" or "Models" section 
# that provide Inference API for free.

API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5" 
#API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/falcon-7b-sft-mix-2000" 

headers={}

# -----------------------------
# 2. Define a function to query the model
# -----------------------------
def query_model(prompt: str) -> str:
    """
    Sends the prompt to the Hugging Face Inference Endpoint and returns the model's response.
    """
    # The payload format for text generation can vary by model. We'll try a general approach:
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 200,     # limit response length
            "temperature": 0.7,       # moderate creativity
        }
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        model_output = response.json()
        # "generated_text" or "text" can vary depending on the model
        if isinstance(model_output, dict) and "generated_text" in model_output:
            return model_output["generated_text"]
        elif isinstance(model_output, list) and len(model_output) > 0:
            # Some endpoints return a list of dicts
            return model_output[0].get("generated_text", "")
        else:
            return "Error: Unexpected model output format."
    else:
        return f"Error {response.status_code}: {response.text}"

# -----------------------------
# 3. Define a simple evaluation function
#    This is a naive "keyword and structure" based scoring for demonstration.
# -----------------------------
def evaluate_response(response: str) -> dict:
    """
    Rates the response on a scale of 1–5 for:
      1) Relevance (R)
      2) Depth (D)
      3) Clarity (C)
      4) References (E)
      5) Overall Quality (Q)
    Returns a dict with individual scores and total.
    """
    # We'll do a very simplistic approach:
    # Relevance: presence of 'remote work' or synonyms + mention of 'software engineers'
    relevance = 5 if ("remote work" in response.lower() and "software engineer" in response.lower()) else 0
    
    # Depth: check if the text is > 100 words or includes multiple paragraphs
    word_count = len(response.split())
    depth = 5 if word_count > 150 else (4 if word_count > 80 else 0)
    
    # Clarity: check if there's a mention of 'introduction'/'conclusion' or if it has multiple paragraphs
    paragraphs = response.strip().split("\n\n")
    clarity = 5 if len(paragraphs) >= 2 else 0
    
    # References: look for something like 'reference', 'source', 'citation', or an URL
    if re.search(r"reference|source|citation|http", response, re.IGNORECASE):
        references = 5
    else:
        references = 0
    
    # Overall Quality: a naive combination
    # We'll penalize if the text is too short or if it's obviously incomplete
    if "..." in response[-10:]:
        # If it ends with ... maybe it's incomplete
        overall = 0
    else:
        overall = 5 if (relevance >= 4 and depth >= 4 and references >= 4) else 0
    
    # Summation
    total_score = relevance + depth + clarity + references + overall
    
    return {
        "Relevance": relevance,
        "Depth": depth,
        "Clarity": clarity,
        "References": references,
        "Overall": overall,
        "Total": total_score
    }

# -----------------------------
# 4. Define the Gradio interface function
#    This is the function that runs when user clicks "Generate & Evaluate"
# -----------------------------
def generate_and_evaluate(prompt: str):
    if not prompt.strip():
        return "Please enter a prompt.", {}
    
    # 1) Get LLM response
    llm_response = query_model(prompt)
    
    # 2) Evaluate
    scores = evaluate_response(llm_response)
    
    return llm_response, scores

# -----------------------------
# 5. Build the Gradio UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown("# Prompt Evaluator")
    gr.Markdown(
        "Enter a prompt about something"
        "The model will generate a response and our auto-evaluator will score it."
    )
    
    prompt_input = gr.Textbox(
        label="Enter your prompt here",
        placeholder="E.g., 'Write a short report on the benefits of remote work for software engineers...'",
        lines=3
    )
    
    generate_button = gr.Button("Generate & Evaluate")
    
    response_output = gr.Textbox(
        label="LLM Response",
        lines=10
    )
    
    score_output = gr.JSON(
        label="Evaluation Scores",
        visible=True
    )
    
    generate_button.click(
        fn=generate_and_evaluate,
        inputs=[prompt_input],
        outputs=[response_output, score_output]
    )

# -----------------------------
# 6. Launch
# -----------------------------
if __name__ == "__main__":
    demo.launch()