import gradio as gr import requests import re # ----------------------------- # 1. Configure the open-source LLM API endpoint # For demonstration, we can use a hosted inference API on Hugging Face # that is free to use (to a certain rate limit). # ----------------------------- # Example: We'll use an OpenAssistant model endpoint on HF. # You can find many such endpoints in the Hugging Face "Spaces" or "Models" section # that provide Inference API for free. API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5" #API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/falcon-7b-sft-mix-2000" headers={} # ----------------------------- # 2. Define a function to query the model # ----------------------------- def query_model(prompt: str) -> str: """ Sends the prompt to the Hugging Face Inference Endpoint and returns the model's response. """ # The payload format for text generation can vary by model. We'll try a general approach: payload = { "inputs": prompt, "parameters": { "max_new_tokens": 200, # limit response length "temperature": 0.7, # moderate creativity } } response = requests.post(API_URL, headers=headers, json=payload) if response.status_code == 200: model_output = response.json() # "generated_text" or "text" can vary depending on the model if isinstance(model_output, dict) and "generated_text" in model_output: return model_output["generated_text"] elif isinstance(model_output, list) and len(model_output) > 0: # Some endpoints return a list of dicts return model_output[0].get("generated_text", "") else: return "Error: Unexpected model output format." else: return f"Error {response.status_code}: {response.text}" # ----------------------------- # 3. Define a simple evaluation function # This is a naive "keyword and structure" based scoring for demonstration. # ----------------------------- def evaluate_response(response: str) -> dict: """ Rates the response on a scale of 1–5 for: 1) Relevance (R) 2) Depth (D) 3) Clarity (C) 4) References (E) 5) Overall Quality (Q) Returns a dict with individual scores and total. """ # We'll do a very simplistic approach: # Relevance: presence of 'remote work' or synonyms + mention of 'software engineers' relevance = 5 if ("remote work" in response.lower() and "software engineer" in response.lower()) else 0 # Depth: check if the text is > 100 words or includes multiple paragraphs word_count = len(response.split()) depth = 5 if word_count > 150 else (4 if word_count > 80 else 0) # Clarity: check if there's a mention of 'introduction'/'conclusion' or if it has multiple paragraphs paragraphs = response.strip().split("\n\n") clarity = 5 if len(paragraphs) >= 2 else 0 # References: look for something like 'reference', 'source', 'citation', or an URL if re.search(r"reference|source|citation|http", response, re.IGNORECASE): references = 5 else: references = 0 # Overall Quality: a naive combination # We'll penalize if the text is too short or if it's obviously incomplete if "..." in response[-10:]: # If it ends with ... maybe it's incomplete overall = 0 else: overall = 5 if (relevance >= 4 and depth >= 4 and references >= 4) else 0 # Summation total_score = relevance + depth + clarity + references + overall return { "Relevance": relevance, "Depth": depth, "Clarity": clarity, "References": references, "Overall": overall, "Total": total_score } # ----------------------------- # 4. Define the Gradio interface function # This is the function that runs when user clicks "Generate & Evaluate" # ----------------------------- def generate_and_evaluate(prompt: str): if not prompt.strip(): return "Please enter a prompt.", {} # 1) Get LLM response llm_response = query_model(prompt) # 2) Evaluate scores = evaluate_response(llm_response) return llm_response, scores # ----------------------------- # 5. Build the Gradio UI # ----------------------------- with gr.Blocks() as demo: gr.Markdown("# Prompt Evaluator") gr.Markdown( "Enter a prompt about something" "The model will generate a response and our auto-evaluator will score it." ) prompt_input = gr.Textbox( label="Enter your prompt here", placeholder="E.g., 'Write a short report on the benefits of remote work for software engineers...'", lines=3 ) generate_button = gr.Button("Generate & Evaluate") response_output = gr.Textbox( label="LLM Response", lines=10 ) score_output = gr.JSON( label="Evaluation Scores", visible=True ) generate_button.click( fn=generate_and_evaluate, inputs=[prompt_input], outputs=[response_output, score_output] ) # ----------------------------- # 6. Launch # ----------------------------- if __name__ == "__main__": demo.launch()