Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
import re | |
# ----------------------------- | |
# 1. Configure the open-source LLM API endpoint | |
# For demonstration, we can use a hosted inference API on Hugging Face | |
# that is free to use (to a certain rate limit). | |
# ----------------------------- | |
# Example: We'll use an OpenAssistant model endpoint on HF. | |
# You can find many such endpoints in the Hugging Face "Spaces" or "Models" section | |
# that provide Inference API for free. | |
API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5" | |
#API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/falcon-7b-sft-mix-2000" | |
headers={} | |
# ----------------------------- | |
# 2. Define a function to query the model | |
# ----------------------------- | |
def query_model(prompt: str) -> str: | |
""" | |
Sends the prompt to the Hugging Face Inference Endpoint and returns the model's response. | |
""" | |
# The payload format for text generation can vary by model. We'll try a general approach: | |
payload = { | |
"inputs": prompt, | |
"parameters": { | |
"max_new_tokens": 200, # limit response length | |
"temperature": 0.7, # moderate creativity | |
} | |
} | |
response = requests.post(API_URL, headers=headers, json=payload) | |
if response.status_code == 200: | |
model_output = response.json() | |
# "generated_text" or "text" can vary depending on the model | |
if isinstance(model_output, dict) and "generated_text" in model_output: | |
return model_output["generated_text"] | |
elif isinstance(model_output, list) and len(model_output) > 0: | |
# Some endpoints return a list of dicts | |
return model_output[0].get("generated_text", "") | |
else: | |
return "Error: Unexpected model output format." | |
else: | |
return f"Error {response.status_code}: {response.text}" | |
# ----------------------------- | |
# 3. Define a simple evaluation function | |
# This is a naive "keyword and structure" based scoring for demonstration. | |
# ----------------------------- | |
def evaluate_response(response: str) -> dict: | |
""" | |
Rates the response on a scale of 1β5 for: | |
1) Relevance (R) | |
2) Depth (D) | |
3) Clarity (C) | |
4) References (E) | |
5) Overall Quality (Q) | |
Returns a dict with individual scores and total. | |
""" | |
# We'll do a very simplistic approach: | |
# Relevance: presence of 'remote work' or synonyms + mention of 'software engineers' | |
relevance = 5 if ("remote work" in response.lower() and "software engineer" in response.lower()) else 0 | |
# Depth: check if the text is > 100 words or includes multiple paragraphs | |
word_count = len(response.split()) | |
depth = 5 if word_count > 150 else (4 if word_count > 80 else 0) | |
# Clarity: check if there's a mention of 'introduction'/'conclusion' or if it has multiple paragraphs | |
paragraphs = response.strip().split("\n\n") | |
clarity = 5 if len(paragraphs) >= 2 else 0 | |
# References: look for something like 'reference', 'source', 'citation', or an URL | |
if re.search(r"reference|source|citation|http", response, re.IGNORECASE): | |
references = 5 | |
else: | |
references = 0 | |
# Overall Quality: a naive combination | |
# We'll penalize if the text is too short or if it's obviously incomplete | |
if "..." in response[-10:]: | |
# If it ends with ... maybe it's incomplete | |
overall = 0 | |
else: | |
overall = 5 if (relevance >= 4 and depth >= 4 and references >= 4) else 0 | |
# Summation | |
total_score = relevance + depth + clarity + references + overall | |
return { | |
"Relevance": relevance, | |
"Depth": depth, | |
"Clarity": clarity, | |
"References": references, | |
"Overall": overall, | |
"Total": total_score | |
} | |
# ----------------------------- | |
# 4. Define the Gradio interface function | |
# This is the function that runs when user clicks "Generate & Evaluate" | |
# ----------------------------- | |
def generate_and_evaluate(prompt: str): | |
if not prompt.strip(): | |
return "Please enter a prompt.", {} | |
# 1) Get LLM response | |
llm_response = query_model(prompt) | |
# 2) Evaluate | |
scores = evaluate_response(llm_response) | |
return llm_response, scores | |
# ----------------------------- | |
# 5. Build the Gradio UI | |
# ----------------------------- | |
with gr.Blocks() as demo: | |
gr.Markdown("# Prompt Evaluator") | |
gr.Markdown( | |
"Enter a prompt about something" | |
"The model will generate a response and our auto-evaluator will score it." | |
) | |
prompt_input = gr.Textbox( | |
label="Enter your prompt here", | |
placeholder="E.g., 'Write a short report on the benefits of remote work for software engineers...'", | |
lines=3 | |
) | |
generate_button = gr.Button("Generate & Evaluate") | |
response_output = gr.Textbox( | |
label="LLM Response", | |
lines=10 | |
) | |
score_output = gr.JSON( | |
label="Evaluation Scores", | |
visible=True | |
) | |
generate_button.click( | |
fn=generate_and_evaluate, | |
inputs=[prompt_input], | |
outputs=[response_output, score_output] | |
) | |
# ----------------------------- | |
# 6. Launch | |
# ----------------------------- | |
if __name__ == "__main__": | |
demo.launch() | |