AIMI / app.py
yashphalle's picture
Update app.py
b4b6f31 verified
import gradio as gr
import requests
import re
# -----------------------------
# 1. Configure the open-source LLM API endpoint
# For demonstration, we can use a hosted inference API on Hugging Face
# that is free to use (to a certain rate limit).
# -----------------------------
# Example: We'll use an OpenAssistant model endpoint on HF.
# You can find many such endpoints in the Hugging Face "Spaces" or "Models" section
# that provide Inference API for free.
API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
#API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/falcon-7b-sft-mix-2000"
headers={}
# -----------------------------
# 2. Define a function to query the model
# -----------------------------
def query_model(prompt: str) -> str:
"""
Sends the prompt to the Hugging Face Inference Endpoint and returns the model's response.
"""
# The payload format for text generation can vary by model. We'll try a general approach:
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": 200, # limit response length
"temperature": 0.7, # moderate creativity
}
}
response = requests.post(API_URL, headers=headers, json=payload)
if response.status_code == 200:
model_output = response.json()
# "generated_text" or "text" can vary depending on the model
if isinstance(model_output, dict) and "generated_text" in model_output:
return model_output["generated_text"]
elif isinstance(model_output, list) and len(model_output) > 0:
# Some endpoints return a list of dicts
return model_output[0].get("generated_text", "")
else:
return "Error: Unexpected model output format."
else:
return f"Error {response.status_code}: {response.text}"
# -----------------------------
# 3. Define a simple evaluation function
# This is a naive "keyword and structure" based scoring for demonstration.
# -----------------------------
def evaluate_response(response: str) -> dict:
"""
Rates the response on a scale of 1–5 for:
1) Relevance (R)
2) Depth (D)
3) Clarity (C)
4) References (E)
5) Overall Quality (Q)
Returns a dict with individual scores and total.
"""
# We'll do a very simplistic approach:
# Relevance: presence of 'remote work' or synonyms + mention of 'software engineers'
relevance = 5 if ("remote work" in response.lower() and "software engineer" in response.lower()) else 0
# Depth: check if the text is > 100 words or includes multiple paragraphs
word_count = len(response.split())
depth = 5 if word_count > 150 else (4 if word_count > 80 else 0)
# Clarity: check if there's a mention of 'introduction'/'conclusion' or if it has multiple paragraphs
paragraphs = response.strip().split("\n\n")
clarity = 5 if len(paragraphs) >= 2 else 0
# References: look for something like 'reference', 'source', 'citation', or an URL
if re.search(r"reference|source|citation|http", response, re.IGNORECASE):
references = 5
else:
references = 0
# Overall Quality: a naive combination
# We'll penalize if the text is too short or if it's obviously incomplete
if "..." in response[-10:]:
# If it ends with ... maybe it's incomplete
overall = 0
else:
overall = 5 if (relevance >= 4 and depth >= 4 and references >= 4) else 0
# Summation
total_score = relevance + depth + clarity + references + overall
return {
"Relevance": relevance,
"Depth": depth,
"Clarity": clarity,
"References": references,
"Overall": overall,
"Total": total_score
}
# -----------------------------
# 4. Define the Gradio interface function
# This is the function that runs when user clicks "Generate & Evaluate"
# -----------------------------
def generate_and_evaluate(prompt: str):
if not prompt.strip():
return "Please enter a prompt.", {}
# 1) Get LLM response
llm_response = query_model(prompt)
# 2) Evaluate
scores = evaluate_response(llm_response)
return llm_response, scores
# -----------------------------
# 5. Build the Gradio UI
# -----------------------------
with gr.Blocks() as demo:
gr.Markdown("# Prompt Evaluator")
gr.Markdown(
"Enter a prompt about something"
"The model will generate a response and our auto-evaluator will score it."
)
prompt_input = gr.Textbox(
label="Enter your prompt here",
placeholder="E.g., 'Write a short report on the benefits of remote work for software engineers...'",
lines=3
)
generate_button = gr.Button("Generate & Evaluate")
response_output = gr.Textbox(
label="LLM Response",
lines=10
)
score_output = gr.JSON(
label="Evaluation Scores",
visible=True
)
generate_button.click(
fn=generate_and_evaluate,
inputs=[prompt_input],
outputs=[response_output, score_output]
)
# -----------------------------
# 6. Launch
# -----------------------------
if __name__ == "__main__":
demo.launch()