Spaces:
Sleeping
Sleeping
File size: 5,343 Bytes
73e70b2 2b48b5a 5760be1 cc94ce4 73e70b2 1323518 73e70b2 1323518 73e70b2 1323518 73e70b2 b4b6f31 73e70b2 1323518 73e70b2 b4b6f31 73e70b2 edf82fa 73e70b2 edf82fa 73e70b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import gradio as gr
import requests
import re
# -----------------------------
# 1. Configure the open-source LLM API endpoint
# For demonstration, we can use a hosted inference API on Hugging Face
# that is free to use (to a certain rate limit).
# -----------------------------
# Example: We'll use an OpenAssistant model endpoint on HF.
# You can find many such endpoints in the Hugging Face "Spaces" or "Models" section
# that provide Inference API for free.
API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
#API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/falcon-7b-sft-mix-2000"
headers={}
# -----------------------------
# 2. Define a function to query the model
# -----------------------------
def query_model(prompt: str) -> str:
"""
Sends the prompt to the Hugging Face Inference Endpoint and returns the model's response.
"""
# The payload format for text generation can vary by model. We'll try a general approach:
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": 200, # limit response length
"temperature": 0.7, # moderate creativity
}
}
response = requests.post(API_URL, headers=headers, json=payload)
if response.status_code == 200:
model_output = response.json()
# "generated_text" or "text" can vary depending on the model
if isinstance(model_output, dict) and "generated_text" in model_output:
return model_output["generated_text"]
elif isinstance(model_output, list) and len(model_output) > 0:
# Some endpoints return a list of dicts
return model_output[0].get("generated_text", "")
else:
return "Error: Unexpected model output format."
else:
return f"Error {response.status_code}: {response.text}"
# -----------------------------
# 3. Define a simple evaluation function
# This is a naive "keyword and structure" based scoring for demonstration.
# -----------------------------
def evaluate_response(response: str) -> dict:
"""
Rates the response on a scale of 1–5 for:
1) Relevance (R)
2) Depth (D)
3) Clarity (C)
4) References (E)
5) Overall Quality (Q)
Returns a dict with individual scores and total.
"""
# We'll do a very simplistic approach:
# Relevance: presence of 'remote work' or synonyms + mention of 'software engineers'
relevance = 5 if ("remote work" in response.lower() and "software engineer" in response.lower()) else 0
# Depth: check if the text is > 100 words or includes multiple paragraphs
word_count = len(response.split())
depth = 5 if word_count > 150 else (4 if word_count > 80 else 0)
# Clarity: check if there's a mention of 'introduction'/'conclusion' or if it has multiple paragraphs
paragraphs = response.strip().split("\n\n")
clarity = 5 if len(paragraphs) >= 2 else 0
# References: look for something like 'reference', 'source', 'citation', or an URL
if re.search(r"reference|source|citation|http", response, re.IGNORECASE):
references = 5
else:
references = 0
# Overall Quality: a naive combination
# We'll penalize if the text is too short or if it's obviously incomplete
if "..." in response[-10:]:
# If it ends with ... maybe it's incomplete
overall = 0
else:
overall = 5 if (relevance >= 4 and depth >= 4 and references >= 4) else 0
# Summation
total_score = relevance + depth + clarity + references + overall
return {
"Relevance": relevance,
"Depth": depth,
"Clarity": clarity,
"References": references,
"Overall": overall,
"Total": total_score
}
# -----------------------------
# 4. Define the Gradio interface function
# This is the function that runs when user clicks "Generate & Evaluate"
# -----------------------------
def generate_and_evaluate(prompt: str):
if not prompt.strip():
return "Please enter a prompt.", {}
# 1) Get LLM response
llm_response = query_model(prompt)
# 2) Evaluate
scores = evaluate_response(llm_response)
return llm_response, scores
# -----------------------------
# 5. Build the Gradio UI
# -----------------------------
with gr.Blocks() as demo:
gr.Markdown("# Prompt Evaluator")
gr.Markdown(
"Enter a prompt about something"
"The model will generate a response and our auto-evaluator will score it."
)
prompt_input = gr.Textbox(
label="Enter your prompt here",
placeholder="E.g., 'Write a short report on the benefits of remote work for software engineers...'",
lines=3
)
generate_button = gr.Button("Generate & Evaluate")
response_output = gr.Textbox(
label="LLM Response",
lines=10
)
score_output = gr.JSON(
label="Evaluation Scores",
visible=True
)
generate_button.click(
fn=generate_and_evaluate,
inputs=[prompt_input],
outputs=[response_output, score_output]
)
# -----------------------------
# 6. Launch
# -----------------------------
if __name__ == "__main__":
demo.launch()
|