File size: 5,343 Bytes
73e70b2
 
 
 
 
 
 
 
 
 
 
 
 
2b48b5a
 
5760be1
cc94ce4
73e70b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1323518
73e70b2
 
 
1323518
73e70b2
 
 
1323518
73e70b2
 
 
 
 
b4b6f31
73e70b2
 
 
 
 
1323518
73e70b2
b4b6f31
73e70b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edf82fa
73e70b2
edf82fa
73e70b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import gradio as gr
import requests
import re

# -----------------------------
# 1. Configure the open-source LLM API endpoint
#    For demonstration, we can use a hosted inference API on Hugging Face
#    that is free to use (to a certain rate limit).
# -----------------------------
# Example: We'll use an OpenAssistant model endpoint on HF. 
# You can find many such endpoints in the Hugging Face "Spaces" or "Models" section 
# that provide Inference API for free.

API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5" 
#API_URL = "https://api-inference.huggingface.co/models/OpenAssistant/falcon-7b-sft-mix-2000" 

headers={}

# -----------------------------
# 2. Define a function to query the model
# -----------------------------
def query_model(prompt: str) -> str:
    """
    Sends the prompt to the Hugging Face Inference Endpoint and returns the model's response.
    """
    # The payload format for text generation can vary by model. We'll try a general approach:
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 200,     # limit response length
            "temperature": 0.7,       # moderate creativity
        }
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        model_output = response.json()
        # "generated_text" or "text" can vary depending on the model
        if isinstance(model_output, dict) and "generated_text" in model_output:
            return model_output["generated_text"]
        elif isinstance(model_output, list) and len(model_output) > 0:
            # Some endpoints return a list of dicts
            return model_output[0].get("generated_text", "")
        else:
            return "Error: Unexpected model output format."
    else:
        return f"Error {response.status_code}: {response.text}"

# -----------------------------
# 3. Define a simple evaluation function
#    This is a naive "keyword and structure" based scoring for demonstration.
# -----------------------------
def evaluate_response(response: str) -> dict:
    """
    Rates the response on a scale of 1–5 for:
      1) Relevance (R)
      2) Depth (D)
      3) Clarity (C)
      4) References (E)
      5) Overall Quality (Q)
    Returns a dict with individual scores and total.
    """
    # We'll do a very simplistic approach:
    # Relevance: presence of 'remote work' or synonyms + mention of 'software engineers'
    relevance = 5 if ("remote work" in response.lower() and "software engineer" in response.lower()) else 0
    
    # Depth: check if the text is > 100 words or includes multiple paragraphs
    word_count = len(response.split())
    depth = 5 if word_count > 150 else (4 if word_count > 80 else 0)
    
    # Clarity: check if there's a mention of 'introduction'/'conclusion' or if it has multiple paragraphs
    paragraphs = response.strip().split("\n\n")
    clarity = 5 if len(paragraphs) >= 2 else 0
    
    # References: look for something like 'reference', 'source', 'citation', or an URL
    if re.search(r"reference|source|citation|http", response, re.IGNORECASE):
        references = 5
    else:
        references = 0
    
    # Overall Quality: a naive combination
    # We'll penalize if the text is too short or if it's obviously incomplete
    if "..." in response[-10:]:
        # If it ends with ... maybe it's incomplete
        overall = 0
    else:
        overall = 5 if (relevance >= 4 and depth >= 4 and references >= 4) else 0
    
    # Summation
    total_score = relevance + depth + clarity + references + overall
    
    return {
        "Relevance": relevance,
        "Depth": depth,
        "Clarity": clarity,
        "References": references,
        "Overall": overall,
        "Total": total_score
    }

# -----------------------------
# 4. Define the Gradio interface function
#    This is the function that runs when user clicks "Generate & Evaluate"
# -----------------------------
def generate_and_evaluate(prompt: str):
    if not prompt.strip():
        return "Please enter a prompt.", {}
    
    # 1) Get LLM response
    llm_response = query_model(prompt)
    
    # 2) Evaluate
    scores = evaluate_response(llm_response)
    
    return llm_response, scores

# -----------------------------
# 5. Build the Gradio UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown("# Prompt Evaluator")
    gr.Markdown(
        "Enter a prompt about something"
        "The model will generate a response and our auto-evaluator will score it."
    )
    
    prompt_input = gr.Textbox(
        label="Enter your prompt here",
        placeholder="E.g., 'Write a short report on the benefits of remote work for software engineers...'",
        lines=3
    )
    
    generate_button = gr.Button("Generate & Evaluate")
    
    response_output = gr.Textbox(
        label="LLM Response",
        lines=10
    )
    
    score_output = gr.JSON(
        label="Evaluation Scores",
        visible=True
    )
    
    generate_button.click(
        fn=generate_and_evaluate,
        inputs=[prompt_input],
        outputs=[response_output, score_output]
    )

# -----------------------------
# 6. Launch
# -----------------------------
if __name__ == "__main__":
    demo.launch()