judge-arena / app.py
tobydrane's picture
Push main
0136a5b
raw
history blame
25.9 kB
import json
import re
import random
from collections import defaultdict
from datetime import datetime, timezone
from dotenv import load_dotenv
load_dotenv()
import gradio as gr
from gen_api_answer import get_model_response, parse_model_response
from db import add_vote, create_db_connection, get_votes
from utils import Vote
from common import (
POLICY_CONTENT,
ACKNOWLEDGEMENTS,
DEFAULT_EVAL_PROMPT,
DEFAULT_INPUT,
DEFAULT_RESPONSE,
CSS_STYLES,
MAIN_TITLE,
HOW_IT_WORKS,
BATTLE_RULES,
EVAL_DESCRIPTION,
VOTING_HEADER,
)
from example_metrics import EXAMPLE_METRICS
# Model and ELO score data
DEFAULT_ELO = 1500 # Starting ELO for new models
K_FACTOR = 32 # Standard chess K-factor, adjust as needed
elo_scores = defaultdict(lambda: DEFAULT_ELO)
vote_counts = defaultdict(int)
db = create_db_connection()
votes_collection = get_votes(db)
current_time = datetime.now()
# Load the model_data from JSONL
def load_model_data():
model_data = {}
try:
with open("data/models.jsonl", "r") as f:
for line in f:
model = json.loads(line)
model_data[model["name"]] = {
"organization": model["organization"],
"license": model["license"],
"api_model": model["api_model"],
}
except FileNotFoundError:
print("Warning: models.jsonl not found")
return {}
return model_data
model_data = load_model_data()
current_session_id = 0
def get_new_session_id():
global current_session_id
current_session_id += 1
return f"user{current_session_id}"
def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
vote = Vote(
timestamp=datetime.now().isoformat(),
prompt=prompt,
response_a=response_a,
response_b=response_b,
model_a=model_a,
model_b=model_b,
winner=winner,
judge_id=judge_id,
)
add_vote(vote, db)
def parse_variables(prompt):
# Extract variables enclosed in double curly braces
variables = re.findall(r"{{(.*?)}}", prompt)
# Remove duplicates while preserving order
seen = set()
variables = [
x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
]
return variables
def get_final_prompt(eval_prompt, variable_values):
# Replace variables in the eval prompt with their values
for var, val in variable_values.items():
eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
return eval_prompt
def submit_prompt(eval_prompt, *variable_values):
try:
variables = parse_variables(eval_prompt)
variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
models = list(model_data.keys())
model1, model2 = random.sample(models, 2)
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
response_a = get_model_response(model_a, model_data.get(model_a), final_prompt)
response_b = get_model_response(model_b, model_data.get(model_b), final_prompt)
return (
response_a,
response_b,
gr.update(visible=True),
gr.update(visible=True),
model_a,
model_b,
final_prompt,
)
except Exception as e:
print(f"Error in submit_prompt: {str(e)}")
return (
"Error generating response",
"Error generating response",
gr.update(visible=False),
gr.update(visible=False),
None,
None,
None,
)
def vote(
choice,
model_a,
model_b,
final_prompt,
score_a,
critique_a,
score_b,
critique_b,
judge_id,
):
# Update ELO scores based on user choice
elo_a = elo_scores[model_a]
elo_b = elo_scores[model_b]
# Calculate expected scores
Ea = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
Eb = 1 / (1 + 10 ** ((elo_a - elo_b) / 400))
# Assign actual scores
if choice == "A":
Sa, Sb = 1, 0
elif choice == "B":
Sa, Sb = 0, 1
else:
Sa, Sb = 0.5, 0.5
# Update scores and vote counts
elo_scores[model_a] += K_FACTOR * (Sa - Ea)
elo_scores[model_b] += K_FACTOR * (Sb - Eb)
vote_counts[model_a] += 1
vote_counts[model_b] += 1
# Format the full responses with score and critique
response_a = f"""{score_a}
{critique_a}"""
response_b = f"""{score_b}
{critique_b}"""
# Store the vote data with the final prompt
store_vote_data(
final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
)
# Return updates for UI components
return [
gr.update(visible=False), # action_buttons_row
gr.update(value=f"*Model: {model_a}*"), # model_name_a
gr.update(value=f"*Model: {model_b}*"), # model_name_b
gr.update(interactive=True), # send_btn
gr.update(visible=True, interactive=True), # regenerate_button
]
def get_current_votes():
"""Get current votes from database."""
return get_votes(db)
def get_leaderboard():
"""Generate leaderboard data using fresh votes from MongoDB."""
# Get fresh voting data
voting_data = get_current_votes()
print(f"Fetched {len(voting_data)} votes from database") # Debug log
# Initialize dictionaries for tracking
ratings = defaultdict(lambda: DEFAULT_ELO)
matches = defaultdict(int)
# Process each vote
for vote in voting_data:
try:
model_a = vote.get("model_a")
model_b = vote.get("model_b")
winner = vote.get("winner")
# Skip if models aren't in current model_data
if (
not all([model_a, model_b, winner])
or model_a not in model_data
or model_b not in model_data
):
continue
# Update match counts
matches[model_a] += 1
matches[model_b] += 1
# Calculate ELO changes
elo_a = ratings[model_a]
elo_b = ratings[model_b]
# Expected scores
expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
expected_b = 1 - expected_a
# Actual scores
score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
score_b = 1 - score_a
# Update ratings
ratings[model_a] += K_FACTOR * (score_a - expected_a)
ratings[model_b] += K_FACTOR * (score_b - expected_b)
except Exception as e:
print(f"Error processing vote: {e}")
continue
# Generate leaderboard data
leaderboard = []
for model in model_data.keys():
votes = matches[model]
elo = ratings[model]
ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
data = {
"Model": model,
"ELO Score": f"{elo:.2f}",
"95% CI": f"±{ci:.2f}",
"# Votes": votes,
"Organization": model_data[model]["organization"],
"License": model_data[model]["license"],
}
leaderboard.append(data)
return leaderboard
def regenerate_prompt(model_a, model_b, eval_prompt, *variable_values):
variables = parse_variables(eval_prompt)
variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
# Get available models excluding the previous ones
available_models = [m for m in model_data.keys() if m not in (model_a, model_b)]
# If we have enough models for new pairs
if len(available_models) >= 2:
model1, model2 = random.sample(available_models, 2)
else:
# Fallback to allowing previous models if necessary
model1, model2 = random.sample(list(model_data.keys()), 2)
response_a = get_model_response(model1, model_data.get(model1), final_prompt)
response_b = get_model_response(model2, model_data.get(model2), final_prompt)
# Parse the responses
score_a, critique_a = parse_model_response(response_a)
score_b, critique_b = parse_model_response(response_b)
return (
score_a, # score_a textbox
critique_a, # critique_a textbox
score_b, # score_b textbox
critique_b, # critique_b textbox
gr.update(visible=True), # action_buttons_row
gr.update(value="*Model: Unknown*"), # model_name_a
gr.update(value="*Model: Unknown*"), # model_name_b
model1, # model_a_state
model2, # model_b_state
)
def calculate_elo_change(rating_a, rating_b, winner):
"""Calculate ELO rating changes for both players."""
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
expected_b = 1 - expected_a
if winner == "A":
score_a, score_b = 1, 0
elif winner == "B":
score_a, score_b = 0, 1
else: # Handle ties
score_a, score_b = 0.5, 0.5
change_a = K_FACTOR * (score_a - expected_a)
change_b = K_FACTOR * (score_b - expected_b)
return change_a, change_b
def update_leaderboard():
"""Generate leaderboard DataFrame using fresh votes from MongoDB."""
# Get fresh voting data
voting_data = get_current_votes()
print(f"Found {len(voting_data)} votes in database")
matches = defaultdict(int)
# Process each vote chronologically
for vote in voting_data:
# Extract model names from the vote document
try:
model_a = vote.get("model_a")
model_b = vote.get("model_b")
winner = vote.get("winner")
print(f"Processing vote: {model_a} vs {model_b}, winner: {winner}")
# Skip if any required field is missing or models aren't in current model_data
if not all([model_a, model_b, winner]):
print(f"Missing required fields in vote: {vote}")
continue
if model_a not in model_data:
print(f"Model A '{model_a}' not found in model_data")
continue
if model_b not in model_data:
print(f"Model B '{model_b}' not found in model_data")
continue
# Update match counts
matches[model_a] += 1
matches[model_b] += 1
print(
f"Updated matches - {model_a}: {matches[model_a]}, {model_b}: {matches[model_b]}"
)
except Exception as e:
print(f"Error processing vote: {e}")
print(f"Problematic vote data: {vote}")
continue
# Update the display_leaderboard function
def display_leaderboard():
df = update_leaderboard()
return gr.DataFrame(
value=df,
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
datatype=["str", "number", "str", "number", "str", "str", "str"],
row_count=(len(df) + 1, "dynamic"),
)
# Update the leaderboard table definition in the UI
leaderboard_table = gr.Dataframe(
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
datatype=["str", "number", "str", "number", "str", "str", "str"],
)
def get_leaderboard_stats():
"""Get summary statistics for the leaderboard."""
now = datetime.now(timezone.utc)
total_votes = len(get_current_votes())
total_models = len(model_data)
last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
"%B %d, %Y at %H:00 UTC"
)
return f"""
### Leaderboard Stats
- **Total Models**: {total_models}
- **Total Votes**: {total_votes}
- **Last Updated**: {last_updated}
"""
def set_example_metric(metric_name):
if metric_name == "Custom":
variables = parse_variables(DEFAULT_EVAL_PROMPT)
variable_values = []
for var in variables:
if var == "input":
variable_values.append(DEFAULT_INPUT)
elif var == "response":
variable_values.append(DEFAULT_RESPONSE)
else:
variable_values.append("") # Default empty value
# Pad variable_values to match the length of variable_rows
while len(variable_values) < len(variable_rows):
variable_values.append("")
return [DEFAULT_EVAL_PROMPT] + variable_values
metric_data = EXAMPLE_METRICS[metric_name]
variables = parse_variables(metric_data["prompt"])
variable_values = []
for var in variables:
value = metric_data.get(var, "") # Default to empty string if not found
variable_values.append(value)
# Pad variable_values to match the length of variable_rows
while len(variable_values) < len(variable_rows):
variable_values.append("")
return [metric_data["prompt"]] + variable_values
# Select random metric at startup
def get_random_metric():
metrics = list(EXAMPLE_METRICS.keys())
return set_example_metric(random.choice(metrics))
with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
judge_id = gr.State(get_new_session_id())
gr.Markdown(MAIN_TITLE)
gr.Markdown(HOW_IT_WORKS)
with gr.Tabs():
with gr.TabItem("Judge Arena"):
with gr.Row():
with gr.Column():
gr.Markdown(BATTLE_RULES)
gr.Markdown(EVAL_DESCRIPTION)
# Add Example Metrics Section
with gr.Accordion("Evaluator Prompt Templates", open=False):
with gr.Row():
custom_btn = gr.Button("Custom", variant="secondary")
hallucination_btn = gr.Button("Hallucination")
precision_btn = gr.Button("Precision")
recall_btn = gr.Button("Recall")
coherence_btn = gr.Button("Logical coherence")
faithfulness_btn = gr.Button("Faithfulness")
# Eval Prompt and Variables side by side
with gr.Row():
# Left column - Eval Prompt
with gr.Column(scale=1):
eval_prompt = gr.TextArea(
label="Evaluator Prompt",
lines=1,
value=DEFAULT_EVAL_PROMPT,
placeholder="Type your eval prompt here... denote variables in {{curly brackets}} to be populated on the right.",
show_label=True,
)
# Right column - Variable Mapping
with gr.Column(scale=1):
gr.Markdown("### Sample to test the evaluator")
# Create inputs for up to 5 variables, with first two visible by default
variable_rows = []
for i in range(5):
initial_visibility = True if i < 2 else False
with gr.Group(visible=initial_visibility) as var_row:
# Set default labels for the first two inputs
default_label = (
"input" if i == 0 else "response" if i == 1 else ""
)
var_input = gr.Textbox(
container=True,
label=default_label, # Add default label here
)
variable_rows.append((var_row, var_input))
# Send button
with gr.Row(elem_classes="send-button-row"):
send_btn = gr.Button(
value="Test the evaluators", variant="primary", size="lg", scale=1
)
# Add divider heading for model outputs
gr.Markdown(VOTING_HEADER)
# Model Responses side-by-side
with gr.Row():
with gr.Column():
gr.Markdown("### Model A")
score_a = gr.Textbox(label="Score", interactive=False)
critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
model_name_a = gr.Markdown("*Model: Unknown*")
with gr.Column():
gr.Markdown("### Model B")
score_b = gr.Textbox(label="Score", interactive=False)
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
model_name_b = gr.Markdown("*Model: Unknown*")
# Initially hide vote buttons and regenerate button
with gr.Row(visible=False) as action_buttons_row:
vote_a = gr.Button("Choose A", variant="primary")
vote_tie = gr.Button("Tie", variant="secondary")
vote_b = gr.Button("Choose B", variant="primary")
regenerate_button = gr.Button(
"Regenerate with different models", variant="secondary", visible=False
)
# Add spacing and acknowledgements at the bottom
gr.Markdown(ACKNOWLEDGEMENTS)
with gr.TabItem("Leaderboard"):
stats_display = gr.Markdown()
leaderboard_table = gr.Dataframe(
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
datatype=["str", "number", "str", "number", "str", "str", "str"],
)
with gr.TabItem("Policy"):
gr.Markdown(POLICY_CONTENT)
# Define state variables for model tracking
model_a_state = gr.State()
model_b_state = gr.State()
final_prompt_state = gr.State()
# Update variable inputs based on the eval prompt
def update_variables(eval_prompt):
variables = parse_variables(eval_prompt)
updates = []
for i in range(len(variable_rows)):
var_row, var_input = variable_rows[i]
if i < len(variables):
var_name = variables[i]
# Set the number of lines based on the variable name
if var_name == "response":
lines = 4 # Adjust this number as needed
else:
lines = 1 # Default to single line for other variables
updates.extend(
[
gr.update(visible=True), # Show the variable row
gr.update(
label=var_name, visible=True, lines=lines
), # Update label and lines
]
)
else:
updates.extend(
[
gr.update(visible=False), # Hide the variable row
gr.update(value="", visible=False), # Clear value when hidden
]
)
return updates
eval_prompt.change(
fn=update_variables,
inputs=eval_prompt,
outputs=[item for sublist in variable_rows for item in sublist],
)
# Regenerate button functionality
regenerate_button.click(
fn=regenerate_prompt,
inputs=[model_a_state, model_b_state, eval_prompt]
+ [var_input for _, var_input in variable_rows],
outputs=[
score_a,
critique_a,
score_b,
critique_b,
action_buttons_row,
model_name_a,
model_name_b,
model_a_state,
model_b_state,
],
)
# Update model names after responses are generated
def update_model_names(model_a, model_b):
return gr.update(value=f"*Model: {model_a}*"), gr.update(
value=f"*Model: {model_b}*"
)
# Store the last submitted prompt and variables for comparison
last_submission = gr.State({})
# Update the vote button click handlers
vote_a.click(
fn=lambda *args: vote("A", *args),
inputs=[
model_a_state,
model_b_state,
final_prompt_state,
score_a,
critique_a,
score_b,
critique_b,
judge_id,
],
outputs=[
action_buttons_row,
model_name_a,
model_name_b,
send_btn,
regenerate_button,
],
)
vote_b.click(
fn=lambda *args: vote("B", *args),
inputs=[
model_a_state,
model_b_state,
final_prompt_state,
score_a,
critique_a,
score_b,
critique_b,
judge_id,
],
outputs=[
action_buttons_row,
model_name_a,
model_name_b,
send_btn,
regenerate_button,
],
)
vote_tie.click(
fn=lambda *args: vote("Tie", *args),
inputs=[
model_a_state,
model_b_state,
final_prompt_state,
score_a,
critique_a,
score_b,
critique_b,
judge_id,
],
outputs=[
action_buttons_row,
model_name_a,
model_name_b,
send_btn,
regenerate_button,
],
)
# Update the send button handler to store the submitted inputs
def submit_and_store(prompt, *variables):
# Create a copy of the current submission
current_submission = {"prompt": prompt, "variables": variables}
# Get the responses
(
response_a,
response_b,
buttons_visible,
regen_visible,
model_a,
model_b,
final_prompt,
) = submit_prompt(prompt, *variables)
# Parse the responses
score_a, critique_a = parse_model_response(response_a)
score_b, critique_b = parse_model_response(response_b)
# Update the last_submission state with the current values
last_submission.value = current_submission
return (
score_a,
critique_a,
score_b,
critique_b,
buttons_visible,
gr.update(
visible=True, interactive=True
), # Show and enable regenerate button
model_a,
model_b,
final_prompt, # Add final_prompt to state
gr.update(value="*Model: Unknown*"),
gr.update(value="*Model: Unknown*"),
)
send_btn.click(
fn=submit_and_store,
inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
outputs=[
score_a,
critique_a,
score_b,
critique_b,
action_buttons_row,
regenerate_button,
model_a_state,
model_b_state,
final_prompt_state, # Add final_prompt_state to outputs
model_name_a,
model_name_b,
],
)
# Update the input change handlers to also disable regenerate button
def handle_input_changes(prompt, *variables):
"""Enable send button and manage regenerate button based on input changes"""
last_inputs = last_submission.value
current_inputs = {"prompt": prompt, "variables": variables}
inputs_changed = last_inputs != current_inputs
return [
gr.update(interactive=True), # send button always enabled
gr.update(
interactive=not inputs_changed
), # regenerate button disabled if inputs changed
]
# Update the change handlers for prompt and variables
eval_prompt.change(
fn=handle_input_changes,
inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
outputs=[send_btn, regenerate_button],
)
for _, var_input in variable_rows:
var_input.change(
fn=handle_input_changes,
inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
outputs=[send_btn, regenerate_button],
)
# Update the leaderboard
def refresh_leaderboard():
"""Refresh the leaderboard data and stats."""
leaderboard = get_leaderboard()
data = [
[
entry["Model"],
float(entry["ELO Score"]),
entry["95% CI"],
entry["# Votes"],
entry["Organization"],
entry["License"],
]
for entry in leaderboard
]
stats = get_leaderboard_stats()
return [gr.update(value=data), gr.update(value=stats)]
# Add the load event at the very end, just before demo.launch()
demo.load(
fn=refresh_leaderboard, inputs=None, outputs=[leaderboard_table, stats_display]
)
# Add click handlers for metric buttons
outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
hallucination_btn.click(
fn=lambda: set_example_metric("Hallucination"), outputs=outputs_list
)
precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
coherence_btn.click(
fn=lambda: set_example_metric("Logical_Coherence"), outputs=outputs_list
)
faithfulness_btn.click(
fn=lambda: set_example_metric("Faithfulness"), outputs=outputs_list
)
# Set default metric at startup
demo.load(
fn=lambda: set_example_metric("Custom"),
outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
)
if __name__ == "__main__":
demo.launch()