Spaces:

AtlaAI
/

judge-arena

Running

App Files Files Community

kaikaidai commited on Jan 14

Commit

cd8d5ec

verified ·

1 Parent(s): 7fe530a

Remove Flow judge until ready

Browse files

Files changed (1) hide show

gen_api_answer.py +1030 -417

gen_api_answer.py CHANGED Viewed

@@ -1,448 +1,1061 @@
-from openai import OpenAI
-import anthropic
-from together import Together
-import cohere
 import json
 import re
-import os
-import requests
 from prompts import (
-    JUDGE_SYSTEM_PROMPT,
-    PROMETHEUS_PROMPT,
-    PROMETHEUS_PROMPT_WITH_REFERENCE,
-    ATLA_PROMPT,
-    ATLA_PROMPT_WITH_REFERENCE,
-     FLOW_JUDGE_PROMPT
 )
-from transformers import AutoTokenizer
-# Initialize clients
-anthropic_client = anthropic.Anthropic()
-openai_client = OpenAI()
-together_client = Together()
-hf_api_key = os.getenv("HF_API_KEY")
-flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
-cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
-def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
-    """Get response from OpenAI API"""
-    try:
-        response = openai_client.chat.completions.create(
-            model=model_name,
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt},
-            ],
-            max_completion_tokens=max_tokens,
-            temperature=temperature,
-        )
-        return response.choices[0].message.content
-    except Exception as e:
-        return f"Error with OpenAI model {model_name}: {str(e)}"
-def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
-    """Get response from Anthropic API"""
-    try:
-        response = anthropic_client.messages.create(
-            model=model_name,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            system=system_prompt,
-            messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
-        )
-        return response.content[0].text
-    except Exception as e:
-        return f"Error with Anthropic model {model_name}: {str(e)}"
-def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
-    """Get response from Together API"""
-    try:
-        response = together_client.chat.completions.create(
-            model=model_name,
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt},
-            ],
-            max_tokens=max_tokens,
-            temperature=temperature,
-            stream=False,
-        )
-        return response.choices[0].message.content
-    except Exception as e:
-        return f"Error with Together model {model_name}: {str(e)}"
-def get_prometheus_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
-    """Get response from Hugging Face model"""
-    try:
-        headers = {
-            "Accept": "application/json",
-            "Authorization": f"Bearer {hf_api_key}",
-            "Content-Type": "application/json"
-        }
-        # Create messages list for chat template
-        messages = []
-        if system_prompt:
-            messages.append({"role": "system", "content": system_prompt})
-        messages.append({"role": "user", "content": prompt})
-        # Apply chat template
-        model_id = "prometheus-eval/prometheus-7b-v2.0"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        payload = {
-            "inputs": formatted_prompt,
-            "parameters": {
-                "max_new_tokens": max_tokens,
-                "return_full_text": False,
-                "temperature": temperature
-            }
-        }
-        response = requests.post(
-            "https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud",
-            headers=headers,
-            json=payload
-        )
-        return response.json()[0]["generated_text"]
-    except Exception as e:
-        return f"Error with Hugging Face model {model_name}: {str(e)}"
-def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
-    """Get response from HF endpoint for Atla model"""
-    try:
-        headers = {
-            "Accept": "application/json",
-            "Authorization": f"Bearer {hf_api_key}",
-            "Content-Type": "application/json"
-        }
-        # Create messages list for chat template
-        messages = []
-        if system_prompt:
-            messages.append({"role": "system", "content": system_prompt})
-        messages.append({"role": "user", "content": prompt})
-        # Apply chat template
-        model_id = "meta-llama/Llama-3.1-8B"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        payload = {
-            "inputs": formatted_prompt,
-            "parameters": {
-                "max_new_tokens": max_tokens,
-                "return_full_text": False,
-                "temperature": temperature,
-                "seed": 42,
-                "add_generation_prompt": True
-            }
-        }
-        response = requests.post(
-            "https://azk0vbxyrc64s2v2.us-east-1.aws.endpoints.huggingface.cloud",
-            headers=headers,
-            json=payload
-        )
-        return response.json()[0]["generated_text"]
-    except Exception as e:
-        return f"Error with Atla model {model_name}: {str(e)}"
-def get_flow_judge_response(model_name, prompt, max_tokens=500, temperature=0.1, top_p=0.95) -> str:
-    """Get response from Flow Judge"""
-    try:
-        response = requests.post(
-            "https://tsukuyomi.tailfa581.ts.net/v1/chat/completions",
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {flow_judge_api_key}"
-            },
-            json={
-                "model": model_name,
-                "messages": [
-                    {"role": "user", "content": prompt}
-                ],
-                "max_tokens": max_tokens,
-                "temperature": temperature,
-                "top_p": top_p
-            }
-        )
-        response.raise_for_status()
-        return response.json()["choices"][0]['message']['content']
-    except Exception as e:
-        return f"Error with Flow Judge completions model {model_name}: {str(e)}"
-def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
-    """Get response from Cohere API"""
     try:
-        response = cohere_client.chat(
-            model=model_name,
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt}
-            ],
-            max_tokens=max_tokens,
-            temperature=temperature
-        )
-        # Extract the text from the content items
-        content_items = response.message.content
-        if isinstance(content_items, list):
-            # Get the text from the first content item
-            return content_items[0].text
-        return str(content_items)  # Fallback if it's not a list
-    except Exception as e:
-        return f"Error with Cohere model {model_name}: {str(e)}"
-def get_model_response(
-    model_name,
-    model_info,
-    prompt_data,
-    use_reference=False,
-    max_tokens=500,
-    temperature=0
-):
-    """Get response from appropriate API based on model organization"""
-    if not model_info:
-        return "Model not found or unsupported."
-    api_model = model_info["api_model"]
-    organization = model_info["organization"]
-    # Determine if model is Prometheus or Atla or Flow Judge
-    is_prometheus = (organization == "Prometheus")
-    is_atla = (organization == "Atla")
-    is_flow_judge = (organization == "Flow AI")
-    # For non-Prometheus/Atla models/Flow Judge, use the Judge system prompt
-    system_prompt = None if (is_prometheus or is_atla or is_flow_judge) else JUDGE_SYSTEM_PROMPT
-    # Select the appropriate base prompt
-    if is_atla:
-        base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
-    elif is_flow_judge:
-        base_prompt = FLOW_JUDGE_PROMPT
     else:
-        base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
-    # For non-Prometheus/non-Atla models, replace the specific instruction
-    if not (is_prometheus or is_atla or is_flow_judge):
-        base_prompt = base_prompt.replace(
-            '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
-            '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
-        )
-    try:
-        if not is_flow_judge:
-            # Format the prompt with the provided data, only using available keys
-            final_prompt = base_prompt.format(
-                human_input=prompt_data['human_input'],
-                ai_response=prompt_data['ai_response'],
-                ground_truth_input=prompt_data.get('ground_truth_input', ''),
-                eval_criteria=prompt_data['eval_criteria'],
-                score1_desc=prompt_data['score1_desc'],
-                score2_desc=prompt_data['score2_desc'],
-                score3_desc=prompt_data['score3_desc'],
-                score4_desc=prompt_data['score4_desc'],
-                score5_desc=prompt_data['score5_desc']
-            )
-        else:
-            human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
-            ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
-            ground_truth=prompt_data.get('ground_truth_input', '')
-            if ground_truth:
-                response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
-            else:
-                response_reference = ""
-            eval_criteria = prompt_data['eval_criteria']
-            score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
-            score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
-            score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
-            score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
-            score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
-            rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
-            if response_reference:
-                inputs = human_input + "\n"+ response_reference
-            else:
-                inputs = human_input
-            final_prompt = base_prompt.format(
-                INPUTS=inputs,
-                OUTPUT=ai_response,
-                EVALUATION_CRITERIA=eval_criteria,
-                RUBRIC=rubric
-            )
-    except KeyError as e:
-        return f"Error formatting prompt: Missing required field {str(e)}"
-    try:
-        if organization == "OpenAI":
-            return get_openai_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature
-            )
-        elif organization == "Anthropic":
-            return get_anthropic_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature
-            )
-        elif organization == "Prometheus":
-            return get_prometheus_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
-            )
-        elif organization == "Atla":
-            return get_atla_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
             )
-        elif organization == "Cohere":
-            return get_cohere_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature
             )
-        elif organization == "Flow AI":
-            return get_flow_judge_response(
-                api_model, final_prompt, max_tokens, temperature
             )
         else:
-            # All other organizations use Together API
-            return get_together_response(
-                api_model, final_prompt, system_prompt, max_tokens, temperature
-            )
-    except Exception as e:
-        return f"Error with {organization} model {model_name}: {str(e)}"
-def parse_model_response(response):
-    try:
-        # Debug print
-        print(f"Raw model response: {response}")
-        # If response is already a dictionary, use it directly
-        if isinstance(response, dict):
-            return str(response.get("result", "N/A")), response.get("feedback", "N/A")
-        # First try to parse the entire response as JSON
-        try:
-            data = json.loads(response)
-            return str(data.get("result", "N/A")), data.get("feedback", "N/A")
-        except json.JSONDecodeError:
-            # If that fails (typically for smaller models), try to find JSON within the response
-            json_match = re.search(r"{.*}", response, re.DOTALL)
-            if json_match:
-                data = json.loads(json_match.group(0))
-                return str(data.get("result", "N/A")), data.get("feedback", "N/A")
-            else:
-                return "Error", f"Invalid response format returned - here is the raw model response: {response}"
-    except Exception as e:
-        # Debug print for error case
-        print(f"Failed to parse response: {str(e)}")
-        # If the error message itself contains valid JSON, try to parse that
-        try:
-            error_json_match = re.search(r"{.*}", str(e), re.DOTALL)
-            if error_json_match:
-                data = json.loads(error_json_match.group(0))
-                return str(data.get("result", "N/A")), data.get("feedback", "N/A")
-        except:
-            pass
-        return "Error", f"Failed to parse response: {response}"
-def prometheus_parse_model_response(output):
-    try:
-        print(f"Raw model response: {output}")
-        output = output.strip()
-        # Remove "Feedback:" prefix if present (case insensitive)
-        output = re.sub(r'^feedback:\s*', '', output, flags=re.IGNORECASE)
-        # New pattern to match [RESULT] X at the beginning
-        begin_result_pattern = r'^\[RESULT\]\s*(\d+)\s*\n*(.*?)$'
-        begin_match = re.search(begin_result_pattern, output, re.DOTALL | re.IGNORECASE)
-        if begin_match:
-            score = int(begin_match.group(1))
-            feedback = begin_match.group(2).strip()
-            return str(score), feedback
-        # Existing patterns for end-of-string results...
-        pattern = r"(.*?)\s*\[RESULT\]\s*[\(\[]?(\d+)[\)\]]?"
-        match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
-        if match:
-            feedback = match.group(1).strip()
-            score = int(match.group(2))
-            return str(score), feedback
-        # If no match, try to match "... Score: X"
-        pattern = r"(.*?)\s*(?:Score|Result)\s*:\s*[\(\[]?(\d+)[\)\]]?"
-        match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
-        if match:
-            feedback = match.group(1).strip()
-            score = int(match.group(2))
-            return str(score), feedback
-        # Pattern to handle [Score X] at the end
-        pattern = r"(.*?)\s*\[(?:Score|Result)\s*[\(\[]?(\d+)[\)\]]?\]$"
-        match = re.search(pattern, output, re.DOTALL)
-        if match:
-            feedback = match.group(1).strip()
-            score = int(match.group(2))
-            return str(score), feedback
-        # Final fallback attempt
-        pattern = r"[\(\[]?(\d+)[\)\]]?\s*\]?$"
-        match = re.search(pattern, output)
-        if match:
-            score = int(match.group(1))
-            feedback = output[:match.start()].rstrip()
-            # Remove any trailing brackets from feedback
-            feedback = re.sub(r'\s*\[[^\]]*$', '', feedback).strip()
-            return str(score), feedback
-        return "Error", f"Failed to parse response: {output}"
-    except Exception as e:
-        print(f"Failed to parse response: {str(e)}")
-        return "Error", f"Exception during parsing: {str(e)}"
-def atla_parse_model_response(output):
-    """Parse response from ATLA model"""
-    try:
-        print(f"Raw Atla model response: {output}")
-        output = output.strip()
-        # Look for the Reasoning and Result sections
-        reasoning_match = re.search(r'\*\*Reasoning:\*\*(.*?)(?=\*\*Result:|$)', output, re.DOTALL)
-        result_match = re.search(r'\*\*Result:\*\*\s*(\d+)', output)
-        if reasoning_match and result_match:
-            feedback = reasoning_match.group(1).strip()
-            score = result_match.group(1)
-            return str(score), feedback
-        return "Error", f"Failed to parse ATLA response format: {output}"
-    except Exception as e:
-        print(f"Failed to parse ATLA response: {str(e)}")
-        return "Error", f"Exception during parsing: {str(e)}"
-def flow_judge_parse_model_response(output):
-    try:
-        print(f"Raw model response: {output}")
-        # Convert multiple line breaks to single ones and strip whitespace
-        output = re.sub(r'\n{2,}', '\n', output.strip())
-        # Compile regex patterns
-        feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
-        score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
-        feedback_match = feedback_pattern.search(output)
-        score_match = score_pattern.search(output)
-        if feedback_match or not score_match:
-            feedback = feedback_match.group(1).strip()
-            score = int(score_match.group(1).strip())
-            return str(score), feedback
-        return "Error", f"Failed to parse response: {output}"
-    except Exception as e:
-        print(f"Failed to parse response: {str(e)}")
-        return "Error", f"Exception during parsing: {str(e)}"

 import json
 import re
+import random
+from collections import defaultdict
+from datetime import datetime
+import hashlib
+import gradio as gr
+from dotenv import load_dotenv
+load_dotenv()
+from gen_api_answer import (
+    get_model_response,
+    parse_model_response,
+    prometheus_parse_model_response,
+    atla_parse_model_response,
+    flow_judge_parse_model_response
+)
+from random_sample_generation import (
+    get_random_human_ai_pair,
+    get_random_human_ai_ground_truth_pair,
+    generate_ai_response
+)
+from db import add_vote, create_db_connection, get_votes
+from utils import Vote
+from common import (
+    POLICY_CONTENT,
+    ACKNOWLEDGEMENTS,
+    CSS_STYLES,
+    MAIN_TITLE,
+    HOW_IT_WORKS,
+)
 from prompts import (
+    DEFAULT_EVAL_PROMPT,
+    DEFAULT_EVAL_PROMPT_EDITABLE,
+    FIXED_EVAL_SUFFIX,
+    DEFAULT_EVAL_CRITERIA,
+    DEFAULT_SCORE_1,
+    DEFAULT_SCORE_2,
+    DEFAULT_SCORE_3,
+    DEFAULT_SCORE_4,
+    DEFAULT_SCORE_5,
+)
+from leaderboard import (
+    get_leaderboard,
+    get_leaderboard_stats,
+    get_model_rankings,
+    DEFAULT_ELO,
+    K_FACTOR
 )
+elo_scores = defaultdict(lambda: DEFAULT_ELO)
+vote_counts = defaultdict(int)
+db = create_db_connection()
+votes_collection = get_votes(db)
+current_time = datetime.now()
+# Load the model_data from JSONL
+def load_model_data():
+    model_data = {}
     try:
+        with open("data/models.jsonl", "r") as f:
+            for line in f:
+                model = json.loads(line)
+                model_data[model["name"]] = {
+                    "organization": model["organization"],
+                    "license": model["license"],
+                    "api_model": model["api_model"],
+                }
+    except FileNotFoundError:
+        print("Warning: models.jsonl not found")
+        return {}
+    return model_data
+model_data = load_model_data()
+def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
+    prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
+    vote = Vote(
+        timestamp=datetime.now().isoformat(),
+        prompt=prompt_value,
+        response_a=response_a,
+        response_b=response_b,
+        model_a=model_a,
+        model_b=model_b,
+        winner=winner,
+        judge_id=judge_id,
+    )
+    add_vote(vote, db)
+def parse_variables(prompt):
+    # Extract variables enclosed in double curly braces
+    variables = re.findall(r"{{(.*?)}}", prompt)
+    # Remove duplicates while preserving order
+    seen = set()
+    variables = [
+        x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
+    ]
+    return variables
+def get_final_prompt(eval_prompt, variable_values):
+    # Replace variables in the eval prompt with their values
+    for var, val in variable_values.items():
+        eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
+    return eval_prompt
+def get_ip(request: gr.Request) -> str:
+    """Get and hash the IP address from the request."""
+    if "cf-connecting-ip" in request.headers:
+        ip = request.headers["cf-connecting-ip"]
+    elif "x-forwarded-for" in request.headers:
+        ip = request.headers["x-forwarded-for"]
+        if "," in ip:
+            ip = ip.split(",")[0]
     else:
+        ip = request.client.host
+    # Hash the IP address for privacy
+    return hashlib.sha256(ip.encode()).hexdigest()[:16]
+def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
+    """Generate appropriate message based on vote and model rankings.
+    Returns (title, message) tuple."""
+    # Get current rankings
+    voting_data = get_current_votes()
+    leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
+    rankings = get_model_rankings(leaderboard)
+    pos_a = rankings.get(model_a, 0)
+    pos_b = rankings.get(model_b, 0)
+    if choice == "Tie":
+        return "It's a tie!", "Keep voting responsibly 🤗"
+    # Check if vote aligns with leaderboard
+    if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
+        return "The favourite wins!", "Keep voting responsibly 🤗"
+    else:
+        return "The underdog wins!", "Keep voting responsibly ���"
+def vote(
+    choice,
+    model_a,
+    model_b,
+    final_prompt,
+    score_a,
+    critique_a,
+    score_b,
+    critique_b,
+    request: gr.Request,
+):
+    # Get hashed IP as judge_id
+    judge_id = get_ip(request)
+    # Update ELO scores based on user choice
+    elo_a = elo_scores[model_a]
+    elo_b = elo_scores[model_b]
+    # Calculate expected scores
+    Ea = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
+    Eb = 1 / (1 + 10 ** ((elo_a - elo_b) / 400))
+    # Assign actual scores
+    if choice == "A":
+        Sa, Sb = 1, 0
+    elif choice == "B":
+        Sa, Sb = 0, 1
+    else:
+        Sa, Sb = 0.5, 0.5
+    # Update scores and vote counts
+    elo_scores[model_a] += K_FACTOR * (Sa - Ea)
+    elo_scores[model_b] += K_FACTOR * (Sb - Eb)
+    vote_counts[model_a] += 1
+    vote_counts[model_b] += 1
+    # Format the full responses with score and critique
+    response_a = f"""{score_a}
+{critique_a}"""
+    response_b = f"""{score_b}
+{critique_b}"""
+    # Store the vote data with the final prompt
+    store_vote_data(
+        final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
+    )
+    # Get model positions for display
+    voting_data = get_current_votes()
+    leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
+    rankings = get_model_rankings(leaderboard)
+    pos_a = rankings.get(model_a, 0)
+    pos_b = rankings.get(model_b, 0)
+    # Format model names with positions and win/loss indicators
+    if choice == "Tie":
+        model_a_display = f"*Model: {model_a} (Position #{pos_a})*"
+        model_b_display = f"*Model: {model_b} (Position #{pos_b})*"
+    else:
+        winner = model_a if choice == "A" else model_b
+        loser = model_b if choice == "A" else model_a
+        winner_pos = pos_a if choice == "A" else pos_b
+        loser_pos = pos_b if choice == "A" else pos_a
+        model_a_display = f"*Model: {model_a} {'✅' if choice == 'A' else '❌'} (Position #{pos_a})*"
+        model_b_display = f"*Model: {model_b} {'✅' if choice == 'B' else '❌'} (Position #{pos_b})*"
+    # Generate vote message
+    title, message = get_vote_message(choice, model_a, model_b)
+    return [
+        gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"),  # vote_a
+        gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"),  # vote_b
+        gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"),  # vote_tie
+        gr.update(value=model_a_display),  # model_name_a
+        gr.update(value=model_b_display),  # model_name_b
+        gr.update(interactive=True, value="Regenerate judges", variant="secondary"),  # send_btn
+        gr.update(value="🎲 New round", variant="primary"),  # random_btn
+        gr.Info(message, title=title),  # success message
+    ]
+def get_current_votes():
+    """Get current votes from database."""
+    return get_votes(db)
+# Update the refresh_leaderboard function
+def refresh_leaderboard(show_preliminary):
+    """Refresh the leaderboard data and stats."""
+    voting_data = get_current_votes()
+    leaderboard = get_leaderboard(model_data, voting_data, show_preliminary)
+    data = [
+        [
+            entry["Model"],
+            float(entry["ELO Score"]),
+            entry["95% CI"],
+            entry["# Votes"],
+            entry["Organization"],
+            entry["License"],
+        ]
+        for entry in leaderboard
+    ]
+    stats = get_leaderboard_stats(model_data, voting_data)
+    return [gr.update(value=data), gr.update(value=stats)]
+# Update the leaderboard table definition in the UI
+leaderboard_table = gr.Dataframe(
+    headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
+    datatype=["str", "number", "str", "number", "str", "str", "str"],
+)
+def populate_random_example(request: gr.Request, compatible_mode: bool):
+    """Generate a random human-AI conversation example and reset judge outputs."""
+    if compatible_mode:
+        # Generate all three components when compatible mode is enabled
+        human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
+    else:
+        # Generate only human and AI messages when compatible mode is disabled
+        human_msg, ai_msg = get_random_human_ai_pair()
+        ground_truth_msg = ""
+    return [
+        gr.update(value=human_msg),
+        gr.update(value=ai_msg),
+        gr.update(value="🎲", variant="secondary"),  # Reset random button appearance
+        gr.update(value=""),  # Clear score A
+        gr.update(value=""),  # Clear critique A
+        gr.update(value=""),  # Clear score B
+        gr.update(value=""),  # Clear critique B
+        gr.update(interactive=False, variant="primary"),  # Reset vote A
+        gr.update(interactive=False, variant="primary"),  # Reset vote B
+        gr.update(interactive=False, variant="primary"),  # Reset vote tie
+        gr.update(value="*Model: Hidden*"),  # Reset model name A
+        gr.update(value="*Model: Hidden*"),  # Reset model name B
+        gr.update(value=ground_truth_msg, visible=compatible_mode),  # Set ground truth and visibility
+    ]
+with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
+    gr.Markdown(MAIN_TITLE)
+    gr.Markdown(HOW_IT_WORKS)
+    # Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
+    eval_prompt = gr.Textbox(
+        value=DEFAULT_EVAL_PROMPT,
+        visible=False
+    )
+    with gr.Tabs():
+        with gr.TabItem("Judge Arena"):
+            with gr.Row():
+                # Left side - Input section
+                with gr.Column(scale=1):
+                    with gr.Group():
+                        human_input = gr.TextArea(
+                            label="👩 User Input",
+                            lines=10,
+                            placeholder="Enter the human message here..."
+                        )
+                        with gr.Row():
+                            generate_btn = gr.Button(
+                                "Generate AI Response",
+                                size="sm",
+                                interactive=False
+                            )
+                        ai_response = gr.TextArea(
+                            label="🤖 AI Response",
+                            lines=15,
+                            placeholder="Enter the AI response here..."
+                        )
+                        # Ground truth response (initially hidden)
+                        ground_truth = gr.TextArea(
+                            label="🎯 Ground truth response",
+                            lines=12,
+                            placeholder="Enter the ground truth response here...",
+                            visible=False
+                        )
+                    with gr.Row():
+                        random_btn = gr.Button("🎲", scale=2)
+                        send_btn = gr.Button(
+                            value="Run judges",
+                            variant="primary",
+                            size="lg",
+                            scale=8
+                        )
+                # Right side - Model outputs
+                with gr.Column(scale=1):
+                    gr.Markdown("### 👩‍⚖️ Judge A")
+                    with gr.Group():
+                        model_name_a = gr.Markdown("*Model: Hidden*")
+                        with gr.Row():
+                            with gr.Column(scale=1, min_width=100):  # Fixed narrow width for score
+                                score_a = gr.Textbox(label="Score", lines=6, interactive=False)
+                                vote_a = gr.Button("Vote A", variant="primary", interactive=False)
+                            with gr.Column(scale=9, min_width=400):  # Wider width for critique
+                                critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
+                    # Tie button row
+                    with gr.Row() as tie_button_row:
+                        with gr.Column():
+                            vote_tie = gr.Button("Tie", variant="primary", interactive=False)
+                    gr.Markdown("### 🧑‍⚖️ Judge B")
+                    with gr.Group():
+                        model_name_b = gr.Markdown("*Model: Hidden*")
+                        with gr.Row():
+                            with gr.Column(scale=1, min_width=100):  # Fixed narrow width for score
+                                score_b = gr.Textbox(label="Score", lines=6, interactive=False)
+                                vote_b = gr.Button("Vote B", variant="primary", interactive=False)
+                            with gr.Column(scale=9, min_width=400):  # Wider width for critique
+                                critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
+                        # Place Vote B button directly under Judge B
+            gr.Markdown("<br>")
+            # Replace the "Edit Judge Prompt" Accordion section with:
+            with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
+                gr.Markdown("<br>")
+                use_reference_toggle = gr.Checkbox(
+                    label="Use a reference response",
+                    value=False
+                )
+                # Hide the default prompt editor
+                with gr.Column(visible=False) as default_prompt_editor:
+                    eval_prompt_editable = gr.TextArea(
+                        value=DEFAULT_EVAL_PROMPT_EDITABLE,
+                        label="Evaluation Criteria",
+                        lines=12
+                    )
+                    with gr.Row(visible=False) as edit_buttons_row:
+                        cancel_prompt_btn = gr.Button("Cancel")
+                        save_prompt_btn = gr.Button("Save", variant="primary")
+                    gr.Markdown("*The sample being evaluated is always appended as:*")
+                    gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
+                # Show the compatible mode editor
+                with gr.Column(visible=True) as compatible_prompt_editor:
+                    with gr.Row():
+                        # Left column - Evaluation Criteria
+                        with gr.Column(scale=1):
+                            eval_criteria_text = gr.TextArea(
+                                label="Evaluation Criteria",
+                                lines=12,
+                                value=DEFAULT_EVAL_CRITERIA,
+                                placeholder="Enter the evaluation criteria..."
+                            )
+                            prometheus_reference = gr.Markdown(
+                                "<br> *By default, we use the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
+                                visible=True
+                            )
+                        # Right column - Score Descriptions
+                        with gr.Column(scale=1):
+                            score1_description = gr.TextArea(
+                                label="Score 1",
+                                value=DEFAULT_SCORE_1,
+                                placeholder="Description for score 1",
+                                lines=2
+                            )
+                            score2_description = gr.TextArea(
+                                label="Score 2",
+                                value=DEFAULT_SCORE_2,
+                                placeholder="Description for score 2",
+                                lines=2
+                            )
+                            score3_description = gr.TextArea(
+                                label="Score 3",
+                                value=DEFAULT_SCORE_3,
+                                placeholder="Description for score 3",
+                                lines=2
+                            )
+                            score4_description = gr.TextArea(
+                                label="Score 4",
+                                value=DEFAULT_SCORE_4,
+                                placeholder="Description for score 4",
+                                lines=2
+                            )
+                            score5_description = gr.TextArea(
+                                label="Score 5",
+                                value=DEFAULT_SCORE_5,
+                                placeholder="Description for score 5",
+                                lines=2
+                            )
+                    # Add save/cancel buttons for compatible mode
+                    with gr.Row(visible=False) as compatible_edit_buttons_row:
+                        compatible_cancel_btn = gr.Button("Cancel")
+                        compatible_save_btn = gr.Button("Save", variant="primary")
+        with gr.TabItem("Leaderboard"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    show_preliminary = gr.Checkbox(
+                        label="Reveal preliminary results",
+                        value=True,  # Checked by default
+                        info="Show all models, including models with less human ratings (< 300 votes)",
+                        interactive=True
+                    )
+            stats_display = gr.Markdown()
+            leaderboard_table = gr.Dataframe(
+                headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
+                datatype=["str", "number", "str", "number", "str", "str", "str"],
             )
+            gr.Markdown("""<br>
+                        <br>
+                        Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
+                        [*"Together Turbo achieves this performance while maintaining full accuracy compared to Meta's reference implementation across all models. Llama-3.1-405B-Instruct-Turbo matches the accuracy of Meta reference models."*](https://www.together.ai/blog/together-inference-engine-2)
+            """)
+            # Add change handler for checkbox
+            show_preliminary.change(
+                fn=refresh_leaderboard,
+                inputs=[show_preliminary],
+                outputs=[leaderboard_table, stats_display]
             )
+            # Update the load event
+            demo.load(
+                fn=refresh_leaderboard,
+                inputs=[show_preliminary],
+                outputs=[leaderboard_table, stats_display]
             )
+        with gr.TabItem("Policy"):
+            gr.Markdown(POLICY_CONTENT)
+            gr.Markdown(ACKNOWLEDGEMENTS)
+    # Define state variables for model tracking
+    model_a_state = gr.State()
+    model_b_state = gr.State()
+    final_prompt_state = gr.State()
+    eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE)  # Initialize with default value
+    is_editing = gr.State(False)  # Track editing state
+    compatible_mode_state = gr.State(False)  # Track compatible mode state
+    # Update model names after responses are generated
+    def update_model_names(model_a, model_b):
+        return gr.update(value=f"*Model: {model_a}*"), gr.update(
+            value=f"*Model: {model_b}*"
+        )
+    # Store the last submitted prompt and variables for comparison
+    last_submission = gr.State({})
+    # Update the vote button click handlers
+    vote_a.click(
+        fn=vote,
+        inputs=[
+            gr.State("A"),
+            model_a_state,
+            model_b_state,
+            final_prompt_state,
+            score_a,
+            critique_a,
+            score_b,
+            critique_b,
+        ],
+        outputs=[
+            vote_a,
+            vote_b,
+            vote_tie,
+            model_name_a,
+            model_name_b,
+            send_btn,
+            random_btn,
+            gr.State(),  # placeholder for success message
+        ],
+    )
+    vote_b.click(
+        fn=vote,
+        inputs=[
+            gr.State("B"),
+            model_a_state,
+            model_b_state,
+            final_prompt_state,
+            score_a,
+            critique_a,
+            score_b,
+            critique_b,
+        ],
+        outputs=[
+            vote_a,
+            vote_b,
+            vote_tie,
+            model_name_a,
+            model_name_b,
+            send_btn,
+            random_btn,
+            gr.State(),  # placeholder for success message
+        ],
+    )
+    vote_tie.click(
+        fn=vote,
+        inputs=[
+            gr.State("Tie"),
+            model_a_state,
+            model_b_state,
+            final_prompt_state,
+            score_a,
+            critique_a,
+            score_b,
+            critique_b,
+        ],
+        outputs=[
+            vote_a,
+            vote_b,
+            vote_tie,
+            model_name_a,
+            model_name_b,
+            send_btn,
+            random_btn,
+            gr.State(),  # placeholder for success message
+        ],
+    )
+    # Add handlers for save/cancel buttons
+    def save_prompt(new_prompt, previous_prompt):
+        return [
+            gr.update(value=new_prompt),  # Update the prompt
+            new_prompt,  # Update the previous prompt state
+            gr.update(visible=False)  # Hide the buttons
+        ]
+    def cancel_prompt(previous_prompt):
+        return [
+            gr.update(value=previous_prompt),  # Revert to previous prompt
+            previous_prompt,  # Keep the previous prompt state
+            gr.update(visible=False)  # Hide the buttons
+        ]
+    def show_edit_buttons(current_value, previous_value):
+        # Show buttons only if the current value differs from the previous value
+        return gr.update(visible=current_value != previous_value)
+    # Add handlers for save/cancel buttons and prompt changes
+    save_prompt_btn.click(
+        fn=save_prompt,
+        inputs=[eval_prompt_editable, eval_prompt_previous],
+        outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
+    )
+    cancel_prompt_btn.click(
+        fn=cancel_prompt,
+        inputs=[eval_prompt_previous],
+        outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
+    )
+    eval_prompt_editable.change(
+        fn=show_edit_buttons,
+        inputs=[eval_prompt_editable, eval_prompt_previous],
+        outputs=edit_buttons_row
+    )
+    # Function to toggle visibility based on compatible mode
+    def toggle_use_reference(checked):
+        if checked:
+            # Get new random samples with ground truth when enabling reference mode
+            human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
+            return {
+                ground_truth: gr.update(visible=True, value=ground_truth_msg),
+                human_input: gr.update(value=human_msg),
+                ai_response: gr.update(value=ai_msg),
+                # Reset other UI elements
+                score_a: gr.update(value=""),
+                critique_a: gr.update(value=""),
+                score_b: gr.update(value=""),
+                critique_b: gr.update(value=""),
+                vote_a: gr.update(interactive=False, variant="primary"),
+                vote_b: gr.update(interactive=False, variant="primary"),
+                vote_tie: gr.update(interactive=False, variant="primary"),
+                model_name_a: gr.update(value="*Model: Hidden*"),
+                model_name_b: gr.update(value="*Model: Hidden*"),
+                random_btn: gr.update(value="🎲", variant="secondary"),
+            }
         else:
+            # Just hide ground truth when disabling reference mode
+            return {
+                ground_truth: gr.update(visible=False)
+            }
+    # Update the change handler to include all necessary outputs
+    use_reference_toggle.change(
+        fn=toggle_use_reference,
+        inputs=[use_reference_toggle],
+        outputs=[
+            ground_truth,
+            human_input,
+            ai_response,
+            score_a,
+            critique_a,
+            score_b,
+            critique_b,
+            vote_a,
+            vote_b,
+            vote_tie,
+            model_name_a,
+            model_name_b,
+            random_btn,
+        ]
+    )
+    # Add a new state variable to track first game
+    first_game_state = gr.State(True)  # Initialize as True
+    # Update the submit function to use the state variable
+    def submit_and_store(
+        use_reference,
+        eval_criteria_text_input,
+        human_input,
+        ai_response,
+        ground_truth_input,
+        score1_description,
+        score2_description,
+        score3_description,
+        score4_description,
+        score5_description,
+        is_first_game,  # Add state variable as input
+    ):
+        # Build prompt data dictionary
+        prompt_data = {
+            'human_input': human_input,
+            'ai_response': ai_response,
+            'ground_truth_input': ground_truth_input,
+            'eval_criteria': eval_criteria_text_input,
+            'score1_desc': score1_description,
+            'score2_desc': score2_description,
+            'score3_desc': score3_description,
+            'score4_desc': score4_description,
+            'score5_desc': score5_description,
+        }
+        # Get list of active models only for matches
+        active_models = [name for name, info in model_data.items()
+                        if info.get("active", True)]
+        atla_model = "Atla-8B-preview"
+        if is_first_game:
+            # For the first game, ensure new model is one of the models to catch up on votes
+            other_models = [m for m in active_models if m != atla_model]
+            other_model = random.choice(other_models)
+            # Randomly assign new model to either position A or B
+            if random.random() < 0.5:
+                model_a, model_b = atla_model, other_model
+            else:
+                model_a, model_b = other_model, atla_model
+        else:
+            # For subsequent games, new models appears 40% of the time
+            if random.random() < 0.4:
+                # Randomly choose between new models
+                new_model = random.choice(["Atla-8B-preview"]) # add "Flow-Judge-1.0" once ready
+                other_models = [m for m in active_models if m not in [new_model]]
+                other_model = random.choice(other_models)
+                if random.random() < 0.5:
+                    model_a, model_b = new_model, other_model
+                else:
+                    model_a, model_b = other_model, new_model
+            else:
+                # For other cases, exclude both Atla and Flow-Judge
+                non_special_models = [m for m in active_models if m not in new_model]
+                model1, model2 = random.sample(non_special_models, 2)
+                model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
+        # Get responses from models
+        response_a = get_model_response(
+            model_a,
+            model_data.get(model_a),
+            prompt_data,
+            use_reference=use_reference
+        )
+        response_b = get_model_response(
+            model_b,
+            model_data.get(model_b),
+            prompt_data,
+            use_reference=use_reference
+        )
+        # Parse the responses based on model, using appropriate parsing for different models
+        is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
+        is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
+        is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
+        is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
+        is_flow_judge_a = (model_data.get(model_a)['organization'] == 'Flow AI')
+        is_flow_judge_b = (model_data.get(model_b)['organization'] == 'Flow AI')
+        if is_prometheus_a:
+            score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
+            score_a_val = f"{score_a_val} / 5"
+        elif is_atla_a:
+            score_a_val, critique_a_val = atla_parse_model_response(response_a)
+            score_a_val = f"{score_a_val} / 5"
+        elif is_flow_judge_a:
+            score_a_val, critique_a_val = flow_judge_parse_model_response(response_a)
+            score_a_val = f"{score_a_val} / 5"
+        else:
+            score_a_val, critique_a_val = parse_model_response(response_a)
+            score_a_val = f"{score_a_val} / 5"
+        if is_prometheus_b:
+            score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
+            score_b_val = f"{score_b_val} / 5"
+        elif is_atla_b:
+            score_b_val, critique_b_val = atla_parse_model_response(response_b)
+            score_b_val = f"{score_b_val} / 5"
+        elif is_flow_judge_b:
+            score_b_val, critique_b_val = flow_judge_parse_model_response(response_b)
+            score_b_val = f"{score_b_val} / 5"
+        else:
+            score_b_val, critique_b_val = parse_model_response(response_b)
+            score_b_val = f"{score_b_val} / 5"
+        return (
+            score_a_val,
+            critique_a_val,
+            score_b_val,
+            critique_b_val,
+            gr.update(interactive=True, variant="primary"),  # vote_a
+            gr.update(interactive=True, variant="primary"),  # vote_b
+            gr.update(interactive=True, variant="primary"),  # vote_tie
+            model_a,
+            model_b,
+            eval_prompt,
+            gr.update(value="*Model: Hidden*"),
+            gr.update(value="*Model: Hidden*"),
+            gr.update(value="Regenerate judges", variant="secondary", interactive=True),
+            gr.update(value="🎲"),  # random_btn
+            False,  # Set first_game_state to False after first submission
+        )
+    # Update the click handler to use False for is_first_game after first submission
+    def create_submit_handler():
+        first_game = True
+        def handler(*args):
+            nonlocal first_game
+            result = submit_and_store(*args, first_game)
+            first_game = False  # Set to False after first submission
+            return result
+        return handler
+    # Update the send_btn click handler
+    send_btn.click(
+        fn=submit_and_store,
+        inputs=[
+            use_reference_toggle,
+            eval_criteria_text,
+            human_input,
+            ai_response,
+            ground_truth,
+            score1_description,
+            score2_description,
+            score3_description,
+            score4_description,
+            score5_description,
+            first_game_state,  # Add first_game_state as input
+        ],
+        outputs=[
+            score_a,
+            critique_a,
+            score_b,
+            critique_b,
+            vote_a,
+            vote_b,
+            vote_tie,
+            model_a_state,
+            model_b_state,
+            final_prompt_state,
+            model_name_a,
+            model_name_b,
+            send_btn,
+            random_btn,
+            first_game_state,  # Add first_game_state as output
+        ],
+    )
+    # Add random button handler
+    random_btn.click(
+        fn=populate_random_example,
+        inputs=[use_reference_toggle],  # Use compatible mode toggle to decide behavior
+        outputs=[
+            human_input,
+            ai_response,
+            random_btn,
+            score_a,
+            critique_a,
+            score_b,
+            critique_b,
+            vote_a,
+            vote_b,
+            vote_tie,
+            model_name_a,
+            model_name_b,
+            ground_truth,  # Set ground truth
+        ]
+    )
+    # Add new input change handlers
+    def handle_input_change():
+        """Reset UI state when inputs are changed"""
+        return [
+            gr.update(interactive=False),  # vote_a
+            gr.update(interactive=False),  # vote_b
+            gr.update(interactive=False),  # vote_tie
+            gr.update(value="Run judges", variant="primary"),  # send_btn
+            gr.update(value="🎲", variant="secondary"),  # random_btn
+        ]
+    # Update the change handlers for inputs
+    human_input.change(
+        fn=handle_input_change,
+        inputs=[],
+        outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
+    )
+    ai_response.change(
+        fn=handle_input_change,
+        inputs=[],
+        outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
+    )
+    generate_btn.click(
+        fn=lambda msg: (
+            generate_ai_response(msg)[0],  # Only take the response text
+            gr.update(
+                value="Generate AI Response",  # Keep the label
+                interactive=False  # Disable the button
+            )
+        ),
+        inputs=[human_input],
+        outputs=[ai_response, generate_btn]
+    )
+    human_input.change(
+        fn=lambda x: gr.update(interactive=bool(x.strip())),
+        inputs=[human_input],
+        outputs=[generate_btn]
+    )
+    # Update the demo.load to include the random example population
+    demo.load(
+        fn=lambda: populate_random_example(None, False),  # Pass False for initial compatible_mode
+        inputs=[],
+        outputs=[
+            human_input,
+            ai_response,
+            random_btn,
+            score_a,
+            critique_a,
+            score_b,
+            critique_b,
+            vote_a,
+            vote_b,
+            vote_tie,
+            model_name_a,
+            model_name_b,
+            ground_truth,
+        ]
+    )
+    # Add new state variables for compatible mode
+    eval_criteria_previous = gr.State(value=DEFAULT_EVAL_CRITERIA)
+    score1_previous = gr.State(value=DEFAULT_SCORE_1)
+    score2_previous = gr.State(value=DEFAULT_SCORE_2)
+    score3_previous = gr.State(value=DEFAULT_SCORE_3)
+    score4_previous = gr.State(value=DEFAULT_SCORE_4)
+    score5_previous = gr.State(value=DEFAULT_SCORE_5)
+    # Add new functions to handle compatible mode saves/cancels
+    def save_compatible_prompt(criteria, score1, score2, score3, score4, score5):
+        return [
+            gr.update(value=criteria),  # Update criteria
+            criteria,  # Update previous criteria state
+            gr.update(value=score1),
+            score1,
+            gr.update(value=score2),
+            score2,
+            gr.update(value=score3),
+            score3,
+            gr.update(value=score4),
+            score4,
+            gr.update(value=score5),
+            score5,
+            gr.update(visible=False)  # Hide buttons
+        ]
+    def cancel_compatible_prompt(prev_criteria, prev_score1, prev_score2, prev_score3, prev_score4, prev_score5):
+        return [
+            gr.update(value=prev_criteria),
+            prev_criteria,
+            gr.update(value=prev_score1),
+            prev_score1,
+            gr.update(value=prev_score2),
+            prev_score2,
+            gr.update(value=prev_score3),
+            prev_score3,
+            gr.update(value=prev_score4),
+            prev_score4,
+            gr.update(value=prev_score5),
+            prev_score5,
+            gr.update(visible=False)
+        ]
+    def show_compatible_edit_buttons(*current_values):
+        previous_values = current_values[1::2]  # Get previous values
+        current_values = current_values[::2]    # Get current values
+        return gr.update(visible=any(curr != prev for curr, prev in zip(current_values, previous_values)))
+    # Add click handlers for compatible mode buttons
+    compatible_save_btn.click(
+        fn=save_compatible_prompt,
+        inputs=[
+            eval_criteria_text,
+            score1_description,
+            score2_description,
+            score3_description,
+            score4_description,
+            score5_description
+        ],
+        outputs=[
+            eval_criteria_text,
+            eval_criteria_previous,
+            score1_description,
+            score1_previous,
+            score2_description,
+            score2_previous,
+            score3_description,
+            score3_previous,
+            score4_description,
+            score4_previous,
+            score5_description,
+            score5_previous,
+            compatible_edit_buttons_row
+        ]
+    )
+    compatible_cancel_btn.click(
+        fn=cancel_compatible_prompt,
+        inputs=[
+            eval_criteria_previous,
+            score1_previous,
+            score2_previous,
+            score3_previous,
+            score4_previous,
+            score5_previous
+        ],
+        outputs=[
+            eval_criteria_text,
+            eval_criteria_previous,
+            score1_description,
+            score1_previous,
+            score2_description,
+            score2_previous,
+            score3_description,
+            score3_previous,
+            score4_description,
+            score4_previous,
+            score5_description,
+            score5_previous,
+            compatible_edit_buttons_row
+        ]
+    )
+    # Add change handlers for all compatible mode inputs
+    for component in [eval_criteria_text, score1_description, score2_description,
+                     score3_description, score4_description, score5_description]:
+        component.change(
+            fn=show_compatible_edit_buttons,
+            inputs=[
+                eval_criteria_text,
+                eval_criteria_previous,
+                score1_description,
+                score1_previous,
+                score2_description,
+                score2_previous,
+                score3_description,
+                score3_previous,
+                score4_description,
+                score4_previous,
+                score5_description,
+                score5_previous
+            ],
+            outputs=compatible_edit_buttons_row
+        )
+if __name__ == "__main__":
+    demo.launch()