Spaces:

AtlaAI
/

judge-arena

Running

App Files Files Community

add-Flow-Judge-v0.1

by bergr7f - opened Jan 10

base: refs/heads/main

←

from: refs/pr/8

Discussion Files changed

+387

-97

Files changed (5) hide show

app.py +71 -42
data/models.jsonl +25 -21
gen_api_answer.py +167 -29
leaderboard.py +4 -2
prompts.py +120 -3

app.py CHANGED Viewed

@@ -13,7 +13,9 @@ import gradio as gr
 from gen_api_answer import (
     get_model_response,
     parse_model_response,
-    prometheus_parse_model_response
 )
 from random_sample_generation import (
@@ -113,40 +115,6 @@ def get_final_prompt(eval_prompt, variable_values):
     return eval_prompt
-def submit_prompt(eval_prompt, *variable_values):
-    try:
-        variables = parse_variables(eval_prompt)
-        variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
-        final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
-        models = list(model_data.keys())
-        model1, model2 = random.sample(models, 2)
-        model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
-        response_a = get_model_response(model_a, model_data.get(model_a), final_prompt)
-        response_b = get_model_response(model_b, model_data.get(model_b), final_prompt)
-        return (
-            response_a,
-            response_b,
-            gr.update(visible=True),
-            gr.update(visible=True),
-            model_a,
-            model_b,
-            final_prompt,
-        )
-    except Exception as e:
-        print(f"Error in submit_prompt: {str(e)}")
-        return (
-            "Error generating response",
-            "Error generating response",
-            gr.update(visible=False),
-            gr.update(visible=False),
-            None,
-            None,
-            None,
-        )
 def get_ip(request: gr.Request) -> str:
     """Get and hash the IP address from the request."""
@@ -492,7 +460,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
                     show_preliminary = gr.Checkbox(
                         label="Reveal preliminary results",
                         value=True,  # Checked by default
-                        info="Show all models, including models with less human ratings (< 500 votes)",
                         interactive=True
                     )
             stats_display = gr.Markdown()
@@ -714,6 +682,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
         score3_description,
         score4_description,
         score5_description,
     ):
         # Build prompt data dictionary
         prompt_data = {
@@ -728,9 +697,40 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
             'score5_desc': score5_description,
         }
-        models = list(model_data.keys())
-        model1, model2 = random.sample(models, 2)
-        model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
         # Get responses from models
         response_a = get_model_response(
@@ -746,13 +746,24 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
             use_reference=use_reference
         )
-        # Parse the responses based on model, using Prometheus parsing for Prometheus models and JSON parsing for others
         is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
         is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
         if is_prometheus_a:
             score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
             score_a_val = f"{score_a_val} / 5"
         else:
             score_a_val, critique_a_val = parse_model_response(response_a)
             score_a_val = f"{score_a_val} / 5"
@@ -760,6 +771,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
         if is_prometheus_b:
             score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
             score_b_val = f"{score_b_val} / 5"
         else:
             score_b_val, critique_b_val = parse_model_response(response_b)
             score_b_val = f"{score_b_val} / 5"
@@ -781,9 +798,21 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
             gr.update(value="🎲"),  # random_btn
         )
-    # Update the click handler to use the editable prompt
     send_btn.click(
-        fn=submit_and_store,
         inputs=[
             use_reference_toggle,
             eval_criteria_text,

 from gen_api_answer import (
     get_model_response,
     parse_model_response,
+    prometheus_parse_model_response,
+    atla_parse_model_response,
+    flow_judge_parse_model_response,
 )
 from random_sample_generation import (
     return eval_prompt
 def get_ip(request: gr.Request) -> str:
     """Get and hash the IP address from the request."""
                     show_preliminary = gr.Checkbox(
                         label="Reveal preliminary results",
                         value=True,  # Checked by default
+                        info="Show all models, including models with less human ratings (< 300 votes)",
                         interactive=True
                     )
             stats_display = gr.Markdown()
         score3_description,
         score4_description,
         score5_description,
+        is_first_game=False
     ):
         # Build prompt data dictionary
         prompt_data = {
             'score5_desc': score5_description,
         }
+        # Get list of active models only for matches
+        active_models = [name for name, info in model_data.items()
+                        if info.get("active", True)]  # Default to True for backward compatibility
+        # Modified model selection logic
+        atla_model = "Atla-8B-preview-2024-01-08"
+        if is_first_game:
+            # For the first game, ensure Atla is one of the models
+            other_models = [m for m in active_models if m != atla_model]
+            other_model = random.choice(other_models)
+            # Randomly assign Atla to either position A or B
+            if random.random() < 0.5:
+                model_a, model_b = atla_model, other_model
+            else:
+                model_a, model_b = other_model, atla_model
+        else:
+            # For subsequent games, Atla appears 30% of the time
+            if random.random() < 0.3:
+                # Include Atla in this battle
+                other_models = [m for m in active_models if m != atla_model]
+                other_model = random.choice(other_models)
+                # Randomly assign Atla to either position A or B
+                if random.random() < 0.5:
+                    model_a, model_b = atla_model, other_model
+                else:
+                    model_a, model_b = other_model, atla_model
+            else:
+                # Battle between two non-Atla models
+                non_atla_models = [m for m in active_models if m != atla_model]
+                model1, model2 = random.sample(non_atla_models, 2)
+                model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
         # Get responses from models
         response_a = get_model_response(
             use_reference=use_reference
         )
+        # Parse the responses based on model, using appropriate parsing for different models
         is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
         is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
+        is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
+        is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
+        is_flow_judge_a = (model_data.get(model_a)['organization'] == 'Flow AI')
+        is_flow_judge_b = (model_data.get(model_b)['organization'] == 'Flow AI')
         if is_prometheus_a:
             score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
             score_a_val = f"{score_a_val} / 5"
+        elif is_atla_a:
+            score_a_val, critique_a_val = atla_parse_model_response(response_a)
+            score_a_val = f"{score_a_val} / 5"
+        elif is_flow_judge_a:
+            score_a_val, critique_a_val = flow_judge_parse_model_response(response_a)
+            score_a_val = f"{score_a_val} / 5"
         else:
             score_a_val, critique_a_val = parse_model_response(response_a)
             score_a_val = f"{score_a_val} / 5"
         if is_prometheus_b:
             score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
             score_b_val = f"{score_b_val} / 5"
+        elif is_atla_b:
+            score_b_val, critique_b_val = atla_parse_model_response(response_b)
+            score_b_val = f"{score_b_val} / 5"
+        elif is_flow_judge_b:
+            score_b_val, critique_b_val = flow_judge_parse_model_response(response_b)
+            score_b_val = f"{score_b_val} / 5"
         else:
             score_b_val, critique_b_val = parse_model_response(response_b)
             score_b_val = f"{score_b_val} / 5"
             gr.update(value="🎲"),  # random_btn
         )
+    # Update the click handler to use False for is_first_game after first submission
+    def create_submit_handler():
+        first_game = True
+        def handler(*args):
+            nonlocal first_game
+            result = submit_and_store(*args, first_game)
+            first_game = False  # Set to False after first submission
+            return result
+        return handler
+    # Update the send_btn click handler
     send_btn.click(
+        fn=create_submit_handler(),
         inputs=[
             use_reference_toggle,
             eval_criteria_text,

data/models.jsonl CHANGED Viewed

@@ -1,21 +1,25 @@
-{"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"}
-{"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"}
-{"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"}
-{"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"}
-{"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"}
-{"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"}
-{"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"}
-{"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"}
-{"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"}
-{"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"}
-{"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"}
-{"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-latest"}
-{"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"}
-{"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"}
-{"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"}
-{"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"}
-{"name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest"}
-{"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest"}
-{"name": "Prometheus-7b v2", "organization": "Prometheus", "license": "Open Source", "api_model": "prometheus/prometheus-7b-v2"}
-{"name": "Command-R", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r"}
-{"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus"}

+{"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", "active": false}
+{"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", "active": true}
+{"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it", "active": true}
+{"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it", "active": true}
+{"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct", "active": true}
+{"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3", "active": true}
+{"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o", "active": true}
+{"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "active": true}
+{"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "active": true}
+{"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307", "active": true}
+{"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229", "active": true}
+{"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-latest", "active": true}
+{"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", "active": true}
+{"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "active": true}
+{"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "active": true}
+{"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1", "active": true}
+{"name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest", "active": true}
+{"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest", "active": true}
+{"name": "Prometheus-7b v2", "organization": "Prometheus", "license": "Open Source", "api_model": "prometheus/prometheus-7b-v2", "active": true}
+{"name": "Command-R", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r", "active": true}
+{"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus", "active": true}
+{"name": "Atla-8B-preview-2024-01-08", "organization": "Atla", "license": "Open Source", "api_model": "Atla-8B-preview-2024-01-08", "active": true}
+{"name": "Meta Llama 3.3 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "active": true}
+{"name": "QwQ 32B Preview", "organization": "Qwen", "license": "Open Source", "api_model": "Qwen/QwQ-32B-Preview", "active": true}
+{"name": "Flow-Judge-v0.1", "organization": "Flow AI", "license": "Open Source", "api_model": "Flow-Judge-v0.1-4.65bpw-exl2"}

gen_api_answer.py CHANGED Viewed

@@ -10,6 +10,9 @@ from prompts import (
     JUDGE_SYSTEM_PROMPT,
     PROMETHEUS_PROMPT,
     PROMETHEUS_PROMPT_WITH_REFERENCE,
 )
 # Initialize clients
@@ -18,10 +21,10 @@ openai_client = OpenAI()
 together_client = Together()
 hf_api_key = os.getenv("HF_API_KEY")
 cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
-huggingface_client = OpenAI(
-    base_url="https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud/v1/",
-    api_key=hf_api_key
-)
 def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
     """Get response from OpenAI API"""
@@ -70,7 +73,7 @@ def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT,
     except Exception as e:
         return f"Error with Together model {model_name}: {str(e)}"
-def get_hf_response(model_name, prompt, max_tokens=500):
     """Get response from Hugging Face model"""
     try:
         headers = {
@@ -83,7 +86,8 @@ def get_hf_response(model_name, prompt, max_tokens=500):
             "inputs": prompt,
             "parameters": {
                 "max_new_tokens": max_tokens,
-                "return_full_text": False
             }
         }
@@ -96,6 +100,34 @@ def get_hf_response(model_name, prompt, max_tokens=500):
     except Exception as e:
         return f"Error with Hugging Face model {model_name}: {str(e)}"
 def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
     """Get response from Cohere API"""
     try:
@@ -116,6 +148,30 @@ def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, m
         return str(content_items)  # Fallback if it's not a list
     except Exception as e:
         return f"Error with Cohere model {model_name}: {str(e)}"
 def get_model_response(
     model_name,
@@ -132,38 +188,67 @@ def get_model_response(
     api_model = model_info["api_model"]
     organization = model_info["organization"]
-    # Determine if model is Prometheus
     is_prometheus = (organization == "Prometheus")
-    # For non-Prometheus models, use the Judge system prompt
-    system_prompt = None if is_prometheus else JUDGE_SYSTEM_PROMPT
     # Select the appropriate base prompt
-    if use_reference:
-        base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE
     else:
-        base_prompt = PROMETHEUS_PROMPT
-    # For non-Prometheus models, replace the specific instruction
-    if not is_prometheus:
         base_prompt = base_prompt.replace(
             '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
             '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
         )
     try:
-        # Format the prompt with the provided data, only using available keys
-        final_prompt = base_prompt.format(
-            human_input=prompt_data['human_input'],
-            ai_response=prompt_data['ai_response'],
-            ground_truth_input=prompt_data.get('ground_truth_input', ''),
-            eval_criteria=prompt_data['eval_criteria'],
-            score1_desc=prompt_data['score1_desc'],
-            score2_desc=prompt_data['score2_desc'],
-            score3_desc=prompt_data['score3_desc'],
-            score4_desc=prompt_data['score4_desc'],
-            score5_desc=prompt_data['score5_desc']
-        )
     except KeyError as e:
         return f"Error formatting prompt: Missing required field {str(e)}"
@@ -177,13 +262,21 @@ def get_model_response(
                 api_model, final_prompt, system_prompt, max_tokens, temperature
             )
         elif organization == "Prometheus":
-            return get_hf_response(
-                api_model, final_prompt, max_tokens
             )
         elif organization == "Cohere":
             return get_cohere_response(
                 api_model, final_prompt, system_prompt, max_tokens, temperature
             )
         else:
             # All other organizations use Together API
             return get_together_response(
@@ -269,4 +362,49 @@ def prometheus_parse_model_response(output):
     except Exception as e:
         print(f"Failed to parse response: {str(e)}")
         return "Error", f"Exception during parsing: {str(e)}"

     JUDGE_SYSTEM_PROMPT,
     PROMETHEUS_PROMPT,
     PROMETHEUS_PROMPT_WITH_REFERENCE,
+    ATLA_PROMPT,
+    ATLA_PROMPT_WITH_REFERENCE,
+    FLOW_JUDGE_PROMPT
 )
 # Initialize clients
 together_client = Together()
 hf_api_key = os.getenv("HF_API_KEY")
 cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
+flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
 def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
     """Get response from OpenAI API"""
     except Exception as e:
         return f"Error with Together model {model_name}: {str(e)}"
+def get_prometheus_response(model_name, prompt, max_tokens=500, temperature=0.01): # temperature needs to be > 0 for hf to work
     """Get response from Hugging Face model"""
     try:
         headers = {
             "inputs": prompt,
             "parameters": {
                 "max_new_tokens": max_tokens,
+                "return_full_text": False,
+                "temperature": temperature
             }
         }
     except Exception as e:
         return f"Error with Hugging Face model {model_name}: {str(e)}"
+def get_atla_response(model_name, prompt, max_tokens=500, temperature=0.01):
+    """Get response from HF endpoint for Atla model"""
+    try:
+        headers = {
+            "Accept": "application/json",
+            "Authorization": f"Bearer {hf_api_key}",
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "inputs": prompt,
+            "parameters": {
+                "max_new_tokens": max_tokens,
+                "return_full_text": False,
+                "temperature": temperature,
+                "seed": 42
+            }
+        }
+        response = requests.post(
+            "https://azk0vbxyrc64s2v2.us-east-1.aws.endpoints.huggingface.cloud",
+            headers=headers,
+            json=payload
+        )
+        return response.json()[0]["generated_text"]
+    except Exception as e:
+        return f"Error with Atla model {model_name}: {str(e)}"
 def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
     """Get response from Cohere API"""
     try:
         return str(content_items)  # Fallback if it's not a list
     except Exception as e:
         return f"Error with Cohere model {model_name}: {str(e)}"
+def get_flow_judge_response(model_name, prompt, max_tokens=500, temperature=0.1, top_p=0.95) -> str:
+    """Get response from Flow Judge"""
+    try:
+        response = requests.post(
+            "https://tsukuyomi.tailfa581.ts.net/v1/chat/completions",
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {flow_judge_api_key}"
+            },
+            json={
+                "model": model_name,
+                "messages": [
+                    {"role": "user", "content": prompt}
+                ],
+                "max_tokens": max_tokens,
+                "temperature": temperature,
+                "top_p": top_p
+            }
+        )
+        response.raise_for_status()
+        return response.json()["choices"][0]['message']['content']
+    except Exception as e:
+        return f"Error with Flow Judge completions model {model_name}: {str(e)}"
 def get_model_response(
     model_name,
     api_model = model_info["api_model"]
     organization = model_info["organization"]
+    # Determine if model is Prometheus or Atla
     is_prometheus = (organization == "Prometheus")
+    is_atla = (organization == "Atla")
+    is_flow_judge = (organization == "Flow AI")
+    # For non-Prometheus/Atla models, use the Judge system prompt
+    system_prompt = None if (is_prometheus or is_atla or is_flow_judge) else JUDGE_SYSTEM_PROMPT
     # Select the appropriate base prompt
+    if is_atla:
+        base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
+    elif is_flow_judge:
+        base_prompt = FLOW_JUDGE_PROMPT
     else:
+        base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
+    # For non-Prometheus/non-Atla models, replace the specific instruction
+    if not (is_prometheus or is_atla or is_flow_judge):
         base_prompt = base_prompt.replace(
             '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
             '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
         )
     try:
+        if not is_flow_judge:
+            # Format the prompt with the provided data, only using available keys
+            final_prompt = base_prompt.format(
+                human_input=prompt_data['human_input'],
+                ai_response=prompt_data['ai_response'],
+                ground_truth_input=prompt_data.get('ground_truth_input', ''),
+                eval_criteria=prompt_data['eval_criteria'],
+                score1_desc=prompt_data['score1_desc'],
+                score2_desc=prompt_data['score2_desc'],
+                score3_desc=prompt_data['score3_desc'],
+                score4_desc=prompt_data['score4_desc'],
+                score5_desc=prompt_data['score5_desc']
+            )
+        else:
+            human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
+            ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
+            ground_truth=prompt_data.get('ground_truth_input', '')
+            if ground_truth:
+                response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
+            else:
+                response_reference = ""
+            eval_criteria = prompt_data['eval_criteria']
+            score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
+            score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
+            score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
+            score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
+            score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
+            rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
+            if response_reference:
+                inputs = human_input + "\n"+ response_reference
+            else:
+                inputs = human_input
+            final_prompt = base_prompt.format(
+                INPUTS=inputs,
+                OUTPUT=ai_response,
+                EVALUATION_CRITERIA=eval_criteria,
+                RUBRIC=rubric
+            )
     except KeyError as e:
         return f"Error formatting prompt: Missing required field {str(e)}"
                 api_model, final_prompt, system_prompt, max_tokens, temperature
             )
         elif organization == "Prometheus":
+            return get_prometheus_response(
+                api_model, final_prompt, max_tokens, temperature = 0.01
+            )
+        elif organization == "Atla":
+            return get_atla_response(
+                api_model, final_prompt, max_tokens, temperature = 0.01
             )
         elif organization == "Cohere":
             return get_cohere_response(
                 api_model, final_prompt, system_prompt, max_tokens, temperature
             )
+        elif organization == "Flow AI":
+            return get_flow_judge_response(
+                api_model, final_prompt, max_tokens, temperature
+            )
         else:
             # All other organizations use Together API
             return get_together_response(
     except Exception as e:
         print(f"Failed to parse response: {str(e)}")
+        return "Error", f"Exception during parsing: {str(e)}"
+def flow_judge_parse_model_response(output):
+    try:
+        print(f"Raw model response: {output}")
+        # Convert multiple line breaks to single ones and strip whitespace
+        output = re.sub(r'\n{2,}', '\n', output.strip())
+        # Compile regex patterns
+        feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
+        score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
+        feedback_match = feedback_pattern.search(output)
+        score_match = score_pattern.search(output)
+        if feedback_match or not score_match:
+            feedback = feedback_match.group(1).strip()
+            score = int(score_match.group(1).strip())
+            return str(score), feedback
+        return "Error", f"Failed to parse response: {output}"
+    except Exception as e:
+        print(f"Failed to parse response: {str(e)}")
+        return "Error", f"Exception during parsing: {str(e)}"
+def atla_parse_model_response(output):
+    """Parse response from ATLA model"""
+    try:
+        print(f"Raw Atla model response: {output}")
+        output = output.strip()
+        # Look for the Reasoning and Result sections
+        reasoning_match = re.search(r'\*\*Reasoning:\*\*(.*?)(?=\*\*Result:|$)', output, re.DOTALL)
+        result_match = re.search(r'\*\*Result:\*\*\s*(\d+)', output)
+        if reasoning_match and result_match:
+            feedback = reasoning_match.group(1).strip()
+            score = result_match.group(1)
+            return str(score), feedback
+        return "Error", f"Failed to parse ATLA response format: {output}"
+    except Exception as e:
+        print(f"Failed to parse ATLA response: {str(e)}")
         return "Error", f"Exception during parsing: {str(e)}"

leaderboard.py CHANGED Viewed

@@ -55,8 +55,8 @@ def get_leaderboard(model_data: Dict, voting_data: List, show_preliminary=True):
     leaderboard = []
     for model in model_data.keys():
         votes = matches[model]
-        # Skip models with < 500 votes if show_preliminary is False
-        if not show_preliminary and votes < 500:
             continue
         elo = ratings[model]
@@ -81,6 +81,8 @@ def get_leaderboard_stats(model_data: Dict, voting_data: List) -> str:
     now = datetime.now(timezone.utc)
     total_votes = len(voting_data)
     total_models = len(model_data)
     last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
         "%B %d, %Y at %H:00 UTC"
     )

     leaderboard = []
     for model in model_data.keys():
         votes = matches[model]
+        # Skip models with < 300 votes if show_preliminary is False
+        if not show_preliminary and votes < 300:
             continue
         elo = ratings[model]
     now = datetime.now(timezone.utc)
     total_votes = len(voting_data)
     total_models = len(model_data)
+    # last_updated = now.strftime("%B %d, %Y at %H:%M:%S UTC")
     last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
         "%B %d, %Y at %H:00 UTC"
     )

prompts.py CHANGED Viewed

@@ -33,9 +33,9 @@ Score 5: The model consistently offers highly relevant and useful responses that
 # Fixed suffix that will always be appended
 FIXED_EVAL_SUFFIX = """
-[User Query]: {{input}}
-[AI Response]: {{response}}"""
 # Define the Prometheus prompt used by default (without reference)
 PROMETHEUS_PROMPT = """###Task Description:
@@ -90,5 +90,122 @@ Score 5: {score5_desc}
 ###Feedback:
 """
 # Judge system prompt for non-Prometheus models
-JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""

 # Fixed suffix that will always be appended
 FIXED_EVAL_SUFFIX = """
+[User Query]: {{human_input}}
+[AI Response]: {{ai_response}}"""
 # Define the Prometheus prompt used by default (without reference)
 PROMETHEUS_PROMPT = """###Task Description:
 ###Feedback:
 """
+# Define the Flow Judge prompt
+FLOW_JUDGE_PROMPT = """# GOAL
+Your job is to evaluate a task carried out by an AI system powered by a large \
+language model.
+You will be provided with the inputs and output of the task, as well as the evaluation criteria \
+and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation \
+criteria and scoring rubric provided.
+# INPUT
+Below are the inputs required for performing the task:
+<inputs>
+{INPUTS}
+</inputs>
+# OUTPUT
+Below is the output of the task:
+<output>
+{OUTPUT}
+</output>
+# EVALUATION CRITERIA AND SCORING RUBRIC
+Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
+<evaluation_criteria>
+{EVALUATION_CRITERIA}
+</evaluation_criteria>
+<scoring_rubric>
+{RUBRIC}
+</scoring_rubric>
+# INSTRUCTIONS FOR THE EVALUATION
+1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. \
+Review the evaluation criteria and scoring rubric to understand the different levels of \
+performance and the descriptions for each score.
+2. Review the inputs and output: Look at the inputs provided for the task. Examine the output \
+generated from completing the task.
+3. Compare output to score descriptions: Compare the output against the criteria and score \
+descriptions in the scoring rubric. For each criterion,decide which description best matches the \
+output.
+4. After comparing the output to the score descriptions, pay attention to the small details that \
+might impact the final score that you assign. Sometimes a small difference can dictate the final \
+score.
+5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring \
+to specific aspects of the output and comparing them to the rubric.
+6. Assign a final score based on the scoring rubric.
+## FORMAT FOR THE EVALUATION
+- Write the verbal feedback inside <feedback> tags without any additional surrounding text.
+- Write the numeric score inside <score> tags, without any additional surrounding text and always \
+after the feedback.
+Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric."""
 # Judge system prompt for non-Prometheus models
+JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
+ATLA_PROMPT = """<|begin_of_text|><|start_header_id|>user<|end_header_id|> You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score between 1 and 5, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
+  Here are some rules of the evaluation:
+  (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
+  Your reply should strictly follow this format:
+  **Reasoning:** <Your feedback>
+  **Result:** <an integer between 1 and 5>
+  Here is the data:
+  Instruction:
+  ```
+  {human_input}
+  ```
+  Response:
+  ```
+  {ai_response}
+  ```
+  Score Rubrics:
+  [{eval_criteria}]
+  Score 1: {score1_desc}
+  Score 2: {score2_desc}
+  Score 3: {score3_desc}
+  Score 4: {score4_desc}
+  Score 5: {score5_desc}
+  <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
+ATLA_PROMPT_WITH_REFERENCE = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric and reference answer that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score between 1 and 5, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
+  Here are some rules of the evaluation:
+  (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
+  Your reply should strictly follow this format:
+  **Reasoning:** <Your feedback>
+  **Result:** <an integer between 1 and 5>
+  Here is the data:
+  Instruction:
+  ```
+  {human_input}
+  ```
+  Response:
+  ```
+  {ai_response}
+  ```
+  Score Rubrics:
+  [{eval_criteria}]
+  Score 1: {score1_desc}
+  Score 2: {score2_desc}
+  Score 3: {score3_desc}
+  Score 4: {score4_desc}
+  Score 5: {score5_desc}
+  Reference answer:
+  {ground_truth_input}"""