add-Flow-Judge-v0.1

#8
by bergr7f - opened
Files changed (5) hide show
  1. app.py +71 -42
  2. data/models.jsonl +25 -21
  3. gen_api_answer.py +167 -29
  4. leaderboard.py +4 -2
  5. prompts.py +120 -3
app.py CHANGED
@@ -13,7 +13,9 @@ import gradio as gr
13
  from gen_api_answer import (
14
  get_model_response,
15
  parse_model_response,
16
- prometheus_parse_model_response
 
 
17
  )
18
 
19
  from random_sample_generation import (
@@ -113,40 +115,6 @@ def get_final_prompt(eval_prompt, variable_values):
113
  return eval_prompt
114
 
115
 
116
- def submit_prompt(eval_prompt, *variable_values):
117
- try:
118
- variables = parse_variables(eval_prompt)
119
- variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
120
- final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
121
-
122
- models = list(model_data.keys())
123
- model1, model2 = random.sample(models, 2)
124
- model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
125
-
126
- response_a = get_model_response(model_a, model_data.get(model_a), final_prompt)
127
- response_b = get_model_response(model_b, model_data.get(model_b), final_prompt)
128
-
129
- return (
130
- response_a,
131
- response_b,
132
- gr.update(visible=True),
133
- gr.update(visible=True),
134
- model_a,
135
- model_b,
136
- final_prompt,
137
- )
138
- except Exception as e:
139
- print(f"Error in submit_prompt: {str(e)}")
140
- return (
141
- "Error generating response",
142
- "Error generating response",
143
- gr.update(visible=False),
144
- gr.update(visible=False),
145
- None,
146
- None,
147
- None,
148
- )
149
-
150
 
151
  def get_ip(request: gr.Request) -> str:
152
  """Get and hash the IP address from the request."""
@@ -492,7 +460,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
492
  show_preliminary = gr.Checkbox(
493
  label="Reveal preliminary results",
494
  value=True, # Checked by default
495
- info="Show all models, including models with less human ratings (< 500 votes)",
496
  interactive=True
497
  )
498
  stats_display = gr.Markdown()
@@ -714,6 +682,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
714
  score3_description,
715
  score4_description,
716
  score5_description,
 
717
  ):
718
  # Build prompt data dictionary
719
  prompt_data = {
@@ -728,9 +697,40 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
728
  'score5_desc': score5_description,
729
  }
730
 
731
- models = list(model_data.keys())
732
- model1, model2 = random.sample(models, 2)
733
- model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
 
735
  # Get responses from models
736
  response_a = get_model_response(
@@ -746,13 +746,24 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
746
  use_reference=use_reference
747
  )
748
 
749
- # Parse the responses based on model, using Prometheus parsing for Prometheus models and JSON parsing for others
750
  is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
751
  is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
 
 
 
 
 
752
 
753
  if is_prometheus_a:
754
  score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
755
  score_a_val = f"{score_a_val} / 5"
 
 
 
 
 
 
756
  else:
757
  score_a_val, critique_a_val = parse_model_response(response_a)
758
  score_a_val = f"{score_a_val} / 5"
@@ -760,6 +771,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
760
  if is_prometheus_b:
761
  score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
762
  score_b_val = f"{score_b_val} / 5"
 
 
 
 
 
 
763
  else:
764
  score_b_val, critique_b_val = parse_model_response(response_b)
765
  score_b_val = f"{score_b_val} / 5"
@@ -781,9 +798,21 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
781
  gr.update(value="🎲"), # random_btn
782
  )
783
 
784
- # Update the click handler to use the editable prompt
 
 
 
 
 
 
 
 
 
 
 
 
785
  send_btn.click(
786
- fn=submit_and_store,
787
  inputs=[
788
  use_reference_toggle,
789
  eval_criteria_text,
 
13
  from gen_api_answer import (
14
  get_model_response,
15
  parse_model_response,
16
+ prometheus_parse_model_response,
17
+ atla_parse_model_response,
18
+ flow_judge_parse_model_response,
19
  )
20
 
21
  from random_sample_generation import (
 
115
  return eval_prompt
116
 
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  def get_ip(request: gr.Request) -> str:
120
  """Get and hash the IP address from the request."""
 
460
  show_preliminary = gr.Checkbox(
461
  label="Reveal preliminary results",
462
  value=True, # Checked by default
463
+ info="Show all models, including models with less human ratings (< 300 votes)",
464
  interactive=True
465
  )
466
  stats_display = gr.Markdown()
 
682
  score3_description,
683
  score4_description,
684
  score5_description,
685
+ is_first_game=False
686
  ):
687
  # Build prompt data dictionary
688
  prompt_data = {
 
697
  'score5_desc': score5_description,
698
  }
699
 
700
+ # Get list of active models only for matches
701
+ active_models = [name for name, info in model_data.items()
702
+ if info.get("active", True)] # Default to True for backward compatibility
703
+
704
+ # Modified model selection logic
705
+ atla_model = "Atla-8B-preview-2024-01-08"
706
+
707
+ if is_first_game:
708
+ # For the first game, ensure Atla is one of the models
709
+ other_models = [m for m in active_models if m != atla_model]
710
+ other_model = random.choice(other_models)
711
+
712
+ # Randomly assign Atla to either position A or B
713
+ if random.random() < 0.5:
714
+ model_a, model_b = atla_model, other_model
715
+ else:
716
+ model_a, model_b = other_model, atla_model
717
+ else:
718
+ # For subsequent games, Atla appears 30% of the time
719
+ if random.random() < 0.3:
720
+ # Include Atla in this battle
721
+ other_models = [m for m in active_models if m != atla_model]
722
+ other_model = random.choice(other_models)
723
+
724
+ # Randomly assign Atla to either position A or B
725
+ if random.random() < 0.5:
726
+ model_a, model_b = atla_model, other_model
727
+ else:
728
+ model_a, model_b = other_model, atla_model
729
+ else:
730
+ # Battle between two non-Atla models
731
+ non_atla_models = [m for m in active_models if m != atla_model]
732
+ model1, model2 = random.sample(non_atla_models, 2)
733
+ model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
734
 
735
  # Get responses from models
736
  response_a = get_model_response(
 
746
  use_reference=use_reference
747
  )
748
 
749
+ # Parse the responses based on model, using appropriate parsing for different models
750
  is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
751
  is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
752
+ is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
753
+ is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
754
+
755
+ is_flow_judge_a = (model_data.get(model_a)['organization'] == 'Flow AI')
756
+ is_flow_judge_b = (model_data.get(model_b)['organization'] == 'Flow AI')
757
 
758
  if is_prometheus_a:
759
  score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
760
  score_a_val = f"{score_a_val} / 5"
761
+ elif is_atla_a:
762
+ score_a_val, critique_a_val = atla_parse_model_response(response_a)
763
+ score_a_val = f"{score_a_val} / 5"
764
+ elif is_flow_judge_a:
765
+ score_a_val, critique_a_val = flow_judge_parse_model_response(response_a)
766
+ score_a_val = f"{score_a_val} / 5"
767
  else:
768
  score_a_val, critique_a_val = parse_model_response(response_a)
769
  score_a_val = f"{score_a_val} / 5"
 
771
  if is_prometheus_b:
772
  score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
773
  score_b_val = f"{score_b_val} / 5"
774
+ elif is_atla_b:
775
+ score_b_val, critique_b_val = atla_parse_model_response(response_b)
776
+ score_b_val = f"{score_b_val} / 5"
777
+ elif is_flow_judge_b:
778
+ score_b_val, critique_b_val = flow_judge_parse_model_response(response_b)
779
+ score_b_val = f"{score_b_val} / 5"
780
  else:
781
  score_b_val, critique_b_val = parse_model_response(response_b)
782
  score_b_val = f"{score_b_val} / 5"
 
798
  gr.update(value="🎲"), # random_btn
799
  )
800
 
801
+ # Update the click handler to use False for is_first_game after first submission
802
+ def create_submit_handler():
803
+ first_game = True
804
+
805
+ def handler(*args):
806
+ nonlocal first_game
807
+ result = submit_and_store(*args, first_game)
808
+ first_game = False # Set to False after first submission
809
+ return result
810
+
811
+ return handler
812
+
813
+ # Update the send_btn click handler
814
  send_btn.click(
815
+ fn=create_submit_handler(),
816
  inputs=[
817
  use_reference_toggle,
818
  eval_criteria_text,
data/models.jsonl CHANGED
@@ -1,21 +1,25 @@
1
- {"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"}
2
- {"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"}
3
- {"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"}
4
- {"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"}
5
- {"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"}
6
- {"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"}
7
- {"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"}
8
- {"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"}
9
- {"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"}
10
- {"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"}
11
- {"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"}
12
- {"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-latest"}
13
- {"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"}
14
- {"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"}
15
- {"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"}
16
- {"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"}
17
- {"name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest"}
18
- {"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest"}
19
- {"name": "Prometheus-7b v2", "organization": "Prometheus", "license": "Open Source", "api_model": "prometheus/prometheus-7b-v2"}
20
- {"name": "Command-R", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r"}
21
- {"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus"}
 
 
 
 
 
1
+ {"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", "active": false}
2
+ {"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", "active": true}
3
+ {"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it", "active": true}
4
+ {"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it", "active": true}
5
+ {"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct", "active": true}
6
+ {"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3", "active": true}
7
+ {"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o", "active": true}
8
+ {"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "active": true}
9
+ {"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "active": true}
10
+ {"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307", "active": true}
11
+ {"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229", "active": true}
12
+ {"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-latest", "active": true}
13
+ {"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", "active": true}
14
+ {"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "active": true}
15
+ {"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "active": true}
16
+ {"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1", "active": true}
17
+ {"name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest", "active": true}
18
+ {"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest", "active": true}
19
+ {"name": "Prometheus-7b v2", "organization": "Prometheus", "license": "Open Source", "api_model": "prometheus/prometheus-7b-v2", "active": true}
20
+ {"name": "Command-R", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r", "active": true}
21
+ {"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus", "active": true}
22
+ {"name": "Atla-8B-preview-2024-01-08", "organization": "Atla", "license": "Open Source", "api_model": "Atla-8B-preview-2024-01-08", "active": true}
23
+ {"name": "Meta Llama 3.3 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "active": true}
24
+ {"name": "QwQ 32B Preview", "organization": "Qwen", "license": "Open Source", "api_model": "Qwen/QwQ-32B-Preview", "active": true}
25
+ {"name": "Flow-Judge-v0.1", "organization": "Flow AI", "license": "Open Source", "api_model": "Flow-Judge-v0.1-4.65bpw-exl2"}
gen_api_answer.py CHANGED
@@ -10,6 +10,9 @@ from prompts import (
10
  JUDGE_SYSTEM_PROMPT,
11
  PROMETHEUS_PROMPT,
12
  PROMETHEUS_PROMPT_WITH_REFERENCE,
 
 
 
13
  )
14
 
15
  # Initialize clients
@@ -18,10 +21,10 @@ openai_client = OpenAI()
18
  together_client = Together()
19
  hf_api_key = os.getenv("HF_API_KEY")
20
  cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
21
- huggingface_client = OpenAI(
22
- base_url="https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud/v1/",
23
- api_key=hf_api_key
24
- )
25
 
26
  def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
27
  """Get response from OpenAI API"""
@@ -70,7 +73,7 @@ def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT,
70
  except Exception as e:
71
  return f"Error with Together model {model_name}: {str(e)}"
72
 
73
- def get_hf_response(model_name, prompt, max_tokens=500):
74
  """Get response from Hugging Face model"""
75
  try:
76
  headers = {
@@ -83,7 +86,8 @@ def get_hf_response(model_name, prompt, max_tokens=500):
83
  "inputs": prompt,
84
  "parameters": {
85
  "max_new_tokens": max_tokens,
86
- "return_full_text": False
 
87
  }
88
  }
89
 
@@ -96,6 +100,34 @@ def get_hf_response(model_name, prompt, max_tokens=500):
96
  except Exception as e:
97
  return f"Error with Hugging Face model {model_name}: {str(e)}"
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
100
  """Get response from Cohere API"""
101
  try:
@@ -116,6 +148,30 @@ def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, m
116
  return str(content_items) # Fallback if it's not a list
117
  except Exception as e:
118
  return f"Error with Cohere model {model_name}: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  def get_model_response(
121
  model_name,
@@ -132,38 +188,67 @@ def get_model_response(
132
  api_model = model_info["api_model"]
133
  organization = model_info["organization"]
134
 
135
- # Determine if model is Prometheus
136
  is_prometheus = (organization == "Prometheus")
137
-
138
- # For non-Prometheus models, use the Judge system prompt
139
- system_prompt = None if is_prometheus else JUDGE_SYSTEM_PROMPT
 
140
 
141
  # Select the appropriate base prompt
142
- if use_reference:
143
- base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE
 
 
144
  else:
145
- base_prompt = PROMETHEUS_PROMPT
146
 
147
- # For non-Prometheus models, replace the specific instruction
148
- if not is_prometheus:
149
  base_prompt = base_prompt.replace(
150
  '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
151
  '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
152
  )
153
 
154
  try:
155
- # Format the prompt with the provided data, only using available keys
156
- final_prompt = base_prompt.format(
157
- human_input=prompt_data['human_input'],
158
- ai_response=prompt_data['ai_response'],
159
- ground_truth_input=prompt_data.get('ground_truth_input', ''),
160
- eval_criteria=prompt_data['eval_criteria'],
161
- score1_desc=prompt_data['score1_desc'],
162
- score2_desc=prompt_data['score2_desc'],
163
- score3_desc=prompt_data['score3_desc'],
164
- score4_desc=prompt_data['score4_desc'],
165
- score5_desc=prompt_data['score5_desc']
166
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  except KeyError as e:
168
  return f"Error formatting prompt: Missing required field {str(e)}"
169
 
@@ -177,13 +262,21 @@ def get_model_response(
177
  api_model, final_prompt, system_prompt, max_tokens, temperature
178
  )
179
  elif organization == "Prometheus":
180
- return get_hf_response(
181
- api_model, final_prompt, max_tokens
 
 
 
 
182
  )
183
  elif organization == "Cohere":
184
  return get_cohere_response(
185
  api_model, final_prompt, system_prompt, max_tokens, temperature
186
  )
 
 
 
 
187
  else:
188
  # All other organizations use Together API
189
  return get_together_response(
@@ -269,4 +362,49 @@ def prometheus_parse_model_response(output):
269
 
270
  except Exception as e:
271
  print(f"Failed to parse response: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  return "Error", f"Exception during parsing: {str(e)}"
 
10
  JUDGE_SYSTEM_PROMPT,
11
  PROMETHEUS_PROMPT,
12
  PROMETHEUS_PROMPT_WITH_REFERENCE,
13
+ ATLA_PROMPT,
14
+ ATLA_PROMPT_WITH_REFERENCE,
15
+ FLOW_JUDGE_PROMPT
16
  )
17
 
18
  # Initialize clients
 
21
  together_client = Together()
22
  hf_api_key = os.getenv("HF_API_KEY")
23
  cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
24
+
25
+
26
+ flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
27
+
28
 
29
  def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
30
  """Get response from OpenAI API"""
 
73
  except Exception as e:
74
  return f"Error with Together model {model_name}: {str(e)}"
75
 
76
+ def get_prometheus_response(model_name, prompt, max_tokens=500, temperature=0.01): # temperature needs to be > 0 for hf to work
77
  """Get response from Hugging Face model"""
78
  try:
79
  headers = {
 
86
  "inputs": prompt,
87
  "parameters": {
88
  "max_new_tokens": max_tokens,
89
+ "return_full_text": False,
90
+ "temperature": temperature
91
  }
92
  }
93
 
 
100
  except Exception as e:
101
  return f"Error with Hugging Face model {model_name}: {str(e)}"
102
 
103
+ def get_atla_response(model_name, prompt, max_tokens=500, temperature=0.01):
104
+ """Get response from HF endpoint for Atla model"""
105
+ try:
106
+ headers = {
107
+ "Accept": "application/json",
108
+ "Authorization": f"Bearer {hf_api_key}",
109
+ "Content-Type": "application/json"
110
+ }
111
+
112
+ payload = {
113
+ "inputs": prompt,
114
+ "parameters": {
115
+ "max_new_tokens": max_tokens,
116
+ "return_full_text": False,
117
+ "temperature": temperature,
118
+ "seed": 42
119
+ }
120
+ }
121
+
122
+ response = requests.post(
123
+ "https://azk0vbxyrc64s2v2.us-east-1.aws.endpoints.huggingface.cloud",
124
+ headers=headers,
125
+ json=payload
126
+ )
127
+ return response.json()[0]["generated_text"]
128
+ except Exception as e:
129
+ return f"Error with Atla model {model_name}: {str(e)}"
130
+
131
  def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
132
  """Get response from Cohere API"""
133
  try:
 
148
  return str(content_items) # Fallback if it's not a list
149
  except Exception as e:
150
  return f"Error with Cohere model {model_name}: {str(e)}"
151
+
152
+ def get_flow_judge_response(model_name, prompt, max_tokens=500, temperature=0.1, top_p=0.95) -> str:
153
+ """Get response from Flow Judge"""
154
+ try:
155
+ response = requests.post(
156
+ "https://tsukuyomi.tailfa581.ts.net/v1/chat/completions",
157
+ headers={
158
+ "Content-Type": "application/json",
159
+ "Authorization": f"Bearer {flow_judge_api_key}"
160
+ },
161
+ json={
162
+ "model": model_name,
163
+ "messages": [
164
+ {"role": "user", "content": prompt}
165
+ ],
166
+ "max_tokens": max_tokens,
167
+ "temperature": temperature,
168
+ "top_p": top_p
169
+ }
170
+ )
171
+ response.raise_for_status()
172
+ return response.json()["choices"][0]['message']['content']
173
+ except Exception as e:
174
+ return f"Error with Flow Judge completions model {model_name}: {str(e)}"
175
 
176
  def get_model_response(
177
  model_name,
 
188
  api_model = model_info["api_model"]
189
  organization = model_info["organization"]
190
 
191
+ # Determine if model is Prometheus or Atla
192
  is_prometheus = (organization == "Prometheus")
193
+ is_atla = (organization == "Atla")
194
+ is_flow_judge = (organization == "Flow AI")
195
+ # For non-Prometheus/Atla models, use the Judge system prompt
196
+ system_prompt = None if (is_prometheus or is_atla or is_flow_judge) else JUDGE_SYSTEM_PROMPT
197
 
198
  # Select the appropriate base prompt
199
+ if is_atla:
200
+ base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
201
+ elif is_flow_judge:
202
+ base_prompt = FLOW_JUDGE_PROMPT
203
  else:
204
+ base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
205
 
206
+ # For non-Prometheus/non-Atla models, replace the specific instruction
207
+ if not (is_prometheus or is_atla or is_flow_judge):
208
  base_prompt = base_prompt.replace(
209
  '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
210
  '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
211
  )
212
 
213
  try:
214
+ if not is_flow_judge:
215
+ # Format the prompt with the provided data, only using available keys
216
+ final_prompt = base_prompt.format(
217
+ human_input=prompt_data['human_input'],
218
+ ai_response=prompt_data['ai_response'],
219
+ ground_truth_input=prompt_data.get('ground_truth_input', ''),
220
+ eval_criteria=prompt_data['eval_criteria'],
221
+ score1_desc=prompt_data['score1_desc'],
222
+ score2_desc=prompt_data['score2_desc'],
223
+ score3_desc=prompt_data['score3_desc'],
224
+ score4_desc=prompt_data['score4_desc'],
225
+ score5_desc=prompt_data['score5_desc']
226
+ )
227
+ else:
228
+ human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
229
+ ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
230
+ ground_truth=prompt_data.get('ground_truth_input', '')
231
+ if ground_truth:
232
+ response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
233
+ else:
234
+ response_reference = ""
235
+ eval_criteria = prompt_data['eval_criteria']
236
+ score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
237
+ score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
238
+ score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
239
+ score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
240
+ score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
241
+ rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
242
+ if response_reference:
243
+ inputs = human_input + "\n"+ response_reference
244
+ else:
245
+ inputs = human_input
246
+ final_prompt = base_prompt.format(
247
+ INPUTS=inputs,
248
+ OUTPUT=ai_response,
249
+ EVALUATION_CRITERIA=eval_criteria,
250
+ RUBRIC=rubric
251
+ )
252
  except KeyError as e:
253
  return f"Error formatting prompt: Missing required field {str(e)}"
254
 
 
262
  api_model, final_prompt, system_prompt, max_tokens, temperature
263
  )
264
  elif organization == "Prometheus":
265
+ return get_prometheus_response(
266
+ api_model, final_prompt, max_tokens, temperature = 0.01
267
+ )
268
+ elif organization == "Atla":
269
+ return get_atla_response(
270
+ api_model, final_prompt, max_tokens, temperature = 0.01
271
  )
272
  elif organization == "Cohere":
273
  return get_cohere_response(
274
  api_model, final_prompt, system_prompt, max_tokens, temperature
275
  )
276
+ elif organization == "Flow AI":
277
+ return get_flow_judge_response(
278
+ api_model, final_prompt, max_tokens, temperature
279
+ )
280
  else:
281
  # All other organizations use Together API
282
  return get_together_response(
 
362
 
363
  except Exception as e:
364
  print(f"Failed to parse response: {str(e)}")
365
+ return "Error", f"Exception during parsing: {str(e)}"
366
+
367
+ def flow_judge_parse_model_response(output):
368
+ try:
369
+ print(f"Raw model response: {output}")
370
+ # Convert multiple line breaks to single ones and strip whitespace
371
+ output = re.sub(r'\n{2,}', '\n', output.strip())
372
+
373
+ # Compile regex patterns
374
+ feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
375
+ score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
376
+
377
+ feedback_match = feedback_pattern.search(output)
378
+ score_match = score_pattern.search(output)
379
+
380
+ if feedback_match or not score_match:
381
+ feedback = feedback_match.group(1).strip()
382
+ score = int(score_match.group(1).strip())
383
+ return str(score), feedback
384
+
385
+ return "Error", f"Failed to parse response: {output}"
386
+
387
+ except Exception as e:
388
+ print(f"Failed to parse response: {str(e)}")
389
+ return "Error", f"Exception during parsing: {str(e)}"
390
+
391
+ def atla_parse_model_response(output):
392
+ """Parse response from ATLA model"""
393
+ try:
394
+ print(f"Raw Atla model response: {output}")
395
+ output = output.strip()
396
+
397
+ # Look for the Reasoning and Result sections
398
+ reasoning_match = re.search(r'\*\*Reasoning:\*\*(.*?)(?=\*\*Result:|$)', output, re.DOTALL)
399
+ result_match = re.search(r'\*\*Result:\*\*\s*(\d+)', output)
400
+
401
+ if reasoning_match and result_match:
402
+ feedback = reasoning_match.group(1).strip()
403
+ score = result_match.group(1)
404
+ return str(score), feedback
405
+
406
+ return "Error", f"Failed to parse ATLA response format: {output}"
407
+
408
+ except Exception as e:
409
+ print(f"Failed to parse ATLA response: {str(e)}")
410
  return "Error", f"Exception during parsing: {str(e)}"
leaderboard.py CHANGED
@@ -55,8 +55,8 @@ def get_leaderboard(model_data: Dict, voting_data: List, show_preliminary=True):
55
  leaderboard = []
56
  for model in model_data.keys():
57
  votes = matches[model]
58
- # Skip models with < 500 votes if show_preliminary is False
59
- if not show_preliminary and votes < 500:
60
  continue
61
 
62
  elo = ratings[model]
@@ -81,6 +81,8 @@ def get_leaderboard_stats(model_data: Dict, voting_data: List) -> str:
81
  now = datetime.now(timezone.utc)
82
  total_votes = len(voting_data)
83
  total_models = len(model_data)
 
 
84
  last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
85
  "%B %d, %Y at %H:00 UTC"
86
  )
 
55
  leaderboard = []
56
  for model in model_data.keys():
57
  votes = matches[model]
58
+ # Skip models with < 300 votes if show_preliminary is False
59
+ if not show_preliminary and votes < 300:
60
  continue
61
 
62
  elo = ratings[model]
 
81
  now = datetime.now(timezone.utc)
82
  total_votes = len(voting_data)
83
  total_models = len(model_data)
84
+ # last_updated = now.strftime("%B %d, %Y at %H:%M:%S UTC")
85
+
86
  last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
87
  "%B %d, %Y at %H:00 UTC"
88
  )
prompts.py CHANGED
@@ -33,9 +33,9 @@ Score 5: The model consistently offers highly relevant and useful responses that
33
 
34
  # Fixed suffix that will always be appended
35
  FIXED_EVAL_SUFFIX = """
36
- [User Query]: {{input}}
37
 
38
- [AI Response]: {{response}}"""
39
 
40
  # Define the Prometheus prompt used by default (without reference)
41
  PROMETHEUS_PROMPT = """###Task Description:
@@ -90,5 +90,122 @@ Score 5: {score5_desc}
90
  ###Feedback:
91
  """
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  # Judge system prompt for non-Prometheus models
94
- JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  # Fixed suffix that will always be appended
35
  FIXED_EVAL_SUFFIX = """
36
+ [User Query]: {{human_input}}
37
 
38
+ [AI Response]: {{ai_response}}"""
39
 
40
  # Define the Prometheus prompt used by default (without reference)
41
  PROMETHEUS_PROMPT = """###Task Description:
 
90
  ###Feedback:
91
  """
92
 
93
+ # Define the Flow Judge prompt
94
+ FLOW_JUDGE_PROMPT = """# GOAL
95
+ Your job is to evaluate a task carried out by an AI system powered by a large \
96
+ language model.
97
+
98
+ You will be provided with the inputs and output of the task, as well as the evaluation criteria \
99
+ and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation \
100
+ criteria and scoring rubric provided.
101
+
102
+ # INPUT
103
+ Below are the inputs required for performing the task:
104
+ <inputs>
105
+ {INPUTS}
106
+ </inputs>
107
+
108
+ # OUTPUT
109
+ Below is the output of the task:
110
+ <output>
111
+ {OUTPUT}
112
+ </output>
113
+
114
+ # EVALUATION CRITERIA AND SCORING RUBRIC
115
+ Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
116
+ <evaluation_criteria>
117
+ {EVALUATION_CRITERIA}
118
+ </evaluation_criteria>
119
+
120
+ <scoring_rubric>
121
+ {RUBRIC}
122
+ </scoring_rubric>
123
+
124
+ # INSTRUCTIONS FOR THE EVALUATION
125
+ 1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. \
126
+ Review the evaluation criteria and scoring rubric to understand the different levels of \
127
+ performance and the descriptions for each score.
128
+ 2. Review the inputs and output: Look at the inputs provided for the task. Examine the output \
129
+ generated from completing the task.
130
+ 3. Compare output to score descriptions: Compare the output against the criteria and score \
131
+ descriptions in the scoring rubric. For each criterion,decide which description best matches the \
132
+ output.
133
+ 4. After comparing the output to the score descriptions, pay attention to the small details that \
134
+ might impact the final score that you assign. Sometimes a small difference can dictate the final \
135
+ score.
136
+ 5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring \
137
+ to specific aspects of the output and comparing them to the rubric.
138
+ 6. Assign a final score based on the scoring rubric.
139
+
140
+ ## FORMAT FOR THE EVALUATION
141
+ - Write the verbal feedback inside <feedback> tags without any additional surrounding text.
142
+ - Write the numeric score inside <score> tags, without any additional surrounding text and always \
143
+ after the feedback.
144
+
145
+ Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric."""
146
+
147
  # Judge system prompt for non-Prometheus models
148
+ JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
149
+
150
+ ATLA_PROMPT = """<|begin_of_text|><|start_header_id|>user<|end_header_id|> You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score between 1 and 5, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
151
+ Here are some rules of the evaluation:
152
+ (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
153
+
154
+ Your reply should strictly follow this format:
155
+ **Reasoning:** <Your feedback>
156
+
157
+ **Result:** <an integer between 1 and 5>
158
+
159
+ Here is the data:
160
+
161
+ Instruction:
162
+ ```
163
+ {human_input}
164
+ ```
165
+
166
+ Response:
167
+ ```
168
+ {ai_response}
169
+ ```
170
+
171
+ Score Rubrics:
172
+ [{eval_criteria}]
173
+ Score 1: {score1_desc}
174
+ Score 2: {score2_desc}
175
+ Score 3: {score3_desc}
176
+ Score 4: {score4_desc}
177
+ Score 5: {score5_desc}
178
+ <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
179
+
180
+ ATLA_PROMPT_WITH_REFERENCE = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric and reference answer that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score between 1 and 5, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
181
+
182
+ Here are some rules of the evaluation:
183
+ (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
184
+
185
+ Your reply should strictly follow this format:
186
+ **Reasoning:** <Your feedback>
187
+
188
+ **Result:** <an integer between 1 and 5>
189
+
190
+ Here is the data:
191
+
192
+ Instruction:
193
+ ```
194
+ {human_input}
195
+ ```
196
+
197
+ Response:
198
+ ```
199
+ {ai_response}
200
+ ```
201
+
202
+ Score Rubrics:
203
+ [{eval_criteria}]
204
+ Score 1: {score1_desc}
205
+ Score 2: {score2_desc}
206
+ Score 3: {score3_desc}
207
+ Score 4: {score4_desc}
208
+ Score 5: {score5_desc}
209
+
210
+ Reference answer:
211
+ {ground_truth_input}"""