bergr7f commited on
Commit
bfa9f9a
·
2 Parent(s): bcd436c e0175c8

Merge branch 'feat/add-Flow-Judge-v0.1' into pr/8

Browse files
Files changed (4) hide show
  1. app.py +11 -1
  2. data/models.jsonl +2 -1
  3. gen_api_answer.py +99 -18
  4. prompts.py +54 -0
app.py CHANGED
@@ -14,7 +14,8 @@ from gen_api_answer import (
14
  get_model_response,
15
  parse_model_response,
16
  prometheus_parse_model_response,
17
- atla_parse_model_response
 
18
  )
19
 
20
  from random_sample_generation import (
@@ -750,6 +751,9 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
750
  is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
751
  is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
752
  is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
 
 
 
753
 
754
  if is_prometheus_a:
755
  score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
@@ -757,6 +761,9 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
757
  elif is_atla_a:
758
  score_a_val, critique_a_val = atla_parse_model_response(response_a)
759
  score_a_val = f"{score_a_val} / 5"
 
 
 
760
  else:
761
  score_a_val, critique_a_val = parse_model_response(response_a)
762
  score_a_val = f"{score_a_val} / 5"
@@ -767,6 +774,9 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
767
  elif is_atla_b:
768
  score_b_val, critique_b_val = atla_parse_model_response(response_b)
769
  score_b_val = f"{score_b_val} / 5"
 
 
 
770
  else:
771
  score_b_val, critique_b_val = parse_model_response(response_b)
772
  score_b_val = f"{score_b_val} / 5"
 
14
  get_model_response,
15
  parse_model_response,
16
  prometheus_parse_model_response,
17
+ atla_parse_model_response,
18
+ flow_judge_parse_model_response,
19
  )
20
 
21
  from random_sample_generation import (
 
751
  is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
752
  is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
753
  is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
754
+
755
+ is_flow_judge_a = (model_data.get(model_a)['organization'] == 'Flow AI')
756
+ is_flow_judge_b = (model_data.get(model_b)['organization'] == 'Flow AI')
757
 
758
  if is_prometheus_a:
759
  score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
 
761
  elif is_atla_a:
762
  score_a_val, critique_a_val = atla_parse_model_response(response_a)
763
  score_a_val = f"{score_a_val} / 5"
764
+ elif is_flow_judge_a:
765
+ score_a_val, critique_a_val = flow_judge_parse_model_response(response_a)
766
+ score_a_val = f"{score_a_val} / 5"
767
  else:
768
  score_a_val, critique_a_val = parse_model_response(response_a)
769
  score_a_val = f"{score_a_val} / 5"
 
774
  elif is_atla_b:
775
  score_b_val, critique_b_val = atla_parse_model_response(response_b)
776
  score_b_val = f"{score_b_val} / 5"
777
+ elif is_flow_judge_b:
778
+ score_b_val, critique_b_val = flow_judge_parse_model_response(response_b)
779
+ score_b_val = f"{score_b_val} / 5"
780
  else:
781
  score_b_val, critique_b_val = parse_model_response(response_b)
782
  score_b_val = f"{score_b_val} / 5"
data/models.jsonl CHANGED
@@ -21,4 +21,5 @@
21
  {"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus", "active": true}
22
  {"name": "Atla-8B-preview-2024-01-08", "organization": "Atla", "license": "Open Source", "api_model": "Atla-8B-preview-2024-01-08", "active": true}
23
  {"name": "Meta Llama 3.3 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "active": true}
24
- {"name": "QwQ 32B Preview", "organization": "Qwen", "license": "Open Source", "api_model": "Qwen/QwQ-32B-Preview", "active": true}
 
 
21
  {"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus", "active": true}
22
  {"name": "Atla-8B-preview-2024-01-08", "organization": "Atla", "license": "Open Source", "api_model": "Atla-8B-preview-2024-01-08", "active": true}
23
  {"name": "Meta Llama 3.3 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "active": true}
24
+ {"name": "QwQ 32B Preview", "organization": "Qwen", "license": "Open Source", "api_model": "Qwen/QwQ-32B-Preview", "active": true}
25
+ {"name": "Flow-Judge-v0.1", "organization": "Flow AI", "license": "Open Source", "api_model": "Flow-Judge-v0.1-4.65bpw-exl2"}
gen_api_answer.py CHANGED
@@ -12,6 +12,7 @@ from prompts import (
12
  PROMETHEUS_PROMPT_WITH_REFERENCE,
13
  ATLA_PROMPT,
14
  ATLA_PROMPT_WITH_REFERENCE,
 
15
  )
16
 
17
  # Initialize clients
@@ -22,6 +23,8 @@ hf_api_key = os.getenv("HF_API_KEY")
22
  cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
23
 
24
 
 
 
25
 
26
  def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
27
  """Get response from OpenAI API"""
@@ -145,6 +148,30 @@ def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, m
145
  return str(content_items) # Fallback if it's not a list
146
  except Exception as e:
147
  return f"Error with Cohere model {model_name}: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  def get_model_response(
150
  model_name,
@@ -164,38 +191,64 @@ def get_model_response(
164
  # Determine if model is Prometheus or Atla
165
  is_prometheus = (organization == "Prometheus")
166
  is_atla = (organization == "Atla")
167
-
168
  # For non-Prometheus/Atla models, use the Judge system prompt
169
- system_prompt = None if (is_prometheus or is_atla) else JUDGE_SYSTEM_PROMPT
170
 
171
  # Select the appropriate base prompt
172
  if is_atla:
173
  base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
174
- elif use_reference:
175
- base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE
176
  else:
177
- base_prompt = PROMETHEUS_PROMPT
178
 
179
  # For non-Prometheus/non-Atla models, replace the specific instruction
180
- if not (is_prometheus or is_atla):
181
  base_prompt = base_prompt.replace(
182
  '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
183
  '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
184
  )
185
 
186
  try:
187
- # Format the prompt with the provided data, only using available keys
188
- final_prompt = base_prompt.format(
189
- human_input=prompt_data['human_input'],
190
- ai_response=prompt_data['ai_response'],
191
- ground_truth_input=prompt_data.get('ground_truth_input', ''),
192
- eval_criteria=prompt_data['eval_criteria'],
193
- score1_desc=prompt_data['score1_desc'],
194
- score2_desc=prompt_data['score2_desc'],
195
- score3_desc=prompt_data['score3_desc'],
196
- score4_desc=prompt_data['score4_desc'],
197
- score5_desc=prompt_data['score5_desc']
198
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  except KeyError as e:
200
  return f"Error formatting prompt: Missing required field {str(e)}"
201
 
@@ -220,6 +273,10 @@ def get_model_response(
220
  return get_cohere_response(
221
  api_model, final_prompt, system_prompt, max_tokens, temperature
222
  )
 
 
 
 
223
  else:
224
  # All other organizations use Together API
225
  return get_together_response(
@@ -306,7 +363,31 @@ def prometheus_parse_model_response(output):
306
  except Exception as e:
307
  print(f"Failed to parse response: {str(e)}")
308
  return "Error", f"Exception during parsing: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
 
 
 
 
 
 
 
 
 
 
 
310
  def atla_parse_model_response(output):
311
  """Parse response from ATLA model"""
312
  try:
 
12
  PROMETHEUS_PROMPT_WITH_REFERENCE,
13
  ATLA_PROMPT,
14
  ATLA_PROMPT_WITH_REFERENCE,
15
+ FLOW_JUDGE_PROMPT
16
  )
17
 
18
  # Initialize clients
 
23
  cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
24
 
25
 
26
+ flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
27
+
28
 
29
  def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
30
  """Get response from OpenAI API"""
 
148
  return str(content_items) # Fallback if it's not a list
149
  except Exception as e:
150
  return f"Error with Cohere model {model_name}: {str(e)}"
151
+
152
+ def get_flow_judge_response(model_name, prompt, max_tokens=500, temperature=0.1, top_p=0.95) -> str:
153
+ """Get response from Flow Judge"""
154
+ try:
155
+ response = requests.post(
156
+ "https://tsukuyomi.tailfa581.ts.net/v1/chat/completions",
157
+ headers={
158
+ "Content-Type": "application/json",
159
+ "Authorization": f"Bearer {flow_judge_api_key}"
160
+ },
161
+ json={
162
+ "model": model_name,
163
+ "messages": [
164
+ {"role": "user", "content": prompt}
165
+ ],
166
+ "max_tokens": max_tokens,
167
+ "temperature": temperature,
168
+ "top_p": top_p
169
+ }
170
+ )
171
+ response.raise_for_status()
172
+ return response.json()["choices"][0]['message']['content']
173
+ except Exception as e:
174
+ return f"Error with Flow Judge completions model {model_name}: {str(e)}"
175
 
176
  def get_model_response(
177
  model_name,
 
191
  # Determine if model is Prometheus or Atla
192
  is_prometheus = (organization == "Prometheus")
193
  is_atla = (organization == "Atla")
194
+ is_flow_judge = (organization == "Flow AI")
195
  # For non-Prometheus/Atla models, use the Judge system prompt
196
+ system_prompt = None if (is_prometheus or is_atla or is_flow_judge) else JUDGE_SYSTEM_PROMPT
197
 
198
  # Select the appropriate base prompt
199
  if is_atla:
200
  base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
201
+ elif is_flow_judge:
202
+ base_prompt = FLOW_JUDGE_PROMPT
203
  else:
204
+ base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
205
 
206
  # For non-Prometheus/non-Atla models, replace the specific instruction
207
+ if not (is_prometheus or is_atla or is_flow_judge):
208
  base_prompt = base_prompt.replace(
209
  '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
210
  '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
211
  )
212
 
213
  try:
214
+ if not is_flow_judge:
215
+ # Format the prompt with the provided data, only using available keys
216
+ final_prompt = base_prompt.format(
217
+ human_input=prompt_data['human_input'],
218
+ ai_response=prompt_data['ai_response'],
219
+ ground_truth_input=prompt_data.get('ground_truth_input', ''),
220
+ eval_criteria=prompt_data['eval_criteria'],
221
+ score1_desc=prompt_data['score1_desc'],
222
+ score2_desc=prompt_data['score2_desc'],
223
+ score3_desc=prompt_data['score3_desc'],
224
+ score4_desc=prompt_data['score4_desc'],
225
+ score5_desc=prompt_data['score5_desc']
226
+ )
227
+ else:
228
+ human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
229
+ ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
230
+ ground_truth=prompt_data.get('ground_truth_input', '')
231
+ if ground_truth:
232
+ response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
233
+ else:
234
+ response_reference = ""
235
+ eval_criteria = prompt_data['eval_criteria']
236
+ score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
237
+ score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
238
+ score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
239
+ score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
240
+ score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
241
+ rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
242
+ if response_reference:
243
+ inputs = human_input + "\n"+ response_reference
244
+ else:
245
+ inputs = human_input
246
+ final_prompt = base_prompt.format(
247
+ INPUTS=inputs,
248
+ OUTPUT=ai_response,
249
+ EVALUATION_CRITERIA=eval_criteria,
250
+ RUBRIC=rubric
251
+ )
252
  except KeyError as e:
253
  return f"Error formatting prompt: Missing required field {str(e)}"
254
 
 
273
  return get_cohere_response(
274
  api_model, final_prompt, system_prompt, max_tokens, temperature
275
  )
276
+ elif organization == "Flow AI":
277
+ return get_flow_judge_response(
278
+ api_model, final_prompt, max_tokens, temperature
279
+ )
280
  else:
281
  # All other organizations use Together API
282
  return get_together_response(
 
363
  except Exception as e:
364
  print(f"Failed to parse response: {str(e)}")
365
  return "Error", f"Exception during parsing: {str(e)}"
366
+
367
+ def flow_judge_parse_model_response(output):
368
+ try:
369
+ print(f"Raw model response: {output}")
370
+ # Convert multiple line breaks to single ones and strip whitespace
371
+ output = re.sub(r'\n{2,}', '\n', output.strip())
372
+
373
+ # Compile regex patterns
374
+ feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
375
+ score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
376
+
377
+ feedback_match = feedback_pattern.search(output)
378
+ score_match = score_pattern.search(output)
379
 
380
+ if feedback_match or not score_match:
381
+ feedback = feedback_match.group(1).strip()
382
+ score = int(score_match.group(1).strip())
383
+ return str(score), feedback
384
+
385
+ return "Error", f"Failed to parse response: {output}"
386
+
387
+ except Exception as e:
388
+ print(f"Failed to parse response: {str(e)}")
389
+ return "Error", f"Exception during parsing: {str(e)}"
390
+
391
  def atla_parse_model_response(output):
392
  """Parse response from ATLA model"""
393
  try:
prompts.py CHANGED
@@ -90,6 +90,60 @@ Score 5: {score5_desc}
90
  ###Feedback:
91
  """
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  # Judge system prompt for non-Prometheus models
94
  JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
95
 
 
90
  ###Feedback:
91
  """
92
 
93
+ # Define the Flow Judge prompt
94
+ FLOW_JUDGE_PROMPT = """# GOAL
95
+ Your job is to evaluate a task carried out by an AI system powered by a large \
96
+ language model.
97
+
98
+ You will be provided with the inputs and output of the task, as well as the evaluation criteria \
99
+ and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation \
100
+ criteria and scoring rubric provided.
101
+
102
+ # INPUT
103
+ Below are the inputs required for performing the task:
104
+ <inputs>
105
+ {INPUTS}
106
+ </inputs>
107
+
108
+ # OUTPUT
109
+ Below is the output of the task:
110
+ <output>
111
+ {OUTPUT}
112
+ </output>
113
+
114
+ # EVALUATION CRITERIA AND SCORING RUBRIC
115
+ Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
116
+ <evaluation_criteria>
117
+ {EVALUATION_CRITERIA}
118
+ </evaluation_criteria>
119
+
120
+ <scoring_rubric>
121
+ {RUBRIC}
122
+ </scoring_rubric>
123
+
124
+ # INSTRUCTIONS FOR THE EVALUATION
125
+ 1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. \
126
+ Review the evaluation criteria and scoring rubric to understand the different levels of \
127
+ performance and the descriptions for each score.
128
+ 2. Review the inputs and output: Look at the inputs provided for the task. Examine the output \
129
+ generated from completing the task.
130
+ 3. Compare output to score descriptions: Compare the output against the criteria and score \
131
+ descriptions in the scoring rubric. For each criterion,decide which description best matches the \
132
+ output.
133
+ 4. After comparing the output to the score descriptions, pay attention to the small details that \
134
+ might impact the final score that you assign. Sometimes a small difference can dictate the final \
135
+ score.
136
+ 5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring \
137
+ to specific aspects of the output and comparing them to the rubric.
138
+ 6. Assign a final score based on the scoring rubric.
139
+
140
+ ## FORMAT FOR THE EVALUATION
141
+ - Write the verbal feedback inside <feedback> tags without any additional surrounding text.
142
+ - Write the numeric score inside <score> tags, without any additional surrounding text and always \
143
+ after the feedback.
144
+
145
+ Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric."""
146
+
147
  # Judge system prompt for non-Prometheus models
148
  JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
149