bergr7f commited on
Commit
e0175c8
·
1 Parent(s): 60f3337

feat: add flow judge model

Browse files
Files changed (4) hide show
  1. app.py +12 -1
  2. data/models.jsonl +2 -1
  3. gen_api_answer.py +104 -17
  4. prompts.py +54 -0
app.py CHANGED
@@ -13,7 +13,8 @@ import gradio as gr
13
  from gen_api_answer import (
14
  get_model_response,
15
  parse_model_response,
16
- prometheus_parse_model_response
 
17
  )
18
 
19
  from random_sample_generation import (
@@ -749,10 +750,17 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
749
  # Parse the responses based on model, using Prometheus parsing for Prometheus models and JSON parsing for others
750
  is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
751
  is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
 
 
 
 
752
 
753
  if is_prometheus_a:
754
  score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
755
  score_a_val = f"{score_a_val} / 5"
 
 
 
756
  else:
757
  score_a_val, critique_a_val = parse_model_response(response_a)
758
  score_a_val = f"{score_a_val} / 5"
@@ -760,6 +768,9 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
760
  if is_prometheus_b:
761
  score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
762
  score_b_val = f"{score_b_val} / 5"
 
 
 
763
  else:
764
  score_b_val, critique_b_val = parse_model_response(response_b)
765
  score_b_val = f"{score_b_val} / 5"
 
13
  from gen_api_answer import (
14
  get_model_response,
15
  parse_model_response,
16
+ prometheus_parse_model_response,
17
+ flow_judge_parse_model_response,
18
  )
19
 
20
  from random_sample_generation import (
 
750
  # Parse the responses based on model, using Prometheus parsing for Prometheus models and JSON parsing for others
751
  is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
752
  is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
753
+
754
+ # Parse the responses based on model, using Flow Judge parsing for Flow Judge models and Prometheus parsing for others
755
+ is_flow_judge_a = (model_data.get(model_a)['organization'] == 'Flow AI')
756
+ is_flow_judge_b = (model_data.get(model_b)['organization'] == 'Flow AI')
757
 
758
  if is_prometheus_a:
759
  score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
760
  score_a_val = f"{score_a_val} / 5"
761
+ elif is_flow_judge_a:
762
+ score_a_val, critique_a_val = flow_judge_parse_model_response(response_a)
763
+ score_a_val = f"{score_a_val} / 5"
764
  else:
765
  score_a_val, critique_a_val = parse_model_response(response_a)
766
  score_a_val = f"{score_a_val} / 5"
 
768
  if is_prometheus_b:
769
  score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
770
  score_b_val = f"{score_b_val} / 5"
771
+ elif is_flow_judge_b:
772
+ score_b_val, critique_b_val = flow_judge_parse_model_response(response_b)
773
+ score_b_val = f"{score_b_val} / 5"
774
  else:
775
  score_b_val, critique_b_val = parse_model_response(response_b)
776
  score_b_val = f"{score_b_val} / 5"
data/models.jsonl CHANGED
@@ -18,4 +18,5 @@
18
  {"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest"}
19
  {"name": "Prometheus-7b v2", "organization": "Prometheus", "license": "Open Source", "api_model": "prometheus/prometheus-7b-v2"}
20
  {"name": "Command-R", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r"}
21
- {"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus"}
 
 
18
  {"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest"}
19
  {"name": "Prometheus-7b v2", "organization": "Prometheus", "license": "Open Source", "api_model": "prometheus/prometheus-7b-v2"}
20
  {"name": "Command-R", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r"}
21
+ {"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus"}
22
+ {"name": "Flow-Judge-v0.1", "organization": "Flow AI", "license": "Open Source", "api_model": "Flow-Judge-v0.1-4.65bpw-exl2"}
gen_api_answer.py CHANGED
@@ -10,6 +10,7 @@ from prompts import (
10
  JUDGE_SYSTEM_PROMPT,
11
  PROMETHEUS_PROMPT,
12
  PROMETHEUS_PROMPT_WITH_REFERENCE,
 
13
  )
14
 
15
  # Initialize clients
@@ -22,6 +23,8 @@ huggingface_client = OpenAI(
22
  base_url="https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud/v1/",
23
  api_key=hf_api_key
24
  )
 
 
25
 
26
  def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
27
  """Get response from OpenAI API"""
@@ -116,6 +119,30 @@ def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, m
116
  return str(content_items) # Fallback if it's not a list
117
  except Exception as e:
118
  return f"Error with Cohere model {model_name}: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  def get_model_response(
121
  model_name,
@@ -134,36 +161,68 @@ def get_model_response(
134
 
135
  # Determine if model is Prometheus
136
  is_prometheus = (organization == "Prometheus")
137
-
138
  # For non-Prometheus models, use the Judge system prompt
139
- system_prompt = None if is_prometheus else JUDGE_SYSTEM_PROMPT
140
 
141
  # Select the appropriate base prompt
142
  if use_reference:
143
- base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE
 
 
 
144
  else:
145
- base_prompt = PROMETHEUS_PROMPT
 
 
 
146
 
147
  # For non-Prometheus models, replace the specific instruction
148
- if not is_prometheus:
149
  base_prompt = base_prompt.replace(
150
  '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
151
  '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
152
  )
153
 
154
  try:
155
- # Format the prompt with the provided data, only using available keys
156
- final_prompt = base_prompt.format(
157
- human_input=prompt_data['human_input'],
158
- ai_response=prompt_data['ai_response'],
159
- ground_truth_input=prompt_data.get('ground_truth_input', ''),
160
- eval_criteria=prompt_data['eval_criteria'],
161
- score1_desc=prompt_data['score1_desc'],
162
- score2_desc=prompt_data['score2_desc'],
163
- score3_desc=prompt_data['score3_desc'],
164
- score4_desc=prompt_data['score4_desc'],
165
- score5_desc=prompt_data['score5_desc']
166
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  except KeyError as e:
168
  return f"Error formatting prompt: Missing required field {str(e)}"
169
 
@@ -184,6 +243,10 @@ def get_model_response(
184
  return get_cohere_response(
185
  api_model, final_prompt, system_prompt, max_tokens, temperature
186
  )
 
 
 
 
187
  else:
188
  # All other organizations use Together API
189
  return get_together_response(
@@ -267,6 +330,30 @@ def prometheus_parse_model_response(output):
267
 
268
  return "Error", f"Failed to parse response: {output}"
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  except Exception as e:
271
  print(f"Failed to parse response: {str(e)}")
272
  return "Error", f"Exception during parsing: {str(e)}"
 
10
  JUDGE_SYSTEM_PROMPT,
11
  PROMETHEUS_PROMPT,
12
  PROMETHEUS_PROMPT_WITH_REFERENCE,
13
+ FLOW_JUDGE_PROMPT
14
  )
15
 
16
  # Initialize clients
 
23
  base_url="https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud/v1/",
24
  api_key=hf_api_key
25
  )
26
+ flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
27
+
28
 
29
  def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
30
  """Get response from OpenAI API"""
 
119
  return str(content_items) # Fallback if it's not a list
120
  except Exception as e:
121
  return f"Error with Cohere model {model_name}: {str(e)}"
122
+
123
+ def get_flow_judge_response(model_name, prompt, max_tokens=500, temperature=0.1, top_p=0.95) -> str:
124
+ """Get response from Flow Judge"""
125
+ try:
126
+ response = requests.post(
127
+ "https://tsukuyomi.tailfa581.ts.net/v1/chat/completions",
128
+ headers={
129
+ "Content-Type": "application/json",
130
+ "Authorization": f"Bearer {flow_judge_api_key}"
131
+ },
132
+ json={
133
+ "model": model_name,
134
+ "messages": [
135
+ {"role": "user", "content": prompt}
136
+ ],
137
+ "max_tokens": max_tokens,
138
+ "temperature": temperature,
139
+ "top_p": top_p
140
+ }
141
+ )
142
+ response.raise_for_status()
143
+ return response.json()["choices"][0]['message']['content']
144
+ except Exception as e:
145
+ return f"Error with Flow Judge completions model {model_name}: {str(e)}"
146
 
147
  def get_model_response(
148
  model_name,
 
161
 
162
  # Determine if model is Prometheus
163
  is_prometheus = (organization == "Prometheus")
164
+ is_flow_judge = (organization == "Flow AI")
165
  # For non-Prometheus models, use the Judge system prompt
166
+ system_prompt = None if is_prometheus or is_flow_judge else JUDGE_SYSTEM_PROMPT
167
 
168
  # Select the appropriate base prompt
169
  if use_reference:
170
+ if not is_flow_judge:
171
+ base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE
172
+ else:
173
+ base_prompt = FLOW_JUDGE_PROMPT
174
  else:
175
+ if not is_flow_judge:
176
+ base_prompt = PROMETHEUS_PROMPT
177
+ else:
178
+ base_prompt = FLOW_JUDGE_PROMPT
179
 
180
  # For non-Prometheus models, replace the specific instruction
181
+ if not is_prometheus and not is_flow_judge:
182
  base_prompt = base_prompt.replace(
183
  '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
184
  '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
185
  )
186
 
187
  try:
188
+ if not is_flow_judge:
189
+ # Format the prompt with the provided data, only using available keys
190
+ final_prompt = base_prompt.format(
191
+ human_input=prompt_data['human_input'],
192
+ ai_response=prompt_data['ai_response'],
193
+ ground_truth_input=prompt_data.get('ground_truth_input', ''),
194
+ eval_criteria=prompt_data['eval_criteria'],
195
+ score1_desc=prompt_data['score1_desc'],
196
+ score2_desc=prompt_data['score2_desc'],
197
+ score3_desc=prompt_data['score3_desc'],
198
+ score4_desc=prompt_data['score4_desc'],
199
+ score5_desc=prompt_data['score5_desc']
200
+ )
201
+ else:
202
+ human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
203
+ ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
204
+ ground_truth=prompt_data.get('ground_truth_input', '')
205
+ if ground_truth:
206
+ response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
207
+ else:
208
+ response_reference = ""
209
+ eval_criteria = prompt_data['eval_criteria']
210
+ score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
211
+ score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
212
+ score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
213
+ score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
214
+ score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
215
+ rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
216
+ if response_reference:
217
+ inputs = human_input + "\n"+ response_reference
218
+ else:
219
+ inputs = human_input
220
+ final_prompt = base_prompt.format(
221
+ INPUTS=inputs,
222
+ OUTPUT=ai_response,
223
+ EVALUATION_CRITERIA=eval_criteria,
224
+ RUBRIC=rubric
225
+ )
226
  except KeyError as e:
227
  return f"Error formatting prompt: Missing required field {str(e)}"
228
 
 
243
  return get_cohere_response(
244
  api_model, final_prompt, system_prompt, max_tokens, temperature
245
  )
246
+ elif organization == "Flow AI":
247
+ return get_flow_judge_response(
248
+ api_model, final_prompt, max_tokens, temperature
249
+ )
250
  else:
251
  # All other organizations use Together API
252
  return get_together_response(
 
330
 
331
  return "Error", f"Failed to parse response: {output}"
332
 
333
+ except Exception as e:
334
+ print(f"Failed to parse response: {str(e)}")
335
+ return "Error", f"Exception during parsing: {str(e)}"
336
+
337
+ def flow_judge_parse_model_response(output):
338
+ try:
339
+ print(f"Raw model response: {output}")
340
+ # Convert multiple line breaks to single ones and strip whitespace
341
+ output = re.sub(r'\n{2,}', '\n', output.strip())
342
+
343
+ # Compile regex patterns
344
+ feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
345
+ score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
346
+
347
+ feedback_match = feedback_pattern.search(output)
348
+ score_match = score_pattern.search(output)
349
+
350
+ if feedback_match or not score_match:
351
+ feedback = feedback_match.group(1).strip()
352
+ score = int(score_match.group(1).strip())
353
+ return str(score), feedback
354
+
355
+ return "Error", f"Failed to parse response: {output}"
356
+
357
  except Exception as e:
358
  print(f"Failed to parse response: {str(e)}")
359
  return "Error", f"Exception during parsing: {str(e)}"
prompts.py CHANGED
@@ -90,5 +90,59 @@ Score 5: {score5_desc}
90
  ###Feedback:
91
  """
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  # Judge system prompt for non-Prometheus models
94
  JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
 
90
  ###Feedback:
91
  """
92
 
93
+ # Define the Flow Judge prompt
94
+ FLOW_JUDGE_PROMPT = """# GOAL
95
+ Your job is to evaluate a task carried out by an AI system powered by a large \
96
+ language model.
97
+
98
+ You will be provided with the inputs and output of the task, as well as the evaluation criteria \
99
+ and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation \
100
+ criteria and scoring rubric provided.
101
+
102
+ # INPUT
103
+ Below are the inputs required for performing the task:
104
+ <inputs>
105
+ {INPUTS}
106
+ </inputs>
107
+
108
+ # OUTPUT
109
+ Below is the output of the task:
110
+ <output>
111
+ {OUTPUT}
112
+ </output>
113
+
114
+ # EVALUATION CRITERIA AND SCORING RUBRIC
115
+ Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
116
+ <evaluation_criteria>
117
+ {EVALUATION_CRITERIA}
118
+ </evaluation_criteria>
119
+
120
+ <scoring_rubric>
121
+ {RUBRIC}
122
+ </scoring_rubric>
123
+
124
+ # INSTRUCTIONS FOR THE EVALUATION
125
+ 1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. \
126
+ Review the evaluation criteria and scoring rubric to understand the different levels of \
127
+ performance and the descriptions for each score.
128
+ 2. Review the inputs and output: Look at the inputs provided for the task. Examine the output \
129
+ generated from completing the task.
130
+ 3. Compare output to score descriptions: Compare the output against the criteria and score \
131
+ descriptions in the scoring rubric. For each criterion,decide which description best matches the \
132
+ output.
133
+ 4. After comparing the output to the score descriptions, pay attention to the small details that \
134
+ might impact the final score that you assign. Sometimes a small difference can dictate the final \
135
+ score.
136
+ 5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring \
137
+ to specific aspects of the output and comparing them to the rubric.
138
+ 6. Assign a final score based on the scoring rubric.
139
+
140
+ ## FORMAT FOR THE EVALUATION
141
+ - Write the verbal feedback inside <feedback> tags without any additional surrounding text.
142
+ - Write the numeric score inside <score> tags, without any additional surrounding text and always \
143
+ after the feedback.
144
+
145
+ Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric."""
146
+
147
  # Judge system prompt for non-Prometheus models
148
  JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""