kaikaidai commited on
Commit
cd8d5ec
·
verified ·
1 Parent(s): 7fe530a

Remove Flow judge until ready

Browse files
Files changed (1) hide show
  1. gen_api_answer.py +1030 -417
gen_api_answer.py CHANGED
@@ -1,448 +1,1061 @@
1
- from openai import OpenAI
2
- import anthropic
3
- from together import Together
4
- import cohere
5
  import json
6
  import re
7
- import os
8
- import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  from prompts import (
10
- JUDGE_SYSTEM_PROMPT,
11
- PROMETHEUS_PROMPT,
12
- PROMETHEUS_PROMPT_WITH_REFERENCE,
13
- ATLA_PROMPT,
14
- ATLA_PROMPT_WITH_REFERENCE,
15
- FLOW_JUDGE_PROMPT
 
 
 
 
 
 
 
 
 
 
16
  )
17
- from transformers import AutoTokenizer
18
-
19
- # Initialize clients
20
- anthropic_client = anthropic.Anthropic()
21
- openai_client = OpenAI()
22
- together_client = Together()
23
- hf_api_key = os.getenv("HF_API_KEY")
24
- flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
25
- cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
26
-
27
- def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
28
- """Get response from OpenAI API"""
29
- try:
30
- response = openai_client.chat.completions.create(
31
- model=model_name,
32
- messages=[
33
- {"role": "system", "content": system_prompt},
34
- {"role": "user", "content": prompt},
35
- ],
36
- max_completion_tokens=max_tokens,
37
- temperature=temperature,
38
- )
39
- return response.choices[0].message.content
40
- except Exception as e:
41
- return f"Error with OpenAI model {model_name}: {str(e)}"
42
 
43
- def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
44
- """Get response from Anthropic API"""
45
- try:
46
- response = anthropic_client.messages.create(
47
- model=model_name,
48
- max_tokens=max_tokens,
49
- temperature=temperature,
50
- system=system_prompt,
51
- messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
52
- )
53
- return response.content[0].text
54
- except Exception as e:
55
- return f"Error with Anthropic model {model_name}: {str(e)}"
56
 
57
- def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
58
- """Get response from Together API"""
59
- try:
60
- response = together_client.chat.completions.create(
61
- model=model_name,
62
- messages=[
63
- {"role": "system", "content": system_prompt},
64
- {"role": "user", "content": prompt},
65
- ],
66
- max_tokens=max_tokens,
67
- temperature=temperature,
68
- stream=False,
69
- )
70
- return response.choices[0].message.content
71
- except Exception as e:
72
- return f"Error with Together model {model_name}: {str(e)}"
73
 
74
- def get_prometheus_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
75
- """Get response from Hugging Face model"""
76
- try:
77
- headers = {
78
- "Accept": "application/json",
79
- "Authorization": f"Bearer {hf_api_key}",
80
- "Content-Type": "application/json"
81
- }
82
-
83
- # Create messages list for chat template
84
- messages = []
85
- if system_prompt:
86
- messages.append({"role": "system", "content": system_prompt})
87
- messages.append({"role": "user", "content": prompt})
88
-
89
- # Apply chat template
90
- model_id = "prometheus-eval/prometheus-7b-v2.0"
91
- tokenizer = AutoTokenizer.from_pretrained(model_id)
92
- formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
93
-
94
- payload = {
95
- "inputs": formatted_prompt,
96
- "parameters": {
97
- "max_new_tokens": max_tokens,
98
- "return_full_text": False,
99
- "temperature": temperature
100
- }
101
- }
102
-
103
- response = requests.post(
104
- "https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud",
105
- headers=headers,
106
- json=payload
107
- )
108
- return response.json()[0]["generated_text"]
109
- except Exception as e:
110
- return f"Error with Hugging Face model {model_name}: {str(e)}"
111
 
112
- def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
113
- """Get response from HF endpoint for Atla model"""
114
- try:
115
- headers = {
116
- "Accept": "application/json",
117
- "Authorization": f"Bearer {hf_api_key}",
118
- "Content-Type": "application/json"
119
- }
120
-
121
- # Create messages list for chat template
122
- messages = []
123
- if system_prompt:
124
- messages.append({"role": "system", "content": system_prompt})
125
- messages.append({"role": "user", "content": prompt})
126
-
127
- # Apply chat template
128
- model_id = "meta-llama/Llama-3.1-8B"
129
- tokenizer = AutoTokenizer.from_pretrained(model_id)
130
- formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
131
-
132
- payload = {
133
- "inputs": formatted_prompt,
134
- "parameters": {
135
- "max_new_tokens": max_tokens,
136
- "return_full_text": False,
137
- "temperature": temperature,
138
- "seed": 42,
139
- "add_generation_prompt": True
140
- }
141
- }
142
-
143
- response = requests.post(
144
- "https://azk0vbxyrc64s2v2.us-east-1.aws.endpoints.huggingface.cloud",
145
- headers=headers,
146
- json=payload
147
- )
148
- return response.json()[0]["generated_text"]
149
- except Exception as e:
150
- return f"Error with Atla model {model_name}: {str(e)}"
151
 
152
- def get_flow_judge_response(model_name, prompt, max_tokens=500, temperature=0.1, top_p=0.95) -> str:
153
- """Get response from Flow Judge"""
154
- try:
155
- response = requests.post(
156
- "https://tsukuyomi.tailfa581.ts.net/v1/chat/completions",
157
- headers={
158
- "Content-Type": "application/json",
159
- "Authorization": f"Bearer {flow_judge_api_key}"
160
- },
161
- json={
162
- "model": model_name,
163
- "messages": [
164
- {"role": "user", "content": prompt}
165
- ],
166
- "max_tokens": max_tokens,
167
- "temperature": temperature,
168
- "top_p": top_p
169
- }
170
- )
171
- response.raise_for_status()
172
- return response.json()["choices"][0]['message']['content']
173
- except Exception as e:
174
- return f"Error with Flow Judge completions model {model_name}: {str(e)}"
175
 
176
- def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
177
- """Get response from Cohere API"""
 
178
  try:
179
- response = cohere_client.chat(
180
- model=model_name,
181
- messages=[
182
- {"role": "system", "content": system_prompt},
183
- {"role": "user", "content": prompt}
184
- ],
185
- max_tokens=max_tokens,
186
- temperature=temperature
187
- )
188
- # Extract the text from the content items
189
- content_items = response.message.content
190
- if isinstance(content_items, list):
191
- # Get the text from the first content item
192
- return content_items[0].text
193
- return str(content_items) # Fallback if it's not a list
194
- except Exception as e:
195
- return f"Error with Cohere model {model_name}: {str(e)}"
196
-
197
- def get_model_response(
198
- model_name,
199
- model_info,
200
- prompt_data,
201
- use_reference=False,
202
- max_tokens=500,
203
- temperature=0
204
- ):
205
- """Get response from appropriate API based on model organization"""
206
- if not model_info:
207
- return "Model not found or unsupported."
208
-
209
- api_model = model_info["api_model"]
210
- organization = model_info["organization"]
211
-
212
- # Determine if model is Prometheus or Atla or Flow Judge
213
- is_prometheus = (organization == "Prometheus")
214
- is_atla = (organization == "Atla")
215
- is_flow_judge = (organization == "Flow AI")
216
- # For non-Prometheus/Atla models/Flow Judge, use the Judge system prompt
217
- system_prompt = None if (is_prometheus or is_atla or is_flow_judge) else JUDGE_SYSTEM_PROMPT
218
-
219
- # Select the appropriate base prompt
220
-
221
- if is_atla:
222
- base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
223
- elif is_flow_judge:
224
- base_prompt = FLOW_JUDGE_PROMPT
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  else:
226
- base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
227
 
228
- # For non-Prometheus/non-Atla models, replace the specific instruction
229
- if not (is_prometheus or is_atla or is_flow_judge):
230
- base_prompt = base_prompt.replace(
231
- '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
232
- '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
233
- )
234
 
235
- try:
236
- if not is_flow_judge:
237
- # Format the prompt with the provided data, only using available keys
238
- final_prompt = base_prompt.format(
239
- human_input=prompt_data['human_input'],
240
- ai_response=prompt_data['ai_response'],
241
- ground_truth_input=prompt_data.get('ground_truth_input', ''),
242
- eval_criteria=prompt_data['eval_criteria'],
243
- score1_desc=prompt_data['score1_desc'],
244
- score2_desc=prompt_data['score2_desc'],
245
- score3_desc=prompt_data['score3_desc'],
246
- score4_desc=prompt_data['score4_desc'],
247
- score5_desc=prompt_data['score5_desc']
248
- )
249
 
250
- else:
251
- human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
252
- ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
253
- ground_truth=prompt_data.get('ground_truth_input', '')
254
- if ground_truth:
255
- response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
256
- else:
257
- response_reference = ""
258
- eval_criteria = prompt_data['eval_criteria']
259
- score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
260
- score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
261
- score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
262
- score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
263
- score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
264
- rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
265
- if response_reference:
266
- inputs = human_input + "\n"+ response_reference
267
- else:
268
- inputs = human_input
269
- final_prompt = base_prompt.format(
270
- INPUTS=inputs,
271
- OUTPUT=ai_response,
272
- EVALUATION_CRITERIA=eval_criteria,
273
- RUBRIC=rubric
274
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
- except KeyError as e:
277
- return f"Error formatting prompt: Missing required field {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
- try:
280
- if organization == "OpenAI":
281
- return get_openai_response(
282
- api_model, final_prompt, system_prompt, max_tokens, temperature
283
- )
284
- elif organization == "Anthropic":
285
- return get_anthropic_response(
286
- api_model, final_prompt, system_prompt, max_tokens, temperature
287
- )
288
- elif organization == "Prometheus":
289
- return get_prometheus_response(
290
- api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
291
- )
292
- elif organization == "Atla":
293
- return get_atla_response(
294
- api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  )
296
- elif organization == "Cohere":
297
- return get_cohere_response(
298
- api_model, final_prompt, system_prompt, max_tokens, temperature
 
 
 
 
 
 
 
 
 
 
299
  )
300
- elif organization == "Flow AI":
301
- return get_flow_judge_response(
302
- api_model, final_prompt, max_tokens, temperature
 
 
 
303
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  else:
305
- # All other organizations use Together API
306
- return get_together_response(
307
- api_model, final_prompt, system_prompt, max_tokens, temperature
308
- )
309
- except Exception as e:
310
- return f"Error with {organization} model {model_name}: {str(e)}"
311
 
312
- def parse_model_response(response):
313
- try:
314
- # Debug print
315
- print(f"Raw model response: {response}")
316
-
317
- # If response is already a dictionary, use it directly
318
- if isinstance(response, dict):
319
- return str(response.get("result", "N/A")), response.get("feedback", "N/A")
320
-
321
- # First try to parse the entire response as JSON
322
- try:
323
- data = json.loads(response)
324
- return str(data.get("result", "N/A")), data.get("feedback", "N/A")
325
- except json.JSONDecodeError:
326
- # If that fails (typically for smaller models), try to find JSON within the response
327
- json_match = re.search(r"{.*}", response, re.DOTALL)
328
- if json_match:
329
- data = json.loads(json_match.group(0))
330
- return str(data.get("result", "N/A")), data.get("feedback", "N/A")
331
- else:
332
- return "Error", f"Invalid response format returned - here is the raw model response: {response}"
333
 
334
- except Exception as e:
335
- # Debug print for error case
336
- print(f"Failed to parse response: {str(e)}")
337
-
338
- # If the error message itself contains valid JSON, try to parse that
339
- try:
340
- error_json_match = re.search(r"{.*}", str(e), re.DOTALL)
341
- if error_json_match:
342
- data = json.loads(error_json_match.group(0))
343
- return str(data.get("result", "N/A")), data.get("feedback", "N/A")
344
- except:
345
- pass
346
-
347
- return "Error", f"Failed to parse response: {response}"
348
-
349
- def prometheus_parse_model_response(output):
350
- try:
351
- print(f"Raw model response: {output}")
352
- output = output.strip()
353
 
354
- # Remove "Feedback:" prefix if present (case insensitive)
355
- output = re.sub(r'^feedback:\s*', '', output, flags=re.IGNORECASE)
356
-
357
- # New pattern to match [RESULT] X at the beginning
358
- begin_result_pattern = r'^\[RESULT\]\s*(\d+)\s*\n*(.*?)$'
359
- begin_match = re.search(begin_result_pattern, output, re.DOTALL | re.IGNORECASE)
360
- if begin_match:
361
- score = int(begin_match.group(1))
362
- feedback = begin_match.group(2).strip()
363
- return str(score), feedback
364
-
365
- # Existing patterns for end-of-string results...
366
- pattern = r"(.*?)\s*\[RESULT\]\s*[\(\[]?(\d+)[\)\]]?"
367
- match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
368
- if match:
369
- feedback = match.group(1).strip()
370
- score = int(match.group(2))
371
- return str(score), feedback
372
-
373
- # If no match, try to match "... Score: X"
374
- pattern = r"(.*?)\s*(?:Score|Result)\s*:\s*[\(\[]?(\d+)[\)\]]?"
375
- match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
376
- if match:
377
- feedback = match.group(1).strip()
378
- score = int(match.group(2))
379
- return str(score), feedback
380
-
381
- # Pattern to handle [Score X] at the end
382
- pattern = r"(.*?)\s*\[(?:Score|Result)\s*[\(\[]?(\d+)[\)\]]?\]$"
383
- match = re.search(pattern, output, re.DOTALL)
384
- if match:
385
- feedback = match.group(1).strip()
386
- score = int(match.group(2))
387
- return str(score), feedback
388
-
389
- # Final fallback attempt
390
- pattern = r"[\(\[]?(\d+)[\)\]]?\s*\]?$"
391
- match = re.search(pattern, output)
392
- if match:
393
- score = int(match.group(1))
394
- feedback = output[:match.start()].rstrip()
395
- # Remove any trailing brackets from feedback
396
- feedback = re.sub(r'\s*\[[^\]]*$', '', feedback).strip()
397
- return str(score), feedback
398
-
399
- return "Error", f"Failed to parse response: {output}"
400
-
401
- except Exception as e:
402
- print(f"Failed to parse response: {str(e)}")
403
- return "Error", f"Exception during parsing: {str(e)}"
404
-
405
- def atla_parse_model_response(output):
406
- """Parse response from ATLA model"""
407
- try:
408
- print(f"Raw Atla model response: {output}")
409
- output = output.strip()
410
 
411
- # Look for the Reasoning and Result sections
412
- reasoning_match = re.search(r'\*\*Reasoning:\*\*(.*?)(?=\*\*Result:|$)', output, re.DOTALL)
413
- result_match = re.search(r'\*\*Result:\*\*\s*(\d+)', output)
414
 
415
- if reasoning_match and result_match:
416
- feedback = reasoning_match.group(1).strip()
417
- score = result_match.group(1)
418
- return str(score), feedback
419
 
420
- return "Error", f"Failed to parse ATLA response format: {output}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
 
422
- except Exception as e:
423
- print(f"Failed to parse ATLA response: {str(e)}")
424
- return "Error", f"Exception during parsing: {str(e)}"
425
-
426
- def flow_judge_parse_model_response(output):
427
- try:
428
- print(f"Raw model response: {output}")
429
- # Convert multiple line breaks to single ones and strip whitespace
430
- output = re.sub(r'\n{2,}', '\n', output.strip())
431
-
432
- # Compile regex patterns
433
- feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
434
- score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
435
 
436
- feedback_match = feedback_pattern.search(output)
437
- score_match = score_pattern.search(output)
 
 
 
 
 
438
 
439
- if feedback_match or not score_match:
440
- feedback = feedback_match.group(1).strip()
441
- score = int(score_match.group(1).strip())
442
- return str(score), feedback
443
-
444
- return "Error", f"Failed to parse response: {output}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
- except Exception as e:
447
- print(f"Failed to parse response: {str(e)}")
448
- return "Error", f"Exception during parsing: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import re
3
+ import random
4
+ from collections import defaultdict
5
+ from datetime import datetime
6
+ import hashlib
7
+ import gradio as gr
8
+
9
+ from dotenv import load_dotenv
10
+ load_dotenv()
11
+
12
+ from gen_api_answer import (
13
+ get_model_response,
14
+ parse_model_response,
15
+ prometheus_parse_model_response,
16
+ atla_parse_model_response,
17
+ flow_judge_parse_model_response
18
+ )
19
+
20
+ from random_sample_generation import (
21
+ get_random_human_ai_pair,
22
+ get_random_human_ai_ground_truth_pair,
23
+ generate_ai_response
24
+ )
25
+ from db import add_vote, create_db_connection, get_votes
26
+
27
+ from utils import Vote
28
+
29
+ from common import (
30
+ POLICY_CONTENT,
31
+ ACKNOWLEDGEMENTS,
32
+ CSS_STYLES,
33
+ MAIN_TITLE,
34
+ HOW_IT_WORKS,
35
+ )
36
  from prompts import (
37
+ DEFAULT_EVAL_PROMPT,
38
+ DEFAULT_EVAL_PROMPT_EDITABLE,
39
+ FIXED_EVAL_SUFFIX,
40
+ DEFAULT_EVAL_CRITERIA,
41
+ DEFAULT_SCORE_1,
42
+ DEFAULT_SCORE_2,
43
+ DEFAULT_SCORE_3,
44
+ DEFAULT_SCORE_4,
45
+ DEFAULT_SCORE_5,
46
+ )
47
+ from leaderboard import (
48
+ get_leaderboard,
49
+ get_leaderboard_stats,
50
+ get_model_rankings,
51
+ DEFAULT_ELO,
52
+ K_FACTOR
53
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ elo_scores = defaultdict(lambda: DEFAULT_ELO)
57
+ vote_counts = defaultdict(int)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ db = create_db_connection()
60
+ votes_collection = get_votes(db)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ current_time = datetime.now()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ # Load the model_data from JSONL
66
+ def load_model_data():
67
+ model_data = {}
68
  try:
69
+ with open("data/models.jsonl", "r") as f:
70
+ for line in f:
71
+ model = json.loads(line)
72
+ model_data[model["name"]] = {
73
+ "organization": model["organization"],
74
+ "license": model["license"],
75
+ "api_model": model["api_model"],
76
+ }
77
+ except FileNotFoundError:
78
+ print("Warning: models.jsonl not found")
79
+ return {}
80
+ return model_data
81
+
82
+
83
+ model_data = load_model_data()
84
+
85
+ def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
86
+ prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
87
+
88
+ vote = Vote(
89
+ timestamp=datetime.now().isoformat(),
90
+ prompt=prompt_value,
91
+ response_a=response_a,
92
+ response_b=response_b,
93
+ model_a=model_a,
94
+ model_b=model_b,
95
+ winner=winner,
96
+ judge_id=judge_id,
97
+ )
98
+ add_vote(vote, db)
99
+
100
+
101
+ def parse_variables(prompt):
102
+ # Extract variables enclosed in double curly braces
103
+ variables = re.findall(r"{{(.*?)}}", prompt)
104
+ # Remove duplicates while preserving order
105
+ seen = set()
106
+ variables = [
107
+ x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
108
+ ]
109
+ return variables
110
+
111
+
112
+ def get_final_prompt(eval_prompt, variable_values):
113
+ # Replace variables in the eval prompt with their values
114
+ for var, val in variable_values.items():
115
+ eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
116
+ return eval_prompt
117
+
118
+
119
+
120
+ def get_ip(request: gr.Request) -> str:
121
+ """Get and hash the IP address from the request."""
122
+ if "cf-connecting-ip" in request.headers:
123
+ ip = request.headers["cf-connecting-ip"]
124
+ elif "x-forwarded-for" in request.headers:
125
+ ip = request.headers["x-forwarded-for"]
126
+ if "," in ip:
127
+ ip = ip.split(",")[0]
128
  else:
129
+ ip = request.client.host
130
 
131
+ # Hash the IP address for privacy
132
+ return hashlib.sha256(ip.encode()).hexdigest()[:16]
 
 
 
 
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
136
+ """Generate appropriate message based on vote and model rankings.
137
+ Returns (title, message) tuple."""
138
+ # Get current rankings
139
+ voting_data = get_current_votes()
140
+ leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
141
+ rankings = get_model_rankings(leaderboard)
142
+ pos_a = rankings.get(model_a, 0)
143
+ pos_b = rankings.get(model_b, 0)
144
+
145
+ if choice == "Tie":
146
+ return "It's a tie!", "Keep voting responsibly 🤗"
147
+
148
+ # Check if vote aligns with leaderboard
149
+ if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
150
+ return "The favourite wins!", "Keep voting responsibly 🤗"
151
+ else:
152
+ return "The underdog wins!", "Keep voting responsibly ���"
153
+
154
+
155
+ def vote(
156
+ choice,
157
+ model_a,
158
+ model_b,
159
+ final_prompt,
160
+ score_a,
161
+ critique_a,
162
+ score_b,
163
+ critique_b,
164
+ request: gr.Request,
165
+ ):
166
+ # Get hashed IP as judge_id
167
+ judge_id = get_ip(request)
168
+
169
+ # Update ELO scores based on user choice
170
+ elo_a = elo_scores[model_a]
171
+ elo_b = elo_scores[model_b]
172
+
173
+ # Calculate expected scores
174
+ Ea = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
175
+ Eb = 1 / (1 + 10 ** ((elo_a - elo_b) / 400))
176
+
177
+ # Assign actual scores
178
+ if choice == "A":
179
+ Sa, Sb = 1, 0
180
+ elif choice == "B":
181
+ Sa, Sb = 0, 1
182
+ else:
183
+ Sa, Sb = 0.5, 0.5
184
+
185
+ # Update scores and vote counts
186
+ elo_scores[model_a] += K_FACTOR * (Sa - Ea)
187
+ elo_scores[model_b] += K_FACTOR * (Sb - Eb)
188
+ vote_counts[model_a] += 1
189
+ vote_counts[model_b] += 1
190
+
191
+ # Format the full responses with score and critique
192
+ response_a = f"""{score_a}
193
+
194
+ {critique_a}"""
195
+
196
+ response_b = f"""{score_b}
197
+
198
+ {critique_b}"""
199
+
200
+ # Store the vote data with the final prompt
201
+ store_vote_data(
202
+ final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
203
+ )
204
+
205
+ # Get model positions for display
206
+ voting_data = get_current_votes()
207
+ leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
208
+ rankings = get_model_rankings(leaderboard)
209
+ pos_a = rankings.get(model_a, 0)
210
+ pos_b = rankings.get(model_b, 0)
211
+
212
+ # Format model names with positions and win/loss indicators
213
+ if choice == "Tie":
214
+ model_a_display = f"*Model: {model_a} (Position #{pos_a})*"
215
+ model_b_display = f"*Model: {model_b} (Position #{pos_b})*"
216
+ else:
217
+ winner = model_a if choice == "A" else model_b
218
+ loser = model_b if choice == "A" else model_a
219
+ winner_pos = pos_a if choice == "A" else pos_b
220
+ loser_pos = pos_b if choice == "A" else pos_a
221
 
222
+ model_a_display = f"*Model: {model_a} {'✅' if choice == 'A' else '❌'} (Position #{pos_a})*"
223
+ model_b_display = f"*Model: {model_b} {'✅' if choice == 'B' else '❌'} (Position #{pos_b})*"
224
+
225
+ # Generate vote message
226
+ title, message = get_vote_message(choice, model_a, model_b)
227
+
228
+ return [
229
+ gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a
230
+ gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b
231
+ gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie
232
+ gr.update(value=model_a_display), # model_name_a
233
+ gr.update(value=model_b_display), # model_name_b
234
+ gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn
235
+ gr.update(value="🎲 New round", variant="primary"), # random_btn
236
+ gr.Info(message, title=title), # success message
237
+ ]
238
 
239
+
240
+ def get_current_votes():
241
+ """Get current votes from database."""
242
+ return get_votes(db)
243
+
244
+
245
+ # Update the refresh_leaderboard function
246
+ def refresh_leaderboard(show_preliminary):
247
+ """Refresh the leaderboard data and stats."""
248
+ voting_data = get_current_votes()
249
+ leaderboard = get_leaderboard(model_data, voting_data, show_preliminary)
250
+ data = [
251
+ [
252
+ entry["Model"],
253
+ float(entry["ELO Score"]),
254
+ entry["95% CI"],
255
+ entry["# Votes"],
256
+ entry["Organization"],
257
+ entry["License"],
258
+ ]
259
+ for entry in leaderboard
260
+ ]
261
+ stats = get_leaderboard_stats(model_data, voting_data)
262
+ return [gr.update(value=data), gr.update(value=stats)]
263
+
264
+
265
+ # Update the leaderboard table definition in the UI
266
+ leaderboard_table = gr.Dataframe(
267
+ headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
268
+ datatype=["str", "number", "str", "number", "str", "str", "str"],
269
+ )
270
+
271
+
272
+ def populate_random_example(request: gr.Request, compatible_mode: bool):
273
+ """Generate a random human-AI conversation example and reset judge outputs."""
274
+ if compatible_mode:
275
+ # Generate all three components when compatible mode is enabled
276
+ human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
277
+ else:
278
+ # Generate only human and AI messages when compatible mode is disabled
279
+ human_msg, ai_msg = get_random_human_ai_pair()
280
+ ground_truth_msg = ""
281
+
282
+ return [
283
+ gr.update(value=human_msg),
284
+ gr.update(value=ai_msg),
285
+ gr.update(value="🎲", variant="secondary"), # Reset random button appearance
286
+ gr.update(value=""), # Clear score A
287
+ gr.update(value=""), # Clear critique A
288
+ gr.update(value=""), # Clear score B
289
+ gr.update(value=""), # Clear critique B
290
+ gr.update(interactive=False, variant="primary"), # Reset vote A
291
+ gr.update(interactive=False, variant="primary"), # Reset vote B
292
+ gr.update(interactive=False, variant="primary"), # Reset vote tie
293
+ gr.update(value="*Model: Hidden*"), # Reset model name A
294
+ gr.update(value="*Model: Hidden*"), # Reset model name B
295
+ gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
296
+ ]
297
+
298
+
299
+ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
300
+ gr.Markdown(MAIN_TITLE)
301
+ gr.Markdown(HOW_IT_WORKS)
302
+
303
+ # Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
304
+ eval_prompt = gr.Textbox(
305
+ value=DEFAULT_EVAL_PROMPT,
306
+ visible=False
307
+ )
308
+
309
+ with gr.Tabs():
310
+ with gr.TabItem("Judge Arena"):
311
+ with gr.Row():
312
+ # Left side - Input section
313
+ with gr.Column(scale=1):
314
+ with gr.Group():
315
+ human_input = gr.TextArea(
316
+ label="👩 User Input",
317
+ lines=10,
318
+ placeholder="Enter the human message here..."
319
+ )
320
+ with gr.Row():
321
+ generate_btn = gr.Button(
322
+ "Generate AI Response",
323
+ size="sm",
324
+ interactive=False
325
+ )
326
+
327
+ ai_response = gr.TextArea(
328
+ label="🤖 AI Response",
329
+ lines=15,
330
+ placeholder="Enter the AI response here..."
331
+ )
332
+
333
+ # Ground truth response (initially hidden)
334
+ ground_truth = gr.TextArea(
335
+ label="🎯 Ground truth response",
336
+ lines=12,
337
+ placeholder="Enter the ground truth response here...",
338
+ visible=False
339
+ )
340
+
341
+ with gr.Row():
342
+ random_btn = gr.Button("🎲", scale=2)
343
+ send_btn = gr.Button(
344
+ value="Run judges",
345
+ variant="primary",
346
+ size="lg",
347
+ scale=8
348
+ )
349
+
350
+ # Right side - Model outputs
351
+ with gr.Column(scale=1):
352
+ gr.Markdown("### 👩‍⚖️ Judge A")
353
+ with gr.Group():
354
+ model_name_a = gr.Markdown("*Model: Hidden*")
355
+ with gr.Row():
356
+ with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
357
+ score_a = gr.Textbox(label="Score", lines=6, interactive=False)
358
+ vote_a = gr.Button("Vote A", variant="primary", interactive=False)
359
+ with gr.Column(scale=9, min_width=400): # Wider width for critique
360
+ critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
361
+
362
+ # Tie button row
363
+ with gr.Row() as tie_button_row:
364
+ with gr.Column():
365
+ vote_tie = gr.Button("Tie", variant="primary", interactive=False)
366
+
367
+
368
+ gr.Markdown("### 🧑‍⚖️ Judge B")
369
+ with gr.Group():
370
+ model_name_b = gr.Markdown("*Model: Hidden*")
371
+ with gr.Row():
372
+ with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
373
+ score_b = gr.Textbox(label="Score", lines=6, interactive=False)
374
+ vote_b = gr.Button("Vote B", variant="primary", interactive=False)
375
+ with gr.Column(scale=9, min_width=400): # Wider width for critique
376
+ critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
377
+ # Place Vote B button directly under Judge B
378
+
379
+ gr.Markdown("<br>")
380
+
381
+
382
+ # Replace the "Edit Judge Prompt" Accordion section with:
383
+ with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
384
+ gr.Markdown("<br>")
385
+ use_reference_toggle = gr.Checkbox(
386
+ label="Use a reference response",
387
+ value=False
388
+ )
389
+
390
+ # Hide the default prompt editor
391
+ with gr.Column(visible=False) as default_prompt_editor:
392
+ eval_prompt_editable = gr.TextArea(
393
+ value=DEFAULT_EVAL_PROMPT_EDITABLE,
394
+ label="Evaluation Criteria",
395
+ lines=12
396
+ )
397
+
398
+ with gr.Row(visible=False) as edit_buttons_row:
399
+ cancel_prompt_btn = gr.Button("Cancel")
400
+ save_prompt_btn = gr.Button("Save", variant="primary")
401
+ gr.Markdown("*The sample being evaluated is always appended as:*")
402
+ gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
403
+
404
+ # Show the compatible mode editor
405
+ with gr.Column(visible=True) as compatible_prompt_editor:
406
+ with gr.Row():
407
+ # Left column - Evaluation Criteria
408
+ with gr.Column(scale=1):
409
+ eval_criteria_text = gr.TextArea(
410
+ label="Evaluation Criteria",
411
+ lines=12,
412
+ value=DEFAULT_EVAL_CRITERIA,
413
+ placeholder="Enter the evaluation criteria..."
414
+ )
415
+ prometheus_reference = gr.Markdown(
416
+ "<br> *By default, we use the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
417
+ visible=True
418
+ )
419
+
420
+ # Right column - Score Descriptions
421
+ with gr.Column(scale=1):
422
+ score1_description = gr.TextArea(
423
+ label="Score 1",
424
+ value=DEFAULT_SCORE_1,
425
+ placeholder="Description for score 1",
426
+ lines=2
427
+ )
428
+ score2_description = gr.TextArea(
429
+ label="Score 2",
430
+ value=DEFAULT_SCORE_2,
431
+ placeholder="Description for score 2",
432
+ lines=2
433
+ )
434
+ score3_description = gr.TextArea(
435
+ label="Score 3",
436
+ value=DEFAULT_SCORE_3,
437
+ placeholder="Description for score 3",
438
+ lines=2
439
+ )
440
+ score4_description = gr.TextArea(
441
+ label="Score 4",
442
+ value=DEFAULT_SCORE_4,
443
+ placeholder="Description for score 4",
444
+ lines=2
445
+ )
446
+ score5_description = gr.TextArea(
447
+ label="Score 5",
448
+ value=DEFAULT_SCORE_5,
449
+ placeholder="Description for score 5",
450
+ lines=2
451
+ )
452
+
453
+ # Add save/cancel buttons for compatible mode
454
+ with gr.Row(visible=False) as compatible_edit_buttons_row:
455
+ compatible_cancel_btn = gr.Button("Cancel")
456
+ compatible_save_btn = gr.Button("Save", variant="primary")
457
+
458
+ with gr.TabItem("Leaderboard"):
459
+ with gr.Row():
460
+ with gr.Column(scale=1):
461
+ show_preliminary = gr.Checkbox(
462
+ label="Reveal preliminary results",
463
+ value=True, # Checked by default
464
+ info="Show all models, including models with less human ratings (< 300 votes)",
465
+ interactive=True
466
+ )
467
+ stats_display = gr.Markdown()
468
+ leaderboard_table = gr.Dataframe(
469
+ headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
470
+ datatype=["str", "number", "str", "number", "str", "str", "str"],
471
  )
472
+
473
+ gr.Markdown("""<br>
474
+ <br>
475
+ Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
476
+
477
+ [*"Together Turbo achieves this performance while maintaining full accuracy compared to Meta's reference implementation across all models. Llama-3.1-405B-Instruct-Turbo matches the accuracy of Meta reference models."*](https://www.together.ai/blog/together-inference-engine-2)
478
+ """)
479
+
480
+ # Add change handler for checkbox
481
+ show_preliminary.change(
482
+ fn=refresh_leaderboard,
483
+ inputs=[show_preliminary],
484
+ outputs=[leaderboard_table, stats_display]
485
  )
486
+
487
+ # Update the load event
488
+ demo.load(
489
+ fn=refresh_leaderboard,
490
+ inputs=[show_preliminary],
491
+ outputs=[leaderboard_table, stats_display]
492
  )
493
+
494
+ with gr.TabItem("Policy"):
495
+ gr.Markdown(POLICY_CONTENT)
496
+ gr.Markdown(ACKNOWLEDGEMENTS)
497
+
498
+ # Define state variables for model tracking
499
+ model_a_state = gr.State()
500
+ model_b_state = gr.State()
501
+ final_prompt_state = gr.State()
502
+ eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
503
+ is_editing = gr.State(False) # Track editing state
504
+ compatible_mode_state = gr.State(False) # Track compatible mode state
505
+
506
+ # Update model names after responses are generated
507
+ def update_model_names(model_a, model_b):
508
+ return gr.update(value=f"*Model: {model_a}*"), gr.update(
509
+ value=f"*Model: {model_b}*"
510
+ )
511
+
512
+ # Store the last submitted prompt and variables for comparison
513
+ last_submission = gr.State({})
514
+
515
+ # Update the vote button click handlers
516
+ vote_a.click(
517
+ fn=vote,
518
+ inputs=[
519
+ gr.State("A"),
520
+ model_a_state,
521
+ model_b_state,
522
+ final_prompt_state,
523
+ score_a,
524
+ critique_a,
525
+ score_b,
526
+ critique_b,
527
+ ],
528
+ outputs=[
529
+ vote_a,
530
+ vote_b,
531
+ vote_tie,
532
+ model_name_a,
533
+ model_name_b,
534
+ send_btn,
535
+ random_btn,
536
+ gr.State(), # placeholder for success message
537
+ ],
538
+ )
539
+
540
+ vote_b.click(
541
+ fn=vote,
542
+ inputs=[
543
+ gr.State("B"),
544
+ model_a_state,
545
+ model_b_state,
546
+ final_prompt_state,
547
+ score_a,
548
+ critique_a,
549
+ score_b,
550
+ critique_b,
551
+ ],
552
+ outputs=[
553
+ vote_a,
554
+ vote_b,
555
+ vote_tie,
556
+ model_name_a,
557
+ model_name_b,
558
+ send_btn,
559
+ random_btn,
560
+ gr.State(), # placeholder for success message
561
+ ],
562
+ )
563
+
564
+ vote_tie.click(
565
+ fn=vote,
566
+ inputs=[
567
+ gr.State("Tie"),
568
+ model_a_state,
569
+ model_b_state,
570
+ final_prompt_state,
571
+ score_a,
572
+ critique_a,
573
+ score_b,
574
+ critique_b,
575
+ ],
576
+ outputs=[
577
+ vote_a,
578
+ vote_b,
579
+ vote_tie,
580
+ model_name_a,
581
+ model_name_b,
582
+ send_btn,
583
+ random_btn,
584
+ gr.State(), # placeholder for success message
585
+ ],
586
+ )
587
+
588
+ # Add handlers for save/cancel buttons
589
+ def save_prompt(new_prompt, previous_prompt):
590
+ return [
591
+ gr.update(value=new_prompt), # Update the prompt
592
+ new_prompt, # Update the previous prompt state
593
+ gr.update(visible=False) # Hide the buttons
594
+ ]
595
+
596
+ def cancel_prompt(previous_prompt):
597
+ return [
598
+ gr.update(value=previous_prompt), # Revert to previous prompt
599
+ previous_prompt, # Keep the previous prompt state
600
+ gr.update(visible=False) # Hide the buttons
601
+ ]
602
+
603
+ def show_edit_buttons(current_value, previous_value):
604
+ # Show buttons only if the current value differs from the previous value
605
+ return gr.update(visible=current_value != previous_value)
606
+
607
+ # Add handlers for save/cancel buttons and prompt changes
608
+ save_prompt_btn.click(
609
+ fn=save_prompt,
610
+ inputs=[eval_prompt_editable, eval_prompt_previous],
611
+ outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
612
+ )
613
+
614
+ cancel_prompt_btn.click(
615
+ fn=cancel_prompt,
616
+ inputs=[eval_prompt_previous],
617
+ outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
618
+ )
619
+
620
+ eval_prompt_editable.change(
621
+ fn=show_edit_buttons,
622
+ inputs=[eval_prompt_editable, eval_prompt_previous],
623
+ outputs=edit_buttons_row
624
+ )
625
+
626
+ # Function to toggle visibility based on compatible mode
627
+ def toggle_use_reference(checked):
628
+ if checked:
629
+ # Get new random samples with ground truth when enabling reference mode
630
+ human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
631
+ return {
632
+ ground_truth: gr.update(visible=True, value=ground_truth_msg),
633
+ human_input: gr.update(value=human_msg),
634
+ ai_response: gr.update(value=ai_msg),
635
+ # Reset other UI elements
636
+ score_a: gr.update(value=""),
637
+ critique_a: gr.update(value=""),
638
+ score_b: gr.update(value=""),
639
+ critique_b: gr.update(value=""),
640
+ vote_a: gr.update(interactive=False, variant="primary"),
641
+ vote_b: gr.update(interactive=False, variant="primary"),
642
+ vote_tie: gr.update(interactive=False, variant="primary"),
643
+ model_name_a: gr.update(value="*Model: Hidden*"),
644
+ model_name_b: gr.update(value="*Model: Hidden*"),
645
+ random_btn: gr.update(value="🎲", variant="secondary"),
646
+ }
647
  else:
648
+ # Just hide ground truth when disabling reference mode
649
+ return {
650
+ ground_truth: gr.update(visible=False)
651
+ }
 
 
652
 
653
+ # Update the change handler to include all necessary outputs
654
+ use_reference_toggle.change(
655
+ fn=toggle_use_reference,
656
+ inputs=[use_reference_toggle],
657
+ outputs=[
658
+ ground_truth,
659
+ human_input,
660
+ ai_response,
661
+ score_a,
662
+ critique_a,
663
+ score_b,
664
+ critique_b,
665
+ vote_a,
666
+ vote_b,
667
+ vote_tie,
668
+ model_name_a,
669
+ model_name_b,
670
+ random_btn,
671
+ ]
672
+ )
 
673
 
674
+ # Add a new state variable to track first game
675
+ first_game_state = gr.State(True) # Initialize as True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
 
677
+ # Update the submit function to use the state variable
678
+ def submit_and_store(
679
+ use_reference,
680
+ eval_criteria_text_input,
681
+ human_input,
682
+ ai_response,
683
+ ground_truth_input,
684
+ score1_description,
685
+ score2_description,
686
+ score3_description,
687
+ score4_description,
688
+ score5_description,
689
+ is_first_game, # Add state variable as input
690
+ ):
691
+ # Build prompt data dictionary
692
+ prompt_data = {
693
+ 'human_input': human_input,
694
+ 'ai_response': ai_response,
695
+ 'ground_truth_input': ground_truth_input,
696
+ 'eval_criteria': eval_criteria_text_input,
697
+ 'score1_desc': score1_description,
698
+ 'score2_desc': score2_description,
699
+ 'score3_desc': score3_description,
700
+ 'score4_desc': score4_description,
701
+ 'score5_desc': score5_description,
702
+ }
703
+
704
+ # Get list of active models only for matches
705
+ active_models = [name for name, info in model_data.items()
706
+ if info.get("active", True)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
707
 
708
+ atla_model = "Atla-8B-preview"
 
 
709
 
710
+ if is_first_game:
711
+ # For the first game, ensure new model is one of the models to catch up on votes
712
+ other_models = [m for m in active_models if m != atla_model]
713
+ other_model = random.choice(other_models)
714
 
715
+ # Randomly assign new model to either position A or B
716
+ if random.random() < 0.5:
717
+ model_a, model_b = atla_model, other_model
718
+ else:
719
+ model_a, model_b = other_model, atla_model
720
+ else:
721
+ # For subsequent games, new models appears 40% of the time
722
+ if random.random() < 0.4:
723
+ # Randomly choose between new models
724
+ new_model = random.choice(["Atla-8B-preview"]) # add "Flow-Judge-1.0" once ready
725
+ other_models = [m for m in active_models if m not in [new_model]]
726
+ other_model = random.choice(other_models)
727
+
728
+ if random.random() < 0.5:
729
+ model_a, model_b = new_model, other_model
730
+ else:
731
+ model_a, model_b = other_model, new_model
732
+ else:
733
+ # For other cases, exclude both Atla and Flow-Judge
734
+ non_special_models = [m for m in active_models if m not in new_model]
735
+ model1, model2 = random.sample(non_special_models, 2)
736
+ model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
737
 
738
+ # Get responses from models
739
+ response_a = get_model_response(
740
+ model_a,
741
+ model_data.get(model_a),
742
+ prompt_data,
743
+ use_reference=use_reference
744
+ )
745
+ response_b = get_model_response(
746
+ model_b,
747
+ model_data.get(model_b),
748
+ prompt_data,
749
+ use_reference=use_reference
750
+ )
751
 
752
+ # Parse the responses based on model, using appropriate parsing for different models
753
+ is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
754
+ is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
755
+ is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
756
+ is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
757
+ is_flow_judge_a = (model_data.get(model_a)['organization'] == 'Flow AI')
758
+ is_flow_judge_b = (model_data.get(model_b)['organization'] == 'Flow AI')
759
 
760
+ if is_prometheus_a:
761
+ score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
762
+ score_a_val = f"{score_a_val} / 5"
763
+ elif is_atla_a:
764
+ score_a_val, critique_a_val = atla_parse_model_response(response_a)
765
+ score_a_val = f"{score_a_val} / 5"
766
+ elif is_flow_judge_a:
767
+ score_a_val, critique_a_val = flow_judge_parse_model_response(response_a)
768
+ score_a_val = f"{score_a_val} / 5"
769
+ else:
770
+ score_a_val, critique_a_val = parse_model_response(response_a)
771
+ score_a_val = f"{score_a_val} / 5"
772
+
773
+ if is_prometheus_b:
774
+ score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
775
+ score_b_val = f"{score_b_val} / 5"
776
+ elif is_atla_b:
777
+ score_b_val, critique_b_val = atla_parse_model_response(response_b)
778
+ score_b_val = f"{score_b_val} / 5"
779
+ elif is_flow_judge_b:
780
+ score_b_val, critique_b_val = flow_judge_parse_model_response(response_b)
781
+ score_b_val = f"{score_b_val} / 5"
782
+ else:
783
+ score_b_val, critique_b_val = parse_model_response(response_b)
784
+ score_b_val = f"{score_b_val} / 5"
785
+
786
+ return (
787
+ score_a_val,
788
+ critique_a_val,
789
+ score_b_val,
790
+ critique_b_val,
791
+ gr.update(interactive=True, variant="primary"), # vote_a
792
+ gr.update(interactive=True, variant="primary"), # vote_b
793
+ gr.update(interactive=True, variant="primary"), # vote_tie
794
+ model_a,
795
+ model_b,
796
+ eval_prompt,
797
+ gr.update(value="*Model: Hidden*"),
798
+ gr.update(value="*Model: Hidden*"),
799
+ gr.update(value="Regenerate judges", variant="secondary", interactive=True),
800
+ gr.update(value="🎲"), # random_btn
801
+ False, # Set first_game_state to False after first submission
802
+ )
803
+
804
+ # Update the click handler to use False for is_first_game after first submission
805
+ def create_submit_handler():
806
+ first_game = True
807
+
808
+ def handler(*args):
809
+ nonlocal first_game
810
+ result = submit_and_store(*args, first_game)
811
+ first_game = False # Set to False after first submission
812
+ return result
813
 
814
+ return handler
815
+
816
+ # Update the send_btn click handler
817
+ send_btn.click(
818
+ fn=submit_and_store,
819
+ inputs=[
820
+ use_reference_toggle,
821
+ eval_criteria_text,
822
+ human_input,
823
+ ai_response,
824
+ ground_truth,
825
+ score1_description,
826
+ score2_description,
827
+ score3_description,
828
+ score4_description,
829
+ score5_description,
830
+ first_game_state, # Add first_game_state as input
831
+ ],
832
+ outputs=[
833
+ score_a,
834
+ critique_a,
835
+ score_b,
836
+ critique_b,
837
+ vote_a,
838
+ vote_b,
839
+ vote_tie,
840
+ model_a_state,
841
+ model_b_state,
842
+ final_prompt_state,
843
+ model_name_a,
844
+ model_name_b,
845
+ send_btn,
846
+ random_btn,
847
+ first_game_state, # Add first_game_state as output
848
+ ],
849
+ )
850
+
851
+ # Add random button handler
852
+ random_btn.click(
853
+ fn=populate_random_example,
854
+ inputs=[use_reference_toggle], # Use compatible mode toggle to decide behavior
855
+ outputs=[
856
+ human_input,
857
+ ai_response,
858
+ random_btn,
859
+ score_a,
860
+ critique_a,
861
+ score_b,
862
+ critique_b,
863
+ vote_a,
864
+ vote_b,
865
+ vote_tie,
866
+ model_name_a,
867
+ model_name_b,
868
+ ground_truth, # Set ground truth
869
+ ]
870
+ )
871
+
872
+ # Add new input change handlers
873
+ def handle_input_change():
874
+ """Reset UI state when inputs are changed"""
875
+ return [
876
+ gr.update(interactive=False), # vote_a
877
+ gr.update(interactive=False), # vote_b
878
+ gr.update(interactive=False), # vote_tie
879
+ gr.update(value="Run judges", variant="primary"), # send_btn
880
+ gr.update(value="🎲", variant="secondary"), # random_btn
881
+ ]
882
+
883
+ # Update the change handlers for inputs
884
+ human_input.change(
885
+ fn=handle_input_change,
886
+ inputs=[],
887
+ outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
888
+ )
889
+
890
+ ai_response.change(
891
+ fn=handle_input_change,
892
+ inputs=[],
893
+ outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
894
+ )
895
+
896
+ generate_btn.click(
897
+ fn=lambda msg: (
898
+ generate_ai_response(msg)[0], # Only take the response text
899
+ gr.update(
900
+ value="Generate AI Response", # Keep the label
901
+ interactive=False # Disable the button
902
+ )
903
+ ),
904
+ inputs=[human_input],
905
+ outputs=[ai_response, generate_btn]
906
+ )
907
+
908
+ human_input.change(
909
+ fn=lambda x: gr.update(interactive=bool(x.strip())),
910
+ inputs=[human_input],
911
+ outputs=[generate_btn]
912
+ )
913
+
914
+ # Update the demo.load to include the random example population
915
+ demo.load(
916
+ fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
917
+ inputs=[],
918
+ outputs=[
919
+ human_input,
920
+ ai_response,
921
+ random_btn,
922
+ score_a,
923
+ critique_a,
924
+ score_b,
925
+ critique_b,
926
+ vote_a,
927
+ vote_b,
928
+ vote_tie,
929
+ model_name_a,
930
+ model_name_b,
931
+ ground_truth,
932
+ ]
933
+ )
934
+
935
+ # Add new state variables for compatible mode
936
+ eval_criteria_previous = gr.State(value=DEFAULT_EVAL_CRITERIA)
937
+ score1_previous = gr.State(value=DEFAULT_SCORE_1)
938
+ score2_previous = gr.State(value=DEFAULT_SCORE_2)
939
+ score3_previous = gr.State(value=DEFAULT_SCORE_3)
940
+ score4_previous = gr.State(value=DEFAULT_SCORE_4)
941
+ score5_previous = gr.State(value=DEFAULT_SCORE_5)
942
+
943
+ # Add new functions to handle compatible mode saves/cancels
944
+ def save_compatible_prompt(criteria, score1, score2, score3, score4, score5):
945
+ return [
946
+ gr.update(value=criteria), # Update criteria
947
+ criteria, # Update previous criteria state
948
+ gr.update(value=score1),
949
+ score1,
950
+ gr.update(value=score2),
951
+ score2,
952
+ gr.update(value=score3),
953
+ score3,
954
+ gr.update(value=score4),
955
+ score4,
956
+ gr.update(value=score5),
957
+ score5,
958
+ gr.update(visible=False) # Hide buttons
959
+ ]
960
+
961
+ def cancel_compatible_prompt(prev_criteria, prev_score1, prev_score2, prev_score3, prev_score4, prev_score5):
962
+ return [
963
+ gr.update(value=prev_criteria),
964
+ prev_criteria,
965
+ gr.update(value=prev_score1),
966
+ prev_score1,
967
+ gr.update(value=prev_score2),
968
+ prev_score2,
969
+ gr.update(value=prev_score3),
970
+ prev_score3,
971
+ gr.update(value=prev_score4),
972
+ prev_score4,
973
+ gr.update(value=prev_score5),
974
+ prev_score5,
975
+ gr.update(visible=False)
976
+ ]
977
+
978
+ def show_compatible_edit_buttons(*current_values):
979
+ previous_values = current_values[1::2] # Get previous values
980
+ current_values = current_values[::2] # Get current values
981
+ return gr.update(visible=any(curr != prev for curr, prev in zip(current_values, previous_values)))
982
+
983
+ # Add click handlers for compatible mode buttons
984
+ compatible_save_btn.click(
985
+ fn=save_compatible_prompt,
986
+ inputs=[
987
+ eval_criteria_text,
988
+ score1_description,
989
+ score2_description,
990
+ score3_description,
991
+ score4_description,
992
+ score5_description
993
+ ],
994
+ outputs=[
995
+ eval_criteria_text,
996
+ eval_criteria_previous,
997
+ score1_description,
998
+ score1_previous,
999
+ score2_description,
1000
+ score2_previous,
1001
+ score3_description,
1002
+ score3_previous,
1003
+ score4_description,
1004
+ score4_previous,
1005
+ score5_description,
1006
+ score5_previous,
1007
+ compatible_edit_buttons_row
1008
+ ]
1009
+ )
1010
+
1011
+ compatible_cancel_btn.click(
1012
+ fn=cancel_compatible_prompt,
1013
+ inputs=[
1014
+ eval_criteria_previous,
1015
+ score1_previous,
1016
+ score2_previous,
1017
+ score3_previous,
1018
+ score4_previous,
1019
+ score5_previous
1020
+ ],
1021
+ outputs=[
1022
+ eval_criteria_text,
1023
+ eval_criteria_previous,
1024
+ score1_description,
1025
+ score1_previous,
1026
+ score2_description,
1027
+ score2_previous,
1028
+ score3_description,
1029
+ score3_previous,
1030
+ score4_description,
1031
+ score4_previous,
1032
+ score5_description,
1033
+ score5_previous,
1034
+ compatible_edit_buttons_row
1035
+ ]
1036
+ )
1037
+
1038
+ # Add change handlers for all compatible mode inputs
1039
+ for component in [eval_criteria_text, score1_description, score2_description,
1040
+ score3_description, score4_description, score5_description]:
1041
+ component.change(
1042
+ fn=show_compatible_edit_buttons,
1043
+ inputs=[
1044
+ eval_criteria_text,
1045
+ eval_criteria_previous,
1046
+ score1_description,
1047
+ score1_previous,
1048
+ score2_description,
1049
+ score2_previous,
1050
+ score3_description,
1051
+ score3_previous,
1052
+ score4_description,
1053
+ score4_previous,
1054
+ score5_description,
1055
+ score5_previous
1056
+ ],
1057
+ outputs=compatible_edit_buttons_row
1058
+ )
1059
+
1060
+ if __name__ == "__main__":
1061
+ demo.launch()