kaikaidai commited on
Commit
e193ee0
·
verified ·
1 Parent(s): 2ecd4d6

Undo previous commit

Browse files

app.py committed here

Files changed (1) hide show
  1. gen_api_answer.py +417 -1030
gen_api_answer.py CHANGED
@@ -1,1061 +1,448 @@
 
 
 
 
1
  import json
2
  import re
3
- import random
4
- from collections import defaultdict
5
- from datetime import datetime
6
- import hashlib
7
- import gradio as gr
8
-
9
- from dotenv import load_dotenv
10
- load_dotenv()
11
-
12
- from gen_api_answer import (
13
- get_model_response,
14
- parse_model_response,
15
- prometheus_parse_model_response,
16
- atla_parse_model_response,
17
- flow_judge_parse_model_response
18
- )
19
-
20
- from random_sample_generation import (
21
- get_random_human_ai_pair,
22
- get_random_human_ai_ground_truth_pair,
23
- generate_ai_response
24
- )
25
- from db import add_vote, create_db_connection, get_votes
26
-
27
- from utils import Vote
28
-
29
- from common import (
30
- POLICY_CONTENT,
31
- ACKNOWLEDGEMENTS,
32
- CSS_STYLES,
33
- MAIN_TITLE,
34
- HOW_IT_WORKS,
35
- )
36
  from prompts import (
37
- DEFAULT_EVAL_PROMPT,
38
- DEFAULT_EVAL_PROMPT_EDITABLE,
39
- FIXED_EVAL_SUFFIX,
40
- DEFAULT_EVAL_CRITERIA,
41
- DEFAULT_SCORE_1,
42
- DEFAULT_SCORE_2,
43
- DEFAULT_SCORE_3,
44
- DEFAULT_SCORE_4,
45
- DEFAULT_SCORE_5,
46
- )
47
- from leaderboard import (
48
- get_leaderboard,
49
- get_leaderboard_stats,
50
- get_model_rankings,
51
- DEFAULT_ELO,
52
- K_FACTOR
53
  )
54
-
55
-
56
- elo_scores = defaultdict(lambda: DEFAULT_ELO)
57
- vote_counts = defaultdict(int)
58
-
59
- db = create_db_connection()
60
- votes_collection = get_votes(db)
61
-
62
- current_time = datetime.now()
63
-
64
-
65
- # Load the model_data from JSONL
66
- def load_model_data():
67
- model_data = {}
68
  try:
69
- with open("data/models.jsonl", "r") as f:
70
- for line in f:
71
- model = json.loads(line)
72
- model_data[model["name"]] = {
73
- "organization": model["organization"],
74
- "license": model["license"],
75
- "api_model": model["api_model"],
76
- }
77
- except FileNotFoundError:
78
- print("Warning: models.jsonl not found")
79
- return {}
80
- return model_data
81
-
82
-
83
- model_data = load_model_data()
84
-
85
- def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
86
- prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
87
-
88
- vote = Vote(
89
- timestamp=datetime.now().isoformat(),
90
- prompt=prompt_value,
91
- response_a=response_a,
92
- response_b=response_b,
93
- model_a=model_a,
94
- model_b=model_b,
95
- winner=winner,
96
- judge_id=judge_id,
97
- )
98
- add_vote(vote, db)
99
-
100
-
101
- def parse_variables(prompt):
102
- # Extract variables enclosed in double curly braces
103
- variables = re.findall(r"{{(.*?)}}", prompt)
104
- # Remove duplicates while preserving order
105
- seen = set()
106
- variables = [
107
- x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
108
- ]
109
- return variables
110
-
111
-
112
- def get_final_prompt(eval_prompt, variable_values):
113
- # Replace variables in the eval prompt with their values
114
- for var, val in variable_values.items():
115
- eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
116
- return eval_prompt
117
-
118
-
119
-
120
- def get_ip(request: gr.Request) -> str:
121
- """Get and hash the IP address from the request."""
122
- if "cf-connecting-ip" in request.headers:
123
- ip = request.headers["cf-connecting-ip"]
124
- elif "x-forwarded-for" in request.headers:
125
- ip = request.headers["x-forwarded-for"]
126
- if "," in ip:
127
- ip = ip.split(",")[0]
128
- else:
129
- ip = request.client.host
130
-
131
- # Hash the IP address for privacy
132
- return hashlib.sha256(ip.encode()).hexdigest()[:16]
133
-
134
-
135
- def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
136
- """Generate appropriate message based on vote and model rankings.
137
- Returns (title, message) tuple."""
138
- # Get current rankings
139
- voting_data = get_current_votes()
140
- leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
141
- rankings = get_model_rankings(leaderboard)
142
- pos_a = rankings.get(model_a, 0)
143
- pos_b = rankings.get(model_b, 0)
144
-
145
- if choice == "Tie":
146
- return "It's a tie!", "Keep voting responsibly 🤗"
147
-
148
- # Check if vote aligns with leaderboard
149
- if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
150
- return "The favourite wins!", "Keep voting responsibly 🤗"
151
- else:
152
- return "The underdog wins!", "Keep voting responsibly 🤗"
153
-
154
-
155
- def vote(
156
- choice,
157
- model_a,
158
- model_b,
159
- final_prompt,
160
- score_a,
161
- critique_a,
162
- score_b,
163
- critique_b,
164
- request: gr.Request,
165
- ):
166
- # Get hashed IP as judge_id
167
- judge_id = get_ip(request)
168
-
169
- # Update ELO scores based on user choice
170
- elo_a = elo_scores[model_a]
171
- elo_b = elo_scores[model_b]
172
-
173
- # Calculate expected scores
174
- Ea = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
175
- Eb = 1 / (1 + 10 ** ((elo_a - elo_b) / 400))
176
-
177
- # Assign actual scores
178
- if choice == "A":
179
- Sa, Sb = 1, 0
180
- elif choice == "B":
181
- Sa, Sb = 0, 1
182
- else:
183
- Sa, Sb = 0.5, 0.5
184
-
185
- # Update scores and vote counts
186
- elo_scores[model_a] += K_FACTOR * (Sa - Ea)
187
- elo_scores[model_b] += K_FACTOR * (Sb - Eb)
188
- vote_counts[model_a] += 1
189
- vote_counts[model_b] += 1
190
-
191
- # Format the full responses with score and critique
192
- response_a = f"""{score_a}
193
-
194
- {critique_a}"""
195
 
196
- response_b = f"""{score_b}
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
- {critique_b}"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- # Store the vote data with the final prompt
201
- store_vote_data(
202
- final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
203
- )
204
-
205
- # Get model positions for display
206
- voting_data = get_current_votes()
207
- leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
208
- rankings = get_model_rankings(leaderboard)
209
- pos_a = rankings.get(model_a, 0)
210
- pos_b = rankings.get(model_b, 0)
211
-
212
- # Format model names with positions and win/loss indicators
213
- if choice == "Tie":
214
- model_a_display = f"*Model: {model_a} (Position #{pos_a})*"
215
- model_b_display = f"*Model: {model_b} (Position #{pos_b})*"
216
- else:
217
- winner = model_a if choice == "A" else model_b
218
- loser = model_b if choice == "A" else model_a
219
- winner_pos = pos_a if choice == "A" else pos_b
220
- loser_pos = pos_b if choice == "A" else pos_a
221
 
222
- model_a_display = f"*Model: {model_a} {'✅' if choice == 'A' else '❌'} (Position #{pos_a})*"
223
- model_b_display = f"*Model: {model_b} {'✅' if choice == 'B' else '❌'} (Position #{pos_b})*"
224
-
225
- # Generate vote message
226
- title, message = get_vote_message(choice, model_a, model_b)
227
-
228
- return [
229
- gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a
230
- gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b
231
- gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie
232
- gr.update(value=model_a_display), # model_name_a
233
- gr.update(value=model_b_display), # model_name_b
234
- gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn
235
- gr.update(value="🎲 New round", variant="primary"), # random_btn
236
- gr.Info(message, title=title), # success message
237
- ]
238
-
239
-
240
- def get_current_votes():
241
- """Get current votes from database."""
242
- return get_votes(db)
243
-
244
-
245
- # Update the refresh_leaderboard function
246
- def refresh_leaderboard(show_preliminary):
247
- """Refresh the leaderboard data and stats."""
248
- voting_data = get_current_votes()
249
- leaderboard = get_leaderboard(model_data, voting_data, show_preliminary)
250
- data = [
251
- [
252
- entry["Model"],
253
- float(entry["ELO Score"]),
254
- entry["95% CI"],
255
- entry["# Votes"],
256
- entry["Organization"],
257
- entry["License"],
258
- ]
259
- for entry in leaderboard
260
- ]
261
- stats = get_leaderboard_stats(model_data, voting_data)
262
- return [gr.update(value=data), gr.update(value=stats)]
263
-
264
 
265
- # Update the leaderboard table definition in the UI
266
- leaderboard_table = gr.Dataframe(
267
- headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
268
- datatype=["str", "number", "str", "number", "str", "str", "str"],
269
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
- def populate_random_example(request: gr.Request, compatible_mode: bool):
273
- """Generate a random human-AI conversation example and reset judge outputs."""
274
- if compatible_mode:
275
- # Generate all three components when compatible mode is enabled
276
- human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  else:
278
- # Generate only human and AI messages when compatible mode is disabled
279
- human_msg, ai_msg = get_random_human_ai_pair()
280
- ground_truth_msg = ""
281
-
282
- return [
283
- gr.update(value=human_msg),
284
- gr.update(value=ai_msg),
285
- gr.update(value="🎲", variant="secondary"), # Reset random button appearance
286
- gr.update(value=""), # Clear score A
287
- gr.update(value=""), # Clear critique A
288
- gr.update(value=""), # Clear score B
289
- gr.update(value=""), # Clear critique B
290
- gr.update(interactive=False, variant="primary"), # Reset vote A
291
- gr.update(interactive=False, variant="primary"), # Reset vote B
292
- gr.update(interactive=False, variant="primary"), # Reset vote tie
293
- gr.update(value="*Model: Hidden*"), # Reset model name A
294
- gr.update(value="*Model: Hidden*"), # Reset model name B
295
- gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
296
- ]
297
-
298
-
299
- with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
300
- gr.Markdown(MAIN_TITLE)
301
- gr.Markdown(HOW_IT_WORKS)
302
 
303
- # Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
304
- eval_prompt = gr.Textbox(
305
- value=DEFAULT_EVAL_PROMPT,
306
- visible=False
307
- )
308
-
309
- with gr.Tabs():
310
- with gr.TabItem("Judge Arena"):
311
- with gr.Row():
312
- # Left side - Input section
313
- with gr.Column(scale=1):
314
- with gr.Group():
315
- human_input = gr.TextArea(
316
- label="👩 User Input",
317
- lines=10,
318
- placeholder="Enter the human message here..."
319
- )
320
- with gr.Row():
321
- generate_btn = gr.Button(
322
- "Generate AI Response",
323
- size="sm",
324
- interactive=False
325
- )
326
-
327
- ai_response = gr.TextArea(
328
- label="🤖 AI Response",
329
- lines=15,
330
- placeholder="Enter the AI response here..."
331
- )
332
-
333
- # Ground truth response (initially hidden)
334
- ground_truth = gr.TextArea(
335
- label="🎯 Ground truth response",
336
- lines=12,
337
- placeholder="Enter the ground truth response here...",
338
- visible=False
339
- )
340
-
341
- with gr.Row():
342
- random_btn = gr.Button("🎲", scale=2)
343
- send_btn = gr.Button(
344
- value="Run judges",
345
- variant="primary",
346
- size="lg",
347
- scale=8
348
- )
349
-
350
- # Right side - Model outputs
351
- with gr.Column(scale=1):
352
- gr.Markdown("### 👩‍⚖️ Judge A")
353
- with gr.Group():
354
- model_name_a = gr.Markdown("*Model: Hidden*")
355
- with gr.Row():
356
- with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
357
- score_a = gr.Textbox(label="Score", lines=6, interactive=False)
358
- vote_a = gr.Button("Vote A", variant="primary", interactive=False)
359
- with gr.Column(scale=9, min_width=400): # Wider width for critique
360
- critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
361
-
362
- # Tie button row
363
- with gr.Row() as tie_button_row:
364
- with gr.Column():
365
- vote_tie = gr.Button("Tie", variant="primary", interactive=False)
366
-
367
-
368
- gr.Markdown("### 🧑‍⚖️ Judge B")
369
- with gr.Group():
370
- model_name_b = gr.Markdown("*Model: Hidden*")
371
- with gr.Row():
372
- with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
373
- score_b = gr.Textbox(label="Score", lines=6, interactive=False)
374
- vote_b = gr.Button("Vote B", variant="primary", interactive=False)
375
- with gr.Column(scale=9, min_width=400): # Wider width for critique
376
- critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
377
- # Place Vote B button directly under Judge B
378
-
379
- gr.Markdown("<br>")
380
-
381
-
382
- # Replace the "Edit Judge Prompt" Accordion section with:
383
- with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
384
- gr.Markdown("<br>")
385
- use_reference_toggle = gr.Checkbox(
386
- label="Use a reference response",
387
- value=False
388
- )
389
-
390
- # Hide the default prompt editor
391
- with gr.Column(visible=False) as default_prompt_editor:
392
- eval_prompt_editable = gr.TextArea(
393
- value=DEFAULT_EVAL_PROMPT_EDITABLE,
394
- label="Evaluation Criteria",
395
- lines=12
396
- )
397
-
398
- with gr.Row(visible=False) as edit_buttons_row:
399
- cancel_prompt_btn = gr.Button("Cancel")
400
- save_prompt_btn = gr.Button("Save", variant="primary")
401
- gr.Markdown("*The sample being evaluated is always appended as:*")
402
- gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
403
-
404
- # Show the compatible mode editor
405
- with gr.Column(visible=True) as compatible_prompt_editor:
406
- with gr.Row():
407
- # Left column - Evaluation Criteria
408
- with gr.Column(scale=1):
409
- eval_criteria_text = gr.TextArea(
410
- label="Evaluation Criteria",
411
- lines=12,
412
- value=DEFAULT_EVAL_CRITERIA,
413
- placeholder="Enter the evaluation criteria..."
414
- )
415
- prometheus_reference = gr.Markdown(
416
- "<br> *By default, we use the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
417
- visible=True
418
- )
419
-
420
- # Right column - Score Descriptions
421
- with gr.Column(scale=1):
422
- score1_description = gr.TextArea(
423
- label="Score 1",
424
- value=DEFAULT_SCORE_1,
425
- placeholder="Description for score 1",
426
- lines=2
427
- )
428
- score2_description = gr.TextArea(
429
- label="Score 2",
430
- value=DEFAULT_SCORE_2,
431
- placeholder="Description for score 2",
432
- lines=2
433
- )
434
- score3_description = gr.TextArea(
435
- label="Score 3",
436
- value=DEFAULT_SCORE_3,
437
- placeholder="Description for score 3",
438
- lines=2
439
- )
440
- score4_description = gr.TextArea(
441
- label="Score 4",
442
- value=DEFAULT_SCORE_4,
443
- placeholder="Description for score 4",
444
- lines=2
445
- )
446
- score5_description = gr.TextArea(
447
- label="Score 5",
448
- value=DEFAULT_SCORE_5,
449
- placeholder="Description for score 5",
450
- lines=2
451
- )
452
-
453
- # Add save/cancel buttons for compatible mode
454
- with gr.Row(visible=False) as compatible_edit_buttons_row:
455
- compatible_cancel_btn = gr.Button("Cancel")
456
- compatible_save_btn = gr.Button("Save", variant="primary")
457
 
458
- with gr.TabItem("Leaderboard"):
459
- with gr.Row():
460
- with gr.Column(scale=1):
461
- show_preliminary = gr.Checkbox(
462
- label="Reveal preliminary results",
463
- value=True, # Checked by default
464
- info="Show all models, including models with less human ratings (< 300 votes)",
465
- interactive=True
466
- )
467
- stats_display = gr.Markdown()
468
- leaderboard_table = gr.Dataframe(
469
- headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
470
- datatype=["str", "number", "str", "number", "str", "str", "str"],
471
  )
472
-
473
- gr.Markdown("""<br>
474
- <br>
475
- Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
476
 
477
- [*"Together Turbo achieves this performance while maintaining full accuracy compared to Meta's reference implementation across all models. Llama-3.1-405B-Instruct-Turbo matches the accuracy of Meta reference models."*](https://www.together.ai/blog/together-inference-engine-2)
478
- """)
479
-
480
- # Add change handler for checkbox
481
- show_preliminary.change(
482
- fn=refresh_leaderboard,
483
- inputs=[show_preliminary],
484
- outputs=[leaderboard_table, stats_display]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  )
 
 
 
486
 
487
- # Update the load event
488
- demo.load(
489
- fn=refresh_leaderboard,
490
- inputs=[show_preliminary],
491
- outputs=[leaderboard_table, stats_display]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  )
493
-
494
- with gr.TabItem("Policy"):
495
- gr.Markdown(POLICY_CONTENT)
496
- gr.Markdown(ACKNOWLEDGEMENTS)
497
-
498
- # Define state variables for model tracking
499
- model_a_state = gr.State()
500
- model_b_state = gr.State()
501
- final_prompt_state = gr.State()
502
- eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
503
- is_editing = gr.State(False) # Track editing state
504
- compatible_mode_state = gr.State(False) # Track compatible mode state
505
-
506
- # Update model names after responses are generated
507
- def update_model_names(model_a, model_b):
508
- return gr.update(value=f"*Model: {model_a}*"), gr.update(
509
- value=f"*Model: {model_b}*"
510
- )
511
-
512
- # Store the last submitted prompt and variables for comparison
513
- last_submission = gr.State({})
514
-
515
- # Update the vote button click handlers
516
- vote_a.click(
517
- fn=vote,
518
- inputs=[
519
- gr.State("A"),
520
- model_a_state,
521
- model_b_state,
522
- final_prompt_state,
523
- score_a,
524
- critique_a,
525
- score_b,
526
- critique_b,
527
- ],
528
- outputs=[
529
- vote_a,
530
- vote_b,
531
- vote_tie,
532
- model_name_a,
533
- model_name_b,
534
- send_btn,
535
- random_btn,
536
- gr.State(), # placeholder for success message
537
- ],
538
- )
539
-
540
- vote_b.click(
541
- fn=vote,
542
- inputs=[
543
- gr.State("B"),
544
- model_a_state,
545
- model_b_state,
546
- final_prompt_state,
547
- score_a,
548
- critique_a,
549
- score_b,
550
- critique_b,
551
- ],
552
- outputs=[
553
- vote_a,
554
- vote_b,
555
- vote_tie,
556
- model_name_a,
557
- model_name_b,
558
- send_btn,
559
- random_btn,
560
- gr.State(), # placeholder for success message
561
- ],
562
- )
563
-
564
- vote_tie.click(
565
- fn=vote,
566
- inputs=[
567
- gr.State("Tie"),
568
- model_a_state,
569
- model_b_state,
570
- final_prompt_state,
571
- score_a,
572
- critique_a,
573
- score_b,
574
- critique_b,
575
- ],
576
- outputs=[
577
- vote_a,
578
- vote_b,
579
- vote_tie,
580
- model_name_a,
581
- model_name_b,
582
- send_btn,
583
- random_btn,
584
- gr.State(), # placeholder for success message
585
- ],
586
- )
587
-
588
- # Add handlers for save/cancel buttons
589
- def save_prompt(new_prompt, previous_prompt):
590
- return [
591
- gr.update(value=new_prompt), # Update the prompt
592
- new_prompt, # Update the previous prompt state
593
- gr.update(visible=False) # Hide the buttons
594
- ]
595
-
596
- def cancel_prompt(previous_prompt):
597
- return [
598
- gr.update(value=previous_prompt), # Revert to previous prompt
599
- previous_prompt, # Keep the previous prompt state
600
- gr.update(visible=False) # Hide the buttons
601
- ]
602
-
603
- def show_edit_buttons(current_value, previous_value):
604
- # Show buttons only if the current value differs from the previous value
605
- return gr.update(visible=current_value != previous_value)
606
-
607
- # Add handlers for save/cancel buttons and prompt changes
608
- save_prompt_btn.click(
609
- fn=save_prompt,
610
- inputs=[eval_prompt_editable, eval_prompt_previous],
611
- outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
612
- )
613
-
614
- cancel_prompt_btn.click(
615
- fn=cancel_prompt,
616
- inputs=[eval_prompt_previous],
617
- outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
618
- )
619
-
620
- eval_prompt_editable.change(
621
- fn=show_edit_buttons,
622
- inputs=[eval_prompt_editable, eval_prompt_previous],
623
- outputs=edit_buttons_row
624
- )
625
-
626
- # Function to toggle visibility based on compatible mode
627
- def toggle_use_reference(checked):
628
- if checked:
629
- # Get new random samples with ground truth when enabling reference mode
630
- human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
631
- return {
632
- ground_truth: gr.update(visible=True, value=ground_truth_msg),
633
- human_input: gr.update(value=human_msg),
634
- ai_response: gr.update(value=ai_msg),
635
- # Reset other UI elements
636
- score_a: gr.update(value=""),
637
- critique_a: gr.update(value=""),
638
- score_b: gr.update(value=""),
639
- critique_b: gr.update(value=""),
640
- vote_a: gr.update(interactive=False, variant="primary"),
641
- vote_b: gr.update(interactive=False, variant="primary"),
642
- vote_tie: gr.update(interactive=False, variant="primary"),
643
- model_name_a: gr.update(value="*Model: Hidden*"),
644
- model_name_b: gr.update(value="*Model: Hidden*"),
645
- random_btn: gr.update(value="🎲", variant="secondary"),
646
- }
647
  else:
648
- # Just hide ground truth when disabling reference mode
649
- return {
650
- ground_truth: gr.update(visible=False)
651
- }
652
-
653
- # Update the change handler to include all necessary outputs
654
- use_reference_toggle.change(
655
- fn=toggle_use_reference,
656
- inputs=[use_reference_toggle],
657
- outputs=[
658
- ground_truth,
659
- human_input,
660
- ai_response,
661
- score_a,
662
- critique_a,
663
- score_b,
664
- critique_b,
665
- vote_a,
666
- vote_b,
667
- vote_tie,
668
- model_name_a,
669
- model_name_b,
670
- random_btn,
671
- ]
672
- )
673
-
674
- # Add a new state variable to track first game
675
- first_game_state = gr.State(True) # Initialize as True
676
 
677
- # Update the submit function to use the state variable
678
- def submit_and_store(
679
- use_reference,
680
- eval_criteria_text_input,
681
- human_input,
682
- ai_response,
683
- ground_truth_input,
684
- score1_description,
685
- score2_description,
686
- score3_description,
687
- score4_description,
688
- score5_description,
689
- is_first_game, # Add state variable as input
690
- ):
691
- # Build prompt data dictionary
692
- prompt_data = {
693
- 'human_input': human_input,
694
- 'ai_response': ai_response,
695
- 'ground_truth_input': ground_truth_input,
696
- 'eval_criteria': eval_criteria_text_input,
697
- 'score1_desc': score1_description,
698
- 'score2_desc': score2_description,
699
- 'score3_desc': score3_description,
700
- 'score4_desc': score4_description,
701
- 'score5_desc': score5_description,
702
- }
703
 
704
- # Get list of active models only for matches
705
- active_models = [name for name, info in model_data.items()
706
- if info.get("active", True)]
707
-
708
- atla_model = "Atla-8B-preview"
709
 
710
- if is_first_game:
711
- # For the first game, ensure new model is one of the models to catch up on votes
712
- other_models = [m for m in active_models if m != atla_model]
713
- other_model = random.choice(other_models)
 
 
 
 
714
 
715
- # Randomly assign new model to either position A or B
716
- if random.random() < 0.5:
717
- model_a, model_b = atla_model, other_model
718
- else:
719
- model_a, model_b = other_model, atla_model
720
- else:
721
- # For subsequent games, new models appears 40% of the time
722
- if random.random() < 0.4:
723
- # Randomly choose between new models
724
- new_model = random.choice(["Atla-8B-preview"]) # add "Flow-Judge-1.0" once ready
725
- other_models = [m for m in active_models if m not in [new_model]]
726
- other_model = random.choice(other_models)
727
-
728
- if random.random() < 0.5:
729
- model_a, model_b = new_model, other_model
730
- else:
731
- model_a, model_b = other_model, new_model
732
- else:
733
- # For other cases, exclude both Atla and Flow-Judge
734
- non_special_models = [m for m in active_models if m not in new_model]
735
- model1, model2 = random.sample(non_special_models, 2)
736
- model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
737
-
738
- # Get responses from models
739
- response_a = get_model_response(
740
- model_a,
741
- model_data.get(model_a),
742
- prompt_data,
743
- use_reference=use_reference
744
- )
745
- response_b = get_model_response(
746
- model_b,
747
- model_data.get(model_b),
748
- prompt_data,
749
- use_reference=use_reference
750
- )
751
-
752
- # Parse the responses based on model, using appropriate parsing for different models
753
- is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
754
- is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
755
- is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
756
- is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
757
- is_flow_judge_a = (model_data.get(model_a)['organization'] == 'Flow AI')
758
- is_flow_judge_b = (model_data.get(model_b)['organization'] == 'Flow AI')
759
-
760
- if is_prometheus_a:
761
- score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
762
- score_a_val = f"{score_a_val} / 5"
763
- elif is_atla_a:
764
- score_a_val, critique_a_val = atla_parse_model_response(response_a)
765
- score_a_val = f"{score_a_val} / 5"
766
- elif is_flow_judge_a:
767
- score_a_val, critique_a_val = flow_judge_parse_model_response(response_a)
768
- score_a_val = f"{score_a_val} / 5"
769
- else:
770
- score_a_val, critique_a_val = parse_model_response(response_a)
771
- score_a_val = f"{score_a_val} / 5"
772
-
773
- if is_prometheus_b:
774
- score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
775
- score_b_val = f"{score_b_val} / 5"
776
- elif is_atla_b:
777
- score_b_val, critique_b_val = atla_parse_model_response(response_b)
778
- score_b_val = f"{score_b_val} / 5"
779
- elif is_flow_judge_b:
780
- score_b_val, critique_b_val = flow_judge_parse_model_response(response_b)
781
- score_b_val = f"{score_b_val} / 5"
782
- else:
783
- score_b_val, critique_b_val = parse_model_response(response_b)
784
- score_b_val = f"{score_b_val} / 5"
785
-
786
- return (
787
- score_a_val,
788
- critique_a_val,
789
- score_b_val,
790
- critique_b_val,
791
- gr.update(interactive=True, variant="primary"), # vote_a
792
- gr.update(interactive=True, variant="primary"), # vote_b
793
- gr.update(interactive=True, variant="primary"), # vote_tie
794
- model_a,
795
- model_b,
796
- eval_prompt,
797
- gr.update(value="*Model: Hidden*"),
798
- gr.update(value="*Model: Hidden*"),
799
- gr.update(value="Regenerate judges", variant="secondary", interactive=True),
800
- gr.update(value="🎲"), # random_btn
801
- False, # Set first_game_state to False after first submission
802
- )
803
 
804
- # Update the click handler to use False for is_first_game after first submission
805
- def create_submit_handler():
806
- first_game = True
807
 
808
- def handler(*args):
809
- nonlocal first_game
810
- result = submit_and_store(*args, first_game)
811
- first_game = False # Set to False after first submission
812
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
813
 
814
- return handler
815
-
816
- # Update the send_btn click handler
817
- send_btn.click(
818
- fn=submit_and_store,
819
- inputs=[
820
- use_reference_toggle,
821
- eval_criteria_text,
822
- human_input,
823
- ai_response,
824
- ground_truth,
825
- score1_description,
826
- score2_description,
827
- score3_description,
828
- score4_description,
829
- score5_description,
830
- first_game_state, # Add first_game_state as input
831
- ],
832
- outputs=[
833
- score_a,
834
- critique_a,
835
- score_b,
836
- critique_b,
837
- vote_a,
838
- vote_b,
839
- vote_tie,
840
- model_a_state,
841
- model_b_state,
842
- final_prompt_state,
843
- model_name_a,
844
- model_name_b,
845
- send_btn,
846
- random_btn,
847
- first_game_state, # Add first_game_state as output
848
- ],
849
- )
850
-
851
- # Add random button handler
852
- random_btn.click(
853
- fn=populate_random_example,
854
- inputs=[use_reference_toggle], # Use compatible mode toggle to decide behavior
855
- outputs=[
856
- human_input,
857
- ai_response,
858
- random_btn,
859
- score_a,
860
- critique_a,
861
- score_b,
862
- critique_b,
863
- vote_a,
864
- vote_b,
865
- vote_tie,
866
- model_name_a,
867
- model_name_b,
868
- ground_truth, # Set ground truth
869
- ]
870
- )
871
-
872
- # Add new input change handlers
873
- def handle_input_change():
874
- """Reset UI state when inputs are changed"""
875
- return [
876
- gr.update(interactive=False), # vote_a
877
- gr.update(interactive=False), # vote_b
878
- gr.update(interactive=False), # vote_tie
879
- gr.update(value="Run judges", variant="primary"), # send_btn
880
- gr.update(value="🎲", variant="secondary"), # random_btn
881
- ]
882
-
883
- # Update the change handlers for inputs
884
- human_input.change(
885
- fn=handle_input_change,
886
- inputs=[],
887
- outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
888
- )
889
-
890
- ai_response.change(
891
- fn=handle_input_change,
892
- inputs=[],
893
- outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
894
- )
895
-
896
- generate_btn.click(
897
- fn=lambda msg: (
898
- generate_ai_response(msg)[0], # Only take the response text
899
- gr.update(
900
- value="Generate AI Response", # Keep the label
901
- interactive=False # Disable the button
902
- )
903
- ),
904
- inputs=[human_input],
905
- outputs=[ai_response, generate_btn]
906
- )
907
-
908
- human_input.change(
909
- fn=lambda x: gr.update(interactive=bool(x.strip())),
910
- inputs=[human_input],
911
- outputs=[generate_btn]
912
- )
913
-
914
- # Update the demo.load to include the random example population
915
- demo.load(
916
- fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
917
- inputs=[],
918
- outputs=[
919
- human_input,
920
- ai_response,
921
- random_btn,
922
- score_a,
923
- critique_a,
924
- score_b,
925
- critique_b,
926
- vote_a,
927
- vote_b,
928
- vote_tie,
929
- model_name_a,
930
- model_name_b,
931
- ground_truth,
932
- ]
933
- )
934
-
935
- # Add new state variables for compatible mode
936
- eval_criteria_previous = gr.State(value=DEFAULT_EVAL_CRITERIA)
937
- score1_previous = gr.State(value=DEFAULT_SCORE_1)
938
- score2_previous = gr.State(value=DEFAULT_SCORE_2)
939
- score3_previous = gr.State(value=DEFAULT_SCORE_3)
940
- score4_previous = gr.State(value=DEFAULT_SCORE_4)
941
- score5_previous = gr.State(value=DEFAULT_SCORE_5)
942
-
943
- # Add new functions to handle compatible mode saves/cancels
944
- def save_compatible_prompt(criteria, score1, score2, score3, score4, score5):
945
- return [
946
- gr.update(value=criteria), # Update criteria
947
- criteria, # Update previous criteria state
948
- gr.update(value=score1),
949
- score1,
950
- gr.update(value=score2),
951
- score2,
952
- gr.update(value=score3),
953
- score3,
954
- gr.update(value=score4),
955
- score4,
956
- gr.update(value=score5),
957
- score5,
958
- gr.update(visible=False) # Hide buttons
959
- ]
960
-
961
- def cancel_compatible_prompt(prev_criteria, prev_score1, prev_score2, prev_score3, prev_score4, prev_score5):
962
- return [
963
- gr.update(value=prev_criteria),
964
- prev_criteria,
965
- gr.update(value=prev_score1),
966
- prev_score1,
967
- gr.update(value=prev_score2),
968
- prev_score2,
969
- gr.update(value=prev_score3),
970
- prev_score3,
971
- gr.update(value=prev_score4),
972
- prev_score4,
973
- gr.update(value=prev_score5),
974
- prev_score5,
975
- gr.update(visible=False)
976
- ]
977
-
978
- def show_compatible_edit_buttons(*current_values):
979
- previous_values = current_values[1::2] # Get previous values
980
- current_values = current_values[::2] # Get current values
981
- return gr.update(visible=any(curr != prev for curr, prev in zip(current_values, previous_values)))
982
-
983
- # Add click handlers for compatible mode buttons
984
- compatible_save_btn.click(
985
- fn=save_compatible_prompt,
986
- inputs=[
987
- eval_criteria_text,
988
- score1_description,
989
- score2_description,
990
- score3_description,
991
- score4_description,
992
- score5_description
993
- ],
994
- outputs=[
995
- eval_criteria_text,
996
- eval_criteria_previous,
997
- score1_description,
998
- score1_previous,
999
- score2_description,
1000
- score2_previous,
1001
- score3_description,
1002
- score3_previous,
1003
- score4_description,
1004
- score4_previous,
1005
- score5_description,
1006
- score5_previous,
1007
- compatible_edit_buttons_row
1008
- ]
1009
- )
1010
 
1011
- compatible_cancel_btn.click(
1012
- fn=cancel_compatible_prompt,
1013
- inputs=[
1014
- eval_criteria_previous,
1015
- score1_previous,
1016
- score2_previous,
1017
- score3_previous,
1018
- score4_previous,
1019
- score5_previous
1020
- ],
1021
- outputs=[
1022
- eval_criteria_text,
1023
- eval_criteria_previous,
1024
- score1_description,
1025
- score1_previous,
1026
- score2_description,
1027
- score2_previous,
1028
- score3_description,
1029
- score3_previous,
1030
- score4_description,
1031
- score4_previous,
1032
- score5_description,
1033
- score5_previous,
1034
- compatible_edit_buttons_row
1035
- ]
1036
- )
1037
 
1038
- # Add change handlers for all compatible mode inputs
1039
- for component in [eval_criteria_text, score1_description, score2_description,
1040
- score3_description, score4_description, score5_description]:
1041
- component.change(
1042
- fn=show_compatible_edit_buttons,
1043
- inputs=[
1044
- eval_criteria_text,
1045
- eval_criteria_previous,
1046
- score1_description,
1047
- score1_previous,
1048
- score2_description,
1049
- score2_previous,
1050
- score3_description,
1051
- score3_previous,
1052
- score4_description,
1053
- score4_previous,
1054
- score5_description,
1055
- score5_previous
1056
- ],
1057
- outputs=compatible_edit_buttons_row
1058
- )
1059
 
1060
- if __name__ == "__main__":
1061
- demo.launch()
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import anthropic
3
+ from together import Together
4
+ import cohere
5
  import json
6
  import re
7
+ import os
8
+ import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  from prompts import (
10
+ JUDGE_SYSTEM_PROMPT,
11
+ PROMETHEUS_PROMPT,
12
+ PROMETHEUS_PROMPT_WITH_REFERENCE,
13
+ ATLA_PROMPT,
14
+ ATLA_PROMPT_WITH_REFERENCE,
15
+ FLOW_JUDGE_PROMPT
 
 
 
 
 
 
 
 
 
 
16
  )
17
+ from transformers import AutoTokenizer
18
+
19
+ # Initialize clients
20
+ anthropic_client = anthropic.Anthropic()
21
+ openai_client = OpenAI()
22
+ together_client = Together()
23
+ hf_api_key = os.getenv("HF_API_KEY")
24
+ flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
25
+ cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
26
+
27
+ def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
28
+ """Get response from OpenAI API"""
 
 
29
  try:
30
+ response = openai_client.chat.completions.create(
31
+ model=model_name,
32
+ messages=[
33
+ {"role": "system", "content": system_prompt},
34
+ {"role": "user", "content": prompt},
35
+ ],
36
+ max_completion_tokens=max_tokens,
37
+ temperature=temperature,
38
+ )
39
+ return response.choices[0].message.content
40
+ except Exception as e:
41
+ return f"Error with OpenAI model {model_name}: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
44
+ """Get response from Anthropic API"""
45
+ try:
46
+ response = anthropic_client.messages.create(
47
+ model=model_name,
48
+ max_tokens=max_tokens,
49
+ temperature=temperature,
50
+ system=system_prompt,
51
+ messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
52
+ )
53
+ return response.content[0].text
54
+ except Exception as e:
55
+ return f"Error with Anthropic model {model_name}: {str(e)}"
56
 
57
+ def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
58
+ """Get response from Together API"""
59
+ try:
60
+ response = together_client.chat.completions.create(
61
+ model=model_name,
62
+ messages=[
63
+ {"role": "system", "content": system_prompt},
64
+ {"role": "user", "content": prompt},
65
+ ],
66
+ max_tokens=max_tokens,
67
+ temperature=temperature,
68
+ stream=False,
69
+ )
70
+ return response.choices[0].message.content
71
+ except Exception as e:
72
+ return f"Error with Together model {model_name}: {str(e)}"
73
 
74
+ def get_prometheus_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
75
+ """Get response from Hugging Face model"""
76
+ try:
77
+ headers = {
78
+ "Accept": "application/json",
79
+ "Authorization": f"Bearer {hf_api_key}",
80
+ "Content-Type": "application/json"
81
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Create messages list for chat template
84
+ messages = []
85
+ if system_prompt:
86
+ messages.append({"role": "system", "content": system_prompt})
87
+ messages.append({"role": "user", "content": prompt})
88
+
89
+ # Apply chat template
90
+ model_id = "prometheus-eval/prometheus-7b-v2.0"
91
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
92
+ formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
93
+
94
+ payload = {
95
+ "inputs": formatted_prompt,
96
+ "parameters": {
97
+ "max_new_tokens": max_tokens,
98
+ "return_full_text": False,
99
+ "temperature": temperature
100
+ }
101
+ }
102
+
103
+ response = requests.post(
104
+ "https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud",
105
+ headers=headers,
106
+ json=payload
107
+ )
108
+ return response.json()[0]["generated_text"]
109
+ except Exception as e:
110
+ return f"Error with Hugging Face model {model_name}: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
113
+ """Get response from HF endpoint for Atla model"""
114
+ try:
115
+ headers = {
116
+ "Accept": "application/json",
117
+ "Authorization": f"Bearer {hf_api_key}",
118
+ "Content-Type": "application/json"
119
+ }
120
+
121
+ # Create messages list for chat template
122
+ messages = []
123
+ if system_prompt:
124
+ messages.append({"role": "system", "content": system_prompt})
125
+ messages.append({"role": "user", "content": prompt})
126
+
127
+ # Apply chat template
128
+ model_id = "meta-llama/Llama-3.1-8B"
129
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
130
+ formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
131
+
132
+ payload = {
133
+ "inputs": formatted_prompt,
134
+ "parameters": {
135
+ "max_new_tokens": max_tokens,
136
+ "return_full_text": False,
137
+ "temperature": temperature,
138
+ "seed": 42,
139
+ "add_generation_prompt": True
140
+ }
141
+ }
142
+
143
+ response = requests.post(
144
+ "https://azk0vbxyrc64s2v2.us-east-1.aws.endpoints.huggingface.cloud",
145
+ headers=headers,
146
+ json=payload
147
+ )
148
+ return response.json()[0]["generated_text"]
149
+ except Exception as e:
150
+ return f"Error with Atla model {model_name}: {str(e)}"
151
 
152
+ def get_flow_judge_response(model_name, prompt, max_tokens=500, temperature=0.1, top_p=0.95) -> str:
153
+ """Get response from Flow Judge"""
154
+ try:
155
+ response = requests.post(
156
+ "https://tsukuyomi.tailfa581.ts.net/v1/chat/completions",
157
+ headers={
158
+ "Content-Type": "application/json",
159
+ "Authorization": f"Bearer {flow_judge_api_key}"
160
+ },
161
+ json={
162
+ "model": model_name,
163
+ "messages": [
164
+ {"role": "user", "content": prompt}
165
+ ],
166
+ "max_tokens": max_tokens,
167
+ "temperature": temperature,
168
+ "top_p": top_p
169
+ }
170
+ )
171
+ response.raise_for_status()
172
+ return response.json()["choices"][0]['message']['content']
173
+ except Exception as e:
174
+ return f"Error with Flow Judge completions model {model_name}: {str(e)}"
175
 
176
+ def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
177
+ """Get response from Cohere API"""
178
+ try:
179
+ response = cohere_client.chat(
180
+ model=model_name,
181
+ messages=[
182
+ {"role": "system", "content": system_prompt},
183
+ {"role": "user", "content": prompt}
184
+ ],
185
+ max_tokens=max_tokens,
186
+ temperature=temperature
187
+ )
188
+ # Extract the text from the content items
189
+ content_items = response.message.content
190
+ if isinstance(content_items, list):
191
+ # Get the text from the first content item
192
+ return content_items[0].text
193
+ return str(content_items) # Fallback if it's not a list
194
+ except Exception as e:
195
+ return f"Error with Cohere model {model_name}: {str(e)}"
196
+
197
+ def get_model_response(
198
+ model_name,
199
+ model_info,
200
+ prompt_data,
201
+ use_reference=False,
202
+ max_tokens=500,
203
+ temperature=0
204
+ ):
205
+ """Get response from appropriate API based on model organization"""
206
+ if not model_info:
207
+ return "Model not found or unsupported."
208
+
209
+ api_model = model_info["api_model"]
210
+ organization = model_info["organization"]
211
+
212
+ # Determine if model is Prometheus or Atla or Flow Judge
213
+ is_prometheus = (organization == "Prometheus")
214
+ is_atla = (organization == "Atla")
215
+ is_flow_judge = (organization == "Flow AI")
216
+ # For non-Prometheus/Atla models/Flow Judge, use the Judge system prompt
217
+ system_prompt = None if (is_prometheus or is_atla or is_flow_judge) else JUDGE_SYSTEM_PROMPT
218
+
219
+ # Select the appropriate base prompt
220
+
221
+ if is_atla:
222
+ base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
223
+ elif is_flow_judge:
224
+ base_prompt = FLOW_JUDGE_PROMPT
225
  else:
226
+ base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
+ # For non-Prometheus/non-Atla models, replace the specific instruction
229
+ if not (is_prometheus or is_atla or is_flow_judge):
230
+ base_prompt = base_prompt.replace(
231
+ '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
232
+ '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
233
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+ try:
236
+ if not is_flow_judge:
237
+ # Format the prompt with the provided data, only using available keys
238
+ final_prompt = base_prompt.format(
239
+ human_input=prompt_data['human_input'],
240
+ ai_response=prompt_data['ai_response'],
241
+ ground_truth_input=prompt_data.get('ground_truth_input', ''),
242
+ eval_criteria=prompt_data['eval_criteria'],
243
+ score1_desc=prompt_data['score1_desc'],
244
+ score2_desc=prompt_data['score2_desc'],
245
+ score3_desc=prompt_data['score3_desc'],
246
+ score4_desc=prompt_data['score4_desc'],
247
+ score5_desc=prompt_data['score5_desc']
248
  )
 
 
 
 
249
 
250
+ else:
251
+ human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
252
+ ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
253
+ ground_truth=prompt_data.get('ground_truth_input', '')
254
+ if ground_truth:
255
+ response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
256
+ else:
257
+ response_reference = ""
258
+ eval_criteria = prompt_data['eval_criteria']
259
+ score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
260
+ score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
261
+ score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
262
+ score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
263
+ score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
264
+ rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
265
+ if response_reference:
266
+ inputs = human_input + "\n"+ response_reference
267
+ else:
268
+ inputs = human_input
269
+ final_prompt = base_prompt.format(
270
+ INPUTS=inputs,
271
+ OUTPUT=ai_response,
272
+ EVALUATION_CRITERIA=eval_criteria,
273
+ RUBRIC=rubric
274
  )
275
+
276
+ except KeyError as e:
277
+ return f"Error formatting prompt: Missing required field {str(e)}"
278
 
279
+ try:
280
+ if organization == "OpenAI":
281
+ return get_openai_response(
282
+ api_model, final_prompt, system_prompt, max_tokens, temperature
283
+ )
284
+ elif organization == "Anthropic":
285
+ return get_anthropic_response(
286
+ api_model, final_prompt, system_prompt, max_tokens, temperature
287
+ )
288
+ elif organization == "Prometheus":
289
+ return get_prometheus_response(
290
+ api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
291
+ )
292
+ elif organization == "Atla":
293
+ return get_atla_response(
294
+ api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
295
+ )
296
+ elif organization == "Cohere":
297
+ return get_cohere_response(
298
+ api_model, final_prompt, system_prompt, max_tokens, temperature
299
+ )
300
+ elif organization == "Flow AI":
301
+ return get_flow_judge_response(
302
+ api_model, final_prompt, max_tokens, temperature
303
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  else:
305
+ # All other organizations use Together API
306
+ return get_together_response(
307
+ api_model, final_prompt, system_prompt, max_tokens, temperature
308
+ )
309
+ except Exception as e:
310
+ return f"Error with {organization} model {model_name}: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
+ def parse_model_response(response):
313
+ try:
314
+ # Debug print
315
+ print(f"Raw model response: {response}")
316
+
317
+ # If response is already a dictionary, use it directly
318
+ if isinstance(response, dict):
319
+ return str(response.get("result", "N/A")), response.get("feedback", "N/A")
320
+
321
+ # First try to parse the entire response as JSON
322
+ try:
323
+ data = json.loads(response)
324
+ return str(data.get("result", "N/A")), data.get("feedback", "N/A")
325
+ except json.JSONDecodeError:
326
+ # If that fails (typically for smaller models), try to find JSON within the response
327
+ json_match = re.search(r"{.*}", response, re.DOTALL)
328
+ if json_match:
329
+ data = json.loads(json_match.group(0))
330
+ return str(data.get("result", "N/A")), data.get("feedback", "N/A")
331
+ else:
332
+ return "Error", f"Invalid response format returned - here is the raw model response: {response}"
 
 
 
 
 
333
 
334
+ except Exception as e:
335
+ # Debug print for error case
336
+ print(f"Failed to parse response: {str(e)}")
 
 
337
 
338
+ # If the error message itself contains valid JSON, try to parse that
339
+ try:
340
+ error_json_match = re.search(r"{.*}", str(e), re.DOTALL)
341
+ if error_json_match:
342
+ data = json.loads(error_json_match.group(0))
343
+ return str(data.get("result", "N/A")), data.get("feedback", "N/A")
344
+ except:
345
+ pass
346
 
347
+ return "Error", f"Failed to parse response: {response}"
348
+
349
+ def prometheus_parse_model_response(output):
350
+ try:
351
+ print(f"Raw model response: {output}")
352
+ output = output.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
+ # Remove "Feedback:" prefix if present (case insensitive)
355
+ output = re.sub(r'^feedback:\s*', '', output, flags=re.IGNORECASE)
 
356
 
357
+ # New pattern to match [RESULT] X at the beginning
358
+ begin_result_pattern = r'^\[RESULT\]\s*(\d+)\s*\n*(.*?)$'
359
+ begin_match = re.search(begin_result_pattern, output, re.DOTALL | re.IGNORECASE)
360
+ if begin_match:
361
+ score = int(begin_match.group(1))
362
+ feedback = begin_match.group(2).strip()
363
+ return str(score), feedback
364
+
365
+ # Existing patterns for end-of-string results...
366
+ pattern = r"(.*?)\s*\[RESULT\]\s*[\(\[]?(\d+)[\)\]]?"
367
+ match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
368
+ if match:
369
+ feedback = match.group(1).strip()
370
+ score = int(match.group(2))
371
+ return str(score), feedback
372
+
373
+ # If no match, try to match "... Score: X"
374
+ pattern = r"(.*?)\s*(?:Score|Result)\s*:\s*[\(\[]?(\d+)[\)\]]?"
375
+ match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
376
+ if match:
377
+ feedback = match.group(1).strip()
378
+ score = int(match.group(2))
379
+ return str(score), feedback
380
+
381
+ # Pattern to handle [Score X] at the end
382
+ pattern = r"(.*?)\s*\[(?:Score|Result)\s*[\(\[]?(\d+)[\)\]]?\]$"
383
+ match = re.search(pattern, output, re.DOTALL)
384
+ if match:
385
+ feedback = match.group(1).strip()
386
+ score = int(match.group(2))
387
+ return str(score), feedback
388
+
389
+ # Final fallback attempt
390
+ pattern = r"[\(\[]?(\d+)[\)\]]?\s*\]?$"
391
+ match = re.search(pattern, output)
392
+ if match:
393
+ score = int(match.group(1))
394
+ feedback = output[:match.start()].rstrip()
395
+ # Remove any trailing brackets from feedback
396
+ feedback = re.sub(r'\s*\[[^\]]*$', '', feedback).strip()
397
+ return str(score), feedback
398
+
399
+ return "Error", f"Failed to parse response: {output}"
400
+
401
+ except Exception as e:
402
+ print(f"Failed to parse response: {str(e)}")
403
+ return "Error", f"Exception during parsing: {str(e)}"
404
+
405
+ def atla_parse_model_response(output):
406
+ """Parse response from ATLA model"""
407
+ try:
408
+ print(f"Raw Atla model response: {output}")
409
+ output = output.strip()
410
 
411
+ # Look for the Reasoning and Result sections
412
+ reasoning_match = re.search(r'\*\*Reasoning:\*\*(.*?)(?=\*\*Result:|$)', output, re.DOTALL)
413
+ result_match = re.search(r'\*\*Result:\*\*\s*(\d+)', output)
414
+
415
+ if reasoning_match and result_match:
416
+ feedback = reasoning_match.group(1).strip()
417
+ score = result_match.group(1)
418
+ return str(score), feedback
419
+
420
+ return "Error", f"Failed to parse ATLA response format: {output}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
 
422
+ except Exception as e:
423
+ print(f"Failed to parse ATLA response: {str(e)}")
424
+ return "Error", f"Exception during parsing: {str(e)}"
425
+
426
+ def flow_judge_parse_model_response(output):
427
+ try:
428
+ print(f"Raw model response: {output}")
429
+ # Convert multiple line breaks to single ones and strip whitespace
430
+ output = re.sub(r'\n{2,}', '\n', output.strip())
431
+
432
+ # Compile regex patterns
433
+ feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
434
+ score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
 
 
 
 
 
 
 
 
 
 
 
 
 
435
 
436
+ feedback_match = feedback_pattern.search(output)
437
+ score_match = score_pattern.search(output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
+ if feedback_match or not score_match:
440
+ feedback = feedback_match.group(1).strip()
441
+ score = int(score_match.group(1).strip())
442
+ return str(score), feedback
443
+
444
+ return "Error", f"Failed to parse response: {output}"
445
+
446
+ except Exception as e:
447
+ print(f"Failed to parse response: {str(e)}")
448
+ return "Error", f"Exception during parsing: {str(e)}"