kaikaidai commited on
Commit
8bba8de
·
verified ·
1 Parent(s): f2d7524

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +381 -162
app.py CHANGED
@@ -13,10 +13,15 @@ load_dotenv()
13
  import gradio as gr
14
  from gen_api_answer import (
15
  get_model_response,
16
- parse_model_response,
 
 
 
 
17
  get_random_human_ai_pair,
 
18
  generate_ai_response
19
- )
20
  from db import add_vote, create_db_connection, get_votes
21
  from utils import Vote
22
  from common import (
@@ -33,6 +38,12 @@ from common import (
33
  VOTING_HEADER,
34
  DEFAULT_EVAL_PROMPT_EDITABLE,
35
  FIXED_EVAL_SUFFIX,
 
 
 
 
 
 
36
  )
37
  from leaderboard import (
38
  get_leaderboard,
@@ -292,9 +303,16 @@ leaderboard_table = gr.Dataframe(
292
  )
293
 
294
 
295
- def populate_random_example(request: gr.Request):
296
  """Generate a random human-AI conversation example and reset judge outputs."""
297
- human_msg, ai_msg = get_random_human_ai_pair()
 
 
 
 
 
 
 
298
  return [
299
  gr.update(value=human_msg),
300
  gr.update(value=ai_msg),
@@ -308,6 +326,7 @@ def populate_random_example(request: gr.Request):
308
  gr.update(interactive=False, variant="primary"), # Reset vote tie
309
  gr.update(value="*Model: Hidden*"), # Reset model name A
310
  gr.update(value="*Model: Hidden*"), # Reset model name B
 
311
  ]
312
 
313
 
@@ -345,6 +364,14 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
345
  placeholder="Enter the AI response here..."
346
  )
347
 
 
 
 
 
 
 
 
 
348
  with gr.Row():
349
  random_btn = gr.Button("🎲", scale=2)
350
  send_btn = gr.Button(
@@ -381,22 +408,86 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
381
  vote_b = gr.Button("Vote B", variant="primary", interactive=False)
382
  with gr.Column(scale=9, min_width=400): # Wider width for critique
383
  critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
384
- # Place Vote B button directly under Judge B
385
 
386
  gr.Markdown("<br>")
387
-
388
- # Update Evaluator Prompt Accordion
389
- with gr.Accordion("📝 Edit Judge Prompt", open=False):
390
- eval_prompt_editable = gr.TextArea(
391
- value=DEFAULT_EVAL_PROMPT_EDITABLE,
392
- label="Evaluation Criteria",
393
- lines=12
 
394
  )
395
- with gr.Row(visible=False) as edit_buttons_row: # Make buttons row initially hidden
396
- cancel_prompt_btn = gr.Button("Cancel")
397
- save_prompt_btn = gr.Button("Save", variant="primary")
398
- gr.Markdown("*The sample being evaluated is always appended as:*")
399
- gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
  with gr.TabItem("Leaderboard"):
402
  with gr.Row():
@@ -404,7 +495,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
404
  show_preliminary = gr.Checkbox(
405
  label="Reveal preliminary results",
406
  value=True, # Checked by default
407
- info="Show all models, including models with less few human ratings (< 500 votes)",
408
  interactive=True
409
  )
410
  stats_display = gr.Markdown()
@@ -412,7 +503,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
412
  headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
413
  datatype=["str", "number", "str", "number", "str", "str", "str"],
414
  )
415
-
416
  gr.Markdown("""<br>
417
  <br>
418
  Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
@@ -444,62 +535,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
444
  final_prompt_state = gr.State()
445
  eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
446
  is_editing = gr.State(False) # Track editing state
447
-
448
- # Update variable inputs based on the eval prompt
449
- #def update_variables(eval_prompt):
450
- # variables = parse_variables(eval_prompt)
451
- # updates = []
452
-
453
- # for i in range(len(variable_rows)):
454
- # var_row, var_input = variable_rows[i]
455
- # if i < len(variables):
456
- # var_name = variables[i]
457
- # # Set the number of lines based on the variable name
458
- # if var_name == "response":
459
- # lines = 4 # Adjust this number as needed
460
- # else:
461
- # lines = 1 # Default to single line for other variables
462
- # updates.extend(
463
- # [
464
- # gr.update(visible=True), # Show the variable row
465
- # gr.update(
466
- # label=var_name, visible=True, lines=lines
467
- # ), # Update label and lines
468
- # ]
469
- # )
470
- # else:
471
- # updates.extend(
472
- # [
473
- # gr.update(visible=False), # Hide the variable row
474
- # gr.update(value="", visible=False), # Clear value when hidden
475
- # ]
476
- # )
477
- # return updates
478
-
479
- #eval_prompt.change(
480
- # fn=update_variables,
481
- # inputs=eval_prompt,
482
- # outputs=[item for sublist in variable_rows for item in sublist],
483
- #)
484
-
485
- # Regenerate button functionality
486
- #regenerate_button.click(
487
- # fn=regenerate_prompt,
488
- # inputs=[model_a_state, model_b_state, eval_prompt, human_input, ai_response],
489
- # outputs=[
490
- # score_a,
491
- # critique_a,
492
- # score_b,
493
- # critique_b,
494
- # vote_a,
495
- # vote_b,
496
- # tie_button_row,
497
- # model_name_a,
498
- # model_name_b,
499
- # model_a_state,
500
- # model_b_state,
501
- # ],
502
- #)
503
 
504
  # Update model names after responses are generated
505
  def update_model_names(model_a, model_b):
@@ -621,39 +657,128 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
621
  outputs=edit_buttons_row
622
  )
623
 
624
- # Update the submit function to combine editable and fixed parts
625
- def submit_and_store(editable_prompt, *variables):
626
- # Combine the editable prompt with fixed suffix
627
- full_prompt = editable_prompt + FIXED_EVAL_SUFFIX
628
-
629
- # Get the responses using the full prompt
630
- (
631
- response_a,
632
- response_b,
633
- buttons_visible,
634
- regen_visible,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  model_a,
 
 
 
 
 
636
  model_b,
 
637
  final_prompt,
638
- ) = submit_prompt(full_prompt, *variables)
 
639
 
640
- # Parse the responses
641
- score_a, critique_a = parse_model_response(response_a)
642
- score_b, critique_b = parse_model_response(response_b)
 
 
 
 
643
 
644
  # Only append "/ 5" if using the default prompt
645
- if editable_prompt.strip() == DEFAULT_EVAL_PROMPT_EDITABLE.strip():
646
- score_a = f"{score_a} / 5"
647
- score_b = f"{score_b} / 5"
648
 
649
- # Update the last_submission state with the current values
650
- last_submission.value = {"prompt": full_prompt, "variables": variables}
651
 
652
  return (
653
- score_a,
654
- critique_a,
655
- score_b,
656
- critique_b,
657
  gr.update(interactive=True, variant="primary"), # vote_a
658
  gr.update(interactive=True, variant="primary"), # vote_b
659
  gr.update(interactive=True, variant="primary"), # vote_tie
@@ -662,18 +787,26 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
662
  final_prompt,
663
  gr.update(value="*Model: Hidden*"),
664
  gr.update(value="*Model: Hidden*"),
665
- gr.update(
666
- value="Regenerate judges",
667
- variant="secondary",
668
- interactive=True
669
- ),
670
  gr.update(value="🎲"), # random_btn
671
  )
672
 
673
  # Update the click handler to use the editable prompt
674
  send_btn.click(
675
  fn=submit_and_store,
676
- inputs=[eval_prompt_editable, human_input, ai_response],
 
 
 
 
 
 
 
 
 
 
 
 
677
  outputs=[
678
  score_a,
679
  critique_a,
@@ -692,64 +825,10 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
692
  ],
693
  )
694
 
695
- # Update the input change handlers to also disable regenerate button
696
- # def handle_input_changes(prompt, *variables):
697
- # """Enable send button and manage regenerate button based on input changes"""
698
- # last_inputs = last_submission.value
699
- # current_inputs = {"prompt": prompt, "variables": variables}
700
- # inputs_changed = last_inputs != current_inputs
701
- # return [
702
- # gr.update(interactive=True), # send button always enabled
703
- # gr.update(
704
- # interactive=not inputs_changed
705
- # ), # regenerate button disabled if inputs changed
706
- # ]
707
-
708
- # Update the change handlers for prompt and variables
709
- #eval_prompt.change(
710
- # fn=handle_input_changes,
711
- # inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
712
- # outputs=[send_btn, regenerate_button],
713
- #)
714
-
715
- # for _, var_input in variable_rows:
716
- # var_input.change(
717
- # fn=handle_input_changes,
718
- # inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
719
- # outputs=[send_btn, regenerate_button],
720
- # )
721
-
722
- # Add click handlers for metric buttons
723
- #outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
724
-
725
- #custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
726
-
727
- #hallucination_btn.click(
728
- # fn=lambda: set_example_metric("Hallucination"), outputs=outputs_list
729
- #)
730
-
731
- #precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
732
-
733
- #recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
734
-
735
- #coherence_btn.click(
736
- # fn=lambda: set_example_metric("Logical_Coherence"), outputs=outputs_list
737
- #)
738
-
739
- #faithfulness_btn.click(
740
- # fn=lambda: set_example_metric("Faithfulness"), outputs=outputs_list
741
- #)
742
-
743
- # Set default metric at startup
744
- demo.load(
745
- #fn=lambda: set_example_metric("Hallucination"),
746
- #outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
747
- )
748
-
749
  # Add random button handler
750
  random_btn.click(
751
  fn=populate_random_example,
752
- inputs=[],
753
  outputs=[
754
  human_input,
755
  ai_response,
@@ -763,6 +842,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
763
  vote_tie,
764
  model_name_a,
765
  model_name_b,
 
766
  ]
767
  )
768
 
@@ -810,10 +890,149 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
810
 
811
  # Update the demo.load to include the random example population
812
  demo.load(
813
- fn=populate_random_example,
814
  inputs=[],
815
- outputs=[human_input, ai_response]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
  )
817
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
  if __name__ == "__main__":
819
  demo.launch()
 
13
  import gradio as gr
14
  from gen_api_answer import (
15
  get_model_response,
16
+ parse_model_response,
17
+ alternative_parse_model_response
18
+ )
19
+
20
+ from random_sample_generation import (
21
  get_random_human_ai_pair,
22
+ get_random_human_ai_ground_truth_pair,
23
  generate_ai_response
24
+ )
25
  from db import add_vote, create_db_connection, get_votes
26
  from utils import Vote
27
  from common import (
 
38
  VOTING_HEADER,
39
  DEFAULT_EVAL_PROMPT_EDITABLE,
40
  FIXED_EVAL_SUFFIX,
41
+ DEFAULT_EVAL_CRITERIA,
42
+ DEFAULT_SCORE_1,
43
+ DEFAULT_SCORE_2,
44
+ DEFAULT_SCORE_3,
45
+ DEFAULT_SCORE_4,
46
+ DEFAULT_SCORE_5,
47
  )
48
  from leaderboard import (
49
  get_leaderboard,
 
303
  )
304
 
305
 
306
+ def populate_random_example(request: gr.Request, compatible_mode: bool):
307
  """Generate a random human-AI conversation example and reset judge outputs."""
308
+ if compatible_mode:
309
+ # Generate all three components when compatible mode is enabled
310
+ human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
311
+ else:
312
+ # Generate only human and AI messages when compatible mode is disabled
313
+ human_msg, ai_msg = get_random_human_ai_pair()
314
+ ground_truth_msg = ""
315
+
316
  return [
317
  gr.update(value=human_msg),
318
  gr.update(value=ai_msg),
 
326
  gr.update(interactive=False, variant="primary"), # Reset vote tie
327
  gr.update(value="*Model: Hidden*"), # Reset model name A
328
  gr.update(value="*Model: Hidden*"), # Reset model name B
329
+ gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
330
  ]
331
 
332
 
 
364
  placeholder="Enter the AI response here..."
365
  )
366
 
367
+ # Ground truth response (initially hidden)
368
+ ground_truth = gr.TextArea(
369
+ label="🎯 Ground truth response",
370
+ lines=12,
371
+ placeholder="Enter the ground truth response here...",
372
+ visible=False
373
+ )
374
+
375
  with gr.Row():
376
  random_btn = gr.Button("🎲", scale=2)
377
  send_btn = gr.Button(
 
408
  vote_b = gr.Button("Vote B", variant="primary", interactive=False)
409
  with gr.Column(scale=9, min_width=400): # Wider width for critique
410
  critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
411
+ # Place Vote B button directly under Judge B
412
 
413
  gr.Markdown("<br>")
414
+
415
+
416
+ # Replace the "Edit Judge Prompt" Accordion section with:
417
+ with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
418
+ gr.Markdown("<br>")
419
+ compatible_mode_toggle = gr.Checkbox(
420
+ label="Use a prompt compatible with Prometheus models",
421
+ value=False
422
  )
423
+
424
+ # Default prompt editor
425
+ with gr.Column(visible=True) as default_prompt_editor:
426
+ eval_prompt_editable = gr.TextArea(
427
+ value=DEFAULT_EVAL_PROMPT_EDITABLE,
428
+ label="Evaluation Criteria",
429
+ lines=12
430
+ )
431
+
432
+ with gr.Row(visible=False) as edit_buttons_row:
433
+ cancel_prompt_btn = gr.Button("Cancel")
434
+ save_prompt_btn = gr.Button("Save", variant="primary")
435
+ gr.Markdown("*The sample being evaluated is always appended as:*")
436
+ gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
437
+
438
+ # Compatible mode editor
439
+ with gr.Column(visible=False) as compatible_prompt_editor:
440
+ with gr.Row():
441
+ # Left column - Evaluation Criteria
442
+ with gr.Column(scale=1):
443
+ eval_criteria_text = gr.TextArea(
444
+ label="Evaluation Criteria",
445
+ lines=12,
446
+ value=DEFAULT_EVAL_CRITERIA,
447
+ placeholder="Enter the evaluation criteria..."
448
+ )
449
+ prometheus_reference = gr.Markdown(
450
+ "<br> *This enforces the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
451
+ visible=False # Initially hidden
452
+ )
453
+
454
+ # Right column - Score Descriptions
455
+ with gr.Column(scale=1):
456
+ score1_description = gr.TextArea(
457
+ label="Score 1",
458
+ value=DEFAULT_SCORE_1,
459
+ placeholder="Description for score 1",
460
+ lines=2
461
+ )
462
+ score2_description = gr.TextArea(
463
+ label="Score 2",
464
+ value=DEFAULT_SCORE_2,
465
+ placeholder="Description for score 2",
466
+ lines=2
467
+ )
468
+ score3_description = gr.TextArea(
469
+ label="Score 3",
470
+ value=DEFAULT_SCORE_3,
471
+ placeholder="Description for score 3",
472
+ lines=2
473
+ )
474
+ score4_description = gr.TextArea(
475
+ label="Score 4",
476
+ value=DEFAULT_SCORE_4,
477
+ placeholder="Description for score 4",
478
+ lines=2
479
+ )
480
+ score5_description = gr.TextArea(
481
+ label="Score 5",
482
+ value=DEFAULT_SCORE_5,
483
+ placeholder="Description for score 5",
484
+ lines=2
485
+ )
486
+
487
+ # Add save/cancel buttons for compatible mode
488
+ with gr.Row(visible=False) as compatible_edit_buttons_row:
489
+ compatible_cancel_btn = gr.Button("Cancel")
490
+ compatible_save_btn = gr.Button("Save", variant="primary")
491
 
492
  with gr.TabItem("Leaderboard"):
493
  with gr.Row():
 
495
  show_preliminary = gr.Checkbox(
496
  label="Reveal preliminary results",
497
  value=True, # Checked by default
498
+ info="Show all models, including models with less human ratings (< 500 votes)",
499
  interactive=True
500
  )
501
  stats_display = gr.Markdown()
 
503
  headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
504
  datatype=["str", "number", "str", "number", "str", "str", "str"],
505
  )
506
+
507
  gr.Markdown("""<br>
508
  <br>
509
  Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
 
535
  final_prompt_state = gr.State()
536
  eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
537
  is_editing = gr.State(False) # Track editing state
538
+ compatible_mode_state = gr.State(False) # Track compatible mode state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
540
  # Update model names after responses are generated
541
  def update_model_names(model_a, model_b):
 
657
  outputs=edit_buttons_row
658
  )
659
 
660
+ # Function to toggle visibility based on compatible mode
661
+ def toggle_compatible_mode(checked):
662
+ return {
663
+ ground_truth: gr.update(visible=checked),
664
+ default_prompt_editor: gr.update(visible=not checked),
665
+ compatible_prompt_editor: gr.update(visible=checked),
666
+ prometheus_reference: gr.update(visible=checked),
667
+ }
668
+
669
+ compatible_mode_toggle.change(
670
+ fn=toggle_compatible_mode,
671
+ inputs=[compatible_mode_toggle],
672
+ outputs=[
673
+ ground_truth,
674
+ default_prompt_editor,
675
+ compatible_prompt_editor,
676
+ prometheus_reference,
677
+ ]
678
+ )
679
+
680
+ # Update the submit function to handle compatible mode
681
+ def submit_and_store(
682
+ compatible_mode,
683
+ editable_prompt,
684
+ human_input,
685
+ ai_response,
686
+ ground_truth_input,
687
+ eval_criteria_text_input,
688
+ score1_desc,
689
+ score2_desc,
690
+ score3_desc,
691
+ score4_desc,
692
+ score5_desc,
693
+ ):
694
+ if compatible_mode:
695
+ # Build the prompt using the new format
696
+ prompt = f"""###Task Description:
697
+ An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing an evaluation criteria are given.
698
+ 1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general.
699
+ 2. After writing the feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
700
+ 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
701
+ 4. Please do not generate any other openings, closings, or explanations.
702
+
703
+ ###The instruction to evaluate:
704
+ {human_input}
705
+
706
+ ###Response to evaluate:
707
+ {ai_response}
708
+
709
+ ###Reference Answer (Score 5):
710
+ {ground_truth_input}
711
+
712
+ ###Score Rubrics:
713
+ [{eval_criteria_text_input}]
714
+ Score 1: {score1_desc}
715
+ Score 2: {score2_desc}
716
+ Score 3: {score3_desc}
717
+ Score 4: {score4_desc}
718
+ Score 5: {score5_desc}
719
+
720
+ ###Feedback:
721
+ """
722
+ final_prompt = prompt
723
+ use_alternative_prompt = True
724
+ else:
725
+ # Combine the editable prompt with fixed suffix
726
+ full_prompt = editable_prompt + FIXED_EVAL_SUFFIX
727
+ # Replace variables in the eval prompt
728
+ variable_values = {'input': human_input, 'response': ai_response}
729
+ final_prompt = get_final_prompt(full_prompt, variable_values)
730
+ use_alternative_prompt = False
731
+
732
+ # Filter models based on compatible mode
733
+ if compatible_mode:
734
+ # Include all models when compatible mode is enabled
735
+ models = list(model_data.keys())
736
+ else:
737
+ # Exclude Prometheus models when not in compatible mode
738
+ models = [
739
+ model_name for model_name in model_data.keys()
740
+ if model_data[model_name]["organization"] != "Prometheus"
741
+ ]
742
+
743
+ # Select two models randomly from the filtered list
744
+ model1, model2 = random.sample(models, 2)
745
+ model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
746
+
747
+ # Get responses from models
748
+ response_a = get_model_response(
749
  model_a,
750
+ model_data.get(model_a),
751
+ final_prompt,
752
+ use_alternative_prompt=use_alternative_prompt
753
+ )
754
+ response_b = get_model_response(
755
  model_b,
756
+ model_data.get(model_b),
757
  final_prompt,
758
+ use_alternative_prompt=use_alternative_prompt
759
+ )
760
 
761
+ # Parse the responses based on mode
762
+ if compatible_mode:
763
+ score_a_val, critique_a_val = alternative_parse_model_response(response_a)
764
+ score_b_val, critique_b_val = alternative_parse_model_response(response_b)
765
+ else:
766
+ score_a_val, critique_a_val = parse_model_response(response_a)
767
+ score_b_val, critique_b_val = parse_model_response(response_b)
768
 
769
  # Only append "/ 5" if using the default prompt
770
+ if not compatible_mode and editable_prompt.strip() == DEFAULT_EVAL_PROMPT_EDITABLE.strip():
771
+ score_a_val = f"{score_a_val} / 5"
772
+ score_b_val = f"{score_b_val} / 5"
773
 
774
+ # Update the last_submission state
775
+ last_submission.value = {"prompt": final_prompt, "variables": [human_input, ai_response]}
776
 
777
  return (
778
+ score_a_val,
779
+ critique_a_val,
780
+ score_b_val,
781
+ critique_b_val,
782
  gr.update(interactive=True, variant="primary"), # vote_a
783
  gr.update(interactive=True, variant="primary"), # vote_b
784
  gr.update(interactive=True, variant="primary"), # vote_tie
 
787
  final_prompt,
788
  gr.update(value="*Model: Hidden*"),
789
  gr.update(value="*Model: Hidden*"),
790
+ gr.update(value="Regenerate judges", variant="secondary", interactive=True),
 
 
 
 
791
  gr.update(value="🎲"), # random_btn
792
  )
793
 
794
  # Update the click handler to use the editable prompt
795
  send_btn.click(
796
  fn=submit_and_store,
797
+ inputs=[
798
+ compatible_mode_toggle,
799
+ eval_prompt_editable,
800
+ human_input,
801
+ ai_response,
802
+ ground_truth,
803
+ eval_criteria_text,
804
+ score1_description,
805
+ score2_description,
806
+ score3_description,
807
+ score4_description,
808
+ score5_description,
809
+ ],
810
  outputs=[
811
  score_a,
812
  critique_a,
 
825
  ],
826
  )
827
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828
  # Add random button handler
829
  random_btn.click(
830
  fn=populate_random_example,
831
+ inputs=[compatible_mode_toggle], # Use compatible mode toggle to decide behavior
832
  outputs=[
833
  human_input,
834
  ai_response,
 
842
  vote_tie,
843
  model_name_a,
844
  model_name_b,
845
+ ground_truth, # Set ground truth
846
  ]
847
  )
848
 
 
890
 
891
  # Update the demo.load to include the random example population
892
  demo.load(
893
+ fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
894
  inputs=[],
895
+ outputs=[
896
+ human_input,
897
+ ai_response,
898
+ random_btn,
899
+ score_a,
900
+ critique_a,
901
+ score_b,
902
+ critique_b,
903
+ vote_a,
904
+ vote_b,
905
+ vote_tie,
906
+ model_name_a,
907
+ model_name_b,
908
+ ground_truth,
909
+ ]
910
+ )
911
+
912
+ # Add new state variables for compatible mode
913
+ eval_criteria_previous = gr.State(value=DEFAULT_EVAL_CRITERIA)
914
+ score1_previous = gr.State(value=DEFAULT_SCORE_1)
915
+ score2_previous = gr.State(value=DEFAULT_SCORE_2)
916
+ score3_previous = gr.State(value=DEFAULT_SCORE_3)
917
+ score4_previous = gr.State(value=DEFAULT_SCORE_4)
918
+ score5_previous = gr.State(value=DEFAULT_SCORE_5)
919
+
920
+ # Add new functions to handle compatible mode saves/cancels
921
+ def save_compatible_prompt(criteria, score1, score2, score3, score4, score5):
922
+ return [
923
+ gr.update(value=criteria), # Update criteria
924
+ criteria, # Update previous criteria state
925
+ gr.update(value=score1),
926
+ score1,
927
+ gr.update(value=score2),
928
+ score2,
929
+ gr.update(value=score3),
930
+ score3,
931
+ gr.update(value=score4),
932
+ score4,
933
+ gr.update(value=score5),
934
+ score5,
935
+ gr.update(visible=False) # Hide buttons
936
+ ]
937
+
938
+ def cancel_compatible_prompt(prev_criteria, prev_score1, prev_score2, prev_score3, prev_score4, prev_score5):
939
+ return [
940
+ gr.update(value=prev_criteria),
941
+ prev_criteria,
942
+ gr.update(value=prev_score1),
943
+ prev_score1,
944
+ gr.update(value=prev_score2),
945
+ prev_score2,
946
+ gr.update(value=prev_score3),
947
+ prev_score3,
948
+ gr.update(value=prev_score4),
949
+ prev_score4,
950
+ gr.update(value=prev_score5),
951
+ prev_score5,
952
+ gr.update(visible=False)
953
+ ]
954
+
955
+ def show_compatible_edit_buttons(*current_values):
956
+ previous_values = current_values[1::2] # Get previous values
957
+ current_values = current_values[::2] # Get current values
958
+ return gr.update(visible=any(curr != prev for curr, prev in zip(current_values, previous_values)))
959
+
960
+ # Add click handlers for compatible mode buttons
961
+ compatible_save_btn.click(
962
+ fn=save_compatible_prompt,
963
+ inputs=[
964
+ eval_criteria_text,
965
+ score1_description,
966
+ score2_description,
967
+ score3_description,
968
+ score4_description,
969
+ score5_description
970
+ ],
971
+ outputs=[
972
+ eval_criteria_text,
973
+ eval_criteria_previous,
974
+ score1_description,
975
+ score1_previous,
976
+ score2_description,
977
+ score2_previous,
978
+ score3_description,
979
+ score3_previous,
980
+ score4_description,
981
+ score4_previous,
982
+ score5_description,
983
+ score5_previous,
984
+ compatible_edit_buttons_row
985
+ ]
986
+ )
987
+
988
+ compatible_cancel_btn.click(
989
+ fn=cancel_compatible_prompt,
990
+ inputs=[
991
+ eval_criteria_previous,
992
+ score1_previous,
993
+ score2_previous,
994
+ score3_previous,
995
+ score4_previous,
996
+ score5_previous
997
+ ],
998
+ outputs=[
999
+ eval_criteria_text,
1000
+ eval_criteria_previous,
1001
+ score1_description,
1002
+ score1_previous,
1003
+ score2_description,
1004
+ score2_previous,
1005
+ score3_description,
1006
+ score3_previous,
1007
+ score4_description,
1008
+ score4_previous,
1009
+ score5_description,
1010
+ score5_previous,
1011
+ compatible_edit_buttons_row
1012
+ ]
1013
  )
1014
 
1015
+ # Add change handlers for all compatible mode inputs
1016
+ for component in [eval_criteria_text, score1_description, score2_description,
1017
+ score3_description, score4_description, score5_description]:
1018
+ component.change(
1019
+ fn=show_compatible_edit_buttons,
1020
+ inputs=[
1021
+ eval_criteria_text,
1022
+ eval_criteria_previous,
1023
+ score1_description,
1024
+ score1_previous,
1025
+ score2_description,
1026
+ score2_previous,
1027
+ score3_description,
1028
+ score3_previous,
1029
+ score4_description,
1030
+ score4_previous,
1031
+ score5_description,
1032
+ score5_previous
1033
+ ],
1034
+ outputs=compatible_edit_buttons_row
1035
+ )
1036
+
1037
  if __name__ == "__main__":
1038
  demo.launch()