Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -13,10 +13,15 @@ load_dotenv()
|
|
13 |
import gradio as gr
|
14 |
from gen_api_answer import (
|
15 |
get_model_response,
|
16 |
-
parse_model_response,
|
|
|
|
|
|
|
|
|
17 |
get_random_human_ai_pair,
|
|
|
18 |
generate_ai_response
|
19 |
-
)
|
20 |
from db import add_vote, create_db_connection, get_votes
|
21 |
from utils import Vote
|
22 |
from common import (
|
@@ -33,6 +38,12 @@ from common import (
|
|
33 |
VOTING_HEADER,
|
34 |
DEFAULT_EVAL_PROMPT_EDITABLE,
|
35 |
FIXED_EVAL_SUFFIX,
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
)
|
37 |
from leaderboard import (
|
38 |
get_leaderboard,
|
@@ -292,9 +303,16 @@ leaderboard_table = gr.Dataframe(
|
|
292 |
)
|
293 |
|
294 |
|
295 |
-
def populate_random_example(request: gr.Request):
|
296 |
"""Generate a random human-AI conversation example and reset judge outputs."""
|
297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
return [
|
299 |
gr.update(value=human_msg),
|
300 |
gr.update(value=ai_msg),
|
@@ -308,6 +326,7 @@ def populate_random_example(request: gr.Request):
|
|
308 |
gr.update(interactive=False, variant="primary"), # Reset vote tie
|
309 |
gr.update(value="*Model: Hidden*"), # Reset model name A
|
310 |
gr.update(value="*Model: Hidden*"), # Reset model name B
|
|
|
311 |
]
|
312 |
|
313 |
|
@@ -345,6 +364,14 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
345 |
placeholder="Enter the AI response here..."
|
346 |
)
|
347 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
with gr.Row():
|
349 |
random_btn = gr.Button("🎲", scale=2)
|
350 |
send_btn = gr.Button(
|
@@ -381,22 +408,86 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
381 |
vote_b = gr.Button("Vote B", variant="primary", interactive=False)
|
382 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
383 |
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
|
384 |
-
|
385 |
|
386 |
gr.Markdown("<br>")
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
|
|
394 |
)
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
|
401 |
with gr.TabItem("Leaderboard"):
|
402 |
with gr.Row():
|
@@ -404,7 +495,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
404 |
show_preliminary = gr.Checkbox(
|
405 |
label="Reveal preliminary results",
|
406 |
value=True, # Checked by default
|
407 |
-
info="Show all models, including models with less
|
408 |
interactive=True
|
409 |
)
|
410 |
stats_display = gr.Markdown()
|
@@ -412,7 +503,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
412 |
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
413 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
414 |
)
|
415 |
-
|
416 |
gr.Markdown("""<br>
|
417 |
<br>
|
418 |
Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
|
@@ -444,62 +535,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
444 |
final_prompt_state = gr.State()
|
445 |
eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
|
446 |
is_editing = gr.State(False) # Track editing state
|
447 |
-
|
448 |
-
# Update variable inputs based on the eval prompt
|
449 |
-
#def update_variables(eval_prompt):
|
450 |
-
# variables = parse_variables(eval_prompt)
|
451 |
-
# updates = []
|
452 |
-
|
453 |
-
# for i in range(len(variable_rows)):
|
454 |
-
# var_row, var_input = variable_rows[i]
|
455 |
-
# if i < len(variables):
|
456 |
-
# var_name = variables[i]
|
457 |
-
# # Set the number of lines based on the variable name
|
458 |
-
# if var_name == "response":
|
459 |
-
# lines = 4 # Adjust this number as needed
|
460 |
-
# else:
|
461 |
-
# lines = 1 # Default to single line for other variables
|
462 |
-
# updates.extend(
|
463 |
-
# [
|
464 |
-
# gr.update(visible=True), # Show the variable row
|
465 |
-
# gr.update(
|
466 |
-
# label=var_name, visible=True, lines=lines
|
467 |
-
# ), # Update label and lines
|
468 |
-
# ]
|
469 |
-
# )
|
470 |
-
# else:
|
471 |
-
# updates.extend(
|
472 |
-
# [
|
473 |
-
# gr.update(visible=False), # Hide the variable row
|
474 |
-
# gr.update(value="", visible=False), # Clear value when hidden
|
475 |
-
# ]
|
476 |
-
# )
|
477 |
-
# return updates
|
478 |
-
|
479 |
-
#eval_prompt.change(
|
480 |
-
# fn=update_variables,
|
481 |
-
# inputs=eval_prompt,
|
482 |
-
# outputs=[item for sublist in variable_rows for item in sublist],
|
483 |
-
#)
|
484 |
-
|
485 |
-
# Regenerate button functionality
|
486 |
-
#regenerate_button.click(
|
487 |
-
# fn=regenerate_prompt,
|
488 |
-
# inputs=[model_a_state, model_b_state, eval_prompt, human_input, ai_response],
|
489 |
-
# outputs=[
|
490 |
-
# score_a,
|
491 |
-
# critique_a,
|
492 |
-
# score_b,
|
493 |
-
# critique_b,
|
494 |
-
# vote_a,
|
495 |
-
# vote_b,
|
496 |
-
# tie_button_row,
|
497 |
-
# model_name_a,
|
498 |
-
# model_name_b,
|
499 |
-
# model_a_state,
|
500 |
-
# model_b_state,
|
501 |
-
# ],
|
502 |
-
#)
|
503 |
|
504 |
# Update model names after responses are generated
|
505 |
def update_model_names(model_a, model_b):
|
@@ -621,39 +657,128 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
621 |
outputs=edit_buttons_row
|
622 |
)
|
623 |
|
624 |
-
#
|
625 |
-
def
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
635 |
model_a,
|
|
|
|
|
|
|
|
|
|
|
636 |
model_b,
|
|
|
637 |
final_prompt,
|
638 |
-
|
|
|
639 |
|
640 |
-
# Parse the responses
|
641 |
-
|
642 |
-
|
|
|
|
|
|
|
|
|
643 |
|
644 |
# Only append "/ 5" if using the default prompt
|
645 |
-
if editable_prompt.strip() == DEFAULT_EVAL_PROMPT_EDITABLE.strip():
|
646 |
-
|
647 |
-
|
648 |
|
649 |
-
# Update the last_submission state
|
650 |
-
last_submission.value = {"prompt":
|
651 |
|
652 |
return (
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
gr.update(interactive=True, variant="primary"), # vote_a
|
658 |
gr.update(interactive=True, variant="primary"), # vote_b
|
659 |
gr.update(interactive=True, variant="primary"), # vote_tie
|
@@ -662,18 +787,26 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
662 |
final_prompt,
|
663 |
gr.update(value="*Model: Hidden*"),
|
664 |
gr.update(value="*Model: Hidden*"),
|
665 |
-
gr.update(
|
666 |
-
value="Regenerate judges",
|
667 |
-
variant="secondary",
|
668 |
-
interactive=True
|
669 |
-
),
|
670 |
gr.update(value="🎲"), # random_btn
|
671 |
)
|
672 |
|
673 |
# Update the click handler to use the editable prompt
|
674 |
send_btn.click(
|
675 |
fn=submit_and_store,
|
676 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
677 |
outputs=[
|
678 |
score_a,
|
679 |
critique_a,
|
@@ -692,64 +825,10 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
692 |
],
|
693 |
)
|
694 |
|
695 |
-
# Update the input change handlers to also disable regenerate button
|
696 |
-
# def handle_input_changes(prompt, *variables):
|
697 |
-
# """Enable send button and manage regenerate button based on input changes"""
|
698 |
-
# last_inputs = last_submission.value
|
699 |
-
# current_inputs = {"prompt": prompt, "variables": variables}
|
700 |
-
# inputs_changed = last_inputs != current_inputs
|
701 |
-
# return [
|
702 |
-
# gr.update(interactive=True), # send button always enabled
|
703 |
-
# gr.update(
|
704 |
-
# interactive=not inputs_changed
|
705 |
-
# ), # regenerate button disabled if inputs changed
|
706 |
-
# ]
|
707 |
-
|
708 |
-
# Update the change handlers for prompt and variables
|
709 |
-
#eval_prompt.change(
|
710 |
-
# fn=handle_input_changes,
|
711 |
-
# inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
712 |
-
# outputs=[send_btn, regenerate_button],
|
713 |
-
#)
|
714 |
-
|
715 |
-
# for _, var_input in variable_rows:
|
716 |
-
# var_input.change(
|
717 |
-
# fn=handle_input_changes,
|
718 |
-
# inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
719 |
-
# outputs=[send_btn, regenerate_button],
|
720 |
-
# )
|
721 |
-
|
722 |
-
# Add click handlers for metric buttons
|
723 |
-
#outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
|
724 |
-
|
725 |
-
#custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
|
726 |
-
|
727 |
-
#hallucination_btn.click(
|
728 |
-
# fn=lambda: set_example_metric("Hallucination"), outputs=outputs_list
|
729 |
-
#)
|
730 |
-
|
731 |
-
#precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
|
732 |
-
|
733 |
-
#recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
|
734 |
-
|
735 |
-
#coherence_btn.click(
|
736 |
-
# fn=lambda: set_example_metric("Logical_Coherence"), outputs=outputs_list
|
737 |
-
#)
|
738 |
-
|
739 |
-
#faithfulness_btn.click(
|
740 |
-
# fn=lambda: set_example_metric("Faithfulness"), outputs=outputs_list
|
741 |
-
#)
|
742 |
-
|
743 |
-
# Set default metric at startup
|
744 |
-
demo.load(
|
745 |
-
#fn=lambda: set_example_metric("Hallucination"),
|
746 |
-
#outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
747 |
-
)
|
748 |
-
|
749 |
# Add random button handler
|
750 |
random_btn.click(
|
751 |
fn=populate_random_example,
|
752 |
-
inputs=[],
|
753 |
outputs=[
|
754 |
human_input,
|
755 |
ai_response,
|
@@ -763,6 +842,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
763 |
vote_tie,
|
764 |
model_name_a,
|
765 |
model_name_b,
|
|
|
766 |
]
|
767 |
)
|
768 |
|
@@ -810,10 +890,149 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
810 |
|
811 |
# Update the demo.load to include the random example population
|
812 |
demo.load(
|
813 |
-
fn=populate_random_example,
|
814 |
inputs=[],
|
815 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
816 |
)
|
817 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
818 |
if __name__ == "__main__":
|
819 |
demo.launch()
|
|
|
13 |
import gradio as gr
|
14 |
from gen_api_answer import (
|
15 |
get_model_response,
|
16 |
+
parse_model_response,
|
17 |
+
alternative_parse_model_response
|
18 |
+
)
|
19 |
+
|
20 |
+
from random_sample_generation import (
|
21 |
get_random_human_ai_pair,
|
22 |
+
get_random_human_ai_ground_truth_pair,
|
23 |
generate_ai_response
|
24 |
+
)
|
25 |
from db import add_vote, create_db_connection, get_votes
|
26 |
from utils import Vote
|
27 |
from common import (
|
|
|
38 |
VOTING_HEADER,
|
39 |
DEFAULT_EVAL_PROMPT_EDITABLE,
|
40 |
FIXED_EVAL_SUFFIX,
|
41 |
+
DEFAULT_EVAL_CRITERIA,
|
42 |
+
DEFAULT_SCORE_1,
|
43 |
+
DEFAULT_SCORE_2,
|
44 |
+
DEFAULT_SCORE_3,
|
45 |
+
DEFAULT_SCORE_4,
|
46 |
+
DEFAULT_SCORE_5,
|
47 |
)
|
48 |
from leaderboard import (
|
49 |
get_leaderboard,
|
|
|
303 |
)
|
304 |
|
305 |
|
306 |
+
def populate_random_example(request: gr.Request, compatible_mode: bool):
|
307 |
"""Generate a random human-AI conversation example and reset judge outputs."""
|
308 |
+
if compatible_mode:
|
309 |
+
# Generate all three components when compatible mode is enabled
|
310 |
+
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
|
311 |
+
else:
|
312 |
+
# Generate only human and AI messages when compatible mode is disabled
|
313 |
+
human_msg, ai_msg = get_random_human_ai_pair()
|
314 |
+
ground_truth_msg = ""
|
315 |
+
|
316 |
return [
|
317 |
gr.update(value=human_msg),
|
318 |
gr.update(value=ai_msg),
|
|
|
326 |
gr.update(interactive=False, variant="primary"), # Reset vote tie
|
327 |
gr.update(value="*Model: Hidden*"), # Reset model name A
|
328 |
gr.update(value="*Model: Hidden*"), # Reset model name B
|
329 |
+
gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
|
330 |
]
|
331 |
|
332 |
|
|
|
364 |
placeholder="Enter the AI response here..."
|
365 |
)
|
366 |
|
367 |
+
# Ground truth response (initially hidden)
|
368 |
+
ground_truth = gr.TextArea(
|
369 |
+
label="🎯 Ground truth response",
|
370 |
+
lines=12,
|
371 |
+
placeholder="Enter the ground truth response here...",
|
372 |
+
visible=False
|
373 |
+
)
|
374 |
+
|
375 |
with gr.Row():
|
376 |
random_btn = gr.Button("🎲", scale=2)
|
377 |
send_btn = gr.Button(
|
|
|
408 |
vote_b = gr.Button("Vote B", variant="primary", interactive=False)
|
409 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
410 |
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
|
411 |
+
# Place Vote B button directly under Judge B
|
412 |
|
413 |
gr.Markdown("<br>")
|
414 |
+
|
415 |
+
|
416 |
+
# Replace the "Edit Judge Prompt" Accordion section with:
|
417 |
+
with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
|
418 |
+
gr.Markdown("<br>")
|
419 |
+
compatible_mode_toggle = gr.Checkbox(
|
420 |
+
label="Use a prompt compatible with Prometheus models",
|
421 |
+
value=False
|
422 |
)
|
423 |
+
|
424 |
+
# Default prompt editor
|
425 |
+
with gr.Column(visible=True) as default_prompt_editor:
|
426 |
+
eval_prompt_editable = gr.TextArea(
|
427 |
+
value=DEFAULT_EVAL_PROMPT_EDITABLE,
|
428 |
+
label="Evaluation Criteria",
|
429 |
+
lines=12
|
430 |
+
)
|
431 |
+
|
432 |
+
with gr.Row(visible=False) as edit_buttons_row:
|
433 |
+
cancel_prompt_btn = gr.Button("Cancel")
|
434 |
+
save_prompt_btn = gr.Button("Save", variant="primary")
|
435 |
+
gr.Markdown("*The sample being evaluated is always appended as:*")
|
436 |
+
gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
|
437 |
+
|
438 |
+
# Compatible mode editor
|
439 |
+
with gr.Column(visible=False) as compatible_prompt_editor:
|
440 |
+
with gr.Row():
|
441 |
+
# Left column - Evaluation Criteria
|
442 |
+
with gr.Column(scale=1):
|
443 |
+
eval_criteria_text = gr.TextArea(
|
444 |
+
label="Evaluation Criteria",
|
445 |
+
lines=12,
|
446 |
+
value=DEFAULT_EVAL_CRITERIA,
|
447 |
+
placeholder="Enter the evaluation criteria..."
|
448 |
+
)
|
449 |
+
prometheus_reference = gr.Markdown(
|
450 |
+
"<br> *This enforces the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
|
451 |
+
visible=False # Initially hidden
|
452 |
+
)
|
453 |
+
|
454 |
+
# Right column - Score Descriptions
|
455 |
+
with gr.Column(scale=1):
|
456 |
+
score1_description = gr.TextArea(
|
457 |
+
label="Score 1",
|
458 |
+
value=DEFAULT_SCORE_1,
|
459 |
+
placeholder="Description for score 1",
|
460 |
+
lines=2
|
461 |
+
)
|
462 |
+
score2_description = gr.TextArea(
|
463 |
+
label="Score 2",
|
464 |
+
value=DEFAULT_SCORE_2,
|
465 |
+
placeholder="Description for score 2",
|
466 |
+
lines=2
|
467 |
+
)
|
468 |
+
score3_description = gr.TextArea(
|
469 |
+
label="Score 3",
|
470 |
+
value=DEFAULT_SCORE_3,
|
471 |
+
placeholder="Description for score 3",
|
472 |
+
lines=2
|
473 |
+
)
|
474 |
+
score4_description = gr.TextArea(
|
475 |
+
label="Score 4",
|
476 |
+
value=DEFAULT_SCORE_4,
|
477 |
+
placeholder="Description for score 4",
|
478 |
+
lines=2
|
479 |
+
)
|
480 |
+
score5_description = gr.TextArea(
|
481 |
+
label="Score 5",
|
482 |
+
value=DEFAULT_SCORE_5,
|
483 |
+
placeholder="Description for score 5",
|
484 |
+
lines=2
|
485 |
+
)
|
486 |
+
|
487 |
+
# Add save/cancel buttons for compatible mode
|
488 |
+
with gr.Row(visible=False) as compatible_edit_buttons_row:
|
489 |
+
compatible_cancel_btn = gr.Button("Cancel")
|
490 |
+
compatible_save_btn = gr.Button("Save", variant="primary")
|
491 |
|
492 |
with gr.TabItem("Leaderboard"):
|
493 |
with gr.Row():
|
|
|
495 |
show_preliminary = gr.Checkbox(
|
496 |
label="Reveal preliminary results",
|
497 |
value=True, # Checked by default
|
498 |
+
info="Show all models, including models with less human ratings (< 500 votes)",
|
499 |
interactive=True
|
500 |
)
|
501 |
stats_display = gr.Markdown()
|
|
|
503 |
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
504 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
505 |
)
|
506 |
+
|
507 |
gr.Markdown("""<br>
|
508 |
<br>
|
509 |
Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
|
|
|
535 |
final_prompt_state = gr.State()
|
536 |
eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
|
537 |
is_editing = gr.State(False) # Track editing state
|
538 |
+
compatible_mode_state = gr.State(False) # Track compatible mode state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
|
540 |
# Update model names after responses are generated
|
541 |
def update_model_names(model_a, model_b):
|
|
|
657 |
outputs=edit_buttons_row
|
658 |
)
|
659 |
|
660 |
+
# Function to toggle visibility based on compatible mode
|
661 |
+
def toggle_compatible_mode(checked):
|
662 |
+
return {
|
663 |
+
ground_truth: gr.update(visible=checked),
|
664 |
+
default_prompt_editor: gr.update(visible=not checked),
|
665 |
+
compatible_prompt_editor: gr.update(visible=checked),
|
666 |
+
prometheus_reference: gr.update(visible=checked),
|
667 |
+
}
|
668 |
+
|
669 |
+
compatible_mode_toggle.change(
|
670 |
+
fn=toggle_compatible_mode,
|
671 |
+
inputs=[compatible_mode_toggle],
|
672 |
+
outputs=[
|
673 |
+
ground_truth,
|
674 |
+
default_prompt_editor,
|
675 |
+
compatible_prompt_editor,
|
676 |
+
prometheus_reference,
|
677 |
+
]
|
678 |
+
)
|
679 |
+
|
680 |
+
# Update the submit function to handle compatible mode
|
681 |
+
def submit_and_store(
|
682 |
+
compatible_mode,
|
683 |
+
editable_prompt,
|
684 |
+
human_input,
|
685 |
+
ai_response,
|
686 |
+
ground_truth_input,
|
687 |
+
eval_criteria_text_input,
|
688 |
+
score1_desc,
|
689 |
+
score2_desc,
|
690 |
+
score3_desc,
|
691 |
+
score4_desc,
|
692 |
+
score5_desc,
|
693 |
+
):
|
694 |
+
if compatible_mode:
|
695 |
+
# Build the prompt using the new format
|
696 |
+
prompt = f"""###Task Description:
|
697 |
+
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing an evaluation criteria are given.
|
698 |
+
1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general.
|
699 |
+
2. After writing the feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
|
700 |
+
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
|
701 |
+
4. Please do not generate any other openings, closings, or explanations.
|
702 |
+
|
703 |
+
###The instruction to evaluate:
|
704 |
+
{human_input}
|
705 |
+
|
706 |
+
###Response to evaluate:
|
707 |
+
{ai_response}
|
708 |
+
|
709 |
+
###Reference Answer (Score 5):
|
710 |
+
{ground_truth_input}
|
711 |
+
|
712 |
+
###Score Rubrics:
|
713 |
+
[{eval_criteria_text_input}]
|
714 |
+
Score 1: {score1_desc}
|
715 |
+
Score 2: {score2_desc}
|
716 |
+
Score 3: {score3_desc}
|
717 |
+
Score 4: {score4_desc}
|
718 |
+
Score 5: {score5_desc}
|
719 |
+
|
720 |
+
###Feedback:
|
721 |
+
"""
|
722 |
+
final_prompt = prompt
|
723 |
+
use_alternative_prompt = True
|
724 |
+
else:
|
725 |
+
# Combine the editable prompt with fixed suffix
|
726 |
+
full_prompt = editable_prompt + FIXED_EVAL_SUFFIX
|
727 |
+
# Replace variables in the eval prompt
|
728 |
+
variable_values = {'input': human_input, 'response': ai_response}
|
729 |
+
final_prompt = get_final_prompt(full_prompt, variable_values)
|
730 |
+
use_alternative_prompt = False
|
731 |
+
|
732 |
+
# Filter models based on compatible mode
|
733 |
+
if compatible_mode:
|
734 |
+
# Include all models when compatible mode is enabled
|
735 |
+
models = list(model_data.keys())
|
736 |
+
else:
|
737 |
+
# Exclude Prometheus models when not in compatible mode
|
738 |
+
models = [
|
739 |
+
model_name for model_name in model_data.keys()
|
740 |
+
if model_data[model_name]["organization"] != "Prometheus"
|
741 |
+
]
|
742 |
+
|
743 |
+
# Select two models randomly from the filtered list
|
744 |
+
model1, model2 = random.sample(models, 2)
|
745 |
+
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
|
746 |
+
|
747 |
+
# Get responses from models
|
748 |
+
response_a = get_model_response(
|
749 |
model_a,
|
750 |
+
model_data.get(model_a),
|
751 |
+
final_prompt,
|
752 |
+
use_alternative_prompt=use_alternative_prompt
|
753 |
+
)
|
754 |
+
response_b = get_model_response(
|
755 |
model_b,
|
756 |
+
model_data.get(model_b),
|
757 |
final_prompt,
|
758 |
+
use_alternative_prompt=use_alternative_prompt
|
759 |
+
)
|
760 |
|
761 |
+
# Parse the responses based on mode
|
762 |
+
if compatible_mode:
|
763 |
+
score_a_val, critique_a_val = alternative_parse_model_response(response_a)
|
764 |
+
score_b_val, critique_b_val = alternative_parse_model_response(response_b)
|
765 |
+
else:
|
766 |
+
score_a_val, critique_a_val = parse_model_response(response_a)
|
767 |
+
score_b_val, critique_b_val = parse_model_response(response_b)
|
768 |
|
769 |
# Only append "/ 5" if using the default prompt
|
770 |
+
if not compatible_mode and editable_prompt.strip() == DEFAULT_EVAL_PROMPT_EDITABLE.strip():
|
771 |
+
score_a_val = f"{score_a_val} / 5"
|
772 |
+
score_b_val = f"{score_b_val} / 5"
|
773 |
|
774 |
+
# Update the last_submission state
|
775 |
+
last_submission.value = {"prompt": final_prompt, "variables": [human_input, ai_response]}
|
776 |
|
777 |
return (
|
778 |
+
score_a_val,
|
779 |
+
critique_a_val,
|
780 |
+
score_b_val,
|
781 |
+
critique_b_val,
|
782 |
gr.update(interactive=True, variant="primary"), # vote_a
|
783 |
gr.update(interactive=True, variant="primary"), # vote_b
|
784 |
gr.update(interactive=True, variant="primary"), # vote_tie
|
|
|
787 |
final_prompt,
|
788 |
gr.update(value="*Model: Hidden*"),
|
789 |
gr.update(value="*Model: Hidden*"),
|
790 |
+
gr.update(value="Regenerate judges", variant="secondary", interactive=True),
|
|
|
|
|
|
|
|
|
791 |
gr.update(value="🎲"), # random_btn
|
792 |
)
|
793 |
|
794 |
# Update the click handler to use the editable prompt
|
795 |
send_btn.click(
|
796 |
fn=submit_and_store,
|
797 |
+
inputs=[
|
798 |
+
compatible_mode_toggle,
|
799 |
+
eval_prompt_editable,
|
800 |
+
human_input,
|
801 |
+
ai_response,
|
802 |
+
ground_truth,
|
803 |
+
eval_criteria_text,
|
804 |
+
score1_description,
|
805 |
+
score2_description,
|
806 |
+
score3_description,
|
807 |
+
score4_description,
|
808 |
+
score5_description,
|
809 |
+
],
|
810 |
outputs=[
|
811 |
score_a,
|
812 |
critique_a,
|
|
|
825 |
],
|
826 |
)
|
827 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
828 |
# Add random button handler
|
829 |
random_btn.click(
|
830 |
fn=populate_random_example,
|
831 |
+
inputs=[compatible_mode_toggle], # Use compatible mode toggle to decide behavior
|
832 |
outputs=[
|
833 |
human_input,
|
834 |
ai_response,
|
|
|
842 |
vote_tie,
|
843 |
model_name_a,
|
844 |
model_name_b,
|
845 |
+
ground_truth, # Set ground truth
|
846 |
]
|
847 |
)
|
848 |
|
|
|
890 |
|
891 |
# Update the demo.load to include the random example population
|
892 |
demo.load(
|
893 |
+
fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
|
894 |
inputs=[],
|
895 |
+
outputs=[
|
896 |
+
human_input,
|
897 |
+
ai_response,
|
898 |
+
random_btn,
|
899 |
+
score_a,
|
900 |
+
critique_a,
|
901 |
+
score_b,
|
902 |
+
critique_b,
|
903 |
+
vote_a,
|
904 |
+
vote_b,
|
905 |
+
vote_tie,
|
906 |
+
model_name_a,
|
907 |
+
model_name_b,
|
908 |
+
ground_truth,
|
909 |
+
]
|
910 |
+
)
|
911 |
+
|
912 |
+
# Add new state variables for compatible mode
|
913 |
+
eval_criteria_previous = gr.State(value=DEFAULT_EVAL_CRITERIA)
|
914 |
+
score1_previous = gr.State(value=DEFAULT_SCORE_1)
|
915 |
+
score2_previous = gr.State(value=DEFAULT_SCORE_2)
|
916 |
+
score3_previous = gr.State(value=DEFAULT_SCORE_3)
|
917 |
+
score4_previous = gr.State(value=DEFAULT_SCORE_4)
|
918 |
+
score5_previous = gr.State(value=DEFAULT_SCORE_5)
|
919 |
+
|
920 |
+
# Add new functions to handle compatible mode saves/cancels
|
921 |
+
def save_compatible_prompt(criteria, score1, score2, score3, score4, score5):
|
922 |
+
return [
|
923 |
+
gr.update(value=criteria), # Update criteria
|
924 |
+
criteria, # Update previous criteria state
|
925 |
+
gr.update(value=score1),
|
926 |
+
score1,
|
927 |
+
gr.update(value=score2),
|
928 |
+
score2,
|
929 |
+
gr.update(value=score3),
|
930 |
+
score3,
|
931 |
+
gr.update(value=score4),
|
932 |
+
score4,
|
933 |
+
gr.update(value=score5),
|
934 |
+
score5,
|
935 |
+
gr.update(visible=False) # Hide buttons
|
936 |
+
]
|
937 |
+
|
938 |
+
def cancel_compatible_prompt(prev_criteria, prev_score1, prev_score2, prev_score3, prev_score4, prev_score5):
|
939 |
+
return [
|
940 |
+
gr.update(value=prev_criteria),
|
941 |
+
prev_criteria,
|
942 |
+
gr.update(value=prev_score1),
|
943 |
+
prev_score1,
|
944 |
+
gr.update(value=prev_score2),
|
945 |
+
prev_score2,
|
946 |
+
gr.update(value=prev_score3),
|
947 |
+
prev_score3,
|
948 |
+
gr.update(value=prev_score4),
|
949 |
+
prev_score4,
|
950 |
+
gr.update(value=prev_score5),
|
951 |
+
prev_score5,
|
952 |
+
gr.update(visible=False)
|
953 |
+
]
|
954 |
+
|
955 |
+
def show_compatible_edit_buttons(*current_values):
|
956 |
+
previous_values = current_values[1::2] # Get previous values
|
957 |
+
current_values = current_values[::2] # Get current values
|
958 |
+
return gr.update(visible=any(curr != prev for curr, prev in zip(current_values, previous_values)))
|
959 |
+
|
960 |
+
# Add click handlers for compatible mode buttons
|
961 |
+
compatible_save_btn.click(
|
962 |
+
fn=save_compatible_prompt,
|
963 |
+
inputs=[
|
964 |
+
eval_criteria_text,
|
965 |
+
score1_description,
|
966 |
+
score2_description,
|
967 |
+
score3_description,
|
968 |
+
score4_description,
|
969 |
+
score5_description
|
970 |
+
],
|
971 |
+
outputs=[
|
972 |
+
eval_criteria_text,
|
973 |
+
eval_criteria_previous,
|
974 |
+
score1_description,
|
975 |
+
score1_previous,
|
976 |
+
score2_description,
|
977 |
+
score2_previous,
|
978 |
+
score3_description,
|
979 |
+
score3_previous,
|
980 |
+
score4_description,
|
981 |
+
score4_previous,
|
982 |
+
score5_description,
|
983 |
+
score5_previous,
|
984 |
+
compatible_edit_buttons_row
|
985 |
+
]
|
986 |
+
)
|
987 |
+
|
988 |
+
compatible_cancel_btn.click(
|
989 |
+
fn=cancel_compatible_prompt,
|
990 |
+
inputs=[
|
991 |
+
eval_criteria_previous,
|
992 |
+
score1_previous,
|
993 |
+
score2_previous,
|
994 |
+
score3_previous,
|
995 |
+
score4_previous,
|
996 |
+
score5_previous
|
997 |
+
],
|
998 |
+
outputs=[
|
999 |
+
eval_criteria_text,
|
1000 |
+
eval_criteria_previous,
|
1001 |
+
score1_description,
|
1002 |
+
score1_previous,
|
1003 |
+
score2_description,
|
1004 |
+
score2_previous,
|
1005 |
+
score3_description,
|
1006 |
+
score3_previous,
|
1007 |
+
score4_description,
|
1008 |
+
score4_previous,
|
1009 |
+
score5_description,
|
1010 |
+
score5_previous,
|
1011 |
+
compatible_edit_buttons_row
|
1012 |
+
]
|
1013 |
)
|
1014 |
|
1015 |
+
# Add change handlers for all compatible mode inputs
|
1016 |
+
for component in [eval_criteria_text, score1_description, score2_description,
|
1017 |
+
score3_description, score4_description, score5_description]:
|
1018 |
+
component.change(
|
1019 |
+
fn=show_compatible_edit_buttons,
|
1020 |
+
inputs=[
|
1021 |
+
eval_criteria_text,
|
1022 |
+
eval_criteria_previous,
|
1023 |
+
score1_description,
|
1024 |
+
score1_previous,
|
1025 |
+
score2_description,
|
1026 |
+
score2_previous,
|
1027 |
+
score3_description,
|
1028 |
+
score3_previous,
|
1029 |
+
score4_description,
|
1030 |
+
score4_previous,
|
1031 |
+
score5_description,
|
1032 |
+
score5_previous
|
1033 |
+
],
|
1034 |
+
outputs=compatible_edit_buttons_row
|
1035 |
+
)
|
1036 |
+
|
1037 |
if __name__ == "__main__":
|
1038 |
demo.launch()
|