Spaces:
Running
Running
Update app.py
Browse filesNewer models get featured in battles more to catch up on votes
app.py
CHANGED
@@ -13,7 +13,8 @@ import gradio as gr
|
|
13 |
from gen_api_answer import (
|
14 |
get_model_response,
|
15 |
parse_model_response,
|
16 |
-
prometheus_parse_model_response
|
|
|
17 |
)
|
18 |
|
19 |
from random_sample_generation import (
|
@@ -113,40 +114,6 @@ def get_final_prompt(eval_prompt, variable_values):
|
|
113 |
return eval_prompt
|
114 |
|
115 |
|
116 |
-
def submit_prompt(eval_prompt, *variable_values):
|
117 |
-
try:
|
118 |
-
variables = parse_variables(eval_prompt)
|
119 |
-
variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
|
120 |
-
final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
|
121 |
-
|
122 |
-
models = list(model_data.keys())
|
123 |
-
model1, model2 = random.sample(models, 2)
|
124 |
-
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
|
125 |
-
|
126 |
-
response_a = get_model_response(model_a, model_data.get(model_a), final_prompt)
|
127 |
-
response_b = get_model_response(model_b, model_data.get(model_b), final_prompt)
|
128 |
-
|
129 |
-
return (
|
130 |
-
response_a,
|
131 |
-
response_b,
|
132 |
-
gr.update(visible=True),
|
133 |
-
gr.update(visible=True),
|
134 |
-
model_a,
|
135 |
-
model_b,
|
136 |
-
final_prompt,
|
137 |
-
)
|
138 |
-
except Exception as e:
|
139 |
-
print(f"Error in submit_prompt: {str(e)}")
|
140 |
-
return (
|
141 |
-
"Error generating response",
|
142 |
-
"Error generating response",
|
143 |
-
gr.update(visible=False),
|
144 |
-
gr.update(visible=False),
|
145 |
-
None,
|
146 |
-
None,
|
147 |
-
None,
|
148 |
-
)
|
149 |
-
|
150 |
|
151 |
def get_ip(request: gr.Request) -> str:
|
152 |
"""Get and hash the IP address from the request."""
|
@@ -492,7 +459,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
492 |
show_preliminary = gr.Checkbox(
|
493 |
label="Reveal preliminary results",
|
494 |
value=True, # Checked by default
|
495 |
-
info="Show all models, including models with less human ratings (<
|
496 |
interactive=True
|
497 |
)
|
498 |
stats_display = gr.Markdown()
|
@@ -714,6 +681,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
714 |
score3_description,
|
715 |
score4_description,
|
716 |
score5_description,
|
|
|
717 |
):
|
718 |
# Build prompt data dictionary
|
719 |
prompt_data = {
|
@@ -728,9 +696,40 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
728 |
'score5_desc': score5_description,
|
729 |
}
|
730 |
|
731 |
-
models
|
732 |
-
|
733 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
734 |
|
735 |
# Get responses from models
|
736 |
response_a = get_model_response(
|
@@ -746,13 +745,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
746 |
use_reference=use_reference
|
747 |
)
|
748 |
|
749 |
-
# Parse the responses based on model, using
|
750 |
is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
|
751 |
is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
|
|
|
|
|
752 |
|
753 |
if is_prometheus_a:
|
754 |
score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
|
755 |
score_a_val = f"{score_a_val} / 5"
|
|
|
|
|
|
|
756 |
else:
|
757 |
score_a_val, critique_a_val = parse_model_response(response_a)
|
758 |
score_a_val = f"{score_a_val} / 5"
|
@@ -760,6 +764,9 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
760 |
if is_prometheus_b:
|
761 |
score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
|
762 |
score_b_val = f"{score_b_val} / 5"
|
|
|
|
|
|
|
763 |
else:
|
764 |
score_b_val, critique_b_val = parse_model_response(response_b)
|
765 |
score_b_val = f"{score_b_val} / 5"
|
@@ -781,9 +788,21 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
781 |
gr.update(value="🎲"), # random_btn
|
782 |
)
|
783 |
|
784 |
-
# Update the click handler to use
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
785 |
send_btn.click(
|
786 |
-
fn=
|
787 |
inputs=[
|
788 |
use_reference_toggle,
|
789 |
eval_criteria_text,
|
|
|
13 |
from gen_api_answer import (
|
14 |
get_model_response,
|
15 |
parse_model_response,
|
16 |
+
prometheus_parse_model_response,
|
17 |
+
atla_parse_model_response
|
18 |
)
|
19 |
|
20 |
from random_sample_generation import (
|
|
|
114 |
return eval_prompt
|
115 |
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
def get_ip(request: gr.Request) -> str:
|
119 |
"""Get and hash the IP address from the request."""
|
|
|
459 |
show_preliminary = gr.Checkbox(
|
460 |
label="Reveal preliminary results",
|
461 |
value=True, # Checked by default
|
462 |
+
info="Show all models, including models with less human ratings (< 300 votes)",
|
463 |
interactive=True
|
464 |
)
|
465 |
stats_display = gr.Markdown()
|
|
|
681 |
score3_description,
|
682 |
score4_description,
|
683 |
score5_description,
|
684 |
+
is_first_game=False
|
685 |
):
|
686 |
# Build prompt data dictionary
|
687 |
prompt_data = {
|
|
|
696 |
'score5_desc': score5_description,
|
697 |
}
|
698 |
|
699 |
+
# Get list of active models only for matches
|
700 |
+
active_models = [name for name, info in model_data.items()
|
701 |
+
if info.get("active", True)] # Default to True for backward compatibility
|
702 |
+
|
703 |
+
# Modified model selection logic
|
704 |
+
atla_model = "Atla-8B-preview-2024-01-08"
|
705 |
+
|
706 |
+
if is_first_game:
|
707 |
+
# For the first game, ensure Atla is one of the models
|
708 |
+
other_models = [m for m in active_models if m != atla_model]
|
709 |
+
other_model = random.choice(other_models)
|
710 |
+
|
711 |
+
# Randomly assign Atla to either position A or B
|
712 |
+
if random.random() < 0.5:
|
713 |
+
model_a, model_b = atla_model, other_model
|
714 |
+
else:
|
715 |
+
model_a, model_b = other_model, atla_model
|
716 |
+
else:
|
717 |
+
# For subsequent games, Atla appears 30% of the time
|
718 |
+
if random.random() < 0.3:
|
719 |
+
# Include Atla in this battle
|
720 |
+
other_models = [m for m in active_models if m != atla_model]
|
721 |
+
other_model = random.choice(other_models)
|
722 |
+
|
723 |
+
# Randomly assign Atla to either position A or B
|
724 |
+
if random.random() < 0.5:
|
725 |
+
model_a, model_b = atla_model, other_model
|
726 |
+
else:
|
727 |
+
model_a, model_b = other_model, atla_model
|
728 |
+
else:
|
729 |
+
# Battle between two non-Atla models
|
730 |
+
non_atla_models = [m for m in active_models if m != atla_model]
|
731 |
+
model1, model2 = random.sample(non_atla_models, 2)
|
732 |
+
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
|
733 |
|
734 |
# Get responses from models
|
735 |
response_a = get_model_response(
|
|
|
745 |
use_reference=use_reference
|
746 |
)
|
747 |
|
748 |
+
# Parse the responses based on model, using appropriate parsing for different models
|
749 |
is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
|
750 |
is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
|
751 |
+
is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
|
752 |
+
is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
|
753 |
|
754 |
if is_prometheus_a:
|
755 |
score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
|
756 |
score_a_val = f"{score_a_val} / 5"
|
757 |
+
elif is_atla_a:
|
758 |
+
score_a_val, critique_a_val = atla_parse_model_response(response_a)
|
759 |
+
score_a_val = f"{score_a_val} / 5"
|
760 |
else:
|
761 |
score_a_val, critique_a_val = parse_model_response(response_a)
|
762 |
score_a_val = f"{score_a_val} / 5"
|
|
|
764 |
if is_prometheus_b:
|
765 |
score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
|
766 |
score_b_val = f"{score_b_val} / 5"
|
767 |
+
elif is_atla_b:
|
768 |
+
score_b_val, critique_b_val = atla_parse_model_response(response_b)
|
769 |
+
score_b_val = f"{score_b_val} / 5"
|
770 |
else:
|
771 |
score_b_val, critique_b_val = parse_model_response(response_b)
|
772 |
score_b_val = f"{score_b_val} / 5"
|
|
|
788 |
gr.update(value="🎲"), # random_btn
|
789 |
)
|
790 |
|
791 |
+
# Update the click handler to use False for is_first_game after first submission
|
792 |
+
def create_submit_handler():
|
793 |
+
first_game = True
|
794 |
+
|
795 |
+
def handler(*args):
|
796 |
+
nonlocal first_game
|
797 |
+
result = submit_and_store(*args, first_game)
|
798 |
+
first_game = False # Set to False after first submission
|
799 |
+
return result
|
800 |
+
|
801 |
+
return handler
|
802 |
+
|
803 |
+
# Update the send_btn click handler
|
804 |
send_btn.click(
|
805 |
+
fn=create_submit_handler(),
|
806 |
inputs=[
|
807 |
use_reference_toggle,
|
808 |
eval_criteria_text,
|