kaikaidai commited on
Commit
281eda1
·
verified ·
1 Parent(s): 60f3337

Update app.py

Browse files

Newer models get featured in battles more to catch up on votes

Files changed (1) hide show
  1. app.py +61 -42
app.py CHANGED
@@ -13,7 +13,8 @@ import gradio as gr
13
  from gen_api_answer import (
14
  get_model_response,
15
  parse_model_response,
16
- prometheus_parse_model_response
 
17
  )
18
 
19
  from random_sample_generation import (
@@ -113,40 +114,6 @@ def get_final_prompt(eval_prompt, variable_values):
113
  return eval_prompt
114
 
115
 
116
- def submit_prompt(eval_prompt, *variable_values):
117
- try:
118
- variables = parse_variables(eval_prompt)
119
- variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
120
- final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
121
-
122
- models = list(model_data.keys())
123
- model1, model2 = random.sample(models, 2)
124
- model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
125
-
126
- response_a = get_model_response(model_a, model_data.get(model_a), final_prompt)
127
- response_b = get_model_response(model_b, model_data.get(model_b), final_prompt)
128
-
129
- return (
130
- response_a,
131
- response_b,
132
- gr.update(visible=True),
133
- gr.update(visible=True),
134
- model_a,
135
- model_b,
136
- final_prompt,
137
- )
138
- except Exception as e:
139
- print(f"Error in submit_prompt: {str(e)}")
140
- return (
141
- "Error generating response",
142
- "Error generating response",
143
- gr.update(visible=False),
144
- gr.update(visible=False),
145
- None,
146
- None,
147
- None,
148
- )
149
-
150
 
151
  def get_ip(request: gr.Request) -> str:
152
  """Get and hash the IP address from the request."""
@@ -492,7 +459,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
492
  show_preliminary = gr.Checkbox(
493
  label="Reveal preliminary results",
494
  value=True, # Checked by default
495
- info="Show all models, including models with less human ratings (< 500 votes)",
496
  interactive=True
497
  )
498
  stats_display = gr.Markdown()
@@ -714,6 +681,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
714
  score3_description,
715
  score4_description,
716
  score5_description,
 
717
  ):
718
  # Build prompt data dictionary
719
  prompt_data = {
@@ -728,9 +696,40 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
728
  'score5_desc': score5_description,
729
  }
730
 
731
- models = list(model_data.keys())
732
- model1, model2 = random.sample(models, 2)
733
- model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
 
735
  # Get responses from models
736
  response_a = get_model_response(
@@ -746,13 +745,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
746
  use_reference=use_reference
747
  )
748
 
749
- # Parse the responses based on model, using Prometheus parsing for Prometheus models and JSON parsing for others
750
  is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
751
  is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
 
 
752
 
753
  if is_prometheus_a:
754
  score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
755
  score_a_val = f"{score_a_val} / 5"
 
 
 
756
  else:
757
  score_a_val, critique_a_val = parse_model_response(response_a)
758
  score_a_val = f"{score_a_val} / 5"
@@ -760,6 +764,9 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
760
  if is_prometheus_b:
761
  score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
762
  score_b_val = f"{score_b_val} / 5"
 
 
 
763
  else:
764
  score_b_val, critique_b_val = parse_model_response(response_b)
765
  score_b_val = f"{score_b_val} / 5"
@@ -781,9 +788,21 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
781
  gr.update(value="🎲"), # random_btn
782
  )
783
 
784
- # Update the click handler to use the editable prompt
 
 
 
 
 
 
 
 
 
 
 
 
785
  send_btn.click(
786
- fn=submit_and_store,
787
  inputs=[
788
  use_reference_toggle,
789
  eval_criteria_text,
 
13
  from gen_api_answer import (
14
  get_model_response,
15
  parse_model_response,
16
+ prometheus_parse_model_response,
17
+ atla_parse_model_response
18
  )
19
 
20
  from random_sample_generation import (
 
114
  return eval_prompt
115
 
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  def get_ip(request: gr.Request) -> str:
119
  """Get and hash the IP address from the request."""
 
459
  show_preliminary = gr.Checkbox(
460
  label="Reveal preliminary results",
461
  value=True, # Checked by default
462
+ info="Show all models, including models with less human ratings (< 300 votes)",
463
  interactive=True
464
  )
465
  stats_display = gr.Markdown()
 
681
  score3_description,
682
  score4_description,
683
  score5_description,
684
+ is_first_game=False
685
  ):
686
  # Build prompt data dictionary
687
  prompt_data = {
 
696
  'score5_desc': score5_description,
697
  }
698
 
699
+ # Get list of active models only for matches
700
+ active_models = [name for name, info in model_data.items()
701
+ if info.get("active", True)] # Default to True for backward compatibility
702
+
703
+ # Modified model selection logic
704
+ atla_model = "Atla-8B-preview-2024-01-08"
705
+
706
+ if is_first_game:
707
+ # For the first game, ensure Atla is one of the models
708
+ other_models = [m for m in active_models if m != atla_model]
709
+ other_model = random.choice(other_models)
710
+
711
+ # Randomly assign Atla to either position A or B
712
+ if random.random() < 0.5:
713
+ model_a, model_b = atla_model, other_model
714
+ else:
715
+ model_a, model_b = other_model, atla_model
716
+ else:
717
+ # For subsequent games, Atla appears 30% of the time
718
+ if random.random() < 0.3:
719
+ # Include Atla in this battle
720
+ other_models = [m for m in active_models if m != atla_model]
721
+ other_model = random.choice(other_models)
722
+
723
+ # Randomly assign Atla to either position A or B
724
+ if random.random() < 0.5:
725
+ model_a, model_b = atla_model, other_model
726
+ else:
727
+ model_a, model_b = other_model, atla_model
728
+ else:
729
+ # Battle between two non-Atla models
730
+ non_atla_models = [m for m in active_models if m != atla_model]
731
+ model1, model2 = random.sample(non_atla_models, 2)
732
+ model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
733
 
734
  # Get responses from models
735
  response_a = get_model_response(
 
745
  use_reference=use_reference
746
  )
747
 
748
+ # Parse the responses based on model, using appropriate parsing for different models
749
  is_prometheus_a = (model_data.get(model_a)['organization'] == 'Prometheus')
750
  is_prometheus_b = (model_data.get(model_b)['organization'] == 'Prometheus')
751
+ is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
752
+ is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
753
 
754
  if is_prometheus_a:
755
  score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
756
  score_a_val = f"{score_a_val} / 5"
757
+ elif is_atla_a:
758
+ score_a_val, critique_a_val = atla_parse_model_response(response_a)
759
+ score_a_val = f"{score_a_val} / 5"
760
  else:
761
  score_a_val, critique_a_val = parse_model_response(response_a)
762
  score_a_val = f"{score_a_val} / 5"
 
764
  if is_prometheus_b:
765
  score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
766
  score_b_val = f"{score_b_val} / 5"
767
+ elif is_atla_b:
768
+ score_b_val, critique_b_val = atla_parse_model_response(response_b)
769
+ score_b_val = f"{score_b_val} / 5"
770
  else:
771
  score_b_val, critique_b_val = parse_model_response(response_b)
772
  score_b_val = f"{score_b_val} / 5"
 
788
  gr.update(value="🎲"), # random_btn
789
  )
790
 
791
+ # Update the click handler to use False for is_first_game after first submission
792
+ def create_submit_handler():
793
+ first_game = True
794
+
795
+ def handler(*args):
796
+ nonlocal first_game
797
+ result = submit_and_store(*args, first_game)
798
+ first_game = False # Set to False after first submission
799
+ return result
800
+
801
+ return handler
802
+
803
+ # Update the send_btn click handler
804
  send_btn.click(
805
+ fn=create_submit_handler(),
806
  inputs=[
807
  use_reference_toggle,
808
  eval_criteria_text,