pminervini commited on
Commit
61c2746
1 Parent(s): 9af5ebf
Files changed (3) hide show
  1. app.py +1 -5
  2. src/display/utils.py +13 -0
  3. src/tools/plots.py +17 -1
app.py CHANGED
@@ -30,11 +30,7 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
31
  # from src.submission.check_validity import already_submitted_models
32
  # from src.tools.collections import update_collections
33
- from src.tools.plots import (
34
- create_metric_plot_obj,
35
- create_plot_df,
36
- create_scores_df,
37
- )
38
 
39
 
40
  def restart_space():
 
30
  from src.submission.submit import add_new_eval
31
  # from src.submission.check_validity import already_submitted_models
32
  # from src.tools.collections import update_collections
33
+ # from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 
 
 
 
34
 
35
 
36
  def restart_space():
src/display/utils.py CHANGED
@@ -62,6 +62,7 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
62
  # We use make dataclass to dynamically fill the scores from Tasks
63
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
64
 
 
65
  @dataclass(frozen=True)
66
  class EvalQueueColumn: # Queue column
67
  model = ColumnContent("model", "markdown", True)
@@ -72,6 +73,18 @@ class EvalQueueColumn: # Queue column
72
  status = ColumnContent("status", "str", True)
73
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  @dataclass
76
  class ModelDetails:
77
  name: str
 
62
  # We use make dataclass to dynamically fill the scores from Tasks
63
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
64
 
65
+
66
  @dataclass(frozen=True)
67
  class EvalQueueColumn: # Queue column
68
  model = ColumnContent("model", "markdown", True)
 
73
  status = ColumnContent("status", "str", True)
74
 
75
 
76
+ # Define the human baselines
77
+ human_baseline_row = {
78
+ AutoEvalColumn.model.name: "<p>Human performance</p>",
79
+ AutoEvalColumn.revision.name: "N/A",
80
+ AutoEvalColumn.precision.name: None,
81
+ AutoEvalColumn.average.name: 100.0,
82
+ AutoEvalColumn.nqopen.name: 100.0,
83
+ AutoEvalColumn.triviaqa.name: 100.0,
84
+ AutoEvalColumn.dummy.name: "human_baseline",
85
+ AutoEvalColumn.model_type.name: "",
86
+ }
87
+
88
  @dataclass
89
  class ModelDetails:
90
  name: str
src/tools/plots.py CHANGED
@@ -93,7 +93,8 @@ def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) ->
93
  df = df[df["task"].isin(metrics)]
94
 
95
  # Filter the human baselines based on the specified metrics
96
- # filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
 
97
 
98
  # Create a line figure using plotly express with specified markers and custom data
99
  fig = px.line(
@@ -128,6 +129,21 @@ def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) ->
128
  for trace in fig.data:
129
  metric_color_mapping[trace.name] = trace.line.color
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  return fig
132
 
133
 
 
93
  df = df[df["task"].isin(metrics)]
94
 
95
  # Filter the human baselines based on the specified metrics
96
+ from src.display.utils import human_baseline_row as HUMAN_BASELINE
97
+ filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
98
 
99
  # Create a line figure using plotly express with specified markers and custom data
100
  fig = px.line(
 
129
  for trace in fig.data:
130
  metric_color_mapping[trace.name] = trace.line.color
131
 
132
+ # Iterate over filtered human baselines and add horizontal lines to the figure
133
+ for metric, value in filtered_human_baselines.items():
134
+ color = metric_color_mapping.get(metric, "blue") # Retrieve color from mapping; default to blue if not found
135
+ location = "top left" if metric == "HellaSwag" else "bottom left" # Set annotation position
136
+ # Add horizontal line with matched color and positioned annotation
137
+ fig.add_hline(
138
+ y=value,
139
+ line_dash="dot",
140
+ annotation_text=f"{metric} human baseline",
141
+ annotation_position=location,
142
+ annotation_font_size=10,
143
+ annotation_font_color=color,
144
+ line_color=color,
145
+ )
146
+
147
  return fig
148
 
149