Spaces:
Runtime error
Runtime error
pminervini
commited on
Commit
•
90dff75
1
Parent(s):
669da77
update
Browse files- src/backend/envs.py +3 -2
- src/display/utils.py +0 -12
- src/tools/plots.py +0 -19
src/backend/envs.py
CHANGED
@@ -20,8 +20,9 @@ class Tasks(Enum):
|
|
20 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
21 |
# task0 = Task("anli_r1", "acc", "ANLI")
|
22 |
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
23 |
-
task0 = Task("nq_open", "em", "NQ Open", 64)
|
24 |
-
task1 = Task("triviaqa", "em", "TriviaQA", 64)
|
|
|
25 |
|
26 |
|
27 |
# NUM_FEWSHOT = 64 # Change with your few shot
|
|
|
20 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
21 |
# task0 = Task("anli_r1", "acc", "ANLI")
|
22 |
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
23 |
+
task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
|
24 |
+
task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
|
25 |
+
task2 = Task("truthfulqa:mc", "mc2", "TruthfulQA", 0) # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
|
26 |
|
27 |
|
28 |
# NUM_FEWSHOT = 64 # Change with your few shot
|
src/display/utils.py
CHANGED
@@ -73,18 +73,6 @@ class EvalQueueColumn: # Queue column
|
|
73 |
status = ColumnContent("status", "str", True)
|
74 |
|
75 |
|
76 |
-
# Define the human baselines
|
77 |
-
human_baseline_row = {
|
78 |
-
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
79 |
-
AutoEvalColumn.revision.name: "N/A",
|
80 |
-
AutoEvalColumn.precision.name: None,
|
81 |
-
AutoEvalColumn.average.name: 100.0,
|
82 |
-
AutoEvalColumn.nqopen.name: 100.0,
|
83 |
-
AutoEvalColumn.triviaqa.name: 100.0,
|
84 |
-
AutoEvalColumn.dummy.name: "human_baseline",
|
85 |
-
AutoEvalColumn.model_type.name: "",
|
86 |
-
}
|
87 |
-
|
88 |
@dataclass
|
89 |
class ModelDetails:
|
90 |
name: str
|
|
|
73 |
status = ColumnContent("status", "str", True)
|
74 |
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
@dataclass
|
77 |
class ModelDetails:
|
78 |
name: str
|
src/tools/plots.py
CHANGED
@@ -92,10 +92,6 @@ def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) ->
|
|
92 |
# Filter the DataFrame based on the specified metrics
|
93 |
df = df[df["task"].isin(metrics)]
|
94 |
|
95 |
-
# Filter the human baselines based on the specified metrics
|
96 |
-
from src.display.utils import human_baseline_row as HUMAN_BASELINE
|
97 |
-
filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
|
98 |
-
|
99 |
# Create a line figure using plotly express with specified markers and custom data
|
100 |
fig = px.line(
|
101 |
df,
|
@@ -129,21 +125,6 @@ def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) ->
|
|
129 |
for trace in fig.data:
|
130 |
metric_color_mapping[trace.name] = trace.line.color
|
131 |
|
132 |
-
# Iterate over filtered human baselines and add horizontal lines to the figure
|
133 |
-
for metric, value in filtered_human_baselines.items():
|
134 |
-
color = metric_color_mapping.get(metric, "blue") # Retrieve color from mapping; default to blue if not found
|
135 |
-
location = "top left" if metric == "HellaSwag" else "bottom left" # Set annotation position
|
136 |
-
# Add horizontal line with matched color and positioned annotation
|
137 |
-
fig.add_hline(
|
138 |
-
y=value,
|
139 |
-
line_dash="dot",
|
140 |
-
annotation_text=f"{metric} human baseline",
|
141 |
-
annotation_position=location,
|
142 |
-
annotation_font_size=10,
|
143 |
-
annotation_font_color=color,
|
144 |
-
line_color=color,
|
145 |
-
)
|
146 |
-
|
147 |
return fig
|
148 |
|
149 |
|
|
|
92 |
# Filter the DataFrame based on the specified metrics
|
93 |
df = df[df["task"].isin(metrics)]
|
94 |
|
|
|
|
|
|
|
|
|
95 |
# Create a line figure using plotly express with specified markers and custom data
|
96 |
fig = px.line(
|
97 |
df,
|
|
|
125 |
for trace in fig.data:
|
126 |
metric_color_mapping[trace.name] = trace.line.color
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
return fig
|
129 |
|
130 |
|