brainz commited on
Commit
e48ac8d
β€’
1 Parent(s): 9b1894c

update space

Browse files
Files changed (3) hide show
  1. app.py +23 -21
  2. src/about.py +2 -2
  3. src/leaderboard/read_evals.py +6 -5
app.py CHANGED
@@ -36,7 +36,7 @@ from src.submission.submit import add_new_eval
36
  def restart_space():
37
  API.restart_space(repo_id=REPO_ID)
38
 
39
- ## Space initialisation
40
  try:
41
  print(EVAL_REQUESTS_PATH)
42
  snapshot_download(
@@ -54,6 +54,7 @@ except Exception:
54
 
55
 
56
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
57
 
58
  (
59
  finished_eval_queue_df,
@@ -61,6 +62,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
61
  pending_eval_queue_df,
62
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
63
 
 
64
  def init_leaderboard(dataframe):
65
  if dataframe is None or dataframe.empty:
66
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -74,20 +76,20 @@ def init_leaderboard(dataframe):
74
  ),
75
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
76
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
77
- filter_columns=[
78
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
79
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
80
- ColumnFilter(
81
- AutoEvalColumn.params.name,
82
- type="slider",
83
- min=0.01,
84
- max=150,
85
- label="Select the number of parameters (B)",
86
- ),
87
- ColumnFilter(
88
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
89
- ),
90
- ],
91
  bool_checkboxgroup_label="Hide models",
92
  interactive=False,
93
  )
@@ -97,7 +99,7 @@ demo = gr.Blocks(css=custom_css)
97
  with demo:
98
  gr.HTML(TITLE)
99
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
100
- gr.Markdown(INTRODUCTION_TEXT_ZH, elem_classes="markdown-text")
101
 
102
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
103
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
@@ -106,16 +108,16 @@ with demo:
106
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
107
  with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
108
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
109
- with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
110
- gr.Markdown(LLM_BENCHMARKS_TEXT_ZH, elem_classes="markdown-text")
111
 
112
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
113
  with gr.Column():
114
  with gr.Row():
115
  with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
116
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
117
- with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
118
- gr.Markdown(EVALUATION_QUEUE_TEXT_ZH, elem_classes="markdown-text")
119
 
120
  with gr.Column():
121
  with gr.Accordion(
@@ -221,4 +223,4 @@ with demo:
221
  scheduler = BackgroundScheduler()
222
  scheduler.add_job(restart_space, "interval", seconds=1800)
223
  scheduler.start()
224
- demo.queue(default_concurrency_limit=40).launch()
 
36
  def restart_space():
37
  API.restart_space(repo_id=REPO_ID)
38
 
39
+ # Space initialisation
40
  try:
41
  print(EVAL_REQUESTS_PATH)
42
  snapshot_download(
 
54
 
55
 
56
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
57
+ # print("Before calling init_leaderboard:", LEADERBOARD_DF)
58
 
59
  (
60
  finished_eval_queue_df,
 
62
  pending_eval_queue_df,
63
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
64
 
65
+
66
  def init_leaderboard(dataframe):
67
  if dataframe is None or dataframe.empty:
68
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
76
  ),
77
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
78
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
79
+ # filter_columns=[
80
+ # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
81
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
82
+ # ColumnFilter(
83
+ # AutoEvalColumn.params.name,
84
+ # type="slider",
85
+ # min=0.01,
86
+ # max=150,
87
+ # label="Select the number of parameters (B)",
88
+ # ),
89
+ # ColumnFilter(
90
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
91
+ # ),
92
+ # ],
93
  bool_checkboxgroup_label="Hide models",
94
  interactive=False,
95
  )
 
99
  with demo:
100
  gr.HTML(TITLE)
101
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
102
+ # gr.Markdown(INTRODUCTION_TEXT_ZH, elem_classes="markdown-text")
103
 
104
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
105
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
 
108
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
109
  with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
110
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
111
+ # with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
112
+ # gr.Markdown(LLM_BENCHMARKS_TEXT_ZH, elem_classes="markdown-text")
113
 
114
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
115
  with gr.Column():
116
  with gr.Row():
117
  with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
118
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
119
+ # with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
120
+ # gr.Markdown(EVALUATION_QUEUE_TEXT_ZH, elem_classes="markdown-text")
121
 
122
  with gr.Column():
123
  with gr.Accordion(
 
223
  scheduler = BackgroundScheduler()
224
  scheduler.add_job(restart_space, "interval", seconds=1800)
225
  scheduler.start()
226
+ demo.queue(default_concurrency_limit=40).launch(share=True)
src/about.py CHANGED
@@ -13,8 +13,8 @@ class Task:
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("mmmlu", "acc", "MMMLU")
16
- task1 = Task("mmlu", "acc", "MMLU")
17
- task2 = Task("cmmlu", "acc", "CMMLU")
18
  task3 = Task("mmmlu_ar", "acc", "MMMLU_AR")
19
  task4 = Task("mmmlu_bn", "acc", "MMMLU_BN")
20
  task5 = Task("mmmlu_de", "acc", "MMMLU_DE")
 
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("mmmlu", "acc", "MMMLU")
16
+ # task1 = Task("mmlu", "acc", "MMLU")
17
+ # task2 = Task("cmmlu", "acc", "CMMLU")
18
  task3 = Task("mmmlu_ar", "acc", "MMMLU_AR")
19
  task4 = Task("mmmlu_bn", "acc", "MMMLU_BN")
20
  task5 = Task("mmmlu_de", "acc", "MMMLU_DE")
src/leaderboard/read_evals.py CHANGED
@@ -96,7 +96,7 @@ class EvalResult:
96
  def update_with_request_file(self, requests_path):
97
  """Finds the relevant request file for the current model and updates info with it"""
98
  request_file = get_request_file_for_model(requests_path, self.full_model.split("/")[-1], self.precision.value.name)
99
- # print("########",request_file)
100
 
101
  try:
102
  with open(request_file, "r") as f:
@@ -112,9 +112,10 @@ class EvalResult:
112
 
113
  def to_dict(self):
114
  """Converts the Eval Result to a dict compatible with our dataframe display"""
115
- keys_to_average = ['mmmlu', 'mmlu', 'cmmlu']
116
- average = sum([self.results[key] for key in keys_to_average if self.results.get(key) is not None]) / len(
117
- keys_to_average)
 
118
  data_dict = {
119
  "eval_name": self.eval_name, # not a column, just a save name,
120
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -182,6 +183,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
182
  for model_result_filepath in model_result_filepaths:
183
  # Creation of result
184
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
185
  eval_result.update_with_request_file(requests_path)
186
 
187
  # Store results of same eval together
@@ -198,5 +200,4 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
198
  results.append(v)
199
  except KeyError: # not all eval values present
200
  continue
201
-
202
  return results
 
96
  def update_with_request_file(self, requests_path):
97
  """Finds the relevant request file for the current model and updates info with it"""
98
  request_file = get_request_file_for_model(requests_path, self.full_model.split("/")[-1], self.precision.value.name)
99
+ # print("########",requests_path,self.full_model.split("/")[-1])
100
 
101
  try:
102
  with open(request_file, "r") as f:
 
112
 
113
  def to_dict(self):
114
  """Converts the Eval Result to a dict compatible with our dataframe display"""
115
+ # keys_to_average = ['mmmlu', 'mmlu', 'cmmlu']
116
+ # average = sum([self.results[key] for key in keys_to_average if self.results.get(key) is not None]) / len(
117
+ # keys_to_average)
118
+ average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
119
  data_dict = {
120
  "eval_name": self.eval_name, # not a column, just a save name,
121
  AutoEvalColumn.precision.name: self.precision.value.name,
 
183
  for model_result_filepath in model_result_filepaths:
184
  # Creation of result
185
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
186
+ print(results_path)
187
  eval_result.update_with_request_file(requests_path)
188
 
189
  # Store results of same eval together
 
200
  results.append(v)
201
  except KeyError: # not all eval values present
202
  continue
 
203
  return results