pufanyi commited on
Commit
046ddc7
·
1 Parent(s): e2606ab

chore: Update Tasks enum values in about.py

Browse files
Files changed (6) hide show
  1. app.py +36 -41
  2. src/about.py +6 -5
  3. src/display/utils.py +10 -10
  4. src/envs.py +6 -8
  5. src/leaderboard/read_evals.py +2 -2
  6. src/populate.py +17 -9
app.py CHANGED
@@ -24,7 +24,7 @@ from src.display.utils import (
24
  WeightType,
25
  Precision
26
  )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, GOOGLE_SHEET_ID
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
@@ -34,9 +34,9 @@ def restart_space():
34
 
35
  ### Space initialisation
36
  try:
37
- print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
  )
41
  except Exception:
42
  restart_space()
@@ -49,13 +49,8 @@ except Exception:
49
  restart_space()
50
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
 
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
@@ -63,29 +58,29 @@ def init_leaderboard(dataframe):
63
  return Leaderboard(
64
  value=dataframe,
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="dropdown", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
  )
90
 
91
 
@@ -101,15 +96,15 @@ with demo:
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.Row():
105
- with gr.Accordion("📙 Citation", open=False):
106
- citation_button = gr.Textbox(
107
- value=CITATION_BUTTON_TEXT,
108
- label=CITATION_BUTTON_LABEL,
109
- lines=20,
110
- elem_id="citation-button",
111
- show_copy_button=True,
112
- )
113
 
114
  scheduler = BackgroundScheduler()
115
  scheduler.add_job(restart_space, "interval", seconds=1800)
 
24
  WeightType,
25
  Precision
26
  )
27
+ from src.envs import API, EVAL_DETAILED_RESULTS_PATH, EVAL_RESULTS_PATH, EVAL_DETAILED_RESULTS_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
 
34
 
35
  ### Space initialisation
36
  try:
37
+ print(EVAL_DETAILED_RESULTS_REPO)
38
  snapshot_download(
39
+ repo_id=EVAL_DETAILED_RESULTS_REPO, local_dir=EVAL_DETAILED_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
  )
41
  except Exception:
42
  restart_space()
 
49
  restart_space()
50
 
51
 
52
+ LEADERBOARD_DF = get_leaderboard_df(RESULTS_REPO, EVAL_RESULTS_PATH, "2024-06")
53
 
 
 
 
 
 
54
 
55
  def init_leaderboard(dataframe):
56
  if dataframe is None or dataframe.empty:
 
58
  return Leaderboard(
59
  value=dataframe,
60
  datatype=[c.type for c in fields(AutoEvalColumn)],
61
+ # select_columns=SelectColumns(
62
+ # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
63
+ # cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
64
+ # label="Select Columns to Display:",
65
+ # ),
66
+ # search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
67
+ # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
68
+ # filter_columns=[
69
+ # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
70
+ # ColumnFilter(AutoEvalColumn.precision.name, type="dropdown", label="Precision"),
71
+ # ColumnFilter(
72
+ # AutoEvalColumn.params.name,
73
+ # type="slider",
74
+ # min=0.01,
75
+ # max=150,
76
+ # label="Select the number of parameters (B)",
77
+ # ),
78
+ # ColumnFilter(
79
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
80
+ # ),
81
+ # ],
82
+ # bool_checkboxgroup_label="Hide models",
83
+ # interactive=False,
84
  )
85
 
86
 
 
96
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
97
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
98
 
99
+ # with gr.Row():
100
+ # with gr.Accordion("📙 Citation", open=False):
101
+ # citation_button = gr.Textbox(
102
+ # value=CITATION_BUTTON_TEXT,
103
+ # label=CITATION_BUTTON_LABEL,
104
+ # lines=20,
105
+ # elem_id="citation-button",
106
+ # show_copy_button=True,
107
+ # )
108
 
109
  scheduler = BackgroundScheduler()
110
  scheduler.add_job(restart_space, "interval", seconds=1800)
src/about.py CHANGED
@@ -8,12 +8,13 @@ class Task:
8
  col_name: str
9
 
10
 
11
- # Select your tasks here
12
- # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
 
8
  col_name: str
9
 
10
 
 
 
11
  class Tasks(Enum):
12
+ basic_understanding = Task("Basic Understanding", "acc", "Basic Understanding")
13
+ contextual_analysis = Task("Contextual Analysis", "acc", "Contextual Analysis")
14
+ deeper_implications = Task("Deeper Implications", "acc", "Deeper Implications")
15
+ broader_implications = Task("Broader Implications", "acc", "Broader Implications")
16
+ further_insights = Task("Further Insights", "acc", "Further Insights")
17
+
18
 
19
  NUM_FEWSHOT = 0 # Change with your few shot
20
  # ---------------------------------------------------
src/display/utils.py CHANGED
@@ -26,19 +26,19 @@ auto_eval_column_dict = []
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
+ auto_eval_column_dict.append(["Overall", ColumnContent, ColumnContent("Overall", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
+ # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
+ # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
src/envs.py CHANGED
@@ -6,22 +6,20 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
26
-
27
- GOOGLE_SHEET_ID = "1uxHISx8UF6ykm6XH0yZdS35q808t0_Vu2vpEP8vLnHg"
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "lmms-lab" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
+ EVAL_DETAILED_RESULTS_REPO = f"{OWNER}/LiveBenchDetailedResults"
14
+ RESULTS_REPO = f"{OWNER}/LiveBenchResults"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
+ EVAL_DETAILED_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-detailed-results")
21
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
+ # EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
+ # EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
 
 
src/leaderboard/read_evals.py CHANGED
@@ -154,7 +154,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
154
  return request_file
155
 
156
 
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
  """From the path of the results folder root, extract all needed info for results"""
159
  model_result_filepaths = []
160
 
@@ -176,7 +176,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
 
154
  return request_file
155
 
156
 
157
+ def get_raw_eval_results(results_path: str, detailed_results_path: str) -> list[EvalResult]:
158
  """From the path of the results folder root, extract all needed info for results"""
159
  model_result_filepaths = []
160
 
 
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
+ eval_result.update_with_request_file(detailed_results_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
src/populate.py CHANGED
@@ -7,18 +7,26 @@ from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
 
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
 
19
 
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
 
 
 
 
 
 
 
 
 
 
22
  print(df)
23
  return df
24
 
 
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
+ from datasets import load_dataset
11
 
 
 
 
 
12
 
13
+ # def get_leaderboard_df(results_path: str, detailed_results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
14
+ # """Creates a dataframe from all the individual experiment results"""
15
+ # raw_data = get_raw_eval_results(results_path, detailed_results_path)
16
+ # all_data_json = [v.to_dict() for v in raw_data]
17
 
18
+ # df = pd.DataFrame.from_records(all_data_json)
19
+ # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
20
+ # df = df[cols].round(decimals=2)
21
+
22
+ # # filter out if any of the benchmarks have not been produced
23
+ # df = df[has_no_nan_values(df, benchmark_cols)]
24
+ # print(df)
25
+ # return df
26
+
27
+ def get_leaderboard_df(results_repo, results_path, dataset_version):
28
+ hf_leaderboard = load_dataset(results_repo, dataset_version, split="test", cache_dir=results_path)
29
+ df = hf_leaderboard.to_pandas()
30
  print(df)
31
  return df
32