hysts HF staff commited on
Commit
99a4ea0
·
1 Parent(s): 560790f

Use preprocessed table dataset (WIP)

Browse files
Files changed (4) hide show
  1. app.py +3 -14
  2. src/envs.py +1 -2
  3. src/leaderboard/read_evals.py +0 -233
  4. src/populate.py +28 -7
app.py CHANGED
@@ -37,7 +37,7 @@ from src.display.utils import (
37
  VllmVersion,
38
  fields,
39
  )
40
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO
41
  from src.i18n import (
42
  CITATION_ACCORDION_LABEL,
43
  CITATION_ACCORDION_LABEL_JA,
@@ -68,17 +68,6 @@ try:
68
  )
69
  except Exception:
70
  restart_space()
71
- try:
72
- print(EVAL_RESULTS_PATH)
73
- snapshot_download(
74
- repo_id=RESULTS_REPO,
75
- local_dir=EVAL_RESULTS_PATH,
76
- repo_type="dataset",
77
- tqdm_class=None,
78
- etag_timeout=30,
79
- )
80
- except Exception:
81
- restart_space()
82
 
83
 
84
  # Get dataframes
@@ -90,7 +79,7 @@ except Exception:
90
  FAILED_EVAL_QUEUE_DF,
91
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
92
 
93
- ORIGINAL_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
94
  MAX_MODEL_SIZE = ORIGINAL_DF["#Params (B)"].max()
95
 
96
 
@@ -316,7 +305,7 @@ def plot_size_vs_score(df_filtered: pd.DataFrame) -> go.Figure:
316
  df = df[["model_name_for_query", "#Params (B)", "Few-shot"] + AVG_COLUMNS]
317
  df[AVG_COLUMNS] = df[AVG_COLUMNS].astype(float)
318
  df = df.rename(columns={"model_name_for_query": "Model", "Few-shot": "n-shot"})
319
- df["model_name_without_org_name"] = df["Model"].str.split("/").str[-1] + " (" + df["n-shot"] + "-shot)"
320
  df = pd.melt(
321
  df,
322
  id_vars=["Model", "model_name_without_org_name", "#Params (B)", "n-shot"],
 
37
  VllmVersion,
38
  fields,
39
  )
40
+ from src.envs import API, CONTENTS_REPO, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID
41
  from src.i18n import (
42
  CITATION_ACCORDION_LABEL,
43
  CITATION_ACCORDION_LABEL_JA,
 
68
  )
69
  except Exception:
70
  restart_space()
 
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
  # Get dataframes
 
79
  FAILED_EVAL_QUEUE_DF,
80
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
81
 
82
+ ORIGINAL_DF = get_leaderboard_df(CONTENTS_REPO, COLS, BENCHMARK_COLS)
83
  MAX_MODEL_SIZE = ORIGINAL_DF["#Params (B)"].max()
84
 
85
 
 
305
  df = df[["model_name_for_query", "#Params (B)", "Few-shot"] + AVG_COLUMNS]
306
  df[AVG_COLUMNS] = df[AVG_COLUMNS].astype(float)
307
  df = df.rename(columns={"model_name_for_query": "Model", "Few-shot": "n-shot"})
308
+ df["model_name_without_org_name"] = df["Model"].str.split("/").str[-1] + " (" + df["n-shot"].astype(str) + "-shot)"
309
  df = pd.melt(
310
  df,
311
  id_vars=["Model", "model_name_without_org_name", "#Params (B)", "n-shot"],
src/envs.py CHANGED
@@ -11,14 +11,13 @@ OWNER = "llm-jp" # Change to your org - don't forget to create a results and re
11
 
12
  REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard"
13
  QUEUE_REPO = f"{OWNER}/leaderboard-requests"
14
- RESULTS_REPO = f"{OWNER}/leaderboard-contents"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
 
11
 
12
  REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard"
13
  QUEUE_REPO = f"{OWNER}/leaderboard-requests"
14
+ CONTENTS_REPO = f"{OWNER}/leaderboard-contents"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
21
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
22
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
23
 
src/leaderboard/read_evals.py DELETED
@@ -1,233 +0,0 @@
1
- import glob
2
- import json
3
- import os
4
- from dataclasses import dataclass
5
- from decimal import Decimal
6
-
7
- import dateutil
8
-
9
- from src.display.formatting import make_clickable_model
10
- from src.display.utils import AutoEvalColumn, Backend, ModelType, Tasks, Version, WeightType
11
-
12
-
13
- @dataclass
14
- class EvalResult:
15
- """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
16
-
17
- eval_name: str # org_model_precision (uid)
18
- full_model: str # org/model (path on hub)
19
- org: str
20
- model: str
21
- revision: str # commit hash, "" if main
22
- results: dict
23
- # precision: Precision = Precision.Unknown
24
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
25
- precision: str = "Unknown"
26
- # model_type: str = "Unknown"
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
- license: str = "?"
30
- likes: int = 0
31
- num_params: int = 0
32
- date: str = "" # submission date of request file
33
- num_few_shots: str = "0"
34
- add_special_tokens: str = ""
35
- llm_jp_eval_version: str = ""
36
- vllm_version: str = ""
37
- backend: str = ""
38
-
39
- @classmethod
40
- def init_from_json_file(self, json_filepath):
41
- """Inits the result from the specific model result file"""
42
- with open(json_filepath) as fp:
43
- data = json.load(fp)
44
-
45
- config = data.get("config")
46
- metainfo = config.get("metainfo", {})
47
- model_config = config.get("model", {})
48
-
49
- # Get model type from metainfo
50
- # model_type_str = metainfo.get("model_type", "")
51
- # model_type = ModelType.from_str(model_type_str)
52
- # model_type = metainfo.get("model_type", "Unknown")
53
-
54
- # Get num_few_shots from metainfo
55
- num_few_shots = str(metainfo.get("num_few_shots", 0))
56
-
57
- # Precision
58
- # precision = Precision.from_str(config.get("dtype"))
59
- precision = model_config.get("dtype", "Unknown")
60
-
61
- # Add Special Tokens
62
- add_special_tokens = str(
63
- config.get("pipeline_kwargs", {"add_special_tokens": "Unknown"}).get("add_special_tokens")
64
- )
65
-
66
- version = Version.from_str(metainfo.get("version", "?")).value.name
67
-
68
- # Get vllm version from metainfo
69
- vllm_version = metainfo.get("vllm-version", "")
70
-
71
- backend = Backend.from_str(model_config.get("_target_", "?").split(".")[0]).value.name
72
- revision = model_config.get("revision", "")
73
-
74
- # Get model and org
75
- # org_and_model = config.get("model_name", config.get("offline_inference").get("model_name", None))
76
- org_and_model = config.get("model_name", config.get("offline_inference", {}).get("model_name", "Unknown"))
77
- org_and_model = org_and_model.split("/", 1)
78
-
79
- # org_and_modelがリストの場合、"/"で結合
80
- if isinstance(org_and_model, list):
81
- full_model = "/".join(org_and_model)
82
- else:
83
- full_model = org_and_model
84
-
85
- if len(org_and_model) == 1:
86
- org = None
87
- model = org_and_model[0]
88
- # result_key = f"{model}_{precision.value.name}"
89
- result_key = f"{model}_{precision}_({num_few_shots}shots)_{add_special_tokens}"
90
- else:
91
- org = org_and_model[0]
92
- model = org_and_model[1]
93
- # result_key = f"{org}_{model}_{precision.value.name}"
94
- result_key = f"{model}_{precision}_({num_few_shots}shots)_{add_special_tokens}"
95
- full_model = "/".join(org_and_model)
96
-
97
- if "scores" not in data:
98
- raise KeyError(f"'scores' key not found in JSON file: {json_filepath}")
99
-
100
- scores = data["scores"]
101
- results = {}
102
- for task in Tasks:
103
- task_value = task.value
104
- score = scores.get(task_value.metric)
105
- results[task_value.metric] = score
106
-
107
- return self(
108
- eval_name=result_key,
109
- full_model=full_model,
110
- org=org,
111
- model=model,
112
- results=results,
113
- precision=precision,
114
- revision=revision,
115
- num_few_shots=num_few_shots,
116
- add_special_tokens=add_special_tokens,
117
- llm_jp_eval_version=version,
118
- vllm_version=vllm_version,
119
- backend=backend,
120
- )
121
-
122
- def update_with_request_file(self, requests_path):
123
- """Finds the relevant request file for the current model and updates info with it"""
124
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
125
- try:
126
- with open(request_file, "r") as f:
127
- request = json.load(f)
128
- self.model_type = ModelType.from_str(request.get("model_type", ""))
129
- self.weight_type = WeightType[request.get("weight_type", "Original")]
130
- self.license = request.get("license", "?")
131
- self.likes = request.get("likes", 0)
132
- self.num_params = request.get("params", 0)
133
- self.date = request.get("submitted_time", "")
134
- self.architecture = request.get("architecture", "?")
135
- except Exception:
136
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision}")
137
-
138
- def to_dict(self):
139
- """Converts the Eval Result to a dict compatible with our dataframe display"""
140
- # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
141
- data_dict = {
142
- "eval_name": self.eval_name, # not a column, just a save name,
143
- AutoEvalColumn.precision.name: self.precision,
144
- AutoEvalColumn.model_type.name: self.model_type.value.name,
145
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
146
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
147
- AutoEvalColumn.architecture.name: self.architecture,
148
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
149
- AutoEvalColumn.dummy.name: self.full_model,
150
- AutoEvalColumn.revision.name: self.revision,
151
- # AutoEvalColumn.average.name: None,
152
- AutoEvalColumn.license.name: self.license,
153
- AutoEvalColumn.likes.name: self.likes,
154
- AutoEvalColumn.params.name: self.num_params,
155
- AutoEvalColumn.num_few_shots.name: self.num_few_shots,
156
- AutoEvalColumn.add_special_tokens.name: self.add_special_tokens,
157
- AutoEvalColumn.llm_jp_eval_version.name: self.llm_jp_eval_version,
158
- AutoEvalColumn.vllm_version.name: self.vllm_version,
159
- AutoEvalColumn.backend.name: self.backend,
160
- }
161
-
162
- # for task in Tasks:
163
- # task_value = task.value
164
- # data_dict[task_value.col_name] = self.results.get(task_value.benchmark, None)
165
- for task in Tasks:
166
- task_value = task.value
167
- value = self.results.get(task_value.metric)
168
- data_dict[task_value.col_name] = Decimal(value)
169
-
170
- return data_dict
171
-
172
-
173
- def get_request_file_for_model(requests_path, model_name, precision):
174
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
175
- request_files = os.path.join(
176
- requests_path,
177
- f"{model_name}_eval_request_*.json",
178
- )
179
- request_files = glob.glob(request_files)
180
-
181
- # Select correct request file (precision)
182
- request_file = ""
183
- request_files = sorted(request_files, reverse=True)
184
- for tmp_request_file in request_files:
185
- with open(tmp_request_file, "r") as f:
186
- req_content = json.load(f)
187
- if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
188
- request_file = tmp_request_file
189
- return request_file
190
-
191
-
192
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
193
- """From the path of the results folder root, extract all needed info for results"""
194
- model_result_filepaths = []
195
-
196
- for root, _, files in os.walk(results_path):
197
- # We should only have json files in model results
198
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
199
- continue
200
-
201
- # Sort the files by date
202
- try:
203
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
204
- except dateutil.parser._parser.ParserError:
205
- files = [files[-1]]
206
-
207
- for file in files:
208
- model_result_filepaths.append(os.path.join(root, file))
209
-
210
- eval_results = {}
211
- for model_result_filepath in model_result_filepaths:
212
- # Creation of result
213
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
214
- eval_result.update_with_request_file(requests_path)
215
-
216
- # Store results of same eval together
217
- eval_name = eval_result.eval_name
218
- if eval_name in eval_results.keys():
219
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
220
- else:
221
- eval_results[eval_name] = eval_result
222
-
223
- results = []
224
- for v in eval_results.values():
225
- try:
226
- v.to_dict() # we test if the dict version is complete
227
- results.append(v)
228
- except KeyError: # not all eval values present
229
- continue
230
- # print(f"Processing file: {model_result_filepath}")
231
- # print(f"Eval result: {eval_result.to_dict()}")
232
-
233
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py CHANGED
@@ -1,19 +1,40 @@
1
  import json
2
  import os
 
3
 
 
4
  import pandas as pd
5
 
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  # Add a row ID column
19
  df[AutoEvalColumn.row_id.name] = range(len(df))
@@ -32,7 +53,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
32
  existing_score_cols = [col for col in score_cols if col in df.columns]
33
 
34
  # スコア列を100で割り、.4f形式でフォーマット
35
- df[existing_score_cols] = (df[existing_score_cols] / 100).applymap(lambda x: f"{x:.4f}")
36
  df = df.sort_values(by=[AutoEvalColumn.AVG.name], ascending=False)
37
  df = df[cols].round(decimals=2)
38
 
 
1
  import json
2
  import os
3
+ from decimal import Decimal
4
 
5
+ import datasets
6
  import pandas as pd
7
 
8
+ from src.about import Tasks
9
  from src.display.formatting import has_no_nan_values, make_clickable_model
10
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
 
11
 
12
 
13
+ def get_leaderboard_df(contents_repo: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
14
  """Creates a dataframe from all the individual experiment results"""
15
+ df = datasets.load_dataset(contents_repo, split="train").to_pandas()
16
+ df["Model"] = df["model"].map(make_clickable_model)
17
+ df["T"] = df["model_type"].map(lambda x: x.split(":")[0].strip())
18
+ df["Type"] = df["model_type"].map(lambda x: x.split(":")[1].strip())
19
+ df["Backend Library"] = "vllm"
20
+ df = df.rename(columns={task.value.metric: task.value.col_name for task in Tasks})
21
+ df = df.rename(
22
+ columns={
23
+ "architecture": "Architecture",
24
+ "weight_type": "Weight type",
25
+ "precision": "Precision",
26
+ "license": "Hub License",
27
+ "params": "#Params (B)",
28
+ "likes": "Hub ❤️",
29
+ "revision": "Revision",
30
+ "num_few_shot": "Few-shot",
31
+ "add_special_tokens": "Add Special Tokens",
32
+ "llm_jp_eval_version": "llm-jp-eval version",
33
+ "vllm_version": "vllm version",
34
+ "model": "model_name_for_query",
35
+ }
36
+ )
37
+ df[[task.value.col_name for task in Tasks]] = df[[task.value.col_name for task in Tasks]].map(lambda x: Decimal(x))
38
 
39
  # Add a row ID column
40
  df[AutoEvalColumn.row_id.name] = range(len(df))
 
53
  existing_score_cols = [col for col in score_cols if col in df.columns]
54
 
55
  # スコア列を100で割り、.4f形式でフォーマット
56
+ df[existing_score_cols] = (df[existing_score_cols] / 100).map(lambda x: f"{x:.4f}")
57
  df = df.sort_values(by=[AutoEvalColumn.AVG.name], ascending=False)
58
  df = df[cols].round(decimals=2)
59