Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Merge branch 'main' into select-column-accordion
Browse files- README.md +9 -3
- app.py +15 -15
- src/about.py +88 -5
- src/display/utils.py +1 -1
- style.css +2 -0
README.md
CHANGED
@@ -1,14 +1,20 @@
|
|
1 |
---
|
2 |
title: Open Japanese LLM Leaderboard
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
sdk_version: 5.1.0
|
11 |
fullWidth: true
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
---
|
13 |
|
14 |
# Start the configuration
|
|
|
1 |
---
|
2 |
title: Open Japanese LLM Leaderboard
|
3 |
+
emoji: 🌸
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: gray
|
6 |
sdk: gradio
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
sdk_version: 5.1.0
|
11 |
fullWidth: true
|
12 |
+
tags:
|
13 |
+
- 日本語
|
14 |
+
- Japanese
|
15 |
+
- leaderboard
|
16 |
+
- language:日本語
|
17 |
+
- language:Japanese
|
18 |
---
|
19 |
|
20 |
# Start the configuration
|
app.py
CHANGED
@@ -79,7 +79,7 @@ def filter_models(
|
|
79 |
add_special_tokens_query: list,
|
80 |
num_few_shots_query: list,
|
81 |
version_query: list,
|
82 |
-
backend_query: list,
|
83 |
) -> pd.DataFrame:
|
84 |
print(f"Initial df shape: {df.shape}")
|
85 |
print(f"Initial df content:\n{df}")
|
@@ -118,8 +118,8 @@ def filter_models(
|
|
118 |
print(f"After version filter: {filtered_df.shape}")
|
119 |
|
120 |
# Backend フィルタリング
|
121 |
-
filtered_df = filtered_df[filtered_df["Backend Library"].isin(backend_query)]
|
122 |
-
print(f"After backend filter: {filtered_df.shape}")
|
123 |
|
124 |
print("Filtered dataframe head:")
|
125 |
print(filtered_df.head())
|
@@ -188,7 +188,7 @@ def update_table(
|
|
188 |
add_special_tokens_query: list,
|
189 |
num_few_shots_query: list,
|
190 |
version_query: list,
|
191 |
-
backend_query: list,
|
192 |
query: str,
|
193 |
*columns,
|
194 |
):
|
@@ -206,7 +206,7 @@ def update_table(
|
|
206 |
add_special_tokens_query,
|
207 |
num_few_shots_query,
|
208 |
version_query,
|
209 |
-
backend_query,
|
210 |
)
|
211 |
print(f"filtered_df shape after filter_models: {filtered_df.shape}")
|
212 |
|
@@ -253,7 +253,7 @@ leaderboard_df = filter_models(
|
|
253 |
[i.value.name for i in AddSpecialTokens],
|
254 |
[i.value.name for i in NumFewShots],
|
255 |
[i.value.name for i in Version],
|
256 |
-
[i.value.name for i in Backend],
|
257 |
)
|
258 |
|
259 |
leaderboard_df_filtered = filter_models(
|
@@ -264,7 +264,7 @@ leaderboard_df_filtered = filter_models(
|
|
264 |
[i.value.name for i in AddSpecialTokens],
|
265 |
[i.value.name for i in NumFewShots],
|
266 |
[i.value.name for i in Version],
|
267 |
-
[i.value.name for i in Backend],
|
268 |
)
|
269 |
|
270 |
# DataFrameの初期化部分のみを修正
|
@@ -350,12 +350,12 @@ with gr.Blocks() as demo_leaderboard:
|
|
350 |
value=[i.value.name for i in Version],
|
351 |
elem_id="filter-columns-version",
|
352 |
)
|
353 |
-
filter_columns_backend = gr.CheckboxGroup(
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
)
|
359 |
|
360 |
# DataFrameコンポーネントの初期化
|
361 |
leaderboard_table = gr.Dataframe(
|
@@ -387,7 +387,7 @@ with gr.Blocks() as demo_leaderboard:
|
|
387 |
filter_columns_add_special_tokens.change,
|
388 |
filter_columns_num_few_shots.change,
|
389 |
filter_columns_version.change,
|
390 |
-
filter_columns_backend.change,
|
391 |
search_bar.submit,
|
392 |
]
|
393 |
+ [shown_columns.change for shown_columns in shown_columns_dict.values()],
|
@@ -400,7 +400,7 @@ with gr.Blocks() as demo_leaderboard:
|
|
400 |
filter_columns_add_special_tokens,
|
401 |
filter_columns_num_few_shots,
|
402 |
filter_columns_version,
|
403 |
-
filter_columns_backend,
|
404 |
search_bar,
|
405 |
]
|
406 |
+ [shown_columns for shown_columns in shown_columns_dict.values()],
|
|
|
79 |
add_special_tokens_query: list,
|
80 |
num_few_shots_query: list,
|
81 |
version_query: list,
|
82 |
+
# backend_query: list,
|
83 |
) -> pd.DataFrame:
|
84 |
print(f"Initial df shape: {df.shape}")
|
85 |
print(f"Initial df content:\n{df}")
|
|
|
118 |
print(f"After version filter: {filtered_df.shape}")
|
119 |
|
120 |
# Backend フィルタリング
|
121 |
+
# filtered_df = filtered_df[filtered_df["Backend Library"].isin(backend_query)]
|
122 |
+
# print(f"After backend filter: {filtered_df.shape}")
|
123 |
|
124 |
print("Filtered dataframe head:")
|
125 |
print(filtered_df.head())
|
|
|
188 |
add_special_tokens_query: list,
|
189 |
num_few_shots_query: list,
|
190 |
version_query: list,
|
191 |
+
# backend_query: list,
|
192 |
query: str,
|
193 |
*columns,
|
194 |
):
|
|
|
206 |
add_special_tokens_query,
|
207 |
num_few_shots_query,
|
208 |
version_query,
|
209 |
+
# backend_query,
|
210 |
)
|
211 |
print(f"filtered_df shape after filter_models: {filtered_df.shape}")
|
212 |
|
|
|
253 |
[i.value.name for i in AddSpecialTokens],
|
254 |
[i.value.name for i in NumFewShots],
|
255 |
[i.value.name for i in Version],
|
256 |
+
# [i.value.name for i in Backend],
|
257 |
)
|
258 |
|
259 |
leaderboard_df_filtered = filter_models(
|
|
|
264 |
[i.value.name for i in AddSpecialTokens],
|
265 |
[i.value.name for i in NumFewShots],
|
266 |
[i.value.name for i in Version],
|
267 |
+
# [i.value.name for i in Backend],
|
268 |
)
|
269 |
|
270 |
# DataFrameの初期化部分のみを修正
|
|
|
350 |
value=[i.value.name for i in Version],
|
351 |
elem_id="filter-columns-version",
|
352 |
)
|
353 |
+
# filter_columns_backend = gr.CheckboxGroup(
|
354 |
+
# label="Backend Library",
|
355 |
+
# choices=[i.value.name for i in Backend],
|
356 |
+
# value=[i.value.name for i in Backend],
|
357 |
+
# elem_id="filter-columns-backend",
|
358 |
+
# )
|
359 |
|
360 |
# DataFrameコンポーネントの初期化
|
361 |
leaderboard_table = gr.Dataframe(
|
|
|
387 |
filter_columns_add_special_tokens.change,
|
388 |
filter_columns_num_few_shots.change,
|
389 |
filter_columns_version.change,
|
390 |
+
# filter_columns_backend.change,
|
391 |
search_bar.submit,
|
392 |
]
|
393 |
+ [shown_columns.change for shown_columns in shown_columns_dict.values()],
|
|
|
400 |
filter_columns_add_special_tokens,
|
401 |
filter_columns_num_few_shots,
|
402 |
filter_columns_version,
|
403 |
+
# filter_columns_backend,
|
404 |
search_bar,
|
405 |
]
|
406 |
+ [shown_columns for shown_columns in shown_columns_dict.values()],
|
src/about.py
CHANGED
@@ -36,16 +36,16 @@ class Tasks(Enum):
|
|
36 |
EL = Task(
|
37 |
"scores", "EL", "EL - エンティティリンキング", TaskType.EL, True
|
38 |
) # Entity Linking - エンティティリンキング
|
39 |
-
FA = Task("scores", "FA", "FA -
|
40 |
-
HE = Task("scores", "HE", "HE -
|
41 |
MC = Task(
|
42 |
-
"scores", "MC", "MC -
|
43 |
-
) # Multiple Choice question answering -
|
44 |
MR = Task("scores", "MR", "MR - 数学的推論", TaskType.MR, True) # Mathematical Reasoning - 数学的推論
|
45 |
MT = Task("scores", "MT", "MT - 機械翻訳", TaskType.MT, True) # Machine Translation - 機械翻訳
|
46 |
NLI = Task("scores", "NLI", "NLI - 自然言語推論", TaskType.NLI, True) # Natural Language Inference - 自然言語推論
|
47 |
QA = Task("scores", "QA", "QA - 質問応答", TaskType.QA, True) # Question Answering - 質問応答
|
48 |
-
RC = Task("scores", "RC", "RC -
|
49 |
SUM = Task("scores", "SUM", "SUM - 要約", TaskType.SUM, True) # Summarization - 要約
|
50 |
alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
|
51 |
alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
|
@@ -225,6 +225,89 @@ To reproduce our results, please follow the instructions of the evalution tool,
|
|
225 |
|
226 |
"""
|
227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
EVALUATION_QUEUE_TEXT = """
|
229 |
## Some good practices before submitting a model
|
230 |
|
|
|
36 |
EL = Task(
|
37 |
"scores", "EL", "EL - エンティティリンキング", TaskType.EL, True
|
38 |
) # Entity Linking - エンティティリンキング
|
39 |
+
FA = Task("scores", "FA", "FA - 基礎解析", TaskType.FA, True) # Fundamental Analysis - 基礎解析
|
40 |
+
HE = Task("scores", "HE", "HE - 試験問題", TaskType.HE, True) # Human Examination - 試験問題
|
41 |
MC = Task(
|
42 |
+
"scores", "MC", "MC - 多肢選択式問題", TaskType.MC, True
|
43 |
+
) # Multiple Choice question answering - 多肢選択式問題
|
44 |
MR = Task("scores", "MR", "MR - 数学的推論", TaskType.MR, True) # Mathematical Reasoning - 数学的推論
|
45 |
MT = Task("scores", "MT", "MT - 機械翻訳", TaskType.MT, True) # Machine Translation - 機械翻訳
|
46 |
NLI = Task("scores", "NLI", "NLI - 自然言語推論", TaskType.NLI, True) # Natural Language Inference - 自然言語推論
|
47 |
QA = Task("scores", "QA", "QA - 質問応答", TaskType.QA, True) # Question Answering - 質問応答
|
48 |
+
RC = Task("scores", "RC", "RC - 文章読解", TaskType.RC, True) # Reading Comprehension - 文章読解
|
49 |
SUM = Task("scores", "SUM", "SUM - 要約", TaskType.SUM, True) # Summarization - 要約
|
50 |
alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
|
51 |
alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
|
|
|
225 |
|
226 |
"""
|
227 |
|
228 |
+
LLM_BENCHMARKS_TEXT_JP = """
|
229 |
+
## 仕組み
|
230 |
+
📈 私たちは評価ツール [llm-jp-eval](https://github.com/llm-jp/llm-jp-eval) を活用し、16のタスクで日本語の大規模言語モデルを評価します。このツールは、様々な評価タスクで日本語LLMを評価するための統一的なフレームワークです。
|
231 |
+
|
232 |
+
**NLI(自然言語推論)**
|
233 |
+
|
234 |
+
* `Jamp`、時間推論に焦点を当てた日本語NLIベンチマーク [ソース](https://github.com/tomo-ut/temporalNLI_dataset)(ライセンス CC BY-SA 4.0)
|
235 |
+
|
236 |
+
* `JaNLI`、日本語の敵対的推論データセット [ソース](https://github.com/verypluming/JaNLI)(ライセンス CC BY-SA 4.0)
|
237 |
+
|
238 |
+
* `JNLI`、日本語自然言語推論(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
239 |
+
|
240 |
+
* `JSeM`、日本語意味論テストセット [ソース](https://github.com/DaisukeBekki/JSeM)(ライセンス BSD 3-Clause)
|
241 |
+
|
242 |
+
* `JSICK`、構成的知識を含む日本語文 [ソース](https://github.com/verypluming/JSICK)(ライセンス CC BY-SA 4.0)
|
243 |
+
|
244 |
+
**QA(質問応答)**
|
245 |
+
|
246 |
+
* `JEMHopQA`、日本語の説明可能なマルチホップ質問応答 [ソース](https://github.com/aiishii/JEMHopQA)(ライセンス CC BY-SA 4.0)
|
247 |
+
|
248 |
+
* `NIILC`、NIILC質問応答データセット [ソース](https://github.com/mynlp/niilc-qa)(ライセンス CC BY-SA 4.0)
|
249 |
+
|
250 |
+
* `JAQKET`、クイズを題材とした日本語QAデータセット [ソース](https://www.nlp.ecei.tohoku.ac.jp/projects/jaqket/)(ライセンス CC BY-SA 4.0 - 企業利用には別途ライセンスが必要)
|
251 |
+
|
252 |
+
**RC(読解)**
|
253 |
+
|
254 |
+
* `JSQuAD`、SQuADの日本語版(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
255 |
+
|
256 |
+
**MC(選択式質問応答)**
|
257 |
+
|
258 |
+
* `JCommonsenseMorality`、常識的な道徳理解を評価する日本語データセット [ソース](https://github.com/Language-Media-Lab/commonsense-moral-ja)(ライセンス MIT License)
|
259 |
+
|
260 |
+
* `JCommonsenseQA`、CommonsenseQAの日本語版 [ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
261 |
+
|
262 |
+
* `KUCI`、京都大学常識推論データセット [ソース](https://github.com/ku-nlp/KUCI)(ライセンス CC BY-SA 4.0)
|
263 |
+
|
264 |
+
**EL(エンティティリンク)**
|
265 |
+
|
266 |
+
* `chABSA`、アスペクトベースの感情分析データセット [ソース](https://github.com/chakki-works/chABSA-dataset)(ライセンス CC BY-SA 4.0)
|
267 |
+
|
268 |
+
**FA(基本的な分析)**
|
269 |
+
|
270 |
+
* `Wikipedia Annotated Corpus`、[ソース](https://github.com/ku-nlp/WikipediaAnnotatedCorpus)(ライセンス CC BY-SA 4.0)
|
271 |
+
|
272 |
+
タスク一覧:(読解予測、固有表現認識(NER)、依存構造解析、述語項構造解析(PAS)、共参照解析)
|
273 |
+
|
274 |
+
**MR(数学的推論)**
|
275 |
+
|
276 |
+
* `MAWPS`、MAWPS(A Math Word Problem Repository)の日本語版 [ソース](https://github.com/nlp-waseda/chain-of-thought-ja-dataset)(ライセンス Apache-2.0)
|
277 |
+
|
278 |
+
* `MGSM`、MGSM(Multilingual Grade School Math Benchmark)の日本語部分 [ソース](https://huggingface.co/datasets/juletxara/mgsm)(ライセンス MIT License)
|
279 |
+
|
280 |
+
**MT(機械翻訳)**
|
281 |
+
|
282 |
+
* `ALT`、アジア言語ツリーバンク(ALT) - 並列コーパス [ソース](https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/index.html)(ライセンス CC BY-SA 4.0)
|
283 |
+
|
284 |
+
* `WikiCorpus`、京都市に関するWikipedia記事の日本語-英語対訳コーパス [ソース](https://alaginrc.nict.go.jp/WikiCorpus/)(ライセンス CC BY-SA 3.0)
|
285 |
+
|
286 |
+
**STS(意味的テキスト類似度)**
|
287 |
+
|
288 |
+
このタスクはllm-jp-evalでサポートされていますが、評価スコアの平均には含まれていません。
|
289 |
+
|
290 |
+
* `JSTS`、STS(Semantic Textual Similarity)の日本語版(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
|
291 |
+
|
292 |
+
**HE(試験問題)**
|
293 |
+
|
294 |
+
* `MMLU`、大規模マルチタスク言語理解の測定 [ソース](https://github.com/hendrycks/test)(ライセンス MIT License)
|
295 |
+
|
296 |
+
* `JMMLU`、日本語大規模マルチタスク言語理解ベンチマーク [ソース](https://github.com/nlp-waseda/JMMLU)(ライセンス CC BY-SA 4.0(3つのタスクはCC BY-NC-ND 4.0ライセンス)
|
297 |
+
|
298 |
+
**CG(コード生成)**
|
299 |
+
|
300 |
+
* `MBPP`、Mostly Basic Python Problems(MBPP)の日本語版 [ソース](https://huggingface.co/datasets/llm-jp/mbpp-ja)(ライセンス CC BY-SA 4.0)
|
301 |
+
|
302 |
+
**SUM(要約)**
|
303 |
+
|
304 |
+
* `XL-Sum`、44言語の大規模な多言語抽象要約 [ソース](https://github.com/csebuetnlp/xl-sum)(ライセンス CC BY-NC-SA 4.0、非商用ライセンスのため、このデータセットは使用しません。ライセンスと利用規約に明確に同意した場合を除きます)
|
305 |
+
|
306 |
+
## 再現性
|
307 |
+
私たちの結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
|
308 |
+
"""
|
309 |
+
|
310 |
+
|
311 |
EVALUATION_QUEUE_TEXT = """
|
312 |
## Some good practices before submitting a model
|
313 |
|
src/display/utils.py
CHANGED
@@ -56,7 +56,7 @@ auto_eval_column_dict.append(["add_special_tokens", ColumnContent, ColumnContent
|
|
56 |
auto_eval_column_dict.append(
|
57 |
["llm_jp_eval_version", ColumnContent, ColumnContent("llm-jp-eval version", "str", False)]
|
58 |
)
|
59 |
-
auto_eval_column_dict.append(["backend", ColumnContent, ColumnContent("Backend Library", "str", False)])
|
60 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
61 |
|
62 |
# We use make dataclass to dynamically fill the scores from Tasks
|
|
|
56 |
auto_eval_column_dict.append(
|
57 |
["llm_jp_eval_version", ColumnContent, ColumnContent("llm-jp-eval version", "str", False)]
|
58 |
)
|
59 |
+
auto_eval_column_dict.append(["backend", ColumnContent, ColumnContent("Backend Library", "str", False, dummy=True)])
|
60 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
61 |
|
62 |
# We use make dataclass to dynamically fill the scores from Tasks
|
style.css
CHANGED
@@ -6,6 +6,8 @@
|
|
6 |
}
|
7 |
|
8 |
/* Hides the final AutoEvalColumn */
|
|
|
|
|
9 |
#llm-benchmark-tab-table table td:last-child,
|
10 |
#llm-benchmark-tab-table table th:last-child {
|
11 |
display: none;
|
|
|
6 |
}
|
7 |
|
8 |
/* Hides the final AutoEvalColumn */
|
9 |
+
#llm-benchmark-tab-table table td:nth-last-child(2),
|
10 |
+
#llm-benchmark-tab-table table th:nth-last-child(2),
|
11 |
#llm-benchmark-tab-table table td:last-child,
|
12 |
#llm-benchmark-tab-table table th:last-child {
|
13 |
display: none;
|