StarscreamDeceptions commited on
Commit
70a352d
1 Parent(s): f359f0f

Update src/about.py

Browse files
Files changed (1) hide show
  1. src/about.py +79 -57
src/about.py CHANGED
@@ -15,27 +15,28 @@ class Tasks(Enum):
15
  # task0 = Task("mmmlu", "acc", "MMMLU")
16
  # task1 = Task("mmlu", "acc", "MMLU")
17
  # task2 = Task("cmmlu", "acc", "CMMLU")
18
- mmmlu_ar = Task("mmmlu_ar", "acc", "MMMLU_AR")
19
- mmmlu_bn = Task("mmmlu_bn", "acc", "MMMLU_BN")
20
- mmmlu_de = Task("mmmlu_de", "acc", "MMMLU_DE")
21
- mmmlu_es = Task("mmmlu_es", "acc", "MMMLU_ES")
22
- mmmlu_fr = Task("mmmlu_fr", "acc", "MMMLU_FR")
23
- mmmlu_hi = Task("mmmlu_hi", "acc", "MMMLU_HI")
24
- mmmlu_id = Task("mmmlu_id", "acc", "MMMLU_ID")
25
- mmmlu_it = Task("mmmlu_it", "acc", "MMMLU_IT")
26
- mmmlu_ja = Task("mmmlu_ja", "acc", "MMMLU_JA")
27
- mmmlu_ko = Task("mmmlu_ko", "acc", "MMMLU_KO")
28
- mmmlu_pt = Task("mmmlu_pt", "acc", "MMMLU_PT")
29
- mmmlu_sw = Task("mmmlu_sw", "acc", "MMMLU_SW")
30
- mmmlu_yo = Task("mmmlu_yo", "acc", "MMMLU_YO")
31
- mmmlu_zh = Task("mmmlu_zh", "acc", "MMMLU_ZH")
32
  NUM_FEWSHOT = 5 # Change with your few shot
33
  # ---------------------------------------------------
34
 
35
 
36
 
37
  # Your leaderboard name
38
- TITLE = """<h1 align="center" id="space-title">Multilingual MMLU Benchmark Leaderboard</h1>"""
 
39
 
40
  # What does your leaderboard evaluate?
41
  INTRODUCTION_TEXT = """
@@ -49,9 +50,6 @@ INTRODUCTION_TEXT_ZH = """
49
  LLM_BENCHMARKS_TEXT = """
50
  ## 💡 About "Multilingual Benchmark MMLU Leaderboard"
51
 
52
- - Press release: [TBD - XXX](#), [TBD - XXX](#), [TBD - XXX](#), [TBD - XXX](#)
53
- - YouTube: [TBD - XXX](#)
54
-
55
  ### Overview
56
  The **Multilingual Massive Multitask Language Understanding (MMMLU)** benchmark is a comprehensive evaluation platform designed to assess the general knowledge capabilities of AI models across a wide range of domains. It includes a series of **Question Answering (QA)** tasks across **57 distinct domains**, ranging from elementary-level knowledge to advanced professional subjects such as law, physics, history, and computer science.
57
 
@@ -107,48 +105,59 @@ Notes:
107
 
108
  You can find:
109
 
110
- - Detailed numerical results in the [results dataset](link_to_results)
111
- - Community queries and running status in the [requests dataset](link_to_requests)
112
 
113
  ### ✅ Reproducibility
114
 
115
- To reproduce the results, you can use [our fork of lm_eval](#), as not all of our PRs are currently integrated into the main repository.
 
 
 
 
 
 
 
116
 
117
  ## 🙌 Acknowledgements
118
 
119
- This leaderboard was developed as part of the [#ProjectName](link_to_project) led by [OrganizationName](link_to_organization) thanks to the donation of high-quality evaluation datasets by:
120
 
121
- - [Institution 1](link_to_institution_1)
122
- - [Institution 2](link_to_institution_2)
123
- - [Institution 3](link_to_institution_3)
124
- - [Institution 4](link_to_institution_4)
125
- - [Institution 5](link_to_institution_5)
126
- - [Institution 6](link_to_institution_6)
127
- - [Institution 7](link_to_institution_7)
128
- - [Institution 8](link_to_institution_8)
129
- - [Institution 9](link_to_institution_9)
 
 
 
 
 
130
 
131
  The entities above are ordered chronologically by the date they joined the project. However, the logos in the footer are ordered by the number of datasets donated.
132
 
133
  Thank you in particular to:
134
- - Task implementation: [Name 1], [Name 2], [Name 3], [Name 4], [Name 5], [Name 6], [Name 7], [Name 8], [Name 9], [Name 10]
135
- - Leaderboard implementation: [Name 11], [Name 12]
136
- - Model evaluation: [Name 13], [Name 14], [Name 15], [Name 16], [Name 17]
137
- - Communication: [Name 18], [Name 19]
138
- - Organization & colab leads: [Name 20], [Name 21], [Name 22], [Name 23], [Name 24], [Name 25], [Name 26], [Name 27], [Name 28], [Name 29], [Name 30]
139
 
140
  For information about the dataset authors please check the corresponding Dataset Cards (linked in the "Tasks" tab) and papers (included in the "Citation" section below). We would like to specially thank the teams that created or open-sourced their datasets specifically for the leaderboard (in chronological order):
141
- - [Dataset1 Placeholder] and [Dataset2 Placeholder]: [Team members placeholder]
142
- - [Dataset3 Placeholder], [Dataset4 Placeholder] and [Dataset5 Placeholder]: [Team members placeholder]
143
- - [Dataset6 Placeholder]: [Team members placeholder]
144
 
145
- We also thank [Institution1 Placeholder], [Institution2 Placeholder], [Organization Placeholder], [Person1 Placeholder], [Person2 Placeholder] and [Institution3 Placeholder] for sponsoring the inference GPUs.
146
 
147
  ## 🚀 Collaborate!
148
 
149
  We would like to create a leaderboard as diverse as possible, reach out if you would like us to include your evaluation dataset!
150
 
151
- Comments and suggestions are more than welcome! Visit the [👏 Community](<Community Page Placeholder>) page, tell us what you think about MMMLU Leaderboard and how we can improve it, or go ahead and open a PR!
152
 
153
  Thank you very much! 💛
154
 
@@ -292,7 +301,19 @@ If everything is done, check you can launch the EleutherAIHarness on your model
292
  """
293
 
294
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
295
- CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
 
 
 
296
  """
297
  EVALUATION_QUEUE_TEXT_ZH = """
298
  ## 提交模型前的一些良好实践
@@ -320,20 +341,21 @@ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
320
  模型失败时的处理
321
  如果你的模型出现在 FAILED 分类中,表示其执行停止。 首先确保你已经遵循了上述步骤。 如果一切都完成,检查你是否可以使用上面的命令在本地启动 EleutherAIHarness 来���试你的模型(你可以添加 --limit 来限制每个任务的示例数)。 """
322
 
323
- CITATION_BUTTON_LABEL = "复制以下代码引用这些结果"
324
- CITATION_BUTTON_TEXT = r"""
325
- """
326
  LOGOS = [
327
- "logo/amsterdam-logo.png",
328
- "logo/cardiff-logo.png",
329
- "logo/coimbra-logo.png",
330
- "logo/dcu-logo.png",
331
- "logo/MBZU-logo.png",
332
- "logo/NAIST-logo.png",
333
- "logo/OSU-logo.png",
334
- "logo/rmit.png",
335
- "logo/sjtu-logo.png",
336
- "logo/tsinghua-logo.png",
337
- "logo/UGA-logo.png",
338
- "logo/um-logo.png"
 
339
  ]
 
15
  # task0 = Task("mmmlu", "acc", "MMMLU")
16
  # task1 = Task("mmlu", "acc", "MMLU")
17
  # task2 = Task("cmmlu", "acc", "CMMLU")
18
+ mmmlu_ar = Task("mmmlu_ar", "acc", "AR")
19
+ mmmlu_bn = Task("mmmlu_bn", "acc", "BN")
20
+ mmmlu_de = Task("mmmlu_de", "acc", "DE")
21
+ mmmlu_es = Task("mmmlu_es", "acc", "ES")
22
+ mmmlu_fr = Task("mmmlu_fr", "acc", "FR")
23
+ mmmlu_hi = Task("mmmlu_hi", "acc", "HI")
24
+ mmmlu_id = Task("mmmlu_id", "acc", "ID")
25
+ mmmlu_it = Task("mmmlu_it", "acc", "IT")
26
+ mmmlu_ja = Task("mmmlu_ja", "acc", "JA")
27
+ mmmlu_ko = Task("mmmlu_ko", "acc", "KO")
28
+ mmmlu_pt = Task("mmmlu_pt", "acc", "PT")
29
+ mmmlu_sw = Task("mmmlu_sw", "acc", "SW")
30
+ mmmlu_yo = Task("mmmlu_yo", "acc", "YO")
31
+ mmmlu_zh = Task("mmmlu_zh", "acc", "ZH")
32
  NUM_FEWSHOT = 5 # Change with your few shot
33
  # ---------------------------------------------------
34
 
35
 
36
 
37
  # Your leaderboard name
38
+ TITLE = """<img src="https://raw.githubusercontent.com/BobTsang1995/Multilingual-MMLU-Benchmark-Leaderboard/main/static/title/title.png" style="width:30%;display:block;margin-left:auto;margin-right:auto;border-radius:15px;">"""
39
+
40
 
41
  # What does your leaderboard evaluate?
42
  INTRODUCTION_TEXT = """
 
50
  LLM_BENCHMARKS_TEXT = """
51
  ## 💡 About "Multilingual Benchmark MMLU Leaderboard"
52
 
 
 
 
53
  ### Overview
54
  The **Multilingual Massive Multitask Language Understanding (MMMLU)** benchmark is a comprehensive evaluation platform designed to assess the general knowledge capabilities of AI models across a wide range of domains. It includes a series of **Question Answering (QA)** tasks across **57 distinct domains**, ranging from elementary-level knowledge to advanced professional subjects such as law, physics, history, and computer science.
55
 
 
105
 
106
  You can find:
107
 
108
+ - Detailed numerical results in the [results dataset](https://huggingface.co/datasets/StarscreamDeceptions/results)
109
+ - Community queries and running status in the [requests dataset](https://huggingface.co/datasets/StarscreamDeceptions/requests)
110
 
111
  ### ✅ Reproducibility
112
 
113
+ To reproduce the results, you can use [opencompass](https://github.com/BobTsang1995/opencompass), Since many open-source models cannot fully adhere to instructions for QA tasks, we perform post-processing on the results by using Qwen2.5-7B-Instruct to extract the answers from the model's output. This is a relatively simple task, so we can generally extract the model's true output, which corresponds to options A, B, C, and D. As not all of our PRs are currently integrated into the main repository.
114
+ ```
115
+ git clone git@github.com:BobTsang1995/opencompass.git
116
+ cd opencompass
117
+ pip install -e .
118
+ pip install lmdeploy
119
+ python run.py --models lmdeploy_qwen2_7b_instruct --datasets mmmlu_gen_5_shot -a lmdeploy
120
+ ```
121
 
122
  ## 🙌 Acknowledgements
123
 
124
+ This leaderboard was independently developed as a non-profit initiative with the support of several academic institutions, which provided valuable assistance to make this effort possible. We extend our heartfelt gratitude to these institutions for their support.
125
 
126
+
127
+ - [Technische Universität München (TUM)](https://www.tum.de/)
128
+ - [Tsinghua University](https://www.tsinghua.edu.cn/en/)
129
+ - [Universiteit van Amsterdam](https://uva.nl/)
130
+ - [Mohamed Bin Zayed University of Artificial Intelligence](https://mbzuai.ac.ae/)
131
+ - [University of Macau](https://www.um.edu.mo/)
132
+ - [Cardiff University](https://www.cardiff.ac.uk/)
133
+ - [Nara Institute of Science and Technology](https://www.naist.jp/en/)
134
+ - [Shanghai Jiao Tong University](https://en.sjtu.edu.cn/)
135
+ - [Dublin City University](https://www.dcu.ie/)
136
+ - [Université Grenoble Alpes](https://www.univ-grenoble-alpes.fr/)
137
+ - [Universidade de Coimbra](https://www.uc.pt/)
138
+ - [The Ohio State University](https://www.osu.edu/)
139
+ - [RMIT University](https://www.rmit.edu.au/)
140
 
141
  The entities above are ordered chronologically by the date they joined the project. However, the logos in the footer are ordered by the number of datasets donated.
142
 
143
  Thank you in particular to:
144
+ - Task implementation: Bo Zeng, Yue Zhao, Chengyang Lyu, Huifeng Yin
145
+ - Leaderboard implementation: Bo Zeng, Longyue Wang
146
+ - Model evaluation: Bo Zeng, Tianqi Shi, Fengye Liu, Lingfeng Ming, Xue Yang, Yiyu Wang
147
+ - Communication: Longyue Wang, Weihua Luo, Kaifu Zhang
148
+ - Organization & colab leads: Yi Zhou (Cardiff University), Yusuke Sakai (Nara Institute of Science and Technology), Yongxin Zhou (Université Grenoble Alpes), Haonan Li (MBZUAI), Jiahui Geng (MBZUAI), Qing Li (MBZUAI), Wenxi Li (Tsinghua University/Shanghai Jiaotong University), Yuanyu Lin (University of Macau), Andy Way (Dublin City University), Zhuang Li (RMIT University), Zhongwei Wan (The Ohio State University), Di Wu (University of Amsterdam), Wen Lai (Technical University of Munich) (TUM)
149
 
150
  For information about the dataset authors please check the corresponding Dataset Cards (linked in the "Tasks" tab) and papers (included in the "Citation" section below). We would like to specially thank the teams that created or open-sourced their datasets specifically for the leaderboard (in chronological order):
151
+ - [MMMLU](https://huggingface.co/datasets/openai/MMMLU): OpenAI
152
+
 
153
 
154
+ We also thank MacroPolo Team, Alibaba International Digital Commerce for sponsoring the inference GPUs.
155
 
156
  ## 🚀 Collaborate!
157
 
158
  We would like to create a leaderboard as diverse as possible, reach out if you would like us to include your evaluation dataset!
159
 
160
+ Comments and suggestions are more than welcome! Visit the [👏 Multilingual-MMLU-Benchmark-Leaderboard discussions](https://huggingface.co/spaces/StarscreamDeceptions/Multilingual-MMLU-Benchmark-Leaderboard/discussions) page, tell us what you think about MMMLU Leaderboard and how we can improve it, or go ahead and open a PR!
161
 
162
  Thank you very much! 💛
163
 
 
301
  """
302
 
303
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
304
+ CITATION_BUTTON_TEXT = r"""@misc{Multilingual MMLU Benchmark Leaderboard2024,
305
+ author = {Bo Zeng and Tianqi Shi and Yefeng Liu and Lingfeng Ming and Xue Yang and Yiyu Wang and Yue Zhao and Chengyang Lyu and Huifeng Yin and Longyue Wang},
306
+ title = {Multilingual MMLU Benchmark Leaderboard},
307
+ year = {2024},
308
+ publisher = {Hugging Face},
309
+ howpublished = "\url{https://huggingface.co/spaces/StarscreamDeceptions/Multilingual-MMLU-Benchmark-Leaderboard}"
310
+
311
+ @article{hendrycks2020measuring,
312
+ title={Measuring massive multitask language understanding},
313
+ author={Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
314
+ journal={arXiv preprint arXiv:2009.03300},
315
+ year={2020}
316
+ }
317
  """
318
  EVALUATION_QUEUE_TEXT_ZH = """
319
  ## 提交模型前的一些良好实践
 
341
  模型失败时的处理
342
  如果你的模型出现在 FAILED 分类中,表示其执行停止。 首先确保你已经遵循了上述步骤。 如果一切都完成,检查你是否可以使用上面的命令在本地启动 EleutherAIHarness 来���试你的模型(你可以添加 --limit 来限制每个任务的示例数)。 """
343
 
344
+ # CITATION_BUTTON_LABEL = "复制以下代码引用这些结果"
345
+ # CITATION_BUTTON_TEXT = r"""
346
+ # """
347
  LOGOS = [
348
+ # "logo/amsterdam-logo.png",
349
+ # "logo/cardiff-logo.png",
350
+ # "logo/coimbra-logo.png",
351
+ # "logo/dcu-logo.png",
352
+ # "logo/MBZU-logo.png",
353
+ # "logo/NAIST-logo.png",
354
+ # "logo/OSU-logo.png",
355
+ # "logo/rmit.png",
356
+ # "logo/sjtu-logo.png",
357
+ # "logo/tsinghua-logo.png",
358
+ # "logo/UGA-logo.png",
359
+ # "logo/um-logo.png"
360
+ "logo/all.png"
361
  ]