Ali-C137 commited on
Commit
249112c
1 Parent(s): 866ba51

Update src/about.py

Browse files
Files changed (1) hide show
  1. src/about.py +104 -5
src/about.py CHANGED
@@ -35,26 +35,64 @@ NUM_FEWSHOT = 0 # Change with your few shot
35
 
36
 
37
  # Your leaderboard name
38
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 
39
 
40
  # What does your leaderboard evaluate?
41
  INTRODUCTION_TEXT = """
42
- Intro text
 
 
 
 
43
  """
44
 
45
  # Which evaluations are you running? how can people reproduce what you have?
46
  LLM_BENCHMARKS_TEXT = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  ## How it works
 
 
 
 
 
 
 
48
 
49
- ## Reproducibility
50
- To reproduce our results, here is the commands you can run:
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  """
53
 
54
  EVALUATION_QUEUE_TEXT = """
55
  ## Some good practices before submitting a model
56
 
57
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 
58
  ```python
59
  from transformers import AutoConfig, AutoModel, AutoTokenizer
60
  config = AutoConfig.from_pretrained("your model name", revision=revision)
@@ -78,9 +116,70 @@ When we add extra information about models to the leaderboard, it will be automa
78
  ## In case of model failure
79
  If your model is displayed in the `FAILED` category, its execution stopped.
80
  Make sure you have followed the above steps first.
81
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
82
  """
83
 
84
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
85
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  """
 
35
 
36
 
37
  # Your leaderboard name
38
+ TITLE = """<h1 align="center" id="space-title">Open Arabic LLM Leaderboard</h1>"""
39
+ # TITLE = """<img src="image.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
40
 
41
  # What does your leaderboard evaluate?
42
  INTRODUCTION_TEXT = """
43
+ 🚀 The Open Arabic LLM Leaderboard : Objectively evaluates and compare the performance of Arabic Large Language Models (LLMs).
44
+
45
+ When you submit a model on the "Submit here!" page, it is automatically evaluated on a set of benchmarks. The GPU used for evaluation is operated with the support of __[Technology Innovation Institute (TII)](https://www.tii.ae/)__.
46
+ The datasets used for evaluation consists of datasets that are Arabic Native like the `AlGhafa` benchmark from [TII](https://www.tii.ae/) and `ACVA` benchmark from [FreedomIntelligence](https://huggingface.co/FreedomIntelligence) to assess reasoning, language understanding, commonsense, and more.
47
+ More details about the benchmarks and the evaluation process is provided on the “About” page.
48
  """
49
 
50
  # Which evaluations are you running? how can people reproduce what you have?
51
  LLM_BENCHMARKS_TEXT = f"""
52
+ # Context
53
+ While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Open Arabic LLM Leaderboard (OALL), to evaluate models that reflect the characteristics of the Arabic language, culture and heritage. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in the Arab region 🔥.
54
+
55
+ ## Icons
56
+ {ModelType.PT.to_str(" : ")} model
57
+ {ModelType.IFT.to_str(" : ")} model
58
+ {ModelType.RL.to_str(" : ")} model
59
+ If the icon is "?", it indicates that there is insufficient information about the model.
60
+ Please provide information about the model through an issue! 🤩
61
+
62
+ Note : Some models might get selected as a subject of caution by the community, implying that users should exercise restraint when using it.
63
+ (Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
64
+
65
  ## How it works
66
+ 📈 We evaluate models using the impressive [LightEval](https://github.com/huggingface/lighteval), a unified and straightforward framework from the HuggingFace Eval Team to test and assess causal language models on a large number of different evaluation tasks.
67
+ We have set up a benchmark using datasets, most of them translated to Arabic, and validated by native arabic speakers. We also added `AlGhafa` a new benchmark prepared from scratch natively for Arabic, alongside the `ACVA` benchmark introduced in the [AceGPT](https://arxiv.org/abs/2309.12053) paper by [FreedomIntelligence](https://huggingface.co/FreedomIntelligence).
68
+
69
+ Find below the Native benchmarks :
70
+
71
+ - AlGhafa : Find more details [here](https://aclanthology.org/2023.arabicnlp-1.21.pdf) - (provided by [TII](https://www.tii.ae/))
72
+ - Arabic-Culture-Value-Alignement (ACVA) : Find more details [here](https://arxiv.org/pdf/2309.12053) - (provided by [FreedomIntelligence](https://huggingface.co/FreedomIntelligence))
73
 
 
 
74
 
75
+ And here find all the translated benchmarks provided by the Language evaluation team at [Technology Innovation Institute](https://www.tii.ae/) :
76
+
77
+ - `Arabic-MMLU`, `Arabic-EXAMS`, `Arabic-ARC-Challenge`, `Arabic-ARC-Easy`, `Arabic-BOOLQ`, `Arabic-COPA`, `Arabic-HELLASWAG`, `Arabic-OPENBOOK-QA`, `Arabic-PIQA`, `Arabic-RACE`, `Arabic-SCIQ`, `Arabic-TOXIGEN`. All part of the extended version of the AlGhafa benchmark (AlGhafa-T version)
78
+
79
+ Please, consider reaching out to us through teh discussions tab if you are working on benchmarks for Arabic LLMs and willing to see them on this leaderboard as well. Your benchmark might change the whole game for Arabic models !
80
+
81
+ GPUs are provided by __[Technology Innovation Institute (TII)](https://www.tii.ae/)__ for the evaluations.
82
+
83
+ ## Details and Logs
84
+ - Detailed numerical results in the `results` OALL dataset: https://huggingface.co/datasets/OALL/results
85
+ - Community queries and running status in the `requests` OALL dataset: https://huggingface.co/datasets/OALL/requests
86
+
87
+ ## More resources
88
+ If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/OALL/leaderboard-test-2/discussions/1)!
89
  """
90
 
91
  EVALUATION_QUEUE_TEXT = """
92
  ## Some good practices before submitting a model
93
 
94
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
95
+
96
  ```python
97
  from transformers import AutoConfig, AutoModel, AutoTokenizer
98
  config = AutoConfig.from_pretrained("your model name", revision=revision)
 
116
  ## In case of model failure
117
  If your model is displayed in the `FAILED` category, its execution stopped.
118
  Make sure you have followed the above steps first.
119
+ If everything is done, check you can launch the LightEval script on your model locally, using [this script](https://gist.github.com/alielfilali01/d486cfc962dca3ed4091b7c562a4377f).
120
  """
121
 
122
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
123
  CITATION_BUTTON_TEXT = r"""
124
+ @misc{OALL,
125
+ author = {Elfilali Ali, Alobeidli Hamza, Clémentine Fourrier, Cojocaru Ruxandra, Nathan Habib},
126
+ title = {Open Arabic LLM Leaderboard},
127
+ year = {2024},
128
+ publisher = {OALL},
129
+ howpublished = "\url{https://huggingface.co/spaces/OALL/leaderboard-test-2}"
130
+ }
131
+ @inproceedings{almazrouei-etal-2023-alghafa,
132
+ title = "{A}l{G}hafa Evaluation Benchmark for {A}rabic Language Models",
133
+ author = "Almazrouei, Ebtesam and
134
+ Cojocaru, Ruxandra and
135
+ Baldo, Michele and
136
+ Malartic, Quentin and
137
+ Alobeidli, Hamza and
138
+ Mazzotta, Daniele and
139
+ Penedo, Guilherme and
140
+ Campesan, Giulia and
141
+ Farooq, Mugariya and
142
+ Alhammadi, Maitha and
143
+ Launay, Julien and
144
+ Noune, Badreddine",
145
+ editor = "Sawaf, Hassan and
146
+ El-Beltagy, Samhaa and
147
+ Zaghouani, Wajdi and
148
+ Magdy, Walid and
149
+ Abdelali, Ahmed and
150
+ Tomeh, Nadi and
151
+ Abu Farha, Ibrahim and
152
+ Habash, Nizar and
153
+ Khalifa, Salam and
154
+ Keleg, Amr and
155
+ Haddad, Hatem and
156
+ Zitouni, Imed and
157
+ Mrini, Khalil and
158
+ Almatham, Rawan",
159
+ booktitle = "Proceedings of ArabicNLP 2023",
160
+ month = dec,
161
+ year = "2023",
162
+ address = "Singapore (Hybrid)",
163
+ publisher = "Association for Computational Linguistics",
164
+ url = "https://aclanthology.org/2023.arabicnlp-1.21",
165
+ doi = "10.18653/v1/2023.arabicnlp-1.21",
166
+ pages = "244--275",
167
+ abstract = "Recent advances in the space of Arabic large language models have opened up a wealth of potential practical applications. From optimal training strategies, large scale data acquisition and continuously increasing NLP resources, the Arabic LLM landscape has improved in a very short span of time, despite being plagued by training data scarcity and limited evaluation resources compared to English. In line with contributing towards this ever-growing field, we introduce AlGhafa, a new multiple-choice evaluation benchmark for Arabic LLMs. For showcasing purposes, we train a new suite of models, including a 14 billion parameter model, the largest monolingual Arabic decoder-only model to date. We use a collection of publicly available datasets, as well as a newly introduced HandMade dataset consisting of 8 billion tokens. Finally, we explore the quantitative and qualitative toxicity of several Arabic models, comparing our models to existing public Arabic LLMs.",
168
+ }
169
+ @misc{huang2023acegpt,
170
+ title={AceGPT, Localizing Large Language Models in Arabic},
171
+ author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu},
172
+ year={2023},
173
+ eprint={2309.12053},
174
+ archivePrefix={arXiv},
175
+ primaryClass={cs.CL}
176
+ }
177
+ @misc{datatrove,
178
+ author = {Clémentine, Fourrier, and Nathan, Habib and Wolf, Thomas},
179
+ title = {LightEval: A lightweight framework for LLM evaluation},
180
+ year = {2024},
181
+ publisher = {GitHub},
182
+ journal = {GitHub repository},
183
+ url = {https://github.com/huggingface/lighteval}
184
+ }
185
  """