Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
version: 1.1.0 | |
config: | |
REPO_ID: "eduagarcia/open_pt_llm_leaderboard" | |
QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests | |
RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_results | |
RAW_RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_raw_results | |
DYNAMIC_INFO_REPO: "eduagarcia-temp/llm_pt_leaderboard_model_info" | |
PATH_TO_COLLECTION: "eduagarcia/portuguese-llm-leaderboard-best-models-65c152c13ab3c67bc4f203a6" | |
IS_PUBLIC: true | |
LEADERBOARD_NAME: "Open Portuguese LLM Leaderboard" | |
GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true | |
TRUST_REMOTE_CODE: true | |
SHOW_INCOMPLETE_EVALS: false | |
REQUIRE_MODEL_CARD: true | |
REQUIRE_MODEL_LICENSE: false | |
readme: | |
general_description: | | |
📐 The 🚀 Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of | |
Large Language Models (LLMs) in the Portuguese language across a variety of tasks | |
and datasets. | |
support_description: | | |
This leaderboard is made possible by the support of the | |
[Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the | |
[Federal University of Goiás (UFG)](https://international.ufg.br/). | |
If you have any questions, suggestions, or would like to contribute to the leaderboard, | |
please feel free to reach out at [@eduagarcia](https://linktr.ee/eduagarcia). | |
about_description: | | |
The 🚀 Open PT-LLM Leaderboard is a benchmark for the evaluation of | |
Large Language Models (LLMs) in the Portuguese language. | |
The leaderboard is open to submissions of models from the community and | |
is designed to be a resource for researchers, practitioners, and enthusiasts interested | |
in the development and evaluation of LLMs for the Portuguese language. | |
Supported by the [Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the | |
[Federal University of Goiás (UFG)](https://international.ufg.br/), this leaderboard | |
operates on a backend of Nvidia A100-80G GPUs. Evaluations are subject to | |
resource availability, which is not exclusive. Therefore, please be patient if | |
your model is in the queue. | |
This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">🤗 Open LLM Leaderboard</a> with | |
portuguese benchmarks. | |
Add the results to your model card: [🧐 Open Portuguese LLM Leaderboard Results PR Opener](https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard) | |
citation: | | |
@misc{open-pt-llm-leaderboard, | |
author = {Garcia, Eduardo A. S.}, | |
title = {Open Portuguese LLM Leaderboard}, | |
year = {2024}, | |
publisher = {Hugging Face}, | |
howpublished = "\url{https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard}" | |
} | |
tasks: | |
enem_challenge: | |
benchmark: enem_challenge | |
col_name: ENEM | |
task_list: | |
- enem_challenge | |
metric: acc | |
few_shot: 3 | |
limit: null | |
baseline: 20.0 #random baseline | |
#https://www.sejalguem.com/enem | |
#https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html | |
human_baseline: 35.0 # ~60 / 180 acertos - nota ~500 | |
expert_human_baseline: 70.0 # ~124 / 180 acertos - nota ~700 | |
description: "The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School | |
level exam widely applied every year by the Brazilian government to students that | |
wish to undertake a University degree. This dataset contains 1,430 questions that don't require | |
image understanding of the exams from 2010 to 2018, 2022 and 2023." | |
link: https://www.ime.usp.br/~ddm/project/enem/ENEM-GuidingTest.pdf | |
sources: ["https://huggingface.co/datasets/eduagarcia/enem_challenge", "https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"] | |
baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"] | |
citation: | | |
@InProceedings{ENEM-Challenge, | |
author = {Silveira, Igor Cataneo and Mau\'a, Denis Deratani}, | |
booktitle = {Proceedings of the 6th Brazilian Conference on Intelligent Systems}, | |
series = {BRACIS}, | |
title = {University Entrance Exam as a Guiding Test for Artificial Intelligence}, | |
pages = {426--431}, | |
year = {2017} | |
} | |
@misc{nunes2023evaluating, | |
title={Evaluating GPT-3.5 and GPT-4 Models on Brazilian University Admission Exams}, | |
author={Desnes Nunes and Ricardo Primi and Ramon Pires and Roberto Lotufo and Rodrigo Nogueira}, | |
year={2023}, | |
eprint={2303.17003}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.CL} | |
} | |
@misc{pires2023evaluating, | |
title={Evaluating GPT-4's Vision Capabilities on Brazilian University Admission Exams}, | |
author={Ramon Pires and Thales Sales Almeida and Hugo Abonizio and Rodrigo Nogueira}, | |
year={2023}, | |
eprint={2311.14169}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.CL} | |
} | |
bluex: | |
benchmark: bluex | |
col_name: BLUEX | |
task_list: | |
- bluex | |
metric: acc | |
few_shot: 3 | |
limit: null | |
baseline: 22.5 #random baseline | |
#https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf 56% mean - 88% @ top-.99 | |
#https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf 43,4% - ~77% @ top-.99 | |
human_baseline: 50.0 | |
expert_human_baseline: 82.5 | |
description: "BLUEX is a multimodal dataset consisting of the two leading | |
university entrance exams conducted in Brazil: Convest (Unicamp) and Fuvest (USP), | |
spanning from 2018 to 2024. The benchmark comprises of 724 questions that do not have accompanying images" | |
link: https://arxiv.org/abs/2307.05410 | |
sources: ["https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images", "https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"] | |
baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"] | |
citation: | | |
@misc{almeida2023bluex, | |
title={BLUEX: A benchmark based on Brazilian Leading Universities Entrance eXams}, | |
author={Thales Sales Almeida and Thiago Laitz and Giovana K. Bonás and Rodrigo Nogueira}, | |
year={2023}, | |
eprint={2307.05410}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.CL} | |
} | |
oab_exams: | |
benchmark: oab_exams | |
col_name: OAB Exams | |
task_list: | |
- oab_exams | |
metric: acc | |
few_shot: 3 | |
limit: null | |
baseline: 25.0 #random baseline | |
#https://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros # 46% | |
# http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3 | |
# Acertou +70% = 17214 / 638500 = top-97,5% | |
# desvio top-97,5% -> 46 - 70.0% = 24 | |
# z score 97,5% ~ 1,9675 | |
# desvio padrao estimado -> 12,2 | |
# top 99% = 46 + 2,33*12,2 = ~75.0 | |
human_baseline: 46.0 | |
expert_human_baseline: 75.0 | |
description: OAB Exams is a dataset of more than 2,000 questions from the Brazilian Bar | |
Association's exams, from 2010 to 2018. | |
link: https://arxiv.org/abs/1712.05128 | |
sources: ["https://huggingface.co/datasets/eduagarcia/oab_exams", "https://github.com/legal-nlp/oab-exams"] | |
baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"] | |
citation: | | |
@inproceedings{d2017passing, | |
title={Passing the Brazilian OAB Exam: Data Preparation and Some Experiments1}, | |
author={d RADEMAKER, Alexandre}, | |
booktitle={Legal Knowledge and Information Systems: JURIX 2017: The Thirtieth Annual Conference}, | |
volume={302}, | |
pages={89}, | |
year={2017}, | |
organization={IOS Press} | |
} | |
assin2_rte: | |
benchmark: assin2_rte | |
col_name: ASSIN2 RTE | |
task_list: | |
- assin2_rte | |
metric: f1_macro | |
few_shot: 15 | |
limit: null | |
baseline: 50.0 #random baseline | |
human_baseline: null | |
expert_human_baseline: null | |
description: "ASSIN 2 (Avaliação de Similaridade Semântica e Inferência Textual - | |
Evaluating Semantic Similarity and Textual Entailment) is the second edition of ASSIN, | |
an evaluation shared task in the scope of the computational processing | |
of Portuguese. Recognising Textual Entailment (RTE), also called Natural Language | |
Inference (NLI), is the task of predicting if a given text (premise) entails (implies) in | |
other text (hypothesis)." | |
link: https://dl.acm.org/doi/abs/10.1007/978-3-030-41505-1_39 | |
sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"] | |
citation: | | |
@inproceedings{real2020assin, | |
title={The assin 2 shared task: a quick overview}, | |
author={Real, Livy and Fonseca, Erick and Oliveira, Hugo Goncalo}, | |
booktitle={International Conference on Computational Processing of the Portuguese Language}, | |
pages={406--412}, | |
year={2020}, | |
organization={Springer} | |
} | |
assin2_sts: | |
benchmark: assin2_sts | |
col_name: ASSIN2 STS | |
task_list: | |
- assin2_sts | |
metric: pearson | |
few_shot: 15 | |
limit: null | |
baseline: 0.0 #random baseline | |
human_baseline: null | |
expert_human_baseline: null | |
description: "Same as dataset as above. Semantic Textual Similarity (STS) | |
‘measures the degree of semantic equivalence between two sentences’." | |
link: https://dl.acm.org/doi/abs/10.1007/978-3-030-41505-1_39 | |
sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"] | |
faquad_nli: | |
benchmark: faquad_nli | |
col_name: FAQUAD NLI | |
task_list: | |
- faquad_nli | |
metric: f1_macro | |
few_shot: 15 | |
limit: null | |
baseline: 45.6 #random baseline | |
human_baseline: null | |
expert_human_baseline: null | |
description: "FaQuAD is a Portuguese reading comprehension dataset that follows the format of the | |
Stanford Question Answering Dataset (SQuAD). The dataset aims to address the problem of | |
abundant questions sent by academics whose answers are found in available institutional | |
documents in the Brazilian higher education system. It consists of 900 questions about | |
249 reading passages taken from 18 official documents of a computer science college | |
from a Brazilian federal university and 21 Wikipedia articles related to the | |
Brazilian higher education system. FaQuAD-NLI is a modified version of the | |
FaQuAD dataset that repurposes the question answering task as a textual | |
entailment task between a question and its possible answers." | |
link: https://ieeexplore.ieee.org/abstract/document/8923668 | |
sources: ["https://github.com/liafacom/faquad/", "https://huggingface.co/datasets/ruanchaves/faquad-nli"] | |
citation: | | |
@inproceedings{8923668, | |
author={Sayama, Hélio Fonseca and Araujo, Anderson Viçoso and Fernandes, Eraldo Rezende}, | |
booktitle={2019 8th Brazilian Conference on Intelligent Systems (BRACIS)}, | |
title={FaQuAD: Reading Comprehension Dataset in the Domain of Brazilian Higher Education}, | |
year={2019}, | |
volume={}, | |
number={}, | |
pages={443-448}, | |
keywords={Training;Context modeling;Encyclopedias;Electronic publishing;Internet;Natural Language Processing;Machine Reading Comprehension;Dataset}, | |
doi={10.1109/BRACIS.2019.00084} | |
} | |
@software{Chaves_Rodrigues_napolab_2023, | |
author = {Chaves Rodrigues, Ruan and Tanti, Marc and Agerri, Rodrigo}, | |
doi = {10.5281/zenodo.7781848}, | |
month = {3}, | |
title = {{Natural Portuguese Language Benchmark (Napolab)}}, | |
url = {https://github.com/ruanchaves/napolab}, | |
version = {1.0.0}, | |
year = {2023} | |
} | |
hatebr_offensive: | |
benchmark: hatebr_offensive | |
col_name: HateBR | |
task_list: | |
- hatebr_offensive | |
metric: f1_macro | |
few_shot: 25 | |
limit: null | |
baseline: 50.0 | |
human_baseline: null | |
expert_human_baseline: null | |
description: "HateBR is the first large-scale expert annotated dataset of Brazilian Instagram comments for abusive language detection | |
on the web and social media. The HateBR was collected from Brazilian Instagram comments of politicians and manually annotated | |
by specialists. It is composed of 7,000 documents annotated with a binary classification (offensive | |
versus non-offensive comments)." | |
link: https://arxiv.org/abs/2103.14972 | |
sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/franciellevargas/HateBR", "https://huggingface.co/datasets/ruanchaves/hatebr"] | |
citation: | | |
@inproceedings{vargas-etal-2022-hatebr, | |
title = "{H}ate{BR}: A Large Expert Annotated Corpus of {B}razilian {I}nstagram Comments for Offensive Language and Hate Speech Detection", | |
author = "Vargas, Francielle and | |
Carvalho, Isabelle and | |
Rodrigues de G{\'o}es, Fabiana and | |
Pardo, Thiago and | |
Benevenuto, Fabr{\'\i}cio", | |
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference", | |
month = jun, | |
year = "2022", | |
address = "Marseille, France", | |
publisher = "European Language Resources Association", | |
url = "https://aclanthology.org/2022.lrec-1.777", | |
pages = "7174--7183" | |
} | |
portuguese_hate_speech: | |
benchmark: portuguese_hate_speech | |
col_name: PT Hate Speech | |
task_list: | |
- portuguese_hate_speech | |
metric: f1_macro | |
few_shot: 25 | |
limit: null | |
baseline: 47.9 | |
human_baseline: null | |
expert_human_baseline: null | |
description: "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate')" | |
link: https://aclanthology.org/W19-3510/ | |
sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/paulafortuna/Portuguese-Hate-Speech-Dataset", "https://huggingface.co/datasets/hate_speech_portuguese"] | |
citation: | | |
@inproceedings{fortuna-etal-2019-hierarchically, | |
title = "A Hierarchically-Labeled {P}ortuguese Hate Speech Dataset", | |
author = "Fortuna, Paula and | |
Rocha da Silva, Jo{\~a}o and | |
Soler-Company, Juan and | |
Wanner, Leo and | |
Nunes, S{\'e}rgio", | |
booktitle = "Proceedings of the 3rd Workshop on Abusive Language Online (ALW3)", | |
year = "2019", | |
publisher = "Association for Computational Linguistics", | |
url = "https://aclanthology.org/W19-3510", | |
doi = "10.18653/v1/W19-3510", | |
pages = "94--104", | |
} | |
tweetsentbr: | |
benchmark: tweetsentbr | |
col_name: tweetSentBR | |
task_list: | |
- tweetsentbr | |
metric: f1_macro | |
few_shot: 25 | |
limit: null | |
baseline: 32.8 | |
human_baseline: null | |
expert_human_baseline: null | |
description: "TweetSentBR is a corpus of Tweets in Brazilian Portuguese. | |
It was labeled by several annotators following steps stablished on the literature for | |
improving reliability on the task of Sentiment Analysis. Each Tweet was annotated | |
in one of the three following classes: Positive, Negative, Neutral." | |
link: https://arxiv.org/abs/1712.08917 | |
sources: ["https://bitbucket.org/HBrum/tweetsentbr", "eduagarcia/tweetsentbr_fewshot"] | |
citation: | | |
@InProceedings{BRUM18.389, | |
author = {Henrico Brum and Maria das Gra\c{c}as Volpe Nunes}, | |
title = "{Building a Sentiment Corpus of Tweets in Brazilian Portuguese}", | |
booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, | |
year = {2018}, | |
month = {May 7-12, 2018}, | |
address = {Miyazaki, Japan}, | |
editor = {Nicoletta Calzolari (Conference chair) and Khalid Choukri and Christopher Cieri and Thierry Declerck and Sara Goggi and Koiti Hasida and Hitoshi Isahara and Bente Maegaard and Joseph Mariani and HÚlŔne Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis and Takenobu Tokunaga}, | |
publisher = {European Language Resources Association (ELRA)}, | |
isbn = {979-10-95546-00-9}, | |
language = {english} | |
} | |