File size: 7,825 Bytes
c0fa950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
version: 1.0.0
config:
  REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
  QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
  RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_results
  RAW_RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_raw_results
  DYNAMIC_INFO_REPO: "eduagarcia-temp/llm_pt_leaderboard_model_info"
  PATH_TO_COLLECTION: "eduagarcia/portuguese-llm-leaderboard-best-models-65c152c13ab3c67bc4f203a6"
  IS_PUBLIC: true
  LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
  GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
  TRUST_REMOTE_CODE: true
tasks:
  enem_challenge:
    benchmark: enem_challenge
    col_name: ENEM
    task_list:
    - enem_challenge
    metric: acc
    few_shot: 3
    limit: null
    baseline: 20.0 #random baseline
    #https://www.sejalguem.com/enem
    #https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html
    human_baseline: 35.0 # ~60 / 180 acertos - nota  ~500
    expert_human_baseline: 70.0 # ~124 / 180 acertos - nota ~700
    description: "The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School
      level exam widely applied every year by the Brazilian government to students that 
      wish to undertake a University degree. This dataset contains 1,430 questions that don't require
      image understanding of the exams from 2010 to 2018, 2022 and 2023."  
    link: https://huggingface.co/datasets/eduagarcia/enem_challenge
    sources: ["https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
    baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
  bluex:
    benchmark: bluex
    col_name: BLUEX
    task_list:
    - bluex
    metric: acc
    few_shot: 3
    limit: null
    baseline: 22.5 #random baseline
    #https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf 56% mean - 88% @ top-.99 
    #https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf 43,4%  - ~77% @ top-.99 
    human_baseline: 50.0
    expert_human_baseline: 82.5
    description: "BLUEX is a multimodal dataset consisting of the two leading 
    university entrance exams conducted in Brazil: Convest (Unicamp) and Fuvest (USP), 
    spanning from 2018 to 2024. The benchmark comprises of 724 questions that do not have accompanying images"   
    link: https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images
    sources: ["https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
    baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
  oab_exams:
    benchmark: oab_exams
    col_name: OAB Exams
    task_list:
    - oab_exams
    metric: acc
    few_shot: 3
    limit: null
    baseline: 25.0 #random baseline
    #https://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros # 46%
    # http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3
    # Acertou +70% = 17214 / 638500 = top-97,5%
    # desvio top-97,5% -> 46 - 70.0% = 24 
    # z score 97,5% ~ 1,9675
    # desvio padrao estimado -> 12,2
    # top 99% = 46 + 2,33*12,2 = ~75.0
    human_baseline: 46.0
    expert_human_baseline: 75.0
    description: OAB Exams is a dataset of more than 2,000 questions from the Brazilian Bar
      Association's exams, from 2010 to 2018.
    link: https://huggingface.co/datasets/eduagarcia/oab_exams
    sources: ["https://github.com/legal-nlp/oab-exams"]
    baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
  assin2_rte:
    benchmark: assin2_rte
    col_name: ASSIN2 RTE
    task_list:
    - assin2_rte
    metric: f1_macro
    few_shot: 15
    limit: null
    baseline: 50.0 #random baseline
    human_baseline: null
    expert_human_baseline: null
    description: "ASSIN 2 (Avaliação de Similaridade Semântica e Inferência Textual - 
    Evaluating Semantic Similarity and Textual Entailment) is the second edition of ASSIN, 
    an evaluation shared task in the scope of the computational processing 
    of Portuguese. Recognising Textual Entailment (RTE), also called Natural Language 
    Inference (NLI), is the task of predicting if a given text (premise) entails (implies) in
    other text (hypothesis)."
    link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
    sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
  assin2_sts:
    benchmark: assin2_sts
    col_name: ASSIN2 STS
    task_list:
    - assin2_sts
    metric: pearson
    few_shot: 15
    limit: null
    baseline: 0.0 #random baseline
    human_baseline: null
    expert_human_baseline: null
    description: "Same as dataset as above. Semantic Textual Similarity (STS) 
    ‘measures the degree of semantic equivalence between two sentences’."
    link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
    sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
  faquad_nli:
    benchmark: faquad_nli
    col_name: FAQUAD NLI
    task_list:
    - faquad_nli
    metric: f1_macro
    few_shot: 15
    limit: null
    baseline: 45.6 #random baseline
    human_baseline: null
    expert_human_baseline: null
    description: "FaQuAD is a Portuguese reading comprehension dataset that follows the format of the 
    Stanford Question Answering Dataset (SQuAD). The dataset aims to address the problem of 
    abundant questions sent by academics whose answers are found in available institutional 
    documents in the Brazilian higher education system. It consists of 900 questions about 
    249 reading passages taken from 18 official documents of a computer science college
    from a Brazilian federal university and 21 Wikipedia articles related to the 
    Brazilian higher education system. FaQuAD-NLI is a modified version of the 
    FaQuAD dataset that repurposes the question answering task as a textual 
    entailment task between a question and its possible answers."
    link: https://huggingface.co/datasets/ruanchaves/faquad-nli
    sources: ["https://github.com/liafacom/faquad/"]
  sparrow_pt:
    benchmark: sparrow_pt
    col_name: Sparrow POR
    task_list:
    - sparrow_emotion-2021-cortiz-por
    - sparrow_hate-2019-fortuna-por
    - sparrow_sentiment-2016-mozetic-por
    - sparrow_sentiment-2018-brum-por
    metric: f1_macro
    few_shot: 15
    limit: 500
    baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
    human_baseline: null
    expert_human_baseline: null
    description: "SPARROW is a multilingual evaluation benchmark for sociopragmatic meaning understanding. 
    SPARROW comprises 169 datasets encompassing 64 different languages, 
    this split evaluates only on the validation set of 4 datasets avaliable for the Portuguese language.
    One on hate speech detection by Fortuna et al. (2019), one on emotion detection by Cortiz et al. (2021) 
    and two on sentiment analysis by Mozetic et al. (2016) and Brum et al. (2018).
    All were extracted and manually annotated from Twitter/X."
    link: https://huggingface.co/datasets/UBC-NLP/sparrow
    sources: ["https://sparrow.dlnlp.ai/", "https://aclanthology.org/W19-3510/", "https://arxiv.org/abs/2108.07638", "https://aclanthology.org/L18-1658/", "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0155036"]