open_pt_llm_leaderboard / tasks_config.yaml
eduagarcia's picture
Refactor Tasks to load by yaml configuration file
4445ad2
raw
history blame
1.56 kB
version: 0.0.4
tasks:
oab_exams:
benchmark: oab_exams
col_name: OAB Exams
task_list:
- oab_exams_generate
metric: exact_match
few_shot: 5
limit: null
baseline: 25.0
human_baseline: 50.0
description: OAB Exams is a dataset of 2,000 questions from the Brazilian Bar
Association's exams.
link: https://huggingface.co/datasets/eduagarcia/oab_exams
brazilian_court_decisions_judgment:
benchmark: brazilian_court_decisions_judgment
col_name: BR Court Decisions
task_list:
- brazilian_court_decisions_judgment_generate
metric: f1_macro
few_shot: 5
limit: null
baseline: 33.33
human_baseline: 100.0
description: A classification dataset of court decisions from the Tribunal de
Justiça de Alagoas (TJAL, the State Supreme Court of Alagoas (Brazil).
link: https://huggingface.co/datasets/joelniklaus/brazilian_court_decisions
datalawyer_frases:
benchmark: datalawyer_frases
col_name: DL Frases
task_list:
- datalawyer_frases_generate
metric: f1_macro
few_shot: 15
limit: 2000
baseline: 10.0
human_baseline: 100.0
description: A classification dataset
link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
rrip:
benchmark: rrip
col_name: RRIP
task_list:
- rrip_generate
metric: f1_macro
few_shot: 15
limit: null
baseline: 12.5
human_baseline: 100.0
description: A classification dataset
link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark