|
{ |
|
"results": { |
|
"assin2_rte": { |
|
"f1_macro,all": 0.3333333333333333, |
|
"f1_macro_stderr,all": 0.0031730251394380704, |
|
"acc,all": 0.5, |
|
"acc_stderr,all": 0.007138073526203421, |
|
"alias": "assin2_rte" |
|
}, |
|
"assin2_sts": { |
|
"pearson,all": 0.07790738376477561, |
|
"pearson_stderr,all": 0.014716684813439029, |
|
"mse,all": 2.066646241830065, |
|
"mse_stderr,all": "N/A", |
|
"alias": "assin2_sts" |
|
}, |
|
"bluex": { |
|
"acc,all": 0.20584144645340752, |
|
"acc_stderr,all": 0.008708200533939503, |
|
"acc,exam_id__USP_2024": 0.24390243902439024, |
|
"acc_stderr,exam_id__USP_2024": 0.038765983941525854, |
|
"acc,exam_id__UNICAMP_2018": 0.2222222222222222, |
|
"acc_stderr,exam_id__UNICAMP_2018": 0.032540841899570724, |
|
"acc,exam_id__USP_2018": 0.16666666666666666, |
|
"acc_stderr,exam_id__USP_2018": 0.029320906143906797, |
|
"acc,exam_id__USP_2022": 0.20408163265306123, |
|
"acc_stderr,exam_id__USP_2022": 0.03318725036103681, |
|
"acc,exam_id__USP_2021": 0.1346153846153846, |
|
"acc_stderr,exam_id__USP_2021": 0.027253785013691273, |
|
"acc,exam_id__UNICAMP_2021_2": 0.27450980392156865, |
|
"acc_stderr,exam_id__UNICAMP_2021_2": 0.0360141604917446, |
|
"acc,exam_id__UNICAMP_2021_1": 0.2826086956521739, |
|
"acc_stderr,exam_id__UNICAMP_2021_1": 0.038353605844743385, |
|
"acc,exam_id__USP_2020": 0.08928571428571429, |
|
"acc_stderr,exam_id__USP_2020": 0.021921367122397676, |
|
"acc,exam_id__UNICAMP_2022": 0.3076923076923077, |
|
"acc_stderr,exam_id__UNICAMP_2022": 0.042727899536711536, |
|
"acc,exam_id__UNICAMP_2023": 0.16279069767441862, |
|
"acc_stderr,exam_id__UNICAMP_2023": 0.03238294764474062, |
|
"acc,exam_id__UNICAMP_2024": 0.3111111111111111, |
|
"acc_stderr,exam_id__UNICAMP_2024": 0.0398333923513593, |
|
"acc,exam_id__UNICAMP_2020": 0.18181818181818182, |
|
"acc_stderr,exam_id__UNICAMP_2020": 0.02992924246353086, |
|
"acc,exam_id__USP_2023": 0.18181818181818182, |
|
"acc_stderr,exam_id__USP_2023": 0.03345273573325805, |
|
"acc,exam_id__UNICAMP_2019": 0.2, |
|
"acc_stderr,exam_id__UNICAMP_2019": 0.03269771596389771, |
|
"acc,exam_id__USP_2019": 0.175, |
|
"acc_stderr,exam_id__USP_2019": 0.0348121283420538, |
|
"alias": "bluex" |
|
}, |
|
"enem_challenge": { |
|
"alias": "enem", |
|
"acc,all": 0.2092372288313506, |
|
"acc_stderr,all": 0.006229253057208555, |
|
"acc,exam_id__2016_2": 0.21138211382113822, |
|
"acc_stderr,exam_id__2016_2": 0.02123691588211098, |
|
"acc,exam_id__2023": 0.1925925925925926, |
|
"acc_stderr,exam_id__2023": 0.019583861187616968, |
|
"acc,exam_id__2013": 0.2037037037037037, |
|
"acc_stderr,exam_id__2013": 0.02238599079325693, |
|
"acc,exam_id__2012": 0.12931034482758622, |
|
"acc_stderr,exam_id__2012": 0.01803175905554286, |
|
"acc,exam_id__2009": 0.23478260869565218, |
|
"acc_stderr,exam_id__2009": 0.022814803582640184, |
|
"acc,exam_id__2022": 0.17293233082706766, |
|
"acc_stderr,exam_id__2022": 0.01889827424607104, |
|
"acc,exam_id__2015": 0.31932773109243695, |
|
"acc_stderr,exam_id__2015": 0.02472561230954832, |
|
"acc,exam_id__2014": 0.21100917431192662, |
|
"acc_stderr,exam_id__2014": 0.022513127008089658, |
|
"acc,exam_id__2016": 0.18181818181818182, |
|
"acc_stderr,exam_id__2016": 0.020248897347876847, |
|
"acc,exam_id__2010": 0.23076923076923078, |
|
"acc_stderr,exam_id__2010": 0.02246971773699712, |
|
"acc,exam_id__2017": 0.27586206896551724, |
|
"acc_stderr,exam_id__2017": 0.023994074423977864, |
|
"acc,exam_id__2011": 0.15384615384615385, |
|
"acc_stderr,exam_id__2011": 0.019264278502154123 |
|
}, |
|
"faquad_nli": { |
|
"f1_macro,all": 0.4396551724137931, |
|
"f1_macro_stderr,all": 0.0035796984729087084, |
|
"acc,all": 0.7846153846153846, |
|
"acc_stderr,all": 0.011396120309131327, |
|
"alias": "faquad_nli" |
|
}, |
|
"hatebr_offensive": { |
|
"alias": "hatebr_offensive_binary", |
|
"f1_macro,all": 0.43054708155379295, |
|
"f1_macro_stderr,all": 0.009093679844467082, |
|
"acc,all": 0.4742857142857143, |
|
"acc_stderr,all": 0.009437507998400261 |
|
}, |
|
"oab_exams": { |
|
"acc,all": 0.25968109339407747, |
|
"acc_stderr,all": 0.005403181658894358, |
|
"acc,exam_id__2017-22": 0.2375, |
|
"acc_stderr,exam_id__2017-22": 0.027511429390216682, |
|
"acc,exam_id__2016-20a": 0.2, |
|
"acc_stderr,exam_id__2016-20a": 0.02584175311098727, |
|
"acc,exam_id__2011-04": 0.2875, |
|
"acc_stderr,exam_id__2011-04": 0.02919454405528515, |
|
"acc,exam_id__2013-12": 0.2875, |
|
"acc_stderr,exam_id__2013-12": 0.029277381115049662, |
|
"acc,exam_id__2013-11": 0.325, |
|
"acc_stderr,exam_id__2013-11": 0.030286419424458838, |
|
"acc,exam_id__2010-02": 0.32, |
|
"acc_stderr,exam_id__2010-02": 0.026888774775418785, |
|
"acc,exam_id__2012-07": 0.2375, |
|
"acc_stderr,exam_id__2012-07": 0.02737558649609428, |
|
"acc,exam_id__2016-21": 0.25, |
|
"acc_stderr,exam_id__2016-21": 0.027994547544285982, |
|
"acc,exam_id__2015-17": 0.24358974358974358, |
|
"acc_stderr,exam_id__2015-17": 0.02793267139214751, |
|
"acc,exam_id__2015-18": 0.2625, |
|
"acc_stderr,exam_id__2015-18": 0.028396710161944567, |
|
"acc,exam_id__2014-14": 0.225, |
|
"acc_stderr,exam_id__2014-14": 0.026939185801353988, |
|
"acc,exam_id__2015-16": 0.3375, |
|
"acc_stderr,exam_id__2015-16": 0.030631826713546063, |
|
"acc,exam_id__2012-09": 0.23376623376623376, |
|
"acc_stderr,exam_id__2012-09": 0.02787359925121907, |
|
"acc,exam_id__2011-03": 0.26262626262626265, |
|
"acc_stderr,exam_id__2011-03": 0.025505720074946718, |
|
"acc,exam_id__2016-19": 0.2692307692307692, |
|
"acc_stderr,exam_id__2016-19": 0.028948751914583667, |
|
"acc,exam_id__2012-06a": 0.2375, |
|
"acc_stderr,exam_id__2012-06a": 0.027440075549438697, |
|
"acc,exam_id__2011-05": 0.2625, |
|
"acc_stderr,exam_id__2011-05": 0.02835789202564455, |
|
"acc,exam_id__2013-10": 0.2125, |
|
"acc_stderr,exam_id__2013-10": 0.026367641247603036, |
|
"acc,exam_id__2017-24": 0.25, |
|
"acc_stderr,exam_id__2017-24": 0.028053164455460838, |
|
"acc,exam_id__2016-20": 0.2625, |
|
"acc_stderr,exam_id__2016-20": 0.0283327789711091, |
|
"acc,exam_id__2012-08": 0.275, |
|
"acc_stderr,exam_id__2012-08": 0.02893752928626648, |
|
"acc,exam_id__2014-13": 0.2, |
|
"acc_stderr,exam_id__2014-13": 0.025784866156114444, |
|
"acc,exam_id__2018-25": 0.25, |
|
"acc_stderr,exam_id__2018-25": 0.027961840366717016, |
|
"acc,exam_id__2017-23": 0.3, |
|
"acc_stderr,exam_id__2017-23": 0.02953160157687412, |
|
"acc,exam_id__2014-15": 0.2564102564102564, |
|
"acc_stderr,exam_id__2014-15": 0.028456647275964232, |
|
"acc,exam_id__2012-06": 0.275, |
|
"acc_stderr,exam_id__2012-06": 0.028790584320040398, |
|
"acc,exam_id__2010-01": 0.23529411764705882, |
|
"acc_stderr,exam_id__2010-01": 0.026470002521428834, |
|
"alias": "oab_exams" |
|
}, |
|
"portuguese_hate_speech": { |
|
"alias": "portuguese_hate_speech_binary", |
|
"f1_macro,all": 0.35895915678524376, |
|
"f1_macro_stderr,all": 0.011204300451999685, |
|
"acc,all": 0.381903642773208, |
|
"acc_stderr,all": 0.011741654959752653 |
|
}, |
|
"tweetsentbr": { |
|
"f1_macro,all": 0.2114730555936155, |
|
"f1_macro_stderr,all": 0.0056538657419370805, |
|
"acc,all": 0.3169154228855721, |
|
"acc_stderr,all": 0.007343236186351586, |
|
"alias": "tweetsentbr" |
|
} |
|
}, |
|
"configs": { |
|
"assin2_rte": { |
|
"task": "assin2_rte", |
|
"group": [ |
|
"pt_benchmark", |
|
"assin2" |
|
], |
|
"dataset_path": "assin2", |
|
"test_split": "test", |
|
"fewshot_split": "train", |
|
"doc_to_text": "Premissa: {{premise}}\nHipótese: {{hypothesis}}\nPergunta: A hipótese pode ser inferida pela premissa? Sim ou Não?\nResposta:", |
|
"doc_to_target": "{{['Não', 'Sim'][entailment_judgment]}}", |
|
"description": "Abaixo estão pares de premissa e hipótese. Para cada par, indique se a hipótese pode ser inferida a partir da premissa, responda apenas com \"Sim\" ou \"Não\".\n\n", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"fewshot_config": { |
|
"sampler": "id_sampler", |
|
"sampler_config": { |
|
"id_list": [ |
|
1, |
|
3251, |
|
2, |
|
3252, |
|
3, |
|
4, |
|
5, |
|
6, |
|
3253, |
|
7, |
|
3254, |
|
3255, |
|
3256, |
|
8, |
|
9, |
|
10, |
|
3257, |
|
11, |
|
3258, |
|
12, |
|
13, |
|
14, |
|
15, |
|
3259, |
|
3260, |
|
3261, |
|
3262, |
|
3263, |
|
16, |
|
17, |
|
3264, |
|
18, |
|
3265, |
|
3266, |
|
3267, |
|
19, |
|
20, |
|
3268, |
|
3269, |
|
21, |
|
3270, |
|
3271, |
|
22, |
|
3272, |
|
3273, |
|
23, |
|
3274, |
|
24, |
|
25, |
|
3275 |
|
], |
|
"id_column": "sentence_pair_id" |
|
} |
|
}, |
|
"num_fewshot": 15, |
|
"metric_list": [ |
|
{ |
|
"metric": "f1_macro", |
|
"aggregation": "f1_macro", |
|
"higher_is_better": true |
|
}, |
|
{ |
|
"metric": "acc", |
|
"aggregation": "acc", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "generate_until", |
|
"generation_kwargs": { |
|
"max_gen_toks": 32, |
|
"do_sample": false, |
|
"temperature": 0.0, |
|
"top_k": null, |
|
"top_p": null, |
|
"until": [ |
|
"\n\n" |
|
] |
|
}, |
|
"repeats": 1, |
|
"filter_list": [ |
|
{ |
|
"name": "all", |
|
"filter": [ |
|
{ |
|
"function": "find_similar_label", |
|
"labels": [ |
|
"Sim", |
|
"Não" |
|
] |
|
}, |
|
{ |
|
"function": "take_first" |
|
} |
|
] |
|
} |
|
], |
|
"should_decontaminate": false, |
|
"metadata": { |
|
"version": 1.1 |
|
} |
|
}, |
|
"assin2_sts": { |
|
"task": "assin2_sts", |
|
"group": [ |
|
"pt_benchmark", |
|
"assin2" |
|
], |
|
"dataset_path": "assin2", |
|
"test_split": "test", |
|
"fewshot_split": "train", |
|
"doc_to_text": "Frase 1: {{premise}}\nFrase 2: {{hypothesis}}\nPergunta: Quão similares são as duas frases? Dê uma pontuação entre 1,0 a 5,0.\nResposta:", |
|
"doc_to_target": "<function assin2_float_to_pt_str at 0x14c26cd9a5c0>", |
|
"description": "Abaixo estão pares de frases que você deve avaliar o grau de similaridade. Dê uma pontuação entre 1,0 e 5,0, sendo 1,0 pouco similar e 5,0 muito similar.\n\n", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"fewshot_config": { |
|
"sampler": "id_sampler", |
|
"sampler_config": { |
|
"id_list": [ |
|
1, |
|
3251, |
|
2, |
|
3252, |
|
3, |
|
4, |
|
5, |
|
6, |
|
3253, |
|
7, |
|
3254, |
|
3255, |
|
3256, |
|
8, |
|
9, |
|
10, |
|
3257, |
|
11, |
|
3258, |
|
12, |
|
13, |
|
14, |
|
15, |
|
3259, |
|
3260, |
|
3261, |
|
3262, |
|
3263, |
|
16, |
|
17, |
|
3264, |
|
18, |
|
3265, |
|
3266, |
|
3267, |
|
19, |
|
20, |
|
3268, |
|
3269, |
|
21, |
|
3270, |
|
3271, |
|
22, |
|
3272, |
|
3273, |
|
23, |
|
3274, |
|
24, |
|
25, |
|
3275 |
|
], |
|
"id_column": "sentence_pair_id" |
|
} |
|
}, |
|
"num_fewshot": 10, |
|
"metric_list": [ |
|
{ |
|
"metric": "pearson", |
|
"aggregation": "pearsonr", |
|
"higher_is_better": true |
|
}, |
|
{ |
|
"metric": "mse", |
|
"aggregation": "mean_squared_error", |
|
"higher_is_better": false |
|
} |
|
], |
|
"output_type": "generate_until", |
|
"generation_kwargs": { |
|
"max_gen_toks": 32, |
|
"do_sample": false, |
|
"temperature": 0.0, |
|
"top_k": null, |
|
"top_p": null, |
|
"until": [ |
|
"\n\n" |
|
] |
|
}, |
|
"repeats": 1, |
|
"filter_list": [ |
|
{ |
|
"name": "all", |
|
"filter": [ |
|
{ |
|
"function": "number_filter", |
|
"type": "float", |
|
"range_min": 1.0, |
|
"range_max": 5.0, |
|
"on_outside_range": "clip", |
|
"fallback": 5.0 |
|
}, |
|
{ |
|
"function": "take_first" |
|
} |
|
] |
|
} |
|
], |
|
"should_decontaminate": false, |
|
"metadata": { |
|
"version": 1.1 |
|
} |
|
}, |
|
"bluex": { |
|
"task": "bluex", |
|
"group": [ |
|
"pt_benchmark", |
|
"vestibular" |
|
], |
|
"dataset_path": "eduagarcia-temp/BLUEX_without_images", |
|
"test_split": "train", |
|
"fewshot_split": "train", |
|
"doc_to_text": "<function enem_doc_to_text at 0x14c26cd99b20>", |
|
"doc_to_target": "{{answerKey}}", |
|
"description": "As perguntas a seguir são questões de múltipla escolha de provas de vestibular de universidades brasileiras, selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"fewshot_config": { |
|
"sampler": "id_sampler", |
|
"sampler_config": { |
|
"id_list": [ |
|
"USP_2018_3", |
|
"UNICAMP_2018_2", |
|
"USP_2018_35", |
|
"UNICAMP_2018_16", |
|
"USP_2018_89" |
|
], |
|
"id_column": "id", |
|
"exclude_from_task": true |
|
} |
|
}, |
|
"num_fewshot": 3, |
|
"metric_list": [ |
|
{ |
|
"metric": "acc", |
|
"aggregation": "acc", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "generate_until", |
|
"generation_kwargs": { |
|
"max_gen_toks": 32, |
|
"do_sample": false, |
|
"temperature": 0.0, |
|
"top_k": null, |
|
"top_p": null, |
|
"until": [ |
|
"\n\n" |
|
] |
|
}, |
|
"repeats": 1, |
|
"filter_list": [ |
|
{ |
|
"name": "all", |
|
"filter": [ |
|
{ |
|
"function": "normalize_spaces" |
|
}, |
|
{ |
|
"function": "remove_accents" |
|
}, |
|
{ |
|
"function": "find_choices", |
|
"choices": [ |
|
"A", |
|
"B", |
|
"C", |
|
"D", |
|
"E" |
|
], |
|
"regex_patterns": [ |
|
"(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b", |
|
"\\b([ABCDE])\\.", |
|
"\\b([ABCDE]) ?[.):-]", |
|
"\\b([ABCDE])$", |
|
"\\b([ABCDE])\\b" |
|
] |
|
}, |
|
{ |
|
"function": "take_first" |
|
} |
|
], |
|
"group_by": { |
|
"column": "exam_id" |
|
} |
|
} |
|
], |
|
"should_decontaminate": true, |
|
"doc_to_decontamination_query": "<function enem_doc_to_text at 0x14c26cd99e40>", |
|
"metadata": { |
|
"version": 1.1 |
|
} |
|
}, |
|
"enem_challenge": { |
|
"task": "enem_challenge", |
|
"task_alias": "enem", |
|
"group": [ |
|
"pt_benchmark", |
|
"vestibular" |
|
], |
|
"dataset_path": "eduagarcia/enem_challenge", |
|
"test_split": "train", |
|
"fewshot_split": "train", |
|
"doc_to_text": "<function enem_doc_to_text at 0x14c26cd9a020>", |
|
"doc_to_target": "{{answerKey}}", |
|
"description": "As perguntas a seguir são questões de múltipla escolha do Exame Nacional do Ensino Médio (ENEM), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"fewshot_config": { |
|
"sampler": "id_sampler", |
|
"sampler_config": { |
|
"id_list": [ |
|
"2022_21", |
|
"2022_88", |
|
"2022_143" |
|
], |
|
"id_column": "id", |
|
"exclude_from_task": true |
|
} |
|
}, |
|
"num_fewshot": 3, |
|
"metric_list": [ |
|
{ |
|
"metric": "acc", |
|
"aggregation": "acc", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "generate_until", |
|
"generation_kwargs": { |
|
"max_gen_toks": 32, |
|
"do_sample": false, |
|
"temperature": 0.0, |
|
"top_k": null, |
|
"top_p": null, |
|
"until": [ |
|
"\n\n" |
|
] |
|
}, |
|
"repeats": 1, |
|
"filter_list": [ |
|
{ |
|
"name": "all", |
|
"filter": [ |
|
{ |
|
"function": "normalize_spaces" |
|
}, |
|
{ |
|
"function": "remove_accents" |
|
}, |
|
{ |
|
"function": "find_choices", |
|
"choices": [ |
|
"A", |
|
"B", |
|
"C", |
|
"D", |
|
"E" |
|
], |
|
"regex_patterns": [ |
|
"(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b", |
|
"\\b([ABCDE])\\.", |
|
"\\b([ABCDE]) ?[.):-]", |
|
"\\b([ABCDE])$", |
|
"\\b([ABCDE])\\b" |
|
] |
|
}, |
|
{ |
|
"function": "take_first" |
|
} |
|
], |
|
"group_by": { |
|
"column": "exam_id" |
|
} |
|
} |
|
], |
|
"should_decontaminate": true, |
|
"doc_to_decontamination_query": "<function enem_doc_to_text at 0x14c26cd9a200>", |
|
"metadata": { |
|
"version": 1.1 |
|
} |
|
}, |
|
"faquad_nli": { |
|
"task": "faquad_nli", |
|
"group": [ |
|
"pt_benchmark" |
|
], |
|
"dataset_path": "ruanchaves/faquad-nli", |
|
"test_split": "test", |
|
"fewshot_split": "train", |
|
"doc_to_text": "Pergunta: {{question}}\nResposta: {{answer}}\nA resposta dada satisfaz à pergunta? Sim ou Não?", |
|
"doc_to_target": "{{['Não', 'Sim'][label]}}", |
|
"description": "Abaixo estão pares de pergunta e resposta. Para cada par, você deve julgar se a resposta responde à pergunta de maneira satisfatória e aparenta estar correta. Escreva apenas \"Sim\" ou \"Não\".\n\n", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"fewshot_config": { |
|
"sampler": "first_n", |
|
"sampler_config": { |
|
"fewshot_indices": [ |
|
1893, |
|
949, |
|
663, |
|
105, |
|
1169, |
|
2910, |
|
2227, |
|
2813, |
|
974, |
|
558, |
|
1503, |
|
1958, |
|
2918, |
|
601, |
|
1560, |
|
984, |
|
2388, |
|
995, |
|
2233, |
|
1982, |
|
165, |
|
2788, |
|
1312, |
|
2285, |
|
522, |
|
1113, |
|
1670, |
|
323, |
|
236, |
|
1263, |
|
1562, |
|
2519, |
|
1049, |
|
432, |
|
1167, |
|
1394, |
|
2022, |
|
2551, |
|
2194, |
|
2187, |
|
2282, |
|
2816, |
|
108, |
|
301, |
|
1185, |
|
1315, |
|
1420, |
|
2436, |
|
2322, |
|
766 |
|
] |
|
} |
|
}, |
|
"num_fewshot": 15, |
|
"metric_list": [ |
|
{ |
|
"metric": "f1_macro", |
|
"aggregation": "f1_macro", |
|
"higher_is_better": true |
|
}, |
|
{ |
|
"metric": "acc", |
|
"aggregation": "acc", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "generate_until", |
|
"generation_kwargs": { |
|
"max_gen_toks": 32, |
|
"do_sample": false, |
|
"temperature": 0.0, |
|
"top_k": null, |
|
"top_p": null, |
|
"until": [ |
|
"\n\n" |
|
] |
|
}, |
|
"repeats": 1, |
|
"filter_list": [ |
|
{ |
|
"name": "all", |
|
"filter": [ |
|
{ |
|
"function": "find_similar_label", |
|
"labels": [ |
|
"Sim", |
|
"Não" |
|
] |
|
}, |
|
{ |
|
"function": "take_first" |
|
} |
|
] |
|
} |
|
], |
|
"should_decontaminate": false, |
|
"metadata": { |
|
"version": 1.1 |
|
} |
|
}, |
|
"hatebr_offensive": { |
|
"task": "hatebr_offensive", |
|
"task_alias": "hatebr_offensive_binary", |
|
"group": [ |
|
"pt_benchmark" |
|
], |
|
"dataset_path": "eduagarcia/portuguese_benchmark", |
|
"dataset_name": "HateBR_offensive_binary", |
|
"test_split": "test", |
|
"fewshot_split": "train", |
|
"doc_to_text": "Texto: {{sentence}}\nPergunta: O texto é ofensivo?\nResposta:", |
|
"doc_to_target": "{{'Sim' if label == 1 else 'Não'}}", |
|
"description": "Abaixo contém o texto de comentários de usuários do Instagram em português, sua tarefa é classificar se o texto é ofensivo ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"fewshot_config": { |
|
"sampler": "id_sampler", |
|
"sampler_config": { |
|
"id_list": [ |
|
48, |
|
44, |
|
36, |
|
20, |
|
3511, |
|
88, |
|
3555, |
|
16, |
|
56, |
|
3535, |
|
60, |
|
40, |
|
3527, |
|
4, |
|
76, |
|
3579, |
|
3523, |
|
3551, |
|
68, |
|
3503, |
|
84, |
|
3539, |
|
64, |
|
3599, |
|
80, |
|
3563, |
|
3559, |
|
3543, |
|
3547, |
|
3587, |
|
3595, |
|
3575, |
|
3567, |
|
3591, |
|
24, |
|
96, |
|
92, |
|
3507, |
|
52, |
|
72, |
|
8, |
|
3571, |
|
3515, |
|
3519, |
|
3531, |
|
28, |
|
32, |
|
0, |
|
12, |
|
3583 |
|
], |
|
"id_column": "idx" |
|
} |
|
}, |
|
"num_fewshot": 25, |
|
"metric_list": [ |
|
{ |
|
"metric": "f1_macro", |
|
"aggregation": "f1_macro", |
|
"higher_is_better": true |
|
}, |
|
{ |
|
"metric": "acc", |
|
"aggregation": "acc", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "generate_until", |
|
"generation_kwargs": { |
|
"max_gen_toks": 32, |
|
"do_sample": false, |
|
"temperature": 0.0, |
|
"top_k": null, |
|
"top_p": null, |
|
"until": [ |
|
"\n\n" |
|
] |
|
}, |
|
"repeats": 1, |
|
"filter_list": [ |
|
{ |
|
"name": "all", |
|
"filter": [ |
|
{ |
|
"function": "find_similar_label", |
|
"labels": [ |
|
"Sim", |
|
"Não" |
|
] |
|
}, |
|
{ |
|
"function": "take_first" |
|
} |
|
] |
|
} |
|
], |
|
"should_decontaminate": false, |
|
"metadata": { |
|
"version": 1.0 |
|
} |
|
}, |
|
"oab_exams": { |
|
"task": "oab_exams", |
|
"group": [ |
|
"legal_benchmark", |
|
"pt_benchmark" |
|
], |
|
"dataset_path": "eduagarcia/oab_exams", |
|
"test_split": "train", |
|
"fewshot_split": "train", |
|
"doc_to_text": "<function doc_to_text at 0x14c26cd9ad40>", |
|
"doc_to_target": "{{answerKey}}", |
|
"description": "As perguntas a seguir são questões de múltipla escolha do Exame de Ordem da Ordem dos Advogados do Brasil (OAB), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\" ou \"D\".\n\n", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"fewshot_config": { |
|
"sampler": "id_sampler", |
|
"sampler_config": { |
|
"id_list": [ |
|
"2010-01_1", |
|
"2010-01_11", |
|
"2010-01_13", |
|
"2010-01_23", |
|
"2010-01_26", |
|
"2010-01_28", |
|
"2010-01_38", |
|
"2010-01_48", |
|
"2010-01_58", |
|
"2010-01_68", |
|
"2010-01_76", |
|
"2010-01_83", |
|
"2010-01_85", |
|
"2010-01_91", |
|
"2010-01_99" |
|
], |
|
"id_column": "id", |
|
"exclude_from_task": true |
|
} |
|
}, |
|
"num_fewshot": 3, |
|
"metric_list": [ |
|
{ |
|
"metric": "acc", |
|
"aggregation": "acc", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "generate_until", |
|
"generation_kwargs": { |
|
"max_gen_toks": 32, |
|
"do_sample": false, |
|
"temperature": 0.0, |
|
"top_k": null, |
|
"top_p": null, |
|
"until": [ |
|
"\n\n" |
|
] |
|
}, |
|
"repeats": 1, |
|
"filter_list": [ |
|
{ |
|
"name": "all", |
|
"filter": [ |
|
{ |
|
"function": "normalize_spaces" |
|
}, |
|
{ |
|
"function": "remove_accents" |
|
}, |
|
{ |
|
"function": "find_choices", |
|
"choices": [ |
|
"A", |
|
"B", |
|
"C", |
|
"D" |
|
], |
|
"regex_patterns": [ |
|
"(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCD])\\b", |
|
"\\b([ABCD])\\.", |
|
"\\b([ABCD]) ?[.):-]", |
|
"\\b([ABCD])$", |
|
"\\b([ABCD])\\b" |
|
] |
|
}, |
|
{ |
|
"function": "take_first" |
|
} |
|
], |
|
"group_by": { |
|
"column": "exam_id" |
|
} |
|
} |
|
], |
|
"should_decontaminate": true, |
|
"doc_to_decontamination_query": "<function doc_to_text at 0x14c26cd9afc0>", |
|
"metadata": { |
|
"version": 1.5 |
|
} |
|
}, |
|
"portuguese_hate_speech": { |
|
"task": "portuguese_hate_speech", |
|
"task_alias": "portuguese_hate_speech_binary", |
|
"group": [ |
|
"pt_benchmark" |
|
], |
|
"dataset_path": "eduagarcia/portuguese_benchmark", |
|
"dataset_name": "Portuguese_Hate_Speech_binary", |
|
"test_split": "test", |
|
"fewshot_split": "train", |
|
"doc_to_text": "Texto: {{sentence}}\nPergunta: O texto contém discurso de ódio?\nResposta:", |
|
"doc_to_target": "{{'Sim' if label == 1 else 'Não'}}", |
|
"description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o texto contém discurso de ódio ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"fewshot_config": { |
|
"sampler": "id_sampler", |
|
"sampler_config": { |
|
"id_list": [ |
|
52, |
|
50, |
|
39, |
|
28, |
|
3, |
|
105, |
|
22, |
|
25, |
|
60, |
|
11, |
|
66, |
|
41, |
|
9, |
|
4, |
|
91, |
|
42, |
|
7, |
|
20, |
|
76, |
|
1, |
|
104, |
|
13, |
|
67, |
|
54, |
|
97, |
|
27, |
|
24, |
|
14, |
|
16, |
|
48, |
|
53, |
|
40, |
|
34, |
|
49, |
|
32, |
|
119, |
|
114, |
|
2, |
|
58, |
|
83, |
|
18, |
|
36, |
|
5, |
|
6, |
|
10, |
|
35, |
|
38, |
|
0, |
|
21, |
|
46 |
|
], |
|
"id_column": "idx" |
|
} |
|
}, |
|
"num_fewshot": 25, |
|
"metric_list": [ |
|
{ |
|
"metric": "f1_macro", |
|
"aggregation": "f1_macro", |
|
"higher_is_better": true |
|
}, |
|
{ |
|
"metric": "acc", |
|
"aggregation": "acc", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "generate_until", |
|
"generation_kwargs": { |
|
"max_gen_toks": 32, |
|
"do_sample": false, |
|
"temperature": 0.0, |
|
"top_k": null, |
|
"top_p": null, |
|
"until": [ |
|
"\n\n" |
|
] |
|
}, |
|
"repeats": 1, |
|
"filter_list": [ |
|
{ |
|
"name": "all", |
|
"filter": [ |
|
{ |
|
"function": "find_similar_label", |
|
"labels": [ |
|
"Sim", |
|
"Não" |
|
] |
|
}, |
|
{ |
|
"function": "take_first" |
|
} |
|
] |
|
} |
|
], |
|
"should_decontaminate": false, |
|
"metadata": { |
|
"version": 1.0 |
|
} |
|
}, |
|
"tweetsentbr": { |
|
"task": "tweetsentbr", |
|
"group": [ |
|
"pt_benchmark" |
|
], |
|
"dataset_path": "eduagarcia/tweetsentbr_fewshot", |
|
"test_split": "test", |
|
"fewshot_split": "train", |
|
"doc_to_text": "Texto: {{sentence}}\nPergunta: O sentimento do texto é Positivo, Neutro ou Negativo?\nResposta:", |
|
"doc_to_target": "{{'Positivo' if label == 'Positive' else ('Negativo' if label == 'Negative' else 'Neutro')}}", |
|
"description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o sentimento do texto é Positivo, Neutro ou Negativo. Responda apenas com uma das opções.\n\n", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"fewshot_config": { |
|
"sampler": "first_n" |
|
}, |
|
"num_fewshot": 25, |
|
"metric_list": [ |
|
{ |
|
"metric": "f1_macro", |
|
"aggregation": "f1_macro", |
|
"higher_is_better": true |
|
}, |
|
{ |
|
"metric": "acc", |
|
"aggregation": "acc", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "generate_until", |
|
"generation_kwargs": { |
|
"max_gen_toks": 32, |
|
"do_sample": false, |
|
"temperature": 0.0, |
|
"top_k": null, |
|
"top_p": null, |
|
"until": [ |
|
"\n\n" |
|
] |
|
}, |
|
"repeats": 1, |
|
"filter_list": [ |
|
{ |
|
"name": "all", |
|
"filter": [ |
|
{ |
|
"function": "find_similar_label", |
|
"labels": [ |
|
"Positivo", |
|
"Neutro", |
|
"Negativo" |
|
] |
|
}, |
|
{ |
|
"function": "take_first" |
|
} |
|
] |
|
} |
|
], |
|
"should_decontaminate": false, |
|
"metadata": { |
|
"version": 1.0 |
|
} |
|
} |
|
}, |
|
"versions": { |
|
"assin2_rte": 1.1, |
|
"assin2_sts": 1.1, |
|
"bluex": 1.1, |
|
"enem_challenge": 1.1, |
|
"faquad_nli": 1.1, |
|
"hatebr_offensive": 1.0, |
|
"oab_exams": 1.5, |
|
"portuguese_hate_speech": 1.0, |
|
"tweetsentbr": 1.0 |
|
}, |
|
"n-shot": { |
|
"assin2_rte": 15, |
|
"assin2_sts": 10, |
|
"bluex": 3, |
|
"enem_challenge": 3, |
|
"faquad_nli": 15, |
|
"hatebr_offensive": 25, |
|
"oab_exams": 3, |
|
"portuguese_hate_speech": 25, |
|
"tweetsentbr": 25 |
|
}, |
|
"model_meta": { |
|
"truncated": 0, |
|
"non_truncated": 14150, |
|
"padded": 0, |
|
"non_padded": 14150, |
|
"fewshots_truncated": 0, |
|
"has_chat_template": true, |
|
"chat_type": "user_assistant", |
|
"n_gpus": 1, |
|
"accelerate_num_process": null, |
|
"model_sha": "None", |
|
"model_dtype": "torch.bfloat16", |
|
"model_memory_footprint": 4889264960, |
|
"model_num_parameters": 2444628480, |
|
"model_is_loaded_in_4bit": null, |
|
"model_is_loaded_in_8bit": null, |
|
"model_is_quantized": null, |
|
"model_device": "cuda:0", |
|
"batch_size": 16, |
|
"max_length": 4096, |
|
"max_ctx_length": 4064, |
|
"max_gen_toks": 32 |
|
}, |
|
"task_model_meta": { |
|
"assin2_rte": { |
|
"sample_size": 2448, |
|
"truncated": 0, |
|
"non_truncated": 2448, |
|
"padded": 0, |
|
"non_padded": 2448, |
|
"fewshots_truncated": 0, |
|
"mean_seq_length": 1061.423202614379, |
|
"min_seq_length": 1046, |
|
"max_seq_length": 1100, |
|
"max_ctx_length": 4064, |
|
"max_gen_toks": 32, |
|
"mean_original_fewshots_size": 15.0, |
|
"mean_effective_fewshot_size": 15.0 |
|
}, |
|
"assin2_sts": { |
|
"sample_size": 2448, |
|
"truncated": 0, |
|
"non_truncated": 2448, |
|
"padded": 0, |
|
"non_padded": 2448, |
|
"fewshots_truncated": 0, |
|
"mean_seq_length": 747.4232026143791, |
|
"min_seq_length": 732, |
|
"max_seq_length": 786, |
|
"max_ctx_length": 4064, |
|
"max_gen_toks": 32, |
|
"mean_original_fewshots_size": 10.0, |
|
"mean_effective_fewshot_size": 10.0 |
|
}, |
|
"bluex": { |
|
"sample_size": 719, |
|
"truncated": 0, |
|
"non_truncated": 719, |
|
"padded": 0, |
|
"non_padded": 719, |
|
"fewshots_truncated": 0, |
|
"mean_seq_length": 1198.817802503477, |
|
"min_seq_length": 932, |
|
"max_seq_length": 1829, |
|
"max_ctx_length": 4064, |
|
"max_gen_toks": 32, |
|
"mean_original_fewshots_size": 3.0, |
|
"mean_effective_fewshot_size": 3.0 |
|
}, |
|
"enem_challenge": { |
|
"sample_size": 1429, |
|
"truncated": 0, |
|
"non_truncated": 1429, |
|
"padded": 0, |
|
"non_padded": 1429, |
|
"fewshots_truncated": 0, |
|
"mean_seq_length": 1035.4177746675998, |
|
"min_seq_length": 857, |
|
"max_seq_length": 2512, |
|
"max_ctx_length": 4064, |
|
"max_gen_toks": 32, |
|
"mean_original_fewshots_size": 3.0, |
|
"mean_effective_fewshot_size": 3.0 |
|
}, |
|
"faquad_nli": { |
|
"sample_size": 650, |
|
"truncated": 0, |
|
"non_truncated": 650, |
|
"padded": 0, |
|
"non_padded": 650, |
|
"fewshots_truncated": 0, |
|
"mean_seq_length": 1083.1338461538462, |
|
"min_seq_length": 1051, |
|
"max_seq_length": 1149, |
|
"max_ctx_length": 4064, |
|
"max_gen_toks": 32, |
|
"mean_original_fewshots_size": 15.0, |
|
"mean_effective_fewshot_size": 15.0 |
|
}, |
|
"hatebr_offensive": { |
|
"sample_size": 1400, |
|
"truncated": 0, |
|
"non_truncated": 1400, |
|
"padded": 0, |
|
"non_padded": 1400, |
|
"fewshots_truncated": 0, |
|
"mean_seq_length": 1090.4407142857142, |
|
"min_seq_length": 1075, |
|
"max_seq_length": 1284, |
|
"max_ctx_length": 4064, |
|
"max_gen_toks": 32, |
|
"mean_original_fewshots_size": 25.0, |
|
"mean_effective_fewshot_size": 25.0 |
|
}, |
|
"oab_exams": { |
|
"sample_size": 2195, |
|
"truncated": 0, |
|
"non_truncated": 2195, |
|
"padded": 0, |
|
"non_padded": 2195, |
|
"fewshots_truncated": 0, |
|
"mean_seq_length": 863.024145785877, |
|
"min_seq_length": 690, |
|
"max_seq_length": 1139, |
|
"max_ctx_length": 4064, |
|
"max_gen_toks": 32, |
|
"mean_original_fewshots_size": 3.0, |
|
"mean_effective_fewshot_size": 3.0 |
|
}, |
|
"portuguese_hate_speech": { |
|
"sample_size": 851, |
|
"truncated": 0, |
|
"non_truncated": 851, |
|
"padded": 0, |
|
"non_padded": 851, |
|
"fewshots_truncated": 0, |
|
"mean_seq_length": 1442.021151586369, |
|
"min_seq_length": 1415, |
|
"max_seq_length": 1478, |
|
"max_ctx_length": 4064, |
|
"max_gen_toks": 32, |
|
"mean_original_fewshots_size": 25.0, |
|
"mean_effective_fewshot_size": 25.0 |
|
}, |
|
"tweetsentbr": { |
|
"sample_size": 2010, |
|
"truncated": 0, |
|
"non_truncated": 2010, |
|
"padded": 0, |
|
"non_padded": 2010, |
|
"fewshots_truncated": 0, |
|
"mean_seq_length": 1370.4194029850746, |
|
"min_seq_length": 1353, |
|
"max_seq_length": 1427, |
|
"max_ctx_length": 4064, |
|
"max_gen_toks": 32, |
|
"mean_original_fewshots_size": 25.0, |
|
"mean_effective_fewshot_size": 25.0 |
|
} |
|
}, |
|
"config": { |
|
"model": "huggingface", |
|
"model_args": "pretrained=/lustre/mlnvme/data/asen_hpc-mula/checkpoints-llama/slurm_job_17782345/step_42164", |
|
"batch_size": "auto", |
|
"batch_sizes": [], |
|
"device": "cuda:0", |
|
"use_cache": null, |
|
"limit": null, |
|
"bootstrap_iters": 100000, |
|
"gen_kwargs": null |
|
}, |
|
"git_hash": null |
|
} |