diff --git "a/lm-evaluation-harness.ipynb" "b/lm-evaluation-harness.ipynb" new file mode 100644--- /dev/null +++ "b/lm-evaluation-harness.ipynb" @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{"id":"Ac6wadk3rmkK"},"source":["# LM Evaluation Harness (by [EleutherAI](https://www.eleuther.ai/))\n","\n","This [`LM-Evaluation-Harness`](https://github.com/EleutherAI/lm-evaluation-harness) provides a unified framework to test generative language models on a large number of different evaluation tasks. For a complete list of available tasks, see the [task table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md), or scroll to the bottom of the page.\n","\n","1. Clone the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and install the necessary libraries (`sentencepiece` is required for the Llama tokenizer)."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":40508,"status":"ok","timestamp":1698580809187,"user":{"displayName":"Nicholas CorrΓͺa","userId":"09736120585766268588"},"user_tz":-60},"id":"UA5I86u91e0A","outputId":"2342bf64-d93b-441f-8643-8e4003c6ef6c"},"outputs":[],"source":["!git clone --branch master https://github.com/EleutherAI/lm-evaluation-harness\n","!cd lm-evaluation-harness && pip install -e . -q\n","!pip install cohere tiktoken sentencepiece -q"]},{"cell_type":"markdown","metadata":{},"source":["2. Run the evaluation harness on the selected tasks."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1753416,"status":"ok","timestamp":1698583348574,"user":{"displayName":"Nicholas CorrΓͺa","userId":"09736120585766268588"},"user_tz":-60},"id":"pnHoAVK25QZn","outputId":"23f65f99-82f8-423f-9c8a-b1d4f2bdbd56"},"outputs":[],"source":["!huggingface-cli login --token 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\n","!cd lm-evaluation-harness && python main.py \\\n"," --model hf-causal \\\n"," --model_args pretrained=nicholasKluge/Aira-OPT-125M \\\n"," --tasks arc_challenge,truthfulqa_mc,toxigen \\\n"," --device cuda:0"]},{"cell_type":"markdown","metadata":{"id":"4Bm78wiZ4Own"},"source":["## Task Table πŸ“š\n","\n","| Task Name |Train|Val|Test|Val/Test Docs| Metrics |\n","|---------------------------------------------------------|-----|---|----|------------:|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n","|anagrams1 | |βœ“ | | 10000|acc |\n","|anagrams2 | |βœ“ | | 10000|acc |\n","|anli_r1 |βœ“ |βœ“ |βœ“ | 1000|acc |\n","|anli_r2 |βœ“ |βœ“ |βœ“ | 1000|acc |\n","|anli_r3 |βœ“ |βœ“ |βœ“ | 1200|acc |\n","|arc_challenge |βœ“ |βœ“ |βœ“ | 1172|acc, acc_norm |\n","|arc_easy |βœ“ |βœ“ |βœ“ | 2376|acc, acc_norm |\n","|arithmetic_1dc | |βœ“ | | 2000|acc |\n","|arithmetic_2da | |βœ“ | | 2000|acc |\n","|arithmetic_2dm | |βœ“ | | 2000|acc |\n","|arithmetic_2ds | |βœ“ | | 2000|acc |\n","|arithmetic_3da | |βœ“ | | 2000|acc |\n","|arithmetic_3ds | |βœ“ | | 2000|acc |\n","|arithmetic_4da | |βœ“ | | 2000|acc |\n","|arithmetic_4ds | |βœ“ | | 2000|acc |\n","|arithmetic_5da | |βœ“ | | 2000|acc |\n","|arithmetic_5ds | |βœ“ | | 2000|acc |\n","|bigbench_causal_judgement | | |βœ“ | 190|multiple_choice_grade, exact_str_match |\n","|bigbench_date_understanding | | |βœ“ | 369|multiple_choice_grade, exact_str_match |\n","|bigbench_disambiguation_qa | | |βœ“ | 258|multiple_choice_grade, exact_str_match |\n","|bigbench_dyck_languages | | |βœ“ | 1000|multiple_choice_grade, exact_str_match |\n","|bigbench_formal_fallacies_syllogisms_negation | | |βœ“ | 14200|multiple_choice_grade, exact_str_match |\n","|bigbench_geometric_shapes | | |βœ“ | 359|multiple_choice_grade, exact_str_match |\n","|bigbench_hyperbaton | | |βœ“ | 50000|multiple_choice_grade, exact_str_match |\n","|bigbench_logical_deduction_five_objects | | |βœ“ | 500|multiple_choice_grade, exact_str_match |\n","|bigbench_logical_deduction_seven_objects | | |βœ“ | 700|multiple_choice_grade, exact_str_match |\n","|bigbench_logical_deduction_three_objects | | |βœ“ | 300|multiple_choice_grade, exact_str_match |\n","|bigbench_movie_recommendation | | |βœ“ | 500|multiple_choice_grade, exact_str_match |\n","|bigbench_navigate | | |βœ“ | 1000|multiple_choice_grade, exact_str_match |\n","|bigbench_reasoning_about_colored_objects | | |βœ“ | 2000|multiple_choice_grade, exact_str_match |\n","|bigbench_ruin_names | | |βœ“ | 448|multiple_choice_grade, exact_str_match |\n","|bigbench_salient_translation_error_detection | | |βœ“ | 998|multiple_choice_grade, exact_str_match |\n","|bigbench_snarks | | |βœ“ | 181|multiple_choice_grade, exact_str_match |\n","|bigbench_sports_understanding | | |βœ“ | 986|multiple_choice_grade, exact_str_match |\n","|bigbench_temporal_sequences | | |βœ“ | 1000|multiple_choice_grade, exact_str_match |\n","|bigbench_tracking_shuffled_objects_five_objects | | |βœ“ | 1250|multiple_choice_grade, exact_str_match |\n","|bigbench_tracking_shuffled_objects_seven_objects | | |βœ“ | 1750|multiple_choice_grade, exact_str_match |\n","|bigbench_tracking_shuffled_objects_three_objects | | |βœ“ | 300|multiple_choice_grade, exact_str_match |\n","|blimp_adjunct_island | |βœ“ | | 1000|acc |\n","|blimp_anaphor_gender_agreement | |βœ“ | | 1000|acc |\n","|blimp_anaphor_number_agreement | |βœ“ | | 1000|acc |\n","|blimp_animate_subject_passive | |βœ“ | | 1000|acc |\n","|blimp_animate_subject_trans | |βœ“ | | 1000|acc |\n","|blimp_causative | |βœ“ | | 1000|acc |\n","|blimp_complex_NP_island | |βœ“ | | 1000|acc |\n","|blimp_coordinate_structure_constraint_complex_left_branch| |βœ“ | | 1000|acc |\n","|blimp_coordinate_structure_constraint_object_extraction | |βœ“ | | 1000|acc |\n","|blimp_determiner_noun_agreement_1 | |βœ“ | | 1000|acc |\n","|blimp_determiner_noun_agreement_2 | |βœ“ | | 1000|acc |\n","|blimp_determiner_noun_agreement_irregular_1 | |βœ“ | | 1000|acc |\n","|blimp_determiner_noun_agreement_irregular_2 | |βœ“ | | 1000|acc |\n","|blimp_determiner_noun_agreement_with_adj_2 | |βœ“ | | 1000|acc |\n","|blimp_determiner_noun_agreement_with_adj_irregular_1 | |βœ“ | | 1000|acc |\n","|blimp_determiner_noun_agreement_with_adj_irregular_2 | |βœ“ | | 1000|acc |\n","|blimp_determiner_noun_agreement_with_adjective_1 | |βœ“ | | 1000|acc |\n","|blimp_distractor_agreement_relational_noun | |βœ“ | | 1000|acc |\n","|blimp_distractor_agreement_relative_clause | |βœ“ | | 1000|acc |\n","|blimp_drop_argument | |βœ“ | | 1000|acc |\n","|blimp_ellipsis_n_bar_1 | |βœ“ | | 1000|acc |\n","|blimp_ellipsis_n_bar_2 | |βœ“ | | 1000|acc |\n","|blimp_existential_there_object_raising | |βœ“ | | 1000|acc |\n","|blimp_existential_there_quantifiers_1 | |βœ“ | | 1000|acc |\n","|blimp_existential_there_quantifiers_2 | |βœ“ | | 1000|acc |\n","|blimp_existential_there_subject_raising | |βœ“ | | 1000|acc |\n","|blimp_expletive_it_object_raising | |βœ“ | | 1000|acc |\n","|blimp_inchoative | |βœ“ | | 1000|acc |\n","|blimp_intransitive | |βœ“ | | 1000|acc |\n","|blimp_irregular_past_participle_adjectives | |βœ“ | | 1000|acc |\n","|blimp_irregular_past_participle_verbs | |βœ“ | | 1000|acc |\n","|blimp_irregular_plural_subject_verb_agreement_1 | |βœ“ | | 1000|acc |\n","|blimp_irregular_plural_subject_verb_agreement_2 | |βœ“ | | 1000|acc |\n","|blimp_left_branch_island_echo_question | |βœ“ | | 1000|acc |\n","|blimp_left_branch_island_simple_question | |βœ“ | | 1000|acc |\n","|blimp_matrix_question_npi_licensor_present | |βœ“ | | 1000|acc |\n","|blimp_npi_present_1 | |βœ“ | | 1000|acc |\n","|blimp_npi_present_2 | |βœ“ | | 1000|acc |\n","|blimp_only_npi_licensor_present | |βœ“ | | 1000|acc |\n","|blimp_only_npi_scope | |βœ“ | | 1000|acc |\n","|blimp_passive_1 | |βœ“ | | 1000|acc |\n","|blimp_passive_2 | |βœ“ | | 1000|acc |\n","|blimp_principle_A_c_command | |βœ“ | | 1000|acc |\n","|blimp_principle_A_case_1 | |βœ“ | | 1000|acc |\n","|blimp_principle_A_case_2 | |βœ“ | | 1000|acc |\n","|blimp_principle_A_domain_1 | |βœ“ | | 1000|acc |\n","|blimp_principle_A_domain_2 | |βœ“ | | 1000|acc |\n","|blimp_principle_A_domain_3 | |βœ“ | | 1000|acc |\n","|blimp_principle_A_reconstruction | |βœ“ | | 1000|acc |\n","|blimp_regular_plural_subject_verb_agreement_1 | |βœ“ | | 1000|acc |\n","|blimp_regular_plural_subject_verb_agreement_2 | |βœ“ | | 1000|acc |\n","|blimp_sentential_negation_npi_licensor_present | |βœ“ | | 1000|acc |\n","|blimp_sentential_negation_npi_scope | |βœ“ | | 1000|acc |\n","|blimp_sentential_subject_island | |βœ“ | | 1000|acc |\n","|blimp_superlative_quantifiers_1 | |βœ“ | | 1000|acc |\n","|blimp_superlative_quantifiers_2 | |βœ“ | | 1000|acc |\n","|blimp_tough_vs_raising_1 | |βœ“ | | 1000|acc |\n","|blimp_tough_vs_raising_2 | |βœ“ | | 1000|acc |\n","|blimp_transitive | |βœ“ | | 1000|acc |\n","|blimp_wh_island | |βœ“ | | 1000|acc |\n","|blimp_wh_questions_object_gap | |βœ“ | | 1000|acc |\n","|blimp_wh_questions_subject_gap | |βœ“ | | 1000|acc |\n","|blimp_wh_questions_subject_gap_long_distance | |βœ“ | | 1000|acc |\n","|blimp_wh_vs_that_no_gap | |βœ“ | | 1000|acc |\n","|blimp_wh_vs_that_no_gap_long_distance | |βœ“ | | 1000|acc |\n","|blimp_wh_vs_that_with_gap | |βœ“ | | 1000|acc |\n","|blimp_wh_vs_that_with_gap_long_distance | |βœ“ | | 1000|acc |\n","|boolq |βœ“ |βœ“ | | 3270|acc |\n","|cb |βœ“ |βœ“ | | 56|acc, f1 |\n","|cola |βœ“ |βœ“ | | 1043|mcc |\n","|copa |βœ“ |βœ“ | | 100|acc |\n","|coqa |βœ“ |βœ“ | | 500|f1, em |\n","|crows_pairs_english | |βœ“ | | 1677|likelihood_difference, pct_stereotype |\n","|crows_pairs_english_age | |βœ“ | | 91|likelihood_difference, pct_stereotype |\n","|crows_pairs_english_autre | |βœ“ | | 11|likelihood_difference, pct_stereotype |\n","|crows_pairs_english_disability | |βœ“ | | 65|likelihood_difference, pct_stereotype |\n","|crows_pairs_english_gender | |βœ“ | | 320|likelihood_difference, pct_stereotype |\n","|crows_pairs_english_nationality | |βœ“ | | 216|likelihood_difference, pct_stereotype |\n","|crows_pairs_english_physical_appearance | |βœ“ | | 72|likelihood_difference, pct_stereotype |\n","|crows_pairs_english_race_color | |βœ“ | | 508|likelihood_difference, pct_stereotype |\n","|crows_pairs_english_religion | |βœ“ | | 111|likelihood_difference, pct_stereotype |\n","|crows_pairs_english_sexual_orientation | |βœ“ | | 93|likelihood_difference, pct_stereotype |\n","|crows_pairs_english_socioeconomic | |βœ“ | | 190|likelihood_difference, pct_stereotype |\n","|crows_pairs_french | |βœ“ | | 1677|likelihood_difference, pct_stereotype |\n","|crows_pairs_french_age | |βœ“ | | 90|likelihood_difference, pct_stereotype |\n","|crows_pairs_french_autre | |βœ“ | | 13|likelihood_difference, pct_stereotype |\n","|crows_pairs_french_disability | |βœ“ | | 66|likelihood_difference, pct_stereotype |\n","|crows_pairs_french_gender | |βœ“ | | 321|likelihood_difference, pct_stereotype |\n","|crows_pairs_french_nationality | |βœ“ | | 253|likelihood_difference, pct_stereotype |\n","|crows_pairs_french_physical_appearance | |βœ“ | | 72|likelihood_difference, pct_stereotype |\n","|crows_pairs_french_race_color | |βœ“ | | 460|likelihood_difference, pct_stereotype |\n","|crows_pairs_french_religion | |βœ“ | | 115|likelihood_difference, pct_stereotype |\n","|crows_pairs_french_sexual_orientation | |βœ“ | | 91|likelihood_difference, pct_stereotype |\n","|crows_pairs_french_socioeconomic | |βœ“ | | 196|likelihood_difference, pct_stereotype |\n","|cycle_letters | |βœ“ | | 10000|acc |\n","|drop |βœ“ |βœ“ | | 9536|em, f1 |\n","|ethics_cm |βœ“ | |βœ“ | 3885|acc |\n","|ethics_deontology |βœ“ | |βœ“ | 3596|acc, em |\n","|ethics_justice |βœ“ | |βœ“ | 2704|acc, em |\n","|ethics_utilitarianism |βœ“ | |βœ“ | 4808|acc |\n","|ethics_utilitarianism_original | | |βœ“ | 4808|acc |\n","|ethics_virtue |βœ“ | |βœ“ | 4975|acc, em |\n","|gsm8k |βœ“ | |βœ“ | 1319|acc |\n","|headqa |βœ“ |βœ“ |βœ“ | 2742|acc, acc_norm |\n","|headqa_en |βœ“ |βœ“ |βœ“ | 2742|acc, acc_norm |\n","|headqa_es |βœ“ |βœ“ |βœ“ | 2742|acc, acc_norm |\n","|hellaswag |βœ“ |βœ“ | | 10042|acc, acc_norm |\n","|hendrycksTest-abstract_algebra | |βœ“ |βœ“ | 100|acc, acc_norm |\n","|hendrycksTest-anatomy | |βœ“ |βœ“ | 135|acc, acc_norm |\n","|hendrycksTest-astronomy | |βœ“ |βœ“ | 152|acc, acc_norm |\n","|hendrycksTest-business_ethics | |βœ“ |βœ“ | 100|acc, acc_norm |\n","|hendrycksTest-clinical_knowledge | |βœ“ |βœ“ | 265|acc, acc_norm |\n","|hendrycksTest-college_biology | |βœ“ |βœ“ | 144|acc, acc_norm |\n","|hendrycksTest-college_chemistry | |βœ“ |βœ“ | 100|acc, acc_norm |\n","|hendrycksTest-college_computer_science | |βœ“ |βœ“ | 100|acc, acc_norm |\n","|hendrycksTest-college_mathematics | |βœ“ |βœ“ | 100|acc, acc_norm |\n","|hendrycksTest-college_medicine | |βœ“ |βœ“ | 173|acc, acc_norm |\n","|hendrycksTest-college_physics | |βœ“ |βœ“ | 102|acc, acc_norm |\n","|hendrycksTest-computer_security | |βœ“ |βœ“ | 100|acc, acc_norm |\n","|hendrycksTest-conceptual_physics | |βœ“ |βœ“ | 235|acc, acc_norm |\n","|hendrycksTest-econometrics | |βœ“ |βœ“ | 114|acc, acc_norm |\n","|hendrycksTest-electrical_engineering | |βœ“ |βœ“ | 145|acc, acc_norm |\n","|hendrycksTest-elementary_mathematics | |βœ“ |βœ“ | 378|acc, acc_norm |\n","|hendrycksTest-formal_logic | |βœ“ |βœ“ | 126|acc, acc_norm |\n","|hendrycksTest-global_facts | |βœ“ |βœ“ | 100|acc, acc_norm |\n","|hendrycksTest-high_school_biology | |βœ“ |βœ“ | 310|acc, acc_norm |\n","|hendrycksTest-high_school_chemistry | |βœ“ |βœ“ | 203|acc, acc_norm |\n","|hendrycksTest-high_school_computer_science | |βœ“ |βœ“ | 100|acc, acc_norm |\n","|hendrycksTest-high_school_european_history | |βœ“ |βœ“ | 165|acc, acc_norm |\n","|hendrycksTest-high_school_geography | |βœ“ |βœ“ | 198|acc, acc_norm |\n","|hendrycksTest-high_school_government_and_politics | |βœ“ |βœ“ | 193|acc, acc_norm |\n","|hendrycksTest-high_school_macroeconomics | |βœ“ |βœ“ | 390|acc, acc_norm |\n","|hendrycksTest-high_school_mathematics | |βœ“ |βœ“ | 270|acc, acc_norm |\n","|hendrycksTest-high_school_microeconomics | |βœ“ |βœ“ | 238|acc, acc_norm |\n","|hendrycksTest-high_school_physics | |βœ“ |βœ“ | 151|acc, acc_norm |\n","|hendrycksTest-high_school_psychology | |βœ“ |βœ“ | 545|acc, acc_norm |\n","|hendrycksTest-high_school_statistics | |βœ“ |βœ“ | 216|acc, acc_norm |\n","|hendrycksTest-high_school_us_history | |βœ“ |βœ“ | 204|acc, acc_norm |\n","|hendrycksTest-high_school_world_history | |βœ“ |βœ“ | 237|acc, acc_norm |\n","|hendrycksTest-human_aging | |βœ“ |βœ“ | 223|acc, acc_norm |\n","|hendrycksTest-human_sexuality | |βœ“ |βœ“ | 131|acc, acc_norm |\n","|hendrycksTest-international_law | |βœ“ |βœ“ | 121|acc, acc_norm |\n","|hendrycksTest-jurisprudence | |βœ“ |βœ“ | 108|acc, acc_norm |\n","|hendrycksTest-logical_fallacies | |βœ“ |βœ“ | 163|acc, acc_norm |\n","|hendrycksTest-machine_learning | |βœ“ |βœ“ | 112|acc, acc_norm |\n","|hendrycksTest-management | |βœ“ |βœ“ | 103|acc, acc_norm |\n","|hendrycksTest-marketing | |βœ“ |βœ“ | 234|acc, acc_norm |\n","|hendrycksTest-medical_genetics | |βœ“ |βœ“ | 100|acc, acc_norm |\n","|hendrycksTest-miscellaneous | |βœ“ |βœ“ | 783|acc, acc_norm |\n","|hendrycksTest-moral_disputes | |βœ“ |βœ“ | 346|acc, acc_norm |\n","|hendrycksTest-moral_scenarios | |βœ“ |βœ“ | 895|acc, acc_norm |\n","|hendrycksTest-nutrition | |βœ“ |βœ“ | 306|acc, acc_norm |\n","|hendrycksTest-philosophy | |βœ“ |βœ“ | 311|acc, acc_norm |\n","|hendrycksTest-prehistory | |βœ“ |βœ“ | 324|acc, acc_norm |\n","|hendrycksTest-professional_accounting | |βœ“ |βœ“ | 282|acc, acc_norm |\n","|hendrycksTest-professional_law | |βœ“ |βœ“ | 1534|acc, acc_norm |\n","|hendrycksTest-professional_medicine | |βœ“ |βœ“ | 272|acc, acc_norm |\n","|hendrycksTest-professional_psychology | |βœ“ |βœ“ | 612|acc, acc_norm |\n","|hendrycksTest-public_relations | |βœ“ |βœ“ | 110|acc, acc_norm |\n","|hendrycksTest-security_studies | |βœ“ |βœ“ | 245|acc, acc_norm |\n","|hendrycksTest-sociology | |βœ“ |βœ“ | 201|acc, acc_norm |\n","|hendrycksTest-us_foreign_policy | |βœ“ |βœ“ | 100|acc, acc_norm |\n","|hendrycksTest-virology | |βœ“ |βœ“ | 166|acc, acc_norm |\n","|hendrycksTest-world_religions | |βœ“ |βœ“ | 171|acc, acc_norm |\n","|iwslt17-ar-en | | |βœ“ | 1460|bleu, chrf, ter |\n","|iwslt17-en-ar | | |βœ“ | 1460|bleu, chrf, ter |\n","|lambada_openai | | |βœ“ | 5153|ppl, acc |\n","|lambada_openai_cloze | | |βœ“ | 5153|ppl, acc |\n","|lambada_openai_mt_de | | |βœ“ | 5153|ppl, acc |\n","|lambada_openai_mt_en | | |βœ“ | 5153|ppl, acc |\n","|lambada_openai_mt_es | | |βœ“ | 5153|ppl, acc |\n","|lambada_openai_mt_fr | | |βœ“ | 5153|ppl, acc |\n","|lambada_openai_mt_it | | |βœ“ | 5153|ppl, acc |\n","|lambada_standard | |βœ“ |βœ“ | 5153|ppl, acc |\n","|lambada_standard_cloze | |βœ“ |βœ“ | 5153|ppl, acc |\n","|logiqa |βœ“ |βœ“ |βœ“ | 651|acc, acc_norm |\n","|math_algebra |βœ“ | |βœ“ | 1187|acc |\n","|math_asdiv | |βœ“ | | 2305|acc |\n","|math_counting_and_prob |βœ“ | |βœ“ | 474|acc |\n","|math_geometry |βœ“ | |βœ“ | 479|acc |\n","|math_intermediate_algebra |βœ“ | |βœ“ | 903|acc |\n","|math_num_theory |βœ“ | |βœ“ | 540|acc |\n","|math_prealgebra |βœ“ | |βœ“ | 871|acc |\n","|math_precalc |βœ“ | |βœ“ | 546|acc |\n","|mathqa |βœ“ |βœ“ |βœ“ | 2985|acc, acc_norm |\n","|mc_taco | |βœ“ |βœ“ | 9442|f1, em |\n","|mgsm_bn |βœ“ | |βœ“ | 250|acc |\n","|mgsm_de |βœ“ | |βœ“ | 250|acc |\n","|mgsm_en |βœ“ | |βœ“ | 250|acc |\n","|mgsm_es |βœ“ | |βœ“ | 250|acc |\n","|mgsm_fr |βœ“ | |βœ“ | 250|acc |\n","|mgsm_ja |βœ“ | |βœ“ | 250|acc |\n","|mgsm_ru |βœ“ | |βœ“ | 250|acc |\n","|mgsm_sw |βœ“ | |βœ“ | 250|acc |\n","|mgsm_te |βœ“ | |βœ“ | 250|acc |\n","|mgsm_th |βœ“ | |βœ“ | 250|acc |\n","|mgsm_zh |βœ“ | |βœ“ | 250|acc |\n","|mnli |βœ“ |βœ“ | | 9815|acc |\n","|mnli_mismatched |βœ“ |βœ“ | | 9832|acc |\n","|mrpc |βœ“ |βœ“ | | 408|acc, f1 |\n","|multirc |βœ“ |βœ“ | | 4848|acc |\n","|mutual |βœ“ |βœ“ | | 886|r@1, r@2, mrr |\n","|mutual_plus |βœ“ |βœ“ | | 886|r@1, r@2, mrr |\n","|openbookqa |βœ“ |βœ“ |βœ“ | 500|acc, acc_norm |\n","|pawsx_de |βœ“ |βœ“ |βœ“ | 2000|acc |\n","|pawsx_en |βœ“ |βœ“ |βœ“ | 2000|acc |\n","|pawsx_es |βœ“ |βœ“ |βœ“ | 2000|acc |\n","|pawsx_fr |βœ“ |βœ“ |βœ“ | 2000|acc |\n","|pawsx_ja |βœ“ |βœ“ |βœ“ | 2000|acc |\n","|pawsx_ko |βœ“ |βœ“ |βœ“ | 2000|acc |\n","|pawsx_zh |βœ“ |βœ“ |βœ“ | 2000|acc |\n","|pile_arxiv | |βœ“ |βœ“ | 2407|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_bookcorpus2 | |βœ“ |βœ“ | 28|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_books3 | |βœ“ |βœ“ | 269|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_dm-mathematics | |βœ“ |βœ“ | 1922|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_enron | |βœ“ |βœ“ | 1010|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_europarl | |βœ“ |βœ“ | 157|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_freelaw | |βœ“ |βœ“ | 5101|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_github | |βœ“ |βœ“ | 18195|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_gutenberg | |βœ“ |βœ“ | 80|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_hackernews | |βœ“ |βœ“ | 1632|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_nih-exporter | |βœ“ |βœ“ | 1884|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_opensubtitles | |βœ“ |βœ“ | 642|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_openwebtext2 | |βœ“ |βœ“ | 32925|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_philpapers | |βœ“ |βœ“ | 68|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_pile-cc | |βœ“ |βœ“ | 52790|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_pubmed-abstracts | |βœ“ |βœ“ | 29895|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_pubmed-central | |βœ“ |βœ“ | 5911|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_stackexchange | |βœ“ |βœ“ | 30378|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_ubuntu-irc | |βœ“ |βœ“ | 22|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_uspto | |βœ“ |βœ“ | 11415|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_wikipedia | |βœ“ |βœ“ | 17511|word_perplexity, byte_perplexity, bits_per_byte |\n","|pile_youtubesubtitles | |βœ“ |βœ“ | 342|word_perplexity, byte_perplexity, bits_per_byte |\n","|piqa |βœ“ |βœ“ | | 1838|acc, acc_norm |\n","|prost | | |βœ“ | 18736|acc, acc_norm |\n","|pubmedqa | | |βœ“ | 1000|acc |\n","|qa4mre_2011 | | |βœ“ | 120|acc, acc_norm |\n","|qa4mre_2012 | | |βœ“ | 160|acc, acc_norm |\n","|qa4mre_2013 | | |βœ“ | 284|acc, acc_norm |\n","|qasper |βœ“ |βœ“ | | 1764|f1_yesno, f1_abstractive |\n","|qnli |βœ“ |βœ“ | | 5463|acc |\n","|qqp |βœ“ |βœ“ | | 40430|acc, f1 |\n","|race |βœ“ |βœ“ |βœ“ | 1045|acc |\n","|random_insertion | |βœ“ | | 10000|acc |\n","|record |βœ“ |βœ“ | | 10000|f1, em |\n","|reversed_words | |βœ“ | | 10000|acc |\n","|rte |βœ“ |βœ“ | | 277|acc |\n","|sciq |βœ“ |βœ“ |βœ“ | 1000|acc, acc_norm |\n","|scrolls_contractnli |βœ“ |βœ“ | | 1037|em, acc, acc_norm |\n","|scrolls_govreport |βœ“ |βœ“ | | 972|rouge1, rouge2, rougeL |\n","|scrolls_narrativeqa |βœ“ |βœ“ | | 3425|f1 |\n","|scrolls_qasper |βœ“ |βœ“ | | 984|f1 |\n","|scrolls_qmsum |βœ“ |βœ“ | | 272|rouge1, rouge2, rougeL |\n","|scrolls_quality |βœ“ |βœ“ | | 2086|em, acc, acc_norm |\n","|scrolls_summscreenfd |βœ“ |βœ“ | | 338|rouge1, rouge2, rougeL |\n","|squad2 |βœ“ |βœ“ | | 11873|exact, f1, HasAns_exact, HasAns_f1, NoAns_exact, NoAns_f1, best_exact, best_f1 |\n","|sst |βœ“ |βœ“ | | 872|acc |\n","|swag |βœ“ |βœ“ | | 20006|acc, acc_norm |\n","|toxigen |βœ“ | |βœ“ | 940|acc, acc_norm |\n","|triviaqa |βœ“ |βœ“ | | 11313|acc |\n","|truthfulqa_gen | |βœ“ | | 817|bleurt_max, bleurt_acc, bleurt_diff, bleu_max, bleu_acc, bleu_diff, rouge1_max, rouge1_acc, rouge1_diff, rouge2_max, rouge2_acc, rouge2_diff, rougeL_max, rougeL_acc, rougeL_diff|\n","|truthfulqa_mc | |βœ“ | | 817|mc1, mc2 |\n","|webqs |βœ“ | |βœ“ | 2032|acc |\n","|wic |βœ“ |βœ“ | | 638|acc |\n","|wikitext |βœ“ |βœ“ |βœ“ | 62|word_perplexity, byte_perplexity, bits_per_byte |\n","|winogrande |βœ“ |βœ“ | | 1267|acc |\n","|wmt14-en-fr | | |βœ“ | 3003|bleu, chrf, ter |\n","|wmt14-fr-en | | |βœ“ | 3003|bleu, chrf, ter |\n","|wmt16-de-en | | |βœ“ | 2999|bleu, chrf, ter |\n","|wmt16-en-de | | |βœ“ | 2999|bleu, chrf, ter |\n","|wmt16-en-ro | | |βœ“ | 1999|bleu, chrf, ter |\n","|wmt16-ro-en | | |βœ“ | 1999|bleu, chrf, ter |\n","|wmt20-cs-en | | |βœ“ | 664|bleu, chrf, ter |\n","|wmt20-de-en | | |βœ“ | 785|bleu, chrf, ter |\n","|wmt20-de-fr | | |βœ“ | 1619|bleu, chrf, ter |\n","|wmt20-en-cs | | |βœ“ | 1418|bleu, chrf, ter |\n","|wmt20-en-de | | |βœ“ | 1418|bleu, chrf, ter |\n","|wmt20-en-iu | | |βœ“ | 2971|bleu, chrf, ter |\n","|wmt20-en-ja | | |βœ“ | 1000|bleu, chrf, ter |\n","|wmt20-en-km | | |βœ“ | 2320|bleu, chrf, ter |\n","|wmt20-en-pl | | |βœ“ | 1000|bleu, chrf, ter |\n","|wmt20-en-ps | | |βœ“ | 2719|bleu, chrf, ter |\n","|wmt20-en-ru | | |βœ“ | 2002|bleu, chrf, ter |\n","|wmt20-en-ta | | |βœ“ | 1000|bleu, chrf, ter |\n","|wmt20-en-zh | | |βœ“ | 1418|bleu, chrf, ter |\n","|wmt20-fr-de | | |βœ“ | 1619|bleu, chrf, ter |\n","|wmt20-iu-en | | |βœ“ | 2971|bleu, chrf, ter |\n","|wmt20-ja-en | | |βœ“ | 993|bleu, chrf, ter |\n","|wmt20-km-en | | |βœ“ | 2320|bleu, chrf, ter |\n","|wmt20-pl-en | | |βœ“ | 1001|bleu, chrf, ter |\n","|wmt20-ps-en | | |βœ“ | 2719|bleu, chrf, ter |\n","|wmt20-ru-en | | |βœ“ | 991|bleu, chrf, ter |\n","|wmt20-ta-en | | |βœ“ | 997|bleu, chrf, ter |\n","|wmt20-zh-en | | |βœ“ | 2000|bleu, chrf, ter |\n","|wnli |βœ“ |βœ“ | | 71|acc |\n","|wsc |βœ“ |βœ“ | | 104|acc |\n","|wsc273 | | |βœ“ | 273|acc |\n","|xcopa_et | |βœ“ |βœ“ | 500|acc |\n","|xcopa_ht | |βœ“ |βœ“ | 500|acc |\n","|xcopa_id | |βœ“ |βœ“ | 500|acc |\n","|xcopa_it | |βœ“ |βœ“ | 500|acc |\n","|xcopa_qu | |βœ“ |βœ“ | 500|acc |\n","|xcopa_sw | |βœ“ |βœ“ | 500|acc |\n","|xcopa_ta | |βœ“ |βœ“ | 500|acc |\n","|xcopa_th | |βœ“ |βœ“ | 500|acc |\n","|xcopa_tr | |βœ“ |βœ“ | 500|acc |\n","|xcopa_vi | |βœ“ |βœ“ | 500|acc |\n","|xcopa_zh | |βœ“ |βœ“ | 500|acc |\n","|xnli_ar |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_bg |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_de |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_el |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_en |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_es |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_fr |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_hi |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_ru |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_sw |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_th |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_tr |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_ur |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_vi |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xnli_zh |βœ“ |βœ“ |βœ“ | 5010|acc |\n","|xstory_cloze_ar |βœ“ |βœ“ | | 1511|acc |\n","|xstory_cloze_en |βœ“ |βœ“ | | 1511|acc |\n","|xstory_cloze_es |βœ“ |βœ“ | | 1511|acc |\n","|xstory_cloze_eu |βœ“ |βœ“ | | 1511|acc |\n","|xstory_cloze_hi |βœ“ |βœ“ | | 1511|acc |\n","|xstory_cloze_id |βœ“ |βœ“ | | 1511|acc |\n","|xstory_cloze_my |βœ“ |βœ“ | | 1511|acc |\n","|xstory_cloze_ru |βœ“ |βœ“ | | 1511|acc |\n","|xstory_cloze_sw |βœ“ |βœ“ | | 1511|acc |\n","|xstory_cloze_te |βœ“ |βœ“ | | 1511|acc |\n","|xstory_cloze_zh |βœ“ |βœ“ | | 1511|acc |\n","|xwinograd_en | | |βœ“ | 2325|acc |\n","|xwinograd_fr | | |βœ“ | 83|acc |\n","|xwinograd_jp | | |βœ“ | 959|acc |\n","|xwinograd_pt | | |βœ“ | 263|acc |\n","|xwinograd_ru | | |βœ“ | 315|acc |\n","|xwinograd_zh | | |βœ“ | 504|acc |\n","| Ceval-valid-computer_network | | βœ“ | | 19 | acc |\n","| Ceval-valid-operating_system | | βœ“ | | 19 | acc |\n","| Ceval-valid-computer_architecture | | βœ“ | | 21 | acc |\n","| Ceval-valid-college_programming | | βœ“ | | 37 | acc |\n","| Ceval-valid-college_physics | | βœ“ | | 19 | acc |\n","| Ceval-valid-college_chemistry | | βœ“ | | 24 | acc |\n","| Ceval-valid-advanced_mathematics | | βœ“ | | 19 | acc |\n","| Ceval-valid-probability_and_statistics | | βœ“ | | 18 | acc |\n","| Ceval-valid-discrete_mathematics | | βœ“ | | 16 | acc |\n","| Ceval-valid-electrical_engineer | | βœ“ | | 37 | acc |\n","| Ceval-valid-metrology_engineer | | βœ“ | | 24 | acc |\n","| Ceval-valid-high_school_mathematics | | βœ“ | | 18 | acc |\n","| Ceval-valid-high_school_physics | | βœ“ | | 19 | acc |\n","| Ceval-valid-high_school_chemistry | | βœ“ | | 19 | acc |\n","| Ceval-valid-high_school_biology | | βœ“ | | 19 | acc |\n","| Ceval-valid-middle_school_mathematics | | βœ“ | | 19 | acc |\n","| Ceval-valid-middle_school_biology | | βœ“ | | 21 | acc |\n","| Ceval-valid-middle_school_physics | | βœ“ | | 19 | acc |\n","| Ceval-valid-middle_school_chemistry | | βœ“ | | 20 | acc |\n","| Ceval-valid-veterinary_medicine | | βœ“ | | 23 | acc |\n","| Ceval-valid-college_economics | | βœ“ | | 55 | acc |\n","| Ceval-valid-business_administration | | βœ“ | | 33 | acc |\n","| Ceval-valid-marxism | | βœ“ | | 19 | acc |\n","| Ceval-valid-mao_zedong_thought | | βœ“ | | 24 | acc |\n","| Ceval-valid-education_science | | βœ“ | | 29 | acc |\n","| Ceval-valid-teacher_qualification | | βœ“ | | 44 | acc |\n","| Ceval-valid-high_school_politics | | βœ“ | | 19 | acc |\n","| Ceval-valid-high_school_geography | | βœ“ | | 19 | acc |\n","| Ceval-valid-middle_school_politics | | βœ“ | | 21 | acc |\n","| Ceval-valid-middle_school_geography | | βœ“ | | 12 | acc |\n","| Ceval-valid-modern_chinese_history | | βœ“ | | 23 | acc |\n","| Ceval-valid-ideological_and_moral_cultivation | | βœ“ | | 19 | acc |\n","| Ceval-valid-logic | | βœ“ | | 22 | acc |\n","| Ceval-valid-law | | βœ“ | | 24 | acc |\n","| Ceval-valid-chinese_language_and_literature | | βœ“ | | 23 | acc |\n","| Ceval-valid-art_studies | | βœ“ | | 33 | acc |\n","| Ceval-valid-professional_tour_guide | | βœ“ | | 29 | acc |\n","| Ceval-valid-legal_professional | | βœ“ | | 23 | acc |\n","| Ceval-valid-high_school_chinese | | βœ“ | | 19 | acc |\n","| Ceval-valid-high_school_history | | βœ“ | | 20 | acc |\n","| Ceval-valid-middle_school_history | | βœ“ | | 22 | acc |\n","| Ceval-valid-civil_servant | | βœ“ | | 47 | acc |\n","| Ceval-valid-sports_science | | βœ“ | | 19 | acc |\n","| Ceval-valid-plant_protection | | βœ“ | | 22 | acc |\n","| Ceval-valid-basic_medicine | | βœ“ | | 19 | acc |\n","| Ceval-valid-clinical_medicine | | βœ“ | | 22 | acc |\n","| Ceval-valid-urban_and_rural_planner | | βœ“ | | 46 | acc |\n","| Ceval-valid-accountant | | βœ“ | | 49 | acc |\n","| Ceval-valid-fire_engineer | | βœ“ | | 31 | acc |\n","| Ceval-valid-environmental_impact_assessment_engineer | | βœ“ | | 31 | acc |\n","| Ceval-valid-tax_accountant | | βœ“ | | 49 | acc |\n","| Ceval-valid-physician | | βœ“ | | 49 | acc |"]}],"metadata":{"accelerator":"GPU","colab":{"authorship_tag":"ABX9TyOYjJFbDr/lKnnIcv2j6MLc","gpuType":"T4","machine_shape":"hm","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}