Muennighoff's picture
Add files
aa2424b
raw
history blame
No virus
13.9 kB
dataset,prompt,metric,value
amazon_reviews_multi_en,prompt_body_title_to_star,accuracy,0.5878
amazon_reviews_multi_en,prompt_review_to_star,accuracy,0.5492
amazon_reviews_multi_en,prompt_title_to_star,accuracy,0.4018
amazon_reviews_multi_en,median,accuracy,0.5492
amazon_reviews_multi_es,prompt_body_title_to_star,accuracy,0.53
amazon_reviews_multi_es,prompt_review_to_star,accuracy,0.5042
amazon_reviews_multi_es,prompt_title_to_star,accuracy,0.37
amazon_reviews_multi_es,median,accuracy,0.5042
amazon_reviews_multi_fr,prompt_body_title_to_star,accuracy,0.5294
amazon_reviews_multi_fr,prompt_review_to_star,accuracy,0.514
amazon_reviews_multi_fr,prompt_title_to_star,accuracy,0.3664
amazon_reviews_multi_fr,median,accuracy,0.514
amazon_reviews_multi_zh,prompt_body_title_to_star,accuracy,0.503
amazon_reviews_multi_zh,prompt_review_to_star,accuracy,0.4882
amazon_reviews_multi_zh,prompt_title_to_star,accuracy,0.3656
amazon_reviews_multi_zh,median,accuracy,0.4882
aqua_rat_raw,Answer questions from options,accuracy,0.2283464566929134
aqua_rat_raw,answer_quiz,accuracy,0.23228346456692914
aqua_rat_raw,select_the_best_option,accuracy,0.2125984251968504
aqua_rat_raw,median,accuracy,0.2283464566929134
art_None,choose_hypothesis,accuracy,0.6527415143603134
art_None,choose_hypothesis_believable,accuracy,0.6586161879895561
art_None,choose_hypothesis_desc,accuracy,0.5718015665796344
art_None,choose_hypothesis_likely,accuracy,0.5535248041775457
art_None,choose_hypothesis_options,accuracy,0.6298955613577023
art_None,median,accuracy,0.6298955613577023
banking77_None,direct_to_which_department,accuracy,0.22077922077922077
banking77_None,help_page_topic,accuracy,0.3198051948051948
banking77_None,rephrase_as_banking_term,accuracy,0.31915584415584414
banking77_None,median,accuracy,0.31915584415584414
blbooksgenre_title_genre_classifiction,classify,accuracy,0.25172811059907835
blbooksgenre_title_genre_classifiction,multi-choice,accuracy,0.5161290322580645
blbooksgenre_title_genre_classifiction,premise_context_first,accuracy,0.7517281105990783
blbooksgenre_title_genre_classifiction,median,accuracy,0.5161290322580645
blimp_adjunct_island,grammatical_between_1_2,accuracy,0.519
blimp_adjunct_island,grammatical_between_A_B,accuracy,0.36
blimp_adjunct_island,grammatical_which_one_1_2,accuracy,0.506
blimp_adjunct_island,single_sentence_bad_yes_no,accuracy,0.527
blimp_adjunct_island,single_sentence_good_yes_no,accuracy,0.49
blimp_adjunct_island,median,accuracy,0.506
climate_fever_None,claim_and_all_supporting_evidences,accuracy,0.23908794788273616
climate_fever_None,fifth_evidence_and_claim_itemization,accuracy,0.43713355048859937
climate_fever_None,first_evidence_and_claim_itemization,accuracy,0.21172638436482086
climate_fever_None,second_evidence_and_claim_itemization,accuracy,0.4
climate_fever_None,third_evidence_claim_pair,accuracy,0.36351791530944627
climate_fever_None,median,accuracy,0.36351791530944627
codah_codah,affirmative_instruction_after_sentence_and_choices,accuracy,0.7917867435158501
codah_codah,affirmative_instruction_before_sentence_and_choices,accuracy,0.7921469740634006
codah_codah,interrogative_instruction_after_sentence_and_choices,accuracy,0.797550432276657
codah_codah,median,accuracy,0.7921469740634006
commonsense_qa_None,answer_given_question_without_options,accuracy,0.6961506961506961
commonsense_qa_None,most_suitable_answer,accuracy,0.7583947583947583
commonsense_qa_None,question_answering,accuracy,0.7444717444717445
commonsense_qa_None,median,accuracy,0.7444717444717445
conv_ai_3_None,ambiguous,accuracy,0.39040207522697795
conv_ai_3_None,clarification_needed,accuracy,0.39040207522697795
conv_ai_3_None,directly_answer,accuracy,0.6095979247730221
conv_ai_3_None,score_give_number,accuracy,0.21444012105490703
conv_ai_3_None,score_how_much,accuracy,0.21444012105490703
conv_ai_3_None,median,accuracy,0.39040207522697795
craigslist_bargains_None,best deal,accuracy,0.5142378559463987
craigslist_bargains_None,good deal for seller,accuracy,0.2663316582914573
craigslist_bargains_None,good deal for seller no list price,accuracy,0.135678391959799
craigslist_bargains_None,good deal for seller no list price implicit,accuracy,0.23785594639865998
craigslist_bargains_None,median,accuracy,0.2520938023450586
emotion_None,answer_question_with_emotion_label,accuracy,0.4005
emotion_None,answer_with_class_label,accuracy,0.3595
emotion_None,choose_the_best_emotion_label,accuracy,0.4435
emotion_None,reply_with_emoation_label,accuracy,0.469
emotion_None,median,accuracy,0.42200000000000004
financial_phrasebank_sentences_allagree,bullish_neutral_bearish,accuracy,0.3365724381625442
financial_phrasebank_sentences_allagree,complementary_industries,accuracy,0.09143109540636042
financial_phrasebank_sentences_allagree,sentiment,accuracy,0.36130742049469966
financial_phrasebank_sentences_allagree,share_price_option,accuracy,0.36925795053003535
financial_phrasebank_sentences_allagree,word_comes_to_mind,accuracy,0.03931095406360424
financial_phrasebank_sentences_allagree,median,accuracy,0.3365724381625442
glue_cola,Following sentence acceptable,accuracy,0.4506232023010546
glue_cola,Make sense yes no,accuracy,0.3595397890699904
glue_cola,Previous sentence acceptable,accuracy,0.3096836049856184
glue_cola,editing,accuracy,0.3202301054650048
glue_cola,is_this_correct,accuracy,0.3288590604026846
glue_cola,median,accuracy,0.3288590604026846
glue_sst2,following positive negative,accuracy,0.9541284403669725
glue_sst2,happy or mad,accuracy,0.9197247706422018
glue_sst2,positive negative after,accuracy,0.9472477064220184
glue_sst2,review,accuracy,0.9541284403669725
glue_sst2,said,accuracy,0.8555045871559633
glue_sst2,median,accuracy,0.9472477064220184
head_qa_en,multiple_choice_a_and_q_en,accuracy,0.3374816983894583
head_qa_en,multiple_choice_a_and_q_with_context_en,accuracy,0.32723279648609077
head_qa_en,multiple_choice_q_and_a_en,accuracy,0.47144948755490484
head_qa_en,multiple_choice_q_and_a_index_en,accuracy,0.4538799414348463
head_qa_en,multiple_choice_q_and_a_index_with_context_en,accuracy,0.45095168374816985
head_qa_en,median,accuracy,0.45095168374816985
head_qa_es,multiple_choice_a_and_q_en,accuracy,0.32503660322108346
head_qa_es,multiple_choice_a_and_q_with_context_en,accuracy,0.32210834553440704
head_qa_es,multiple_choice_q_and_a_en,accuracy,0.4685212298682284
head_qa_es,multiple_choice_q_and_a_index_en,accuracy,0.4289897510980966
head_qa_es,multiple_choice_q_and_a_index_with_context_en,accuracy,0.4333821376281113
head_qa_es,median,accuracy,0.4289897510980966
health_fact_None,claim_explanation_classification,accuracy,0.49551020408163265
health_fact_None,claim_veracity_classification_after_reading_I_believe,accuracy,0.21551020408163266
health_fact_None,claim_veracity_classification_tell_me,accuracy,0.05142857142857143
health_fact_None,median,accuracy,0.21551020408163266
hlgd_None,is_same_event_editor_asks,accuracy,0.6872885451909135
hlgd_None,is_same_event_interrogative_talk,accuracy,0.6795553407443209
hlgd_None,is_same_event_refer,accuracy,0.8221362977283712
hlgd_None,is_same_event_with_time_interrogative_related,accuracy,0.8144030932817786
hlgd_None,is_same_event_with_time_interrogative_talk,accuracy,0.8090865152247463
hlgd_None,median,accuracy,0.8090865152247463
hyperpartisan_news_detection_byarticle,consider_does_it_follow_a_hyperpartisan_argumentation,accuracy,0.6170542635658914
hyperpartisan_news_detection_byarticle,consider_it_exhibits_extreme_one_sidedness,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,consume_with_caution,accuracy,0.6248062015503876
hyperpartisan_news_detection_byarticle,extreme_left_wing_or_right_wing,accuracy,0.6186046511627907
hyperpartisan_news_detection_byarticle,follows_hyperpartisan_argumentation,accuracy,0.627906976744186
hyperpartisan_news_detection_byarticle,median,accuracy,0.6248062015503876
liar_None,Given statement guess category,accuracy,0.20249221183800623
liar_None,median,accuracy,0.20249221183800623
lince_sa_spaeng,express sentiment,accuracy,0.5874125874125874
lince_sa_spaeng,negation template,accuracy,0.5879505110274341
lince_sa_spaeng,original poster expressed sentiment,accuracy,0.5901022054868209
lince_sa_spaeng,sentiment trying to express,accuracy,0.5809575040344271
lince_sa_spaeng,the author seem,accuracy,0.5911780527165142
lince_sa_spaeng,median,accuracy,0.5879505110274341
math_qa_None,choose_correct_og,accuracy,0.24489112227805696
math_qa_None,first_choice_then_problem,accuracy,0.2020100502512563
math_qa_None,gre_problem,accuracy,0.22981574539363483
math_qa_None,pick_the_correct,accuracy,0.23819095477386934
math_qa_None,problem_set_type,accuracy,0.4824120603015075
math_qa_None,median,accuracy,0.23819095477386934
mlsum_es,layman_summ_es,bleu,0.027240547697036887
mlsum_es,palm_prompt,bleu,0.03916653973278921
mlsum_es,summarise_this_in_es_few_sentences,bleu,0.031798181363636026
mlsum_es,median,bleu,0.031798181363636026
movie_rationales_None,Evidences + review,accuracy,0.98
movie_rationales_None,Evidences sentiment classification,accuracy,0.995
movie_rationales_None,Standard binary sentiment analysis,accuracy,0.97
movie_rationales_None,median,accuracy,0.98
mwsc_None,in-the-sentence,accuracy,0.7804878048780488
mwsc_None,in-the-sentence-question-first,accuracy,0.7195121951219512
mwsc_None,is-correct,accuracy,0.524390243902439
mwsc_None,options-or,accuracy,0.7926829268292683
mwsc_None,what-think,accuracy,0.8048780487804879
mwsc_None,median,accuracy,0.7804878048780488
onestop_english_None,ara_context,accuracy,0.6490299823633157
onestop_english_None,assess,accuracy,0.5802469135802469
onestop_english_None,determine_reading_level_from_the_first_three_sentences,accuracy,0.6596119929453262
onestop_english_None,esl_context,accuracy,0.562610229276896
onestop_english_None,esl_variation,accuracy,0.6261022927689595
onestop_english_None,median,accuracy,0.6261022927689595
poem_sentiment_None,guess_sentiment_without_options_variation_1,accuracy,0.24761904761904763
poem_sentiment_None,most_appropriate_sentiment,accuracy,0.26666666666666666
poem_sentiment_None,positive_or_negative_sentiment_variation_1,accuracy,0.2571428571428571
poem_sentiment_None,positive_or_negative_sentiment_variation_2,accuracy,0.22857142857142856
poem_sentiment_None,question_answer_format,accuracy,0.26666666666666666
poem_sentiment_None,median,accuracy,0.2571428571428571
pubmed_qa_pqa_labeled,Long Answer to Final Decision,accuracy,0.698
pubmed_qa_pqa_labeled,Question Answering (Short),accuracy,0.741
pubmed_qa_pqa_labeled,median,accuracy,0.7195
riddle_sense_None,answer_given_question_without_options,accuracy,0.5337904015670911
riddle_sense_None,most_suitable_answer,accuracy,0.5288932419196866
riddle_sense_None,question_answering,accuracy,0.4975514201762977
riddle_sense_None,question_to_answer_index,accuracy,0.5239960822722821
riddle_sense_None,median,accuracy,0.5264446620959844
scicite_None,Classify intent,accuracy,0.3078602620087336
scicite_None,Classify intent (choices first),accuracy,0.16921397379912664
scicite_None,Classify intent (select choice),accuracy,0.3067685589519651
scicite_None,Classify intent w/section (select choice),accuracy,0.38427947598253276
scicite_None,can_describe,accuracy,0.34497816593886466
scicite_None,median,accuracy,0.3078602620087336
selqa_answer_selection_analysis,is-he-talking-about,accuracy,0.913375796178344
selqa_answer_selection_analysis,make-sense-rand,accuracy,0.9452229299363057
selqa_answer_selection_analysis,which-answer-1st-vs-random,accuracy,0.8267515923566879
selqa_answer_selection_analysis,would-make-sense-qu-rand,accuracy,0.9095541401273886
selqa_answer_selection_analysis,median,accuracy,0.9114649681528663
snips_built_in_intents_None,categorize_query,accuracy,0.7926829268292683
snips_built_in_intents_None,categorize_query_brief,accuracy,0.7195121951219512
snips_built_in_intents_None,intent_query,accuracy,0.4268292682926829
snips_built_in_intents_None,query_intent,accuracy,0.7408536585365854
snips_built_in_intents_None,voice_intent,accuracy,0.7652439024390244
snips_built_in_intents_None,median,accuracy,0.7408536585365854
wmt14_fr_en_en-fr,a_good_translation-en-fr-source+target,bleu,0.05708641852963801
wmt14_fr_en_en-fr,a_good_translation-en-fr-target,bleu,0.021803001724318146
wmt14_fr_en_en-fr,gpt3-en-fr,bleu,0.13227227858954294
wmt14_fr_en_en-fr,version-en-fr-target,bleu,0.0376456058717039
wmt14_fr_en_en-fr,xglm-en-fr-target,bleu,0.013771806987654605
wmt14_fr_en_en-fr,median,bleu,0.0376456058717039
wmt14_fr_en_fr-en,a_good_translation-fr-en-source+target,bleu,0.3568647218780612
wmt14_fr_en_fr-en,a_good_translation-fr-en-target,bleu,0.35195872833563757
wmt14_fr_en_fr-en,gpt3-fr-en,bleu,0.3255636093987592
wmt14_fr_en_fr-en,version-fr-en-target,bleu,0.3391775518002708
wmt14_fr_en_fr-en,xglm-fr-en-target,bleu,0.12924351243878487
wmt14_fr_en_fr-en,median,bleu,0.3391775518002708
wmt14_hi_en_en-hi,a_good_translation-en-hi-source+target,bleu,0.0068217085836424505
wmt14_hi_en_en-hi,a_good_translation-en-hi-target,bleu,0.0020182870719270517
wmt14_hi_en_en-hi,gpt-3-en-hi-target,bleu,0.004838104299715251
wmt14_hi_en_en-hi,version-en-hi-target,bleu,0.0045496025805613965
wmt14_hi_en_en-hi,xglm-en-hi-target,bleu,0.006476009503302996
wmt14_hi_en_en-hi,median,bleu,0.004838104299715251
wmt14_hi_en_hi-en,a_good_translation-hi-en-source+target,bleu,0.2459406560241934
wmt14_hi_en_hi-en,a_good_translation-hi-en-target,bleu,0.24756498535867513
wmt14_hi_en_hi-en,gpt-3-hi-en-target,bleu,0.2014199423501091
wmt14_hi_en_hi-en,version-hi-en-target,bleu,0.22687582168757378
wmt14_hi_en_hi-en,xglm-hi-en-target,bleu,0.04035937869116407
wmt14_hi_en_hi-en,median,bleu,0.22687582168757378
multiple,average,multiple,0.4720152103704419