File size: 13,875 Bytes
aa2424b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
dataset,prompt,metric,value
amazon_reviews_multi_en,prompt_body_title_to_star,accuracy,0.5878
amazon_reviews_multi_en,prompt_review_to_star,accuracy,0.5492
amazon_reviews_multi_en,prompt_title_to_star,accuracy,0.4018
amazon_reviews_multi_en,median,accuracy,0.5492
amazon_reviews_multi_es,prompt_body_title_to_star,accuracy,0.53
amazon_reviews_multi_es,prompt_review_to_star,accuracy,0.5042
amazon_reviews_multi_es,prompt_title_to_star,accuracy,0.37
amazon_reviews_multi_es,median,accuracy,0.5042
amazon_reviews_multi_fr,prompt_body_title_to_star,accuracy,0.5294
amazon_reviews_multi_fr,prompt_review_to_star,accuracy,0.514
amazon_reviews_multi_fr,prompt_title_to_star,accuracy,0.3664
amazon_reviews_multi_fr,median,accuracy,0.514
amazon_reviews_multi_zh,prompt_body_title_to_star,accuracy,0.503
amazon_reviews_multi_zh,prompt_review_to_star,accuracy,0.4882
amazon_reviews_multi_zh,prompt_title_to_star,accuracy,0.3656
amazon_reviews_multi_zh,median,accuracy,0.4882
aqua_rat_raw,Answer questions from options,accuracy,0.2283464566929134
aqua_rat_raw,answer_quiz,accuracy,0.23228346456692914
aqua_rat_raw,select_the_best_option,accuracy,0.2125984251968504
aqua_rat_raw,median,accuracy,0.2283464566929134
art_None,choose_hypothesis,accuracy,0.6527415143603134
art_None,choose_hypothesis_believable,accuracy,0.6586161879895561
art_None,choose_hypothesis_desc,accuracy,0.5718015665796344
art_None,choose_hypothesis_likely,accuracy,0.5535248041775457
art_None,choose_hypothesis_options,accuracy,0.6298955613577023
art_None,median,accuracy,0.6298955613577023
banking77_None,direct_to_which_department,accuracy,0.22077922077922077
banking77_None,help_page_topic,accuracy,0.3198051948051948
banking77_None,rephrase_as_banking_term,accuracy,0.31915584415584414
banking77_None,median,accuracy,0.31915584415584414
blbooksgenre_title_genre_classifiction,classify,accuracy,0.25172811059907835
blbooksgenre_title_genre_classifiction,multi-choice,accuracy,0.5161290322580645
blbooksgenre_title_genre_classifiction,premise_context_first,accuracy,0.7517281105990783
blbooksgenre_title_genre_classifiction,median,accuracy,0.5161290322580645
blimp_adjunct_island,grammatical_between_1_2,accuracy,0.519
blimp_adjunct_island,grammatical_between_A_B,accuracy,0.36
blimp_adjunct_island,grammatical_which_one_1_2,accuracy,0.506
blimp_adjunct_island,single_sentence_bad_yes_no,accuracy,0.527
blimp_adjunct_island,single_sentence_good_yes_no,accuracy,0.49
blimp_adjunct_island,median,accuracy,0.506
climate_fever_None,claim_and_all_supporting_evidences,accuracy,0.23908794788273616
climate_fever_None,fifth_evidence_and_claim_itemization,accuracy,0.43713355048859937
climate_fever_None,first_evidence_and_claim_itemization,accuracy,0.21172638436482086
climate_fever_None,second_evidence_and_claim_itemization,accuracy,0.4
climate_fever_None,third_evidence_claim_pair,accuracy,0.36351791530944627
climate_fever_None,median,accuracy,0.36351791530944627
codah_codah,affirmative_instruction_after_sentence_and_choices,accuracy,0.7917867435158501
codah_codah,affirmative_instruction_before_sentence_and_choices,accuracy,0.7921469740634006
codah_codah,interrogative_instruction_after_sentence_and_choices,accuracy,0.797550432276657
codah_codah,median,accuracy,0.7921469740634006
commonsense_qa_None,answer_given_question_without_options,accuracy,0.6961506961506961
commonsense_qa_None,most_suitable_answer,accuracy,0.7583947583947583
commonsense_qa_None,question_answering,accuracy,0.7444717444717445
commonsense_qa_None,median,accuracy,0.7444717444717445
conv_ai_3_None,ambiguous,accuracy,0.39040207522697795
conv_ai_3_None,clarification_needed,accuracy,0.39040207522697795
conv_ai_3_None,directly_answer,accuracy,0.6095979247730221
conv_ai_3_None,score_give_number,accuracy,0.21444012105490703
conv_ai_3_None,score_how_much,accuracy,0.21444012105490703
conv_ai_3_None,median,accuracy,0.39040207522697795
craigslist_bargains_None,best deal,accuracy,0.5142378559463987
craigslist_bargains_None,good deal for seller,accuracy,0.2663316582914573
craigslist_bargains_None,good deal for seller no list price,accuracy,0.135678391959799
craigslist_bargains_None,good deal for seller no list price implicit,accuracy,0.23785594639865998
craigslist_bargains_None,median,accuracy,0.2520938023450586
emotion_None,answer_question_with_emotion_label,accuracy,0.4005
emotion_None,answer_with_class_label,accuracy,0.3595
emotion_None,choose_the_best_emotion_label,accuracy,0.4435
emotion_None,reply_with_emoation_label,accuracy,0.469
emotion_None,median,accuracy,0.42200000000000004
financial_phrasebank_sentences_allagree,bullish_neutral_bearish,accuracy,0.3365724381625442
financial_phrasebank_sentences_allagree,complementary_industries,accuracy,0.09143109540636042
financial_phrasebank_sentences_allagree,sentiment,accuracy,0.36130742049469966
financial_phrasebank_sentences_allagree,share_price_option,accuracy,0.36925795053003535
financial_phrasebank_sentences_allagree,word_comes_to_mind,accuracy,0.03931095406360424
financial_phrasebank_sentences_allagree,median,accuracy,0.3365724381625442
glue_cola,Following sentence acceptable,accuracy,0.4506232023010546
glue_cola,Make sense yes no,accuracy,0.3595397890699904
glue_cola,Previous sentence acceptable,accuracy,0.3096836049856184
glue_cola,editing,accuracy,0.3202301054650048
glue_cola,is_this_correct,accuracy,0.3288590604026846
glue_cola,median,accuracy,0.3288590604026846
glue_sst2,following positive negative,accuracy,0.9541284403669725
glue_sst2,happy or mad,accuracy,0.9197247706422018
glue_sst2,positive negative after,accuracy,0.9472477064220184
glue_sst2,review,accuracy,0.9541284403669725
glue_sst2,said,accuracy,0.8555045871559633
glue_sst2,median,accuracy,0.9472477064220184
head_qa_en,multiple_choice_a_and_q_en,accuracy,0.3374816983894583
head_qa_en,multiple_choice_a_and_q_with_context_en,accuracy,0.32723279648609077
head_qa_en,multiple_choice_q_and_a_en,accuracy,0.47144948755490484
head_qa_en,multiple_choice_q_and_a_index_en,accuracy,0.4538799414348463
head_qa_en,multiple_choice_q_and_a_index_with_context_en,accuracy,0.45095168374816985
head_qa_en,median,accuracy,0.45095168374816985
head_qa_es,multiple_choice_a_and_q_en,accuracy,0.32503660322108346
head_qa_es,multiple_choice_a_and_q_with_context_en,accuracy,0.32210834553440704
head_qa_es,multiple_choice_q_and_a_en,accuracy,0.4685212298682284
head_qa_es,multiple_choice_q_and_a_index_en,accuracy,0.4289897510980966
head_qa_es,multiple_choice_q_and_a_index_with_context_en,accuracy,0.4333821376281113
head_qa_es,median,accuracy,0.4289897510980966
health_fact_None,claim_explanation_classification,accuracy,0.49551020408163265
health_fact_None,claim_veracity_classification_after_reading_I_believe,accuracy,0.21551020408163266
health_fact_None,claim_veracity_classification_tell_me,accuracy,0.05142857142857143
health_fact_None,median,accuracy,0.21551020408163266
hlgd_None,is_same_event_editor_asks,accuracy,0.6872885451909135
hlgd_None,is_same_event_interrogative_talk,accuracy,0.6795553407443209
hlgd_None,is_same_event_refer,accuracy,0.8221362977283712
hlgd_None,is_same_event_with_time_interrogative_related,accuracy,0.8144030932817786
hlgd_None,is_same_event_with_time_interrogative_talk,accuracy,0.8090865152247463
hlgd_None,median,accuracy,0.8090865152247463
hyperpartisan_news_detection_byarticle,consider_does_it_follow_a_hyperpartisan_argumentation,accuracy,0.6170542635658914
hyperpartisan_news_detection_byarticle,consider_it_exhibits_extreme_one_sidedness,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,consume_with_caution,accuracy,0.6248062015503876
hyperpartisan_news_detection_byarticle,extreme_left_wing_or_right_wing,accuracy,0.6186046511627907
hyperpartisan_news_detection_byarticle,follows_hyperpartisan_argumentation,accuracy,0.627906976744186
hyperpartisan_news_detection_byarticle,median,accuracy,0.6248062015503876
liar_None,Given statement guess category,accuracy,0.20249221183800623
liar_None,median,accuracy,0.20249221183800623
lince_sa_spaeng,express sentiment,accuracy,0.5874125874125874
lince_sa_spaeng,negation template,accuracy,0.5879505110274341
lince_sa_spaeng,original poster expressed sentiment,accuracy,0.5901022054868209
lince_sa_spaeng,sentiment trying to express,accuracy,0.5809575040344271
lince_sa_spaeng,the author seem,accuracy,0.5911780527165142
lince_sa_spaeng,median,accuracy,0.5879505110274341
math_qa_None,choose_correct_og,accuracy,0.24489112227805696
math_qa_None,first_choice_then_problem,accuracy,0.2020100502512563
math_qa_None,gre_problem,accuracy,0.22981574539363483
math_qa_None,pick_the_correct,accuracy,0.23819095477386934
math_qa_None,problem_set_type,accuracy,0.4824120603015075
math_qa_None,median,accuracy,0.23819095477386934
mlsum_es,layman_summ_es,bleu,0.027240547697036887
mlsum_es,palm_prompt,bleu,0.03916653973278921
mlsum_es,summarise_this_in_es_few_sentences,bleu,0.031798181363636026
mlsum_es,median,bleu,0.031798181363636026
movie_rationales_None,Evidences + review,accuracy,0.98
movie_rationales_None,Evidences sentiment classification,accuracy,0.995
movie_rationales_None,Standard binary sentiment analysis,accuracy,0.97
movie_rationales_None,median,accuracy,0.98
mwsc_None,in-the-sentence,accuracy,0.7804878048780488
mwsc_None,in-the-sentence-question-first,accuracy,0.7195121951219512
mwsc_None,is-correct,accuracy,0.524390243902439
mwsc_None,options-or,accuracy,0.7926829268292683
mwsc_None,what-think,accuracy,0.8048780487804879
mwsc_None,median,accuracy,0.7804878048780488
onestop_english_None,ara_context,accuracy,0.6490299823633157
onestop_english_None,assess,accuracy,0.5802469135802469
onestop_english_None,determine_reading_level_from_the_first_three_sentences,accuracy,0.6596119929453262
onestop_english_None,esl_context,accuracy,0.562610229276896
onestop_english_None,esl_variation,accuracy,0.6261022927689595
onestop_english_None,median,accuracy,0.6261022927689595
poem_sentiment_None,guess_sentiment_without_options_variation_1,accuracy,0.24761904761904763
poem_sentiment_None,most_appropriate_sentiment,accuracy,0.26666666666666666
poem_sentiment_None,positive_or_negative_sentiment_variation_1,accuracy,0.2571428571428571
poem_sentiment_None,positive_or_negative_sentiment_variation_2,accuracy,0.22857142857142856
poem_sentiment_None,question_answer_format,accuracy,0.26666666666666666
poem_sentiment_None,median,accuracy,0.2571428571428571
pubmed_qa_pqa_labeled,Long Answer to Final Decision,accuracy,0.698
pubmed_qa_pqa_labeled,Question Answering (Short),accuracy,0.741
pubmed_qa_pqa_labeled,median,accuracy,0.7195
riddle_sense_None,answer_given_question_without_options,accuracy,0.5337904015670911
riddle_sense_None,most_suitable_answer,accuracy,0.5288932419196866
riddle_sense_None,question_answering,accuracy,0.4975514201762977
riddle_sense_None,question_to_answer_index,accuracy,0.5239960822722821
riddle_sense_None,median,accuracy,0.5264446620959844
scicite_None,Classify intent,accuracy,0.3078602620087336
scicite_None,Classify intent (choices first),accuracy,0.16921397379912664
scicite_None,Classify intent (select choice),accuracy,0.3067685589519651
scicite_None,Classify intent w/section (select choice),accuracy,0.38427947598253276
scicite_None,can_describe,accuracy,0.34497816593886466
scicite_None,median,accuracy,0.3078602620087336
selqa_answer_selection_analysis,is-he-talking-about,accuracy,0.913375796178344
selqa_answer_selection_analysis,make-sense-rand,accuracy,0.9452229299363057
selqa_answer_selection_analysis,which-answer-1st-vs-random,accuracy,0.8267515923566879
selqa_answer_selection_analysis,would-make-sense-qu-rand,accuracy,0.9095541401273886
selqa_answer_selection_analysis,median,accuracy,0.9114649681528663
snips_built_in_intents_None,categorize_query,accuracy,0.7926829268292683
snips_built_in_intents_None,categorize_query_brief,accuracy,0.7195121951219512
snips_built_in_intents_None,intent_query,accuracy,0.4268292682926829
snips_built_in_intents_None,query_intent,accuracy,0.7408536585365854
snips_built_in_intents_None,voice_intent,accuracy,0.7652439024390244
snips_built_in_intents_None,median,accuracy,0.7408536585365854
wmt14_fr_en_en-fr,a_good_translation-en-fr-source+target,bleu,0.05708641852963801
wmt14_fr_en_en-fr,a_good_translation-en-fr-target,bleu,0.021803001724318146
wmt14_fr_en_en-fr,gpt3-en-fr,bleu,0.13227227858954294
wmt14_fr_en_en-fr,version-en-fr-target,bleu,0.0376456058717039
wmt14_fr_en_en-fr,xglm-en-fr-target,bleu,0.013771806987654605
wmt14_fr_en_en-fr,median,bleu,0.0376456058717039
wmt14_fr_en_fr-en,a_good_translation-fr-en-source+target,bleu,0.3568647218780612
wmt14_fr_en_fr-en,a_good_translation-fr-en-target,bleu,0.35195872833563757
wmt14_fr_en_fr-en,gpt3-fr-en,bleu,0.3255636093987592
wmt14_fr_en_fr-en,version-fr-en-target,bleu,0.3391775518002708
wmt14_fr_en_fr-en,xglm-fr-en-target,bleu,0.12924351243878487
wmt14_fr_en_fr-en,median,bleu,0.3391775518002708
wmt14_hi_en_en-hi,a_good_translation-en-hi-source+target,bleu,0.0068217085836424505
wmt14_hi_en_en-hi,a_good_translation-en-hi-target,bleu,0.0020182870719270517
wmt14_hi_en_en-hi,gpt-3-en-hi-target,bleu,0.004838104299715251
wmt14_hi_en_en-hi,version-en-hi-target,bleu,0.0045496025805613965
wmt14_hi_en_en-hi,xglm-en-hi-target,bleu,0.006476009503302996
wmt14_hi_en_en-hi,median,bleu,0.004838104299715251
wmt14_hi_en_hi-en,a_good_translation-hi-en-source+target,bleu,0.2459406560241934
wmt14_hi_en_hi-en,a_good_translation-hi-en-target,bleu,0.24756498535867513
wmt14_hi_en_hi-en,gpt-3-hi-en-target,bleu,0.2014199423501091
wmt14_hi_en_hi-en,version-hi-en-target,bleu,0.22687582168757378
wmt14_hi_en_hi-en,xglm-hi-en-target,bleu,0.04035937869116407
wmt14_hi_en_hi-en,median,bleu,0.22687582168757378
multiple,average,multiple,0.4720152103704419