File size: 10,503 Bytes
f2124a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
dataset,prompt,metric,value
anli_dev_r1,GPT-3 style,accuracy,0.486
anli_dev_r1,MNLI crowdsource,accuracy,0.427
anli_dev_r1,can we infer,accuracy,0.474
anli_dev_r1,guaranteed/possible/impossible,accuracy,0.39
anli_dev_r1,justified in saying,accuracy,0.46
anli_dev_r1,median,accuracy,0.46
anli_dev_r2,GPT-3 style,accuracy,0.441
anli_dev_r2,MNLI crowdsource,accuracy,0.406
anli_dev_r2,can we infer,accuracy,0.426
anli_dev_r2,guaranteed/possible/impossible,accuracy,0.36
anli_dev_r2,justified in saying,accuracy,0.419
anli_dev_r2,median,accuracy,0.419
anli_dev_r3,GPT-3 style,accuracy,0.455
anli_dev_r3,MNLI crowdsource,accuracy,0.42
anli_dev_r3,can we infer,accuracy,0.445
anli_dev_r3,guaranteed/possible/impossible,accuracy,0.32083333333333336
anli_dev_r3,justified in saying,accuracy,0.4266666666666667
anli_dev_r3,median,accuracy,0.4266666666666667
story_cloze_2016,Answer Given options,accuracy,0.9567076429716729
story_cloze_2016,Choose Story Ending,accuracy,0.9625868519508284
story_cloze_2016,Generate Ending,accuracy,0.7814003206841261
story_cloze_2016,Novel Correct Ending,accuracy,0.9577765900587921
story_cloze_2016,Story Continuation and Options,accuracy,0.951362907536077
story_cloze_2016,median,accuracy,0.9567076429716729
super_glue_cb,GPT-3 style,accuracy,0.8214285714285714
super_glue_cb,MNLI crowdsource,accuracy,0.375
super_glue_cb,can we infer,accuracy,0.8214285714285714
super_glue_cb,guaranteed/possible/impossible,accuracy,0.7321428571428571
super_glue_cb,justified in saying,accuracy,0.7678571428571429
super_glue_cb,median,accuracy,0.7678571428571429
super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.62
super_glue_copa,best_option,accuracy,0.87
super_glue_copa,cause_effect,accuracy,0.88
super_glue_copa,i_am_hesitating,accuracy,0.91
super_glue_copa,plausible_alternatives,accuracy,0.88
super_glue_copa,median,accuracy,0.88
super_glue_rte,GPT-3 style,accuracy,0.8303249097472925
super_glue_rte,MNLI crowdsource,accuracy,0.855595667870036
super_glue_rte,does it follow that,accuracy,0.7833935018050542
super_glue_rte,guaranteed true,accuracy,0.8122743682310469
super_glue_rte,should assume,accuracy,0.8194945848375451
super_glue_rte,median,accuracy,0.8194945848375451
winogrande_winogrande_xl,Replace,accuracy,0.584846093133386
winogrande_winogrande_xl,True or False,accuracy,0.5217048145224941
winogrande_winogrande_xl,does underscore refer to,accuracy,0.5840568271507498
winogrande_winogrande_xl,stand for,accuracy,0.5114443567482242
winogrande_winogrande_xl,underscore refer to,accuracy,0.5927387529597474
winogrande_winogrande_xl,median,accuracy,0.5840568271507498
xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.55
xcopa_id,best_option,accuracy,0.78
xcopa_id,cause_effect,accuracy,0.86
xcopa_id,i_am_hesitating,accuracy,0.79
xcopa_id,plausible_alternatives,accuracy,0.84
xcopa_id,median,accuracy,0.79
xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.57
xcopa_sw,best_option,accuracy,0.6
xcopa_sw,cause_effect,accuracy,0.6
xcopa_sw,i_am_hesitating,accuracy,0.64
xcopa_sw,plausible_alternatives,accuracy,0.62
xcopa_sw,median,accuracy,0.6
xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.58
xcopa_ta,best_option,accuracy,0.67
xcopa_ta,cause_effect,accuracy,0.67
xcopa_ta,i_am_hesitating,accuracy,0.68
xcopa_ta,plausible_alternatives,accuracy,0.69
xcopa_ta,median,accuracy,0.67
xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55
xcopa_vi,best_option,accuracy,0.83
xcopa_vi,cause_effect,accuracy,0.87
xcopa_vi,i_am_hesitating,accuracy,0.84
xcopa_vi,plausible_alternatives,accuracy,0.86
xcopa_vi,median,accuracy,0.84
xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.55
xcopa_zh,best_option,accuracy,0.83
xcopa_zh,cause_effect,accuracy,0.9
xcopa_zh,i_am_hesitating,accuracy,0.9
xcopa_zh,plausible_alternatives,accuracy,0.86
xcopa_zh,median,accuracy,0.86
xnli_ar,GPT-3 style,accuracy,0.5357429718875502
xnli_ar,MNLI crowdsource,accuracy,0.41004016064257026
xnli_ar,can we infer,accuracy,0.5606425702811245
xnli_ar,guaranteed/possible/impossible,accuracy,0.6068273092369478
xnli_ar,justified in saying,accuracy,0.5437751004016064
xnli_ar,median,accuracy,0.5437751004016064
xnli_en,GPT-3 style,accuracy,0.6168674698795181
xnli_en,MNLI crowdsource,accuracy,0.45502008032128516
xnli_en,can we infer,accuracy,0.6092369477911647
xnli_en,guaranteed/possible/impossible,accuracy,0.6746987951807228
xnli_en,justified in saying,accuracy,0.5895582329317269
xnli_en,median,accuracy,0.6092369477911647
xnli_es,GPT-3 style,accuracy,0.585140562248996
xnli_es,MNLI crowdsource,accuracy,0.4357429718875502
xnli_es,can we infer,accuracy,0.5883534136546185
xnli_es,guaranteed/possible/impossible,accuracy,0.6124497991967871
xnli_es,justified in saying,accuracy,0.5734939759036145
xnli_es,median,accuracy,0.585140562248996
xnli_fr,GPT-3 style,accuracy,0.5771084337349398
xnli_fr,MNLI crowdsource,accuracy,0.43012048192771085
xnli_fr,can we infer,accuracy,0.5807228915662651
xnli_fr,guaranteed/possible/impossible,accuracy,0.6136546184738956
xnli_fr,justified in saying,accuracy,0.5694779116465863
xnli_fr,median,accuracy,0.5771084337349398
xnli_hi,GPT-3 style,accuracy,0.5248995983935743
xnli_hi,MNLI crowdsource,accuracy,0.3795180722891566
xnli_hi,can we infer,accuracy,0.5506024096385542
xnli_hi,guaranteed/possible/impossible,accuracy,0.5682730923694779
xnli_hi,justified in saying,accuracy,0.5353413654618474
xnli_hi,median,accuracy,0.5353413654618474
xnli_sw,GPT-3 style,accuracy,0.4795180722891566
xnli_sw,MNLI crowdsource,accuracy,0.39196787148594375
xnli_sw,can we infer,accuracy,0.5208835341365462
xnli_sw,guaranteed/possible/impossible,accuracy,0.5036144578313253
xnli_sw,justified in saying,accuracy,0.5184738955823294
xnli_sw,median,accuracy,0.5036144578313253
xnli_ur,GPT-3 style,accuracy,0.46586345381526106
xnli_ur,MNLI crowdsource,accuracy,0.3718875502008032
xnli_ur,can we infer,accuracy,0.5080321285140562
xnli_ur,guaranteed/possible/impossible,accuracy,0.4995983935742972
xnli_ur,justified in saying,accuracy,0.5080321285140562
xnli_ur,median,accuracy,0.4995983935742972
xnli_vi,GPT-3 style,accuracy,0.5578313253012048
xnli_vi,MNLI crowdsource,accuracy,0.42449799196787147
xnli_vi,can we infer,accuracy,0.5678714859437751
xnli_vi,guaranteed/possible/impossible,accuracy,0.6100401606425703
xnli_vi,justified in saying,accuracy,0.5538152610441767
xnli_vi,median,accuracy,0.5578313253012048
xnli_zh,GPT-3 style,accuracy,0.5526104417670683
xnli_zh,MNLI crowdsource,accuracy,0.38473895582329315
xnli_zh,can we infer,accuracy,0.5690763052208835
xnli_zh,guaranteed/possible/impossible,accuracy,0.5674698795180723
xnli_zh,justified in saying,accuracy,0.5622489959839357
xnli_zh,median,accuracy,0.5622489959839357
xstory_cloze_ar,Answer Given options,accuracy,0.7968232958305758
xstory_cloze_ar,Choose Story Ending,accuracy,0.9232296492389146
xstory_cloze_ar,Generate Ending,accuracy,0.6677696889477167
xstory_cloze_ar,Novel Correct Ending,accuracy,0.9265387160820648
xstory_cloze_ar,Story Continuation and Options,accuracy,0.9126406353408338
xstory_cloze_ar,median,accuracy,0.9126406353408338
xstory_cloze_es,Answer Given options,accuracy,0.8729318332230311
xstory_cloze_es,Choose Story Ending,accuracy,0.9417604235605559
xstory_cloze_es,Generate Ending,accuracy,0.7359364659166115
xstory_cloze_es,Novel Correct Ending,accuracy,0.9430840502978161
xstory_cloze_es,Story Continuation and Options,accuracy,0.9318332230311053
xstory_cloze_es,median,accuracy,0.9318332230311053
xstory_cloze_eu,Answer Given options,accuracy,0.7054930509596293
xstory_cloze_eu,Choose Story Ending,accuracy,0.8663136995367307
xstory_cloze_eu,Generate Ending,accuracy,0.6320317670416943
xstory_cloze_eu,Novel Correct Ending,accuracy,0.8689609530112509
xstory_cloze_eu,Story Continuation and Options,accuracy,0.8524156187954997
xstory_cloze_eu,median,accuracy,0.8524156187954997
xstory_cloze_hi,Answer Given options,accuracy,0.798808735936466
xstory_cloze_hi,Choose Story Ending,accuracy,0.8702845797485109
xstory_cloze_hi,Generate Ending,accuracy,0.6604897418927862
xstory_cloze_hi,Novel Correct Ending,accuracy,0.8788881535407015
xstory_cloze_hi,Story Continuation and Options,accuracy,0.870946393117141
xstory_cloze_hi,median,accuracy,0.8702845797485109
xstory_cloze_id,Answer Given options,accuracy,0.8557246856386499
xstory_cloze_id,Choose Story Ending,accuracy,0.9212442091330245
xstory_cloze_id,Generate Ending,accuracy,0.7041694242223693
xstory_cloze_id,Novel Correct Ending,accuracy,0.9205823957643945
xstory_cloze_id,Story Continuation and Options,accuracy,0.9066843150231635
xstory_cloze_id,median,accuracy,0.9066843150231635
xstory_cloze_zh,Answer Given options,accuracy,0.900066181336863
xstory_cloze_zh,Choose Story Ending,accuracy,0.9232296492389146
xstory_cloze_zh,Generate Ending,accuracy,0.684976836532098
xstory_cloze_zh,Novel Correct Ending,accuracy,0.9311714096624751
xstory_cloze_zh,Story Continuation and Options,accuracy,0.9199205823957644
xstory_cloze_zh,median,accuracy,0.9199205823957644
xwinograd_en,Replace,accuracy,0.6847311827956989
xwinograd_en,True or False,accuracy,0.5135483870967742
xwinograd_en,does underscore refer to,accuracy,0.6787096774193548
xwinograd_en,stand for,accuracy,0.5053763440860215
xwinograd_en,underscore refer to,accuracy,0.690752688172043
xwinograd_en,median,accuracy,0.6787096774193548
xwinograd_fr,Replace,accuracy,0.6506024096385542
xwinograd_fr,True or False,accuracy,0.4939759036144578
xwinograd_fr,does underscore refer to,accuracy,0.6867469879518072
xwinograd_fr,stand for,accuracy,0.46987951807228917
xwinograd_fr,underscore refer to,accuracy,0.6626506024096386
xwinograd_fr,median,accuracy,0.6506024096385542
xwinograd_pt,Replace,accuracy,0.6349809885931559
xwinograd_pt,True or False,accuracy,0.4866920152091255
xwinograd_pt,does underscore refer to,accuracy,0.6387832699619772
xwinograd_pt,stand for,accuracy,0.49429657794676807
xwinograd_pt,underscore refer to,accuracy,0.6425855513307985
xwinograd_pt,median,accuracy,0.6349809885931559
xwinograd_zh,Replace,accuracy,0.6865079365079365
xwinograd_zh,True or False,accuracy,0.5277777777777778
xwinograd_zh,does underscore refer to,accuracy,0.6884920634920635
xwinograd_zh,stand for,accuracy,0.4861111111111111
xwinograd_zh,underscore refer to,accuracy,0.6904761904761905
xwinograd_zh,median,accuracy,0.6865079365079365
multiple,average,multiple,0.6903830754158429
|