File size: 10,526 Bytes
1bcb202 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
dataset,prompt,metric,value
anli_dev_r1,GPT-3 style,accuracy,0.351
anli_dev_r1,MNLI crowdsource,accuracy,0.334
anli_dev_r1,can we infer,accuracy,0.351
anli_dev_r1,guaranteed/possible/impossible,accuracy,0.288
anli_dev_r1,justified in saying,accuracy,0.345
anli_dev_r1,median,accuracy,0.345
anli_dev_r2,GPT-3 style,accuracy,0.339
anli_dev_r2,MNLI crowdsource,accuracy,0.335
anli_dev_r2,can we infer,accuracy,0.354
anli_dev_r2,guaranteed/possible/impossible,accuracy,0.297
anli_dev_r2,justified in saying,accuracy,0.345
anli_dev_r2,median,accuracy,0.339
anli_dev_r3,GPT-3 style,accuracy,0.37583333333333335
anli_dev_r3,MNLI crowdsource,accuracy,0.3408333333333333
anli_dev_r3,can we infer,accuracy,0.36333333333333334
anli_dev_r3,guaranteed/possible/impossible,accuracy,0.31083333333333335
anli_dev_r3,justified in saying,accuracy,0.34
anli_dev_r3,median,accuracy,0.3408333333333333
story_cloze_2016,Answer Given options,accuracy,0.8305718866916088
story_cloze_2016,Choose Story Ending,accuracy,0.8706574024585783
story_cloze_2016,Generate Ending,accuracy,0.7183324425440941
story_cloze_2016,Novel Correct Ending,accuracy,0.848743987172635
story_cloze_2016,Story Continuation and Options,accuracy,0.8466060929983966
story_cloze_2016,median,accuracy,0.8466060929983966
super_glue_cb,GPT-3 style,accuracy,0.625
super_glue_cb,MNLI crowdsource,accuracy,0.08928571428571429
super_glue_cb,can we infer,accuracy,0.5892857142857143
super_glue_cb,guaranteed/possible/impossible,accuracy,0.5
super_glue_cb,justified in saying,accuracy,0.5357142857142857
super_glue_cb,median,accuracy,0.5357142857142857
super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.66
super_glue_copa,best_option,accuracy,0.67
super_glue_copa,cause_effect,accuracy,0.78
super_glue_copa,i_am_hesitating,accuracy,0.8
super_glue_copa,plausible_alternatives,accuracy,0.81
super_glue_copa,median,accuracy,0.78
super_glue_rte,GPT-3 style,accuracy,0.7870036101083032
super_glue_rte,MNLI crowdsource,accuracy,0.7220216606498195
super_glue_rte,does it follow that,accuracy,0.6678700361010831
super_glue_rte,guaranteed true,accuracy,0.6714801444043321
super_glue_rte,should assume,accuracy,0.6678700361010831
super_glue_rte,median,accuracy,0.6714801444043321
winogrande_winogrande_xl,Replace,accuracy,0.5406471981057617
winogrande_winogrande_xl,True or False,accuracy,0.5074980268350434
winogrande_winogrande_xl,does underscore refer to,accuracy,0.5177584846093133
winogrande_winogrande_xl,stand for,accuracy,0.510655090765588
winogrande_winogrande_xl,underscore refer to,accuracy,0.5256511444356748
winogrande_winogrande_xl,median,accuracy,0.5177584846093133
xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.47
xcopa_id,best_option,accuracy,0.51
xcopa_id,cause_effect,accuracy,0.65
xcopa_id,i_am_hesitating,accuracy,0.66
xcopa_id,plausible_alternatives,accuracy,0.67
xcopa_id,median,accuracy,0.65
xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.58
xcopa_sw,best_option,accuracy,0.57
xcopa_sw,cause_effect,accuracy,0.46
xcopa_sw,i_am_hesitating,accuracy,0.48
xcopa_sw,plausible_alternatives,accuracy,0.45
xcopa_sw,median,accuracy,0.48
xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.57
xcopa_ta,best_option,accuracy,0.67
xcopa_ta,cause_effect,accuracy,0.71
xcopa_ta,i_am_hesitating,accuracy,0.71
xcopa_ta,plausible_alternatives,accuracy,0.69
xcopa_ta,median,accuracy,0.69
xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55
xcopa_vi,best_option,accuracy,0.61
xcopa_vi,cause_effect,accuracy,0.67
xcopa_vi,i_am_hesitating,accuracy,0.66
xcopa_vi,plausible_alternatives,accuracy,0.65
xcopa_vi,median,accuracy,0.65
xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.62
xcopa_zh,best_option,accuracy,0.61
xcopa_zh,cause_effect,accuracy,0.77
xcopa_zh,i_am_hesitating,accuracy,0.72
xcopa_zh,plausible_alternatives,accuracy,0.74
xcopa_zh,median,accuracy,0.72
xnli_ar,GPT-3 style,accuracy,0.5040160642570282
xnli_ar,MNLI crowdsource,accuracy,0.39879518072289155
xnli_ar,can we infer,accuracy,0.506425702811245
xnli_ar,guaranteed/possible/impossible,accuracy,0.4799196787148594
xnli_ar,justified in saying,accuracy,0.41526104417670684
xnli_ar,median,accuracy,0.4799196787148594
xnli_en,GPT-3 style,accuracy,0.5590361445783133
xnli_en,MNLI crowdsource,accuracy,0.342570281124498
xnli_en,can we infer,accuracy,0.5449799196787148
xnli_en,guaranteed/possible/impossible,accuracy,0.41164658634538154
xnli_en,justified in saying,accuracy,0.4634538152610442
xnli_en,median,accuracy,0.4634538152610442
xnli_es,GPT-3 style,accuracy,0.5373493975903615
xnli_es,MNLI crowdsource,accuracy,0.40441767068273093
xnli_es,can we infer,accuracy,0.5277108433734939
xnli_es,guaranteed/possible/impossible,accuracy,0.44216867469879517
xnli_es,justified in saying,accuracy,0.4534136546184739
xnli_es,median,accuracy,0.4534136546184739
xnli_fr,GPT-3 style,accuracy,0.5248995983935743
xnli_fr,MNLI crowdsource,accuracy,0.3895582329317269
xnli_fr,can we infer,accuracy,0.5337349397590362
xnli_fr,guaranteed/possible/impossible,accuracy,0.42971887550200805
xnli_fr,justified in saying,accuracy,0.4738955823293173
xnli_fr,median,accuracy,0.4738955823293173
xnli_hi,GPT-3 style,accuracy,0.4983935742971888
xnli_hi,MNLI crowdsource,accuracy,0.38714859437751004
xnli_hi,can we infer,accuracy,0.45542168674698796
xnli_hi,guaranteed/possible/impossible,accuracy,0.41405622489959837
xnli_hi,justified in saying,accuracy,0.38795180722891565
xnli_hi,median,accuracy,0.41405622489959837
xnli_sw,GPT-3 style,accuracy,0.43493975903614457
xnli_sw,MNLI crowdsource,accuracy,0.363855421686747
xnli_sw,can we infer,accuracy,0.42891566265060244
xnli_sw,guaranteed/possible/impossible,accuracy,0.3457831325301205
xnli_sw,justified in saying,accuracy,0.3650602409638554
xnli_sw,median,accuracy,0.3650602409638554
xnli_ur,GPT-3 style,accuracy,0.43493975903614457
xnli_ur,MNLI crowdsource,accuracy,0.3895582329317269
xnli_ur,can we infer,accuracy,0.45180722891566266
xnli_ur,guaranteed/possible/impossible,accuracy,0.40120481927710844
xnli_ur,justified in saying,accuracy,0.37630522088353413
xnli_ur,median,accuracy,0.40120481927710844
xnli_vi,GPT-3 style,accuracy,0.5196787148594377
xnli_vi,MNLI crowdsource,accuracy,0.38112449799196785
xnli_vi,can we infer,accuracy,0.5080321285140562
xnli_vi,guaranteed/possible/impossible,accuracy,0.38393574297188754
xnli_vi,justified in saying,accuracy,0.43614457831325304
xnli_vi,median,accuracy,0.43614457831325304
xnli_zh,GPT-3 style,accuracy,0.5052208835341365
xnli_zh,MNLI crowdsource,accuracy,0.4
xnli_zh,can we infer,accuracy,0.5228915662650603
xnli_zh,guaranteed/possible/impossible,accuracy,0.4738955823293173
xnli_zh,justified in saying,accuracy,0.45863453815261046
xnli_zh,median,accuracy,0.4738955823293173
xstory_cloze_ar,Answer Given options,accuracy,0.7518199867637326
xstory_cloze_ar,Choose Story Ending,accuracy,0.7749834546657842
xstory_cloze_ar,Generate Ending,accuracy,0.586366644606221
xstory_cloze_ar,Novel Correct Ending,accuracy,0.7518199867637326
xstory_cloze_ar,Story Continuation and Options,accuracy,0.7438782263401721
xstory_cloze_ar,median,accuracy,0.7518199867637326
xstory_cloze_es,Answer Given options,accuracy,0.7835870284579749
xstory_cloze_es,Choose Story Ending,accuracy,0.8292521508934481
xstory_cloze_es,Generate Ending,accuracy,0.6399735274652548
xstory_cloze_es,Novel Correct Ending,accuracy,0.7935142289874255
xstory_cloze_es,Story Continuation and Options,accuracy,0.7888815354070152
xstory_cloze_es,median,accuracy,0.7888815354070152
xstory_cloze_eu,Answer Given options,accuracy,0.7041694242223693
xstory_cloze_eu,Choose Story Ending,accuracy,0.6823295830575777
xstory_cloze_eu,Generate Ending,accuracy,0.5625413633355394
xstory_cloze_eu,Novel Correct Ending,accuracy,0.6671078755790867
xstory_cloze_eu,Story Continuation and Options,accuracy,0.671740569159497
xstory_cloze_eu,median,accuracy,0.671740569159497
xstory_cloze_hi,Answer Given options,accuracy,0.6915949702183984
xstory_cloze_hi,Choose Story Ending,accuracy,0.7220383851753805
xstory_cloze_hi,Generate Ending,accuracy,0.5883520847121112
xstory_cloze_hi,Novel Correct Ending,accuracy,0.6743878226340172
xstory_cloze_hi,Story Continuation and Options,accuracy,0.6816677696889477
xstory_cloze_hi,median,accuracy,0.6816677696889477
xstory_cloze_id,Answer Given options,accuracy,0.7445400397088021
xstory_cloze_id,Choose Story Ending,accuracy,0.771012574454004
xstory_cloze_id,Generate Ending,accuracy,0.6029119788219722
xstory_cloze_id,Novel Correct Ending,accuracy,0.7485109199205824
xstory_cloze_id,Story Continuation and Options,accuracy,0.7438782263401721
xstory_cloze_id,median,accuracy,0.7445400397088021
xstory_cloze_zh,Answer Given options,accuracy,0.7610853739245532
xstory_cloze_zh,Choose Story Ending,accuracy,0.7961614824619457
xstory_cloze_zh,Generate Ending,accuracy,0.6214427531436135
xstory_cloze_zh,Novel Correct Ending,accuracy,0.7696889477167439
xstory_cloze_zh,Story Continuation and Options,accuracy,0.7670416942422237
xstory_cloze_zh,median,accuracy,0.7670416942422237
xwinograd_en,Replace,accuracy,0.5225806451612903
xwinograd_en,True or False,accuracy,0.48946236559139783
xwinograd_en,does underscore refer to,accuracy,0.5281720430107527
xwinograd_en,stand for,accuracy,0.5062365591397849
xwinograd_en,underscore refer to,accuracy,0.5372043010752688
xwinograd_en,median,accuracy,0.5225806451612903
xwinograd_fr,Replace,accuracy,0.5060240963855421
xwinograd_fr,True or False,accuracy,0.5421686746987951
xwinograd_fr,does underscore refer to,accuracy,0.5542168674698795
xwinograd_fr,stand for,accuracy,0.4819277108433735
xwinograd_fr,underscore refer to,accuracy,0.5301204819277109
xwinograd_fr,median,accuracy,0.5301204819277109
xwinograd_pt,Replace,accuracy,0.5133079847908745
xwinograd_pt,True or False,accuracy,0.4714828897338403
xwinograd_pt,does underscore refer to,accuracy,0.5209125475285171
xwinograd_pt,stand for,accuracy,0.5019011406844106
xwinograd_pt,underscore refer to,accuracy,0.5399239543726235
xwinograd_pt,median,accuracy,0.5133079847908745
xwinograd_zh,Replace,accuracy,0.5257936507936508
xwinograd_zh,True or False,accuracy,0.5297619047619048
xwinograd_zh,does underscore refer to,accuracy,0.5218253968253969
xwinograd_zh,stand for,accuracy,0.4444444444444444
xwinograd_zh,underscore refer to,accuracy,0.5198412698412699
xwinograd_zh,median,accuracy,0.5218253968253969
multiple,average,multiple,0.5631550819200618
|