|
dataset,prompt,metric,value
|
|
anli_dev_r1,GPT-3 style,accuracy,0.351
|
|
anli_dev_r1,MNLI crowdsource,accuracy,0.334
|
|
anli_dev_r1,can we infer,accuracy,0.351
|
|
anli_dev_r1,guaranteed/possible/impossible,accuracy,0.288
|
|
anli_dev_r1,justified in saying,accuracy,0.345
|
|
anli_dev_r1,median,accuracy,0.345
|
|
anli_dev_r2,GPT-3 style,accuracy,0.339
|
|
anli_dev_r2,MNLI crowdsource,accuracy,0.335
|
|
anli_dev_r2,can we infer,accuracy,0.354
|
|
anli_dev_r2,guaranteed/possible/impossible,accuracy,0.297
|
|
anli_dev_r2,justified in saying,accuracy,0.345
|
|
anli_dev_r2,median,accuracy,0.339
|
|
anli_dev_r3,GPT-3 style,accuracy,0.37583333333333335
|
|
anli_dev_r3,MNLI crowdsource,accuracy,0.3408333333333333
|
|
anli_dev_r3,can we infer,accuracy,0.36333333333333334
|
|
anli_dev_r3,guaranteed/possible/impossible,accuracy,0.31083333333333335
|
|
anli_dev_r3,justified in saying,accuracy,0.34
|
|
anli_dev_r3,median,accuracy,0.3408333333333333
|
|
story_cloze_2016,Answer Given options,accuracy,0.8305718866916088
|
|
story_cloze_2016,Choose Story Ending,accuracy,0.8706574024585783
|
|
story_cloze_2016,Generate Ending,accuracy,0.7183324425440941
|
|
story_cloze_2016,Novel Correct Ending,accuracy,0.848743987172635
|
|
story_cloze_2016,Story Continuation and Options,accuracy,0.8466060929983966
|
|
story_cloze_2016,median,accuracy,0.8466060929983966
|
|
super_glue_cb,GPT-3 style,accuracy,0.625
|
|
super_glue_cb,MNLI crowdsource,accuracy,0.08928571428571429
|
|
super_glue_cb,can we infer,accuracy,0.5892857142857143
|
|
super_glue_cb,guaranteed/possible/impossible,accuracy,0.5
|
|
super_glue_cb,justified in saying,accuracy,0.5357142857142857
|
|
super_glue_cb,median,accuracy,0.5357142857142857
|
|
super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.66
|
|
super_glue_copa,best_option,accuracy,0.67
|
|
super_glue_copa,cause_effect,accuracy,0.78
|
|
super_glue_copa,i_am_hesitating,accuracy,0.8
|
|
super_glue_copa,plausible_alternatives,accuracy,0.81
|
|
super_glue_copa,median,accuracy,0.78
|
|
super_glue_rte,GPT-3 style,accuracy,0.7870036101083032
|
|
super_glue_rte,MNLI crowdsource,accuracy,0.7220216606498195
|
|
super_glue_rte,does it follow that,accuracy,0.6678700361010831
|
|
super_glue_rte,guaranteed true,accuracy,0.6714801444043321
|
|
super_glue_rte,should assume,accuracy,0.6678700361010831
|
|
super_glue_rte,median,accuracy,0.6714801444043321
|
|
winogrande_winogrande_xl,Replace,accuracy,0.5406471981057617
|
|
winogrande_winogrande_xl,True or False,accuracy,0.5074980268350434
|
|
winogrande_winogrande_xl,does underscore refer to,accuracy,0.5177584846093133
|
|
winogrande_winogrande_xl,stand for,accuracy,0.510655090765588
|
|
winogrande_winogrande_xl,underscore refer to,accuracy,0.5256511444356748
|
|
winogrande_winogrande_xl,median,accuracy,0.5177584846093133
|
|
xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.47
|
|
xcopa_id,best_option,accuracy,0.51
|
|
xcopa_id,cause_effect,accuracy,0.65
|
|
xcopa_id,i_am_hesitating,accuracy,0.66
|
|
xcopa_id,plausible_alternatives,accuracy,0.67
|
|
xcopa_id,median,accuracy,0.65
|
|
xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.58
|
|
xcopa_sw,best_option,accuracy,0.57
|
|
xcopa_sw,cause_effect,accuracy,0.46
|
|
xcopa_sw,i_am_hesitating,accuracy,0.48
|
|
xcopa_sw,plausible_alternatives,accuracy,0.45
|
|
xcopa_sw,median,accuracy,0.48
|
|
xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.57
|
|
xcopa_ta,best_option,accuracy,0.67
|
|
xcopa_ta,cause_effect,accuracy,0.71
|
|
xcopa_ta,i_am_hesitating,accuracy,0.71
|
|
xcopa_ta,plausible_alternatives,accuracy,0.69
|
|
xcopa_ta,median,accuracy,0.69
|
|
xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55
|
|
xcopa_vi,best_option,accuracy,0.61
|
|
xcopa_vi,cause_effect,accuracy,0.67
|
|
xcopa_vi,i_am_hesitating,accuracy,0.66
|
|
xcopa_vi,plausible_alternatives,accuracy,0.65
|
|
xcopa_vi,median,accuracy,0.65
|
|
xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.62
|
|
xcopa_zh,best_option,accuracy,0.61
|
|
xcopa_zh,cause_effect,accuracy,0.77
|
|
xcopa_zh,i_am_hesitating,accuracy,0.72
|
|
xcopa_zh,plausible_alternatives,accuracy,0.74
|
|
xcopa_zh,median,accuracy,0.72
|
|
xnli_ar,GPT-3 style,accuracy,0.5040160642570282
|
|
xnli_ar,MNLI crowdsource,accuracy,0.39879518072289155
|
|
xnli_ar,can we infer,accuracy,0.506425702811245
|
|
xnli_ar,guaranteed/possible/impossible,accuracy,0.4799196787148594
|
|
xnli_ar,justified in saying,accuracy,0.41526104417670684
|
|
xnli_ar,median,accuracy,0.4799196787148594
|
|
xnli_en,GPT-3 style,accuracy,0.5590361445783133
|
|
xnli_en,MNLI crowdsource,accuracy,0.342570281124498
|
|
xnli_en,can we infer,accuracy,0.5449799196787148
|
|
xnli_en,guaranteed/possible/impossible,accuracy,0.41164658634538154
|
|
xnli_en,justified in saying,accuracy,0.4634538152610442
|
|
xnli_en,median,accuracy,0.4634538152610442
|
|
xnli_es,GPT-3 style,accuracy,0.5373493975903615
|
|
xnli_es,MNLI crowdsource,accuracy,0.40441767068273093
|
|
xnli_es,can we infer,accuracy,0.5277108433734939
|
|
xnli_es,guaranteed/possible/impossible,accuracy,0.44216867469879517
|
|
xnli_es,justified in saying,accuracy,0.4534136546184739
|
|
xnli_es,median,accuracy,0.4534136546184739
|
|
xnli_fr,GPT-3 style,accuracy,0.5248995983935743
|
|
xnli_fr,MNLI crowdsource,accuracy,0.3895582329317269
|
|
xnli_fr,can we infer,accuracy,0.5337349397590362
|
|
xnli_fr,guaranteed/possible/impossible,accuracy,0.42971887550200805
|
|
xnli_fr,justified in saying,accuracy,0.4738955823293173
|
|
xnli_fr,median,accuracy,0.4738955823293173
|
|
xnli_hi,GPT-3 style,accuracy,0.4983935742971888
|
|
xnli_hi,MNLI crowdsource,accuracy,0.38714859437751004
|
|
xnli_hi,can we infer,accuracy,0.45542168674698796
|
|
xnli_hi,guaranteed/possible/impossible,accuracy,0.41405622489959837
|
|
xnli_hi,justified in saying,accuracy,0.38795180722891565
|
|
xnli_hi,median,accuracy,0.41405622489959837
|
|
xnli_sw,GPT-3 style,accuracy,0.43493975903614457
|
|
xnli_sw,MNLI crowdsource,accuracy,0.363855421686747
|
|
xnli_sw,can we infer,accuracy,0.42891566265060244
|
|
xnli_sw,guaranteed/possible/impossible,accuracy,0.3457831325301205
|
|
xnli_sw,justified in saying,accuracy,0.3650602409638554
|
|
xnli_sw,median,accuracy,0.3650602409638554
|
|
xnli_ur,GPT-3 style,accuracy,0.43493975903614457
|
|
xnli_ur,MNLI crowdsource,accuracy,0.3895582329317269
|
|
xnli_ur,can we infer,accuracy,0.45180722891566266
|
|
xnli_ur,guaranteed/possible/impossible,accuracy,0.40120481927710844
|
|
xnli_ur,justified in saying,accuracy,0.37630522088353413
|
|
xnli_ur,median,accuracy,0.40120481927710844
|
|
xnli_vi,GPT-3 style,accuracy,0.5196787148594377
|
|
xnli_vi,MNLI crowdsource,accuracy,0.38112449799196785
|
|
xnli_vi,can we infer,accuracy,0.5080321285140562
|
|
xnli_vi,guaranteed/possible/impossible,accuracy,0.38393574297188754
|
|
xnli_vi,justified in saying,accuracy,0.43614457831325304
|
|
xnli_vi,median,accuracy,0.43614457831325304
|
|
xnli_zh,GPT-3 style,accuracy,0.5052208835341365
|
|
xnli_zh,MNLI crowdsource,accuracy,0.4
|
|
xnli_zh,can we infer,accuracy,0.5228915662650603
|
|
xnli_zh,guaranteed/possible/impossible,accuracy,0.4738955823293173
|
|
xnli_zh,justified in saying,accuracy,0.45863453815261046
|
|
xnli_zh,median,accuracy,0.4738955823293173
|
|
xstory_cloze_ar,Answer Given options,accuracy,0.7518199867637326
|
|
xstory_cloze_ar,Choose Story Ending,accuracy,0.7749834546657842
|
|
xstory_cloze_ar,Generate Ending,accuracy,0.586366644606221
|
|
xstory_cloze_ar,Novel Correct Ending,accuracy,0.7518199867637326
|
|
xstory_cloze_ar,Story Continuation and Options,accuracy,0.7438782263401721
|
|
xstory_cloze_ar,median,accuracy,0.7518199867637326
|
|
xstory_cloze_es,Answer Given options,accuracy,0.7835870284579749
|
|
xstory_cloze_es,Choose Story Ending,accuracy,0.8292521508934481
|
|
xstory_cloze_es,Generate Ending,accuracy,0.6399735274652548
|
|
xstory_cloze_es,Novel Correct Ending,accuracy,0.7935142289874255
|
|
xstory_cloze_es,Story Continuation and Options,accuracy,0.7888815354070152
|
|
xstory_cloze_es,median,accuracy,0.7888815354070152
|
|
xstory_cloze_eu,Answer Given options,accuracy,0.7041694242223693
|
|
xstory_cloze_eu,Choose Story Ending,accuracy,0.6823295830575777
|
|
xstory_cloze_eu,Generate Ending,accuracy,0.5625413633355394
|
|
xstory_cloze_eu,Novel Correct Ending,accuracy,0.6671078755790867
|
|
xstory_cloze_eu,Story Continuation and Options,accuracy,0.671740569159497
|
|
xstory_cloze_eu,median,accuracy,0.671740569159497
|
|
xstory_cloze_hi,Answer Given options,accuracy,0.6915949702183984
|
|
xstory_cloze_hi,Choose Story Ending,accuracy,0.7220383851753805
|
|
xstory_cloze_hi,Generate Ending,accuracy,0.5883520847121112
|
|
xstory_cloze_hi,Novel Correct Ending,accuracy,0.6743878226340172
|
|
xstory_cloze_hi,Story Continuation and Options,accuracy,0.6816677696889477
|
|
xstory_cloze_hi,median,accuracy,0.6816677696889477
|
|
xstory_cloze_id,Answer Given options,accuracy,0.7445400397088021
|
|
xstory_cloze_id,Choose Story Ending,accuracy,0.771012574454004
|
|
xstory_cloze_id,Generate Ending,accuracy,0.6029119788219722
|
|
xstory_cloze_id,Novel Correct Ending,accuracy,0.7485109199205824
|
|
xstory_cloze_id,Story Continuation and Options,accuracy,0.7438782263401721
|
|
xstory_cloze_id,median,accuracy,0.7445400397088021
|
|
xstory_cloze_zh,Answer Given options,accuracy,0.7610853739245532
|
|
xstory_cloze_zh,Choose Story Ending,accuracy,0.7961614824619457
|
|
xstory_cloze_zh,Generate Ending,accuracy,0.6214427531436135
|
|
xstory_cloze_zh,Novel Correct Ending,accuracy,0.7696889477167439
|
|
xstory_cloze_zh,Story Continuation and Options,accuracy,0.7670416942422237
|
|
xstory_cloze_zh,median,accuracy,0.7670416942422237
|
|
xwinograd_en,Replace,accuracy,0.5225806451612903
|
|
xwinograd_en,True or False,accuracy,0.48946236559139783
|
|
xwinograd_en,does underscore refer to,accuracy,0.5281720430107527
|
|
xwinograd_en,stand for,accuracy,0.5062365591397849
|
|
xwinograd_en,underscore refer to,accuracy,0.5372043010752688
|
|
xwinograd_en,median,accuracy,0.5225806451612903
|
|
xwinograd_fr,Replace,accuracy,0.5060240963855421
|
|
xwinograd_fr,True or False,accuracy,0.5421686746987951
|
|
xwinograd_fr,does underscore refer to,accuracy,0.5542168674698795
|
|
xwinograd_fr,stand for,accuracy,0.4819277108433735
|
|
xwinograd_fr,underscore refer to,accuracy,0.5301204819277109
|
|
xwinograd_fr,median,accuracy,0.5301204819277109
|
|
xwinograd_pt,Replace,accuracy,0.5133079847908745
|
|
xwinograd_pt,True or False,accuracy,0.4714828897338403
|
|
xwinograd_pt,does underscore refer to,accuracy,0.5209125475285171
|
|
xwinograd_pt,stand for,accuracy,0.5019011406844106
|
|
xwinograd_pt,underscore refer to,accuracy,0.5399239543726235
|
|
xwinograd_pt,median,accuracy,0.5133079847908745
|
|
xwinograd_zh,Replace,accuracy,0.5257936507936508
|
|
xwinograd_zh,True or False,accuracy,0.5297619047619048
|
|
xwinograd_zh,does underscore refer to,accuracy,0.5218253968253969
|
|
xwinograd_zh,stand for,accuracy,0.4444444444444444
|
|
xwinograd_zh,underscore refer to,accuracy,0.5198412698412699
|
|
xwinograd_zh,median,accuracy,0.5218253968253969
|
|
multiple,average,multiple,0.5631550819200618
|
|
|