bloomz / evaluation_l1 /merged.csv
Muennighoff's picture
Upload files
f2124a1
raw
history blame
10.5 kB
dataset,prompt,metric,value
anli_dev_r1,GPT-3 style,accuracy,0.486
anli_dev_r1,MNLI crowdsource,accuracy,0.427
anli_dev_r1,can we infer,accuracy,0.474
anli_dev_r1,guaranteed/possible/impossible,accuracy,0.39
anli_dev_r1,justified in saying,accuracy,0.46
anli_dev_r1,median,accuracy,0.46
anli_dev_r2,GPT-3 style,accuracy,0.441
anli_dev_r2,MNLI crowdsource,accuracy,0.406
anli_dev_r2,can we infer,accuracy,0.426
anli_dev_r2,guaranteed/possible/impossible,accuracy,0.36
anli_dev_r2,justified in saying,accuracy,0.419
anli_dev_r2,median,accuracy,0.419
anli_dev_r3,GPT-3 style,accuracy,0.455
anli_dev_r3,MNLI crowdsource,accuracy,0.42
anli_dev_r3,can we infer,accuracy,0.445
anli_dev_r3,guaranteed/possible/impossible,accuracy,0.32083333333333336
anli_dev_r3,justified in saying,accuracy,0.4266666666666667
anli_dev_r3,median,accuracy,0.4266666666666667
story_cloze_2016,Answer Given options,accuracy,0.9567076429716729
story_cloze_2016,Choose Story Ending,accuracy,0.9625868519508284
story_cloze_2016,Generate Ending,accuracy,0.7814003206841261
story_cloze_2016,Novel Correct Ending,accuracy,0.9577765900587921
story_cloze_2016,Story Continuation and Options,accuracy,0.951362907536077
story_cloze_2016,median,accuracy,0.9567076429716729
super_glue_cb,GPT-3 style,accuracy,0.8214285714285714
super_glue_cb,MNLI crowdsource,accuracy,0.375
super_glue_cb,can we infer,accuracy,0.8214285714285714
super_glue_cb,guaranteed/possible/impossible,accuracy,0.7321428571428571
super_glue_cb,justified in saying,accuracy,0.7678571428571429
super_glue_cb,median,accuracy,0.7678571428571429
super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.62
super_glue_copa,best_option,accuracy,0.87
super_glue_copa,cause_effect,accuracy,0.88
super_glue_copa,i_am_hesitating,accuracy,0.91
super_glue_copa,plausible_alternatives,accuracy,0.88
super_glue_copa,median,accuracy,0.88
super_glue_rte,GPT-3 style,accuracy,0.8303249097472925
super_glue_rte,MNLI crowdsource,accuracy,0.855595667870036
super_glue_rte,does it follow that,accuracy,0.7833935018050542
super_glue_rte,guaranteed true,accuracy,0.8122743682310469
super_glue_rte,should assume,accuracy,0.8194945848375451
super_glue_rte,median,accuracy,0.8194945848375451
winogrande_winogrande_xl,Replace,accuracy,0.584846093133386
winogrande_winogrande_xl,True or False,accuracy,0.5217048145224941
winogrande_winogrande_xl,does underscore refer to,accuracy,0.5840568271507498
winogrande_winogrande_xl,stand for,accuracy,0.5114443567482242
winogrande_winogrande_xl,underscore refer to,accuracy,0.5927387529597474
winogrande_winogrande_xl,median,accuracy,0.5840568271507498
xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.55
xcopa_id,best_option,accuracy,0.78
xcopa_id,cause_effect,accuracy,0.86
xcopa_id,i_am_hesitating,accuracy,0.79
xcopa_id,plausible_alternatives,accuracy,0.84
xcopa_id,median,accuracy,0.79
xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.57
xcopa_sw,best_option,accuracy,0.6
xcopa_sw,cause_effect,accuracy,0.6
xcopa_sw,i_am_hesitating,accuracy,0.64
xcopa_sw,plausible_alternatives,accuracy,0.62
xcopa_sw,median,accuracy,0.6
xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.58
xcopa_ta,best_option,accuracy,0.67
xcopa_ta,cause_effect,accuracy,0.67
xcopa_ta,i_am_hesitating,accuracy,0.68
xcopa_ta,plausible_alternatives,accuracy,0.69
xcopa_ta,median,accuracy,0.67
xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55
xcopa_vi,best_option,accuracy,0.83
xcopa_vi,cause_effect,accuracy,0.87
xcopa_vi,i_am_hesitating,accuracy,0.84
xcopa_vi,plausible_alternatives,accuracy,0.86
xcopa_vi,median,accuracy,0.84
xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.55
xcopa_zh,best_option,accuracy,0.83
xcopa_zh,cause_effect,accuracy,0.9
xcopa_zh,i_am_hesitating,accuracy,0.9
xcopa_zh,plausible_alternatives,accuracy,0.86
xcopa_zh,median,accuracy,0.86
xnli_ar,GPT-3 style,accuracy,0.5357429718875502
xnli_ar,MNLI crowdsource,accuracy,0.41004016064257026
xnli_ar,can we infer,accuracy,0.5606425702811245
xnli_ar,guaranteed/possible/impossible,accuracy,0.6068273092369478
xnli_ar,justified in saying,accuracy,0.5437751004016064
xnli_ar,median,accuracy,0.5437751004016064
xnli_en,GPT-3 style,accuracy,0.6168674698795181
xnli_en,MNLI crowdsource,accuracy,0.45502008032128516
xnli_en,can we infer,accuracy,0.6092369477911647
xnli_en,guaranteed/possible/impossible,accuracy,0.6746987951807228
xnli_en,justified in saying,accuracy,0.5895582329317269
xnli_en,median,accuracy,0.6092369477911647
xnli_es,GPT-3 style,accuracy,0.585140562248996
xnli_es,MNLI crowdsource,accuracy,0.4357429718875502
xnli_es,can we infer,accuracy,0.5883534136546185
xnli_es,guaranteed/possible/impossible,accuracy,0.6124497991967871
xnli_es,justified in saying,accuracy,0.5734939759036145
xnli_es,median,accuracy,0.585140562248996
xnli_fr,GPT-3 style,accuracy,0.5771084337349398
xnli_fr,MNLI crowdsource,accuracy,0.43012048192771085
xnli_fr,can we infer,accuracy,0.5807228915662651
xnli_fr,guaranteed/possible/impossible,accuracy,0.6136546184738956
xnli_fr,justified in saying,accuracy,0.5694779116465863
xnli_fr,median,accuracy,0.5771084337349398
xnli_hi,GPT-3 style,accuracy,0.5248995983935743
xnli_hi,MNLI crowdsource,accuracy,0.3795180722891566
xnli_hi,can we infer,accuracy,0.5506024096385542
xnli_hi,guaranteed/possible/impossible,accuracy,0.5682730923694779
xnli_hi,justified in saying,accuracy,0.5353413654618474
xnli_hi,median,accuracy,0.5353413654618474
xnli_sw,GPT-3 style,accuracy,0.4795180722891566
xnli_sw,MNLI crowdsource,accuracy,0.39196787148594375
xnli_sw,can we infer,accuracy,0.5208835341365462
xnli_sw,guaranteed/possible/impossible,accuracy,0.5036144578313253
xnli_sw,justified in saying,accuracy,0.5184738955823294
xnli_sw,median,accuracy,0.5036144578313253
xnli_ur,GPT-3 style,accuracy,0.46586345381526106
xnli_ur,MNLI crowdsource,accuracy,0.3718875502008032
xnli_ur,can we infer,accuracy,0.5080321285140562
xnli_ur,guaranteed/possible/impossible,accuracy,0.4995983935742972
xnli_ur,justified in saying,accuracy,0.5080321285140562
xnli_ur,median,accuracy,0.4995983935742972
xnli_vi,GPT-3 style,accuracy,0.5578313253012048
xnli_vi,MNLI crowdsource,accuracy,0.42449799196787147
xnli_vi,can we infer,accuracy,0.5678714859437751
xnli_vi,guaranteed/possible/impossible,accuracy,0.6100401606425703
xnli_vi,justified in saying,accuracy,0.5538152610441767
xnli_vi,median,accuracy,0.5578313253012048
xnli_zh,GPT-3 style,accuracy,0.5526104417670683
xnli_zh,MNLI crowdsource,accuracy,0.38473895582329315
xnli_zh,can we infer,accuracy,0.5690763052208835
xnli_zh,guaranteed/possible/impossible,accuracy,0.5674698795180723
xnli_zh,justified in saying,accuracy,0.5622489959839357
xnli_zh,median,accuracy,0.5622489959839357
xstory_cloze_ar,Answer Given options,accuracy,0.7968232958305758
xstory_cloze_ar,Choose Story Ending,accuracy,0.9232296492389146
xstory_cloze_ar,Generate Ending,accuracy,0.6677696889477167
xstory_cloze_ar,Novel Correct Ending,accuracy,0.9265387160820648
xstory_cloze_ar,Story Continuation and Options,accuracy,0.9126406353408338
xstory_cloze_ar,median,accuracy,0.9126406353408338
xstory_cloze_es,Answer Given options,accuracy,0.8729318332230311
xstory_cloze_es,Choose Story Ending,accuracy,0.9417604235605559
xstory_cloze_es,Generate Ending,accuracy,0.7359364659166115
xstory_cloze_es,Novel Correct Ending,accuracy,0.9430840502978161
xstory_cloze_es,Story Continuation and Options,accuracy,0.9318332230311053
xstory_cloze_es,median,accuracy,0.9318332230311053
xstory_cloze_eu,Answer Given options,accuracy,0.7054930509596293
xstory_cloze_eu,Choose Story Ending,accuracy,0.8663136995367307
xstory_cloze_eu,Generate Ending,accuracy,0.6320317670416943
xstory_cloze_eu,Novel Correct Ending,accuracy,0.8689609530112509
xstory_cloze_eu,Story Continuation and Options,accuracy,0.8524156187954997
xstory_cloze_eu,median,accuracy,0.8524156187954997
xstory_cloze_hi,Answer Given options,accuracy,0.798808735936466
xstory_cloze_hi,Choose Story Ending,accuracy,0.8702845797485109
xstory_cloze_hi,Generate Ending,accuracy,0.6604897418927862
xstory_cloze_hi,Novel Correct Ending,accuracy,0.8788881535407015
xstory_cloze_hi,Story Continuation and Options,accuracy,0.870946393117141
xstory_cloze_hi,median,accuracy,0.8702845797485109
xstory_cloze_id,Answer Given options,accuracy,0.8557246856386499
xstory_cloze_id,Choose Story Ending,accuracy,0.9212442091330245
xstory_cloze_id,Generate Ending,accuracy,0.7041694242223693
xstory_cloze_id,Novel Correct Ending,accuracy,0.9205823957643945
xstory_cloze_id,Story Continuation and Options,accuracy,0.9066843150231635
xstory_cloze_id,median,accuracy,0.9066843150231635
xstory_cloze_zh,Answer Given options,accuracy,0.900066181336863
xstory_cloze_zh,Choose Story Ending,accuracy,0.9232296492389146
xstory_cloze_zh,Generate Ending,accuracy,0.684976836532098
xstory_cloze_zh,Novel Correct Ending,accuracy,0.9311714096624751
xstory_cloze_zh,Story Continuation and Options,accuracy,0.9199205823957644
xstory_cloze_zh,median,accuracy,0.9199205823957644
xwinograd_en,Replace,accuracy,0.6847311827956989
xwinograd_en,True or False,accuracy,0.5135483870967742
xwinograd_en,does underscore refer to,accuracy,0.6787096774193548
xwinograd_en,stand for,accuracy,0.5053763440860215
xwinograd_en,underscore refer to,accuracy,0.690752688172043
xwinograd_en,median,accuracy,0.6787096774193548
xwinograd_fr,Replace,accuracy,0.6506024096385542
xwinograd_fr,True or False,accuracy,0.4939759036144578
xwinograd_fr,does underscore refer to,accuracy,0.6867469879518072
xwinograd_fr,stand for,accuracy,0.46987951807228917
xwinograd_fr,underscore refer to,accuracy,0.6626506024096386
xwinograd_fr,median,accuracy,0.6506024096385542
xwinograd_pt,Replace,accuracy,0.6349809885931559
xwinograd_pt,True or False,accuracy,0.4866920152091255
xwinograd_pt,does underscore refer to,accuracy,0.6387832699619772
xwinograd_pt,stand for,accuracy,0.49429657794676807
xwinograd_pt,underscore refer to,accuracy,0.6425855513307985
xwinograd_pt,median,accuracy,0.6349809885931559
xwinograd_zh,Replace,accuracy,0.6865079365079365
xwinograd_zh,True or False,accuracy,0.5277777777777778
xwinograd_zh,does underscore refer to,accuracy,0.6884920634920635
xwinograd_zh,stand for,accuracy,0.4861111111111111
xwinograd_zh,underscore refer to,accuracy,0.6904761904761905
xwinograd_zh,median,accuracy,0.6865079365079365
multiple,average,multiple,0.6903830754158429