hynky HF staff commited on
Commit
0932e7b
·
1 Parent(s): c85811e

new plotting code (JIT)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. data/plots/c4_filters_hellaswag.json +0 -1
  2. data/plots/c4_filters_hellaswag/agg_score.json +1 -0
  3. data/plots/c4_filters_hellaswag/arc_acc_norm.json +1 -0
  4. data/plots/c4_filters_hellaswag/commonsense_qa_acc_norm.json +1 -0
  5. data/plots/c4_filters_hellaswag/hellaswag_acc_norm.json +1 -0
  6. data/plots/c4_filters_hellaswag/index.json +1 -0
  7. data/plots/c4_filters_hellaswag/mmlu_acc_norm.json +1 -0
  8. data/plots/c4_filters_hellaswag/openbookqa_acc_norm.json +1 -0
  9. data/plots/c4_filters_hellaswag/piqa_acc_norm.json +1 -0
  10. data/plots/c4_filters_hellaswag/winogrande_acc_norm.json +1 -0
  11. data/plots/cross_ind_unfiltered_comparison.json +0 -0
  12. data/plots/cross_ind_unfiltered_comparison/agg_score.json +1 -0
  13. data/plots/cross_ind_unfiltered_comparison/arc_acc_norm.json +1 -0
  14. data/plots/cross_ind_unfiltered_comparison/commonsense_qa_acc_norm.json +1 -0
  15. data/plots/cross_ind_unfiltered_comparison/hellaswag_acc_norm.json +1 -0
  16. data/plots/cross_ind_unfiltered_comparison/index.json +1 -0
  17. data/plots/cross_ind_unfiltered_comparison/mmlu_acc_norm.json +1 -0
  18. data/plots/cross_ind_unfiltered_comparison/openbookqa_acc_norm.json +1 -0
  19. data/plots/cross_ind_unfiltered_comparison/piqa_acc_norm.json +1 -0
  20. data/plots/cross_ind_unfiltered_comparison/winogrande_acc_norm.json +1 -0
  21. data/plots/custom-_ilters.json +0 -1
  22. data/plots/custom-_ilters/agg_score.json +1 -0
  23. data/plots/custom-_ilters/arc_acc_norm.json +1 -0
  24. data/plots/custom-_ilters/commonsense_qa_acc_norm.json +1 -0
  25. data/plots/custom-_ilters/hellaswag_acc_norm.json +1 -0
  26. data/plots/custom-_ilters/index.json +1 -0
  27. data/plots/custom-_ilters/mmlu_acc_norm.json +1 -0
  28. data/plots/custom-_ilters/openbookqa_acc_norm.json +1 -0
  29. data/plots/custom-_ilters/piqa_acc_norm.json +1 -0
  30. data/plots/custom-_ilters/winogrande_acc_norm.json +1 -0
  31. data/plots/custom-filters.json +0 -1
  32. data/plots/custom-filters/agg_score.json +1 -0
  33. data/plots/custom-filters/arc_acc_norm.json +1 -0
  34. data/plots/custom-filters/commonsense_qa_acc_norm.json +1 -0
  35. data/plots/custom-filters/hellaswag_acc_norm.json +1 -0
  36. data/plots/custom-filters/index.json +1 -0
  37. data/plots/custom-filters/mmlu_acc_norm.json +1 -0
  38. data/plots/custom-filters/openbookqa_acc_norm.json +1 -0
  39. data/plots/custom-filters/piqa_acc_norm.json +1 -0
  40. data/plots/custom-filters/winogrande_acc_norm.json +1 -0
  41. data/plots/custom_filters.json +0 -1
  42. data/plots/custom_filters/agg_score.json +1 -0
  43. data/plots/custom_filters/arc_acc_norm.json +1 -0
  44. data/plots/custom_filters/commonsense_qa_acc_norm.json +1 -0
  45. data/plots/custom_filters/hellaswag_acc_norm.json +1 -0
  46. data/plots/custom_filters/index.json +1 -0
  47. data/plots/custom_filters/mmlu_acc_norm.json +1 -0
  48. data/plots/custom_filters/openbookqa_acc_norm.json +1 -0
  49. data/plots/custom_filters/piqa_acc_norm.json +1 -0
  50. data/plots/custom_filters/winogrande_acc_norm.json +1 -0
data/plots/c4_filters_hellaswag.json DELETED
@@ -1 +0,0 @@
1
- {"data":{"agg_score":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308296035975218,0.35613923892378807,0.3746252153068781,0.38806260935962195,0.39690930768847466,0.4043668694794178,0.40220927633345127,0.41070565767586226,0.41399387270212173,0.4170555509626865,0.42098715901374817,0.4210818205028772,0.42051274701952934,0.424176013097167,0.4225243702530861],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.3583905678242445,0.38119001872837543,0.3873079549521208,0.39723034016788,0.4043100867420435,0.40908974781632423,0.4140731003135443,0.41894380562007427,0.41736695170402527,0.4232212919741869,0.4229240976274013,0.4236308634281158,0.42750727012753487,0.4268195778131485],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36182260885834694,0.3764855917543173,0.3928546328097582,0.3978128544986248,0.4073755294084549,0.4112890623509884,0.41486112400889397,0.4196756165474653,0.4235504809767008,0.42218128964304924,0.4228535555303097,0.4249562546610832,0.42740595713257784,0.42711055465042586],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36000680737197394,0.37551611103117466,0.38802069239318365,0.3933942876756191,0.4043118376284838,0.40780537389218807,0.4112964067608118,0.4137573726475239,0.41791345551609993,0.4173779133707285,0.42117033526301384,0.42073468305170536,0.42412591539323324,0.4260616712272167],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36066408455371857,0.3812380563467741,0.394003426656127,0.40062618628144264,0.4117735456675291,0.4165923688560724,0.4175422675907612,0.42100309208035464,0.42246321588754654,0.42360376194119453,0.42823668196797365,0.4299001637846231,0.4302353039383888,0.4310380257666111],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.330924579873681,0.35825083684176207,0.37912008538842196,0.38942993618547916,0.3983491826802492,0.4053049590438604,0.4079726096242666,0.4135104585438967,0.41717425361275673,0.41904263757169247,0.4211529679596424,0.4212619122117758,0.42373160831630224,0.42435371689498425,0.4279126934707165],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.3580685469011466,0.3740996705989043,0.39048008372386295,0.39857714250683784,0.40837346265713376,0.4111154315372308,0.41773712386687595,0.4196594481666882,0.42379963273803395,0.4276047808428605,0.42980752388636273,0.43098293244838715,0.43155378103256226,0.4327609067161878],"label":"C4"}},"commonsense_qa/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2584999948740005,0.2850000113248825,0.30850000679492945,0.30149999260902405,0.31049999594688416,0.3079999983310699,0.3150000125169754,0.32199999690055847,0.3244999945163727,0.3205000013113022,0.3244999945163727,0.3279999941587448,0.33149999380111694,0.32850000262260437],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2619999945163727,0.288000002503395,0.29749999940395355,0.30399999022483826,0.3149999976158142,0.3245000094175339,0.3230000138282776,0.3240000009536743,0.3245000094175339,0.33550000190734863,0.335999995470047,0.32999999821186066,0.3375000059604645,0.34049999713897705],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2650000005960464,0.28599999845027924,0.3110000044107437,0.2944999933242798,0.3085000067949295,0.32199999690055847,0.31949999928474426,0.3240000009536743,0.32500000298023224,0.3245000094175339,0.32199999690055847,0.3265000134706497,0.3295000046491623,0.32999999821186066],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.2824999988079071,0.2985000014305115,0.3050000071525574,0.3119999915361404,0.3110000044107437,0.3164999932050705,0.32199999690055847,0.3279999941587448,0.3365000039339065,0.3375000059604645,0.3384999930858612,0.340499997138977,0.341499999165535],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26299999654293055,0.2864999920129776,0.2944999933242798,0.2985000014305115,0.3165000081062317,0.3194999992847442,0.318000003695488,0.32500000298023224,0.32899999618530273,0.3254999965429306,0.33150000870227814,0.3330000042915344,0.33200000226497645,0.3330000042915344],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.25800000131130213,0.2849999964237213,0.29200001060962677,0.289000004529953,0.30349999666213984,0.30400000512599945,0.3139999955892563,0.3139999955892563,0.318000003695488,0.32299999892711634,0.3174999952316284,0.3215000033378601,0.32250000536441803,0.32549999654293055],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.25700000921885174,0.2786666651566823,0.2960000038146972,0.3049999972184499,0.3053333262602488,0.3120000064373016,0.31733333071072894,0.3163333336512248,0.3186666667461395,0.3226666748523712,0.3286666671435038,0.3240000009536743,0.32900000611941016,0.3283333381017049],"label":"C4"}},"hellaswag/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28949999809265137,0.32599999010562897,0.34450000524520874,0.3725000023841858,0.38500000536441803,0.39499999582767487,0.408500000834465,0.41700001060962677,0.4174999892711639,0.4284999966621399,0.42849999666213984,0.43150000274181366,0.4399999976158142,0.4375],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29749999940395355,0.3240000009536743,0.34849999845027924,0.3725000023841858,0.3895000070333481,0.39800000190734863,0.41000001132488245,0.4214999973773956,0.42149999737739563,0.42499999701976776,0.42750000953674316,0.4364999979734421,0.4354999959468841,0.4385000020265579],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2939999997615814,0.3295000046491623,0.3684999942779541,0.38449999690055847,0.398499995470047,0.3959999978542328,0.4204999953508377,0.4335000067949295,0.445499986410141,0.443000003695488,0.455499991774559,0.45250000059604645,0.4529999941587448,0.4545000046491623],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29100000858306885,0.32400000095367426,0.3439999967813492,0.3575000017881393,0.3800000101327896,0.40049999952316284,0.4134999960660934,0.42099998891353607,0.4204999953508377,0.4280000030994415,0.44099999964237213,0.43799999356269836,0.44200000166893005,0.44600000977516174],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29950000345706934,0.33799999952316284,0.3789999932050705,0.3970000147819519,0.42149999737739563,0.431999996304512,0.4440000057220459,0.4490000009536743,0.45949999988079065,0.4714999943971634,0.48000000417232513,0.47749999165534973,0.48100000619888306,0.48950000107288355],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2955000102519989,0.3385000079870224,0.36800000071525574,0.40099999308586115,0.4099999964237213,0.41700001060962677,0.42400000989437103,0.4389999955892563,0.4414999932050705,0.4484999924898147,0.455499991774559,0.45799998939037323,0.4660000056028366,0.471000000834465],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29699999094009394,0.3369999925295512,0.3699999948342641,0.3930000066757202,0.41233333945274353,0.42733333508173627,0.43799999356269836,0.4506666660308838,0.454666664203008,0.47166667381922406,0.47766666611035663,0.476666659116745,0.48366666833559663,0.4853333334128062],"label":"C4"}},"openbookqa/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.27699999511241913,0.288000002503395,0.2980000078678131,0.31199999153614044,0.29500000178813934,0.3139999955892563,0.31199999153614044,0.31200000643730164,0.3369999974966049,0.32899999618530273,0.3200000077486038,0.3310000002384186,0.3330000042915344],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25800000131130213,0.29899999499320984,0.27900001406669617,0.296999990940094,0.2980000078678131,0.3149999976158142,0.3179999887943268,0.32500000298023224,0.3079999983310699,0.32900001108646393,0.32599999010562897,0.3190000057220459,0.3279999941587448,0.3229999989271164],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.27400000393390656,0.2929999977350235,0.29600000381469727,0.306999996304512,0.3199999928474426,0.3190000057220459,0.31299999356269836,0.3229999989271164,0.3210000097751617,0.3270000070333481,0.3230000138282776,0.33399999141693115,0.3260000050067901],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2690000087022781,0.27300000190734863,0.28599999845027924,0.28299999237060547,0.3050000071525574,0.30900000035762787,0.31199999153614044,0.3200000077486038,0.33200000226497645,0.31200000643730164,0.3230000138282776,0.32299999892711634,0.32899999618530273,0.3320000022649765],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2590000033378601,0.278999999165535,0.2979999929666519,0.29899999499320984,0.3270000070333481,0.32800000905990595,0.32899999618530273,0.3369999974966049,0.33200000226497645,0.3260000050067901,0.33599999547004694,0.335999995470047,0.33500000834465027,0.3330000042915344],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2500000074505806,0.2759999930858612,0.2800000011920929,0.29099999368190765,0.3070000112056732,0.3070000112056732,0.3229999989271164,0.3240000009536743,0.31700000166893005,0.3100000023841858,0.31300000846385956,0.31700000166893005,0.3100000023841858,0.3189999908208847],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2526666720708211,0.26533332467079157,0.26600000262260437,0.29333333174387616,0.3059999942779541,0.30933333436648053,0.31600000460942584,0.31466667850812274,0.32933333516120905,0.3346666693687439,0.3366666634877522,0.3386666675408681,0.33799999952316284,0.33066666126251215],"label":"C4"}},"piqa/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6105000078678131,0.6350000202655792,0.6620000004768372,0.675000011920929,0.6940000057220459,0.6974999904632568,0.7054999768733978,0.7060000002384186,0.7059999704360962,0.7084999978542328,0.7060000002384186,0.7084999978542328,0.7144999802112579,0.7134999930858612],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6149999797344208,0.6520000100135803,0.6789999902248383,0.69200000166893,0.6949999928474426,0.6955000162124634,0.7055000066757202,0.7150000035762787,0.7169999778270721,0.7184999883174896,0.7235000133514404,0.7240000069141388,0.723499983549118,0.7249999940395355],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.621999979019165,0.6549999713897705,0.6695000231266022,0.6860000193119049,0.6994999945163727,0.6980000138282776,0.7084999978542328,0.7120000123977661,0.7124999761581421,0.7160000205039978,0.7179999947547913,0.7195000052452087,0.7229999899864197,0.723499983549118],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6229999959468842,0.6590000092983246,0.6714999973773956,0.6820000112056732,0.6949999928474426,0.6940000057220459,0.7064999938011169,0.7005000114440918,0.6989999711513519,0.7084999978542328,0.7060000002384186,0.7099999785423279,0.7160000205039978,0.7150000035762787],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6215000152587891,0.6580000221729279,0.6784999966621399,0.69200000166893,0.703499972820282,0.7029999792575836,0.710999995470047,0.7139999866485596,0.7179999947547913,0.7150000035762787,0.715499997138977,0.7184999883174896,0.7160000205039978,0.7224999964237213],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.621999979019165,0.6520000100135803,0.6800000071525574,0.6895000040531158,0.6949999928474426,0.6990000009536743,0.7045000195503235,0.7114999890327454,0.710999995470047,0.7159999907016754,0.7199999988079071,0.7199999988079071,0.7204999923706055,0.7254999876022339],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6196666558583578,0.6583333412806193,0.6833333373069763,0.6829999883969625,0.6983333230018616,0.702999989191691,0.7056666612625122,0.7076666553815206,0.7139999866485596,0.7209999958674113,0.7179999947547913,0.7273333470026652,0.7209999958674113,0.7273333271344503],"label":"C4"}},"winogrande/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48950000107288355,0.48950000107288355,0.5049999952316284,0.5125000178813934,0.5004999935626984,0.5065000057220459,0.5055000185966492,0.511000007390976,0.5160000026226044,0.5209999978542328,0.5270000100135803,0.5219999849796295,0.5149999856948853,0.5125000178813934],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48350000381469727,0.5024999976158142,0.5039999932050705,0.5049999952316284,0.5115000009536743,0.50450000166893,0.5120000243186951,0.5144999921321869,0.5194999873638153,0.5250000059604645,0.5170000195503235,0.5180000066757202,0.527999997138977,0.5259999930858612],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49050000309944153,0.48900000751018524,0.5080000162124634,0.50450000166893,0.5185000002384186,0.5175000131130219,0.5099999904632568,0.526500016450882,0.5320000052452087,0.5230000019073486,0.5105000138282776,0.5214999914169312,0.523499995470047,0.5264999866485596],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49050000309944153,0.49000000953674316,0.4999999850988388,0.4989999830722809,0.5115000009536743,0.5105000138282776,0.5069999992847443,0.5109999775886536,0.5164999961853027,0.5059999823570251,0.5129999816417694,0.5059999823570251,0.5115000009536743,0.5164999961853027],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.4989999979734421,0.5064999908208847,0.49800001084804535,0.5040000081062317,0.5139999985694885,0.5160000026226044,0.5109999775886536,0.5070000141859055,0.5115000009536743,0.5105000138282776,0.5175000131130219,0.5200000107288361,0.5135000050067902],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49150000512599945,0.49900001287460327,0.49300000071525574,0.5015000104904175,0.5094999969005585,0.5109999775886536,0.5085000097751617,0.507500022649765,0.5205000042915344,0.5125000178813934,0.5160000026226044,0.5175000131130219,0.5150000154972076,0.5179999768733978],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4933333396911621,0.48733333746592206,0.5056666731834412,0.5066666503747305,0.5116666754086813,0.5076666871706644,0.5213333169619242,0.5150000055631002,0.5183333357175192,0.5169999996821085,0.515333334604899,0.5193333427111307,0.5143333276112875,0.5196666717529297],"label":"C4"}},"arc/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.32549999654293055,0.3307500034570694,0.3467499911785126,0.3500000089406967,0.3452499955892563,0.3622500002384185,0.35999999940395355,0.37024998664855957,0.3684999942779541,0.3675000071525574,0.37249998748302454,0.37675000727176666,0.3760000020265579],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29250000417232513,0.3184999972581863,0.3297500014305115,0.34450000524520874,0.3512499928474426,0.35724999010562897,0.36375001072883606,0.3665000051259994,0.3684999942779541,0.3712499886751175,0.37375000119209284,0.37800000607967377,0.3840000033378601,0.37950000166893005],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2947500050067901,0.31974999606609344,0.3344999998807907,0.3445000052452087,0.351500004529953,0.35199999809265137,0.35925000905990595,0.3634999990463257,0.36374999582767487,0.36550000309944153,0.36775000393390656,0.3677499890327453,0.36900000274181366,0.36650000512599945],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.28949999809265137,0.3187499940395355,0.33825001120567316,0.35074999928474426,0.3604999929666519,0.36274999380111694,0.3634999990463257,0.3645000010728836,0.3644999861717224,0.3669999986886978,0.3642500042915344,0.3722499907016754,0.37499999999999994,0.37549999356269836],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.30024999380111694,0.32724998891353607,0.33374999463558197,0.34574998915195465,0.351749986410141,0.36124999821186066,0.3527500033378601,0.3582500070333481,0.35850000381469727,0.36075000464916224,0.364750012755394,0.37049999833106995,0.3729999959468841,0.36974999308586115],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.294500008225441,0.32725000381469727,0.3352499902248382,0.3504999876022339,0.3487499952316284,0.3557500094175339,0.35324999690055847,0.36374999582767487,0.36474999785423273,0.372749999165535,0.36775000393390656,0.3707500100135803,0.3734999895095825,0.375],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2928333381811778,0.3191666702429453,0.3451666633288066,0.342166672150294,0.35983332991600037,0.35483332475026447,0.3643333315849304,0.3631666700045268,0.3698333303133647,0.3696666657924652,0.37433333198229474,0.3805000086625417,0.3800000051657359,0.3798333406448364],"label":"C4"}},"mmlu/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25013685226440424,0.25661391019821167,0.2620016932487488,0.2657508552074432,0.2710244506597519,0.2744349539279938,0.27642421424388885,0.2818952649831772,0.2794509679079056,0.2831944525241852,0.28439727425575256,0.2866545617580414,0.2866020053625107,0.28615814447402954,0.2871949374675751],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25762456655502314,0.2630201578140259,0.2672136425971985,0.27234274148941034,0.2702306807041168,0.27446796000003815,0.27583475410938263,0.2770504504442215,0.2794356495141983,0.28302033245563507,0.28214274346828455,0.2855468988418579,0.2840581685304642,0.28505663573741913],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2583308666944504,0.2611347585916519,0.26333703100681305,0.2685028165578842,0.2725042402744293,0.27531248331069946,0.27463899552822113,0.2784048914909363,0.27915388345718384,0.27945026755332947,0.28207844495773315,0.281900018453598,0.2822476774454117,0.28188446164131165],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25205445289611816,0.2613788843154907,0.26891554892063135,0.2724043130874634,0.27449470758438105,0.27719296514987946,0.27587129175662994,0.2815589904785156,0.2833077013492584,0.2830233126878738,0.28461267054080963,0.2871275246143341,0.28650729358196253,0.2869933694601059],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25806266069412226,0.26165445148944855,0.26727744936943054,0.2677594721317291,0.2689383774995804,0.2724889665842056,0.27308812737464905,0.27327476441860193,0.27370570600032806,0.277080088853836,0.27814342081546783,0.2782013118267059,0.27888238430023193,0.2795541882514953],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25500668585300446,0.26221066713333124,0.26368947327136993,0.2702934741973877,0.27218967676162714,0.27553085982799524,0.27833363413810724,0.2786440253257751,0.2810910940170288,0.2834737300872803,0.2833452969789505,0.2836028486490249,0.28682972490787506,0.2868015915155411],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2557150324185689,0.25763070583343506,0.2643406589825948,0.26745049158732087,0.2721543808778127,0.2737567722797394,0.2732303539911906,0.27877557277679443,0.27923040588696796,0.2798382341861725,0.2831268608570099,0.28203009565671283,0.2810969154040019,0.28292057911554974],"label":"C4"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"C4 filtering effect on HellaSwag"}},"defaultWindowSize":3,"defaultMetric":"hellaswag/acc_norm"}
 
 
data/plots/c4_filters_hellaswag/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308296035975218,0.35613923892378807,0.3746252153068781,0.38806260935962195,0.39690930768847466,0.4043668694794178,0.40220927633345127,0.41070565767586226,0.41399387270212173,0.4170555509626865,0.42098715901374817,0.4210818205028772,0.42051274701952934,0.424176013097167,0.4225243702530861],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.3583905678242445,0.38119001872837543,0.3873079549521208,0.39723034016788,0.4043100867420435,0.40908974781632423,0.4140731003135443,0.41894380562007427,0.41736695170402527,0.4232212919741869,0.4229240976274013,0.4236308634281158,0.42750727012753487,0.4268195778131485],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36182260885834694,0.3764855917543173,0.3928546328097582,0.3978128544986248,0.4073755294084549,0.4112890623509884,0.41486112400889397,0.4196756165474653,0.4235504809767008,0.42218128964304924,0.4228535555303097,0.4249562546610832,0.42740595713257784,0.42711055465042586],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36000680737197394,0.37551611103117466,0.38802069239318365,0.3933942876756191,0.4043118376284838,0.40780537389218807,0.4112964067608118,0.4137573726475239,0.41791345551609993,0.4173779133707285,0.42117033526301384,0.42073468305170536,0.42412591539323324,0.4260616712272167],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36066408455371857,0.3812380563467741,0.394003426656127,0.40062618628144264,0.4117735456675291,0.4165923688560724,0.4175422675907612,0.42100309208035464,0.42246321588754654,0.42360376194119453,0.42823668196797365,0.4299001637846231,0.4302353039383888,0.4310380257666111],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.330924579873681,0.35825083684176207,0.37912008538842196,0.38942993618547916,0.3983491826802492,0.4053049590438604,0.4079726096242666,0.4135104585438967,0.41717425361275673,0.41904263757169247,0.4211529679596424,0.4212619122117758,0.42373160831630224,0.42435371689498425,0.4279126934707165],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.3580685469011466,0.3740996705989043,0.39048008372386295,0.39857714250683784,0.40837346265713376,0.4111154315372308,0.41773712386687595,0.4196594481666882,0.42379963273803395,0.4276047808428605,0.42980752388636273,0.43098293244838715,0.43155378103256226,0.4327609067161878],"label":"C4"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"C4 filtering effect on HellaSwag"}}}
data/plots/c4_filters_hellaswag/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.32549999654293055,0.3307500034570694,0.3467499911785126,0.3500000089406967,0.3452499955892563,0.3622500002384185,0.35999999940395355,0.37024998664855957,0.3684999942779541,0.3675000071525574,0.37249998748302454,0.37675000727176666,0.3760000020265579],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29250000417232513,0.3184999972581863,0.3297500014305115,0.34450000524520874,0.3512499928474426,0.35724999010562897,0.36375001072883606,0.3665000051259994,0.3684999942779541,0.3712499886751175,0.37375000119209284,0.37800000607967377,0.3840000033378601,0.37950000166893005],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2947500050067901,0.31974999606609344,0.3344999998807907,0.3445000052452087,0.351500004529953,0.35199999809265137,0.35925000905990595,0.3634999990463257,0.36374999582767487,0.36550000309944153,0.36775000393390656,0.3677499890327453,0.36900000274181366,0.36650000512599945],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.28949999809265137,0.3187499940395355,0.33825001120567316,0.35074999928474426,0.3604999929666519,0.36274999380111694,0.3634999990463257,0.3645000010728836,0.3644999861717224,0.3669999986886978,0.3642500042915344,0.3722499907016754,0.37499999999999994,0.37549999356269836],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.30024999380111694,0.32724998891353607,0.33374999463558197,0.34574998915195465,0.351749986410141,0.36124999821186066,0.3527500033378601,0.3582500070333481,0.35850000381469727,0.36075000464916224,0.364750012755394,0.37049999833106995,0.3729999959468841,0.36974999308586115],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.294500008225441,0.32725000381469727,0.3352499902248382,0.3504999876022339,0.3487499952316284,0.3557500094175339,0.35324999690055847,0.36374999582767487,0.36474999785423273,0.372749999165535,0.36775000393390656,0.3707500100135803,0.3734999895095825,0.375],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2928333381811778,0.3191666702429453,0.3451666633288066,0.342166672150294,0.35983332991600037,0.35483332475026447,0.3643333315849304,0.3631666700045268,0.3698333303133647,0.3696666657924652,0.37433333198229474,0.3805000086625417,0.3800000051657359,0.3798333406448364],"label":"C4"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"C4 filtering effect on HellaSwag"}}}
data/plots/c4_filters_hellaswag/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2584999948740005,0.2850000113248825,0.30850000679492945,0.30149999260902405,0.31049999594688416,0.3079999983310699,0.3150000125169754,0.32199999690055847,0.3244999945163727,0.3205000013113022,0.3244999945163727,0.3279999941587448,0.33149999380111694,0.32850000262260437],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2619999945163727,0.288000002503395,0.29749999940395355,0.30399999022483826,0.3149999976158142,0.3245000094175339,0.3230000138282776,0.3240000009536743,0.3245000094175339,0.33550000190734863,0.335999995470047,0.32999999821186066,0.3375000059604645,0.34049999713897705],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2650000005960464,0.28599999845027924,0.3110000044107437,0.2944999933242798,0.3085000067949295,0.32199999690055847,0.31949999928474426,0.3240000009536743,0.32500000298023224,0.3245000094175339,0.32199999690055847,0.3265000134706497,0.3295000046491623,0.32999999821186066],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.2824999988079071,0.2985000014305115,0.3050000071525574,0.3119999915361404,0.3110000044107437,0.3164999932050705,0.32199999690055847,0.3279999941587448,0.3365000039339065,0.3375000059604645,0.3384999930858612,0.340499997138977,0.341499999165535],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26299999654293055,0.2864999920129776,0.2944999933242798,0.2985000014305115,0.3165000081062317,0.3194999992847442,0.318000003695488,0.32500000298023224,0.32899999618530273,0.3254999965429306,0.33150000870227814,0.3330000042915344,0.33200000226497645,0.3330000042915344],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.25800000131130213,0.2849999964237213,0.29200001060962677,0.289000004529953,0.30349999666213984,0.30400000512599945,0.3139999955892563,0.3139999955892563,0.318000003695488,0.32299999892711634,0.3174999952316284,0.3215000033378601,0.32250000536441803,0.32549999654293055],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.25700000921885174,0.2786666651566823,0.2960000038146972,0.3049999972184499,0.3053333262602488,0.3120000064373016,0.31733333071072894,0.3163333336512248,0.3186666667461395,0.3226666748523712,0.3286666671435038,0.3240000009536743,0.32900000611941016,0.3283333381017049],"label":"C4"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"C4 filtering effect on HellaSwag"}}}
data/plots/c4_filters_hellaswag/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28949999809265137,0.32599999010562897,0.34450000524520874,0.3725000023841858,0.38500000536441803,0.39499999582767487,0.408500000834465,0.41700001060962677,0.4174999892711639,0.4284999966621399,0.42849999666213984,0.43150000274181366,0.4399999976158142,0.4375],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29749999940395355,0.3240000009536743,0.34849999845027924,0.3725000023841858,0.3895000070333481,0.39800000190734863,0.41000001132488245,0.4214999973773956,0.42149999737739563,0.42499999701976776,0.42750000953674316,0.4364999979734421,0.4354999959468841,0.4385000020265579],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2939999997615814,0.3295000046491623,0.3684999942779541,0.38449999690055847,0.398499995470047,0.3959999978542328,0.4204999953508377,0.4335000067949295,0.445499986410141,0.443000003695488,0.455499991774559,0.45250000059604645,0.4529999941587448,0.4545000046491623],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29100000858306885,0.32400000095367426,0.3439999967813492,0.3575000017881393,0.3800000101327896,0.40049999952316284,0.4134999960660934,0.42099998891353607,0.4204999953508377,0.4280000030994415,0.44099999964237213,0.43799999356269836,0.44200000166893005,0.44600000977516174],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29950000345706934,0.33799999952316284,0.3789999932050705,0.3970000147819519,0.42149999737739563,0.431999996304512,0.4440000057220459,0.4490000009536743,0.45949999988079065,0.4714999943971634,0.48000000417232513,0.47749999165534973,0.48100000619888306,0.48950000107288355],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2955000102519989,0.3385000079870224,0.36800000071525574,0.40099999308586115,0.4099999964237213,0.41700001060962677,0.42400000989437103,0.4389999955892563,0.4414999932050705,0.4484999924898147,0.455499991774559,0.45799998939037323,0.4660000056028366,0.471000000834465],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29699999094009394,0.3369999925295512,0.3699999948342641,0.3930000066757202,0.41233333945274353,0.42733333508173627,0.43799999356269836,0.4506666660308838,0.454666664203008,0.47166667381922406,0.47766666611035663,0.476666659116745,0.48366666833559663,0.4853333334128062],"label":"C4"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"C4 filtering effect on HellaSwag"}}}
data/plots/c4_filters_hellaswag/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files":{"agg_score":{"file":"agg_score.json"},"commonsense_qa/acc_norm":{"file":"commonsense_qa_acc_norm.json"},"hellaswag/acc_norm":{"file":"hellaswag_acc_norm.json"},"openbookqa/acc_norm":{"file":"openbookqa_acc_norm.json"},"piqa/acc_norm":{"file":"piqa_acc_norm.json"},"winogrande/acc_norm":{"file":"winogrande_acc_norm.json"},"arc/acc_norm":{"file":"arc_acc_norm.json"},"mmlu/acc_norm":{"file":"mmlu_acc_norm.json"}},"settings":{"slider":{"min":0,"max":10,"default":3}}}
data/plots/c4_filters_hellaswag/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25013685226440424,0.25661391019821167,0.2620016932487488,0.2657508552074432,0.2710244506597519,0.2744349539279938,0.27642421424388885,0.2818952649831772,0.2794509679079056,0.2831944525241852,0.28439727425575256,0.2866545617580414,0.2866020053625107,0.28615814447402954,0.2871949374675751],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25762456655502314,0.2630201578140259,0.2672136425971985,0.27234274148941034,0.2702306807041168,0.27446796000003815,0.27583475410938263,0.2770504504442215,0.2794356495141983,0.28302033245563507,0.28214274346828455,0.2855468988418579,0.2840581685304642,0.28505663573741913],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2583308666944504,0.2611347585916519,0.26333703100681305,0.2685028165578842,0.2725042402744293,0.27531248331069946,0.27463899552822113,0.2784048914909363,0.27915388345718384,0.27945026755332947,0.28207844495773315,0.281900018453598,0.2822476774454117,0.28188446164131165],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25205445289611816,0.2613788843154907,0.26891554892063135,0.2724043130874634,0.27449470758438105,0.27719296514987946,0.27587129175662994,0.2815589904785156,0.2833077013492584,0.2830233126878738,0.28461267054080963,0.2871275246143341,0.28650729358196253,0.2869933694601059],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25806266069412226,0.26165445148944855,0.26727744936943054,0.2677594721317291,0.2689383774995804,0.2724889665842056,0.27308812737464905,0.27327476441860193,0.27370570600032806,0.277080088853836,0.27814342081546783,0.2782013118267059,0.27888238430023193,0.2795541882514953],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25500668585300446,0.26221066713333124,0.26368947327136993,0.2702934741973877,0.27218967676162714,0.27553085982799524,0.27833363413810724,0.2786440253257751,0.2810910940170288,0.2834737300872803,0.2833452969789505,0.2836028486490249,0.28682972490787506,0.2868015915155411],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2557150324185689,0.25763070583343506,0.2643406589825948,0.26745049158732087,0.2721543808778127,0.2737567722797394,0.2732303539911906,0.27877557277679443,0.27923040588696796,0.2798382341861725,0.2831268608570099,0.28203009565671283,0.2810969154040019,0.28292057911554974],"label":"C4"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"C4 filtering effect on HellaSwag"}}}
data/plots/c4_filters_hellaswag/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.27699999511241913,0.288000002503395,0.2980000078678131,0.31199999153614044,0.29500000178813934,0.3139999955892563,0.31199999153614044,0.31200000643730164,0.3369999974966049,0.32899999618530273,0.3200000077486038,0.3310000002384186,0.3330000042915344],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25800000131130213,0.29899999499320984,0.27900001406669617,0.296999990940094,0.2980000078678131,0.3149999976158142,0.3179999887943268,0.32500000298023224,0.3079999983310699,0.32900001108646393,0.32599999010562897,0.3190000057220459,0.3279999941587448,0.3229999989271164],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.27400000393390656,0.2929999977350235,0.29600000381469727,0.306999996304512,0.3199999928474426,0.3190000057220459,0.31299999356269836,0.3229999989271164,0.3210000097751617,0.3270000070333481,0.3230000138282776,0.33399999141693115,0.3260000050067901],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2690000087022781,0.27300000190734863,0.28599999845027924,0.28299999237060547,0.3050000071525574,0.30900000035762787,0.31199999153614044,0.3200000077486038,0.33200000226497645,0.31200000643730164,0.3230000138282776,0.32299999892711634,0.32899999618530273,0.3320000022649765],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2590000033378601,0.278999999165535,0.2979999929666519,0.29899999499320984,0.3270000070333481,0.32800000905990595,0.32899999618530273,0.3369999974966049,0.33200000226497645,0.3260000050067901,0.33599999547004694,0.335999995470047,0.33500000834465027,0.3330000042915344],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2500000074505806,0.2759999930858612,0.2800000011920929,0.29099999368190765,0.3070000112056732,0.3070000112056732,0.3229999989271164,0.3240000009536743,0.31700000166893005,0.3100000023841858,0.31300000846385956,0.31700000166893005,0.3100000023841858,0.3189999908208847],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2526666720708211,0.26533332467079157,0.26600000262260437,0.29333333174387616,0.3059999942779541,0.30933333436648053,0.31600000460942584,0.31466667850812274,0.32933333516120905,0.3346666693687439,0.3366666634877522,0.3386666675408681,0.33799999952316284,0.33066666126251215],"label":"C4"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"C4 filtering effect on HellaSwag"}}}
data/plots/c4_filters_hellaswag/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6105000078678131,0.6350000202655792,0.6620000004768372,0.675000011920929,0.6940000057220459,0.6974999904632568,0.7054999768733978,0.7060000002384186,0.7059999704360962,0.7084999978542328,0.7060000002384186,0.7084999978542328,0.7144999802112579,0.7134999930858612],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6149999797344208,0.6520000100135803,0.6789999902248383,0.69200000166893,0.6949999928474426,0.6955000162124634,0.7055000066757202,0.7150000035762787,0.7169999778270721,0.7184999883174896,0.7235000133514404,0.7240000069141388,0.723499983549118,0.7249999940395355],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.621999979019165,0.6549999713897705,0.6695000231266022,0.6860000193119049,0.6994999945163727,0.6980000138282776,0.7084999978542328,0.7120000123977661,0.7124999761581421,0.7160000205039978,0.7179999947547913,0.7195000052452087,0.7229999899864197,0.723499983549118],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6229999959468842,0.6590000092983246,0.6714999973773956,0.6820000112056732,0.6949999928474426,0.6940000057220459,0.7064999938011169,0.7005000114440918,0.6989999711513519,0.7084999978542328,0.7060000002384186,0.7099999785423279,0.7160000205039978,0.7150000035762787],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6215000152587891,0.6580000221729279,0.6784999966621399,0.69200000166893,0.703499972820282,0.7029999792575836,0.710999995470047,0.7139999866485596,0.7179999947547913,0.7150000035762787,0.715499997138977,0.7184999883174896,0.7160000205039978,0.7224999964237213],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.621999979019165,0.6520000100135803,0.6800000071525574,0.6895000040531158,0.6949999928474426,0.6990000009536743,0.7045000195503235,0.7114999890327454,0.710999995470047,0.7159999907016754,0.7199999988079071,0.7199999988079071,0.7204999923706055,0.7254999876022339],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6196666558583578,0.6583333412806193,0.6833333373069763,0.6829999883969625,0.6983333230018616,0.702999989191691,0.7056666612625122,0.7076666553815206,0.7139999866485596,0.7209999958674113,0.7179999947547913,0.7273333470026652,0.7209999958674113,0.7273333271344503],"label":"C4"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"C4 filtering effect on HellaSwag"}}}
data/plots/c4_filters_hellaswag/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48950000107288355,0.48950000107288355,0.5049999952316284,0.5125000178813934,0.5004999935626984,0.5065000057220459,0.5055000185966492,0.511000007390976,0.5160000026226044,0.5209999978542328,0.5270000100135803,0.5219999849796295,0.5149999856948853,0.5125000178813934],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48350000381469727,0.5024999976158142,0.5039999932050705,0.5049999952316284,0.5115000009536743,0.50450000166893,0.5120000243186951,0.5144999921321869,0.5194999873638153,0.5250000059604645,0.5170000195503235,0.5180000066757202,0.527999997138977,0.5259999930858612],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49050000309944153,0.48900000751018524,0.5080000162124634,0.50450000166893,0.5185000002384186,0.5175000131130219,0.5099999904632568,0.526500016450882,0.5320000052452087,0.5230000019073486,0.5105000138282776,0.5214999914169312,0.523499995470047,0.5264999866485596],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49050000309944153,0.49000000953674316,0.4999999850988388,0.4989999830722809,0.5115000009536743,0.5105000138282776,0.5069999992847443,0.5109999775886536,0.5164999961853027,0.5059999823570251,0.5129999816417694,0.5059999823570251,0.5115000009536743,0.5164999961853027],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.4989999979734421,0.5064999908208847,0.49800001084804535,0.5040000081062317,0.5139999985694885,0.5160000026226044,0.5109999775886536,0.5070000141859055,0.5115000009536743,0.5105000138282776,0.5175000131130219,0.5200000107288361,0.5135000050067902],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49150000512599945,0.49900001287460327,0.49300000071525574,0.5015000104904175,0.5094999969005585,0.5109999775886536,0.5085000097751617,0.507500022649765,0.5205000042915344,0.5125000178813934,0.5160000026226044,0.5175000131130219,0.5150000154972076,0.5179999768733978],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4933333396911621,0.48733333746592206,0.5056666731834412,0.5066666503747305,0.5116666754086813,0.5076666871706644,0.5213333169619242,0.5150000055631002,0.5183333357175192,0.5169999996821085,0.515333334604899,0.5193333427111307,0.5143333276112875,0.5196666717529297],"label":"C4"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"C4 filtering effect on HellaSwag"}}}
data/plots/cross_ind_unfiltered_comparison.json DELETED
The diff for this file is too large to render. See raw diff
 
data/plots/cross_ind_unfiltered_comparison/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3534814938902855,0.3764607086777687,0.38782499730587,0.3981050960719585,0.4028486795723438,0.4125883243978023,0.4117814563214779,0.414029736071825,0.4197172522544861,0.4211113378405571,0.4279881417751312,0.4280137903988361,0.4280424378812313,0.4291964024305343,0.4326301179826259,0.4371833503246307,0.4346669465303421,0.4336562640964985,0.4432648755609989,0.4401291646063328,0.4394684173166752,0.4476612061262131,0.4465444348752498,0.4472153298556804,0.4433343075215816,0.4510187618434429,0.4459567815065384,0.4460812956094742,0.4498684890568256,0.4529943652451038,0.4528274349868297,0.4551213420927524,0.4549156539142132,0.4564928151667118,0.4576693661510944,0.4557182416319847,0.4536240361630916,0.457439012825489,0.4570476822555065,0.4589823484420776,0.462024375796318,0.4540738053619861,0.4550252184271812,0.4576593860983848,0.4573238864541054,0.4575810581445694,0.4622134491801262,0.4592566937208175,0.4614734016358852,0.4637473002076149,0.4625372551381588,0.4613912180066108,0.4597448222339153,0.4594792164862156,0.4662549719214439,0.4634026065468788,0.4633508697152138,0.4635734222829342,0.4628961533308029,0.4670135043561458,0.4639505892992019,0.4631133340299129,0.4665167145431041,0.4672448337078094,0.4693268723785877,0.4630668573081493,0.4676454700529575,0.4646359197795391,0.4621579721570015,0.4692446552217006,0.4704835228621959,0.4663223996758461,0.4680556617677212,0.466339822858572,0.4682099223136902,0.4711195565760135,0.4722655527293682,0.4727961830794811,0.4676857478916645,0.4719390422105789,0.4713102728128433,0.4712141714990139,0.4721613004803657,0.4713456854224205,0.4682970903813839,0.4679934531450271,0.4685162976384163,0.4679946713149547,0.4681242071092129,0.4702276065945625,0.472664151340723,0.4730790853500366,0.4731674715876579,0.4718914777040481,0.4719801284372806,0.4761029370129108,0.4735167175531387,0.4730370938777923,0.4730173237621784,0.4735377207398414,0.4777223989367485,0.4796326830983162,0.4734170883893966,0.4739485755562782,0.4748299159109592,0.4765299335122108,0.4745025858283043,0.4754423759877682,0.4784592799842357,0.4761341325938701,0.4760282784700393,0.4769757278263569,0.47154351323843,0.4786738082766533,0.4804279990494251,0.4777076803147793,0.4798569902777672,0.4759011939167976,0.4784621745347976,0.479673832654953,0.4780617095530033,0.48076206818223,0.47995800152421,0.4790860973298549,0.4817167408764362,0.4811586998403072,0.482547752559185,0.4816697351634502,0.4809327870607376,0.4816545359790325,0.4804601892828941,0.4776877984404564,0.4813711903989315,0.4844604581594467,0.4819537848234176,0.4820829331874847,0.4778126627206802,0.482935007661581,0.48230691999197,0.4826001971960068,0.4823969900608063,0.4811219945549965,0.4789146520197391,0.484035175293684,0.4848698377609253,0.4855728335678577,0.4825376532971859,0.485215101391077,0.4824351668357849,0.4835342466831207,0.4822137206792831,0.4838785007596016,0.4837255179882049,0.4853012599050998,0.4857851006090641,0.4863366298377514,0.4856646582484245,0.4842503517866134,0.4838776960968971,0.4846346862614155,0.4837041422724724,0.4813097268342972,0.4873070046305656,0.4841253720223903,0.4837464913725853,0.483069509267807,0.4851242564618587,0.4861010462045669],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3551952373236418,0.3736435137689113,0.3814037963747978,0.3948809280991554,0.3996850810945034,0.4089604057371616,0.4100853353738785,0.4119834117591381,0.4168377220630646,0.4186493046581745,0.4169826358556747,0.4234288297593593,0.4229162000119686,0.4273439794778824,0.4290364980697632,0.4291782416403293,0.4296907968819141,0.4311576783657074,0.4326641112565994,0.430318683385849,0.430436260998249,0.4339037239551544,0.4363459683954716,0.4357402548193931,0.4342963136732578,0.4366712383925915,0.4363959729671478,0.436981026083231,0.4447868093848228,0.4411709941923618,0.4406092017889023,0.4424176625907421,0.4423875361680984,0.4422253370285034,0.4410557933151722,0.4447037056088447,0.4454837813973427,0.4435960277915001,0.4468514993786812,0.4479999616742134,0.4428562931716442,0.445764634758234,0.4456562362611294,0.4488007053732872,0.4475954286754131,0.4468922987580299,0.4548408314585686,0.4511027485132217,0.4530330970883369,0.4483681954443455,0.4531726539134979,0.45334542542696,0.4544384703040123,0.4530758671462536,0.4540613554418087,0.4510113634169101,0.4538320265710354,0.4518541917204857,0.4536847211420536,0.4532708041369915,0.4552236869931221,0.455034039914608,0.4562875479459762,0.4532428197562694,0.4574853852391243,0.4517738744616508,0.4579889141023159,0.4538268558681011,0.456730306148529,0.4526018649339676,0.4562746733427048,0.4560015797615051,0.4555426277220249,0.4561501257121563,0.4524396173655987,0.4557023830711841,0.4589769169688225,0.4581078588962555,0.4620813727378845,0.4586601965129375,0.4568093195557594,0.4569808952510357,0.4567535072565079,0.4575250148773193,0.4606908001005649,0.4603964723646641,0.4622848592698574,0.4594669193029403,0.4640629850327968,0.4604269936680794,0.4634841009974479,0.4644578285515308,0.4642514958977699,0.4666304066777229,0.4616626128554344,0.4588956907391548,0.4620226770639419,0.4628621749579906,0.4595407098531723,0.4635516740381717,0.46005355194211,0.4601523540914058,0.4644204638898372,0.4620639197528362,0.46614545956254,0.4636696502566337,0.4610077403485775,0.4640897810459137,0.4636163525283336,0.4630545899271965,0.466012816876173,0.4650349207222461,0.4613720141351223,0.4644323363900184,0.4647249802947044,0.4656480401754379,0.4651664271950722,0.4622530452907085,0.4655019529163837,0.4650313258171081,0.466718140989542,0.4661559611558914,0.4661237150430679,0.4664223715662956,0.4640601389110088,0.4642657749354839,0.4633881188929081,0.4629989042878151,0.4685831367969513,0.4675870984792709,0.467183344066143,0.4678030684590339,0.4660939238965511,0.4691914953291416,0.4670972637832165,0.468262892216444,0.4672016054391861,0.4676182121038437,0.4698677137494087,0.4658828042447567,0.4701816700398922,0.4684622809290886,0.466015312820673,0.4675401039421558,0.4693200923502445,0.4702670983970165,0.4679145030677318,0.4676233418285846,0.4674933589994907,0.4678357951343059,0.4669915996491909,0.4657857678830623,0.4666901864111423,0.4669371582567692,0.4672787226736545,0.4684535376727581,0.4685697965323925,0.4694835692644119,0.4683254994451999,0.4712230190634727,0.4683987610042095,0.4707653746008873,0.4663059376180172,0.4683133698999882,0.4686385430395603,0.4657671600580215,0.4692615270614624],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3605199865996837,0.3733148723840713,0.3882005847990513,0.3934122696518898,0.3947227671742439,0.4042885974049568,0.3974800482392311,0.4055779427289963,0.4133470430970192,0.4117913842201233,0.4113653488457203,0.4149517640471458,0.4187851920723915,0.4252083078026771,0.4206527359783649,0.4240428246557712,0.422003373503685,0.4280910938978195,0.4244147576391697,0.4316282644867897,0.4295645765960216,0.4310102686285972,0.4360743537545204,0.4313482865691185,0.4350991360843181,0.4378576353192329,0.4335876516997814,0.4347924515604973,0.4348904751241207,0.436600212007761,0.430036511272192,0.4350974671542644,0.4399556629359722,0.4371416717767715,0.4363861419260502,0.4376698136329651,0.4405004419386387,0.4373639523983001,0.4379038028419018,0.4371281825006008,0.4393439553678036,0.440426729619503,0.4401675276458263,0.4429537951946258,0.4449137263000011,0.4434786736965179,0.4450470842421055,0.4454202279448509,0.4394537284970283,0.442185215651989,0.4461225643754005,0.4427758157253265,0.4430646039545536,0.4476901069283485,0.4478763341903686,0.4493869319558143,0.4448477327823639,0.450044184923172,0.4498609118163585,0.4457665979862213,0.4506924152374267,0.449855338782072,0.448790930211544,0.4474099352955818,0.4546772800385952,0.4529431238770485,0.452015146613121,0.4502020999789238,0.4493804536759853,0.4523266032338142,0.4551868587732315,0.4501944817602634,0.4493303671479225,0.4526805207133293,0.4533850513398647,0.4518048763275146,0.4518973492085933,0.4531301632523536,0.4518006071448326,0.4553494565188885,0.4528752230107784,0.4536322727799415,0.4561733976006508,0.4549491256475448,0.4574789106845855,0.4577847123146057,0.4563642293214798,0.4578686729073524,0.4561499990522861,0.4537816494703293,0.4542164430022239,0.4559455662965774,0.4554723873734474,0.4575514122843742,0.4575202167034149,0.4592722058296203,0.4585275091230869,0.4580587856471538,0.456934317946434,0.4577495418488979,0.4540119916200638,0.4570806957781315,0.4608120545744896,0.4588425755500793,0.4578334167599678,0.4610816091299057,0.4598177038133144,0.461849745362997,0.4631866924464702,0.4601576402783394,0.4646804705262184,0.4632389545440674,0.4604574106633663,0.4602976888418197,0.4581312239170074,0.4654182009398937,0.4655338563024997,0.4616620391607284,0.461054053157568,0.4613021649420261,0.4658613465726375,0.4633531905710697,0.4613638147711754,0.4643996246159076,0.462500050663948,0.4650798961520195,0.4648764543235302,0.4639869071543216,0.4634246975183487,0.46585888043046,0.4639799632132053,0.4630857892334461,0.4644265696406364,0.4642998576164245,0.4686848931014538,0.4687492996454239,0.4650243632495403,0.4627032242715359,0.4665953740477562,0.4660026729106903,0.4664581045508384,0.4676475040614605,0.4657339677214622,0.4664678275585174,0.4673498086631298,0.4676674827933311,0.4680955372750759,0.4681585058569908,0.4659864418208599,0.4686457589268684,0.4661462865769863,0.4658931568264961,0.4674226939678192,0.46805215254426,0.4682257212698459,0.4689070098102093,0.4699570722877979,0.4655096270143986,0.4688013233244419,0.4707522802054882,0.4661469310522079,0.4688841328024864,0.4671329781413078,0.4662554152309894,0.4697433896362781,0.4698473587632179,0.4676505327224731,0.4696521013975143],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.3308933284133672,0.3608616776764393,0.3745453506708145,0.3862277194857597,0.3989979773759842,0.406296543776989,0.4094927236437797,0.4138859286904335,0.4177777022123337,0.4208802655339241,0.4254550077021122,0.4283009432256222,0.429458349943161,0.4330311268568039,0.4303463362157345,0.4349483698606491,0.4348161295056343,0.438955657184124,0.4389265701174736,0.4393925778567791,0.4383306242525577,0.4436748661100864,0.4423373565077781,0.4460027255117893,0.4440812170505523,0.4476902261376381,0.4465879611670971,0.4497823156416416,0.4513350501656532,0.4518667235970497,0.45149727165699,0.4513994492590427,0.4521937072277069,0.4520382955670357,0.4530793912708759,0.4516105614602566,0.4530563354492187,0.4495660625398159,0.4520940892398357,0.4561133235692978,0.4522969461977482,0.4575686641037464,0.4589144177734852,0.4582882039248943,0.457970168441534,0.4554797261953354,0.4622044861316681,0.4596928395330906,0.4624353349208832,0.4619148448109627,0.461100060492754,0.458431463688612,0.4620467089116573,0.4562215581536293,0.4620163068175316,0.4631462283432483,0.4600549824535846,0.4620365314185619,0.458735141903162,0.461642112582922,0.461245734244585,0.4645131677389145,0.4629777930676937,0.4651660025119781,0.4653937108814716,0.4676259346306324,0.4667201824486255,0.4650012850761413,0.4676916748285293,0.4708514772355556,0.4673572592437267,0.4689626581966877,0.4678038358688354,0.4667215310037136,0.4646228328347206,0.4662510119378567,0.4674677737057209,0.4690804108977318,0.4634581170976162,0.4701276533305645,0.4676450751721859,0.4672758504748344,0.4674397967755794,0.4656238108873367,0.4690065123140812,0.4677213467657566,0.4678985886275768,0.4735414572060108,0.4705612398684025,0.4703374318778515,0.4704933613538742,0.4688010476529598,0.4699571952223778,0.4674785658717155,0.4701188169419765,0.4682065695524215,0.4729971997439861,0.4748715870082378,0.4745333231985569,0.4737020246684551,0.4747246317565441,0.4771635122597217,0.4740425907075405,0.475264236330986,0.4744705818593502,0.474684040993452,0.4721556939184665,0.475641455501318,0.476833701133728,0.4746401384472847,0.4742486327886581,0.4730467088520527,0.4773029200732708,0.4760043211281299,0.4770320989191532,0.4742161482572555,0.4780259765684604,0.4806670732796192,0.4784667380154133,0.4788618609309196,0.4762138128280639,0.4777246937155723,0.4796081893146038,0.4798486456274986,0.475479181855917,0.4779988899827003,0.4765858314931392,0.4772914499044418,0.47843898832798,0.4799034222960472,0.4803600236773491,0.4751846008002758,0.4777872562408447,0.4779460839927196,0.4787487275898456,0.4808406494557857,0.4810357913374901,0.4797308407723903,0.4800078608095646,0.4806460626423359,0.4810502976179123,0.4797912389039993,0.477332629263401,0.4818884879350662,0.482621606439352,0.4833096489310264,0.4821632876992225,0.4831674285233021,0.4830279909074306,0.4849893450736999,0.4845218025147915,0.4825541749596596,0.4833571836352348,0.4853803217411041,0.483093187212944,0.4850797094404697,0.485261783003807,0.4837660938501358,0.4835929833352566,0.4855643883347511,0.4832059442996979,0.484714712947607,0.4839249886572361,0.4829078912734985,0.4818423055112362,0.482727088034153,0.4824129492044449,0.4820138849318027,0.4865870922803879],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
data/plots/cross_ind_unfiltered_comparison/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2509999871253967,0.2899999916553497,0.31700000166893,0.3409999907016754,0.3425000011920929,0.3485000133514404,0.3555000126361847,0.3574999868869781,0.3585000038146972,0.363999992609024,0.3619999885559082,0.3675000071525574,0.3865000009536743,0.3810000121593475,0.3740000128746032,0.3810000121593475,0.3810000121593475,0.3860000073909759,0.3810000121593475,0.3894999921321869,0.3849999904632568,0.3855000138282776,0.3989999890327453,0.3980000019073486,0.3995000123977661,0.395000010728836,0.4084999859333038,0.4040000140666961,0.4004999995231628,0.3955000042915344,0.4135000109672546,0.4070000052452087,0.4104999899864197,0.4014999866485595,0.4099999964237213,0.4199999868869781,0.414000004529953,0.402999997138977,0.4214999973773956,0.4095000028610229,0.4059999883174896,0.4090000092983246,0.4074999988079071,0.4120000004768371,0.4154999852180481,0.4189999997615814,0.4149999916553497,0.429500013589859,0.4154999852180481,0.4214999973773956,0.4244999885559082,0.4205000102519989,0.4269999861717224,0.4214999973773956,0.4180000126361847,0.4415000081062317,0.4320000112056732,0.4350000023841858,0.4259999990463257,0.4300000071525574,0.4259999990463257,0.4189999997615814,0.4269999861717224,0.4199999868869781,0.426499992609024,0.4350000023841858,0.4289999902248382,0.4345000088214874,0.4259999990463257,0.426499992609024,0.4395000040531158,0.4395000040531158,0.4359999895095825,0.4280000030994415,0.4370000064373016,0.4329999983310699,0.4309999942779541,0.4490000009536743,0.4399999976158142,0.4339999854564667,0.4399999976158142,0.4345000088214874,0.429500013589859,0.4370000064373016,0.4379999935626983,0.4284999966621399,0.4309999942779541,0.4350000023841858,0.4399999976158142,0.4314999878406524,0.4300000071525574,0.4410000145435333,0.4345000088214874,0.4410000145435333,0.4345000088214874,0.4339999854564667,0.4460000097751617,0.4410000145435333,0.4469999969005584,0.4480000138282776,0.4435000121593475,0.4375,0.4519999921321869,0.4480000138282776,0.4429999887943268,0.4519999921321869,0.4435000121593475,0.4334999918937683,0.4460000097751617,0.4564999938011169,0.4469999969005584,0.453000009059906,0.4485000073909759,0.4410000145435333,0.4444999992847442,0.4485000073909759,0.457500010728836,0.4469999969005584,0.4535000026226043,0.4535000026226043,0.4485000073909759,0.4490000009536743,0.4505000114440918,0.4595000147819519,0.4544999897480011,0.453000009059906,0.4605000019073486,0.4620000123977661,0.457500010728836,0.453000009059906,0.4550000131130218,0.460999995470047,0.4449999928474426,0.4474999904632568,0.457500010728836,0.4584999978542328,0.4494999945163727,0.4474999904632568,0.4625000059604645,0.4639999866485595,0.4555000066757202,0.4469999969005584,0.4600000083446502,0.453000009059906,0.4629999995231628,0.4589999914169311,0.4614999890327453,0.4555000066757202,0.4560000002384186,0.4580000042915344,0.4584999978542328,0.4560000002384186,0.4605000019073486,0.4595000147819519,0.4639999866485595,0.4614999890327453,0.4564999938011169,0.4634999930858612,0.4625000059604645,0.4614999890327453,0.4679999947547912,0.4584999978542328,0.4595000147819519,0.4505000114440918,0.4544999897480011,0.4595000147819519,0.4620000123977661,0.4670000076293945,0.4555000066757202],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2509999871253967,0.2904999852180481,0.3289999961853027,0.3379999995231628,0.3400000035762787,0.3535000085830688,0.3700000047683716,0.3619999885559082,0.3695000112056732,0.3625000119209289,0.3745000064373016,0.3804999887943268,0.3835000097751617,0.3810000121593475,0.3785000145435333,0.3799999952316284,0.3885000050067901,0.3919999897480011,0.3899999856948852,0.3939999938011169,0.4004999995231628,0.3889999985694885,0.4000000059604645,0.3930000066757202,0.4025000035762787,0.398499995470047,0.3939999938011169,0.3989999890327453,0.4020000100135803,0.4079999923706054,0.4129999876022339,0.4014999866485595,0.4129999876022339,0.4079999923706054,0.4115000069141388,0.4070000052452087,0.4095000028610229,0.4199999868869781,0.4165000021457672,0.4239999949932098,0.4129999876022339,0.4034999907016754,0.4050000011920929,0.4135000109672546,0.4189999997615814,0.418500006198883,0.4199999868869781,0.4365000128746032,0.4320000112056732,0.4255000054836273,0.4259999990463257,0.4244999885559082,0.4275000095367431,0.4259999990463257,0.4210000038146972,0.421999990940094,0.4099999964237213,0.4305000007152557,0.4239999949932098,0.4194999933242798,0.4205000102519989,0.4255000054836273,0.414000004529953,0.4210000038146972,0.4180000126361847,0.4429999887943268,0.429500013589859,0.4165000021457672,0.4239999949932098,0.4255000054836273,0.4180000126361847,0.4325000047683716,0.4305000007152557,0.4329999983310699,0.4325000047683716,0.4320000112056732,0.4375,0.4410000145435333,0.4395000040531158,0.4379999935626983,0.4280000030994415,0.4365000128746032,0.4205000102519989,0.426499992609024,0.4280000030994415,0.4354999959468841,0.4314999878406524,0.429500013589859,0.421999990940094,0.4345000088214874,0.429500013589859,0.4354999959468841,0.4314999878406524,0.4404999911785126,0.4384999871253967,0.4359999895095825,0.4345000088214874,0.4320000112056732,0.4345000088214874,0.4375,0.4410000145435333,0.4280000030994415,0.4320000112056732,0.44200000166893,0.4460000097751617,0.4390000104904175,0.4314999878406524,0.4339999854564667,0.4390000104904175,0.4460000097751617,0.4309999942779541,0.4444999992847442,0.44200000166893,0.4404999911785126,0.4395000040531158,0.4370000064373016,0.4519999921321869,0.4429999887943268,0.4395000040531158,0.4415000081062317,0.4384999871253967,0.4494999945163727,0.4469999969005584,0.4375,0.4395000040531158,0.4345000088214874,0.4390000104904175,0.4375,0.4309999942779541,0.4320000112056732,0.4415000081062317,0.4354999959468841,0.445499986410141,0.4404999911785126,0.4429999887943268,0.4395000040531158,0.4354999959468841,0.4429999887943268,0.4410000145435333,0.4494999945163727,0.4429999887943268,0.4460000097751617,0.445499986410141,0.4429999887943268,0.4429999887943268,0.4350000023841858,0.4474999904632568,0.4415000081062317,0.4424999952316284,0.4375,0.4444999992847442,0.4424999952316284,0.4354999959468841,0.445499986410141,0.4379999935626983,0.4449999928474426,0.4365000128746032,0.4474999904632568,0.4440000057220459,0.4465000033378601,0.445499986410141,0.4474999904632568,0.4494999945163727,0.4449999928474426,0.4444999992847442,0.44200000166893,0.4345000088214874,0.4404999911785126],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2509999871253967,0.2894999980926513,0.3235000073909759,0.3389999866485595,0.3384999930858612,0.3459999859333038,0.359499990940094,0.3429999947547912,0.3619999885559082,0.3564999997615814,0.3625000119209289,0.363999992609024,0.3680000007152557,0.3680000007152557,0.3785000145435333,0.3684999942779541,0.375,0.3734999895095825,0.3849999904632568,0.3944999873638153,0.3865000009536743,0.395000010728836,0.3935000002384186,0.3980000019073486,0.3910000026226043,0.3885000050067901,0.3914999961853027,0.3815000057220459,0.395000010728836,0.3894999921321869,0.395000010728836,0.3935000002384186,0.4034999907016754,0.4004999995231628,0.3970000147819519,0.3975000083446502,0.3995000123977661,0.3980000019073486,0.4034999907016754,0.3959999978542328,0.3989999890327453,0.402999997138977,0.3880000114440918,0.3980000019073486,0.4040000140666961,0.3989999890327453,0.3970000147819519,0.3925000131130218,0.4120000004768371,0.3935000002384186,0.395000010728836,0.4070000052452087,0.3935000002384186,0.4034999907016754,0.4189999997615814,0.4129999876022339,0.4160000085830688,0.4149999916553497,0.418500006198883,0.4225000143051147,0.4174999892711639,0.4210000038146972,0.4045000076293945,0.4079999923706054,0.4124999940395355,0.4144999980926513,0.4169999957084656,0.4194999933242798,0.4154999852180481,0.4169999957084656,0.4225000143051147,0.4225000143051147,0.4230000078678131,0.4160000085830688,0.4325000047683716,0.4325000047683716,0.4199999868869781,0.4199999868869781,0.4189999997615814,0.4269999861717224,0.4259999990463257,0.4230000078678131,0.4144999980926513,0.4329999983310699,0.4275000095367431,0.4305000007152557,0.4289999902248382,0.4235000014305115,0.4235000014305115,0.4325000047683716,0.4244999885559082,0.4314999878406524,0.4194999933242798,0.4350000023841858,0.4269999861717224,0.4235000014305115,0.4300000071525574,0.4284999966621399,0.4255000054836273,0.4280000030994415,0.4345000088214874,0.4225000143051147,0.4334999918937683,0.4300000071525574,0.4350000023841858,0.429500013589859,0.4325000047683716,0.4384999871253967,0.4345000088214874,0.4354999959468841,0.4359999895095825,0.4354999959468841,0.4424999952316284,0.4424999952316284,0.4320000112056732,0.4280000030994415,0.4390000104904175,0.4480000138282776,0.4415000081062317,0.4384999871253967,0.4390000104904175,0.4494999945163727,0.4449999928474426,0.4384999871253967,0.4424999952316284,0.4359999895095825,0.445499986410141,0.4399999976158142,0.4375,0.4410000145435333,0.4384999871253967,0.4375,0.4329999983310699,0.4370000064373016,0.4354999959468841,0.4440000057220459,0.4384999871253967,0.4384999871253967,0.4390000104904175,0.4424999952316284,0.4379999935626983,0.4345000088214874,0.4354999959468841,0.4440000057220459,0.4395000040531158,0.4465000033378601,0.4404999911785126,0.4505000114440918,0.4480000138282776,0.4449999928474426,0.445499986410141,0.4410000145435333,0.4485000073909759,0.4460000097751617,0.4480000138282776,0.4465000033378601,0.4460000097751617,0.4460000097751617,0.4395000040531158,0.4474999904632568,0.4469999969005584,0.4404999911785126,0.4440000057220459,0.4435000121593475,0.4435000121593475,0.4514999985694885,0.4474999904632568,0.4474999904632568,0.445499986410141],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2509999871253967,0.2939999997615814,0.3174999952316284,0.3294999897480011,0.3510000109672546,0.3485000133514404,0.3634999990463257,0.3700000047683716,0.3524999916553497,0.375,0.3804999887943268,0.37950000166893,0.3824999928474426,0.3799999952316284,0.3865000009536743,0.395000010728836,0.3844999969005584,0.3894999921321869,0.3855000138282776,0.3955000042915344,0.3995000123977661,0.4009999930858612,0.3939999938011169,0.3970000147819519,0.3955000042915344,0.3955000042915344,0.4079999923706054,0.3959999978542328,0.4090000092983246,0.4045000076293945,0.3930000066757202,0.4099999964237213,0.4054999947547912,0.4124999940395355,0.4160000085830688,0.4149999916553497,0.4070000052452087,0.4110000133514404,0.4144999980926513,0.4120000004768371,0.4050000011920929,0.4165000021457672,0.4180000126361847,0.4050000011920929,0.4120000004768371,0.4135000109672546,0.4320000112056732,0.4284999966621399,0.4269999861717224,0.414000004529953,0.4255000054836273,0.4165000021457672,0.4144999980926513,0.4079999923706054,0.4205000102519989,0.4180000126361847,0.4244999885559082,0.4235000014305115,0.4244999885559082,0.4300000071525574,0.4160000085830688,0.4205000102519989,0.4329999983310699,0.4280000030994415,0.4244999885559082,0.4375,0.4244999885559082,0.4365000128746032,0.4329999983310699,0.4424999952316284,0.4390000104904175,0.4449999928474426,0.445499986410141,0.4320000112056732,0.4365000128746032,0.4244999885559082,0.429500013589859,0.4395000040531158,0.4284999966621399,0.44200000166893,0.4370000064373016,0.4399999976158142,0.4334999918937683,0.4429999887943268,0.44200000166893,0.4334999918937683,0.4384999871253967,0.4365000128746032,0.4390000104904175,0.4354999959468841,0.44200000166893,0.4350000023841858,0.4390000104904175,0.4404999911785126,0.4410000145435333,0.4305000007152557,0.4490000009536743,0.4510000050067901,0.4605000019073486,0.4490000009536743,0.449999988079071,0.4595000147819519,0.4514999985694885,0.4490000009536743,0.4474999904632568,0.4444999992847442,0.4524999856948852,0.4465000033378601,0.4519999921321869,0.4550000131130218,0.4524999856948852,0.4429999887943268,0.4550000131130218,0.4510000050067901,0.4560000002384186,0.4465000033378601,0.4485000073909759,0.4524999856948852,0.4440000057220459,0.457500010728836,0.4544999897480011,0.4480000138282776,0.4584999978542328,0.4544999897480011,0.4569999873638153,0.4584999978542328,0.4444999992847442,0.4629999995231628,0.457500010728836,0.4555000066757202,0.4569999873638153,0.4474999904632568,0.4564999938011169,0.4595000147819519,0.4634999930858612,0.4555000066757202,0.453000009059906,0.457500010728836,0.4614999890327453,0.460999995470047,0.4539999961853027,0.4595000147819519,0.4629999995231628,0.4670000076293945,0.4580000042915344,0.4639999866485595,0.457500010728836,0.4595000147819519,0.4665000140666961,0.4584999978542328,0.4629999995231628,0.4595000147819519,0.4659999907016754,0.4645000100135803,0.4675000011920929,0.4690000116825104,0.4715000092983246,0.4634999930858612,0.4634999930858612,0.4639999866485595,0.465499997138977,0.4675000011920929,0.4670000076293945,0.4600000083446502,0.4595000147819519,0.4625000059604645,0.4600000083446502,0.4645000100135803,0.4715000092983246],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
data/plots/cross_ind_unfiltered_comparison/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2529999911785126,0.2800000011920929,0.2870000004768371,0.3179999887943268,0.3129999935626983,0.3210000097751617,0.3160000145435333,0.3210000097751617,0.31700000166893,0.3330000042915344,0.3389999866485595,0.3289999961853027,0.3429999947547912,0.3370000123977661,0.3379999995231628,0.3459999859333038,0.3490000069141388,0.3470000028610229,0.3600000143051147,0.3569999933242798,0.3449999988079071,0.3650000095367431,0.3499999940395355,0.3540000021457672,0.3569999933242798,0.3619999885559082,0.3619999885559082,0.3580000102519989,0.3740000128746032,0.3709999918937683,0.3720000088214874,0.3759999871253967,0.3720000088214874,0.3659999966621399,0.3790000081062317,0.3610000014305115,0.3650000095367431,0.3650000095367431,0.3720000088214874,0.3729999959468841,0.3790000081062317,0.3680000007152557,0.3659999966621399,0.3680000007152557,0.3619999885559082,0.3619999885559082,0.3729999959468841,0.3720000088214874,0.3650000095367431,0.3759999871253967,0.367000013589859,0.3650000095367431,0.3680000007152557,0.3580000102519989,0.3589999973773956,0.3700000047683716,0.3680000007152557,0.367000013589859,0.3709999918937683,0.3880000114440918,0.3810000121593475,0.375,0.4040000140666961,0.3860000073909759,0.3840000033378601,0.3779999911785126,0.3729999959468841,0.3720000088214874,0.3799999952316284,0.3799999952316284,0.3779999911785126,0.3689999878406524,0.3770000040531158,0.3740000128746032,0.3819999992847442,0.3899999856948852,0.3799999952316284,0.3919999897480011,0.3720000088214874,0.3770000040531158,0.3930000066757202,0.3849999904632568,0.3899999856948852,0.3740000128746032,0.3740000128746032,0.3799999952316284,0.3779999911785126,0.3880000114440918,0.3709999918937683,0.3810000121593475,0.3880000114440918,0.3980000019073486,0.3819999992847442,0.3849999904632568,0.3810000121593475,0.3819999992847442,0.3889999985694885,0.3840000033378601,0.3910000026226043,0.3899999856948852,0.3959999978542328,0.3880000114440918,0.3869999945163727,0.3779999911785126,0.3819999992847442,0.3919999897480011,0.3849999904632568,0.3860000073909759,0.3919999897480011,0.3819999992847442,0.3819999992847442,0.3889999985694885,0.3889999985694885,0.3860000073909759,0.3880000114440918,0.3889999985694885,0.3939999938011169,0.3899999856948852,0.3869999945163727,0.3910000026226043,0.3910000026226043,0.3910000026226043,0.3970000147819519,0.3970000147819519,0.3970000147819519,0.3970000147819519,0.3939999938011169,0.4000000059604645,0.3970000147819519,0.402999997138977,0.3959999978542328,0.3959999978542328,0.4000000059604645,0.4040000140666961,0.4020000100135803,0.3989999890327453,0.3919999897480011,0.3930000066757202,0.3930000066757202,0.3980000019073486,0.4000000059604645,0.395000010728836,0.3899999856948852,0.4059999883174896,0.4020000100135803,0.4020000100135803,0.4059999883174896,0.3970000147819519,0.4110000133514404,0.4050000011920929,0.4000000059604645,0.4090000092983246,0.3989999890327453,0.402999997138977,0.4009999930858612,0.3980000019073486,0.4090000092983246,0.4079999923706054,0.4079999923706054,0.4020000100135803,0.402999997138977,0.402999997138977,0.4059999883174896,0.4040000140666961,0.4059999883174896,0.3989999890327453,0.4070000052452087,0.4059999883174896],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2540000081062317,0.2870000004768371,0.2829999923706054,0.3210000097751617,0.3079999983310699,0.3230000138282776,0.3179999887943268,0.3160000145435333,0.3289999961853027,0.3199999928474426,0.324999988079071,0.3310000002384186,0.3260000050067901,0.335999995470047,0.335999995470047,0.3310000002384186,0.335999995470047,0.3339999914169311,0.3459999859333038,0.3330000042915344,0.3449999988079071,0.3429999947547912,0.3479999899864197,0.3420000076293945,0.3479999899864197,0.3459999859333038,0.3339999914169311,0.3350000083446502,0.3519999980926513,0.3440000116825104,0.3490000069141388,0.3379999995231628,0.3420000076293945,0.3610000014305115,0.3409999907016754,0.356000006198883,0.3630000054836273,0.3519999980926513,0.3510000109672546,0.3619999885559082,0.3569999933242798,0.3479999899864197,0.3529999852180481,0.3569999933242798,0.3529999852180481,0.3519999980926513,0.3549999892711639,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3619999885559082,0.3459999859333038,0.3519999980926513,0.3529999852180481,0.3680000007152557,0.3519999980926513,0.3580000102519989,0.3549999892711639,0.3490000069141388,0.3499999940395355,0.3600000143051147,0.3709999918937683,0.3659999966621399,0.3569999933242798,0.3510000109672546,0.3600000143051147,0.367000013589859,0.3529999852180481,0.363999992609024,0.3630000054836273,0.3619999885559082,0.356000006198883,0.367000013589859,0.3600000143051147,0.3540000021457672,0.3589999973773956,0.3610000014305115,0.356000006198883,0.3680000007152557,0.3519999980926513,0.3549999892711639,0.3479999899864197,0.3549999892711639,0.3519999980926513,0.367000013589859,0.3600000143051147,0.3600000143051147,0.3680000007152557,0.356000006198883,0.3610000014305115,0.3689999878406524,0.367000013589859,0.3689999878406524,0.3720000088214874,0.3680000007152557,0.3569999933242798,0.3650000095367431,0.363999992609024,0.3610000014305115,0.3709999918937683,0.3569999933242798,0.3540000021457672,0.3619999885559082,0.3549999892711639,0.3650000095367431,0.3680000007152557,0.3589999973773956,0.356000006198883,0.3610000014305115,0.3619999885559082,0.3740000128746032,0.3700000047683716,0.3650000095367431,0.3819999992847442,0.3770000040531158,0.3810000121593475,0.3729999959468841,0.3680000007152557,0.3689999878406524,0.3740000128746032,0.3779999911785126,0.3720000088214874,0.3740000128746032,0.367000013589859,0.363999992609024,0.367000013589859,0.3689999878406524,0.3709999918937683,0.3709999918937683,0.375,0.3680000007152557,0.375,0.3630000054836273,0.3720000088214874,0.3819999992847442,0.3729999959468841,0.3689999878406524,0.363999992609024,0.3709999918937683,0.3659999966621399,0.3700000047683716,0.367000013589859,0.3709999918937683,0.3759999871253967,0.3759999871253967,0.3729999959468841,0.3729999959468841,0.3729999959468841,0.3779999911785126,0.375,0.3700000047683716,0.3659999966621399,0.3759999871253967,0.3779999911785126,0.3709999918937683,0.3840000033378601,0.3720000088214874,0.375,0.367000013589859,0.3770000040531158,0.3709999918937683,0.375,0.3709999918937683,0.3740000128746032,0.3740000128746032,0.375,0.3770000040531158],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2599999904632568,0.277999997138977,0.2910000085830688,0.3070000112056732,0.3140000104904175,0.3019999861717224,0.3059999942779541,0.3210000097751617,0.3230000138282776,0.324999988079071,0.3149999976158142,0.3109999895095825,0.3339999914169311,0.3289999961853027,0.3319999873638153,0.3319999873638153,0.3300000131130218,0.3370000123977661,0.3219999969005584,0.3370000123977661,0.328000009059906,0.3339999914169311,0.3420000076293945,0.3400000035762787,0.3440000116825104,0.3510000109672546,0.3409999907016754,0.3449999988079071,0.3339999914169311,0.3540000021457672,0.3339999914169311,0.3470000028610229,0.3470000028610229,0.3440000116825104,0.3589999973773956,0.3569999933242798,0.3630000054836273,0.3549999892711639,0.3589999973773956,0.3449999988079071,0.3549999892711639,0.3449999988079071,0.3389999866485595,0.3499999940395355,0.3610000014305115,0.3619999885559082,0.3600000143051147,0.3519999980926513,0.3479999899864197,0.356000006198883,0.3519999980926513,0.3440000116825104,0.3490000069141388,0.3519999980926513,0.3470000028610229,0.3589999973773956,0.3449999988079071,0.3490000069141388,0.356000006198883,0.3619999885559082,0.3569999933242798,0.3659999966621399,0.3610000014305115,0.3549999892711639,0.3700000047683716,0.363999992609024,0.3600000143051147,0.3580000102519989,0.3549999892711639,0.3619999885559082,0.3689999878406524,0.3630000054836273,0.363999992609024,0.3700000047683716,0.367000013589859,0.3630000054836273,0.3630000054836273,0.3700000047683716,0.3589999973773956,0.3540000021457672,0.3540000021457672,0.3659999966621399,0.3619999885559082,0.3589999973773956,0.3650000095367431,0.3709999918937683,0.3680000007152557,0.3689999878406524,0.3650000095367431,0.3729999959468841,0.3619999885559082,0.3689999878406524,0.3569999933242798,0.3510000109672546,0.3680000007152557,0.363999992609024,0.3700000047683716,0.3659999966621399,0.3659999966621399,0.363999992609024,0.3619999885559082,0.3659999966621399,0.3680000007152557,0.3610000014305115,0.3720000088214874,0.3729999959468841,0.3810000121593475,0.3630000054836273,0.3689999878406524,0.3709999918937683,0.3759999871253967,0.382999986410141,0.3729999959468841,0.3720000088214874,0.3680000007152557,0.3659999966621399,0.3650000095367431,0.363999992609024,0.3589999973773956,0.356000006198883,0.3650000095367431,0.3659999966621399,0.367000013589859,0.3729999959468841,0.3720000088214874,0.375,0.3740000128746032,0.3700000047683716,0.3569999933242798,0.3759999871253967,0.3740000128746032,0.367000013589859,0.3770000040531158,0.3759999871253967,0.3709999918937683,0.3779999911785126,0.3709999918937683,0.3689999878406524,0.3799999952316284,0.3630000054836273,0.375,0.3700000047683716,0.3700000047683716,0.3729999959468841,0.3720000088214874,0.3790000081062317,0.375,0.3729999959468841,0.3770000040531158,0.3799999952316284,0.3779999911785126,0.3720000088214874,0.3799999952316284,0.3759999871253967,0.3799999952316284,0.3790000081062317,0.375,0.3740000128746032,0.3729999959468841,0.3840000033378601,0.3659999966621399,0.3759999871253967,0.3720000088214874,0.3720000088214874,0.3759999871253967,0.375,0.3650000095367431,0.3729999959468841],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2329999953508377,0.2639999985694885,0.2790000140666961,0.296999990940094,0.3109999895095825,0.3240000009536743,0.3070000112056732,0.3210000097751617,0.31700000166893,0.3339999914169311,0.324999988079071,0.3260000050067901,0.3330000042915344,0.3409999907016754,0.3350000083446502,0.3400000035762787,0.3529999852180481,0.3400000035762787,0.3490000069141388,0.3529999852180481,0.3499999940395355,0.3459999859333038,0.3370000123977661,0.356000006198883,0.3490000069141388,0.3429999947547912,0.3490000069141388,0.3610000014305115,0.3499999940395355,0.3569999933242798,0.3610000014305115,0.3619999885559082,0.3449999988079071,0.3409999907016754,0.3420000076293945,0.3449999988079071,0.3409999907016754,0.3379999995231628,0.3420000076293945,0.3569999933242798,0.3529999852180481,0.3610000014305115,0.363999992609024,0.3600000143051147,0.3540000021457672,0.3499999940395355,0.3689999878406524,0.367000013589859,0.3569999933242798,0.3610000014305115,0.3680000007152557,0.3630000054836273,0.3709999918937683,0.3540000021457672,0.3580000102519989,0.367000013589859,0.3529999852180481,0.356000006198883,0.3569999933242798,0.3610000014305115,0.3700000047683716,0.375,0.3709999918937683,0.3819999992847442,0.3709999918937683,0.3650000095367431,0.3709999918937683,0.3650000095367431,0.3709999918937683,0.3840000033378601,0.3740000128746032,0.375,0.356000006198883,0.3689999878406524,0.3700000047683716,0.3819999992847442,0.3799999952316284,0.3779999911785126,0.3729999959468841,0.3709999918937683,0.3759999871253967,0.3709999918937683,0.3759999871253967,0.3779999911785126,0.3779999911785126,0.3689999878406524,0.3840000033378601,0.3860000073909759,0.3849999904632568,0.3790000081062317,0.375,0.3849999904632568,0.3720000088214874,0.3770000040531158,0.3799999952316284,0.3810000121593475,0.382999986410141,0.3650000095367431,0.3740000128746032,0.382999986410141,0.3689999878406524,0.3759999871253967,0.3869999945163727,0.3889999985694885,0.3860000073909759,0.3819999992847442,0.3689999878406524,0.3860000073909759,0.3810000121593475,0.382999986410141,0.3819999992847442,0.3840000033378601,0.3889999985694885,0.3880000114440918,0.3849999904632568,0.3799999952316284,0.3910000026226043,0.3989999890327453,0.3880000114440918,0.3880000114440918,0.3840000033378601,0.3880000114440918,0.3860000073909759,0.3919999897480011,0.3880000114440918,0.3939999938011169,0.3869999945163727,0.3919999897480011,0.3910000026226043,0.382999986410141,0.3930000066757202,0.3840000033378601,0.3880000114440918,0.3840000033378601,0.3819999992847442,0.382999986410141,0.3880000114440918,0.3860000073909759,0.3860000073909759,0.3869999945163727,0.3860000073909759,0.3899999856948852,0.3819999992847442,0.3860000073909759,0.3889999985694885,0.3840000033378601,0.395000010728836,0.3899999856948852,0.3899999856948852,0.3910000026226043,0.3959999978542328,0.3959999978542328,0.3919999897480011,0.3980000019073486,0.3880000114440918,0.3930000066757202,0.4000000059604645,0.3919999897480011,0.3919999897480011,0.4040000140666961,0.3930000066757202,0.3970000147819519,0.3889999985694885,0.3959999978542328,0.3930000066757202,0.3939999938011169,0.3970000147819519,0.3910000026226043,0.4020000100135803],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
data/plots/cross_ind_unfiltered_comparison/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.2759999930858612,0.328000009059906,0.3499999940395355,0.3889999985694885,0.3910000026226043,0.402999997138977,0.4210000038146972,0.4280000030994415,0.4359999895095825,0.4469999969005584,0.4440000057220459,0.4600000083446502,0.4690000116825104,0.4600000083446502,0.4679999947547912,0.4729999899864197,0.4760000109672546,0.4839999973773956,0.4939999878406524,0.488999992609024,0.4990000128746032,0.4979999959468841,0.4979999959468841,0.5009999871253967,0.5,0.5090000033378601,0.5070000290870667,0.5180000066757202,0.5199999809265137,0.5109999775886536,0.5130000114440918,0.5249999761581421,0.5149999856948853,0.5299999713897705,0.5339999794960022,0.5189999938011169,0.5289999842643738,0.5249999761581421,0.5320000052452087,0.5460000038146973,0.5419999957084656,0.5260000228881836,0.5289999842643738,0.546999990940094,0.5419999957084656,0.5419999957084656,0.5460000038146973,0.5419999957084656,0.5389999747276306,0.5440000295639038,0.5569999814033508,0.5450000166893005,0.5329999923706055,0.5580000281333923,0.5339999794960022,0.5540000200271606,0.5460000038146973,0.5479999780654907,0.5529999732971191,0.5540000200271606,0.5619999766349792,0.5490000247955322,0.5410000085830688,0.5490000247955322,0.5569999814033508,0.550000011920929,0.5479999780654907,0.5630000233650208,0.546999990940094,0.5559999942779541,0.5600000023841858,0.5509999990463257,0.5569999814033508,0.5569999814033508,0.5580000281333923,0.5619999766349792,0.5580000281333923,0.5669999718666077,0.5569999814033508,0.5709999799728394,0.5529999732971191,0.5649999976158142,0.5659999847412109,0.5659999847412109,0.5690000057220459,0.5600000023841858,0.5580000281333923,0.5540000200271606,0.5640000104904175,0.5680000185966492,0.5709999799728394,0.5649999976158142,0.5680000185966492,0.5730000138282776,0.5640000104904175,0.5799999833106995,0.5699999928474426,0.5669999718666077,0.5680000185966492,0.5770000219345093,0.5709999799728394,0.5759999752044678,0.5690000057220459,0.5789999961853027,0.5740000009536743,0.5709999799728394,0.5789999961853027,0.5709999799728394,0.5770000219345093,0.5770000219345093,0.5730000138282776,0.5809999704360962,0.5720000267028809,0.5849999785423279,0.5820000171661377,0.5799999833106995,0.5830000042915344,0.5759999752044678,0.5730000138282776,0.5799999833106995,0.5830000042915344,0.5860000252723694,0.5789999961853027,0.5789999961853027,0.5860000252723694,0.5979999899864197,0.5920000076293945,0.5820000171661377,0.5870000123977661,0.5889999866485596,0.5839999914169312,0.5849999785423279,0.5899999737739563,0.5920000076293945,0.593999981880188,0.597000002861023,0.5889999866485596,0.5889999866485596,0.5849999785423279,0.5899999737739563,0.5989999771118164,0.5899999737739563,0.5839999914169312,0.5910000205039978,0.5910000205039978,0.5929999947547913,0.5920000076293945,0.5929999947547913,0.5889999866485596,0.5899999737739563,0.593999981880188,0.5910000205039978,0.5960000157356262,0.5920000076293945,0.5889999866485596,0.593999981880188,0.5879999995231628,0.5960000157356262,0.5920000076293945,0.5960000157356262,0.5960000157356262,0.5920000076293945,0.6010000109672546,0.5920000076293945,0.5899999737739563,0.5889999866485596,0.5920000076293945,0.6019999980926514],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.3009999990463257,0.3149999976158142,0.3400000035762787,0.3610000014305115,0.3680000007152557,0.3799999952316284,0.4020000100135803,0.4180000126361847,0.4129999876022339,0.4259999990463257,0.4239999949932098,0.4440000057220459,0.44200000166893,0.4440000057220459,0.4580000042915344,0.4510000050067901,0.4560000002384186,0.4650000035762787,0.4569999873638153,0.460999995470047,0.4659999907016754,0.4679999947547912,0.4779999852180481,0.4740000069141388,0.4600000083446502,0.4860000014305115,0.4790000021457672,0.4880000054836273,0.4930000007152557,0.4860000014305115,0.4850000143051147,0.4900000095367431,0.4850000143051147,0.4900000095367431,0.4959999918937683,0.492000013589859,0.4850000143051147,0.4970000088214874,0.4900000095367431,0.4979999959468841,0.503000020980835,0.5040000081062317,0.4990000128746032,0.4979999959468841,0.5080000162124634,0.5019999742507935,0.4970000088214874,0.4939999878406524,0.5120000243186951,0.5070000290870667,0.503000020980835,0.5070000290870667,0.503000020980835,0.5109999775886536,0.5080000162124634,0.5009999871253967,0.5090000033378601,0.5,0.5149999856948853,0.5109999775886536,0.5099999904632568,0.5130000114440918,0.5080000162124634,0.5080000162124634,0.5109999775886536,0.5099999904632568,0.5239999890327454,0.5180000066757202,0.5130000114440918,0.5120000243186951,0.5180000066757202,0.515999972820282,0.5260000228881836,0.5199999809265137,0.5239999890327454,0.5220000147819519,0.527999997138977,0.5249999761581421,0.5270000100135803,0.5249999761581421,0.5189999938011169,0.5230000019073486,0.5249999761581421,0.5199999809265137,0.5230000019073486,0.5299999713897705,0.5350000262260437,0.5339999794960022,0.5329999923706055,0.5249999761581421,0.5299999713897705,0.5360000133514404,0.5329999923706055,0.5410000085830688,0.5249999761581421,0.5289999842643738,0.5360000133514404,0.5360000133514404,0.5370000004768372,0.5389999747276306,0.5289999842643738,0.5299999713897705,0.5410000085830688,0.5329999923706055,0.5419999957084656,0.5410000085830688,0.527999997138977,0.5370000004768372,0.5429999828338623,0.5419999957084656,0.5389999747276306,0.5320000052452087,0.5350000262260437,0.5419999957084656,0.5410000085830688,0.5339999794960022,0.5440000295639038,0.5329999923706055,0.5429999828338623,0.5460000038146973,0.5400000214576721,0.5429999828338623,0.5479999780654907,0.550000011920929,0.5490000247955322,0.5410000085830688,0.5450000166893005,0.5429999828338623,0.550000011920929,0.5529999732971191,0.5490000247955322,0.5450000166893005,0.5450000166893005,0.5519999861717224,0.5569999814033508,0.5460000038146973,0.546999990940094,0.5509999990463257,0.5509999990463257,0.5450000166893005,0.5440000295639038,0.5440000295639038,0.546999990940094,0.5479999780654907,0.546999990940094,0.5460000038146973,0.546999990940094,0.5479999780654907,0.5460000038146973,0.5460000038146973,0.5440000295639038,0.5410000085830688,0.5440000295639038,0.5389999747276306,0.5410000085830688,0.546999990940094,0.546999990940094,0.5479999780654907,0.546999990940094,0.550000011920929,0.546999990940094,0.5460000038146973,0.546999990940094,0.5479999780654907,0.5479999780654907,0.5519999861717224,0.550000011920929],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.2809999883174896,0.3230000138282776,0.3409999907016754,0.3600000143051147,0.3569999933242798,0.3889999985694885,0.395000010728836,0.4199999868869781,0.4180000126361847,0.421999990940094,0.4289999902248382,0.4350000023841858,0.4359999895095825,0.4469999969005584,0.4350000023841858,0.4480000138282776,0.4480000138282776,0.453000009059906,0.4550000131130218,0.4589999914169311,0.4639999866485595,0.4600000083446502,0.460999995470047,0.4589999914169311,0.481000006198883,0.4769999980926513,0.4709999859333038,0.4740000069141388,0.4679999947547912,0.4790000021457672,0.4729999899864197,0.4819999933242798,0.4850000143051147,0.4819999933242798,0.4819999933242798,0.4880000054836273,0.4869999885559082,0.4959999918937683,0.4850000143051147,0.4959999918937683,0.492000013589859,0.503000020980835,0.4930000007152557,0.5099999904632568,0.5040000081062317,0.5009999871253967,0.4970000088214874,0.4979999959468841,0.5059999823570251,0.5070000290870667,0.5040000081062317,0.5059999823570251,0.5049999952316284,0.5080000162124634,0.5049999952316284,0.5019999742507935,0.5120000243186951,0.5170000195503235,0.5170000195503235,0.5090000033378601,0.5239999890327454,0.527999997138977,0.5230000019073486,0.5210000276565552,0.5149999856948853,0.5189999938011169,0.5270000100135803,0.5149999856948853,0.5099999904632568,0.5299999713897705,0.5199999809265137,0.5230000019073486,0.5260000228881836,0.5249999761581421,0.5239999890327454,0.5329999923706055,0.5210000276565552,0.5260000228881836,0.5170000195503235,0.531000018119812,0.5289999842643738,0.531000018119812,0.5270000100135803,0.5299999713897705,0.5370000004768372,0.5379999876022339,0.5419999957084656,0.5329999923706055,0.5360000133514404,0.5299999713897705,0.5360000133514404,0.5270000100135803,0.5450000166893005,0.5410000085830688,0.546999990940094,0.5329999923706055,0.5329999923706055,0.5379999876022339,0.5299999713897705,0.5429999828338623,0.5360000133514404,0.5339999794960022,0.5419999957084656,0.5410000085830688,0.5370000004768372,0.5389999747276306,0.527999997138977,0.5400000214576721,0.5400000214576721,0.531000018119812,0.5440000295639038,0.5460000038146973,0.5479999780654907,0.5460000038146973,0.5410000085830688,0.5509999990463257,0.5479999780654907,0.5410000085830688,0.5389999747276306,0.550000011920929,0.5569999814033508,0.550000011920929,0.5490000247955322,0.5490000247955322,0.5569999814033508,0.5519999861717224,0.5479999780654907,0.5559999942779541,0.5550000071525574,0.5460000038146973,0.5540000200271606,0.5460000038146973,0.5460000038146973,0.5509999990463257,0.5460000038146973,0.5550000071525574,0.5479999780654907,0.5479999780654907,0.5540000200271606,0.5550000071525574,0.5529999732971191,0.5529999732971191,0.5509999990463257,0.5509999990463257,0.5419999957084656,0.546999990940094,0.5509999990463257,0.5559999942779541,0.5490000247955322,0.5509999990463257,0.5529999732971191,0.550000011920929,0.5540000200271606,0.5550000071525574,0.5580000281333923,0.550000011920929,0.5569999814033508,0.5490000247955322,0.5519999861717224,0.5519999861717224,0.5559999942779541,0.5569999814033508,0.5559999942779541,0.5550000071525574,0.5559999942779541,0.5490000247955322,0.5550000071525574,0.5600000023841858],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.257999986410141,0.3019999861717224,0.3059999942779541,0.335999995470047,0.3610000014305115,0.3819999992847442,0.4009999930858612,0.4020000100135803,0.4250000119209289,0.4309999942779541,0.4469999969005584,0.4519999921321869,0.453000009059906,0.4580000042915344,0.4729999899864197,0.4749999940395355,0.4699999988079071,0.4799999892711639,0.4749999940395355,0.4769999980926513,0.481000006198883,0.4839999973773956,0.4959999918937683,0.5040000081062317,0.4970000088214874,0.4979999959468841,0.5070000290870667,0.5049999952316284,0.5109999775886536,0.515999972820282,0.5120000243186951,0.5120000243186951,0.515999972820282,0.5120000243186951,0.5249999761581421,0.5170000195503235,0.5199999809265137,0.5270000100135803,0.5170000195503235,0.5220000147819519,0.5260000228881836,0.5360000133514404,0.5339999794960022,0.5370000004768372,0.5339999794960022,0.5329999923706055,0.531000018119812,0.5329999923706055,0.5400000214576721,0.5429999828338623,0.5389999747276306,0.5419999957084656,0.5429999828338623,0.5360000133514404,0.5299999713897705,0.546999990940094,0.5360000133514404,0.5450000166893005,0.5440000295639038,0.5350000262260437,0.5339999794960022,0.5419999957084656,0.5450000166893005,0.5460000038146973,0.5370000004768372,0.5490000247955322,0.5440000295639038,0.550000011920929,0.5490000247955322,0.5450000166893005,0.5490000247955322,0.5559999942779541,0.5559999942779541,0.5410000085830688,0.5419999957084656,0.5529999732971191,0.5460000038146973,0.5540000200271606,0.5379999876022339,0.5509999990463257,0.5540000200271606,0.5419999957084656,0.546999990940094,0.5479999780654907,0.5460000038146973,0.5460000038146973,0.5519999861717224,0.5600000023841858,0.5540000200271606,0.5509999990463257,0.5609999895095825,0.5619999766349792,0.5590000152587891,0.5559999942779541,0.5580000281333923,0.5640000104904175,0.5649999976158142,0.5590000152587891,0.5550000071525574,0.5630000233650208,0.5630000233650208,0.5609999895095825,0.5559999942779541,0.5609999895095825,0.5630000233650208,0.5680000185966492,0.5630000233650208,0.5690000057220459,0.5609999895095825,0.5590000152587891,0.5640000104904175,0.5690000057220459,0.5640000104904175,0.5630000233650208,0.574999988079071,0.5630000233650208,0.5619999766349792,0.5690000057220459,0.5770000219345093,0.5690000057220459,0.5609999895095825,0.5649999976158142,0.5680000185966492,0.5590000152587891,0.5600000023841858,0.5619999766349792,0.5799999833106995,0.5619999766349792,0.5699999928474426,0.5709999799728394,0.5669999718666077,0.5680000185966492,0.5609999895095825,0.5649999976158142,0.5680000185966492,0.5730000138282776,0.5720000267028809,0.5709999799728394,0.5770000219345093,0.574999988079071,0.5730000138282776,0.5690000057220459,0.5740000009536743,0.578000009059906,0.574999988079071,0.5820000171661377,0.5730000138282776,0.5740000009536743,0.574999988079071,0.5770000219345093,0.5789999961853027,0.5759999752044678,0.5720000267028809,0.5770000219345093,0.5759999752044678,0.5789999961853027,0.5789999961853027,0.5730000138282776,0.5789999961853027,0.5759999752044678,0.5690000057220459,0.5849999785423279,0.5759999752044678,0.5699999928474426,0.5789999961853027,0.5820000171661377,0.5730000138282776,0.5730000138282776,0.5789999961853027],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
data/plots/cross_ind_unfiltered_comparison/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files":{"agg_score":{"file":"agg_score.json"},"commonsense_qa/acc_norm":{"file":"commonsense_qa_acc_norm.json"},"hellaswag/acc_norm":{"file":"hellaswag_acc_norm.json"},"openbookqa/acc_norm":{"file":"openbookqa_acc_norm.json"},"piqa/acc_norm":{"file":"piqa_acc_norm.json"},"winogrande/acc_norm":{"file":"winogrande_acc_norm.json"},"arc/acc_norm":{"file":"arc_acc_norm.json"},"mmlu/acc_norm":{"file":"mmlu_acc_norm.json"}},"settings":{"slider":{"min":0,"max":30,"default":5}}}
data/plots/cross_ind_unfiltered_comparison/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2528519630432129,0.2616856694221496,0.2665999829769134,0.2683407664299011,0.2742894291877746,0.2762066125869751,0.2807516455650329,0.2767378389835357,0.2807380557060241,0.2788906991481781,0.2844051718711853,0.2856102883815765,0.2883394360542297,0.2875711619853973,0.2890409529209137,0.2894668281078338,0.2883355319499969,0.2872501015663147,0.291619062423706,0.2900333702564239,0.2962473034858703,0.2962896525859833,0.297355443239212,0.2932226359844208,0.2886744439601898,0.29665008187294,0.2976542115211487,0.2991503179073334,0.3004479110240936,0.3044549524784088,0.2976194322109222,0.3014707863330841,0.3048252463340759,0.3039425611495971,0.303354948759079,0.3027459383010864,0.2999922931194305,0.3050121665000915,0.2998814284801483,0.2978588044643402,0.3041949570178985,0.3010904192924499,0.3022017180919647,0.2997751235961914,0.3015910983085632,0.3096485137939453,0.3012076020240783,0.3065535724163055,0.3042872548103332,0.3104783594608307,0.2997980415821075,0.3051296770572662,0.303458571434021,0.3088337182998657,0.3145398199558258,0.3032208085060119,0.310806930065155,0.3075874149799347,0.3101692199707031,0.310107946395874,0.3066047430038452,0.3109066784381866,0.3081336915493011,0.3084586262702942,0.3086149394512176,0.3085348606109619,0.3136637806892395,0.3110873103141784,0.31076380610466,0.3084572553634643,0.3133681714534759,0.3125792145729065,0.3124453127384186,0.3097185790538788,0.3106793165206909,0.3089564740657806,0.3111244142055511,0.3123694658279419,0.3144859969615936,0.3135123550891876,0.311982125043869,0.3142133951187134,0.3122903704643249,0.3147654831409454,0.3078767359256744,0.314947634935379,0.3171303570270538,0.3129573762416839,0.3154936134815216,0.3158208429813385,0.3153132200241089,0.3141326904296875,0.3163397014141083,0.3166318237781524,0.3168410360813141,0.3198235332965851,0.3201336860656738,0.3212967813014984,0.3191385567188263,0.3178017139434814,0.3192791938781738,0.323061466217041,0.320336639881134,0.3165886104106903,0.3206393420696258,0.3167395293712616,0.3135207295417785,0.315539002418518,0.3191742599010467,0.321073055267334,0.3222262561321258,0.3193058371543884,0.3213480710983276,0.3198905289173126,0.3219239711761474,0.3211614489555359,0.318855881690979,0.3177095353603363,0.324197381734848,0.3208906352519989,0.3264936804771423,0.3245965242385864,0.3231639564037323,0.3221887946128845,0.3277338445186615,0.3227696120738983,0.3263820111751556,0.3258577883243561,0.3264622390270233,0.3222362995147705,0.3286814987659454,0.3235024213790893,0.32446950674057,0.3311836123466491,0.328130304813385,0.3271634578704834,0.3250012993812561,0.3309800624847412,0.3274554014205932,0.3273015916347503,0.3261759579181671,0.32697594165802,0.3303172886371612,0.3282814025878906,0.3289586305618286,0.3260826468467712,0.3258011937141418,0.3297208249568939,0.3254813551902771,0.3287739753723144,0.3287097811698913,0.3275279700756073,0.3293041586875915,0.3314100801944732,0.3287808299064636,0.3251930773258209,0.3288172781467438,0.3265027701854706,0.3275215625762939,0.3290774822235107,0.3261331617832184,0.3299777805805206,0.331955999135971,0.3305029273033142,0.3274719417095184,0.3235560953617096,0.3269940316677093,0.3323083519935608],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2510619163513183,0.2621481418609619,0.2632303833961487,0.2720474302768707,0.2719806432723999,0.2726832032203674,0.2786827087402344,0.2823672890663147,0.276201844215393,0.2816944718360901,0.280361145734787,0.2819306254386902,0.2823295891284942,0.2892518043518066,0.2872919738292694,0.2859259247779846,0.2885263860225677,0.2862614393234253,0.2933129370212555,0.2930494546890259,0.2884900867938995,0.2942298054695129,0.2927677929401397,0.2954220175743103,0.2918704748153686,0.2943699061870575,0.2891678512096405,0.291848212480545,0.2942944765090942,0.2973679602146148,0.2953736186027527,0.2963412702083587,0.297100305557251,0.2963026762008667,0.2944463491439819,0.2971296310424804,0.293870210647583,0.2982682287693023,0.2978119254112243,0.2989997565746307,0.2993503510951996,0.298117071390152,0.2977498769760132,0.3004056811332702,0.3012634217739105,0.3001384139060974,0.3052266240119934,0.3038219809532165,0.3037647306919098,0.3009455502033233,0.3038812279701233,0.303263396024704,0.3025077581405639,0.3056069612503052,0.3024908602237701,0.3050909340381622,0.3001562356948852,0.303833544254303,0.3019777834415436,0.3036664128303528,0.3022894859313965,0.3042722940444946,0.3023003339767456,0.3069425821304321,0.307883083820343,0.3026910126209259,0.3054113090038299,0.3046148121356964,0.305342435836792,0.3048149049282074,0.3066973984241485,0.3055126965045929,0.3063409924507141,0.307701051235199,0.3075169324874878,0.3091190159320831,0.3098153173923492,0.31436288356781,0.3096509575843811,0.3022815883159637,0.3119745552539825,0.3083471357822418,0.3085280954837799,0.3082001209259033,0.3080264329910278,0.3116717934608459,0.3097788393497467,0.3117353916168213,0.3170038759708404,0.3099159002304077,0.3133728504180908,0.3161626160144806,0.3095119595527649,0.3135432302951813,0.3103009164333343,0.3126655519008636,0.3121814131736755,0.3123973608016968,0.3148256838321686,0.3144133985042572,0.3124284744262695,0.3102188408374786,0.3123636841773987,0.3115113973617553,0.3151636719703674,0.3148572146892547,0.315061867237091,0.3127182424068451,0.3139308094978332,0.3134367167949676,0.3136025071144104,0.3172793388366699,0.3134761154651642,0.3109587132930755,0.3127998411655426,0.3161843717098236,0.3163313865661621,0.3145243525505066,0.3155156075954437,0.3127505779266357,0.3182451128959656,0.3162476718425751,0.3124897480010986,0.3128789663314819,0.3119811117649078,0.314126193523407,0.3136049509048462,0.3149912655353546,0.3146650791168213,0.3151968121528625,0.3179666996002197,0.3169245719909668,0.3202513754367828,0.3185319602489471,0.3202781081199646,0.3186031281948089,0.3166128396987915,0.3199457228183746,0.3194417059421539,0.3170624077320099,0.3184532523155212,0.3191981911659241,0.3191225528717041,0.3173209130764007,0.3195607960224151,0.3166368305683136,0.3188160359859466,0.3174867630004883,0.3184468746185303,0.3211863338947296,0.3184327483177185,0.3177861273288727,0.3180214762687683,0.3194973170757293,0.3212297558784485,0.3211282789707184,0.3200584352016449,0.3168685734272003,0.3211040198802948,0.3222841620445251,0.3196901082992553,0.3236229419708252,0.3204475045204162,0.3210069537162781,0.3191083669662475,0.31863734126091,0.3195922076702118],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2516599297523498,0.2610189318656921,0.2666046619415283,0.2667981088161468,0.2667821645736694,0.2708088159561157,0.2738403379917145,0.2726235687732696,0.2762763500213623,0.2768311202526092,0.2809228301048279,0.2836140990257263,0.2822815179824829,0.2831664383411407,0.2797218561172485,0.286342591047287,0.2855269610881805,0.2847287058830261,0.2888180613517761,0.286526083946228,0.2865165770053863,0.294582188129425,0.2925947606563568,0.2947863042354584,0.2892930805683136,0.2903610467910766,0.288201242685318,0.2873396277427673,0.2916238009929657,0.2908017039299011,0.2907920777797699,0.2952797412872314,0.2941452264785766,0.2921333611011505,0.2925891280174255,0.2968584895133972,0.2980035543441772,0.2964116632938385,0.2962304651737213,0.2950254380702972,0.2977516651153564,0.2944138348102569,0.3003402054309845,0.2976303696632385,0.3013098239898681,0.302829384803772,0.3018766045570373,0.305361807346344,0.2971298694610595,0.3014816343784332,0.3019805550575256,0.3037064969539642,0.2970167994499206,0.2995208501815796,0.2970106601715088,0.2990955114364624,0.3027818500995636,0.3048534691333771,0.2993872463703155,0.2986327707767486,0.3015393316745758,0.3003426790237427,0.3003274798393249,0.3017795085906982,0.3019182682037353,0.3015450537204742,0.3046211004257202,0.3031167984008789,0.3020436763763428,0.3011128306388855,0.3029948472976684,0.3045558631420135,0.301642894744873,0.3029441833496094,0.3035804331302643,0.3004390001296997,0.3021787703037262,0.306041270494461,0.3064048886299133,0.3087956011295318,0.3070018291473388,0.3065581619739532,0.3093871772289276,0.3060930073261261,0.3033313155174255,0.3072777390480041,0.306413859128952,0.3104493916034698,0.3056999444961548,0.3077532052993774,0.309231549501419,0.3070645034313202,0.3117790520191192,0.3114112913608551,0.312661737203598,0.3181777000427246,0.3117201030254364,0.3099702894687652,0.3074746131896972,0.3064963519573211,0.3105958700180053,0.3111456036567688,0.3084964454174042,0.3087405860424042,0.3121673166751861,0.3121528625488281,0.3100416660308838,0.3142979145050049,0.3129935264587402,0.3112611472606659,0.3119436800479889,0.3154115974903106,0.3091593086719513,0.3103814721107483,0.3130497634410858,0.3133455514907837,0.3152708411216736,0.3137963414192199,0.3099324703216553,0.3164172768592834,0.3133907914161682,0.3128255009651184,0.3134104907512665,0.3106969892978668,0.3130004107952118,0.3131391704082489,0.3130116462707519,0.3143952488899231,0.3143975436687469,0.3143710494041443,0.3163396418094635,0.3166862726211548,0.3184126019477844,0.3178988993167877,0.317479133605957,0.3184944093227386,0.316694974899292,0.3176258206367492,0.3182629346847534,0.3200214207172394,0.3181648552417755,0.320680022239685,0.3178716897964477,0.3182425796985626,0.3182984292507171,0.3158398568630218,0.3152642548084259,0.3132680356502533,0.3178914785385132,0.3156660795211792,0.3161703050136566,0.3176451921463012,0.3173815906047821,0.3194171786308288,0.3193057179450989,0.3172560334205627,0.317656546831131,0.3155770003795624,0.3199106156826019,0.3170182108879089,0.3156754970550537,0.3180731236934662,0.3205638229846954,0.3175432682037353,0.3184471428394317,0.3192788958549499,0.3197042346000671,0.3177168369293213],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2501466572284698,0.2558934390544891,0.2618628144264221,0.2683217823505401,0.2699837982654571,0.2738722860813141,0.2744417488574981,0.2740873992443084,0.2807216048240661,0.2820421457290649,0.2891400754451751,0.2879075407981872,0.2881667613983154,0.2892490327358246,0.2882707118988037,0.2935869693756103,0.2870290875434875,0.2911452651023865,0.2949125170707702,0.2916406095027923,0.2981449663639068,0.2953989207744598,0.2946988642215729,0.297021746635437,0.3001497685909271,0.3010218441486358,0.2977036237716675,0.2992585003376007,0.2986803948879242,0.2994338274002075,0.2989781498908996,0.3041955828666687,0.3030496537685394,0.303806334733963,0.3036351203918457,0.3058845102787018,0.300450712442398,0.3025284707546234,0.3072526752948761,0.3039065897464752,0.3073755502700805,0.3070493042469024,0.3083153367042541,0.3123056292533874,0.307761400938034,0.3053378164768219,0.3116358816623688,0.3080427348613739,0.308482676744461,0.307318776845932,0.3083004653453827,0.3089516758918762,0.3088736236095428,0.3077724277973175,0.3126304149627685,0.3101697862148285,0.3159398734569549,0.314792275428772,0.3103811144828796,0.3111368715763092,0.3129658997058868,0.311605304479599,0.3118223249912262,0.3133279979228973,0.3146496713161468,0.3195074200630188,0.3142614662647247,0.3125102519989013,0.3115333616733551,0.3183117806911468,0.3168580532073974,0.3187012672424316,0.3179306983947754,0.3157722651958465,0.3214826583862304,0.3145081698894501,0.3172421753406524,0.3151432573795318,0.3181649446487427,0.3180212080478668,0.3171605765819549,0.3212067782878876,0.3180184066295624,0.3209905624389648,0.319052129983902,0.3212707936763763,0.3196887373924255,0.3188316226005554,0.3164899051189422,0.3241994678974151,0.3179469406604767,0.3214083909988403,0.3206575512886047,0.3263285160064697,0.3219505250453949,0.3181525468826294,0.3219776451587677,0.3259726762771606,0.3197665512561798,0.3236161768436432,0.3177970349788666,0.3258080780506134,0.3208407461643219,0.3251138925552368,0.3242645859718323,0.3229723274707794,0.3227455914020538,0.3206316232681274,0.3256695866584778,0.3241210877895355,0.3224890530109405,0.3263737261295318,0.3214233517646789,0.3240345120429992,0.3222567737102508,0.3242291808128357,0.3257078528404236,0.3278365731239319,0.3277338743209839,0.3253948092460632,0.3232105076313019,0.3267974853515625,0.3263654410839081,0.3262891769409179,0.3238334357738495,0.3294911682605743,0.3261866867542267,0.3243315815925598,0.3250119090080261,0.326727420091629,0.3268802464008331,0.3269768059253692,0.3257980346679687,0.3280686736106872,0.3274897634983063,0.3282252252101898,0.3272863030433655,0.328346699476242,0.325562834739685,0.3301684856414795,0.3284023404121399,0.3268299400806427,0.3286610245704651,0.3291078805923462,0.324972927570343,0.3314772248268127,0.3278062343597412,0.326839417219162,0.3277239501476288,0.330414742231369,0.3271744549274444,0.3279334008693695,0.3288575112819671,0.3285425007343292,0.3282454907894134,0.3296376466751098,0.3305942714214325,0.3276287615299225,0.3292438983917236,0.329515129327774,0.3281475007534027,0.3282177448272705,0.3333999514579773,0.3302631080150604,0.330238401889801,0.3323166668415069,0.3313035368919372,0.32961106300354,0.3321967124938965],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
data/plots/cross_ind_unfiltered_comparison/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2560000121593475,0.2840000092983246,0.3059999942779541,0.3059999942779541,0.2980000078678131,0.3240000009536743,0.3100000023841858,0.3000000119209289,0.3160000145435333,0.3140000104904175,0.3260000050067901,0.3199999928474426,0.2980000078678131,0.3179999887943268,0.3179999887943268,0.3319999873638153,0.3019999861717224,0.2939999997615814,0.3319999873638153,0.3319999873638153,0.3219999969005584,0.3379999995231628,0.3379999995231628,0.3339999914169311,0.3240000009536743,0.3479999899864197,0.3300000131130218,0.3240000009536743,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3319999873638153,0.3379999995231628,0.356000006198883,0.3339999914169311,0.3459999859333038,0.3440000116825104,0.3519999980926513,0.3479999899864197,0.3339999914169311,0.3400000035762787,0.3479999899864197,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.3400000035762787,0.3499999940395355,0.3420000076293945,0.3659999966621399,0.3400000035762787,0.3459999859333038,0.3499999940395355,0.356000006198883,0.3400000035762787,0.356000006198883,0.3339999914169311,0.3339999914169311,0.3479999899864197,0.3420000076293945,0.3580000102519989,0.3339999914169311,0.3440000116825104,0.3400000035762787,0.3499999940395355,0.3540000021457672,0.3479999899864197,0.3499999940395355,0.3420000076293945,0.3379999995231628,0.335999995470047,0.356000006198883,0.3459999859333038,0.3499999940395355,0.3400000035762787,0.3440000116825104,0.356000006198883,0.3519999980926513,0.3400000035762787,0.3440000116825104,0.356000006198883,0.3400000035762787,0.356000006198883,0.3600000143051147,0.3540000021457672,0.3479999899864197,0.3379999995231628,0.3440000116825104,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3339999914169311,0.3499999940395355,0.3600000143051147,0.3440000116825104,0.3499999940395355,0.356000006198883,0.3420000076293945,0.3479999899864197,0.3379999995231628,0.3379999995231628,0.3459999859333038,0.356000006198883,0.328000009059906,0.3459999859333038,0.3519999980926513,0.3499999940395355,0.3519999980926513,0.3420000076293945,0.3499999940395355,0.3420000076293945,0.3339999914169311,0.335999995470047,0.3379999995231628,0.3379999995231628,0.3540000021457672,0.356000006198883,0.356000006198883,0.335999995470047,0.363999992609024,0.363999992609024,0.3499999940395355,0.356000006198883,0.3519999980926513,0.3519999980926513,0.3540000021457672,0.3459999859333038,0.3479999899864197,0.3519999980926513,0.3519999980926513,0.3420000076293945,0.3440000116825104,0.3379999995231628,0.3519999980926513,0.356000006198883,0.3420000076293945,0.3580000102519989,0.3499999940395355,0.3619999885559082,0.3519999980926513,0.3600000143051147,0.3459999859333038,0.3519999980926513,0.3519999980926513,0.3499999940395355,0.3580000102519989,0.356000006198883,0.3580000102519989,0.3600000143051147,0.3440000116825104,0.3600000143051147,0.3440000116825104,0.3479999899864197,0.3479999899864197,0.3580000102519989,0.3600000143051147,0.3580000102519989,0.3540000021457672,0.3519999980926513,0.3459999859333038,0.3459999859333038,0.3540000021457672,0.335999995470047,0.3540000021457672,0.3540000021457672,0.3519999980926513,0.356000006198883,0.3499999940395355,0.356000006198883],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2460000067949295,0.2720000147819519,0.270000010728836,0.2939999997615814,0.2960000038146972,0.3240000009536743,0.3019999861717224,0.2879999876022339,0.3179999887943268,0.3059999942779541,0.2899999916553497,0.3100000023841858,0.3179999887943268,0.3219999969005584,0.3219999969005584,0.3300000131130218,0.3140000104904175,0.3240000009536743,0.3079999983310699,0.3260000050067901,0.3120000064373016,0.3160000145435333,0.3179999887943268,0.3260000050067901,0.3260000050067901,0.3240000009536743,0.3379999995231628,0.3219999969005584,0.3319999873638153,0.3379999995231628,0.3339999914169311,0.328000009059906,0.3319999873638153,0.3199999928474426,0.3000000119209289,0.3260000050067901,0.3240000009536743,0.328000009059906,0.3240000009536743,0.328000009059906,0.3260000050067901,0.3440000116825104,0.3199999928474426,0.3319999873638153,0.3219999969005584,0.335999995470047,0.3519999980926513,0.3379999995231628,0.328000009059906,0.3300000131130218,0.335999995470047,0.3479999899864197,0.3459999859333038,0.3479999899864197,0.3540000021457672,0.3479999899864197,0.3300000131130218,0.356000006198883,0.3479999899864197,0.356000006198883,0.335999995470047,0.335999995470047,0.3479999899864197,0.3339999914169311,0.3540000021457672,0.3300000131130218,0.3479999899864197,0.3499999940395355,0.3400000035762787,0.3459999859333038,0.3339999914169311,0.3479999899864197,0.335999995470047,0.3400000035762787,0.3179999887943268,0.335999995470047,0.328000009059906,0.328000009059906,0.3540000021457672,0.3479999899864197,0.3420000076293945,0.3580000102519989,0.3459999859333038,0.3420000076293945,0.3459999859333038,0.3440000116825104,0.3499999940395355,0.335999995470047,0.3540000021457672,0.356000006198883,0.3400000035762787,0.3600000143051147,0.3580000102519989,0.3519999980926513,0.3499999940395355,0.3540000021457672,0.3519999980926513,0.3499999940395355,0.3440000116825104,0.356000006198883,0.3479999899864197,0.3479999899864197,0.3440000116825104,0.3499999940395355,0.3440000116825104,0.3519999980926513,0.3440000116825104,0.356000006198883,0.3459999859333038,0.3580000102519989,0.356000006198883,0.3519999980926513,0.3420000076293945,0.3379999995231628,0.3479999899864197,0.3459999859333038,0.3499999940395355,0.3400000035762787,0.3440000116825104,0.3420000076293945,0.3420000076293945,0.3499999940395355,0.3459999859333038,0.3420000076293945,0.3459999859333038,0.3459999859333038,0.3479999899864197,0.3440000116825104,0.3720000088214874,0.3619999885559082,0.356000006198883,0.3519999980926513,0.3459999859333038,0.3440000116825104,0.3420000076293945,0.3580000102519989,0.3600000143051147,0.3519999980926513,0.3600000143051147,0.3440000116825104,0.3600000143051147,0.3619999885559082,0.3499999940395355,0.3499999940395355,0.363999992609024,0.3580000102519989,0.3499999940395355,0.3479999899864197,0.3479999899864197,0.3580000102519989,0.3540000021457672,0.3600000143051147,0.3420000076293945,0.3519999980926513,0.3440000116825104,0.3519999980926513,0.3540000021457672,0.356000006198883,0.3459999859333038,0.3499999940395355,0.3519999980926513,0.3580000102519989,0.3440000116825104,0.3499999940395355,0.3580000102519989,0.3479999899864197,0.3479999899864197],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2560000121593475,0.2720000147819519,0.2980000078678131,0.2840000092983246,0.2879999876022339,0.3039999902248382,0.2860000133514404,0.2899999916553497,0.3019999861717224,0.2960000038146972,0.3039999902248382,0.3100000023841858,0.3160000145435333,0.3260000050067901,0.3160000145435333,0.3260000050067901,0.3179999887943268,0.3420000076293945,0.3219999969005584,0.328000009059906,0.3240000009536743,0.3300000131130218,0.328000009059906,0.3199999928474426,0.3379999995231628,0.3400000035762787,0.3240000009536743,0.3120000064373016,0.3319999873638153,0.3260000050067901,0.3120000064373016,0.3160000145435333,0.3140000104904175,0.3179999887943268,0.3160000145435333,0.3199999928474426,0.3240000009536743,0.3260000050067901,0.3179999887943268,0.3300000131130218,0.3179999887943268,0.328000009059906,0.3240000009536743,0.328000009059906,0.3260000050067901,0.3199999928474426,0.3400000035762787,0.3339999914169311,0.328000009059906,0.328000009059906,0.3339999914169311,0.328000009059906,0.328000009059906,0.335999995470047,0.3580000102519989,0.3499999940395355,0.3260000050067901,0.3499999940395355,0.3420000076293945,0.3160000145435333,0.3339999914169311,0.335999995470047,0.3400000035762787,0.3240000009536743,0.3319999873638153,0.3379999995231628,0.3400000035762787,0.3379999995231628,0.3319999873638153,0.3319999873638153,0.3440000116825104,0.3300000131130218,0.3219999969005584,0.3260000050067901,0.3219999969005584,0.3339999914169311,0.328000009059906,0.3300000131130218,0.3219999969005584,0.3379999995231628,0.3400000035762787,0.3319999873638153,0.328000009059906,0.3440000116825104,0.3339999914169311,0.328000009059906,0.3379999995231628,0.3499999940395355,0.3339999914169311,0.3300000131130218,0.328000009059906,0.335999995470047,0.3240000009536743,0.335999995470047,0.3240000009536743,0.3400000035762787,0.3400000035762787,0.3420000076293945,0.3319999873638153,0.3339999914169311,0.3300000131130218,0.3400000035762787,0.3459999859333038,0.3400000035762787,0.3379999995231628,0.3459999859333038,0.3379999995231628,0.3300000131130218,0.3519999980926513,0.3379999995231628,0.356000006198883,0.335999995470047,0.3420000076293945,0.3400000035762787,0.328000009059906,0.3540000021457672,0.3499999940395355,0.3479999899864197,0.3440000116825104,0.3519999980926513,0.356000006198883,0.3540000021457672,0.3440000116825104,0.3499999940395355,0.356000006198883,0.356000006198883,0.356000006198883,0.363999992609024,0.3600000143051147,0.356000006198883,0.3479999899864197,0.356000006198883,0.3459999859333038,0.3479999899864197,0.3619999885559082,0.363999992609024,0.3499999940395355,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.356000006198883,0.3519999980926513,0.3540000021457672,0.3619999885559082,0.3580000102519989,0.3540000021457672,0.356000006198883,0.3479999899864197,0.3519999980926513,0.356000006198883,0.3499999940395355,0.3379999995231628,0.3479999899864197,0.3499999940395355,0.3440000116825104,0.3580000102519989,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3580000102519989,0.3519999980926513,0.3540000021457672,0.3519999980926513,0.3540000021457672,0.356000006198883,0.363999992609024,0.356000006198883,0.356000006198883],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.2860000133514404,0.2660000026226043,0.277999997138977,0.2820000052452087,0.3079999983310699,0.3140000104904175,0.3260000050067901,0.3039999902248382,0.3319999873638153,0.3240000009536743,0.3199999928474426,0.3379999995231628,0.3339999914169311,0.3319999873638153,0.3179999887943268,0.3319999873638153,0.3219999969005584,0.3319999873638153,0.3379999995231628,0.3199999928474426,0.3179999887943268,0.3400000035762787,0.3219999969005584,0.335999995470047,0.3339999914169311,0.3420000076293945,0.3240000009536743,0.3440000116825104,0.3420000076293945,0.3379999995231628,0.3459999859333038,0.328000009059906,0.3420000076293945,0.3459999859333038,0.3479999899864197,0.3379999995231628,0.356000006198883,0.3379999995231628,0.3440000116825104,0.3400000035762787,0.3379999995231628,0.3499999940395355,0.3540000021457672,0.3479999899864197,0.3479999899864197,0.3440000116825104,0.3459999859333038,0.3440000116825104,0.3519999980926513,0.356000006198883,0.3600000143051147,0.3379999995231628,0.356000006198883,0.3400000035762787,0.3519999980926513,0.3479999899864197,0.3479999899864197,0.3400000035762787,0.3459999859333038,0.3519999980926513,0.3440000116825104,0.3400000035762787,0.356000006198883,0.3420000076293945,0.356000006198883,0.3540000021457672,0.3600000143051147,0.3339999914169311,0.3499999940395355,0.3580000102519989,0.3440000116825104,0.3479999899864197,0.3580000102519989,0.3519999980926513,0.3339999914169311,0.3540000021457672,0.3459999859333038,0.3459999859333038,0.3400000035762787,0.356000006198883,0.356000006198883,0.3420000076293945,0.3420000076293945,0.3400000035762787,0.3479999899864197,0.3519999980926513,0.3319999873638153,0.3580000102519989,0.356000006198883,0.356000006198883,0.3499999940395355,0.3479999899864197,0.3400000035762787,0.3440000116825104,0.3339999914169311,0.3379999995231628,0.3479999899864197,0.3680000007152557,0.3619999885559082,0.3440000116825104,0.3619999885559082,0.3580000102519989,0.356000006198883,0.3600000143051147,0.3519999980926513,0.3519999980926513,0.3459999859333038,0.3540000021457672,0.3600000143051147,0.356000006198883,0.3540000021457672,0.3519999980926513,0.356000006198883,0.3600000143051147,0.3540000021457672,0.3540000021457672,0.363999992609024,0.3580000102519989,0.3680000007152557,0.3580000102519989,0.356000006198883,0.3519999980926513,0.3519999980926513,0.3519999980926513,0.3459999859333038,0.3499999940395355,0.356000006198883,0.3540000021457672,0.3540000021457672,0.3659999966621399,0.3619999885559082,0.3420000076293945,0.363999992609024,0.3580000102519989,0.3619999885559082,0.3759999871253967,0.3740000128746032,0.363999992609024,0.3580000102519989,0.3700000047683716,0.3700000047683716,0.363999992609024,0.3440000116825104,0.3580000102519989,0.3680000007152557,0.3700000047683716,0.3740000128746032,0.3619999885559082,0.3619999885559082,0.3700000047683716,0.363999992609024,0.363999992609024,0.363999992609024,0.3700000047683716,0.3600000143051147,0.3680000007152557,0.363999992609024,0.3659999966621399,0.363999992609024,0.3680000007152557,0.3580000102519989,0.363999992609024,0.3659999966621399,0.363999992609024,0.3580000102519989,0.3600000143051147,0.3600000143051147,0.3580000102519989,0.3600000143051147],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
data/plots/cross_ind_unfiltered_comparison/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6019999980926514,0.652999997138977,0.6710000038146973,0.6740000247955322,0.6899999976158142,0.6919999718666077,0.6909999847412109,0.7070000171661377,0.7089999914169312,0.7129999995231628,0.7229999899864197,0.7120000123977661,0.7200000286102295,0.7300000190734863,0.7279999852180481,0.7369999885559082,0.7390000224113464,0.7350000143051147,0.7319999933242798,0.7279999852180481,0.7269999980926514,0.7459999918937683,0.7400000095367432,0.7390000224113464,0.7319999933242798,0.7390000224113464,0.7379999756813049,0.7390000224113464,0.7360000014305115,0.7440000176429749,0.7400000095367432,0.7360000014305115,0.7480000257492065,0.7360000014305115,0.7440000176429749,0.7459999918937683,0.7409999966621399,0.746999979019165,0.7440000176429749,0.7450000047683716,0.753000020980835,0.7390000224113464,0.7490000128746033,0.7419999837875366,0.7390000224113464,0.7559999823570251,0.7519999742507935,0.7549999952316284,0.7419999837875366,0.7490000128746033,0.7540000081062317,0.7480000257492065,0.7450000047683716,0.7429999709129333,0.7509999871253967,0.7549999952316284,0.7490000128746033,0.7490000128746033,0.7400000095367432,0.753000020980835,0.75,0.7509999871253967,0.7570000290870667,0.7590000033378601,0.7570000290870667,0.7329999804496765,0.7540000081062317,0.746999979019165,0.7409999966621399,0.7590000033378601,0.7509999871253967,0.7570000290870667,0.75,0.7540000081062317,0.7480000257492065,0.7580000162124634,0.7639999985694885,0.7630000114440918,0.7590000033378601,0.7549999952316284,0.7480000257492065,0.7509999871253967,0.7570000290870667,0.75,0.7540000081062317,0.7480000257492065,0.7549999952316284,0.7559999823570251,0.7580000162124634,0.7580000162124634,0.753000020980835,0.7490000128746033,0.7540000081062317,0.7639999985694885,0.7580000162124634,0.7519999742507935,0.7590000033378601,0.75,0.7570000290870667,0.7620000243186951,0.7710000276565552,0.7739999890327454,0.7620000243186951,0.7549999952316284,0.7599999904632568,0.765999972820282,0.7680000066757202,0.7639999985694885,0.7540000081062317,0.7649999856948853,0.7649999856948853,0.7609999775886536,0.7549999952316284,0.765999972820282,0.7639999985694885,0.7580000162124634,0.7710000276565552,0.7570000290870667,0.7630000114440918,0.7580000162124634,0.7599999904632568,0.7649999856948853,0.7670000195503235,0.7699999809265137,0.7710000276565552,0.7559999823570251,0.7609999775886536,0.7620000243186951,0.7620000243186951,0.7609999775886536,0.753000020980835,0.7570000290870667,0.7620000243186951,0.7609999775886536,0.7609999775886536,0.7559999823570251,0.7540000081062317,0.7570000290870667,0.7639999985694885,0.7590000033378601,0.7680000066757202,0.7680000066757202,0.765999972820282,0.765999972820282,0.7670000195503235,0.7739999890327454,0.7649999856948853,0.7749999761581421,0.7699999809265137,0.7639999985694885,0.7680000066757202,0.7630000114440918,0.7680000066757202,0.7699999809265137,0.7739999890327454,0.7749999761581421,0.765999972820282,0.7680000066757202,0.7710000276565552,0.7680000066757202,0.765999972820282,0.7689999938011169,0.7760000228881836,0.7710000276565552,0.7680000066757202,0.7649999856948853,0.7720000147819519,0.7730000019073486],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6169999837875366,0.6359999775886536,0.6769999861717224,0.6769999861717224,0.6970000267028809,0.6990000009536743,0.6970000267028809,0.6959999799728394,0.7049999833106995,0.7089999914169312,0.7179999947547913,0.7099999785423279,0.7160000205039978,0.7260000109672546,0.7229999899864197,0.7179999947547913,0.7210000157356262,0.7200000286102295,0.734000027179718,0.7089999914169312,0.7229999899864197,0.7239999771118164,0.7310000061988831,0.7300000190734863,0.7260000109672546,0.7250000238418579,0.7239999771118164,0.7289999723434448,0.7390000224113464,0.7229999899864197,0.7310000061988831,0.7350000143051147,0.7289999723434448,0.734000027179718,0.7289999723434448,0.7329999804496765,0.7300000190734863,0.7319999933242798,0.7440000176429749,0.746999979019165,0.7310000061988831,0.7329999804496765,0.7480000257492065,0.7429999709129333,0.7369999885559082,0.7269999980926514,0.7269999980926514,0.7379999756813049,0.75,0.7360000014305115,0.746999979019165,0.7409999966621399,0.7369999885559082,0.7459999918937683,0.7400000095367432,0.7409999966621399,0.746999979019165,0.7360000014305115,0.7459999918937683,0.7400000095367432,0.7429999709129333,0.7350000143051147,0.7390000224113464,0.7379999756813049,0.7480000257492065,0.7329999804496765,0.734000027179718,0.7390000224113464,0.7459999918937683,0.7360000014305115,0.7419999837875366,0.7429999709129333,0.7400000095367432,0.7379999756813049,0.7310000061988831,0.7360000014305115,0.7390000224113464,0.75,0.7369999885559082,0.7570000290870667,0.7409999966621399,0.7459999918937683,0.7350000143051147,0.7459999918937683,0.7509999871253967,0.7429999709129333,0.7419999837875366,0.7419999837875366,0.75,0.7440000176429749,0.7450000047683716,0.75,0.7409999966621399,0.7490000128746033,0.7409999966621399,0.7419999837875366,0.7429999709129333,0.7490000128746033,0.7419999837875366,0.7419999837875366,0.75,0.753000020980835,0.75,0.746999979019165,0.7519999742507935,0.746999979019165,0.7570000290870667,0.7549999952316284,0.75,0.7540000081062317,0.7480000257492065,0.7490000128746033,0.7419999837875366,0.7419999837875366,0.746999979019165,0.746999979019165,0.75,0.7519999742507935,0.7580000162124634,0.7549999952316284,0.7490000128746033,0.7480000257492065,0.7519999742507935,0.7590000033378601,0.7450000047683716,0.75,0.7440000176429749,0.7419999837875366,0.7519999742507935,0.7450000047683716,0.753000020980835,0.7450000047683716,0.7440000176429749,0.7559999823570251,0.7509999871253967,0.7540000081062317,0.7440000176429749,0.7509999871253967,0.753000020980835,0.7490000128746033,0.7570000290870667,0.7490000128746033,0.746999979019165,0.746999979019165,0.7509999871253967,0.7509999871253967,0.7519999742507935,0.7570000290870667,0.7540000081062317,0.7440000176429749,0.7480000257492065,0.7509999871253967,0.7509999871253967,0.7509999871253967,0.7549999952316284,0.75,0.7559999823570251,0.746999979019165,0.7609999775886536,0.7549999952316284,0.746999979019165,0.7490000128746033,0.753000020980835,0.753000020980835,0.7609999775886536,0.746999979019165,0.7580000162124634],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.621999979019165,0.6439999938011169,0.6700000166893005,0.6790000200271606,0.6869999766349792,0.6959999799728394,0.6790000200271606,0.6880000233650208,0.7049999833106995,0.699999988079071,0.6990000009536743,0.6940000057220459,0.7110000252723694,0.7089999914169312,0.7120000123977661,0.7070000171661377,0.7070000171661377,0.6990000009536743,0.7009999752044678,0.7160000205039978,0.7200000286102295,0.7149999737739563,0.7250000238418579,0.7210000157356262,0.722000002861023,0.7310000061988831,0.7289999723434448,0.7319999933242798,0.7250000238418579,0.722000002861023,0.7210000157356262,0.7170000076293945,0.7260000109672546,0.7250000238418579,0.7210000157356262,0.7200000286102295,0.7379999756813049,0.7239999771118164,0.7239999771118164,0.7080000042915344,0.7289999723434448,0.7289999723434448,0.7300000190734863,0.7329999804496765,0.7319999933242798,0.7350000143051147,0.7390000224113464,0.7350000143051147,0.7289999723434448,0.734000027179718,0.7329999804496765,0.7400000095367432,0.7409999966621399,0.7310000061988831,0.7350000143051147,0.7360000014305115,0.7360000014305115,0.7409999966621399,0.7319999933242798,0.7409999966621399,0.7400000095367432,0.7390000224113464,0.7329999804496765,0.7459999918937683,0.753000020980835,0.746999979019165,0.734000027179718,0.7369999885559082,0.7419999837875366,0.734000027179718,0.7419999837875366,0.7289999723434448,0.7350000143051147,0.7300000190734863,0.7519999742507935,0.7390000224113464,0.7400000095367432,0.7409999966621399,0.7429999709129333,0.7450000047683716,0.7329999804496765,0.7260000109672546,0.7570000290870667,0.7360000014305115,0.7519999742507935,0.7419999837875366,0.7379999756813049,0.7390000224113464,0.7490000128746033,0.734000027179718,0.7360000014305115,0.7390000224113464,0.7440000176429749,0.7450000047683716,0.7319999933242798,0.7429999709129333,0.7519999742507935,0.7540000081062317,0.7519999742507935,0.753000020980835,0.7480000257492065,0.7440000176429749,0.7459999918937683,0.7369999885559082,0.7419999837875366,0.7480000257492065,0.7419999837875366,0.765999972820282,0.746999979019165,0.7459999918937683,0.7570000290870667,0.7390000224113464,0.7409999966621399,0.7459999918937683,0.75,0.7570000290870667,0.753000020980835,0.7549999952316284,0.7519999742507935,0.7490000128746033,0.746999979019165,0.7459999918937683,0.7459999918937683,0.746999979019165,0.7409999966621399,0.7419999837875366,0.7459999918937683,0.7440000176429749,0.7459999918937683,0.7490000128746033,0.7450000047683716,0.7409999966621399,0.7419999837875366,0.7490000128746033,0.7590000033378601,0.7549999952316284,0.7549999952316284,0.746999979019165,0.753000020980835,0.7549999952316284,0.746999979019165,0.7580000162124634,0.7490000128746033,0.753000020980835,0.75,0.75,0.7540000081062317,0.7540000081062317,0.7490000128746033,0.7570000290870667,0.7570000290870667,0.7590000033378601,0.7559999823570251,0.7620000243186951,0.7590000033378601,0.7509999871253967,0.7639999985694885,0.7580000162124634,0.7599999904632568,0.7620000243186951,0.7590000033378601,0.7609999775886536,0.7559999823570251,0.75,0.7509999871253967,0.7549999952316284,0.7540000081062317,0.7540000081062317],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.5099999904632568,0.6209999918937683,0.6549999713897705,0.6800000071525574,0.6830000281333923,0.703000009059906,0.7020000219345093,0.7110000252723694,0.7160000205039978,0.7129999995231628,0.7210000157356262,0.7250000238418579,0.7210000157356262,0.7310000061988831,0.7269999980926514,0.7269999980926514,0.7329999804496765,0.7459999918937683,0.734000027179718,0.7409999966621399,0.7390000224113464,0.7350000143051147,0.7509999871253967,0.7440000176429749,0.7379999756813049,0.7599999904632568,0.7400000095367432,0.7409999966621399,0.7590000033378601,0.7409999966621399,0.7440000176429749,0.7400000095367432,0.7450000047683716,0.75,0.7440000176429749,0.7409999966621399,0.7429999709129333,0.7440000176429749,0.7440000176429749,0.7559999823570251,0.7459999918937683,0.7559999823570251,0.7540000081062317,0.7599999904632568,0.7559999823570251,0.7490000128746033,0.7490000128746033,0.7429999709129333,0.7609999775886536,0.7519999742507935,0.7480000257492065,0.7490000128746033,0.7620000243186951,0.7580000162124634,0.7580000162124634,0.7540000081062317,0.7509999871253967,0.7519999742507935,0.7440000176429749,0.7459999918937683,0.7559999823570251,0.7620000243186951,0.746999979019165,0.7570000290870667,0.7620000243186951,0.7570000290870667,0.7540000081062317,0.7540000081062317,0.7570000290870667,0.7590000033378601,0.7519999742507935,0.75,0.7559999823570251,0.7590000033378601,0.7559999823570251,0.7519999742507935,0.7639999985694885,0.7620000243186951,0.7549999952316284,0.7490000128746033,0.7559999823570251,0.7639999985694885,0.7609999775886536,0.7609999775886536,0.7519999742507935,0.7549999952316284,0.7570000290870667,0.7620000243186951,0.7599999904632568,0.7639999985694885,0.7559999823570251,0.753000020980835,0.7649999856948853,0.753000020980835,0.7549999952316284,0.7609999775886536,0.7599999904632568,0.7680000066757202,0.7540000081062317,0.7559999823570251,0.7590000033378601,0.7590000033378601,0.7649999856948853,0.7639999985694885,0.7710000276565552,0.7699999809265137,0.7609999775886536,0.765999972820282,0.7670000195503235,0.7720000147819519,0.7639999985694885,0.7609999775886536,0.7549999952316284,0.7630000114440918,0.7670000195503235,0.7599999904632568,0.765999972820282,0.7670000195503235,0.7670000195503235,0.7670000195503235,0.7720000147819519,0.7760000228881836,0.7710000276565552,0.7829999923706055,0.7630000114440918,0.7720000147819519,0.7649999856948853,0.7630000114440918,0.7699999809265137,0.7720000147819519,0.7720000147819519,0.7689999938011169,0.777999997138977,0.7689999938011169,0.7760000228881836,0.7730000019073486,0.7799999713897705,0.7720000147819519,0.7760000228881836,0.7710000276565552,0.7770000100135803,0.777999997138977,0.7670000195503235,0.7789999842643738,0.7799999713897705,0.7749999761581421,0.7730000019073486,0.777999997138977,0.777999997138977,0.7799999713897705,0.7770000100135803,0.7770000100135803,0.7789999842643738,0.7760000228881836,0.7770000100135803,0.7770000100135803,0.7770000100135803,0.7739999890327454,0.7689999938011169,0.7760000228881836,0.777999997138977,0.7699999809265137,0.7739999890327454,0.7670000195503235,0.7699999809265137,0.7710000276565552,0.7730000019073486,0.7739999890327454,0.7680000066757202],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
data/plots/cross_ind_unfiltered_comparison/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.5,0.4979999959468841,0.4950000047683716,0.4950000047683716,0.5049999952316284,0.5329999923706055,0.5220000147819519,0.5139999985694885,0.5339999794960022,0.5130000114440918,0.5389999747276306,0.5400000214576721,0.5270000100135803,0.5320000052452087,0.5260000228881836,0.5370000004768372,0.527999997138977,0.5289999842643738,0.5339999794960022,0.5270000100135803,0.531000018119812,0.527999997138977,0.5400000214576721,0.5479999780654907,0.550000011920929,0.5400000214576721,0.5350000262260437,0.5410000085830688,0.5379999876022339,0.5299999713897705,0.5490000247955322,0.5509999990463257,0.5519999861717224,0.5429999828338623,0.5429999828338623,0.5440000295639038,0.5379999876022339,0.5379999876022339,0.5419999957084656,0.5609999895095825,0.5540000200271606,0.5370000004768372,0.5440000295639038,0.5410000085830688,0.5379999876022339,0.5329999923706055,0.5419999957084656,0.5419999957084656,0.5519999861717224,0.550000011920929,0.5509999990463257,0.5400000214576721,0.5450000166893005,0.5509999990463257,0.5569999814033508,0.5550000071525574,0.5590000152587891,0.5479999780654907,0.5550000071525574,0.5440000295639038,0.5460000038146973,0.546999990940094,0.5559999942779541,0.5550000071525574,0.5490000247955322,0.5440000295639038,0.546999990940094,0.5450000166893005,0.546999990940094,0.5649999976158142,0.5490000247955322,0.5519999861717224,0.550000011920929,0.5509999990463257,0.5519999861717224,0.5519999861717224,0.5529999732971191,0.5490000247955322,0.546999990940094,0.550000011920929,0.5720000267028809,0.5619999766349792,0.5490000247955322,0.5680000185966492,0.5519999861717224,0.5569999814033508,0.5509999990463257,0.5619999766349792,0.5630000233650208,0.5529999732971191,0.5619999766349792,0.5609999895095825,0.550000011920929,0.5479999780654907,0.5529999732971191,0.5519999861717224,0.5580000281333923,0.5590000152587891,0.5529999732971191,0.550000011920929,0.5680000185966492,0.5580000281333923,0.5630000233650208,0.5630000233650208,0.5559999942779541,0.5649999976158142,0.5569999814033508,0.5649999976158142,0.5659999847412109,0.5559999942779541,0.5659999847412109,0.5630000233650208,0.5509999990463257,0.5669999718666077,0.5669999718666077,0.5479999780654907,0.5540000200271606,0.5580000281333923,0.5519999861717224,0.5590000152587891,0.5590000152587891,0.5619999766349792,0.5509999990463257,0.546999990940094,0.5609999895095825,0.5540000200271606,0.5630000233650208,0.5580000281333923,0.5559999942779541,0.5680000185966492,0.5649999976158142,0.5619999766349792,0.5580000281333923,0.5630000233650208,0.5559999942779541,0.5540000200271606,0.5540000200271606,0.5569999814033508,0.5619999766349792,0.5559999942779541,0.5600000023841858,0.5460000038146973,0.5429999828338623,0.5580000281333923,0.5550000071525574,0.5580000281333923,0.5540000200271606,0.5609999895095825,0.5519999861717224,0.550000011920929,0.5519999861717224,0.5590000152587891,0.5619999766349792,0.5600000023841858,0.5590000152587891,0.5690000057220459,0.5640000104904175,0.5580000281333923,0.5559999942779541,0.5569999814033508,0.5569999814033508,0.5540000200271606,0.5640000104904175,0.5600000023841858,0.5550000071525574,0.5640000104904175,0.5600000023841858,0.5540000200271606],"label":"RefinedWeb"},"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.4869999885559082,0.4959999918937683,0.4979999959468841,0.5099999904632568,0.515999972820282,0.5080000162124634,0.5249999761581421,0.5239999890327454,0.5299999713897705,0.5239999890327454,0.5149999856948853,0.5270000100135803,0.5249999761581421,0.5180000066757202,0.5220000147819519,0.5329999923706055,0.5289999842643738,0.5239999890327454,0.5299999713897705,0.5230000019073486,0.5130000114440918,0.5180000066757202,0.5299999713897705,0.5199999809265137,0.5270000100135803,0.5230000019073486,0.5299999713897705,0.5320000052452087,0.5429999828338623,0.527999997138977,0.5379999876022339,0.527999997138977,0.5419999957084656,0.5329999923706055,0.5450000166893005,0.5320000052452087,0.5410000085830688,0.5249999761581421,0.5400000214576721,0.5249999761581421,0.5289999842643738,0.5320000052452087,0.5339999794960022,0.5320000052452087,0.5350000262260437,0.5400000214576721,0.5450000166893005,0.5440000295639038,0.5400000214576721,0.5379999876022339,0.5350000262260437,0.5410000085830688,0.5490000247955322,0.531000018119812,0.5389999747276306,0.546999990940094,0.5529999732971191,0.5370000004768372,0.5440000295639038,0.5400000214576721,0.5490000247955322,0.550000011920929,0.5580000281333923,0.5609999895095825,0.5429999828338623,0.5529999732971191,0.5519999861717224,0.5450000166893005,0.550000011920929,0.5379999876022339,0.5490000247955322,0.5460000038146973,0.5419999957084656,0.5569999814033508,0.5509999990463257,0.5490000247955322,0.5529999732971191,0.5479999780654907,0.5590000152587891,0.5479999780654907,0.5509999990463257,0.5440000295639038,0.5509999990463257,0.5540000200271606,0.5559999942779541,0.5630000233650208,0.5649999976158142,0.5640000104904175,0.5649999976158142,0.5490000247955322,0.5709999799728394,0.5659999847412109,0.5630000233650208,0.5640000104904175,0.5580000281333923,0.546999990940094,0.5550000071525574,0.5580000281333923,0.5429999828338623,0.5440000295639038,0.5569999814033508,0.5569999814033508,0.5540000200271606,0.5550000071525574,0.5649999976158142,0.5540000200271606,0.5630000233650208,0.5609999895095825,0.5580000281333923,0.5509999990463257,0.5550000071525574,0.5550000071525574,0.5519999861717224,0.5609999895095825,0.5630000233650208,0.5509999990463257,0.550000011920929,0.5490000247955322,0.5540000200271606,0.550000011920929,0.5529999732971191,0.5460000038146973,0.550000011920929,0.5529999732971191,0.5519999861717224,0.5529999732971191,0.5609999895095825,0.5590000152587891,0.5550000071525574,0.550000011920929,0.5609999895095825,0.5619999766349792,0.5609999895095825,0.5540000200271606,0.550000011920929,0.5600000023841858,0.5559999942779541,0.5609999895095825,0.5569999814033508,0.5600000023841858,0.5680000185966492,0.5580000281333923,0.5559999942779541,0.5569999814033508,0.5669999718666077,0.5709999799728394,0.5640000104904175,0.5569999814033508,0.5600000023841858,0.5569999814033508,0.5649999976158142,0.5600000023841858,0.5580000281333923,0.5609999895095825,0.5590000152587891,0.5640000104904175,0.5529999732971191,0.5640000104904175,0.5649999976158142,0.5659999847412109,0.5630000233650208,0.5630000233650208,0.5619999766349792,0.5609999895095825,0.5559999942779541,0.5529999732971191,0.5600000023841858],"label":"FineWeb full MinHash"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.5239999890327454,0.4900000095367431,0.5040000081062317,0.5099999904632568,0.4990000128746032,0.5170000195503235,0.5040000081062317,0.5009999871253967,0.5230000019073486,0.5109999775886536,0.5059999823570251,0.5130000114440918,0.5090000033378601,0.5180000066757202,0.5220000147819519,0.5189999938011169,0.5180000066757202,0.5220000147819519,0.5120000243186951,0.5460000038146973,0.5239999890327454,0.5289999842643738,0.5440000295639038,0.5339999794960022,0.5299999713897705,0.5260000228881836,0.5360000133514404,0.5339999794960022,0.5360000133514404,0.5299999713897705,0.5180000066757202,0.5249999761581421,0.5440000295639038,0.5299999713897705,0.5339999794960022,0.5239999890327454,0.527999997138977,0.5139999985694885,0.5289999842643738,0.5360000133514404,0.5260000228881836,0.5389999747276306,0.5460000038146973,0.5270000100135803,0.5339999794960022,0.5320000052452087,0.5329999923706055,0.5260000228881836,0.5220000147819519,0.5260000228881836,0.5379999876022339,0.5410000085830688,0.5350000262260437,0.5389999747276306,0.5320000052452087,0.5389999747276306,0.5379999876022339,0.5329999923706055,0.5270000100135803,0.5170000195503235,0.5329999923706055,0.5370000004768372,0.5379999876022339,0.5249999761581421,0.5479999780654907,0.546999990940094,0.5400000214576721,0.5440000295639038,0.5360000133514404,0.5450000166893005,0.5440000295639038,0.5370000004768372,0.5370000004768372,0.5479999780654907,0.5379999876022339,0.5400000214576721,0.5479999780654907,0.5379999876022339,0.5509999990463257,0.5440000295639038,0.5379999876022339,0.550000011920929,0.5389999747276306,0.5370000004768372,0.5379999876022339,0.5419999957084656,0.5360000133514404,0.5509999990463257,0.5360000133514404,0.5419999957084656,0.5419999957084656,0.550000011920929,0.5360000133514404,0.5519999861717224,0.5540000200271606,0.546999990940094,0.5370000004768372,0.5379999876022339,0.5519999861717224,0.5329999923706055,0.5400000214576721,0.5429999828338623,0.550000011920929,0.5490000247955322,0.5360000133514404,0.550000011920929,0.5569999814033508,0.5490000247955322,0.5490000247955322,0.5479999780654907,0.5350000262260437,0.5490000247955322,0.5370000004768372,0.5440000295639038,0.5329999923706055,0.5440000295639038,0.5429999828338623,0.5389999747276306,0.5450000166893005,0.5320000052452087,0.5450000166893005,0.5400000214576721,0.5419999957084656,0.5460000038146973,0.5370000004768372,0.5400000214576721,0.5460000038146973,0.5370000004768372,0.5370000004768372,0.5460000038146973,0.5400000214576721,0.5490000247955322,0.5529999732971191,0.5379999876022339,0.5460000038146973,0.5450000166893005,0.5429999828338623,0.5460000038146973,0.5400000214576721,0.5479999780654907,0.5460000038146973,0.5540000200271606,0.5400000214576721,0.5350000262260437,0.5490000247955322,0.5460000038146973,0.5460000038146973,0.5509999990463257,0.5410000085830688,0.5429999828338623,0.5379999876022339,0.5450000166893005,0.5389999747276306,0.5400000214576721,0.5400000214576721,0.550000011920929,0.5440000295639038,0.5389999747276306,0.5450000166893005,0.5400000214576721,0.5389999747276306,0.5419999957084656,0.5410000085830688,0.5440000295639038,0.5519999861717224,0.5479999780654907,0.5450000166893005,0.5569999814033508],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[0.4970000088214874,0.4880000054836273,0.492000013589859,0.5059999823570251,0.5139999985694885,0.5070000290870667,0.5090000033378601,0.5230000019073486,0.5189999938011169,0.5189999938011169,0.5220000147819519,0.5149999856948853,0.5260000228881836,0.5329999923706055,0.5230000019073486,0.5180000066757202,0.5289999842643738,0.5400000214576721,0.5410000085830688,0.5440000295639038,0.5329999923706055,0.550000011920929,0.5419999957084656,0.5360000133514404,0.5429999828338623,0.5429999828338623,0.5450000166893005,0.5490000247955322,0.5400000214576721,0.5509999990463257,0.5559999942779541,0.5479999780654907,0.5540000200271606,0.5490000247955322,0.5400000214576721,0.5429999828338623,0.5460000038146973,0.5370000004768372,0.5479999780654907,0.5550000071525574,0.5490000247955322,0.5400000214576721,0.5410000085830688,0.5460000038146973,0.546999990940094,0.5479999780654907,0.546999990940094,0.5509999990463257,0.5450000166893005,0.5590000152587891,0.5419999957084656,0.5540000200271606,0.5440000295639038,0.5450000166893005,0.5580000281333923,0.5540000200271606,0.5440000295639038,0.5619999766349792,0.5450000166893005,0.5600000023841858,0.5559999942779541,0.5600000023841858,0.5400000214576721,0.5569999814033508,0.5600000023841858,0.5619999766349792,0.5529999732971191,0.5649999976158142,0.5609999895095825,0.5550000071525574,0.5609999895095825,0.5580000281333923,0.5550000071525574,0.5619999766349792,0.5550000071525574,0.5519999861717224,0.5600000023841858,0.5550000071525574,0.5550000071525574,0.5590000152587891,0.5490000247955322,0.5580000281333923,0.5600000023841858,0.5419999957084656,0.5559999942779541,0.5559999942779541,0.5529999732971191,0.5609999895095825,0.5519999861717224,0.5569999814033508,0.5569999814033508,0.5509999990463257,0.5619999766349792,0.546999990940094,0.5619999766349792,0.5460000038146973,0.5529999732971191,0.5619999766349792,0.5690000057220459,0.5680000185966492,0.5720000267028809,0.5640000104904175,0.5550000071525574,0.5509999990463257,0.550000011920929,0.5600000023841858,0.5609999895095825,0.5630000233650208,0.5649999976158142,0.5529999732971191,0.5540000200271606,0.5529999732971191,0.5659999847412109,0.5600000023841858,0.5590000152587891,0.5619999766349792,0.5600000023841858,0.5730000138282776,0.5569999814033508,0.5690000057220459,0.5619999766349792,0.5680000185966492,0.578000009059906,0.5730000138282776,0.5550000071525574,0.5529999732971191,0.5600000023841858,0.5630000233650208,0.5590000152587891,0.5659999847412109,0.5669999718666077,0.5609999895095825,0.5630000233650208,0.5569999814033508,0.5490000247955322,0.5619999766349792,0.5550000071525574,0.5630000233650208,0.5559999942779541,0.5559999942779541,0.5649999976158142,0.5569999814033508,0.5619999766349792,0.5559999942779541,0.5669999718666077,0.5609999895095825,0.5690000057220459,0.5770000219345093,0.5690000057220459,0.5720000267028809,0.5619999766349792,0.5649999976158142,0.5669999718666077,0.5680000185966492,0.5699999928474426,0.5640000104904175,0.5609999895095825,0.5740000009536743,0.5690000057220459,0.5669999718666077,0.5720000267028809,0.5699999928474426,0.5709999799728394,0.5740000009536743,0.5680000185966492,0.5619999766349792,0.5690000057220459,0.5659999847412109,0.574999988079071],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
data/plots/custom-_ilters.json DELETED
@@ -1 +0,0 @@
1
- {"data":{"agg_score":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308596294373274,0.35654734168201685,0.3758235517889261,0.38752372190356255,0.39841264486312866,0.4040419068187475,0.4097859803587198,0.41541148349642754,0.416892247274518,0.41986062191426754,0.4234193116426468,0.4218583852052688,0.4243287574499845,0.42519346065819263,0.42440339736640453],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35955795273184776,0.3757704347372055,0.3934198468923569,0.398214865475893,0.4062729831784963,0.41363069601356983,0.41463132016360754,0.41851891577243805,0.4239445272833109,0.42439557053148746,0.4273625332862139,0.4289980959147215,0.4327357914298773,0.43017333932220936],"label":"Filters combined"}},"commonsense_qa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2850000113248825,0.2875000089406967,0.31049999594688416,0.3135000020265579,0.3279999941587448,0.32999999821186066,0.32349999248981476,0.3229999989271164,0.32350000739097595,0.32900001108646393,0.3264999985694885,0.3349999934434891,0.32999999821186066],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2649999856948852,0.2790000140666961,0.29649999737739563,0.3135000020265579,0.3164999932050705,0.32099999487400055,0.3210000097751617,0.3305000066757202,0.3205000013113022,0.32549999654293055,0.3295000046491623,0.33050000667572016,0.335999995470047,0.33200000226497645],"label":"Filters combined"}},"hellaswag/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2900000065565109,0.3279999941587448,0.3550000041723251,0.375,0.38850000500679016,0.40350000560283655,0.41200000047683716,0.4194999933242798,0.42249999940395355,0.4329999983310699,0.43449999392032623,0.43700000643730164,0.4395000040531158,0.43950000405311584],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29349999129772186,0.3210000097751617,0.36150000989437103,0.3734999895095825,0.39599999785423273,0.4125000089406967,0.4234999865293503,0.42749999463558197,0.44699999690055847,0.4549999982118606,0.4660000056028366,0.46600000560283655,0.47050000727176666,0.4675000011920929],"label":"Filters combined"}},"openbookqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25099999457597727,0.2629999965429306,0.2810000032186508,0.2939999997615814,0.2900000065565109,0.3100000023841858,0.3129999935626983,0.3149999976158142,0.3229999989271164,0.3310000002384186,0.32100000977516174,0.32999999821186066,0.32100000977516174,0.32100000977516174],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2719999998807907,0.277999997138977,0.3039999902248382,0.28199999034404755,0.30200000107288355,0.3050000071525574,0.31299999356269836,0.32099999487400055,0.3269999921321869,0.31599999964237213,0.3260000050067901,0.32600000500679016,0.3299999982118606,0.32500000298023224],"label":"Filters combined"}},"piqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.613999992609024,0.6504999995231628,0.6649999916553497,0.6870000064373016,0.6915000081062317,0.6974999904632568,0.7035000026226044,0.7129999995231628,0.7055000066757202,0.7080000042915344,0.7084999978542328,0.7114999890327454,0.714000016450882,0.7115000188350677],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6274999976158142,0.656000018119812,0.6665000021457672,0.6854999959468842,0.6895000040531158,0.7035000026226044,0.7060000002384186,0.7100000083446503,0.7195000052452087,0.7159999907016754,0.715499997138977,0.7170000076293945,0.7274999916553497,0.7199999988079071],"label":"Filters combined"}},"winogrande/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.49699999392032623,0.5035000145435333,0.5054999887943268,0.5200000107288361,0.5115000009536743,0.5154999792575836,0.507999986410141,0.515500009059906,0.5160000026226044,0.5139999985694885,0.5165000259876251,0.5160000026226044,0.523499995470047],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4854999929666519,0.4860000014305115,0.5094999969005585,0.5090000033378601,0.5195000171661377,0.5185000002384186,0.5090000182390213,0.5084999799728394,0.5209999978542328,0.5164999961853027,0.5254999995231628,0.5250000059604645,0.5250000059604645,0.5254999995231628],"label":"Filters combined"}},"arc/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.29224999248981476,0.3267499953508377,0.34375,0.34800000488758087,0.35224999487400055,0.3567499965429306,0.36450000107288355,0.369499996304512,0.3712500035762787,0.3722500056028366,0.37325000762939453,0.377250000834465,0.37624999880790705,0.3764999955892563],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.3199999928474426,0.3397499918937683,0.3467499911785126,0.3540000021457672,0.3662499934434891,0.36374999582767487,0.3647499978542328,0.3675000071525574,0.371749997138977,0.37074999511241913,0.375,0.3787499964237213,0.38099999725818634],"label":"Filters combined"}},"mmlu/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.25162875652313227,0.26033842563629145,0.26643975079059595,0.26930116117000574,0.27358523011207575,0.27403785288333893,0.2792918980121612,0.28113801777362823,0.2826349586248398,0.2856044620275497,0.2851170748472214,0.28488004207611084,0.2877976596355438,0.28672714531421656],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2534636557102203,0.2621634304523468,0.2661087810993194,0.2704689502716064,0.27318383753299713,0.2757955640554428,0.2758005559444427,0.28340134024620056,0.2835562080144882,0.28641459345817566,0.28565025329589844,0.28998473286628723,0.29013633728027344,0.2888867110013962],"label":"Filters combined"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}},"defaultWindowSize":3}
 
 
data/plots/custom-_ilters/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308596294373274,0.35654734168201685,0.3758235517889261,0.38752372190356255,0.39841264486312866,0.4040419068187475,0.4097859803587198,0.41541148349642754,0.416892247274518,0.41986062191426754,0.4234193116426468,0.4218583852052688,0.4243287574499845,0.42519346065819263,0.42440339736640453],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35955795273184776,0.3757704347372055,0.3934198468923569,0.398214865475893,0.4062729831784963,0.41363069601356983,0.41463132016360754,0.41851891577243805,0.4239445272833109,0.42439557053148746,0.4273625332862139,0.4289980959147215,0.4327357914298773,0.43017333932220936],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-_ilters/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.29224999248981476,0.3267499953508377,0.34375,0.34800000488758087,0.35224999487400055,0.3567499965429306,0.36450000107288355,0.369499996304512,0.3712500035762787,0.3722500056028366,0.37325000762939453,0.377250000834465,0.37624999880790705,0.3764999955892563],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.3199999928474426,0.3397499918937683,0.3467499911785126,0.3540000021457672,0.3662499934434891,0.36374999582767487,0.3647499978542328,0.3675000071525574,0.371749997138977,0.37074999511241913,0.375,0.3787499964237213,0.38099999725818634],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-_ilters/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2850000113248825,0.2875000089406967,0.31049999594688416,0.3135000020265579,0.3279999941587448,0.32999999821186066,0.32349999248981476,0.3229999989271164,0.32350000739097595,0.32900001108646393,0.3264999985694885,0.3349999934434891,0.32999999821186066],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2649999856948852,0.2790000140666961,0.29649999737739563,0.3135000020265579,0.3164999932050705,0.32099999487400055,0.3210000097751617,0.3305000066757202,0.3205000013113022,0.32549999654293055,0.3295000046491623,0.33050000667572016,0.335999995470047,0.33200000226497645],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-_ilters/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2900000065565109,0.3279999941587448,0.3550000041723251,0.375,0.38850000500679016,0.40350000560283655,0.41200000047683716,0.4194999933242798,0.42249999940395355,0.4329999983310699,0.43449999392032623,0.43700000643730164,0.4395000040531158,0.43950000405311584],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29349999129772186,0.3210000097751617,0.36150000989437103,0.3734999895095825,0.39599999785423273,0.4125000089406967,0.4234999865293503,0.42749999463558197,0.44699999690055847,0.4549999982118606,0.4660000056028366,0.46600000560283655,0.47050000727176666,0.4675000011920929],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-_ilters/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files":{"agg_score":{"file":"agg_score.json"},"commonsense_qa/acc_norm":{"file":"commonsense_qa_acc_norm.json"},"hellaswag/acc_norm":{"file":"hellaswag_acc_norm.json"},"openbookqa/acc_norm":{"file":"openbookqa_acc_norm.json"},"piqa/acc_norm":{"file":"piqa_acc_norm.json"},"winogrande/acc_norm":{"file":"winogrande_acc_norm.json"},"arc/acc_norm":{"file":"arc_acc_norm.json"},"mmlu/acc_norm":{"file":"mmlu_acc_norm.json"}},"settings":{"slider":{"min":0,"max":10,"default":3}}}
data/plots/custom-_ilters/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.25162875652313227,0.26033842563629145,0.26643975079059595,0.26930116117000574,0.27358523011207575,0.27403785288333893,0.2792918980121612,0.28113801777362823,0.2826349586248398,0.2856044620275497,0.2851170748472214,0.28488004207611084,0.2877976596355438,0.28672714531421656],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2534636557102203,0.2621634304523468,0.2661087810993194,0.2704689502716064,0.27318383753299713,0.2757955640554428,0.2758005559444427,0.28340134024620056,0.2835562080144882,0.28641459345817566,0.28565025329589844,0.28998473286628723,0.29013633728027344,0.2888867110013962],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-_ilters/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25099999457597727,0.2629999965429306,0.2810000032186508,0.2939999997615814,0.2900000065565109,0.3100000023841858,0.3129999935626983,0.3149999976158142,0.3229999989271164,0.3310000002384186,0.32100000977516174,0.32999999821186066,0.32100000977516174,0.32100000977516174],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2719999998807907,0.277999997138977,0.3039999902248382,0.28199999034404755,0.30200000107288355,0.3050000071525574,0.31299999356269836,0.32099999487400055,0.3269999921321869,0.31599999964237213,0.3260000050067901,0.32600000500679016,0.3299999982118606,0.32500000298023224],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-_ilters/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.613999992609024,0.6504999995231628,0.6649999916553497,0.6870000064373016,0.6915000081062317,0.6974999904632568,0.7035000026226044,0.7129999995231628,0.7055000066757202,0.7080000042915344,0.7084999978542328,0.7114999890327454,0.714000016450882,0.7115000188350677],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6274999976158142,0.656000018119812,0.6665000021457672,0.6854999959468842,0.6895000040531158,0.7035000026226044,0.7060000002384186,0.7100000083446503,0.7195000052452087,0.7159999907016754,0.715499997138977,0.7170000076293945,0.7274999916553497,0.7199999988079071],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-_ilters/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.49699999392032623,0.5035000145435333,0.5054999887943268,0.5200000107288361,0.5115000009536743,0.5154999792575836,0.507999986410141,0.515500009059906,0.5160000026226044,0.5139999985694885,0.5165000259876251,0.5160000026226044,0.523499995470047],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4854999929666519,0.4860000014305115,0.5094999969005585,0.5090000033378601,0.5195000171661377,0.5185000002384186,0.5090000182390213,0.5084999799728394,0.5209999978542328,0.5164999961853027,0.5254999995231628,0.5250000059604645,0.5250000059604645,0.5254999995231628],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-filters.json DELETED
@@ -1 +0,0 @@
1
- {"data":{"agg_score":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308596294373274,0.35654734168201685,0.3758235517889261,0.38752372190356255,0.39841264486312866,0.4040419068187475,0.4097859803587198,0.41541148349642754,0.416892247274518,0.41986062191426754,0.4234193116426468,0.4218583852052688,0.4243287574499845,0.42519346065819263,0.42440339736640453],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35955795273184776,0.3757704347372055,0.3934198468923569,0.398214865475893,0.4062729831784963,0.41363069601356983,0.41463132016360754,0.41851891577243805,0.4239445272833109,0.42439557053148746,0.4273625332862139,0.4289980959147215,0.4327357914298773,0.43017333932220936],"label":"Filters combined"}},"commonsense_qa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2850000113248825,0.2875000089406967,0.31049999594688416,0.3135000020265579,0.3279999941587448,0.32999999821186066,0.32349999248981476,0.3229999989271164,0.32350000739097595,0.32900001108646393,0.3264999985694885,0.3349999934434891,0.32999999821186066],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2649999856948852,0.2790000140666961,0.29649999737739563,0.3135000020265579,0.3164999932050705,0.32099999487400055,0.3210000097751617,0.3305000066757202,0.3205000013113022,0.32549999654293055,0.3295000046491623,0.33050000667572016,0.335999995470047,0.33200000226497645],"label":"Filters combined"}},"hellaswag/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2900000065565109,0.3279999941587448,0.3550000041723251,0.375,0.38850000500679016,0.40350000560283655,0.41200000047683716,0.4194999933242798,0.42249999940395355,0.4329999983310699,0.43449999392032623,0.43700000643730164,0.4395000040531158,0.43950000405311584],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29349999129772186,0.3210000097751617,0.36150000989437103,0.3734999895095825,0.39599999785423273,0.4125000089406967,0.4234999865293503,0.42749999463558197,0.44699999690055847,0.4549999982118606,0.4660000056028366,0.46600000560283655,0.47050000727176666,0.4675000011920929],"label":"Filters combined"}},"openbookqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25099999457597727,0.2629999965429306,0.2810000032186508,0.2939999997615814,0.2900000065565109,0.3100000023841858,0.3129999935626983,0.3149999976158142,0.3229999989271164,0.3310000002384186,0.32100000977516174,0.32999999821186066,0.32100000977516174,0.32100000977516174],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2719999998807907,0.277999997138977,0.3039999902248382,0.28199999034404755,0.30200000107288355,0.3050000071525574,0.31299999356269836,0.32099999487400055,0.3269999921321869,0.31599999964237213,0.3260000050067901,0.32600000500679016,0.3299999982118606,0.32500000298023224],"label":"Filters combined"}},"piqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.613999992609024,0.6504999995231628,0.6649999916553497,0.6870000064373016,0.6915000081062317,0.6974999904632568,0.7035000026226044,0.7129999995231628,0.7055000066757202,0.7080000042915344,0.7084999978542328,0.7114999890327454,0.714000016450882,0.7115000188350677],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6274999976158142,0.656000018119812,0.6665000021457672,0.6854999959468842,0.6895000040531158,0.7035000026226044,0.7060000002384186,0.7100000083446503,0.7195000052452087,0.7159999907016754,0.715499997138977,0.7170000076293945,0.7274999916553497,0.7199999988079071],"label":"Filters combined"}},"winogrande/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.49699999392032623,0.5035000145435333,0.5054999887943268,0.5200000107288361,0.5115000009536743,0.5154999792575836,0.507999986410141,0.515500009059906,0.5160000026226044,0.5139999985694885,0.5165000259876251,0.5160000026226044,0.523499995470047],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4854999929666519,0.4860000014305115,0.5094999969005585,0.5090000033378601,0.5195000171661377,0.5185000002384186,0.5090000182390213,0.5084999799728394,0.5209999978542328,0.5164999961853027,0.5254999995231628,0.5250000059604645,0.5250000059604645,0.5254999995231628],"label":"Filters combined"}},"arc/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.29224999248981476,0.3267499953508377,0.34375,0.34800000488758087,0.35224999487400055,0.3567499965429306,0.36450000107288355,0.369499996304512,0.3712500035762787,0.3722500056028366,0.37325000762939453,0.377250000834465,0.37624999880790705,0.3764999955892563],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.3199999928474426,0.3397499918937683,0.3467499911785126,0.3540000021457672,0.3662499934434891,0.36374999582767487,0.3647499978542328,0.3675000071525574,0.371749997138977,0.37074999511241913,0.375,0.3787499964237213,0.38099999725818634],"label":"Filters combined"}},"mmlu/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.25162875652313227,0.26033842563629145,0.26643975079059595,0.26930116117000574,0.27358523011207575,0.27403785288333893,0.2792918980121612,0.28113801777362823,0.2826349586248398,0.2856044620275497,0.2851170748472214,0.28488004207611084,0.2877976596355438,0.28672714531421656],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2534636557102203,0.2621634304523468,0.2661087810993194,0.2704689502716064,0.27318383753299713,0.2757955640554428,0.2758005559444427,0.28340134024620056,0.2835562080144882,0.28641459345817566,0.28565025329589844,0.28998473286628723,0.29013633728027344,0.2888867110013962],"label":"Filters combined"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}},"defaultWindowSize":3}
 
 
data/plots/custom-filters/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308596294373274,0.35654734168201685,0.3758235517889261,0.38752372190356255,0.39841264486312866,0.4040419068187475,0.4097859803587198,0.41541148349642754,0.416892247274518,0.41986062191426754,0.4234193116426468,0.4218583852052688,0.4243287574499845,0.42519346065819263,0.42440339736640453],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35955795273184776,0.3757704347372055,0.3934198468923569,0.398214865475893,0.4062729831784963,0.41363069601356983,0.41463132016360754,0.41851891577243805,0.4239445272833109,0.42439557053148746,0.4273625332862139,0.4289980959147215,0.4327357914298773,0.43017333932220936],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-filters/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.29224999248981476,0.3267499953508377,0.34375,0.34800000488758087,0.35224999487400055,0.3567499965429306,0.36450000107288355,0.369499996304512,0.3712500035762787,0.3722500056028366,0.37325000762939453,0.377250000834465,0.37624999880790705,0.3764999955892563],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.3199999928474426,0.3397499918937683,0.3467499911785126,0.3540000021457672,0.3662499934434891,0.36374999582767487,0.3647499978542328,0.3675000071525574,0.371749997138977,0.37074999511241913,0.375,0.3787499964237213,0.38099999725818634],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-filters/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2850000113248825,0.2875000089406967,0.31049999594688416,0.3135000020265579,0.3279999941587448,0.32999999821186066,0.32349999248981476,0.3229999989271164,0.32350000739097595,0.32900001108646393,0.3264999985694885,0.3349999934434891,0.32999999821186066],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2649999856948852,0.2790000140666961,0.29649999737739563,0.3135000020265579,0.3164999932050705,0.32099999487400055,0.3210000097751617,0.3305000066757202,0.3205000013113022,0.32549999654293055,0.3295000046491623,0.33050000667572016,0.335999995470047,0.33200000226497645],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-filters/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2900000065565109,0.3279999941587448,0.3550000041723251,0.375,0.38850000500679016,0.40350000560283655,0.41200000047683716,0.4194999933242798,0.42249999940395355,0.4329999983310699,0.43449999392032623,0.43700000643730164,0.4395000040531158,0.43950000405311584],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29349999129772186,0.3210000097751617,0.36150000989437103,0.3734999895095825,0.39599999785423273,0.4125000089406967,0.4234999865293503,0.42749999463558197,0.44699999690055847,0.4549999982118606,0.4660000056028366,0.46600000560283655,0.47050000727176666,0.4675000011920929],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-filters/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files":{"agg_score":{"file":"agg_score.json"},"commonsense_qa/acc_norm":{"file":"commonsense_qa_acc_norm.json"},"hellaswag/acc_norm":{"file":"hellaswag_acc_norm.json"},"openbookqa/acc_norm":{"file":"openbookqa_acc_norm.json"},"piqa/acc_norm":{"file":"piqa_acc_norm.json"},"winogrande/acc_norm":{"file":"winogrande_acc_norm.json"},"arc/acc_norm":{"file":"arc_acc_norm.json"},"mmlu/acc_norm":{"file":"mmlu_acc_norm.json"}},"settings":{"slider":{"min":0,"max":10,"default":3}}}
data/plots/custom-filters/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.25162875652313227,0.26033842563629145,0.26643975079059595,0.26930116117000574,0.27358523011207575,0.27403785288333893,0.2792918980121612,0.28113801777362823,0.2826349586248398,0.2856044620275497,0.2851170748472214,0.28488004207611084,0.2877976596355438,0.28672714531421656],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2534636557102203,0.2621634304523468,0.2661087810993194,0.2704689502716064,0.27318383753299713,0.2757955640554428,0.2758005559444427,0.28340134024620056,0.2835562080144882,0.28641459345817566,0.28565025329589844,0.28998473286628723,0.29013633728027344,0.2888867110013962],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-filters/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25099999457597727,0.2629999965429306,0.2810000032186508,0.2939999997615814,0.2900000065565109,0.3100000023841858,0.3129999935626983,0.3149999976158142,0.3229999989271164,0.3310000002384186,0.32100000977516174,0.32999999821186066,0.32100000977516174,0.32100000977516174],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2719999998807907,0.277999997138977,0.3039999902248382,0.28199999034404755,0.30200000107288355,0.3050000071525574,0.31299999356269836,0.32099999487400055,0.3269999921321869,0.31599999964237213,0.3260000050067901,0.32600000500679016,0.3299999982118606,0.32500000298023224],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-filters/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.613999992609024,0.6504999995231628,0.6649999916553497,0.6870000064373016,0.6915000081062317,0.6974999904632568,0.7035000026226044,0.7129999995231628,0.7055000066757202,0.7080000042915344,0.7084999978542328,0.7114999890327454,0.714000016450882,0.7115000188350677],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6274999976158142,0.656000018119812,0.6665000021457672,0.6854999959468842,0.6895000040531158,0.7035000026226044,0.7060000002384186,0.7100000083446503,0.7195000052452087,0.7159999907016754,0.715499997138977,0.7170000076293945,0.7274999916553497,0.7199999988079071],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom-filters/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.49699999392032623,0.5035000145435333,0.5054999887943268,0.5200000107288361,0.5115000009536743,0.5154999792575836,0.507999986410141,0.515500009059906,0.5160000026226044,0.5139999985694885,0.5165000259876251,0.5160000026226044,0.523499995470047],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4854999929666519,0.4860000014305115,0.5094999969005585,0.5090000033378601,0.5195000171661377,0.5185000002384186,0.5090000182390213,0.5084999799728394,0.5209999978542328,0.5164999961853027,0.5254999995231628,0.5250000059604645,0.5250000059604645,0.5254999995231628],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}}}
data/plots/custom_filters.json DELETED
@@ -1 +0,0 @@
1
- {"data":{"agg_score":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308596294373274,0.35654734168201685,0.3758235517889261,0.38752372190356255,0.39841264486312866,0.4040419068187475,0.4097859803587198,0.41541148349642754,0.416892247274518,0.41986062191426754,0.4234193116426468,0.4218583852052688,0.4243287574499845,0.42519346065819263,0.42440339736640453],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35955795273184776,0.3757704347372055,0.3934198468923569,0.398214865475893,0.4062729831784963,0.41363069601356983,0.41463132016360754,0.41851891577243805,0.4239445272833109,0.42439557053148746,0.4273625332862139,0.4289980959147215,0.4327357914298773,0.43017333932220936],"label":"Filters combined"}},"commonsense_qa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2850000113248825,0.2875000089406967,0.31049999594688416,0.3135000020265579,0.3279999941587448,0.32999999821186066,0.32349999248981476,0.3229999989271164,0.32350000739097595,0.32900001108646393,0.3264999985694885,0.3349999934434891,0.32999999821186066],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2649999856948852,0.2790000140666961,0.29649999737739563,0.3135000020265579,0.3164999932050705,0.32099999487400055,0.3210000097751617,0.3305000066757202,0.3205000013113022,0.32549999654293055,0.3295000046491623,0.33050000667572016,0.335999995470047,0.33200000226497645],"label":"Filters combined"}},"hellaswag/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2900000065565109,0.3279999941587448,0.3550000041723251,0.375,0.38850000500679016,0.40350000560283655,0.41200000047683716,0.4194999933242798,0.42249999940395355,0.4329999983310699,0.43449999392032623,0.43700000643730164,0.4395000040531158,0.43950000405311584],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29349999129772186,0.3210000097751617,0.36150000989437103,0.3734999895095825,0.39599999785423273,0.4125000089406967,0.4234999865293503,0.42749999463558197,0.44699999690055847,0.4549999982118606,0.4660000056028366,0.46600000560283655,0.47050000727176666,0.4675000011920929],"label":"Filters combined"}},"openbookqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25099999457597727,0.2629999965429306,0.2810000032186508,0.2939999997615814,0.2900000065565109,0.3100000023841858,0.3129999935626983,0.3149999976158142,0.3229999989271164,0.3310000002384186,0.32100000977516174,0.32999999821186066,0.32100000977516174,0.32100000977516174],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2719999998807907,0.277999997138977,0.3039999902248382,0.28199999034404755,0.30200000107288355,0.3050000071525574,0.31299999356269836,0.32099999487400055,0.3269999921321869,0.31599999964237213,0.3260000050067901,0.32600000500679016,0.3299999982118606,0.32500000298023224],"label":"Filters combined"}},"piqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.613999992609024,0.6504999995231628,0.6649999916553497,0.6870000064373016,0.6915000081062317,0.6974999904632568,0.7035000026226044,0.7129999995231628,0.7055000066757202,0.7080000042915344,0.7084999978542328,0.7114999890327454,0.714000016450882,0.7115000188350677],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6274999976158142,0.656000018119812,0.6665000021457672,0.6854999959468842,0.6895000040531158,0.7035000026226044,0.7060000002384186,0.7100000083446503,0.7195000052452087,0.7159999907016754,0.715499997138977,0.7170000076293945,0.7274999916553497,0.7199999988079071],"label":"Filters combined"}},"winogrande/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.49699999392032623,0.5035000145435333,0.5054999887943268,0.5200000107288361,0.5115000009536743,0.5154999792575836,0.507999986410141,0.515500009059906,0.5160000026226044,0.5139999985694885,0.5165000259876251,0.5160000026226044,0.523499995470047],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4854999929666519,0.4860000014305115,0.5094999969005585,0.5090000033378601,0.5195000171661377,0.5185000002384186,0.5090000182390213,0.5084999799728394,0.5209999978542328,0.5164999961853027,0.5254999995231628,0.5250000059604645,0.5250000059604645,0.5254999995231628],"label":"Filters combined"}},"arc/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.29224999248981476,0.3267499953508377,0.34375,0.34800000488758087,0.35224999487400055,0.3567499965429306,0.36450000107288355,0.369499996304512,0.3712500035762787,0.3722500056028366,0.37325000762939453,0.377250000834465,0.37624999880790705,0.3764999955892563],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.3199999928474426,0.3397499918937683,0.3467499911785126,0.3540000021457672,0.3662499934434891,0.36374999582767487,0.3647499978542328,0.3675000071525574,0.371749997138977,0.37074999511241913,0.375,0.3787499964237213,0.38099999725818634],"label":"Filters combined"}},"mmlu/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.25162875652313227,0.26033842563629145,0.26643975079059595,0.26930116117000574,0.27358523011207575,0.27403785288333893,0.2792918980121612,0.28113801777362823,0.2826349586248398,0.2856044620275497,0.2851170748472214,0.28488004207611084,0.2877976596355438,0.28672714531421656],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2534636557102203,0.2621634304523468,0.2661087810993194,0.2704689502716064,0.27318383753299713,0.2757955640554428,0.2758005559444427,0.28340134024620056,0.2835562080144882,0.28641459345817566,0.28565025329589844,0.28998473286628723,0.29013633728027344,0.2888867110013962],"label":"Filters combined"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.4,0.44]}},"defaultWindowSize":3}
 
 
data/plots/custom_filters/agg_score.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308596294373274,0.35654734168201685,0.3758235517889261,0.38752372190356255,0.39841264486312866,0.4040419068187475,0.4097859803587198,0.41541148349642754,0.416892247274518,0.41986062191426754,0.4234193116426468,0.4218583852052688,0.4243287574499845,0.42519346065819263,0.42440339736640453],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35955795273184776,0.3757704347372055,0.3934198468923569,0.398214865475893,0.4062729831784963,0.41363069601356983,0.41463132016360754,0.41851891577243805,0.4239445272833109,0.42439557053148746,0.4273625332862139,0.4289980959147215,0.4327357914298773,0.43017333932220936],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.4,0.44]}}}
data/plots/custom_filters/arc_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.29224999248981476,0.3267499953508377,0.34375,0.34800000488758087,0.35224999487400055,0.3567499965429306,0.36450000107288355,0.369499996304512,0.3712500035762787,0.3722500056028366,0.37325000762939453,0.377250000834465,0.37624999880790705,0.3764999955892563],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.3199999928474426,0.3397499918937683,0.3467499911785126,0.3540000021457672,0.3662499934434891,0.36374999582767487,0.3647499978542328,0.3675000071525574,0.371749997138977,0.37074999511241913,0.375,0.3787499964237213,0.38099999725818634],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.4,0.44]}}}
data/plots/custom_filters/commonsense_qa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2850000113248825,0.2875000089406967,0.31049999594688416,0.3135000020265579,0.3279999941587448,0.32999999821186066,0.32349999248981476,0.3229999989271164,0.32350000739097595,0.32900001108646393,0.3264999985694885,0.3349999934434891,0.32999999821186066],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2649999856948852,0.2790000140666961,0.29649999737739563,0.3135000020265579,0.3164999932050705,0.32099999487400055,0.3210000097751617,0.3305000066757202,0.3205000013113022,0.32549999654293055,0.3295000046491623,0.33050000667572016,0.335999995470047,0.33200000226497645],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.4,0.44]}}}
data/plots/custom_filters/hellaswag_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2900000065565109,0.3279999941587448,0.3550000041723251,0.375,0.38850000500679016,0.40350000560283655,0.41200000047683716,0.4194999933242798,0.42249999940395355,0.4329999983310699,0.43449999392032623,0.43700000643730164,0.4395000040531158,0.43950000405311584],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29349999129772186,0.3210000097751617,0.36150000989437103,0.3734999895095825,0.39599999785423273,0.4125000089406967,0.4234999865293503,0.42749999463558197,0.44699999690055847,0.4549999982118606,0.4660000056028366,0.46600000560283655,0.47050000727176666,0.4675000011920929],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.4,0.44]}}}
data/plots/custom_filters/index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"files":{"agg_score":{"file":"agg_score.json"},"commonsense_qa/acc_norm":{"file":"commonsense_qa_acc_norm.json"},"hellaswag/acc_norm":{"file":"hellaswag_acc_norm.json"},"openbookqa/acc_norm":{"file":"openbookqa_acc_norm.json"},"piqa/acc_norm":{"file":"piqa_acc_norm.json"},"winogrande/acc_norm":{"file":"winogrande_acc_norm.json"},"arc/acc_norm":{"file":"arc_acc_norm.json"},"mmlu/acc_norm":{"file":"mmlu_acc_norm.json"}},"settings":{"slider":{"min":0,"max":10,"default":3}}}
data/plots/custom_filters/mmlu_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.25162875652313227,0.26033842563629145,0.26643975079059595,0.26930116117000574,0.27358523011207575,0.27403785288333893,0.2792918980121612,0.28113801777362823,0.2826349586248398,0.2856044620275497,0.2851170748472214,0.28488004207611084,0.2877976596355438,0.28672714531421656],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2534636557102203,0.2621634304523468,0.2661087810993194,0.2704689502716064,0.27318383753299713,0.2757955640554428,0.2758005559444427,0.28340134024620056,0.2835562080144882,0.28641459345817566,0.28565025329589844,0.28998473286628723,0.29013633728027344,0.2888867110013962],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.4,0.44]}}}
data/plots/custom_filters/openbookqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25099999457597727,0.2629999965429306,0.2810000032186508,0.2939999997615814,0.2900000065565109,0.3100000023841858,0.3129999935626983,0.3149999976158142,0.3229999989271164,0.3310000002384186,0.32100000977516174,0.32999999821186066,0.32100000977516174,0.32100000977516174],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2719999998807907,0.277999997138977,0.3039999902248382,0.28199999034404755,0.30200000107288355,0.3050000071525574,0.31299999356269836,0.32099999487400055,0.3269999921321869,0.31599999964237213,0.3260000050067901,0.32600000500679016,0.3299999982118606,0.32500000298023224],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.4,0.44]}}}
data/plots/custom_filters/piqa_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.613999992609024,0.6504999995231628,0.6649999916553497,0.6870000064373016,0.6915000081062317,0.6974999904632568,0.7035000026226044,0.7129999995231628,0.7055000066757202,0.7080000042915344,0.7084999978542328,0.7114999890327454,0.714000016450882,0.7115000188350677],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6274999976158142,0.656000018119812,0.6665000021457672,0.6854999959468842,0.6895000040531158,0.7035000026226044,0.7060000002384186,0.7100000083446503,0.7195000052452087,0.7159999907016754,0.715499997138977,0.7170000076293945,0.7274999916553497,0.7199999988079071],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.4,0.44]}}}
data/plots/custom_filters/winogrande_acc_norm.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.49699999392032623,0.5035000145435333,0.5054999887943268,0.5200000107288361,0.5115000009536743,0.5154999792575836,0.507999986410141,0.515500009059906,0.5160000026226044,0.5139999985694885,0.5165000259876251,0.5160000026226044,0.523499995470047],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4854999929666519,0.4860000014305115,0.5094999969005585,0.5090000033378601,0.5195000171661377,0.5185000002384186,0.5090000182390213,0.5084999799728394,0.5209999978542328,0.5164999961853027,0.5254999995231628,0.5250000059604645,0.5250000059604645,0.5254999995231628],"label":"Filters combined"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.4,0.44]}}}