blogpost-fineweb-v1 / data /plots /wet_comparison /openbookqa_acc_norm.json
hynky's picture
hynky HF staff
new plotting code (JIT)
0932e7b
raw
history blame
1.28 kB
{"data":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26500000059604645,0.24800000339746475,0.28299999237060547,0.28200000524520874,0.30900000035762787,0.3100000023841858,0.3020000010728836,0.3149999976158142,0.3110000044107437,0.32100000977516174,0.31700000166893005,0.31599999964237213,0.31599999964237213,0.31900000572204584],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2580000013113022,0.2719999998807907,0.2770000100135803,0.27300000190734863,0.2880000025033951,0.2989999949932098,0.29500000178813934,0.29899999499320984,0.3100000023841858,0.30300000309944153,0.30600000917911524,0.3040000051259994,0.3110000044107437,0.30300000309944153],"label":"WET data"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"title":{"text":"WET data is worse than data extracted from WARC"}}}