File size: 1,265 Bytes
0932e7b
1
{"data":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6130000054836273,0.6599999964237213,0.6704999804496765,0.6845000088214874,0.6854999959468842,0.6895000040531158,0.7005000114440918,0.6990000009536743,0.7090000212192535,0.707999974489212,0.7125000059604645,0.7114999890327454,0.7094999849796295,0.7150000035762787],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.5995000004768372,0.6385000050067902,0.6534999907016754,0.6675000190734863,0.6755000054836273,0.6814999878406525,0.6859999895095825,0.6840000152587891,0.6924999952316284,0.6944999992847443,0.69200000166893,0.6995000243186951,0.6960000097751617,0.6979999840259552],"label":"WET data"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"title":{"text":"WET data is worse than data extracted from WARC"}}}