|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 93654,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.003203280158882696,
|
|
"grad_norm": 0.9955105781555176,
|
|
"learning_rate": 5e-06,
|
|
"loss": 1.2173,
|
|
"num_input_tokens_seen": 819200,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.006406560317765392,
|
|
"grad_norm": 8.62949275970459,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.1953,
|
|
"num_input_tokens_seen": 1638400,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.009609840476648087,
|
|
"grad_norm": 1.0293811559677124,
|
|
"learning_rate": 1.5e-05,
|
|
"loss": 1.1905,
|
|
"num_input_tokens_seen": 2457600,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.012813120635530783,
|
|
"grad_norm": 6.295543193817139,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.1391,
|
|
"num_input_tokens_seen": 3276800,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.01601640079441348,
|
|
"grad_norm": 3.0551528930664062,
|
|
"learning_rate": 2.5e-05,
|
|
"loss": 1.1383,
|
|
"num_input_tokens_seen": 4096000,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.019219680953296174,
|
|
"grad_norm": 0.8111634850502014,
|
|
"learning_rate": 3e-05,
|
|
"loss": 1.1022,
|
|
"num_input_tokens_seen": 4915200,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.022422961112178872,
|
|
"grad_norm": 0.77763432264328,
|
|
"learning_rate": 3.5e-05,
|
|
"loss": 1.0805,
|
|
"num_input_tokens_seen": 5734400,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.025626241271061567,
|
|
"grad_norm": 1.9141496419906616,
|
|
"learning_rate": 4e-05,
|
|
"loss": 1.0755,
|
|
"num_input_tokens_seen": 6553600,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.028829521429944265,
|
|
"grad_norm": 0.8061490058898926,
|
|
"learning_rate": 4.5e-05,
|
|
"loss": 1.0995,
|
|
"num_input_tokens_seen": 7372800,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.03203280158882696,
|
|
"grad_norm": 0.6671661734580994,
|
|
"learning_rate": 5e-05,
|
|
"loss": 1.0835,
|
|
"num_input_tokens_seen": 8192000,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.035236081747709654,
|
|
"grad_norm": 2.4559221267700195,
|
|
"learning_rate": 4.9999856291983216e-05,
|
|
"loss": 1.0848,
|
|
"num_input_tokens_seen": 9011200,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.03843936190659235,
|
|
"grad_norm": 0.6218218803405762,
|
|
"learning_rate": 4.9999425169585025e-05,
|
|
"loss": 1.0621,
|
|
"num_input_tokens_seen": 9830400,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.04164264206547505,
|
|
"grad_norm": 1.1977851390838623,
|
|
"learning_rate": 4.999870663776188e-05,
|
|
"loss": 1.0774,
|
|
"num_input_tokens_seen": 10649600,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.044845922224357744,
|
|
"grad_norm": 0.581513524055481,
|
|
"learning_rate": 4.99977007047745e-05,
|
|
"loss": 1.0204,
|
|
"num_input_tokens_seen": 11468800,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.04804920238324044,
|
|
"grad_norm": 0.6710864901542664,
|
|
"learning_rate": 4.999640738218772e-05,
|
|
"loss": 1.0509,
|
|
"num_input_tokens_seen": 12288000,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.05125248254212313,
|
|
"grad_norm": 2.048499345779419,
|
|
"learning_rate": 4.99948266848704e-05,
|
|
"loss": 1.1401,
|
|
"num_input_tokens_seen": 13107200,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.05445576270100583,
|
|
"grad_norm": 0.6593829989433289,
|
|
"learning_rate": 4.999295863099528e-05,
|
|
"loss": 1.042,
|
|
"num_input_tokens_seen": 13926400,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.05765904285988853,
|
|
"grad_norm": 0.5166763663291931,
|
|
"learning_rate": 4.999080324203867e-05,
|
|
"loss": 1.1398,
|
|
"num_input_tokens_seen": 14745600,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.060862323018771224,
|
|
"grad_norm": 0.4539300203323364,
|
|
"learning_rate": 4.9988360542780333e-05,
|
|
"loss": 1.0759,
|
|
"num_input_tokens_seen": 15564800,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.06406560317765392,
|
|
"grad_norm": 0.7282894253730774,
|
|
"learning_rate": 4.998563056130308e-05,
|
|
"loss": 1.0988,
|
|
"num_input_tokens_seen": 16384000,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.06726888333653662,
|
|
"grad_norm": 0.6337546706199646,
|
|
"learning_rate": 4.998261332899255e-05,
|
|
"loss": 1.0642,
|
|
"num_input_tokens_seen": 17203200,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.07047216349541931,
|
|
"grad_norm": 0.6283242702484131,
|
|
"learning_rate": 4.997930888053677e-05,
|
|
"loss": 1.076,
|
|
"num_input_tokens_seen": 18022400,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.07367544365430201,
|
|
"grad_norm": 0.6066380739212036,
|
|
"learning_rate": 4.99757172539258e-05,
|
|
"loss": 1.0616,
|
|
"num_input_tokens_seen": 18841600,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.0768787238131847,
|
|
"grad_norm": 0.506839394569397,
|
|
"learning_rate": 4.997183849045129e-05,
|
|
"loss": 1.0691,
|
|
"num_input_tokens_seen": 19660800,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.0800820039720674,
|
|
"grad_norm": 0.6370711922645569,
|
|
"learning_rate": 4.996767263470599e-05,
|
|
"loss": 1.0463,
|
|
"num_input_tokens_seen": 20480000,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.0832852841309501,
|
|
"grad_norm": 2.0462234020233154,
|
|
"learning_rate": 4.996321973458325e-05,
|
|
"loss": 1.0703,
|
|
"num_input_tokens_seen": 21299200,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.08648856428983279,
|
|
"grad_norm": 0.6036199331283569,
|
|
"learning_rate": 4.9958479841276446e-05,
|
|
"loss": 1.0397,
|
|
"num_input_tokens_seen": 22118400,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.08969184444871549,
|
|
"grad_norm": 0.6303982138633728,
|
|
"learning_rate": 4.995345300927845e-05,
|
|
"loss": 1.0837,
|
|
"num_input_tokens_seen": 22937600,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.09289512460759818,
|
|
"grad_norm": 0.5572041869163513,
|
|
"learning_rate": 4.994813929638096e-05,
|
|
"loss": 1.0399,
|
|
"num_input_tokens_seen": 23756800,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.09609840476648088,
|
|
"grad_norm": 0.6958311200141907,
|
|
"learning_rate": 4.9942538763673794e-05,
|
|
"loss": 1.0634,
|
|
"num_input_tokens_seen": 24576000,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.09930168492536358,
|
|
"grad_norm": 0.583613395690918,
|
|
"learning_rate": 4.993665147554429e-05,
|
|
"loss": 1.0472,
|
|
"num_input_tokens_seen": 25395200,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.10250496508424627,
|
|
"grad_norm": 0.5093560814857483,
|
|
"learning_rate": 4.9930477499676495e-05,
|
|
"loss": 1.0774,
|
|
"num_input_tokens_seen": 26214400,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.10570824524312897,
|
|
"grad_norm": 1.930864691734314,
|
|
"learning_rate": 4.992401690705038e-05,
|
|
"loss": 1.0402,
|
|
"num_input_tokens_seen": 27033600,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.10891152540201166,
|
|
"grad_norm": 0.6102778911590576,
|
|
"learning_rate": 4.9917269771941056e-05,
|
|
"loss": 1.0353,
|
|
"num_input_tokens_seen": 27852800,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.11211480556089436,
|
|
"grad_norm": 0.5592427849769592,
|
|
"learning_rate": 4.991023617191792e-05,
|
|
"loss": 1.0776,
|
|
"num_input_tokens_seen": 28672000,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.11531808571977706,
|
|
"grad_norm": 0.6671651005744934,
|
|
"learning_rate": 4.990291618784377e-05,
|
|
"loss": 1.1083,
|
|
"num_input_tokens_seen": 29491200,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.11852136587865975,
|
|
"grad_norm": 1.4246577024459839,
|
|
"learning_rate": 4.989530990387381e-05,
|
|
"loss": 1.0262,
|
|
"num_input_tokens_seen": 30310400,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 0.12172464603754245,
|
|
"grad_norm": 2.4318628311157227,
|
|
"learning_rate": 4.988741740745477e-05,
|
|
"loss": 1.0441,
|
|
"num_input_tokens_seen": 31129600,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.12492792619642513,
|
|
"grad_norm": 2.1933786869049072,
|
|
"learning_rate": 4.987923878932386e-05,
|
|
"loss": 1.0375,
|
|
"num_input_tokens_seen": 31948800,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 0.12813120635530784,
|
|
"grad_norm": 0.5265761017799377,
|
|
"learning_rate": 4.9870774143507696e-05,
|
|
"loss": 1.0041,
|
|
"num_input_tokens_seen": 32768000,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.13133448651419052,
|
|
"grad_norm": 0.6378248929977417,
|
|
"learning_rate": 4.98620235673213e-05,
|
|
"loss": 1.0798,
|
|
"num_input_tokens_seen": 33587200,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 0.13453776667307324,
|
|
"grad_norm": 0.5426807999610901,
|
|
"learning_rate": 4.9852987161366895e-05,
|
|
"loss": 1.1014,
|
|
"num_input_tokens_seen": 34406400,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.13774104683195593,
|
|
"grad_norm": 0.587978720664978,
|
|
"learning_rate": 4.9843665029532796e-05,
|
|
"loss": 1.0321,
|
|
"num_input_tokens_seen": 35225600,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 0.14094432699083861,
|
|
"grad_norm": 0.8025338649749756,
|
|
"learning_rate": 4.983405727899221e-05,
|
|
"loss": 0.9954,
|
|
"num_input_tokens_seen": 36044800,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.1441476071497213,
|
|
"grad_norm": 0.5788518786430359,
|
|
"learning_rate": 4.982416402020201e-05,
|
|
"loss": 1.0049,
|
|
"num_input_tokens_seen": 36864000,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.14735088730860402,
|
|
"grad_norm": 0.629861056804657,
|
|
"learning_rate": 4.9813985366901435e-05,
|
|
"loss": 1.0586,
|
|
"num_input_tokens_seen": 37683200,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.1505541674674867,
|
|
"grad_norm": 0.5835918188095093,
|
|
"learning_rate": 4.980352143611081e-05,
|
|
"loss": 1.0949,
|
|
"num_input_tokens_seen": 38502400,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 0.1537574476263694,
|
|
"grad_norm": 0.5552580952644348,
|
|
"learning_rate": 4.979277234813021e-05,
|
|
"loss": 1.0374,
|
|
"num_input_tokens_seen": 39321600,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 0.1569607277852521,
|
|
"grad_norm": 0.7137876749038696,
|
|
"learning_rate": 4.978173822653802e-05,
|
|
"loss": 1.0195,
|
|
"num_input_tokens_seen": 40140800,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 0.1601640079441348,
|
|
"grad_norm": 0.6314465403556824,
|
|
"learning_rate": 4.9770419198189595e-05,
|
|
"loss": 1.0661,
|
|
"num_input_tokens_seen": 40960000,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.16336728810301748,
|
|
"grad_norm": 0.5494422316551208,
|
|
"learning_rate": 4.975881539321574e-05,
|
|
"loss": 1.0168,
|
|
"num_input_tokens_seen": 41779200,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 0.1665705682619002,
|
|
"grad_norm": 2.2284624576568604,
|
|
"learning_rate": 4.974692694502123e-05,
|
|
"loss": 1.0523,
|
|
"num_input_tokens_seen": 42598400,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 0.16977384842078289,
|
|
"grad_norm": 0.5189602375030518,
|
|
"learning_rate": 4.973475399028331e-05,
|
|
"loss": 1.0294,
|
|
"num_input_tokens_seen": 43417600,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 0.17297712857966557,
|
|
"grad_norm": 2.1537561416625977,
|
|
"learning_rate": 4.972229666895006e-05,
|
|
"loss": 0.9866,
|
|
"num_input_tokens_seen": 44236800,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 0.17618040873854826,
|
|
"grad_norm": 0.5834473967552185,
|
|
"learning_rate": 4.970955512423884e-05,
|
|
"loss": 0.99,
|
|
"num_input_tokens_seen": 45056000,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 0.17938368889743098,
|
|
"grad_norm": 0.6151788830757141,
|
|
"learning_rate": 4.969652950263462e-05,
|
|
"loss": 1.0292,
|
|
"num_input_tokens_seen": 45875200,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 0.18258696905631366,
|
|
"grad_norm": 0.641342043876648,
|
|
"learning_rate": 4.96832199538883e-05,
|
|
"loss": 1.0712,
|
|
"num_input_tokens_seen": 46694400,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 0.18579024921519635,
|
|
"grad_norm": 0.7882746458053589,
|
|
"learning_rate": 4.966962663101499e-05,
|
|
"loss": 1.0279,
|
|
"num_input_tokens_seen": 47513600,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 0.18899352937407907,
|
|
"grad_norm": 0.633734405040741,
|
|
"learning_rate": 4.965574969029223e-05,
|
|
"loss": 1.0448,
|
|
"num_input_tokens_seen": 48332800,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 0.19219680953296175,
|
|
"grad_norm": 1.5470919609069824,
|
|
"learning_rate": 4.9641589291258255e-05,
|
|
"loss": 1.0492,
|
|
"num_input_tokens_seen": 49152000,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.19540008969184444,
|
|
"grad_norm": 1.6563118696212769,
|
|
"learning_rate": 4.962714559671008e-05,
|
|
"loss": 1.0593,
|
|
"num_input_tokens_seen": 49971200,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 0.19860336985072716,
|
|
"grad_norm": 0.6741557717323303,
|
|
"learning_rate": 4.961241877270169e-05,
|
|
"loss": 1.0054,
|
|
"num_input_tokens_seen": 50790400,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 0.20180665000960984,
|
|
"grad_norm": 0.6842678785324097,
|
|
"learning_rate": 4.9597408988542096e-05,
|
|
"loss": 0.9865,
|
|
"num_input_tokens_seen": 51609600,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 0.20500993016849253,
|
|
"grad_norm": 8.189310073852539,
|
|
"learning_rate": 4.958211641679339e-05,
|
|
"loss": 1.0529,
|
|
"num_input_tokens_seen": 52428800,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 0.20821321032737522,
|
|
"grad_norm": 0.8904711604118347,
|
|
"learning_rate": 4.956654123326881e-05,
|
|
"loss": 1.0272,
|
|
"num_input_tokens_seen": 53248000,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 0.21141649048625794,
|
|
"grad_norm": 0.7857553362846375,
|
|
"learning_rate": 4.9550683617030624e-05,
|
|
"loss": 1.0295,
|
|
"num_input_tokens_seen": 54067200,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 0.21461977064514062,
|
|
"grad_norm": 0.6658555865287781,
|
|
"learning_rate": 4.9534543750388185e-05,
|
|
"loss": 0.9849,
|
|
"num_input_tokens_seen": 54886400,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 0.2178230508040233,
|
|
"grad_norm": 0.6390406489372253,
|
|
"learning_rate": 4.951812181889573e-05,
|
|
"loss": 0.9597,
|
|
"num_input_tokens_seen": 55705600,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 0.22102633096290603,
|
|
"grad_norm": 0.5161400437355042,
|
|
"learning_rate": 4.950141801135034e-05,
|
|
"loss": 1.0008,
|
|
"num_input_tokens_seen": 56524800,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 0.2242296111217887,
|
|
"grad_norm": 0.7651511430740356,
|
|
"learning_rate": 4.948443251978968e-05,
|
|
"loss": 0.9889,
|
|
"num_input_tokens_seen": 57344000,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 0.2274328912806714,
|
|
"grad_norm": 0.5069282054901123,
|
|
"learning_rate": 4.946716553948987e-05,
|
|
"loss": 0.9869,
|
|
"num_input_tokens_seen": 58163200,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 0.23063617143955412,
|
|
"grad_norm": 0.5041384696960449,
|
|
"learning_rate": 4.9449617268963164e-05,
|
|
"loss": 0.9669,
|
|
"num_input_tokens_seen": 58982400,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 0.2338394515984368,
|
|
"grad_norm": 1.7203638553619385,
|
|
"learning_rate": 4.943178790995576e-05,
|
|
"loss": 1.0426,
|
|
"num_input_tokens_seen": 59801600,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 0.2370427317573195,
|
|
"grad_norm": 0.8364699482917786,
|
|
"learning_rate": 4.941367766744539e-05,
|
|
"loss": 0.9894,
|
|
"num_input_tokens_seen": 60620800,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 0.24024601191620218,
|
|
"grad_norm": 0.42120370268821716,
|
|
"learning_rate": 4.939528674963902e-05,
|
|
"loss": 0.996,
|
|
"num_input_tokens_seen": 61440000,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 0.2434492920750849,
|
|
"grad_norm": 4.017838001251221,
|
|
"learning_rate": 4.937661536797044e-05,
|
|
"loss": 1.0557,
|
|
"num_input_tokens_seen": 62259200,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"epoch": 0.24665257223396758,
|
|
"grad_norm": 0.7951923608779907,
|
|
"learning_rate": 4.9357663737097824e-05,
|
|
"loss": 1.0614,
|
|
"num_input_tokens_seen": 63078400,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"epoch": 0.24985585239285027,
|
|
"grad_norm": 0.7139900922775269,
|
|
"learning_rate": 4.9338432074901276e-05,
|
|
"loss": 1.0525,
|
|
"num_input_tokens_seen": 63897600,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"epoch": 0.25305913255173296,
|
|
"grad_norm": 0.6686214208602905,
|
|
"learning_rate": 4.931892060248032e-05,
|
|
"loss": 1.0947,
|
|
"num_input_tokens_seen": 64716800,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"epoch": 0.2562624127106157,
|
|
"grad_norm": 0.737429678440094,
|
|
"learning_rate": 4.929912954415135e-05,
|
|
"loss": 0.9886,
|
|
"num_input_tokens_seen": 65536000,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 0.2594656928694984,
|
|
"grad_norm": 0.49794241786003113,
|
|
"learning_rate": 4.9279059127445074e-05,
|
|
"loss": 1.0407,
|
|
"num_input_tokens_seen": 66355200,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"epoch": 0.26266897302838105,
|
|
"grad_norm": 0.6615239977836609,
|
|
"learning_rate": 4.925870958310388e-05,
|
|
"loss": 1.021,
|
|
"num_input_tokens_seen": 67174400,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"epoch": 0.26587225318726376,
|
|
"grad_norm": 1.568616509437561,
|
|
"learning_rate": 4.923808114507916e-05,
|
|
"loss": 1.027,
|
|
"num_input_tokens_seen": 67993600,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"epoch": 0.2690755333461465,
|
|
"grad_norm": 0.6627603769302368,
|
|
"learning_rate": 4.921717405052868e-05,
|
|
"loss": 1.0552,
|
|
"num_input_tokens_seen": 68812800,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"epoch": 0.27227881350502914,
|
|
"grad_norm": 0.5849776864051819,
|
|
"learning_rate": 4.9195988539813814e-05,
|
|
"loss": 1.0552,
|
|
"num_input_tokens_seen": 69632000,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 0.27548209366391185,
|
|
"grad_norm": 1.6558514833450317,
|
|
"learning_rate": 4.917452485649677e-05,
|
|
"loss": 1.0516,
|
|
"num_input_tokens_seen": 70451200,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"epoch": 0.27868537382279457,
|
|
"grad_norm": 0.5784972310066223,
|
|
"learning_rate": 4.9152783247337823e-05,
|
|
"loss": 1.0425,
|
|
"num_input_tokens_seen": 71270400,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"epoch": 0.28188865398167723,
|
|
"grad_norm": 0.713585376739502,
|
|
"learning_rate": 4.9130763962292453e-05,
|
|
"loss": 1.0633,
|
|
"num_input_tokens_seen": 72089600,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"epoch": 0.28509193414055994,
|
|
"grad_norm": 0.678617000579834,
|
|
"learning_rate": 4.9108467254508487e-05,
|
|
"loss": 1.0208,
|
|
"num_input_tokens_seen": 72908800,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"epoch": 0.2882952142994426,
|
|
"grad_norm": 0.6494852900505066,
|
|
"learning_rate": 4.908589338032316e-05,
|
|
"loss": 1.0193,
|
|
"num_input_tokens_seen": 73728000,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.2914984944583253,
|
|
"grad_norm": 0.6913178563117981,
|
|
"learning_rate": 4.9063042599260234e-05,
|
|
"loss": 0.9783,
|
|
"num_input_tokens_seen": 74547200,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"epoch": 0.29470177461720803,
|
|
"grad_norm": 0.6419298648834229,
|
|
"learning_rate": 4.9039915174026916e-05,
|
|
"loss": 1.0251,
|
|
"num_input_tokens_seen": 75366400,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"epoch": 0.2979050547760907,
|
|
"grad_norm": 0.6663874983787537,
|
|
"learning_rate": 4.9016511370510945e-05,
|
|
"loss": 1.009,
|
|
"num_input_tokens_seen": 76185600,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"epoch": 0.3011083349349734,
|
|
"grad_norm": 0.5730396509170532,
|
|
"learning_rate": 4.8992831457777446e-05,
|
|
"loss": 1.0154,
|
|
"num_input_tokens_seen": 77004800,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"epoch": 0.3043116150938561,
|
|
"grad_norm": 0.5048360824584961,
|
|
"learning_rate": 4.896887570806588e-05,
|
|
"loss": 1.0498,
|
|
"num_input_tokens_seen": 77824000,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 0.3075148952527388,
|
|
"grad_norm": 1.7296109199523926,
|
|
"learning_rate": 4.89446443967869e-05,
|
|
"loss": 1.0426,
|
|
"num_input_tokens_seen": 78643200,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"epoch": 0.3107181754116215,
|
|
"grad_norm": 0.8863735198974609,
|
|
"learning_rate": 4.892013780251922e-05,
|
|
"loss": 0.9947,
|
|
"num_input_tokens_seen": 79462400,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"epoch": 0.3139214555705042,
|
|
"grad_norm": 2.7898573875427246,
|
|
"learning_rate": 4.889535620700635e-05,
|
|
"loss": 1.0301,
|
|
"num_input_tokens_seen": 80281600,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"epoch": 0.3171247357293869,
|
|
"grad_norm": 0.5569226741790771,
|
|
"learning_rate": 4.887029989515341e-05,
|
|
"loss": 0.976,
|
|
"num_input_tokens_seen": 81100800,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"epoch": 0.3203280158882696,
|
|
"grad_norm": 0.46732258796691895,
|
|
"learning_rate": 4.884496915502385e-05,
|
|
"loss": 1.0477,
|
|
"num_input_tokens_seen": 81920000,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 0.3235312960471523,
|
|
"grad_norm": 0.45553821325302124,
|
|
"learning_rate": 4.881936427783607e-05,
|
|
"loss": 1.0019,
|
|
"num_input_tokens_seen": 82739200,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"epoch": 0.32673457620603497,
|
|
"grad_norm": 0.7193503379821777,
|
|
"learning_rate": 4.879348555796018e-05,
|
|
"loss": 0.997,
|
|
"num_input_tokens_seen": 83558400,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"epoch": 0.3299378563649177,
|
|
"grad_norm": 0.6309390664100647,
|
|
"learning_rate": 4.8767333292914544e-05,
|
|
"loss": 0.9891,
|
|
"num_input_tokens_seen": 84377600,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"epoch": 0.3331411365238004,
|
|
"grad_norm": 0.555618166923523,
|
|
"learning_rate": 4.874090778336235e-05,
|
|
"loss": 1.0175,
|
|
"num_input_tokens_seen": 85196800,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"epoch": 0.33634441668268306,
|
|
"grad_norm": 1.5369619131088257,
|
|
"learning_rate": 4.8714209333108236e-05,
|
|
"loss": 1.0151,
|
|
"num_input_tokens_seen": 86016000,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 0.33954769684156577,
|
|
"grad_norm": 0.5254389047622681,
|
|
"learning_rate": 4.868723824909469e-05,
|
|
"loss": 1.025,
|
|
"num_input_tokens_seen": 86835200,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"epoch": 0.3427509770004485,
|
|
"grad_norm": 0.5323970913887024,
|
|
"learning_rate": 4.8659994841398594e-05,
|
|
"loss": 1.0334,
|
|
"num_input_tokens_seen": 87654400,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"epoch": 0.34595425715933115,
|
|
"grad_norm": 0.602602481842041,
|
|
"learning_rate": 4.863247942322764e-05,
|
|
"loss": 1.0237,
|
|
"num_input_tokens_seen": 88473600,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"epoch": 0.34915753731821386,
|
|
"grad_norm": 2.1106760501861572,
|
|
"learning_rate": 4.860469231091671e-05,
|
|
"loss": 1.0181,
|
|
"num_input_tokens_seen": 89292800,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"epoch": 0.3523608174770965,
|
|
"grad_norm": 0.6294669508934021,
|
|
"learning_rate": 4.857663382392428e-05,
|
|
"loss": 1.0289,
|
|
"num_input_tokens_seen": 90112000,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 0.35556409763597924,
|
|
"grad_norm": 0.5473527908325195,
|
|
"learning_rate": 4.854830428482871e-05,
|
|
"loss": 1.0296,
|
|
"num_input_tokens_seen": 90931200,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"epoch": 0.35876737779486195,
|
|
"grad_norm": 0.5963702201843262,
|
|
"learning_rate": 4.851970401932454e-05,
|
|
"loss": 0.9784,
|
|
"num_input_tokens_seen": 91750400,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"epoch": 0.3619706579537446,
|
|
"grad_norm": 1.5987745523452759,
|
|
"learning_rate": 4.849083335621878e-05,
|
|
"loss": 1.0842,
|
|
"num_input_tokens_seen": 92569600,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"epoch": 0.3651739381126273,
|
|
"grad_norm": 1.9906154870986938,
|
|
"learning_rate": 4.846169262742709e-05,
|
|
"loss": 1.0196,
|
|
"num_input_tokens_seen": 93388800,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"epoch": 0.36837721827151004,
|
|
"grad_norm": 0.7897935509681702,
|
|
"learning_rate": 4.843228216796996e-05,
|
|
"loss": 1.0103,
|
|
"num_input_tokens_seen": 94208000,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 0.3715804984303927,
|
|
"grad_norm": 0.6737790107727051,
|
|
"learning_rate": 4.8402602315968905e-05,
|
|
"loss": 1.0551,
|
|
"num_input_tokens_seen": 95027200,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"epoch": 0.3747837785892754,
|
|
"grad_norm": 0.5573664307594299,
|
|
"learning_rate": 4.837265341264253e-05,
|
|
"loss": 1.0221,
|
|
"num_input_tokens_seen": 95846400,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"epoch": 0.37798705874815813,
|
|
"grad_norm": 0.6558005809783936,
|
|
"learning_rate": 4.834243580230266e-05,
|
|
"loss": 0.975,
|
|
"num_input_tokens_seen": 96665600,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"epoch": 0.3811903389070408,
|
|
"grad_norm": 0.7646604776382446,
|
|
"learning_rate": 4.831194983235029e-05,
|
|
"loss": 1.0152,
|
|
"num_input_tokens_seen": 97484800,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"epoch": 0.3843936190659235,
|
|
"grad_norm": 0.5662313103675842,
|
|
"learning_rate": 4.82811958532717e-05,
|
|
"loss": 0.9909,
|
|
"num_input_tokens_seen": 98304000,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 0.3875968992248062,
|
|
"grad_norm": 0.5597667098045349,
|
|
"learning_rate": 4.825017421863436e-05,
|
|
"loss": 1.0208,
|
|
"num_input_tokens_seen": 99123200,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"epoch": 0.3908001793836889,
|
|
"grad_norm": 0.5832675099372864,
|
|
"learning_rate": 4.821888528508287e-05,
|
|
"loss": 1.0189,
|
|
"num_input_tokens_seen": 99942400,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"epoch": 0.3940034595425716,
|
|
"grad_norm": 1.6424989700317383,
|
|
"learning_rate": 4.8187329412334884e-05,
|
|
"loss": 1.055,
|
|
"num_input_tokens_seen": 100761600,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"epoch": 0.3972067397014543,
|
|
"grad_norm": 0.4590611755847931,
|
|
"learning_rate": 4.815550696317695e-05,
|
|
"loss": 1.0586,
|
|
"num_input_tokens_seen": 101580800,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"epoch": 0.400410019860337,
|
|
"grad_norm": 0.5123792290687561,
|
|
"learning_rate": 4.812341830346035e-05,
|
|
"loss": 1.0073,
|
|
"num_input_tokens_seen": 102400000,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"epoch": 0.4036133000192197,
|
|
"grad_norm": 1.7758103609085083,
|
|
"learning_rate": 4.80910638020969e-05,
|
|
"loss": 1.0012,
|
|
"num_input_tokens_seen": 103219200,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"epoch": 0.40681658017810235,
|
|
"grad_norm": 0.6465420722961426,
|
|
"learning_rate": 4.805844383105469e-05,
|
|
"loss": 0.9919,
|
|
"num_input_tokens_seen": 104038400,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"epoch": 0.41001986033698506,
|
|
"grad_norm": 0.6052021980285645,
|
|
"learning_rate": 4.802555876535383e-05,
|
|
"loss": 1.0369,
|
|
"num_input_tokens_seen": 104857600,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"epoch": 0.4132231404958678,
|
|
"grad_norm": 0.5069152116775513,
|
|
"learning_rate": 4.799240898306214e-05,
|
|
"loss": 1.0105,
|
|
"num_input_tokens_seen": 105676800,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"epoch": 0.41642642065475044,
|
|
"grad_norm": 0.6421388387680054,
|
|
"learning_rate": 4.7958994865290766e-05,
|
|
"loss": 0.9861,
|
|
"num_input_tokens_seen": 106496000,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"epoch": 0.41962970081363316,
|
|
"grad_norm": 0.6774849891662598,
|
|
"learning_rate": 4.7925316796189826e-05,
|
|
"loss": 0.9771,
|
|
"num_input_tokens_seen": 107315200,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"epoch": 0.42283298097251587,
|
|
"grad_norm": 2.159661293029785,
|
|
"learning_rate": 4.789137516294402e-05,
|
|
"loss": 1.0182,
|
|
"num_input_tokens_seen": 108134400,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"epoch": 0.42603626113139853,
|
|
"grad_norm": 0.6035510301589966,
|
|
"learning_rate": 4.785717035576812e-05,
|
|
"loss": 1.036,
|
|
"num_input_tokens_seen": 108953600,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"epoch": 0.42923954129028125,
|
|
"grad_norm": 1.6665889024734497,
|
|
"learning_rate": 4.782270276790254e-05,
|
|
"loss": 1.0713,
|
|
"num_input_tokens_seen": 109772800,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"epoch": 0.43244282144916396,
|
|
"grad_norm": 0.702918291091919,
|
|
"learning_rate": 4.778797279560876e-05,
|
|
"loss": 0.9708,
|
|
"num_input_tokens_seen": 110592000,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"epoch": 0.4356461016080466,
|
|
"grad_norm": 0.6358348727226257,
|
|
"learning_rate": 4.775298083816482e-05,
|
|
"loss": 0.9967,
|
|
"num_input_tokens_seen": 111411200,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"epoch": 0.43884938176692934,
|
|
"grad_norm": 0.652087390422821,
|
|
"learning_rate": 4.77177272978607e-05,
|
|
"loss": 1.0333,
|
|
"num_input_tokens_seen": 112230400,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"epoch": 0.44205266192581205,
|
|
"grad_norm": 0.6892516016960144,
|
|
"learning_rate": 4.768221257999373e-05,
|
|
"loss": 1.0308,
|
|
"num_input_tokens_seen": 113049600,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"epoch": 0.4452559420846947,
|
|
"grad_norm": 0.6279174089431763,
|
|
"learning_rate": 4.764643709286386e-05,
|
|
"loss": 1.057,
|
|
"num_input_tokens_seen": 113868800,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"epoch": 0.4484592222435774,
|
|
"grad_norm": 0.6180372834205627,
|
|
"learning_rate": 4.761040124776904e-05,
|
|
"loss": 1.0059,
|
|
"num_input_tokens_seen": 114688000,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"epoch": 0.45166250240246014,
|
|
"grad_norm": 0.6153070330619812,
|
|
"learning_rate": 4.757410545900047e-05,
|
|
"loss": 1.0717,
|
|
"num_input_tokens_seen": 115507200,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"epoch": 0.4548657825613428,
|
|
"grad_norm": 0.5821653604507446,
|
|
"learning_rate": 4.7537550143837796e-05,
|
|
"loss": 1.0313,
|
|
"num_input_tokens_seen": 116326400,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"epoch": 0.4580690627202255,
|
|
"grad_norm": 0.5773714780807495,
|
|
"learning_rate": 4.750073572254438e-05,
|
|
"loss": 1.0296,
|
|
"num_input_tokens_seen": 117145600,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"epoch": 0.46127234287910823,
|
|
"grad_norm": 0.7084370255470276,
|
|
"learning_rate": 4.746366261836242e-05,
|
|
"loss": 0.9977,
|
|
"num_input_tokens_seen": 117964800,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"epoch": 0.4644756230379909,
|
|
"grad_norm": 0.719439685344696,
|
|
"learning_rate": 4.742633125750808e-05,
|
|
"loss": 0.9753,
|
|
"num_input_tokens_seen": 118784000,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"epoch": 0.4676789031968736,
|
|
"grad_norm": 0.6266898512840271,
|
|
"learning_rate": 4.738874206916665e-05,
|
|
"loss": 0.9722,
|
|
"num_input_tokens_seen": 119603200,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"epoch": 0.47088218335575627,
|
|
"grad_norm": 0.6483869552612305,
|
|
"learning_rate": 4.7350895485487526e-05,
|
|
"loss": 1.066,
|
|
"num_input_tokens_seen": 120422400,
|
|
"step": 14700
|
|
},
|
|
{
|
|
"epoch": 0.474085463514639,
|
|
"grad_norm": 0.5138384699821472,
|
|
"learning_rate": 4.731279194157933e-05,
|
|
"loss": 0.973,
|
|
"num_input_tokens_seen": 121241600,
|
|
"step": 14800
|
|
},
|
|
{
|
|
"epoch": 0.4772887436735217,
|
|
"grad_norm": 0.6580103039741516,
|
|
"learning_rate": 4.727443187550481e-05,
|
|
"loss": 0.9922,
|
|
"num_input_tokens_seen": 122060800,
|
|
"step": 14900
|
|
},
|
|
{
|
|
"epoch": 0.48049202383240436,
|
|
"grad_norm": 0.6680930852890015,
|
|
"learning_rate": 4.723581572827592e-05,
|
|
"loss": 0.9851,
|
|
"num_input_tokens_seen": 122880000,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 0.4836953039912871,
|
|
"grad_norm": 2.329383373260498,
|
|
"learning_rate": 4.719694394384863e-05,
|
|
"loss": 1.0284,
|
|
"num_input_tokens_seen": 123699200,
|
|
"step": 15100
|
|
},
|
|
{
|
|
"epoch": 0.4868985841501698,
|
|
"grad_norm": 0.7416221499443054,
|
|
"learning_rate": 4.715781696911792e-05,
|
|
"loss": 0.9828,
|
|
"num_input_tokens_seen": 124518400,
|
|
"step": 15200
|
|
},
|
|
{
|
|
"epoch": 0.49010186430905245,
|
|
"grad_norm": 0.5373809337615967,
|
|
"learning_rate": 4.7118435253912575e-05,
|
|
"loss": 0.9621,
|
|
"num_input_tokens_seen": 125337600,
|
|
"step": 15300
|
|
},
|
|
{
|
|
"epoch": 0.49330514446793516,
|
|
"grad_norm": 0.5429302453994751,
|
|
"learning_rate": 4.7078799250990056e-05,
|
|
"loss": 1.013,
|
|
"num_input_tokens_seen": 126156800,
|
|
"step": 15400
|
|
},
|
|
{
|
|
"epoch": 0.4965084246268179,
|
|
"grad_norm": 0.5449560284614563,
|
|
"learning_rate": 4.7038909416031276e-05,
|
|
"loss": 1.0564,
|
|
"num_input_tokens_seen": 126976000,
|
|
"step": 15500
|
|
},
|
|
{
|
|
"epoch": 0.49971170478570054,
|
|
"grad_norm": 0.6629030704498291,
|
|
"learning_rate": 4.699876620763535e-05,
|
|
"loss": 0.9828,
|
|
"num_input_tokens_seen": 127795200,
|
|
"step": 15600
|
|
},
|
|
{
|
|
"epoch": 0.5029149849445832,
|
|
"grad_norm": 0.6022646427154541,
|
|
"learning_rate": 4.6958370087314344e-05,
|
|
"loss": 1.0435,
|
|
"num_input_tokens_seen": 128614400,
|
|
"step": 15700
|
|
},
|
|
{
|
|
"epoch": 0.5061182651034659,
|
|
"grad_norm": 1.8832833766937256,
|
|
"learning_rate": 4.691772151948799e-05,
|
|
"loss": 0.9438,
|
|
"num_input_tokens_seen": 129433600,
|
|
"step": 15800
|
|
},
|
|
{
|
|
"epoch": 0.5093215452623486,
|
|
"grad_norm": 0.7114049196243286,
|
|
"learning_rate": 4.687682097147826e-05,
|
|
"loss": 0.947,
|
|
"num_input_tokens_seen": 130252800,
|
|
"step": 15900
|
|
},
|
|
{
|
|
"epoch": 0.5125248254212313,
|
|
"grad_norm": 1.7428299188613892,
|
|
"learning_rate": 4.683566891350412e-05,
|
|
"loss": 0.9461,
|
|
"num_input_tokens_seen": 131072000,
|
|
"step": 16000
|
|
},
|
|
{
|
|
"epoch": 0.5157281055801141,
|
|
"grad_norm": 0.7306798100471497,
|
|
"learning_rate": 4.679426581867599e-05,
|
|
"loss": 0.9964,
|
|
"num_input_tokens_seen": 131891200,
|
|
"step": 16100
|
|
},
|
|
{
|
|
"epoch": 0.5189313857389968,
|
|
"grad_norm": 0.6088542938232422,
|
|
"learning_rate": 4.675261216299042e-05,
|
|
"loss": 0.9499,
|
|
"num_input_tokens_seen": 132710400,
|
|
"step": 16200
|
|
},
|
|
{
|
|
"epoch": 0.5221346658978794,
|
|
"grad_norm": 1.0487473011016846,
|
|
"learning_rate": 4.6710708425324545e-05,
|
|
"loss": 1.0205,
|
|
"num_input_tokens_seen": 133529600,
|
|
"step": 16300
|
|
},
|
|
{
|
|
"epoch": 0.5253379460567621,
|
|
"grad_norm": 0.4886884093284607,
|
|
"learning_rate": 4.6668555087430605e-05,
|
|
"loss": 0.9996,
|
|
"num_input_tokens_seen": 134348800,
|
|
"step": 16400
|
|
},
|
|
{
|
|
"epoch": 0.5285412262156448,
|
|
"grad_norm": 0.8639355301856995,
|
|
"learning_rate": 4.662615263393041e-05,
|
|
"loss": 1.0013,
|
|
"num_input_tokens_seen": 135168000,
|
|
"step": 16500
|
|
},
|
|
{
|
|
"epoch": 0.5317445063745275,
|
|
"grad_norm": 2.132063865661621,
|
|
"learning_rate": 4.658350155230976e-05,
|
|
"loss": 1.0437,
|
|
"num_input_tokens_seen": 135987200,
|
|
"step": 16600
|
|
},
|
|
{
|
|
"epoch": 0.5349477865334102,
|
|
"grad_norm": 0.5800316333770752,
|
|
"learning_rate": 4.6540602332912854e-05,
|
|
"loss": 1.0094,
|
|
"num_input_tokens_seen": 136806400,
|
|
"step": 16700
|
|
},
|
|
{
|
|
"epoch": 0.538151066692293,
|
|
"grad_norm": 0.48361486196517944,
|
|
"learning_rate": 4.6497455468936606e-05,
|
|
"loss": 1.0141,
|
|
"num_input_tokens_seen": 137625600,
|
|
"step": 16800
|
|
},
|
|
{
|
|
"epoch": 0.5413543468511756,
|
|
"grad_norm": 0.5760986804962158,
|
|
"learning_rate": 4.645406145642506e-05,
|
|
"loss": 1.0359,
|
|
"num_input_tokens_seen": 138444800,
|
|
"step": 16900
|
|
},
|
|
{
|
|
"epoch": 0.5445576270100583,
|
|
"grad_norm": 0.42741426825523376,
|
|
"learning_rate": 4.64104207942636e-05,
|
|
"loss": 0.9605,
|
|
"num_input_tokens_seen": 139264000,
|
|
"step": 17000
|
|
},
|
|
{
|
|
"epoch": 0.547760907168941,
|
|
"grad_norm": 0.6151024103164673,
|
|
"learning_rate": 4.6366533984173274e-05,
|
|
"loss": 0.9502,
|
|
"num_input_tokens_seen": 140083200,
|
|
"step": 17100
|
|
},
|
|
{
|
|
"epoch": 0.5509641873278237,
|
|
"grad_norm": 5.775717735290527,
|
|
"learning_rate": 4.6322401530704995e-05,
|
|
"loss": 1.016,
|
|
"num_input_tokens_seen": 140902400,
|
|
"step": 17200
|
|
},
|
|
{
|
|
"epoch": 0.5541674674867064,
|
|
"grad_norm": 0.5886793732643127,
|
|
"learning_rate": 4.627802394123375e-05,
|
|
"loss": 1.0039,
|
|
"num_input_tokens_seen": 141721600,
|
|
"step": 17300
|
|
},
|
|
{
|
|
"epoch": 0.5573707476455891,
|
|
"grad_norm": 2.4064829349517822,
|
|
"learning_rate": 4.623340172595277e-05,
|
|
"loss": 0.9972,
|
|
"num_input_tokens_seen": 142540800,
|
|
"step": 17400
|
|
},
|
|
{
|
|
"epoch": 0.5605740278044717,
|
|
"grad_norm": 0.5964205861091614,
|
|
"learning_rate": 4.6188535397867675e-05,
|
|
"loss": 0.9894,
|
|
"num_input_tokens_seen": 143360000,
|
|
"step": 17500
|
|
},
|
|
{
|
|
"epoch": 0.5637773079633545,
|
|
"grad_norm": 0.5683798789978027,
|
|
"learning_rate": 4.614342547279052e-05,
|
|
"loss": 1.0721,
|
|
"num_input_tokens_seen": 144179200,
|
|
"step": 17600
|
|
},
|
|
{
|
|
"epoch": 0.5669805881222372,
|
|
"grad_norm": 0.5441416501998901,
|
|
"learning_rate": 4.609807246933395e-05,
|
|
"loss": 1.0183,
|
|
"num_input_tokens_seen": 144998400,
|
|
"step": 17700
|
|
},
|
|
{
|
|
"epoch": 0.5701838682811199,
|
|
"grad_norm": 2.547898530960083,
|
|
"learning_rate": 4.605247690890518e-05,
|
|
"loss": 1.0083,
|
|
"num_input_tokens_seen": 145817600,
|
|
"step": 17800
|
|
},
|
|
{
|
|
"epoch": 0.5733871484400026,
|
|
"grad_norm": 0.7640330791473389,
|
|
"learning_rate": 4.600663931570001e-05,
|
|
"loss": 0.9927,
|
|
"num_input_tokens_seen": 146636800,
|
|
"step": 17900
|
|
},
|
|
{
|
|
"epoch": 0.5765904285988852,
|
|
"grad_norm": 0.6045035123825073,
|
|
"learning_rate": 4.596056021669681e-05,
|
|
"loss": 1.0144,
|
|
"num_input_tokens_seen": 147456000,
|
|
"step": 18000
|
|
},
|
|
{
|
|
"epoch": 0.5797937087577679,
|
|
"grad_norm": 0.5718028545379639,
|
|
"learning_rate": 4.591424014165047e-05,
|
|
"loss": 1.0417,
|
|
"num_input_tokens_seen": 148275200,
|
|
"step": 18100
|
|
},
|
|
{
|
|
"epoch": 0.5829969889166506,
|
|
"grad_norm": 0.49183499813079834,
|
|
"learning_rate": 4.586767962308625e-05,
|
|
"loss": 1.0124,
|
|
"num_input_tokens_seen": 149094400,
|
|
"step": 18200
|
|
},
|
|
{
|
|
"epoch": 0.5862002690755334,
|
|
"grad_norm": 0.5138664841651917,
|
|
"learning_rate": 4.5820879196293756e-05,
|
|
"loss": 0.9961,
|
|
"num_input_tokens_seen": 149913600,
|
|
"step": 18300
|
|
},
|
|
{
|
|
"epoch": 0.5894035492344161,
|
|
"grad_norm": 0.6507889628410339,
|
|
"learning_rate": 4.577383939932069e-05,
|
|
"loss": 1.0066,
|
|
"num_input_tokens_seen": 150732800,
|
|
"step": 18400
|
|
},
|
|
{
|
|
"epoch": 0.5926068293932988,
|
|
"grad_norm": 0.48219242691993713,
|
|
"learning_rate": 4.572656077296676e-05,
|
|
"loss": 1.0422,
|
|
"num_input_tokens_seen": 151552000,
|
|
"step": 18500
|
|
},
|
|
{
|
|
"epoch": 0.5958101095521814,
|
|
"grad_norm": 2.981851100921631,
|
|
"learning_rate": 4.567904386077734e-05,
|
|
"loss": 1.0647,
|
|
"num_input_tokens_seen": 152371200,
|
|
"step": 18600
|
|
},
|
|
{
|
|
"epoch": 0.5990133897110641,
|
|
"grad_norm": 1.6492716073989868,
|
|
"learning_rate": 4.563128920903735e-05,
|
|
"loss": 1.0465,
|
|
"num_input_tokens_seen": 153190400,
|
|
"step": 18700
|
|
},
|
|
{
|
|
"epoch": 0.6022166698699468,
|
|
"grad_norm": 0.6568962335586548,
|
|
"learning_rate": 4.558329736676488e-05,
|
|
"loss": 1.0505,
|
|
"num_input_tokens_seen": 154009600,
|
|
"step": 18800
|
|
},
|
|
{
|
|
"epoch": 0.6054199500288295,
|
|
"grad_norm": 0.77339768409729,
|
|
"learning_rate": 4.553506888570494e-05,
|
|
"loss": 1.0287,
|
|
"num_input_tokens_seen": 154828800,
|
|
"step": 18900
|
|
},
|
|
{
|
|
"epoch": 0.6086232301877122,
|
|
"grad_norm": 0.6354805827140808,
|
|
"learning_rate": 4.548660432032307e-05,
|
|
"loss": 0.9675,
|
|
"num_input_tokens_seen": 155648000,
|
|
"step": 19000
|
|
},
|
|
{
|
|
"epoch": 0.611826510346595,
|
|
"grad_norm": 0.6528341770172119,
|
|
"learning_rate": 4.5437904227799e-05,
|
|
"loss": 1.0027,
|
|
"num_input_tokens_seen": 156467200,
|
|
"step": 19100
|
|
},
|
|
{
|
|
"epoch": 0.6150297905054776,
|
|
"grad_norm": 0.7518653273582458,
|
|
"learning_rate": 4.538896916802023e-05,
|
|
"loss": 1.0002,
|
|
"num_input_tokens_seen": 157286400,
|
|
"step": 19200
|
|
},
|
|
{
|
|
"epoch": 0.6182330706643603,
|
|
"grad_norm": 1.2601783275604248,
|
|
"learning_rate": 4.533979970357558e-05,
|
|
"loss": 1.0698,
|
|
"num_input_tokens_seen": 158105600,
|
|
"step": 19300
|
|
},
|
|
{
|
|
"epoch": 0.621436350823243,
|
|
"grad_norm": 0.7242873311042786,
|
|
"learning_rate": 4.529039639974876e-05,
|
|
"loss": 0.9834,
|
|
"num_input_tokens_seen": 158924800,
|
|
"step": 19400
|
|
},
|
|
{
|
|
"epoch": 0.6246396309821257,
|
|
"grad_norm": 2.0396833419799805,
|
|
"learning_rate": 4.524075982451183e-05,
|
|
"loss": 0.9634,
|
|
"num_input_tokens_seen": 159744000,
|
|
"step": 19500
|
|
},
|
|
{
|
|
"epoch": 0.6278429111410084,
|
|
"grad_norm": 2.7037477493286133,
|
|
"learning_rate": 4.5190890548518696e-05,
|
|
"loss": 1.0221,
|
|
"num_input_tokens_seen": 160563200,
|
|
"step": 19600
|
|
},
|
|
{
|
|
"epoch": 0.631046191299891,
|
|
"grad_norm": 1.6231496334075928,
|
|
"learning_rate": 4.5140789145098536e-05,
|
|
"loss": 1.0582,
|
|
"num_input_tokens_seen": 161382400,
|
|
"step": 19700
|
|
},
|
|
{
|
|
"epoch": 0.6342494714587738,
|
|
"grad_norm": 0.6004766225814819,
|
|
"learning_rate": 4.509045619024921e-05,
|
|
"loss": 1.0112,
|
|
"num_input_tokens_seen": 162201600,
|
|
"step": 19800
|
|
},
|
|
{
|
|
"epoch": 0.6374527516176565,
|
|
"grad_norm": 12.123788833618164,
|
|
"learning_rate": 4.5039892262630656e-05,
|
|
"loss": 1.0078,
|
|
"num_input_tokens_seen": 163020800,
|
|
"step": 19900
|
|
},
|
|
{
|
|
"epoch": 0.6406560317765392,
|
|
"grad_norm": 3.2375683784484863,
|
|
"learning_rate": 4.498909794355821e-05,
|
|
"loss": 1.0239,
|
|
"num_input_tokens_seen": 163840000,
|
|
"step": 20000
|
|
},
|
|
{
|
|
"epoch": 0.6438593119354219,
|
|
"grad_norm": 0.8260817527770996,
|
|
"learning_rate": 4.493807381699595e-05,
|
|
"loss": 1.009,
|
|
"num_input_tokens_seen": 164659200,
|
|
"step": 20100
|
|
},
|
|
{
|
|
"epoch": 0.6470625920943046,
|
|
"grad_norm": 0.7712699174880981,
|
|
"learning_rate": 4.488682046954994e-05,
|
|
"loss": 0.9565,
|
|
"num_input_tokens_seen": 165478400,
|
|
"step": 20200
|
|
},
|
|
{
|
|
"epoch": 0.6502658722531872,
|
|
"grad_norm": 0.5889214277267456,
|
|
"learning_rate": 4.483533849046155e-05,
|
|
"loss": 1.0225,
|
|
"num_input_tokens_seen": 166297600,
|
|
"step": 20300
|
|
},
|
|
{
|
|
"epoch": 0.6534691524120699,
|
|
"grad_norm": 1.2388112545013428,
|
|
"learning_rate": 4.4783628471600636e-05,
|
|
"loss": 1.0642,
|
|
"num_input_tokens_seen": 167116800,
|
|
"step": 20400
|
|
},
|
|
{
|
|
"epoch": 0.6566724325709526,
|
|
"grad_norm": 0.6664971709251404,
|
|
"learning_rate": 4.473169100745871e-05,
|
|
"loss": 0.9598,
|
|
"num_input_tokens_seen": 167936000,
|
|
"step": 20500
|
|
},
|
|
{
|
|
"epoch": 0.6598757127298354,
|
|
"grad_norm": 0.5350831151008606,
|
|
"learning_rate": 4.4679526695142195e-05,
|
|
"loss": 1.0391,
|
|
"num_input_tokens_seen": 168755200,
|
|
"step": 20600
|
|
},
|
|
{
|
|
"epoch": 0.6630789928887181,
|
|
"grad_norm": 0.6643035411834717,
|
|
"learning_rate": 4.4627136134365463e-05,
|
|
"loss": 0.998,
|
|
"num_input_tokens_seen": 169574400,
|
|
"step": 20700
|
|
},
|
|
{
|
|
"epoch": 0.6662822730476008,
|
|
"grad_norm": 0.5972053408622742,
|
|
"learning_rate": 4.457451992744402e-05,
|
|
"loss": 1.0335,
|
|
"num_input_tokens_seen": 170393600,
|
|
"step": 20800
|
|
},
|
|
{
|
|
"epoch": 0.6694855532064834,
|
|
"grad_norm": 0.5102434754371643,
|
|
"learning_rate": 4.452167867928751e-05,
|
|
"loss": 1.0459,
|
|
"num_input_tokens_seen": 171212800,
|
|
"step": 20900
|
|
},
|
|
{
|
|
"epoch": 0.6726888333653661,
|
|
"grad_norm": 0.5346103310585022,
|
|
"learning_rate": 4.4468612997392824e-05,
|
|
"loss": 0.9922,
|
|
"num_input_tokens_seen": 172032000,
|
|
"step": 21000
|
|
},
|
|
{
|
|
"epoch": 0.6758921135242488,
|
|
"grad_norm": 0.5129193663597107,
|
|
"learning_rate": 4.441532349183706e-05,
|
|
"loss": 1.0024,
|
|
"num_input_tokens_seen": 172851200,
|
|
"step": 21100
|
|
},
|
|
{
|
|
"epoch": 0.6790953936831315,
|
|
"grad_norm": 0.5462967753410339,
|
|
"learning_rate": 4.4361810775270554e-05,
|
|
"loss": 0.994,
|
|
"num_input_tokens_seen": 173670400,
|
|
"step": 21200
|
|
},
|
|
{
|
|
"epoch": 0.6822986738420143,
|
|
"grad_norm": 1.2343724966049194,
|
|
"learning_rate": 4.430807546290982e-05,
|
|
"loss": 0.9669,
|
|
"num_input_tokens_seen": 174489600,
|
|
"step": 21300
|
|
},
|
|
{
|
|
"epoch": 0.685501954000897,
|
|
"grad_norm": 0.653947651386261,
|
|
"learning_rate": 4.425411817253048e-05,
|
|
"loss": 1.0029,
|
|
"num_input_tokens_seen": 175308800,
|
|
"step": 21400
|
|
},
|
|
{
|
|
"epoch": 0.6887052341597796,
|
|
"grad_norm": 2.948323965072632,
|
|
"learning_rate": 4.419993952446013e-05,
|
|
"loss": 1.0158,
|
|
"num_input_tokens_seen": 176128000,
|
|
"step": 21500
|
|
},
|
|
{
|
|
"epoch": 0.6919085143186623,
|
|
"grad_norm": 1.577588438987732,
|
|
"learning_rate": 4.414554014157127e-05,
|
|
"loss": 1.0571,
|
|
"num_input_tokens_seen": 176947200,
|
|
"step": 21600
|
|
},
|
|
{
|
|
"epoch": 0.695111794477545,
|
|
"grad_norm": 1.0136100053787231,
|
|
"learning_rate": 4.4090920649274095e-05,
|
|
"loss": 0.9647,
|
|
"num_input_tokens_seen": 177766400,
|
|
"step": 21700
|
|
},
|
|
{
|
|
"epoch": 0.6983150746364277,
|
|
"grad_norm": 0.5571495294570923,
|
|
"learning_rate": 4.40360816755093e-05,
|
|
"loss": 0.9609,
|
|
"num_input_tokens_seen": 178585600,
|
|
"step": 21800
|
|
},
|
|
{
|
|
"epoch": 0.7015183547953104,
|
|
"grad_norm": 0.5548049211502075,
|
|
"learning_rate": 4.3981023850740926e-05,
|
|
"loss": 0.9524,
|
|
"num_input_tokens_seen": 179404800,
|
|
"step": 21900
|
|
},
|
|
{
|
|
"epoch": 0.704721634954193,
|
|
"grad_norm": 0.9693801999092102,
|
|
"learning_rate": 4.392574780794901e-05,
|
|
"loss": 0.9641,
|
|
"num_input_tokens_seen": 180224000,
|
|
"step": 22000
|
|
},
|
|
{
|
|
"epoch": 0.7079249151130758,
|
|
"grad_norm": 0.6628372669219971,
|
|
"learning_rate": 4.387025418262242e-05,
|
|
"loss": 0.9838,
|
|
"num_input_tokens_seen": 181043200,
|
|
"step": 22100
|
|
},
|
|
{
|
|
"epoch": 0.7111281952719585,
|
|
"grad_norm": 0.5312179923057556,
|
|
"learning_rate": 4.381454361275143e-05,
|
|
"loss": 1.0309,
|
|
"num_input_tokens_seen": 181862400,
|
|
"step": 22200
|
|
},
|
|
{
|
|
"epoch": 0.7143314754308412,
|
|
"grad_norm": 0.6137087941169739,
|
|
"learning_rate": 4.3758616738820506e-05,
|
|
"loss": 1.0029,
|
|
"num_input_tokens_seen": 182681600,
|
|
"step": 22300
|
|
},
|
|
{
|
|
"epoch": 0.7175347555897239,
|
|
"grad_norm": 1.6591495275497437,
|
|
"learning_rate": 4.370247420380085e-05,
|
|
"loss": 0.9842,
|
|
"num_input_tokens_seen": 183500800,
|
|
"step": 22400
|
|
},
|
|
{
|
|
"epoch": 0.7207380357486066,
|
|
"grad_norm": 0.677762508392334,
|
|
"learning_rate": 4.3646116653143046e-05,
|
|
"loss": 0.9606,
|
|
"num_input_tokens_seen": 184320000,
|
|
"step": 22500
|
|
},
|
|
{
|
|
"epoch": 0.7239413159074892,
|
|
"grad_norm": 0.602687418460846,
|
|
"learning_rate": 4.358954473476965e-05,
|
|
"loss": 0.9781,
|
|
"num_input_tokens_seen": 185139200,
|
|
"step": 22600
|
|
},
|
|
{
|
|
"epoch": 0.7271445960663719,
|
|
"grad_norm": 0.5638014674186707,
|
|
"learning_rate": 4.353275909906772e-05,
|
|
"loss": 0.9823,
|
|
"num_input_tokens_seen": 185958400,
|
|
"step": 22700
|
|
},
|
|
{
|
|
"epoch": 0.7303478762252547,
|
|
"grad_norm": 1.6680676937103271,
|
|
"learning_rate": 4.3475760398881325e-05,
|
|
"loss": 0.988,
|
|
"num_input_tokens_seen": 186777600,
|
|
"step": 22800
|
|
},
|
|
{
|
|
"epoch": 0.7335511563841374,
|
|
"grad_norm": 0.6449896097183228,
|
|
"learning_rate": 4.3418549289504096e-05,
|
|
"loss": 0.9878,
|
|
"num_input_tokens_seen": 187596800,
|
|
"step": 22900
|
|
},
|
|
{
|
|
"epoch": 0.7367544365430201,
|
|
"grad_norm": 2.6768717765808105,
|
|
"learning_rate": 4.3361126428671636e-05,
|
|
"loss": 1.0091,
|
|
"num_input_tokens_seen": 188416000,
|
|
"step": 23000
|
|
},
|
|
{
|
|
"epoch": 0.7399577167019028,
|
|
"grad_norm": 1.079026460647583,
|
|
"learning_rate": 4.330349247655398e-05,
|
|
"loss": 1.0383,
|
|
"num_input_tokens_seen": 189235200,
|
|
"step": 23100
|
|
},
|
|
{
|
|
"epoch": 0.7431609968607854,
|
|
"grad_norm": 0.6426740288734436,
|
|
"learning_rate": 4.324564809574799e-05,
|
|
"loss": 0.9801,
|
|
"num_input_tokens_seen": 190054400,
|
|
"step": 23200
|
|
},
|
|
{
|
|
"epoch": 0.7463642770196681,
|
|
"grad_norm": 0.8264270424842834,
|
|
"learning_rate": 4.318759395126979e-05,
|
|
"loss": 1.0095,
|
|
"num_input_tokens_seen": 190873600,
|
|
"step": 23300
|
|
},
|
|
{
|
|
"epoch": 0.7495675571785508,
|
|
"grad_norm": 0.5160927176475525,
|
|
"learning_rate": 4.3129330710547035e-05,
|
|
"loss": 0.9601,
|
|
"num_input_tokens_seen": 191692800,
|
|
"step": 23400
|
|
},
|
|
{
|
|
"epoch": 0.7527708373374336,
|
|
"grad_norm": 0.6011959910392761,
|
|
"learning_rate": 4.307085904341133e-05,
|
|
"loss": 0.9837,
|
|
"num_input_tokens_seen": 192512000,
|
|
"step": 23500
|
|
},
|
|
{
|
|
"epoch": 0.7559741174963163,
|
|
"grad_norm": 0.5961838960647583,
|
|
"learning_rate": 4.3012179622090436e-05,
|
|
"loss": 0.9647,
|
|
"num_input_tokens_seen": 193331200,
|
|
"step": 23600
|
|
},
|
|
{
|
|
"epoch": 0.7591773976551989,
|
|
"grad_norm": 0.8201313614845276,
|
|
"learning_rate": 4.295329312120063e-05,
|
|
"loss": 0.9439,
|
|
"num_input_tokens_seen": 194150400,
|
|
"step": 23700
|
|
},
|
|
{
|
|
"epoch": 0.7623806778140816,
|
|
"grad_norm": 0.5474829077720642,
|
|
"learning_rate": 4.289420021773889e-05,
|
|
"loss": 0.9708,
|
|
"num_input_tokens_seen": 194969600,
|
|
"step": 23800
|
|
},
|
|
{
|
|
"epoch": 0.7655839579729643,
|
|
"grad_norm": 0.5124524235725403,
|
|
"learning_rate": 4.283490159107513e-05,
|
|
"loss": 1.0109,
|
|
"num_input_tokens_seen": 195788800,
|
|
"step": 23900
|
|
},
|
|
{
|
|
"epoch": 0.768787238131847,
|
|
"grad_norm": 0.6800445318222046,
|
|
"learning_rate": 4.27753979229444e-05,
|
|
"loss": 1.0119,
|
|
"num_input_tokens_seen": 196608000,
|
|
"step": 24000
|
|
},
|
|
{
|
|
"epoch": 0.7719905182907297,
|
|
"grad_norm": 0.5350146889686584,
|
|
"learning_rate": 4.271568989743903e-05,
|
|
"loss": 0.9659,
|
|
"num_input_tokens_seen": 197427200,
|
|
"step": 24100
|
|
},
|
|
{
|
|
"epoch": 0.7751937984496124,
|
|
"grad_norm": 0.6650831699371338,
|
|
"learning_rate": 4.265577820100076e-05,
|
|
"loss": 0.9729,
|
|
"num_input_tokens_seen": 198246400,
|
|
"step": 24200
|
|
},
|
|
{
|
|
"epoch": 0.778397078608495,
|
|
"grad_norm": 0.5228304862976074,
|
|
"learning_rate": 4.2595663522412884e-05,
|
|
"loss": 0.9633,
|
|
"num_input_tokens_seen": 199065600,
|
|
"step": 24300
|
|
},
|
|
{
|
|
"epoch": 0.7816003587673778,
|
|
"grad_norm": 0.532375693321228,
|
|
"learning_rate": 4.253534655279232e-05,
|
|
"loss": 0.9687,
|
|
"num_input_tokens_seen": 199884800,
|
|
"step": 24400
|
|
},
|
|
{
|
|
"epoch": 0.7848036389262605,
|
|
"grad_norm": 0.8860092759132385,
|
|
"learning_rate": 4.247482798558161e-05,
|
|
"loss": 1.0017,
|
|
"num_input_tokens_seen": 200704000,
|
|
"step": 24500
|
|
},
|
|
{
|
|
"epoch": 0.7880069190851432,
|
|
"grad_norm": 2.975177526473999,
|
|
"learning_rate": 4.241410851654102e-05,
|
|
"loss": 0.9905,
|
|
"num_input_tokens_seen": 201523200,
|
|
"step": 24600
|
|
},
|
|
{
|
|
"epoch": 0.7912101992440259,
|
|
"grad_norm": 0.622031033039093,
|
|
"learning_rate": 4.235318884374051e-05,
|
|
"loss": 1.0358,
|
|
"num_input_tokens_seen": 202342400,
|
|
"step": 24700
|
|
},
|
|
{
|
|
"epoch": 0.7944134794029086,
|
|
"grad_norm": 1.7574553489685059,
|
|
"learning_rate": 4.229206966755172e-05,
|
|
"loss": 1.0105,
|
|
"num_input_tokens_seen": 203161600,
|
|
"step": 24800
|
|
},
|
|
{
|
|
"epoch": 0.7976167595617912,
|
|
"grad_norm": 0.7439371347427368,
|
|
"learning_rate": 4.223075169063989e-05,
|
|
"loss": 0.9345,
|
|
"num_input_tokens_seen": 203980800,
|
|
"step": 24900
|
|
},
|
|
{
|
|
"epoch": 0.800820039720674,
|
|
"grad_norm": 0.5452560782432556,
|
|
"learning_rate": 4.21692356179558e-05,
|
|
"loss": 0.9655,
|
|
"num_input_tokens_seen": 204800000,
|
|
"step": 25000
|
|
},
|
|
{
|
|
"epoch": 0.8040233198795567,
|
|
"grad_norm": 0.5876986384391785,
|
|
"learning_rate": 4.210752215672769e-05,
|
|
"loss": 0.949,
|
|
"num_input_tokens_seen": 205619200,
|
|
"step": 25100
|
|
},
|
|
{
|
|
"epoch": 0.8072266000384394,
|
|
"grad_norm": 2.6809980869293213,
|
|
"learning_rate": 4.204561201645307e-05,
|
|
"loss": 1.0082,
|
|
"num_input_tokens_seen": 206438400,
|
|
"step": 25200
|
|
},
|
|
{
|
|
"epoch": 0.8104298801973221,
|
|
"grad_norm": 0.647762656211853,
|
|
"learning_rate": 4.198350590889064e-05,
|
|
"loss": 1.0074,
|
|
"num_input_tokens_seen": 207257600,
|
|
"step": 25300
|
|
},
|
|
{
|
|
"epoch": 0.8136331603562047,
|
|
"grad_norm": 0.4822922945022583,
|
|
"learning_rate": 4.192120454805203e-05,
|
|
"loss": 0.9638,
|
|
"num_input_tokens_seen": 208076800,
|
|
"step": 25400
|
|
},
|
|
{
|
|
"epoch": 0.8168364405150874,
|
|
"grad_norm": 9.964862823486328,
|
|
"learning_rate": 4.185870865019364e-05,
|
|
"loss": 0.9793,
|
|
"num_input_tokens_seen": 208896000,
|
|
"step": 25500
|
|
},
|
|
{
|
|
"epoch": 0.8200397206739701,
|
|
"grad_norm": 0.6270651817321777,
|
|
"learning_rate": 4.17960189338084e-05,
|
|
"loss": 0.9515,
|
|
"num_input_tokens_seen": 209715200,
|
|
"step": 25600
|
|
},
|
|
{
|
|
"epoch": 0.8232430008328528,
|
|
"grad_norm": 0.5813098549842834,
|
|
"learning_rate": 4.17331361196175e-05,
|
|
"loss": 0.9659,
|
|
"num_input_tokens_seen": 210534400,
|
|
"step": 25700
|
|
},
|
|
{
|
|
"epoch": 0.8264462809917356,
|
|
"grad_norm": 0.5864317417144775,
|
|
"learning_rate": 4.167006093056209e-05,
|
|
"loss": 1.0496,
|
|
"num_input_tokens_seen": 211353600,
|
|
"step": 25800
|
|
},
|
|
{
|
|
"epoch": 0.8296495611506183,
|
|
"grad_norm": 2.7955405712127686,
|
|
"learning_rate": 4.1606794091795e-05,
|
|
"loss": 0.9466,
|
|
"num_input_tokens_seen": 212172800,
|
|
"step": 25900
|
|
},
|
|
{
|
|
"epoch": 0.8328528413095009,
|
|
"grad_norm": 0.5431935787200928,
|
|
"learning_rate": 4.154333633067238e-05,
|
|
"loss": 0.9308,
|
|
"num_input_tokens_seen": 212992000,
|
|
"step": 26000
|
|
},
|
|
{
|
|
"epoch": 0.8360561214683836,
|
|
"grad_norm": 2.313504934310913,
|
|
"learning_rate": 4.147968837674535e-05,
|
|
"loss": 0.9996,
|
|
"num_input_tokens_seen": 213811200,
|
|
"step": 26100
|
|
},
|
|
{
|
|
"epoch": 0.8392594016272663,
|
|
"grad_norm": 0.6028672456741333,
|
|
"learning_rate": 4.141585096175162e-05,
|
|
"loss": 0.9862,
|
|
"num_input_tokens_seen": 214630400,
|
|
"step": 26200
|
|
},
|
|
{
|
|
"epoch": 0.842462681786149,
|
|
"grad_norm": 1.6038614511489868,
|
|
"learning_rate": 4.1351824819607056e-05,
|
|
"loss": 1.0175,
|
|
"num_input_tokens_seen": 215449600,
|
|
"step": 26300
|
|
},
|
|
{
|
|
"epoch": 0.8456659619450317,
|
|
"grad_norm": 0.6132040619850159,
|
|
"learning_rate": 4.128761068639723e-05,
|
|
"loss": 0.9903,
|
|
"num_input_tokens_seen": 216268800,
|
|
"step": 26400
|
|
},
|
|
{
|
|
"epoch": 0.8488692421039145,
|
|
"grad_norm": 1.7026666402816772,
|
|
"learning_rate": 4.122320930036902e-05,
|
|
"loss": 1.0261,
|
|
"num_input_tokens_seen": 217088000,
|
|
"step": 26500
|
|
},
|
|
{
|
|
"epoch": 0.8520725222627971,
|
|
"grad_norm": 0.6355572938919067,
|
|
"learning_rate": 4.1158621401922046e-05,
|
|
"loss": 1.0048,
|
|
"num_input_tokens_seen": 217907200,
|
|
"step": 26600
|
|
},
|
|
{
|
|
"epoch": 0.8552758024216798,
|
|
"grad_norm": 0.683513879776001,
|
|
"learning_rate": 4.109384773360023e-05,
|
|
"loss": 0.9659,
|
|
"num_input_tokens_seen": 218726400,
|
|
"step": 26700
|
|
},
|
|
{
|
|
"epoch": 0.8584790825805625,
|
|
"grad_norm": 0.6867396831512451,
|
|
"learning_rate": 4.10288890400832e-05,
|
|
"loss": 1.0134,
|
|
"num_input_tokens_seen": 219545600,
|
|
"step": 26800
|
|
},
|
|
{
|
|
"epoch": 0.8616823627394452,
|
|
"grad_norm": 0.4578529894351959,
|
|
"learning_rate": 4.0963746068177744e-05,
|
|
"loss": 1.0011,
|
|
"num_input_tokens_seen": 220364800,
|
|
"step": 26900
|
|
},
|
|
{
|
|
"epoch": 0.8648856428983279,
|
|
"grad_norm": 0.5275700688362122,
|
|
"learning_rate": 4.089841956680927e-05,
|
|
"loss": 1.0777,
|
|
"num_input_tokens_seen": 221184000,
|
|
"step": 27000
|
|
},
|
|
{
|
|
"epoch": 0.8680889230572106,
|
|
"grad_norm": 0.5704593658447266,
|
|
"learning_rate": 4.08329102870131e-05,
|
|
"loss": 1.0113,
|
|
"num_input_tokens_seen": 222003200,
|
|
"step": 27100
|
|
},
|
|
{
|
|
"epoch": 0.8712922032160932,
|
|
"grad_norm": 0.5546739101409912,
|
|
"learning_rate": 4.076721898192597e-05,
|
|
"loss": 1.0181,
|
|
"num_input_tokens_seen": 222822400,
|
|
"step": 27200
|
|
},
|
|
{
|
|
"epoch": 0.874495483374976,
|
|
"grad_norm": 0.4796381890773773,
|
|
"learning_rate": 4.070134640677722e-05,
|
|
"loss": 0.9882,
|
|
"num_input_tokens_seen": 223641600,
|
|
"step": 27300
|
|
},
|
|
{
|
|
"epoch": 0.8776987635338587,
|
|
"grad_norm": 8.13311767578125,
|
|
"learning_rate": 4.063529331888024e-05,
|
|
"loss": 0.9378,
|
|
"num_input_tokens_seen": 224460800,
|
|
"step": 27400
|
|
},
|
|
{
|
|
"epoch": 0.8809020436927414,
|
|
"grad_norm": 0.4969484806060791,
|
|
"learning_rate": 4.056906047762368e-05,
|
|
"loss": 0.9867,
|
|
"num_input_tokens_seen": 225280000,
|
|
"step": 27500
|
|
},
|
|
{
|
|
"epoch": 0.8841053238516241,
|
|
"grad_norm": 3.9572601318359375,
|
|
"learning_rate": 4.0502648644462774e-05,
|
|
"loss": 0.9645,
|
|
"num_input_tokens_seen": 226099200,
|
|
"step": 27600
|
|
},
|
|
{
|
|
"epoch": 0.8873086040105067,
|
|
"grad_norm": 2.1928722858428955,
|
|
"learning_rate": 4.043605858291053e-05,
|
|
"loss": 0.9678,
|
|
"num_input_tokens_seen": 226918400,
|
|
"step": 27700
|
|
},
|
|
{
|
|
"epoch": 0.8905118841693894,
|
|
"grad_norm": 0.7099782824516296,
|
|
"learning_rate": 4.036929105852901e-05,
|
|
"loss": 1.0127,
|
|
"num_input_tokens_seen": 227737600,
|
|
"step": 27800
|
|
},
|
|
{
|
|
"epoch": 0.8937151643282721,
|
|
"grad_norm": 0.6126459836959839,
|
|
"learning_rate": 4.0302346838920514e-05,
|
|
"loss": 1.0439,
|
|
"num_input_tokens_seen": 228556800,
|
|
"step": 27900
|
|
},
|
|
{
|
|
"epoch": 0.8969184444871549,
|
|
"grad_norm": 0.6163774728775024,
|
|
"learning_rate": 4.02352266937187e-05,
|
|
"loss": 0.9393,
|
|
"num_input_tokens_seen": 229376000,
|
|
"step": 28000
|
|
},
|
|
{
|
|
"epoch": 0.9001217246460376,
|
|
"grad_norm": 0.6306945085525513,
|
|
"learning_rate": 4.016793139457982e-05,
|
|
"loss": 0.8966,
|
|
"num_input_tokens_seen": 230195200,
|
|
"step": 28100
|
|
},
|
|
{
|
|
"epoch": 0.9033250048049203,
|
|
"grad_norm": 0.6520447134971619,
|
|
"learning_rate": 4.0100461715173777e-05,
|
|
"loss": 0.9861,
|
|
"num_input_tokens_seen": 231014400,
|
|
"step": 28200
|
|
},
|
|
{
|
|
"epoch": 0.9065282849638029,
|
|
"grad_norm": 0.5960193276405334,
|
|
"learning_rate": 4.003281843117528e-05,
|
|
"loss": 1.0012,
|
|
"num_input_tokens_seen": 231833600,
|
|
"step": 28300
|
|
},
|
|
{
|
|
"epoch": 0.9097315651226856,
|
|
"grad_norm": 0.6080912947654724,
|
|
"learning_rate": 3.9965002320254924e-05,
|
|
"loss": 0.9602,
|
|
"num_input_tokens_seen": 232652800,
|
|
"step": 28400
|
|
},
|
|
{
|
|
"epoch": 0.9129348452815683,
|
|
"grad_norm": 0.6659435033798218,
|
|
"learning_rate": 3.989701416207019e-05,
|
|
"loss": 0.988,
|
|
"num_input_tokens_seen": 233472000,
|
|
"step": 28500
|
|
},
|
|
{
|
|
"epoch": 0.916138125440451,
|
|
"grad_norm": 2.5207667350769043,
|
|
"learning_rate": 3.9828854738256564e-05,
|
|
"loss": 1.0339,
|
|
"num_input_tokens_seen": 234291200,
|
|
"step": 28600
|
|
},
|
|
{
|
|
"epoch": 0.9193414055993337,
|
|
"grad_norm": 2.4952239990234375,
|
|
"learning_rate": 3.976052483241849e-05,
|
|
"loss": 1.0025,
|
|
"num_input_tokens_seen": 235110400,
|
|
"step": 28700
|
|
},
|
|
{
|
|
"epoch": 0.9225446857582165,
|
|
"grad_norm": 0.6766204237937927,
|
|
"learning_rate": 3.969202523012038e-05,
|
|
"loss": 1.0335,
|
|
"num_input_tokens_seen": 235929600,
|
|
"step": 28800
|
|
},
|
|
{
|
|
"epoch": 0.9257479659170991,
|
|
"grad_norm": 0.666861891746521,
|
|
"learning_rate": 3.9623356718877605e-05,
|
|
"loss": 0.9721,
|
|
"num_input_tokens_seen": 236748800,
|
|
"step": 28900
|
|
},
|
|
{
|
|
"epoch": 0.9289512460759818,
|
|
"grad_norm": 0.5322718620300293,
|
|
"learning_rate": 3.955452008814741e-05,
|
|
"loss": 0.9866,
|
|
"num_input_tokens_seen": 237568000,
|
|
"step": 29000
|
|
},
|
|
{
|
|
"epoch": 0.9321545262348645,
|
|
"grad_norm": 0.6603706479072571,
|
|
"learning_rate": 3.9485516129319844e-05,
|
|
"loss": 0.9863,
|
|
"num_input_tokens_seen": 238387200,
|
|
"step": 29100
|
|
},
|
|
{
|
|
"epoch": 0.9353578063937472,
|
|
"grad_norm": 0.6650800704956055,
|
|
"learning_rate": 3.9416345635708676e-05,
|
|
"loss": 0.9902,
|
|
"num_input_tokens_seen": 239206400,
|
|
"step": 29200
|
|
},
|
|
{
|
|
"epoch": 0.9385610865526299,
|
|
"grad_norm": 2.477098226547241,
|
|
"learning_rate": 3.9347009402542256e-05,
|
|
"loss": 0.991,
|
|
"num_input_tokens_seen": 240025600,
|
|
"step": 29300
|
|
},
|
|
{
|
|
"epoch": 0.9417643667115125,
|
|
"grad_norm": 0.6523051261901855,
|
|
"learning_rate": 3.9277508226954394e-05,
|
|
"loss": 0.9851,
|
|
"num_input_tokens_seen": 240844800,
|
|
"step": 29400
|
|
},
|
|
{
|
|
"epoch": 0.9449676468703953,
|
|
"grad_norm": 0.7197608351707458,
|
|
"learning_rate": 3.920784290797519e-05,
|
|
"loss": 1.0144,
|
|
"num_input_tokens_seen": 241664000,
|
|
"step": 29500
|
|
},
|
|
{
|
|
"epoch": 0.948170927029278,
|
|
"grad_norm": 0.6857073903083801,
|
|
"learning_rate": 3.9138014246521806e-05,
|
|
"loss": 0.9529,
|
|
"num_input_tokens_seen": 242483200,
|
|
"step": 29600
|
|
},
|
|
{
|
|
"epoch": 0.9513742071881607,
|
|
"grad_norm": 0.616074800491333,
|
|
"learning_rate": 3.906802304538935e-05,
|
|
"loss": 0.9949,
|
|
"num_input_tokens_seen": 243302400,
|
|
"step": 29700
|
|
},
|
|
{
|
|
"epoch": 0.9545774873470434,
|
|
"grad_norm": 0.5982092022895813,
|
|
"learning_rate": 3.899787010924152e-05,
|
|
"loss": 0.9596,
|
|
"num_input_tokens_seen": 244121600,
|
|
"step": 29800
|
|
},
|
|
{
|
|
"epoch": 0.9577807675059261,
|
|
"grad_norm": 0.6943311095237732,
|
|
"learning_rate": 3.8927556244601495e-05,
|
|
"loss": 0.9813,
|
|
"num_input_tokens_seen": 244940800,
|
|
"step": 29900
|
|
},
|
|
{
|
|
"epoch": 0.9609840476648087,
|
|
"grad_norm": 0.7715808153152466,
|
|
"learning_rate": 3.885708225984254e-05,
|
|
"loss": 0.9747,
|
|
"num_input_tokens_seen": 245760000,
|
|
"step": 30000
|
|
},
|
|
{
|
|
"epoch": 0.9641873278236914,
|
|
"grad_norm": 0.6129135489463806,
|
|
"learning_rate": 3.878644896517879e-05,
|
|
"loss": 0.9933,
|
|
"num_input_tokens_seen": 246579200,
|
|
"step": 30100
|
|
},
|
|
{
|
|
"epoch": 0.9673906079825741,
|
|
"grad_norm": 0.7009174227714539,
|
|
"learning_rate": 3.87156571726559e-05,
|
|
"loss": 0.964,
|
|
"num_input_tokens_seen": 247398400,
|
|
"step": 30200
|
|
},
|
|
{
|
|
"epoch": 0.9705938881414569,
|
|
"grad_norm": 0.7255650758743286,
|
|
"learning_rate": 3.8644707696141704e-05,
|
|
"loss": 0.9784,
|
|
"num_input_tokens_seen": 248217600,
|
|
"step": 30300
|
|
},
|
|
{
|
|
"epoch": 0.9737971683003396,
|
|
"grad_norm": 4.299106597900391,
|
|
"learning_rate": 3.857360135131691e-05,
|
|
"loss": 1.0191,
|
|
"num_input_tokens_seen": 249036800,
|
|
"step": 30400
|
|
},
|
|
{
|
|
"epoch": 0.9770004484592223,
|
|
"grad_norm": 0.5924736261367798,
|
|
"learning_rate": 3.8502338955665644e-05,
|
|
"loss": 0.9769,
|
|
"num_input_tokens_seen": 249856000,
|
|
"step": 30500
|
|
},
|
|
{
|
|
"epoch": 0.9802037286181049,
|
|
"grad_norm": 0.7270549535751343,
|
|
"learning_rate": 3.843092132846613e-05,
|
|
"loss": 1.0179,
|
|
"num_input_tokens_seen": 250675200,
|
|
"step": 30600
|
|
},
|
|
{
|
|
"epoch": 0.9834070087769876,
|
|
"grad_norm": 0.7704394459724426,
|
|
"learning_rate": 3.835934929078119e-05,
|
|
"loss": 0.9206,
|
|
"num_input_tokens_seen": 251494400,
|
|
"step": 30700
|
|
},
|
|
{
|
|
"epoch": 0.9866102889358703,
|
|
"grad_norm": 0.612688422203064,
|
|
"learning_rate": 3.828762366544888e-05,
|
|
"loss": 0.9686,
|
|
"num_input_tokens_seen": 252313600,
|
|
"step": 30800
|
|
},
|
|
{
|
|
"epoch": 0.989813569094753,
|
|
"grad_norm": 0.5262284278869629,
|
|
"learning_rate": 3.8215745277073e-05,
|
|
"loss": 0.9694,
|
|
"num_input_tokens_seen": 253132800,
|
|
"step": 30900
|
|
},
|
|
{
|
|
"epoch": 0.9930168492536358,
|
|
"grad_norm": 0.5798372626304626,
|
|
"learning_rate": 3.8143714952013584e-05,
|
|
"loss": 0.8879,
|
|
"num_input_tokens_seen": 253952000,
|
|
"step": 31000
|
|
},
|
|
{
|
|
"epoch": 0.9962201294125185,
|
|
"grad_norm": 0.5605859756469727,
|
|
"learning_rate": 3.807153351837746e-05,
|
|
"loss": 0.9948,
|
|
"num_input_tokens_seen": 254771200,
|
|
"step": 31100
|
|
},
|
|
{
|
|
"epoch": 0.9994234095714011,
|
|
"grad_norm": 1.9532912969589233,
|
|
"learning_rate": 3.799920180600868e-05,
|
|
"loss": 1.027,
|
|
"num_input_tokens_seen": 255590400,
|
|
"step": 31200
|
|
},
|
|
{
|
|
"epoch": 1.0026266897302838,
|
|
"grad_norm": 0.6683017611503601,
|
|
"learning_rate": 3.792672064647898e-05,
|
|
"loss": 0.9665,
|
|
"num_input_tokens_seen": 256409600,
|
|
"step": 31300
|
|
},
|
|
{
|
|
"epoch": 1.0058299698891664,
|
|
"grad_norm": 0.5574291348457336,
|
|
"learning_rate": 3.785409087307828e-05,
|
|
"loss": 0.8671,
|
|
"num_input_tokens_seen": 257228800,
|
|
"step": 31400
|
|
},
|
|
{
|
|
"epoch": 1.0090332500480492,
|
|
"grad_norm": 0.6487427949905396,
|
|
"learning_rate": 3.778131332080503e-05,
|
|
"loss": 0.9356,
|
|
"num_input_tokens_seen": 258048000,
|
|
"step": 31500
|
|
},
|
|
{
|
|
"epoch": 1.0122365302069318,
|
|
"grad_norm": 0.6974719166755676,
|
|
"learning_rate": 3.7708388826356636e-05,
|
|
"loss": 0.9751,
|
|
"num_input_tokens_seen": 258867200,
|
|
"step": 31600
|
|
},
|
|
{
|
|
"epoch": 1.0154398103658147,
|
|
"grad_norm": 0.6754201054573059,
|
|
"learning_rate": 3.763531822811986e-05,
|
|
"loss": 0.8963,
|
|
"num_input_tokens_seen": 259686400,
|
|
"step": 31700
|
|
},
|
|
{
|
|
"epoch": 1.0186430905246973,
|
|
"grad_norm": 0.5839199423789978,
|
|
"learning_rate": 3.756210236616117e-05,
|
|
"loss": 0.9021,
|
|
"num_input_tokens_seen": 260505600,
|
|
"step": 31800
|
|
},
|
|
{
|
|
"epoch": 1.02184637068358,
|
|
"grad_norm": 0.5535345673561096,
|
|
"learning_rate": 3.7488742082217064e-05,
|
|
"loss": 0.947,
|
|
"num_input_tokens_seen": 261324800,
|
|
"step": 31900
|
|
},
|
|
{
|
|
"epoch": 1.0250496508424627,
|
|
"grad_norm": 1.948480248451233,
|
|
"learning_rate": 3.741523821968441e-05,
|
|
"loss": 0.9314,
|
|
"num_input_tokens_seen": 262144000,
|
|
"step": 32000
|
|
},
|
|
{
|
|
"epoch": 1.0282529310013453,
|
|
"grad_norm": 0.8400202393531799,
|
|
"learning_rate": 3.734159162361077e-05,
|
|
"loss": 0.9523,
|
|
"num_input_tokens_seen": 262963200,
|
|
"step": 32100
|
|
},
|
|
{
|
|
"epoch": 1.0314562111602281,
|
|
"grad_norm": 0.7016623020172119,
|
|
"learning_rate": 3.7267803140684635e-05,
|
|
"loss": 0.9119,
|
|
"num_input_tokens_seen": 263782400,
|
|
"step": 32200
|
|
},
|
|
{
|
|
"epoch": 1.0346594913191107,
|
|
"grad_norm": 0.6084064841270447,
|
|
"learning_rate": 3.719387361922573e-05,
|
|
"loss": 0.9027,
|
|
"num_input_tokens_seen": 264601600,
|
|
"step": 32300
|
|
},
|
|
{
|
|
"epoch": 1.0378627714779936,
|
|
"grad_norm": 1.551859736442566,
|
|
"learning_rate": 3.711980390917523e-05,
|
|
"loss": 0.9126,
|
|
"num_input_tokens_seen": 265420800,
|
|
"step": 32400
|
|
},
|
|
{
|
|
"epoch": 1.0410660516368762,
|
|
"grad_norm": 0.6663823127746582,
|
|
"learning_rate": 3.7045594862086065e-05,
|
|
"loss": 0.909,
|
|
"num_input_tokens_seen": 266240000,
|
|
"step": 32500
|
|
},
|
|
{
|
|
"epoch": 1.0442693317957588,
|
|
"grad_norm": 0.6280916333198547,
|
|
"learning_rate": 3.697124733111299e-05,
|
|
"loss": 0.8809,
|
|
"num_input_tokens_seen": 267059200,
|
|
"step": 32600
|
|
},
|
|
{
|
|
"epoch": 1.0474726119546416,
|
|
"grad_norm": 0.7370727062225342,
|
|
"learning_rate": 3.689676217100293e-05,
|
|
"loss": 0.9155,
|
|
"num_input_tokens_seen": 267878400,
|
|
"step": 32700
|
|
},
|
|
{
|
|
"epoch": 1.0506758921135242,
|
|
"grad_norm": 0.5798324942588806,
|
|
"learning_rate": 3.682214023808506e-05,
|
|
"loss": 0.9514,
|
|
"num_input_tokens_seen": 268697600,
|
|
"step": 32800
|
|
},
|
|
{
|
|
"epoch": 1.053879172272407,
|
|
"grad_norm": 0.6621294021606445,
|
|
"learning_rate": 3.674738239026097e-05,
|
|
"loss": 0.9057,
|
|
"num_input_tokens_seen": 269516800,
|
|
"step": 32900
|
|
},
|
|
{
|
|
"epoch": 1.0570824524312896,
|
|
"grad_norm": 0.9696263074874878,
|
|
"learning_rate": 3.667248948699482e-05,
|
|
"loss": 0.9083,
|
|
"num_input_tokens_seen": 270336000,
|
|
"step": 33000
|
|
},
|
|
{
|
|
"epoch": 1.0602857325901724,
|
|
"grad_norm": 1.3327863216400146,
|
|
"learning_rate": 3.659746238930345e-05,
|
|
"loss": 0.9211,
|
|
"num_input_tokens_seen": 271155200,
|
|
"step": 33100
|
|
},
|
|
{
|
|
"epoch": 1.063489012749055,
|
|
"grad_norm": 0.7066917419433594,
|
|
"learning_rate": 3.6522301959746514e-05,
|
|
"loss": 0.9384,
|
|
"num_input_tokens_seen": 271974400,
|
|
"step": 33200
|
|
},
|
|
{
|
|
"epoch": 1.0666922929079377,
|
|
"grad_norm": 0.6944926977157593,
|
|
"learning_rate": 3.6447009062416506e-05,
|
|
"loss": 0.9296,
|
|
"num_input_tokens_seen": 272793600,
|
|
"step": 33300
|
|
},
|
|
{
|
|
"epoch": 1.0698955730668205,
|
|
"grad_norm": 2.94767165184021,
|
|
"learning_rate": 3.637158456292885e-05,
|
|
"loss": 0.8913,
|
|
"num_input_tokens_seen": 273612800,
|
|
"step": 33400
|
|
},
|
|
{
|
|
"epoch": 1.073098853225703,
|
|
"grad_norm": 0.671801745891571,
|
|
"learning_rate": 3.629602932841199e-05,
|
|
"loss": 0.9251,
|
|
"num_input_tokens_seen": 274432000,
|
|
"step": 33500
|
|
},
|
|
{
|
|
"epoch": 1.076302133384586,
|
|
"grad_norm": 0.6639389991760254,
|
|
"learning_rate": 3.622034422749734e-05,
|
|
"loss": 0.9024,
|
|
"num_input_tokens_seen": 275251200,
|
|
"step": 33600
|
|
},
|
|
{
|
|
"epoch": 1.0795054135434685,
|
|
"grad_norm": 0.6131206154823303,
|
|
"learning_rate": 3.614453013030936e-05,
|
|
"loss": 0.8965,
|
|
"num_input_tokens_seen": 276070400,
|
|
"step": 33700
|
|
},
|
|
{
|
|
"epoch": 1.0827086937023511,
|
|
"grad_norm": 2.824341058731079,
|
|
"learning_rate": 3.606858790845555e-05,
|
|
"loss": 0.9058,
|
|
"num_input_tokens_seen": 276889600,
|
|
"step": 33800
|
|
},
|
|
{
|
|
"epoch": 1.085911973861234,
|
|
"grad_norm": 0.4830228388309479,
|
|
"learning_rate": 3.5992518435016376e-05,
|
|
"loss": 0.9052,
|
|
"num_input_tokens_seen": 277708800,
|
|
"step": 33900
|
|
},
|
|
{
|
|
"epoch": 1.0891152540201166,
|
|
"grad_norm": 0.49670127034187317,
|
|
"learning_rate": 3.59163225845353e-05,
|
|
"loss": 0.9027,
|
|
"num_input_tokens_seen": 278528000,
|
|
"step": 34000
|
|
},
|
|
{
|
|
"epoch": 1.0923185341789994,
|
|
"grad_norm": 0.7440226674079895,
|
|
"learning_rate": 3.584000123300869e-05,
|
|
"loss": 0.8947,
|
|
"num_input_tokens_seen": 279347200,
|
|
"step": 34100
|
|
},
|
|
{
|
|
"epoch": 1.095521814337882,
|
|
"grad_norm": 0.515023410320282,
|
|
"learning_rate": 3.576355525787576e-05,
|
|
"loss": 0.8998,
|
|
"num_input_tokens_seen": 280166400,
|
|
"step": 34200
|
|
},
|
|
{
|
|
"epoch": 1.0987250944967646,
|
|
"grad_norm": 0.8011521100997925,
|
|
"learning_rate": 3.5686985538008445e-05,
|
|
"loss": 0.8951,
|
|
"num_input_tokens_seen": 280985600,
|
|
"step": 34300
|
|
},
|
|
{
|
|
"epoch": 1.1019283746556474,
|
|
"grad_norm": 0.5452113151550293,
|
|
"learning_rate": 3.561029295370138e-05,
|
|
"loss": 0.9009,
|
|
"num_input_tokens_seen": 281804800,
|
|
"step": 34400
|
|
},
|
|
{
|
|
"epoch": 1.10513165481453,
|
|
"grad_norm": 0.8674356937408447,
|
|
"learning_rate": 3.5533478386661665e-05,
|
|
"loss": 0.9592,
|
|
"num_input_tokens_seen": 282624000,
|
|
"step": 34500
|
|
},
|
|
{
|
|
"epoch": 1.1083349349734128,
|
|
"grad_norm": 0.653605043888092,
|
|
"learning_rate": 3.545654271999886e-05,
|
|
"loss": 0.8587,
|
|
"num_input_tokens_seen": 283443200,
|
|
"step": 34600
|
|
},
|
|
{
|
|
"epoch": 1.1115382151322954,
|
|
"grad_norm": 0.5951905846595764,
|
|
"learning_rate": 3.5379486838214715e-05,
|
|
"loss": 0.906,
|
|
"num_input_tokens_seen": 284262400,
|
|
"step": 34700
|
|
},
|
|
{
|
|
"epoch": 1.1147414952911783,
|
|
"grad_norm": 0.6143243908882141,
|
|
"learning_rate": 3.530231162719307e-05,
|
|
"loss": 0.8925,
|
|
"num_input_tokens_seen": 285081600,
|
|
"step": 34800
|
|
},
|
|
{
|
|
"epoch": 1.1179447754500609,
|
|
"grad_norm": 0.569734513759613,
|
|
"learning_rate": 3.5225017974189644e-05,
|
|
"loss": 0.8922,
|
|
"num_input_tokens_seen": 285900800,
|
|
"step": 34900
|
|
},
|
|
{
|
|
"epoch": 1.1211480556089435,
|
|
"grad_norm": 1.6546896696090698,
|
|
"learning_rate": 3.5147606767821846e-05,
|
|
"loss": 0.884,
|
|
"num_input_tokens_seen": 286720000,
|
|
"step": 35000
|
|
},
|
|
{
|
|
"epoch": 1.1243513357678263,
|
|
"grad_norm": 0.7131773829460144,
|
|
"learning_rate": 3.507007889805856e-05,
|
|
"loss": 0.8941,
|
|
"num_input_tokens_seen": 287539200,
|
|
"step": 35100
|
|
},
|
|
{
|
|
"epoch": 1.127554615926709,
|
|
"grad_norm": 1.8620835542678833,
|
|
"learning_rate": 3.499243525620988e-05,
|
|
"loss": 0.9209,
|
|
"num_input_tokens_seen": 288358400,
|
|
"step": 35200
|
|
},
|
|
{
|
|
"epoch": 1.1307578960855917,
|
|
"grad_norm": 1.936231017112732,
|
|
"learning_rate": 3.491467673491692e-05,
|
|
"loss": 0.9284,
|
|
"num_input_tokens_seen": 289177600,
|
|
"step": 35300
|
|
},
|
|
{
|
|
"epoch": 1.1339611762444743,
|
|
"grad_norm": 0.5847631096839905,
|
|
"learning_rate": 3.483680422814152e-05,
|
|
"loss": 0.9036,
|
|
"num_input_tokens_seen": 289996800,
|
|
"step": 35400
|
|
},
|
|
{
|
|
"epoch": 1.137164456403357,
|
|
"grad_norm": 0.6272117495536804,
|
|
"learning_rate": 3.4758818631155934e-05,
|
|
"loss": 0.8766,
|
|
"num_input_tokens_seen": 290816000,
|
|
"step": 35500
|
|
},
|
|
{
|
|
"epoch": 1.1403677365622398,
|
|
"grad_norm": 0.50895756483078,
|
|
"learning_rate": 3.4680720840532636e-05,
|
|
"loss": 0.8996,
|
|
"num_input_tokens_seen": 291635200,
|
|
"step": 35600
|
|
},
|
|
{
|
|
"epoch": 1.1435710167211224,
|
|
"grad_norm": 0.8421196341514587,
|
|
"learning_rate": 3.460251175413388e-05,
|
|
"loss": 0.932,
|
|
"num_input_tokens_seen": 292454400,
|
|
"step": 35700
|
|
},
|
|
{
|
|
"epoch": 1.1467742968800052,
|
|
"grad_norm": 1.1610244512557983,
|
|
"learning_rate": 3.452419227110151e-05,
|
|
"loss": 0.9095,
|
|
"num_input_tokens_seen": 293273600,
|
|
"step": 35800
|
|
},
|
|
{
|
|
"epoch": 1.1499775770388878,
|
|
"grad_norm": 0.5575504302978516,
|
|
"learning_rate": 3.444576329184651e-05,
|
|
"loss": 0.9166,
|
|
"num_input_tokens_seen": 294092800,
|
|
"step": 35900
|
|
},
|
|
{
|
|
"epoch": 1.1531808571977704,
|
|
"grad_norm": 0.5330684781074524,
|
|
"learning_rate": 3.436722571803874e-05,
|
|
"loss": 0.9445,
|
|
"num_input_tokens_seen": 294912000,
|
|
"step": 36000
|
|
},
|
|
{
|
|
"epoch": 1.1563841373566532,
|
|
"grad_norm": 0.7490949630737305,
|
|
"learning_rate": 3.428858045259652e-05,
|
|
"loss": 0.8947,
|
|
"num_input_tokens_seen": 295731200,
|
|
"step": 36100
|
|
},
|
|
{
|
|
"epoch": 1.1595874175155358,
|
|
"grad_norm": 1.870923399925232,
|
|
"learning_rate": 3.420982839967624e-05,
|
|
"loss": 0.9532,
|
|
"num_input_tokens_seen": 296550400,
|
|
"step": 36200
|
|
},
|
|
{
|
|
"epoch": 1.1627906976744187,
|
|
"grad_norm": 3.164524555206299,
|
|
"learning_rate": 3.413097046466203e-05,
|
|
"loss": 0.9716,
|
|
"num_input_tokens_seen": 297369600,
|
|
"step": 36300
|
|
},
|
|
{
|
|
"epoch": 1.1659939778333013,
|
|
"grad_norm": 1.375303864479065,
|
|
"learning_rate": 3.405200755415527e-05,
|
|
"loss": 0.9364,
|
|
"num_input_tokens_seen": 298188800,
|
|
"step": 36400
|
|
},
|
|
{
|
|
"epoch": 1.169197257992184,
|
|
"grad_norm": 2.2876625061035156,
|
|
"learning_rate": 3.397294057596424e-05,
|
|
"loss": 0.8933,
|
|
"num_input_tokens_seen": 299008000,
|
|
"step": 36500
|
|
},
|
|
{
|
|
"epoch": 1.1724005381510667,
|
|
"grad_norm": 0.5776546597480774,
|
|
"learning_rate": 3.389377043909361e-05,
|
|
"loss": 0.8916,
|
|
"num_input_tokens_seen": 299827200,
|
|
"step": 36600
|
|
},
|
|
{
|
|
"epoch": 1.1756038183099493,
|
|
"grad_norm": 0.7254892587661743,
|
|
"learning_rate": 3.381449805373406e-05,
|
|
"loss": 0.922,
|
|
"num_input_tokens_seen": 300646400,
|
|
"step": 36700
|
|
},
|
|
{
|
|
"epoch": 1.1788070984688321,
|
|
"grad_norm": 0.7244319319725037,
|
|
"learning_rate": 3.3735124331251764e-05,
|
|
"loss": 0.9093,
|
|
"num_input_tokens_seen": 301465600,
|
|
"step": 36800
|
|
},
|
|
{
|
|
"epoch": 1.1820103786277147,
|
|
"grad_norm": 0.5166808366775513,
|
|
"learning_rate": 3.3655650184177957e-05,
|
|
"loss": 0.9553,
|
|
"num_input_tokens_seen": 302284800,
|
|
"step": 36900
|
|
},
|
|
{
|
|
"epoch": 1.1852136587865976,
|
|
"grad_norm": 1.6987115144729614,
|
|
"learning_rate": 3.357607652619839e-05,
|
|
"loss": 0.8768,
|
|
"num_input_tokens_seen": 303104000,
|
|
"step": 37000
|
|
},
|
|
{
|
|
"epoch": 1.1884169389454802,
|
|
"grad_norm": 0.8271929621696472,
|
|
"learning_rate": 3.349640427214287e-05,
|
|
"loss": 0.9632,
|
|
"num_input_tokens_seen": 303923200,
|
|
"step": 37100
|
|
},
|
|
{
|
|
"epoch": 1.1916202191043628,
|
|
"grad_norm": 0.7163927555084229,
|
|
"learning_rate": 3.341663433797474e-05,
|
|
"loss": 0.8682,
|
|
"num_input_tokens_seen": 304742400,
|
|
"step": 37200
|
|
},
|
|
{
|
|
"epoch": 1.1948234992632456,
|
|
"grad_norm": 0.6233458518981934,
|
|
"learning_rate": 3.33367676407803e-05,
|
|
"loss": 0.9334,
|
|
"num_input_tokens_seen": 305561600,
|
|
"step": 37300
|
|
},
|
|
{
|
|
"epoch": 1.1980267794221282,
|
|
"grad_norm": 1.0882517099380493,
|
|
"learning_rate": 3.3256805098758346e-05,
|
|
"loss": 0.9073,
|
|
"num_input_tokens_seen": 306380800,
|
|
"step": 37400
|
|
},
|
|
{
|
|
"epoch": 1.201230059581011,
|
|
"grad_norm": 0.8322218656539917,
|
|
"learning_rate": 3.3176747631209534e-05,
|
|
"loss": 0.9343,
|
|
"num_input_tokens_seen": 307200000,
|
|
"step": 37500
|
|
},
|
|
{
|
|
"epoch": 1.2044333397398936,
|
|
"grad_norm": 1.4540088176727295,
|
|
"learning_rate": 3.309659615852586e-05,
|
|
"loss": 0.8541,
|
|
"num_input_tokens_seen": 308019200,
|
|
"step": 37600
|
|
},
|
|
{
|
|
"epoch": 1.2076366198987762,
|
|
"grad_norm": 0.6830178499221802,
|
|
"learning_rate": 3.301635160218005e-05,
|
|
"loss": 0.8889,
|
|
"num_input_tokens_seen": 308838400,
|
|
"step": 37700
|
|
},
|
|
{
|
|
"epoch": 1.210839900057659,
|
|
"grad_norm": 1.9847421646118164,
|
|
"learning_rate": 3.293601488471499e-05,
|
|
"loss": 0.883,
|
|
"num_input_tokens_seen": 309657600,
|
|
"step": 37800
|
|
},
|
|
{
|
|
"epoch": 1.2140431802165417,
|
|
"grad_norm": 0.8129870891571045,
|
|
"learning_rate": 3.285558692973312e-05,
|
|
"loss": 0.9474,
|
|
"num_input_tokens_seen": 310476800,
|
|
"step": 37900
|
|
},
|
|
{
|
|
"epoch": 1.2172464603754245,
|
|
"grad_norm": 0.6733205914497375,
|
|
"learning_rate": 3.277506866188577e-05,
|
|
"loss": 0.904,
|
|
"num_input_tokens_seen": 311296000,
|
|
"step": 38000
|
|
},
|
|
{
|
|
"epoch": 1.220449740534307,
|
|
"grad_norm": 1.2211860418319702,
|
|
"learning_rate": 3.269446100686261e-05,
|
|
"loss": 0.8879,
|
|
"num_input_tokens_seen": 312115200,
|
|
"step": 38100
|
|
},
|
|
{
|
|
"epoch": 1.22365302069319,
|
|
"grad_norm": 0.7225973010063171,
|
|
"learning_rate": 3.261376489138092e-05,
|
|
"loss": 0.9139,
|
|
"num_input_tokens_seen": 312934400,
|
|
"step": 38200
|
|
},
|
|
{
|
|
"epoch": 1.2268563008520725,
|
|
"grad_norm": 0.7631468772888184,
|
|
"learning_rate": 3.253298124317502e-05,
|
|
"loss": 0.959,
|
|
"num_input_tokens_seen": 313753600,
|
|
"step": 38300
|
|
},
|
|
{
|
|
"epoch": 1.2300595810109551,
|
|
"grad_norm": 0.6244317889213562,
|
|
"learning_rate": 3.245211099098551e-05,
|
|
"loss": 0.9155,
|
|
"num_input_tokens_seen": 314572800,
|
|
"step": 38400
|
|
},
|
|
{
|
|
"epoch": 1.233262861169838,
|
|
"grad_norm": 0.5164452791213989,
|
|
"learning_rate": 3.237115506454869e-05,
|
|
"loss": 0.8758,
|
|
"num_input_tokens_seen": 315392000,
|
|
"step": 38500
|
|
},
|
|
{
|
|
"epoch": 1.2364661413287206,
|
|
"grad_norm": 0.7463127970695496,
|
|
"learning_rate": 3.2290114394585815e-05,
|
|
"loss": 0.9116,
|
|
"num_input_tokens_seen": 316211200,
|
|
"step": 38600
|
|
},
|
|
{
|
|
"epoch": 1.2396694214876034,
|
|
"grad_norm": 0.697425901889801,
|
|
"learning_rate": 3.22089899127924e-05,
|
|
"loss": 0.8743,
|
|
"num_input_tokens_seen": 317030400,
|
|
"step": 38700
|
|
},
|
|
{
|
|
"epoch": 1.242872701646486,
|
|
"grad_norm": 0.6725397706031799,
|
|
"learning_rate": 3.212778255182752e-05,
|
|
"loss": 0.9507,
|
|
"num_input_tokens_seen": 317849600,
|
|
"step": 38800
|
|
},
|
|
{
|
|
"epoch": 1.2460759818053686,
|
|
"grad_norm": 0.5633911490440369,
|
|
"learning_rate": 3.2046493245303066e-05,
|
|
"loss": 0.9114,
|
|
"num_input_tokens_seen": 318668800,
|
|
"step": 38900
|
|
},
|
|
{
|
|
"epoch": 1.2492792619642514,
|
|
"grad_norm": 0.4953620135784149,
|
|
"learning_rate": 3.196512292777305e-05,
|
|
"loss": 0.9392,
|
|
"num_input_tokens_seen": 319488000,
|
|
"step": 39000
|
|
},
|
|
{
|
|
"epoch": 1.252482542123134,
|
|
"grad_norm": 0.5511077642440796,
|
|
"learning_rate": 3.1883672534722824e-05,
|
|
"loss": 0.9277,
|
|
"num_input_tokens_seen": 320307200,
|
|
"step": 39100
|
|
},
|
|
{
|
|
"epoch": 1.2556858222820169,
|
|
"grad_norm": 1.671002745628357,
|
|
"learning_rate": 3.180214300255834e-05,
|
|
"loss": 0.8868,
|
|
"num_input_tokens_seen": 321126400,
|
|
"step": 39200
|
|
},
|
|
{
|
|
"epoch": 1.2588891024408995,
|
|
"grad_norm": 0.47333982586860657,
|
|
"learning_rate": 3.1720535268595406e-05,
|
|
"loss": 0.9129,
|
|
"num_input_tokens_seen": 321945600,
|
|
"step": 39300
|
|
},
|
|
{
|
|
"epoch": 1.262092382599782,
|
|
"grad_norm": 0.6256750226020813,
|
|
"learning_rate": 3.1638850271048845e-05,
|
|
"loss": 0.9237,
|
|
"num_input_tokens_seen": 322764800,
|
|
"step": 39400
|
|
},
|
|
{
|
|
"epoch": 1.265295662758665,
|
|
"grad_norm": 1.6359134912490845,
|
|
"learning_rate": 3.15570889490218e-05,
|
|
"loss": 0.8913,
|
|
"num_input_tokens_seen": 323584000,
|
|
"step": 39500
|
|
},
|
|
{
|
|
"epoch": 1.2684989429175475,
|
|
"grad_norm": 0.7079516649246216,
|
|
"learning_rate": 3.1475252242494855e-05,
|
|
"loss": 0.9312,
|
|
"num_input_tokens_seen": 324403200,
|
|
"step": 39600
|
|
},
|
|
{
|
|
"epoch": 1.2717022230764303,
|
|
"grad_norm": 0.5469818711280823,
|
|
"learning_rate": 3.139334109231527e-05,
|
|
"loss": 0.8776,
|
|
"num_input_tokens_seen": 325222400,
|
|
"step": 39700
|
|
},
|
|
{
|
|
"epoch": 1.274905503235313,
|
|
"grad_norm": 0.6753129959106445,
|
|
"learning_rate": 3.131135644018617e-05,
|
|
"loss": 0.9715,
|
|
"num_input_tokens_seen": 326041600,
|
|
"step": 39800
|
|
},
|
|
{
|
|
"epoch": 1.2781087833941958,
|
|
"grad_norm": 1.3139586448669434,
|
|
"learning_rate": 3.1229299228655683e-05,
|
|
"loss": 0.9268,
|
|
"num_input_tokens_seen": 326860800,
|
|
"step": 39900
|
|
},
|
|
{
|
|
"epoch": 1.2813120635530784,
|
|
"grad_norm": 0.6371886730194092,
|
|
"learning_rate": 3.1147170401106154e-05,
|
|
"loss": 0.9286,
|
|
"num_input_tokens_seen": 327680000,
|
|
"step": 40000
|
|
},
|
|
{
|
|
"epoch": 1.284515343711961,
|
|
"grad_norm": 0.9212737083435059,
|
|
"learning_rate": 3.106497090174325e-05,
|
|
"loss": 0.9317,
|
|
"num_input_tokens_seen": 328499200,
|
|
"step": 40100
|
|
},
|
|
{
|
|
"epoch": 1.2877186238708438,
|
|
"grad_norm": 0.6135571002960205,
|
|
"learning_rate": 3.098270167558514e-05,
|
|
"loss": 0.9152,
|
|
"num_input_tokens_seen": 329318400,
|
|
"step": 40200
|
|
},
|
|
{
|
|
"epoch": 1.2909219040297264,
|
|
"grad_norm": 0.6993789076805115,
|
|
"learning_rate": 3.09003636684516e-05,
|
|
"loss": 0.9283,
|
|
"num_input_tokens_seen": 330137600,
|
|
"step": 40300
|
|
},
|
|
{
|
|
"epoch": 1.294125184188609,
|
|
"grad_norm": 0.7431827783584595,
|
|
"learning_rate": 3.081795782695317e-05,
|
|
"loss": 0.9307,
|
|
"num_input_tokens_seen": 330956800,
|
|
"step": 40400
|
|
},
|
|
{
|
|
"epoch": 1.2973284643474918,
|
|
"grad_norm": 0.9774760603904724,
|
|
"learning_rate": 3.0735485098480255e-05,
|
|
"loss": 0.8917,
|
|
"num_input_tokens_seen": 331776000,
|
|
"step": 40500
|
|
},
|
|
{
|
|
"epoch": 1.3005317445063747,
|
|
"grad_norm": 0.5644115209579468,
|
|
"learning_rate": 3.0652946431192244e-05,
|
|
"loss": 0.9321,
|
|
"num_input_tokens_seen": 332595200,
|
|
"step": 40600
|
|
},
|
|
{
|
|
"epoch": 1.3037350246652573,
|
|
"grad_norm": 2.2749266624450684,
|
|
"learning_rate": 3.057034277400658e-05,
|
|
"loss": 0.9211,
|
|
"num_input_tokens_seen": 333414400,
|
|
"step": 40700
|
|
},
|
|
{
|
|
"epoch": 1.3069383048241399,
|
|
"grad_norm": 0.6312987804412842,
|
|
"learning_rate": 3.048767507658788e-05,
|
|
"loss": 0.913,
|
|
"num_input_tokens_seen": 334233600,
|
|
"step": 40800
|
|
},
|
|
{
|
|
"epoch": 1.3101415849830227,
|
|
"grad_norm": 0.5494056344032288,
|
|
"learning_rate": 3.0404944289337034e-05,
|
|
"loss": 0.9423,
|
|
"num_input_tokens_seen": 335052800,
|
|
"step": 40900
|
|
},
|
|
{
|
|
"epoch": 1.3133448651419053,
|
|
"grad_norm": 1.3932960033416748,
|
|
"learning_rate": 3.0322151363380202e-05,
|
|
"loss": 0.9409,
|
|
"num_input_tokens_seen": 335872000,
|
|
"step": 41000
|
|
},
|
|
{
|
|
"epoch": 1.316548145300788,
|
|
"grad_norm": 0.7711178660392761,
|
|
"learning_rate": 3.023929725055798e-05,
|
|
"loss": 0.9187,
|
|
"num_input_tokens_seen": 336691200,
|
|
"step": 41100
|
|
},
|
|
{
|
|
"epoch": 1.3197514254596707,
|
|
"grad_norm": 0.9086521863937378,
|
|
"learning_rate": 3.0156382903414383e-05,
|
|
"loss": 1.0063,
|
|
"num_input_tokens_seen": 337510400,
|
|
"step": 41200
|
|
},
|
|
{
|
|
"epoch": 1.3229547056185533,
|
|
"grad_norm": 0.6938414573669434,
|
|
"learning_rate": 3.007340927518591e-05,
|
|
"loss": 0.8821,
|
|
"num_input_tokens_seen": 338329600,
|
|
"step": 41300
|
|
},
|
|
{
|
|
"epoch": 1.3261579857774362,
|
|
"grad_norm": 0.5269713401794434,
|
|
"learning_rate": 2.999037731979063e-05,
|
|
"loss": 0.8968,
|
|
"num_input_tokens_seen": 339148800,
|
|
"step": 41400
|
|
},
|
|
{
|
|
"epoch": 1.3293612659363188,
|
|
"grad_norm": 0.69822096824646,
|
|
"learning_rate": 2.9907287991817128e-05,
|
|
"loss": 0.955,
|
|
"num_input_tokens_seen": 339968000,
|
|
"step": 41500
|
|
},
|
|
{
|
|
"epoch": 1.3325645460952016,
|
|
"grad_norm": 1.9268356561660767,
|
|
"learning_rate": 2.9824142246513624e-05,
|
|
"loss": 0.9096,
|
|
"num_input_tokens_seen": 340787200,
|
|
"step": 41600
|
|
},
|
|
{
|
|
"epoch": 1.3357678262540842,
|
|
"grad_norm": 0.5475559234619141,
|
|
"learning_rate": 2.9740941039776925e-05,
|
|
"loss": 0.8828,
|
|
"num_input_tokens_seen": 341606400,
|
|
"step": 41700
|
|
},
|
|
{
|
|
"epoch": 1.3389711064129668,
|
|
"grad_norm": 1.9515366554260254,
|
|
"learning_rate": 2.9657685328141466e-05,
|
|
"loss": 0.9614,
|
|
"num_input_tokens_seen": 342425600,
|
|
"step": 41800
|
|
},
|
|
{
|
|
"epoch": 1.3421743865718496,
|
|
"grad_norm": 0.6959076523780823,
|
|
"learning_rate": 2.95743760687683e-05,
|
|
"loss": 0.8739,
|
|
"num_input_tokens_seen": 343244800,
|
|
"step": 41900
|
|
},
|
|
{
|
|
"epoch": 1.3453776667307322,
|
|
"grad_norm": 0.761962890625,
|
|
"learning_rate": 2.9491014219434105e-05,
|
|
"loss": 0.9595,
|
|
"num_input_tokens_seen": 344064000,
|
|
"step": 42000
|
|
},
|
|
{
|
|
"epoch": 1.3485809468896148,
|
|
"grad_norm": 0.6127232909202576,
|
|
"learning_rate": 2.9407600738520162e-05,
|
|
"loss": 0.9026,
|
|
"num_input_tokens_seen": 344883200,
|
|
"step": 42100
|
|
},
|
|
{
|
|
"epoch": 1.3517842270484977,
|
|
"grad_norm": 0.6869720220565796,
|
|
"learning_rate": 2.9324136585001348e-05,
|
|
"loss": 0.9488,
|
|
"num_input_tokens_seen": 345702400,
|
|
"step": 42200
|
|
},
|
|
{
|
|
"epoch": 1.3549875072073805,
|
|
"grad_norm": 0.7109299898147583,
|
|
"learning_rate": 2.9240622718435107e-05,
|
|
"loss": 0.9433,
|
|
"num_input_tokens_seen": 346521600,
|
|
"step": 42300
|
|
},
|
|
{
|
|
"epoch": 1.358190787366263,
|
|
"grad_norm": 0.6879071593284607,
|
|
"learning_rate": 2.9157060098950395e-05,
|
|
"loss": 0.8783,
|
|
"num_input_tokens_seen": 347340800,
|
|
"step": 42400
|
|
},
|
|
{
|
|
"epoch": 1.3613940675251457,
|
|
"grad_norm": 0.5623328685760498,
|
|
"learning_rate": 2.9073449687236688e-05,
|
|
"loss": 0.8925,
|
|
"num_input_tokens_seen": 348160000,
|
|
"step": 42500
|
|
},
|
|
{
|
|
"epoch": 1.3645973476840285,
|
|
"grad_norm": 0.9881012439727783,
|
|
"learning_rate": 2.8989792444532892e-05,
|
|
"loss": 0.9417,
|
|
"num_input_tokens_seen": 348979200,
|
|
"step": 42600
|
|
},
|
|
{
|
|
"epoch": 1.3678006278429111,
|
|
"grad_norm": 0.6569281816482544,
|
|
"learning_rate": 2.890608933261633e-05,
|
|
"loss": 0.9262,
|
|
"num_input_tokens_seen": 349798400,
|
|
"step": 42700
|
|
},
|
|
{
|
|
"epoch": 1.3710039080017937,
|
|
"grad_norm": 0.9453611969947815,
|
|
"learning_rate": 2.882234131379167e-05,
|
|
"loss": 0.9022,
|
|
"num_input_tokens_seen": 350617600,
|
|
"step": 42800
|
|
},
|
|
{
|
|
"epoch": 1.3742071881606766,
|
|
"grad_norm": 0.5668920874595642,
|
|
"learning_rate": 2.8738549350879824e-05,
|
|
"loss": 0.9306,
|
|
"num_input_tokens_seen": 351436800,
|
|
"step": 42900
|
|
},
|
|
{
|
|
"epoch": 1.3774104683195592,
|
|
"grad_norm": 0.8056479692459106,
|
|
"learning_rate": 2.8654714407206956e-05,
|
|
"loss": 0.8878,
|
|
"num_input_tokens_seen": 352256000,
|
|
"step": 43000
|
|
},
|
|
{
|
|
"epoch": 1.380613748478442,
|
|
"grad_norm": 0.863929271697998,
|
|
"learning_rate": 2.8570837446593336e-05,
|
|
"loss": 0.9391,
|
|
"num_input_tokens_seen": 353075200,
|
|
"step": 43100
|
|
},
|
|
{
|
|
"epoch": 1.3838170286373246,
|
|
"grad_norm": 0.5808566808700562,
|
|
"learning_rate": 2.8486919433342295e-05,
|
|
"loss": 0.9061,
|
|
"num_input_tokens_seen": 353894400,
|
|
"step": 43200
|
|
},
|
|
{
|
|
"epoch": 1.3870203087962074,
|
|
"grad_norm": 0.8920639157295227,
|
|
"learning_rate": 2.8402961332229143e-05,
|
|
"loss": 0.8854,
|
|
"num_input_tokens_seen": 354713600,
|
|
"step": 43300
|
|
},
|
|
{
|
|
"epoch": 1.39022358895509,
|
|
"grad_norm": 0.6987112760543823,
|
|
"learning_rate": 2.831896410849005e-05,
|
|
"loss": 0.893,
|
|
"num_input_tokens_seen": 355532800,
|
|
"step": 43400
|
|
},
|
|
{
|
|
"epoch": 1.3934268691139726,
|
|
"grad_norm": 0.6486085653305054,
|
|
"learning_rate": 2.823492872781098e-05,
|
|
"loss": 0.9166,
|
|
"num_input_tokens_seen": 356352000,
|
|
"step": 43500
|
|
},
|
|
{
|
|
"epoch": 1.3966301492728554,
|
|
"grad_norm": 1.6597498655319214,
|
|
"learning_rate": 2.815085615631654e-05,
|
|
"loss": 0.9473,
|
|
"num_input_tokens_seen": 357171200,
|
|
"step": 43600
|
|
},
|
|
{
|
|
"epoch": 1.399833429431738,
|
|
"grad_norm": 0.598414957523346,
|
|
"learning_rate": 2.8066747360558966e-05,
|
|
"loss": 0.9046,
|
|
"num_input_tokens_seen": 357990400,
|
|
"step": 43700
|
|
},
|
|
{
|
|
"epoch": 1.4030367095906209,
|
|
"grad_norm": 2.125504732131958,
|
|
"learning_rate": 2.798260330750689e-05,
|
|
"loss": 0.9325,
|
|
"num_input_tokens_seen": 358809600,
|
|
"step": 43800
|
|
},
|
|
{
|
|
"epoch": 1.4062399897495035,
|
|
"grad_norm": 0.798989474773407,
|
|
"learning_rate": 2.789842496453432e-05,
|
|
"loss": 0.9057,
|
|
"num_input_tokens_seen": 359628800,
|
|
"step": 43900
|
|
},
|
|
{
|
|
"epoch": 1.4094432699083863,
|
|
"grad_norm": 0.8189502954483032,
|
|
"learning_rate": 2.7814213299409475e-05,
|
|
"loss": 0.923,
|
|
"num_input_tokens_seen": 360448000,
|
|
"step": 44000
|
|
},
|
|
{
|
|
"epoch": 1.412646550067269,
|
|
"grad_norm": 0.5460119247436523,
|
|
"learning_rate": 2.7729969280283662e-05,
|
|
"loss": 0.8764,
|
|
"num_input_tokens_seen": 361267200,
|
|
"step": 44100
|
|
},
|
|
{
|
|
"epoch": 1.4158498302261515,
|
|
"grad_norm": 0.6900705695152283,
|
|
"learning_rate": 2.7645693875680163e-05,
|
|
"loss": 0.9295,
|
|
"num_input_tokens_seen": 362086400,
|
|
"step": 44200
|
|
},
|
|
{
|
|
"epoch": 1.4190531103850343,
|
|
"grad_norm": 0.7309842705726624,
|
|
"learning_rate": 2.7561388054483074e-05,
|
|
"loss": 0.8883,
|
|
"num_input_tokens_seen": 362905600,
|
|
"step": 44300
|
|
},
|
|
{
|
|
"epoch": 1.422256390543917,
|
|
"grad_norm": 0.9340581297874451,
|
|
"learning_rate": 2.7477052785926178e-05,
|
|
"loss": 0.8784,
|
|
"num_input_tokens_seen": 363724800,
|
|
"step": 44400
|
|
},
|
|
{
|
|
"epoch": 1.4254596707027996,
|
|
"grad_norm": 0.6001551151275635,
|
|
"learning_rate": 2.7392689039581815e-05,
|
|
"loss": 0.949,
|
|
"num_input_tokens_seen": 364544000,
|
|
"step": 44500
|
|
},
|
|
{
|
|
"epoch": 1.4286629508616824,
|
|
"grad_norm": 0.5180249810218811,
|
|
"learning_rate": 2.7308297785349724e-05,
|
|
"loss": 0.8738,
|
|
"num_input_tokens_seen": 365363200,
|
|
"step": 44600
|
|
},
|
|
{
|
|
"epoch": 1.431866231020565,
|
|
"grad_norm": 0.6243082284927368,
|
|
"learning_rate": 2.7223879993445873e-05,
|
|
"loss": 0.9074,
|
|
"num_input_tokens_seen": 366182400,
|
|
"step": 44700
|
|
},
|
|
{
|
|
"epoch": 1.4350695111794478,
|
|
"grad_norm": 0.6807756423950195,
|
|
"learning_rate": 2.713943663439135e-05,
|
|
"loss": 0.953,
|
|
"num_input_tokens_seen": 367001600,
|
|
"step": 44800
|
|
},
|
|
{
|
|
"epoch": 1.4382727913383304,
|
|
"grad_norm": 0.6057282090187073,
|
|
"learning_rate": 2.7054968679001174e-05,
|
|
"loss": 0.8736,
|
|
"num_input_tokens_seen": 367820800,
|
|
"step": 44900
|
|
},
|
|
{
|
|
"epoch": 1.4414760714972132,
|
|
"grad_norm": 0.593506395816803,
|
|
"learning_rate": 2.697047709837312e-05,
|
|
"loss": 0.8405,
|
|
"num_input_tokens_seen": 368640000,
|
|
"step": 45000
|
|
},
|
|
{
|
|
"epoch": 1.4446793516560958,
|
|
"grad_norm": 0.7090416550636292,
|
|
"learning_rate": 2.6885962863876596e-05,
|
|
"loss": 0.8852,
|
|
"num_input_tokens_seen": 369459200,
|
|
"step": 45100
|
|
},
|
|
{
|
|
"epoch": 1.4478826318149784,
|
|
"grad_norm": 0.5391395092010498,
|
|
"learning_rate": 2.6801426947141435e-05,
|
|
"loss": 0.9029,
|
|
"num_input_tokens_seen": 370278400,
|
|
"step": 45200
|
|
},
|
|
{
|
|
"epoch": 1.4510859119738613,
|
|
"grad_norm": 0.5424131751060486,
|
|
"learning_rate": 2.671687032004676e-05,
|
|
"loss": 0.8751,
|
|
"num_input_tokens_seen": 371097600,
|
|
"step": 45300
|
|
},
|
|
{
|
|
"epoch": 1.4542891921327439,
|
|
"grad_norm": 0.5781705975532532,
|
|
"learning_rate": 2.6632293954709785e-05,
|
|
"loss": 0.9417,
|
|
"num_input_tokens_seen": 371916800,
|
|
"step": 45400
|
|
},
|
|
{
|
|
"epoch": 1.4574924722916267,
|
|
"grad_norm": 0.5788801312446594,
|
|
"learning_rate": 2.654769882347464e-05,
|
|
"loss": 0.9022,
|
|
"num_input_tokens_seen": 372736000,
|
|
"step": 45500
|
|
},
|
|
{
|
|
"epoch": 1.4606957524505093,
|
|
"grad_norm": 0.6637430787086487,
|
|
"learning_rate": 2.646308589890123e-05,
|
|
"loss": 0.9017,
|
|
"num_input_tokens_seen": 373555200,
|
|
"step": 45600
|
|
},
|
|
{
|
|
"epoch": 1.4638990326093921,
|
|
"grad_norm": 0.7034772634506226,
|
|
"learning_rate": 2.637845615375397e-05,
|
|
"loss": 0.883,
|
|
"num_input_tokens_seen": 374374400,
|
|
"step": 45700
|
|
},
|
|
{
|
|
"epoch": 1.4671023127682747,
|
|
"grad_norm": 0.6476500630378723,
|
|
"learning_rate": 2.629381056099071e-05,
|
|
"loss": 0.9469,
|
|
"num_input_tokens_seen": 375193600,
|
|
"step": 45800
|
|
},
|
|
{
|
|
"epoch": 1.4703055929271573,
|
|
"grad_norm": 0.560495913028717,
|
|
"learning_rate": 2.6209150093751473e-05,
|
|
"loss": 0.885,
|
|
"num_input_tokens_seen": 376012800,
|
|
"step": 45900
|
|
},
|
|
{
|
|
"epoch": 1.4735088730860402,
|
|
"grad_norm": 1.9203239679336548,
|
|
"learning_rate": 2.612447572534727e-05,
|
|
"loss": 0.9248,
|
|
"num_input_tokens_seen": 376832000,
|
|
"step": 46000
|
|
},
|
|
{
|
|
"epoch": 1.4767121532449228,
|
|
"grad_norm": 2.3468987941741943,
|
|
"learning_rate": 2.6039788429248957e-05,
|
|
"loss": 0.9041,
|
|
"num_input_tokens_seen": 377651200,
|
|
"step": 46100
|
|
},
|
|
{
|
|
"epoch": 1.4799154334038054,
|
|
"grad_norm": 0.6502100825309753,
|
|
"learning_rate": 2.5955089179075997e-05,
|
|
"loss": 0.9431,
|
|
"num_input_tokens_seen": 378470400,
|
|
"step": 46200
|
|
},
|
|
{
|
|
"epoch": 1.4831187135626882,
|
|
"grad_norm": 3.609816551208496,
|
|
"learning_rate": 2.5870378948585295e-05,
|
|
"loss": 0.8893,
|
|
"num_input_tokens_seen": 379289600,
|
|
"step": 46300
|
|
},
|
|
{
|
|
"epoch": 1.4863219937215708,
|
|
"grad_norm": 0.58833247423172,
|
|
"learning_rate": 2.5785658711659987e-05,
|
|
"loss": 0.9181,
|
|
"num_input_tokens_seen": 380108800,
|
|
"step": 46400
|
|
},
|
|
{
|
|
"epoch": 1.4895252738804536,
|
|
"grad_norm": 1.7303794622421265,
|
|
"learning_rate": 2.570092944229826e-05,
|
|
"loss": 0.8921,
|
|
"num_input_tokens_seen": 380928000,
|
|
"step": 46500
|
|
},
|
|
{
|
|
"epoch": 1.4927285540393362,
|
|
"grad_norm": 0.7278485894203186,
|
|
"learning_rate": 2.5616192114602127e-05,
|
|
"loss": 0.8693,
|
|
"num_input_tokens_seen": 381747200,
|
|
"step": 46600
|
|
},
|
|
{
|
|
"epoch": 1.495931834198219,
|
|
"grad_norm": 0.7616570591926575,
|
|
"learning_rate": 2.5531447702766254e-05,
|
|
"loss": 0.9397,
|
|
"num_input_tokens_seen": 382566400,
|
|
"step": 46700
|
|
},
|
|
{
|
|
"epoch": 1.4991351143571017,
|
|
"grad_norm": 0.11684958636760712,
|
|
"learning_rate": 2.5446697181066747e-05,
|
|
"loss": 0.8526,
|
|
"num_input_tokens_seen": 383385600,
|
|
"step": 46800
|
|
},
|
|
{
|
|
"epoch": 1.5023383945159843,
|
|
"grad_norm": 0.7726488709449768,
|
|
"learning_rate": 2.536194152384997e-05,
|
|
"loss": 0.9122,
|
|
"num_input_tokens_seen": 384204800,
|
|
"step": 46900
|
|
},
|
|
{
|
|
"epoch": 1.505541674674867,
|
|
"grad_norm": 0.7091355323791504,
|
|
"learning_rate": 2.527718170552129e-05,
|
|
"loss": 0.8666,
|
|
"num_input_tokens_seen": 385024000,
|
|
"step": 47000
|
|
},
|
|
{
|
|
"epoch": 1.5087449548337497,
|
|
"grad_norm": 2.5142340660095215,
|
|
"learning_rate": 2.519241870053396e-05,
|
|
"loss": 0.911,
|
|
"num_input_tokens_seen": 385843200,
|
|
"step": 47100
|
|
},
|
|
{
|
|
"epoch": 1.5119482349926323,
|
|
"grad_norm": 0.6862989664077759,
|
|
"learning_rate": 2.5107653483377852e-05,
|
|
"loss": 0.974,
|
|
"num_input_tokens_seen": 386662400,
|
|
"step": 47200
|
|
},
|
|
{
|
|
"epoch": 1.5151515151515151,
|
|
"grad_norm": 2.351198196411133,
|
|
"learning_rate": 2.502288702856824e-05,
|
|
"loss": 0.8986,
|
|
"num_input_tokens_seen": 387481600,
|
|
"step": 47300
|
|
},
|
|
{
|
|
"epoch": 1.518354795310398,
|
|
"grad_norm": 0.7517640590667725,
|
|
"learning_rate": 2.4938120310634682e-05,
|
|
"loss": 0.8549,
|
|
"num_input_tokens_seen": 388300800,
|
|
"step": 47400
|
|
},
|
|
{
|
|
"epoch": 1.5215580754692806,
|
|
"grad_norm": 2.709975004196167,
|
|
"learning_rate": 2.485335430410972e-05,
|
|
"loss": 0.899,
|
|
"num_input_tokens_seen": 389120000,
|
|
"step": 47500
|
|
},
|
|
{
|
|
"epoch": 1.5247613556281632,
|
|
"grad_norm": 0.7952636480331421,
|
|
"learning_rate": 2.4768589983517716e-05,
|
|
"loss": 0.8622,
|
|
"num_input_tokens_seen": 389939200,
|
|
"step": 47600
|
|
},
|
|
{
|
|
"epoch": 1.527964635787046,
|
|
"grad_norm": 0.7378533482551575,
|
|
"learning_rate": 2.4683828323363687e-05,
|
|
"loss": 0.8334,
|
|
"num_input_tokens_seen": 390758400,
|
|
"step": 47700
|
|
},
|
|
{
|
|
"epoch": 1.5311679159459286,
|
|
"grad_norm": 2.5980470180511475,
|
|
"learning_rate": 2.459907029812203e-05,
|
|
"loss": 0.9028,
|
|
"num_input_tokens_seen": 391577600,
|
|
"step": 47800
|
|
},
|
|
{
|
|
"epoch": 1.5343711961048112,
|
|
"grad_norm": 0.6807860732078552,
|
|
"learning_rate": 2.4514316882225347e-05,
|
|
"loss": 0.9259,
|
|
"num_input_tokens_seen": 392396800,
|
|
"step": 47900
|
|
},
|
|
{
|
|
"epoch": 1.537574476263694,
|
|
"grad_norm": 2.3691670894622803,
|
|
"learning_rate": 2.442956905005328e-05,
|
|
"loss": 0.8639,
|
|
"num_input_tokens_seen": 393216000,
|
|
"step": 48000
|
|
},
|
|
{
|
|
"epoch": 1.5407777564225769,
|
|
"grad_norm": 0.7466169595718384,
|
|
"learning_rate": 2.434482777592125e-05,
|
|
"loss": 0.8828,
|
|
"num_input_tokens_seen": 394035200,
|
|
"step": 48100
|
|
},
|
|
{
|
|
"epoch": 1.5439810365814595,
|
|
"grad_norm": 0.5329868793487549,
|
|
"learning_rate": 2.426009403406931e-05,
|
|
"loss": 0.8802,
|
|
"num_input_tokens_seen": 394854400,
|
|
"step": 48200
|
|
},
|
|
{
|
|
"epoch": 1.547184316740342,
|
|
"grad_norm": 0.6394245028495789,
|
|
"learning_rate": 2.4175368798650884e-05,
|
|
"loss": 0.8811,
|
|
"num_input_tokens_seen": 395673600,
|
|
"step": 48300
|
|
},
|
|
{
|
|
"epoch": 1.550387596899225,
|
|
"grad_norm": 0.9404513239860535,
|
|
"learning_rate": 2.4090653043721612e-05,
|
|
"loss": 0.8663,
|
|
"num_input_tokens_seen": 396492800,
|
|
"step": 48400
|
|
},
|
|
{
|
|
"epoch": 1.5535908770581075,
|
|
"grad_norm": 0.7973567843437195,
|
|
"learning_rate": 2.4005947743228157e-05,
|
|
"loss": 0.9452,
|
|
"num_input_tokens_seen": 397312000,
|
|
"step": 48500
|
|
},
|
|
{
|
|
"epoch": 1.55679415721699,
|
|
"grad_norm": 1.8970893621444702,
|
|
"learning_rate": 2.3921253870996972e-05,
|
|
"loss": 0.8968,
|
|
"num_input_tokens_seen": 398131200,
|
|
"step": 48600
|
|
},
|
|
{
|
|
"epoch": 1.559997437375873,
|
|
"grad_norm": 0.7782315015792847,
|
|
"learning_rate": 2.383657240072314e-05,
|
|
"loss": 0.9475,
|
|
"num_input_tokens_seen": 398950400,
|
|
"step": 48700
|
|
},
|
|
{
|
|
"epoch": 1.5632007175347555,
|
|
"grad_norm": 0.72723788022995,
|
|
"learning_rate": 2.375190430595914e-05,
|
|
"loss": 0.9347,
|
|
"num_input_tokens_seen": 399769600,
|
|
"step": 48800
|
|
},
|
|
{
|
|
"epoch": 1.5664039976936381,
|
|
"grad_norm": 0.5238316655158997,
|
|
"learning_rate": 2.366725056010369e-05,
|
|
"loss": 0.8969,
|
|
"num_input_tokens_seen": 400588800,
|
|
"step": 48900
|
|
},
|
|
{
|
|
"epoch": 1.569607277852521,
|
|
"grad_norm": 0.7676683664321899,
|
|
"learning_rate": 2.3582612136390556e-05,
|
|
"loss": 0.8926,
|
|
"num_input_tokens_seen": 401408000,
|
|
"step": 49000
|
|
},
|
|
{
|
|
"epoch": 1.5728105580114038,
|
|
"grad_norm": 1.64457106590271,
|
|
"learning_rate": 2.349799000787733e-05,
|
|
"loss": 0.9027,
|
|
"num_input_tokens_seen": 402227200,
|
|
"step": 49100
|
|
},
|
|
{
|
|
"epoch": 1.5760138381702864,
|
|
"grad_norm": 0.5461480617523193,
|
|
"learning_rate": 2.3413385147434285e-05,
|
|
"loss": 0.8651,
|
|
"num_input_tokens_seen": 403046400,
|
|
"step": 49200
|
|
},
|
|
{
|
|
"epoch": 1.579217118329169,
|
|
"grad_norm": 0.527300238609314,
|
|
"learning_rate": 2.332879852773314e-05,
|
|
"loss": 0.8354,
|
|
"num_input_tokens_seen": 403865600,
|
|
"step": 49300
|
|
},
|
|
{
|
|
"epoch": 1.5824203984880518,
|
|
"grad_norm": 0.8455817699432373,
|
|
"learning_rate": 2.3244231121235936e-05,
|
|
"loss": 0.903,
|
|
"num_input_tokens_seen": 404684800,
|
|
"step": 49400
|
|
},
|
|
{
|
|
"epoch": 1.5856236786469344,
|
|
"grad_norm": 0.8457258939743042,
|
|
"learning_rate": 2.3159683900183812e-05,
|
|
"loss": 0.9085,
|
|
"num_input_tokens_seen": 405504000,
|
|
"step": 49500
|
|
},
|
|
{
|
|
"epoch": 1.588826958805817,
|
|
"grad_norm": 0.7063552141189575,
|
|
"learning_rate": 2.3075157836585854e-05,
|
|
"loss": 0.9002,
|
|
"num_input_tokens_seen": 406323200,
|
|
"step": 49600
|
|
},
|
|
{
|
|
"epoch": 1.5920302389646999,
|
|
"grad_norm": 0.6034948229789734,
|
|
"learning_rate": 2.2990653902207875e-05,
|
|
"loss": 0.8665,
|
|
"num_input_tokens_seen": 407142400,
|
|
"step": 49700
|
|
},
|
|
{
|
|
"epoch": 1.5952335191235827,
|
|
"grad_norm": 0.6883265972137451,
|
|
"learning_rate": 2.2906173068561324e-05,
|
|
"loss": 0.9031,
|
|
"num_input_tokens_seen": 407961600,
|
|
"step": 49800
|
|
},
|
|
{
|
|
"epoch": 1.5984367992824653,
|
|
"grad_norm": 0.6610883474349976,
|
|
"learning_rate": 2.282171630689203e-05,
|
|
"loss": 0.9153,
|
|
"num_input_tokens_seen": 408780800,
|
|
"step": 49900
|
|
},
|
|
{
|
|
"epoch": 1.601640079441348,
|
|
"grad_norm": 1.8148962259292603,
|
|
"learning_rate": 2.2737284588169107e-05,
|
|
"loss": 0.8904,
|
|
"num_input_tokens_seen": 409600000,
|
|
"step": 50000
|
|
},
|
|
{
|
|
"epoch": 1.6048433596002307,
|
|
"grad_norm": 0.8317341804504395,
|
|
"learning_rate": 2.2652878883073736e-05,
|
|
"loss": 0.8847,
|
|
"num_input_tokens_seen": 410419200,
|
|
"step": 50100
|
|
},
|
|
{
|
|
"epoch": 1.6080466397591133,
|
|
"grad_norm": 0.5359209179878235,
|
|
"learning_rate": 2.2568500161988023e-05,
|
|
"loss": 0.8983,
|
|
"num_input_tokens_seen": 411238400,
|
|
"step": 50200
|
|
},
|
|
{
|
|
"epoch": 1.611249919917996,
|
|
"grad_norm": 0.6819952726364136,
|
|
"learning_rate": 2.2484149394983882e-05,
|
|
"loss": 0.9138,
|
|
"num_input_tokens_seen": 412057600,
|
|
"step": 50300
|
|
},
|
|
{
|
|
"epoch": 1.6144532000768788,
|
|
"grad_norm": 0.8475795984268188,
|
|
"learning_rate": 2.239982755181181e-05,
|
|
"loss": 0.8536,
|
|
"num_input_tokens_seen": 412876800,
|
|
"step": 50400
|
|
},
|
|
{
|
|
"epoch": 1.6176564802357616,
|
|
"grad_norm": 1.1045705080032349,
|
|
"learning_rate": 2.2315535601889814e-05,
|
|
"loss": 0.9137,
|
|
"num_input_tokens_seen": 413696000,
|
|
"step": 50500
|
|
},
|
|
{
|
|
"epoch": 1.620859760394644,
|
|
"grad_norm": 0.6131917834281921,
|
|
"learning_rate": 2.2231274514292196e-05,
|
|
"loss": 0.8992,
|
|
"num_input_tokens_seen": 414515200,
|
|
"step": 50600
|
|
},
|
|
{
|
|
"epoch": 1.6240630405535268,
|
|
"grad_norm": 0.6096556186676025,
|
|
"learning_rate": 2.214704525773846e-05,
|
|
"loss": 0.9211,
|
|
"num_input_tokens_seen": 415334400,
|
|
"step": 50700
|
|
},
|
|
{
|
|
"epoch": 1.6272663207124096,
|
|
"grad_norm": 0.5279362797737122,
|
|
"learning_rate": 2.2062848800582168e-05,
|
|
"loss": 0.9231,
|
|
"num_input_tokens_seen": 416153600,
|
|
"step": 50800
|
|
},
|
|
{
|
|
"epoch": 1.6304696008712922,
|
|
"grad_norm": 0.5645897388458252,
|
|
"learning_rate": 2.197868611079978e-05,
|
|
"loss": 0.8579,
|
|
"num_input_tokens_seen": 416972800,
|
|
"step": 50900
|
|
},
|
|
{
|
|
"epoch": 1.6336728810301748,
|
|
"grad_norm": 0.5469439029693604,
|
|
"learning_rate": 2.189455815597957e-05,
|
|
"loss": 0.8802,
|
|
"num_input_tokens_seen": 417792000,
|
|
"step": 51000
|
|
},
|
|
{
|
|
"epoch": 1.6368761611890577,
|
|
"grad_norm": 0.7165865898132324,
|
|
"learning_rate": 2.1810465903310445e-05,
|
|
"loss": 0.897,
|
|
"num_input_tokens_seen": 418611200,
|
|
"step": 51100
|
|
},
|
|
{
|
|
"epoch": 1.6400794413479403,
|
|
"grad_norm": 0.49263107776641846,
|
|
"learning_rate": 2.1726410319570874e-05,
|
|
"loss": 0.9145,
|
|
"num_input_tokens_seen": 419430400,
|
|
"step": 51200
|
|
},
|
|
{
|
|
"epoch": 1.6432827215068229,
|
|
"grad_norm": 0.7984305620193481,
|
|
"learning_rate": 2.164239237111776e-05,
|
|
"loss": 0.9656,
|
|
"num_input_tokens_seen": 420249600,
|
|
"step": 51300
|
|
},
|
|
{
|
|
"epoch": 1.6464860016657057,
|
|
"grad_norm": 0.6783995628356934,
|
|
"learning_rate": 2.1558413023875334e-05,
|
|
"loss": 0.8937,
|
|
"num_input_tokens_seen": 421068800,
|
|
"step": 51400
|
|
},
|
|
{
|
|
"epoch": 1.6496892818245885,
|
|
"grad_norm": 0.6700116395950317,
|
|
"learning_rate": 2.147447324332403e-05,
|
|
"loss": 0.8966,
|
|
"num_input_tokens_seen": 421888000,
|
|
"step": 51500
|
|
},
|
|
{
|
|
"epoch": 1.6528925619834711,
|
|
"grad_norm": 2.6840033531188965,
|
|
"learning_rate": 2.1390573994489377e-05,
|
|
"loss": 0.9922,
|
|
"num_input_tokens_seen": 422707200,
|
|
"step": 51600
|
|
},
|
|
{
|
|
"epoch": 1.6560958421423537,
|
|
"grad_norm": 0.6062913537025452,
|
|
"learning_rate": 2.1306716241930968e-05,
|
|
"loss": 0.9201,
|
|
"num_input_tokens_seen": 423526400,
|
|
"step": 51700
|
|
},
|
|
{
|
|
"epoch": 1.6592991223012366,
|
|
"grad_norm": 0.7637689113616943,
|
|
"learning_rate": 2.1222900949731297e-05,
|
|
"loss": 0.9039,
|
|
"num_input_tokens_seen": 424345600,
|
|
"step": 51800
|
|
},
|
|
{
|
|
"epoch": 1.6625024024601192,
|
|
"grad_norm": 3.154482841491699,
|
|
"learning_rate": 2.1139129081484734e-05,
|
|
"loss": 0.968,
|
|
"num_input_tokens_seen": 425164800,
|
|
"step": 51900
|
|
},
|
|
{
|
|
"epoch": 1.6657056826190018,
|
|
"grad_norm": 1.900366187095642,
|
|
"learning_rate": 2.1055401600286386e-05,
|
|
"loss": 0.9064,
|
|
"num_input_tokens_seen": 425984000,
|
|
"step": 52000
|
|
},
|
|
{
|
|
"epoch": 1.6689089627778846,
|
|
"grad_norm": 0.6276770830154419,
|
|
"learning_rate": 2.0971719468721077e-05,
|
|
"loss": 0.8786,
|
|
"num_input_tokens_seen": 426803200,
|
|
"step": 52100
|
|
},
|
|
{
|
|
"epoch": 1.6721122429367674,
|
|
"grad_norm": 0.7337915301322937,
|
|
"learning_rate": 2.0888083648852267e-05,
|
|
"loss": 0.9213,
|
|
"num_input_tokens_seen": 427622400,
|
|
"step": 52200
|
|
},
|
|
{
|
|
"epoch": 1.6753155230956498,
|
|
"grad_norm": 0.6604040861129761,
|
|
"learning_rate": 2.0804495102210975e-05,
|
|
"loss": 0.944,
|
|
"num_input_tokens_seen": 428441600,
|
|
"step": 52300
|
|
},
|
|
{
|
|
"epoch": 1.6785188032545326,
|
|
"grad_norm": 0.6165716648101807,
|
|
"learning_rate": 2.0720954789784753e-05,
|
|
"loss": 0.8767,
|
|
"num_input_tokens_seen": 429260800,
|
|
"step": 52400
|
|
},
|
|
{
|
|
"epoch": 1.6817220834134154,
|
|
"grad_norm": 1.7939884662628174,
|
|
"learning_rate": 2.0637463672006595e-05,
|
|
"loss": 0.9095,
|
|
"num_input_tokens_seen": 430080000,
|
|
"step": 52500
|
|
},
|
|
{
|
|
"epoch": 1.684925363572298,
|
|
"grad_norm": 0.6687926054000854,
|
|
"learning_rate": 2.0554022708743943e-05,
|
|
"loss": 0.8976,
|
|
"num_input_tokens_seen": 430899200,
|
|
"step": 52600
|
|
},
|
|
{
|
|
"epoch": 1.6881286437311807,
|
|
"grad_norm": 0.7300702929496765,
|
|
"learning_rate": 2.0470632859287628e-05,
|
|
"loss": 0.9377,
|
|
"num_input_tokens_seen": 431718400,
|
|
"step": 52700
|
|
},
|
|
{
|
|
"epoch": 1.6913319238900635,
|
|
"grad_norm": 0.590376615524292,
|
|
"learning_rate": 2.0387295082340835e-05,
|
|
"loss": 0.8911,
|
|
"num_input_tokens_seen": 432537600,
|
|
"step": 52800
|
|
},
|
|
{
|
|
"epoch": 1.694535204048946,
|
|
"grad_norm": 0.556515097618103,
|
|
"learning_rate": 2.0304010336008112e-05,
|
|
"loss": 0.8771,
|
|
"num_input_tokens_seen": 433356800,
|
|
"step": 52900
|
|
},
|
|
{
|
|
"epoch": 1.6977384842078287,
|
|
"grad_norm": 0.6625654101371765,
|
|
"learning_rate": 2.0220779577784298e-05,
|
|
"loss": 0.9529,
|
|
"num_input_tokens_seen": 434176000,
|
|
"step": 53000
|
|
},
|
|
{
|
|
"epoch": 1.7009417643667115,
|
|
"grad_norm": 0.5537979602813721,
|
|
"learning_rate": 2.0137603764543573e-05,
|
|
"loss": 0.8813,
|
|
"num_input_tokens_seen": 434995200,
|
|
"step": 53100
|
|
},
|
|
{
|
|
"epoch": 1.7041450445255943,
|
|
"grad_norm": 0.49151819944381714,
|
|
"learning_rate": 2.0054483852528435e-05,
|
|
"loss": 0.8268,
|
|
"num_input_tokens_seen": 435814400,
|
|
"step": 53200
|
|
},
|
|
{
|
|
"epoch": 1.707348324684477,
|
|
"grad_norm": 0.6030770540237427,
|
|
"learning_rate": 1.9971420797338708e-05,
|
|
"loss": 0.9116,
|
|
"num_input_tokens_seen": 436633600,
|
|
"step": 53300
|
|
},
|
|
{
|
|
"epoch": 1.7105516048433596,
|
|
"grad_norm": 0.872156023979187,
|
|
"learning_rate": 1.9888415553920525e-05,
|
|
"loss": 0.8564,
|
|
"num_input_tokens_seen": 437452800,
|
|
"step": 53400
|
|
},
|
|
{
|
|
"epoch": 1.7137548850022424,
|
|
"grad_norm": 0.608736515045166,
|
|
"learning_rate": 1.9805469076555418e-05,
|
|
"loss": 0.8656,
|
|
"num_input_tokens_seen": 438272000,
|
|
"step": 53500
|
|
},
|
|
{
|
|
"epoch": 1.716958165161125,
|
|
"grad_norm": 0.6439238786697388,
|
|
"learning_rate": 1.9722582318849274e-05,
|
|
"loss": 0.8819,
|
|
"num_input_tokens_seen": 439091200,
|
|
"step": 53600
|
|
},
|
|
{
|
|
"epoch": 1.7201614453200076,
|
|
"grad_norm": 0.5254938006401062,
|
|
"learning_rate": 1.9639756233721433e-05,
|
|
"loss": 0.9118,
|
|
"num_input_tokens_seen": 439910400,
|
|
"step": 53700
|
|
},
|
|
{
|
|
"epoch": 1.7233647254788904,
|
|
"grad_norm": 0.6956652998924255,
|
|
"learning_rate": 1.9556991773393686e-05,
|
|
"loss": 0.8578,
|
|
"num_input_tokens_seen": 440729600,
|
|
"step": 53800
|
|
},
|
|
{
|
|
"epoch": 1.7265680056377732,
|
|
"grad_norm": 0.5322553515434265,
|
|
"learning_rate": 1.9474289889379334e-05,
|
|
"loss": 0.8907,
|
|
"num_input_tokens_seen": 441548800,
|
|
"step": 53900
|
|
},
|
|
{
|
|
"epoch": 1.7297712857966556,
|
|
"grad_norm": 0.706683874130249,
|
|
"learning_rate": 1.9391651532472296e-05,
|
|
"loss": 0.8853,
|
|
"num_input_tokens_seen": 442368000,
|
|
"step": 54000
|
|
},
|
|
{
|
|
"epoch": 1.7329745659555384,
|
|
"grad_norm": 1.7393512725830078,
|
|
"learning_rate": 1.930907765273611e-05,
|
|
"loss": 0.8942,
|
|
"num_input_tokens_seen": 443187200,
|
|
"step": 54100
|
|
},
|
|
{
|
|
"epoch": 1.7361778461144213,
|
|
"grad_norm": 0.6126461029052734,
|
|
"learning_rate": 1.922656919949306e-05,
|
|
"loss": 0.861,
|
|
"num_input_tokens_seen": 444006400,
|
|
"step": 54200
|
|
},
|
|
{
|
|
"epoch": 1.7393811262733039,
|
|
"grad_norm": 15.058053016662598,
|
|
"learning_rate": 1.914412712131325e-05,
|
|
"loss": 0.8764,
|
|
"num_input_tokens_seen": 444825600,
|
|
"step": 54300
|
|
},
|
|
{
|
|
"epoch": 1.7425844064321865,
|
|
"grad_norm": 1.590517520904541,
|
|
"learning_rate": 1.906175236600366e-05,
|
|
"loss": 0.9054,
|
|
"num_input_tokens_seen": 445644800,
|
|
"step": 54400
|
|
},
|
|
{
|
|
"epoch": 1.7457876865910693,
|
|
"grad_norm": 2.823185920715332,
|
|
"learning_rate": 1.8979445880597332e-05,
|
|
"loss": 0.9166,
|
|
"num_input_tokens_seen": 446464000,
|
|
"step": 54500
|
|
},
|
|
{
|
|
"epoch": 1.748990966749952,
|
|
"grad_norm": 0.6295785903930664,
|
|
"learning_rate": 1.8897208611342392e-05,
|
|
"loss": 0.893,
|
|
"num_input_tokens_seen": 447283200,
|
|
"step": 54600
|
|
},
|
|
{
|
|
"epoch": 1.7521942469088345,
|
|
"grad_norm": 2.9604554176330566,
|
|
"learning_rate": 1.881504150369125e-05,
|
|
"loss": 0.8883,
|
|
"num_input_tokens_seen": 448102400,
|
|
"step": 54700
|
|
},
|
|
{
|
|
"epoch": 1.7553975270677173,
|
|
"grad_norm": 0.12940554320812225,
|
|
"learning_rate": 1.873294550228965e-05,
|
|
"loss": 0.9114,
|
|
"num_input_tokens_seen": 448921600,
|
|
"step": 54800
|
|
},
|
|
{
|
|
"epoch": 1.7586008072266002,
|
|
"grad_norm": 0.6710172891616821,
|
|
"learning_rate": 1.8650921550965884e-05,
|
|
"loss": 0.9675,
|
|
"num_input_tokens_seen": 449740800,
|
|
"step": 54900
|
|
},
|
|
{
|
|
"epoch": 1.7618040873854828,
|
|
"grad_norm": 0.5467862486839294,
|
|
"learning_rate": 1.8568970592719903e-05,
|
|
"loss": 0.9055,
|
|
"num_input_tokens_seen": 450560000,
|
|
"step": 55000
|
|
},
|
|
{
|
|
"epoch": 1.7650073675443654,
|
|
"grad_norm": 1.6943007707595825,
|
|
"learning_rate": 1.8487093569712482e-05,
|
|
"loss": 0.8754,
|
|
"num_input_tokens_seen": 451379200,
|
|
"step": 55100
|
|
},
|
|
{
|
|
"epoch": 1.7682106477032482,
|
|
"grad_norm": 0.6068347692489624,
|
|
"learning_rate": 1.84052914232544e-05,
|
|
"loss": 0.9695,
|
|
"num_input_tokens_seen": 452198400,
|
|
"step": 55200
|
|
},
|
|
{
|
|
"epoch": 1.7714139278621308,
|
|
"grad_norm": 2.650592565536499,
|
|
"learning_rate": 1.8323565093795576e-05,
|
|
"loss": 0.8756,
|
|
"num_input_tokens_seen": 453017600,
|
|
"step": 55300
|
|
},
|
|
{
|
|
"epoch": 1.7746172080210134,
|
|
"grad_norm": 2.3554019927978516,
|
|
"learning_rate": 1.824191552091431e-05,
|
|
"loss": 0.8884,
|
|
"num_input_tokens_seen": 453836800,
|
|
"step": 55400
|
|
},
|
|
{
|
|
"epoch": 1.7778204881798962,
|
|
"grad_norm": 0.5100352764129639,
|
|
"learning_rate": 1.8160343643306467e-05,
|
|
"loss": 0.901,
|
|
"num_input_tokens_seen": 454656000,
|
|
"step": 55500
|
|
},
|
|
{
|
|
"epoch": 1.781023768338779,
|
|
"grad_norm": 2.276134490966797,
|
|
"learning_rate": 1.8078850398774666e-05,
|
|
"loss": 0.8653,
|
|
"num_input_tokens_seen": 455475200,
|
|
"step": 55600
|
|
},
|
|
{
|
|
"epoch": 1.7842270484976614,
|
|
"grad_norm": 0.6568858027458191,
|
|
"learning_rate": 1.7997436724217517e-05,
|
|
"loss": 0.9307,
|
|
"num_input_tokens_seen": 456294400,
|
|
"step": 55700
|
|
},
|
|
{
|
|
"epoch": 1.7874303286565443,
|
|
"grad_norm": 0.5729939341545105,
|
|
"learning_rate": 1.7916103555618818e-05,
|
|
"loss": 0.8938,
|
|
"num_input_tokens_seen": 457113600,
|
|
"step": 55800
|
|
},
|
|
{
|
|
"epoch": 1.790633608815427,
|
|
"grad_norm": 0.4960566759109497,
|
|
"learning_rate": 1.7834851828036855e-05,
|
|
"loss": 0.8622,
|
|
"num_input_tokens_seen": 457932800,
|
|
"step": 55900
|
|
},
|
|
{
|
|
"epoch": 1.7938368889743097,
|
|
"grad_norm": 0.6195512413978577,
|
|
"learning_rate": 1.7753682475593587e-05,
|
|
"loss": 0.9165,
|
|
"num_input_tokens_seen": 458752000,
|
|
"step": 56000
|
|
},
|
|
{
|
|
"epoch": 1.7970401691331923,
|
|
"grad_norm": 0.7224614024162292,
|
|
"learning_rate": 1.7672596431463963e-05,
|
|
"loss": 0.9159,
|
|
"num_input_tokens_seen": 459571200,
|
|
"step": 56100
|
|
},
|
|
{
|
|
"epoch": 1.8002434492920751,
|
|
"grad_norm": 0.683172881603241,
|
|
"learning_rate": 1.7591594627865134e-05,
|
|
"loss": 0.928,
|
|
"num_input_tokens_seen": 460390400,
|
|
"step": 56200
|
|
},
|
|
{
|
|
"epoch": 1.8034467294509577,
|
|
"grad_norm": 0.6346443891525269,
|
|
"learning_rate": 1.7510677996045787e-05,
|
|
"loss": 0.8891,
|
|
"num_input_tokens_seen": 461209600,
|
|
"step": 56300
|
|
},
|
|
{
|
|
"epoch": 1.8066500096098403,
|
|
"grad_norm": 0.5797076225280762,
|
|
"learning_rate": 1.7429847466275424e-05,
|
|
"loss": 0.9163,
|
|
"num_input_tokens_seen": 462028800,
|
|
"step": 56400
|
|
},
|
|
{
|
|
"epoch": 1.8098532897687232,
|
|
"grad_norm": 1.201037883758545,
|
|
"learning_rate": 1.734910396783364e-05,
|
|
"loss": 0.9401,
|
|
"num_input_tokens_seen": 462848000,
|
|
"step": 56500
|
|
},
|
|
{
|
|
"epoch": 1.813056569927606,
|
|
"grad_norm": 0.6015352606773376,
|
|
"learning_rate": 1.7268448428999508e-05,
|
|
"loss": 0.9391,
|
|
"num_input_tokens_seen": 463667200,
|
|
"step": 56600
|
|
},
|
|
{
|
|
"epoch": 1.8162598500864886,
|
|
"grad_norm": 0.6725329756736755,
|
|
"learning_rate": 1.71878817770408e-05,
|
|
"loss": 0.8751,
|
|
"num_input_tokens_seen": 464486400,
|
|
"step": 56700
|
|
},
|
|
{
|
|
"epoch": 1.8194631302453712,
|
|
"grad_norm": 0.7582192420959473,
|
|
"learning_rate": 1.7107404938203422e-05,
|
|
"loss": 0.9578,
|
|
"num_input_tokens_seen": 465305600,
|
|
"step": 56800
|
|
},
|
|
{
|
|
"epoch": 1.822666410404254,
|
|
"grad_norm": 0.5181425213813782,
|
|
"learning_rate": 1.702701883770074e-05,
|
|
"loss": 0.9462,
|
|
"num_input_tokens_seen": 466124800,
|
|
"step": 56900
|
|
},
|
|
{
|
|
"epoch": 1.8258696905631366,
|
|
"grad_norm": 0.672991931438446,
|
|
"learning_rate": 1.6946724399702905e-05,
|
|
"loss": 0.8676,
|
|
"num_input_tokens_seen": 466944000,
|
|
"step": 57000
|
|
},
|
|
{
|
|
"epoch": 1.8290729707220192,
|
|
"grad_norm": 2.6324303150177,
|
|
"learning_rate": 1.6866522547326292e-05,
|
|
"loss": 0.9282,
|
|
"num_input_tokens_seen": 467763200,
|
|
"step": 57100
|
|
},
|
|
{
|
|
"epoch": 1.832276250880902,
|
|
"grad_norm": 0.5964205861091614,
|
|
"learning_rate": 1.6786414202622818e-05,
|
|
"loss": 0.8611,
|
|
"num_input_tokens_seen": 468582400,
|
|
"step": 57200
|
|
},
|
|
{
|
|
"epoch": 1.835479531039785,
|
|
"grad_norm": 1.6168113946914673,
|
|
"learning_rate": 1.670640028656939e-05,
|
|
"loss": 0.8977,
|
|
"num_input_tokens_seen": 469401600,
|
|
"step": 57300
|
|
},
|
|
{
|
|
"epoch": 1.8386828111986673,
|
|
"grad_norm": 0.5584040284156799,
|
|
"learning_rate": 1.662648171905731e-05,
|
|
"loss": 0.9157,
|
|
"num_input_tokens_seen": 470220800,
|
|
"step": 57400
|
|
},
|
|
{
|
|
"epoch": 1.84188609135755,
|
|
"grad_norm": 0.6906948685646057,
|
|
"learning_rate": 1.654665941888169e-05,
|
|
"loss": 0.8808,
|
|
"num_input_tokens_seen": 471040000,
|
|
"step": 57500
|
|
},
|
|
{
|
|
"epoch": 1.845089371516433,
|
|
"grad_norm": 0.8261626958847046,
|
|
"learning_rate": 1.6466934303730866e-05,
|
|
"loss": 0.9322,
|
|
"num_input_tokens_seen": 471859200,
|
|
"step": 57600
|
|
},
|
|
{
|
|
"epoch": 1.8482926516753155,
|
|
"grad_norm": 0.5074647068977356,
|
|
"learning_rate": 1.6387307290175914e-05,
|
|
"loss": 0.9141,
|
|
"num_input_tokens_seen": 472678400,
|
|
"step": 57700
|
|
},
|
|
{
|
|
"epoch": 1.8514959318341981,
|
|
"grad_norm": 1.8539708852767944,
|
|
"learning_rate": 1.6307779293660034e-05,
|
|
"loss": 0.8777,
|
|
"num_input_tokens_seen": 473497600,
|
|
"step": 57800
|
|
},
|
|
{
|
|
"epoch": 1.854699211993081,
|
|
"grad_norm": 2.2079038619995117,
|
|
"learning_rate": 1.622835122848809e-05,
|
|
"loss": 0.8596,
|
|
"num_input_tokens_seen": 474316800,
|
|
"step": 57900
|
|
},
|
|
{
|
|
"epoch": 1.8579024921519636,
|
|
"grad_norm": 0.670155942440033,
|
|
"learning_rate": 1.6149024007816067e-05,
|
|
"loss": 0.9112,
|
|
"num_input_tokens_seen": 475136000,
|
|
"step": 58000
|
|
},
|
|
{
|
|
"epoch": 1.8611057723108462,
|
|
"grad_norm": 0.8173292875289917,
|
|
"learning_rate": 1.6069798543640543e-05,
|
|
"loss": 0.9513,
|
|
"num_input_tokens_seen": 475955200,
|
|
"step": 58100
|
|
},
|
|
{
|
|
"epoch": 1.864309052469729,
|
|
"grad_norm": 0.5929046273231506,
|
|
"learning_rate": 1.599067574678829e-05,
|
|
"loss": 0.8633,
|
|
"num_input_tokens_seen": 476774400,
|
|
"step": 58200
|
|
},
|
|
{
|
|
"epoch": 1.8675123326286118,
|
|
"grad_norm": 0.6177115440368652,
|
|
"learning_rate": 1.591165652690571e-05,
|
|
"loss": 0.8829,
|
|
"num_input_tokens_seen": 477593600,
|
|
"step": 58300
|
|
},
|
|
{
|
|
"epoch": 1.8707156127874944,
|
|
"grad_norm": 5.405032157897949,
|
|
"learning_rate": 1.5832741792448447e-05,
|
|
"loss": 0.853,
|
|
"num_input_tokens_seen": 478412800,
|
|
"step": 58400
|
|
},
|
|
{
|
|
"epoch": 1.873918892946377,
|
|
"grad_norm": 0.8819538950920105,
|
|
"learning_rate": 1.5753932450670892e-05,
|
|
"loss": 0.8632,
|
|
"num_input_tokens_seen": 479232000,
|
|
"step": 58500
|
|
},
|
|
{
|
|
"epoch": 1.8771221731052599,
|
|
"grad_norm": 0.7577266693115234,
|
|
"learning_rate": 1.5675229407615773e-05,
|
|
"loss": 0.8691,
|
|
"num_input_tokens_seen": 480051200,
|
|
"step": 58600
|
|
},
|
|
{
|
|
"epoch": 1.8803254532641425,
|
|
"grad_norm": 0.5581927299499512,
|
|
"learning_rate": 1.5596633568103764e-05,
|
|
"loss": 0.8898,
|
|
"num_input_tokens_seen": 480870400,
|
|
"step": 58700
|
|
},
|
|
{
|
|
"epoch": 1.883528733423025,
|
|
"grad_norm": 1.5271930694580078,
|
|
"learning_rate": 1.5518145835723034e-05,
|
|
"loss": 0.9001,
|
|
"num_input_tokens_seen": 481689600,
|
|
"step": 58800
|
|
},
|
|
{
|
|
"epoch": 1.886732013581908,
|
|
"grad_norm": 0.594035804271698,
|
|
"learning_rate": 1.54397671128189e-05,
|
|
"loss": 0.8988,
|
|
"num_input_tokens_seen": 482508800,
|
|
"step": 58900
|
|
},
|
|
{
|
|
"epoch": 1.8899352937407907,
|
|
"grad_norm": 0.778454601764679,
|
|
"learning_rate": 1.5361498300483423e-05,
|
|
"loss": 0.8744,
|
|
"num_input_tokens_seen": 483328000,
|
|
"step": 59000
|
|
},
|
|
{
|
|
"epoch": 1.893138573899673,
|
|
"grad_norm": 0.6719622611999512,
|
|
"learning_rate": 1.5283340298545056e-05,
|
|
"loss": 0.9189,
|
|
"num_input_tokens_seen": 484147200,
|
|
"step": 59100
|
|
},
|
|
{
|
|
"epoch": 1.896341854058556,
|
|
"grad_norm": 0.7632321119308472,
|
|
"learning_rate": 1.5205294005558335e-05,
|
|
"loss": 0.9133,
|
|
"num_input_tokens_seen": 484966400,
|
|
"step": 59200
|
|
},
|
|
{
|
|
"epoch": 1.8995451342174388,
|
|
"grad_norm": 2.033229112625122,
|
|
"learning_rate": 1.5127360318793481e-05,
|
|
"loss": 0.8913,
|
|
"num_input_tokens_seen": 485785600,
|
|
"step": 59300
|
|
},
|
|
{
|
|
"epoch": 1.9027484143763214,
|
|
"grad_norm": 0.598871648311615,
|
|
"learning_rate": 1.5049540134226158e-05,
|
|
"loss": 0.8857,
|
|
"num_input_tokens_seen": 486604800,
|
|
"step": 59400
|
|
},
|
|
{
|
|
"epoch": 1.905951694535204,
|
|
"grad_norm": 1.5140035152435303,
|
|
"learning_rate": 1.4971834346527102e-05,
|
|
"loss": 0.9104,
|
|
"num_input_tokens_seen": 487424000,
|
|
"step": 59500
|
|
},
|
|
{
|
|
"epoch": 1.9091549746940868,
|
|
"grad_norm": 1.2196921110153198,
|
|
"learning_rate": 1.4894243849051889e-05,
|
|
"loss": 0.8936,
|
|
"num_input_tokens_seen": 488243200,
|
|
"step": 59600
|
|
},
|
|
{
|
|
"epoch": 1.9123582548529694,
|
|
"grad_norm": 0.6041728854179382,
|
|
"learning_rate": 1.4816769533830638e-05,
|
|
"loss": 0.9233,
|
|
"num_input_tokens_seen": 489062400,
|
|
"step": 59700
|
|
},
|
|
{
|
|
"epoch": 1.915561535011852,
|
|
"grad_norm": 0.585239589214325,
|
|
"learning_rate": 1.4739412291557774e-05,
|
|
"loss": 0.893,
|
|
"num_input_tokens_seen": 489881600,
|
|
"step": 59800
|
|
},
|
|
{
|
|
"epoch": 1.9187648151707348,
|
|
"grad_norm": 0.5198357701301575,
|
|
"learning_rate": 1.4662173011581757e-05,
|
|
"loss": 0.8643,
|
|
"num_input_tokens_seen": 490700800,
|
|
"step": 59900
|
|
},
|
|
{
|
|
"epoch": 1.9219680953296177,
|
|
"grad_norm": 1.5068873167037964,
|
|
"learning_rate": 1.4585052581894881e-05,
|
|
"loss": 0.9376,
|
|
"num_input_tokens_seen": 491520000,
|
|
"step": 60000
|
|
},
|
|
{
|
|
"epoch": 1.9251713754885003,
|
|
"grad_norm": 1.573378562927246,
|
|
"learning_rate": 1.4508051889123075e-05,
|
|
"loss": 0.9354,
|
|
"num_input_tokens_seen": 492339200,
|
|
"step": 60100
|
|
},
|
|
{
|
|
"epoch": 1.9283746556473829,
|
|
"grad_norm": 0.7995052933692932,
|
|
"learning_rate": 1.4431171818515698e-05,
|
|
"loss": 0.8201,
|
|
"num_input_tokens_seen": 493158400,
|
|
"step": 60200
|
|
},
|
|
{
|
|
"epoch": 1.9315779358062657,
|
|
"grad_norm": 0.7116925716400146,
|
|
"learning_rate": 1.4354413253935336e-05,
|
|
"loss": 0.8322,
|
|
"num_input_tokens_seen": 493977600,
|
|
"step": 60300
|
|
},
|
|
{
|
|
"epoch": 1.9347812159651483,
|
|
"grad_norm": 0.714451253414154,
|
|
"learning_rate": 1.4277777077847665e-05,
|
|
"loss": 0.9181,
|
|
"num_input_tokens_seen": 494796800,
|
|
"step": 60400
|
|
},
|
|
{
|
|
"epoch": 1.937984496124031,
|
|
"grad_norm": 0.7062659859657288,
|
|
"learning_rate": 1.420126417131133e-05,
|
|
"loss": 0.8783,
|
|
"num_input_tokens_seen": 495616000,
|
|
"step": 60500
|
|
},
|
|
{
|
|
"epoch": 1.9411877762829137,
|
|
"grad_norm": 0.5767313838005066,
|
|
"learning_rate": 1.4124875413967767e-05,
|
|
"loss": 0.9239,
|
|
"num_input_tokens_seen": 496435200,
|
|
"step": 60600
|
|
},
|
|
{
|
|
"epoch": 1.9443910564417966,
|
|
"grad_norm": 0.7007090449333191,
|
|
"learning_rate": 1.4048611684031138e-05,
|
|
"loss": 0.8908,
|
|
"num_input_tokens_seen": 497254400,
|
|
"step": 60700
|
|
},
|
|
{
|
|
"epoch": 1.947594336600679,
|
|
"grad_norm": 0.663779079914093,
|
|
"learning_rate": 1.3972473858278184e-05,
|
|
"loss": 0.8845,
|
|
"num_input_tokens_seen": 498073600,
|
|
"step": 60800
|
|
},
|
|
{
|
|
"epoch": 1.9507976167595618,
|
|
"grad_norm": 1.9937938451766968,
|
|
"learning_rate": 1.3896462812038168e-05,
|
|
"loss": 0.8902,
|
|
"num_input_tokens_seen": 498892800,
|
|
"step": 60900
|
|
},
|
|
{
|
|
"epoch": 1.9540008969184446,
|
|
"grad_norm": 0.5911014676094055,
|
|
"learning_rate": 1.3820579419182838e-05,
|
|
"loss": 0.9283,
|
|
"num_input_tokens_seen": 499712000,
|
|
"step": 61000
|
|
},
|
|
{
|
|
"epoch": 1.9572041770773272,
|
|
"grad_norm": 0.680264949798584,
|
|
"learning_rate": 1.3744824552116343e-05,
|
|
"loss": 0.9166,
|
|
"num_input_tokens_seen": 500531200,
|
|
"step": 61100
|
|
},
|
|
{
|
|
"epoch": 1.9604074572362098,
|
|
"grad_norm": 0.5298569202423096,
|
|
"learning_rate": 1.3669199081765232e-05,
|
|
"loss": 0.9069,
|
|
"num_input_tokens_seen": 501350400,
|
|
"step": 61200
|
|
},
|
|
{
|
|
"epoch": 1.9636107373950926,
|
|
"grad_norm": 2.5101547241210938,
|
|
"learning_rate": 1.3593703877568407e-05,
|
|
"loss": 0.9138,
|
|
"num_input_tokens_seen": 502169600,
|
|
"step": 61300
|
|
},
|
|
{
|
|
"epoch": 1.9668140175539752,
|
|
"grad_norm": 1.6266756057739258,
|
|
"learning_rate": 1.3518339807467138e-05,
|
|
"loss": 0.8311,
|
|
"num_input_tokens_seen": 502988800,
|
|
"step": 61400
|
|
},
|
|
{
|
|
"epoch": 1.9700172977128578,
|
|
"grad_norm": 0.6949862241744995,
|
|
"learning_rate": 1.3443107737895121e-05,
|
|
"loss": 0.9508,
|
|
"num_input_tokens_seen": 503808000,
|
|
"step": 61500
|
|
},
|
|
{
|
|
"epoch": 1.9732205778717407,
|
|
"grad_norm": 1.9142687320709229,
|
|
"learning_rate": 1.3368008533768478e-05,
|
|
"loss": 0.8986,
|
|
"num_input_tokens_seen": 504627200,
|
|
"step": 61600
|
|
},
|
|
{
|
|
"epoch": 1.9764238580306235,
|
|
"grad_norm": 1.5811573266983032,
|
|
"learning_rate": 1.3293043058475835e-05,
|
|
"loss": 0.8775,
|
|
"num_input_tokens_seen": 505446400,
|
|
"step": 61700
|
|
},
|
|
{
|
|
"epoch": 1.979627138189506,
|
|
"grad_norm": 0.5435724258422852,
|
|
"learning_rate": 1.321821217386836e-05,
|
|
"loss": 0.8588,
|
|
"num_input_tokens_seen": 506265600,
|
|
"step": 61800
|
|
},
|
|
{
|
|
"epoch": 1.9828304183483887,
|
|
"grad_norm": 0.5689346194267273,
|
|
"learning_rate": 1.314351674024989e-05,
|
|
"loss": 0.9,
|
|
"num_input_tokens_seen": 507084800,
|
|
"step": 61900
|
|
},
|
|
{
|
|
"epoch": 1.9860336985072715,
|
|
"grad_norm": 0.5658956170082092,
|
|
"learning_rate": 1.3068957616367045e-05,
|
|
"loss": 0.8931,
|
|
"num_input_tokens_seen": 507904000,
|
|
"step": 62000
|
|
},
|
|
{
|
|
"epoch": 1.9892369786661541,
|
|
"grad_norm": 0.6352538466453552,
|
|
"learning_rate": 1.2994535659399327e-05,
|
|
"loss": 0.9254,
|
|
"num_input_tokens_seen": 508723200,
|
|
"step": 62100
|
|
},
|
|
{
|
|
"epoch": 1.9924402588250367,
|
|
"grad_norm": 1.6909618377685547,
|
|
"learning_rate": 1.2920251724949296e-05,
|
|
"loss": 0.8628,
|
|
"num_input_tokens_seen": 509542400,
|
|
"step": 62200
|
|
},
|
|
{
|
|
"epoch": 1.9956435389839196,
|
|
"grad_norm": 0.6590949892997742,
|
|
"learning_rate": 1.2846106667032693e-05,
|
|
"loss": 0.8509,
|
|
"num_input_tokens_seen": 510361600,
|
|
"step": 62300
|
|
},
|
|
{
|
|
"epoch": 1.9988468191428024,
|
|
"grad_norm": 2.059828042984009,
|
|
"learning_rate": 1.2772101338068649e-05,
|
|
"loss": 0.8547,
|
|
"num_input_tokens_seen": 511180800,
|
|
"step": 62400
|
|
},
|
|
{
|
|
"epoch": 2.0020500993016848,
|
|
"grad_norm": 0.8146264553070068,
|
|
"learning_rate": 1.2698236588869894e-05,
|
|
"loss": 0.8274,
|
|
"num_input_tokens_seen": 512000000,
|
|
"step": 62500
|
|
},
|
|
{
|
|
"epoch": 2.0052533794605676,
|
|
"grad_norm": 0.5894434452056885,
|
|
"learning_rate": 1.2624513268632967e-05,
|
|
"loss": 0.8213,
|
|
"num_input_tokens_seen": 512819200,
|
|
"step": 62600
|
|
},
|
|
{
|
|
"epoch": 2.0084566596194504,
|
|
"grad_norm": 1.9424681663513184,
|
|
"learning_rate": 1.2550932224928425e-05,
|
|
"loss": 0.8608,
|
|
"num_input_tokens_seen": 513638400,
|
|
"step": 62700
|
|
},
|
|
{
|
|
"epoch": 2.011659939778333,
|
|
"grad_norm": 0.6579126715660095,
|
|
"learning_rate": 1.2477494303691157e-05,
|
|
"loss": 0.836,
|
|
"num_input_tokens_seen": 514457600,
|
|
"step": 62800
|
|
},
|
|
{
|
|
"epoch": 2.0148632199372156,
|
|
"grad_norm": 0.5051004886627197,
|
|
"learning_rate": 1.2404200349210577e-05,
|
|
"loss": 0.8208,
|
|
"num_input_tokens_seen": 515276800,
|
|
"step": 62900
|
|
},
|
|
{
|
|
"epoch": 2.0180665000960984,
|
|
"grad_norm": 0.6397780179977417,
|
|
"learning_rate": 1.2331051204121009e-05,
|
|
"loss": 0.8293,
|
|
"num_input_tokens_seen": 516096000,
|
|
"step": 63000
|
|
},
|
|
{
|
|
"epoch": 2.0212697802549813,
|
|
"grad_norm": 0.7705442309379578,
|
|
"learning_rate": 1.2258047709391945e-05,
|
|
"loss": 0.8663,
|
|
"num_input_tokens_seen": 516915200,
|
|
"step": 63100
|
|
},
|
|
{
|
|
"epoch": 2.0244730604138637,
|
|
"grad_norm": 0.711100697517395,
|
|
"learning_rate": 1.218519070431836e-05,
|
|
"loss": 0.8186,
|
|
"num_input_tokens_seen": 517734400,
|
|
"step": 63200
|
|
},
|
|
{
|
|
"epoch": 2.0276763405727465,
|
|
"grad_norm": 0.6769080758094788,
|
|
"learning_rate": 1.2112481026511138e-05,
|
|
"loss": 0.8468,
|
|
"num_input_tokens_seen": 518553600,
|
|
"step": 63300
|
|
},
|
|
{
|
|
"epoch": 2.0308796207316293,
|
|
"grad_norm": 0.7686530351638794,
|
|
"learning_rate": 1.2039919511887338e-05,
|
|
"loss": 0.7955,
|
|
"num_input_tokens_seen": 519372800,
|
|
"step": 63400
|
|
},
|
|
{
|
|
"epoch": 2.0340829008905117,
|
|
"grad_norm": 0.826252281665802,
|
|
"learning_rate": 1.1967506994660685e-05,
|
|
"loss": 0.8313,
|
|
"num_input_tokens_seen": 520192000,
|
|
"step": 63500
|
|
},
|
|
{
|
|
"epoch": 2.0372861810493945,
|
|
"grad_norm": 1.5545631647109985,
|
|
"learning_rate": 1.1895244307331923e-05,
|
|
"loss": 0.8387,
|
|
"num_input_tokens_seen": 521011200,
|
|
"step": 63600
|
|
},
|
|
{
|
|
"epoch": 2.0404894612082773,
|
|
"grad_norm": 2.142545461654663,
|
|
"learning_rate": 1.1823132280679235e-05,
|
|
"loss": 0.8087,
|
|
"num_input_tokens_seen": 521830400,
|
|
"step": 63700
|
|
},
|
|
{
|
|
"epoch": 2.04369274136716,
|
|
"grad_norm": 1.7032113075256348,
|
|
"learning_rate": 1.1751171743748737e-05,
|
|
"loss": 0.8357,
|
|
"num_input_tokens_seen": 522649600,
|
|
"step": 63800
|
|
},
|
|
{
|
|
"epoch": 2.0468960215260426,
|
|
"grad_norm": 0.6579723358154297,
|
|
"learning_rate": 1.1679363523844918e-05,
|
|
"loss": 0.8435,
|
|
"num_input_tokens_seen": 523468800,
|
|
"step": 63900
|
|
},
|
|
{
|
|
"epoch": 2.0500993016849254,
|
|
"grad_norm": 0.6495528817176819,
|
|
"learning_rate": 1.1607708446521125e-05,
|
|
"loss": 0.8702,
|
|
"num_input_tokens_seen": 524288000,
|
|
"step": 64000
|
|
},
|
|
{
|
|
"epoch": 2.053302581843808,
|
|
"grad_norm": 0.5699741840362549,
|
|
"learning_rate": 1.153620733557007e-05,
|
|
"loss": 0.8436,
|
|
"num_input_tokens_seen": 525107200,
|
|
"step": 64100
|
|
},
|
|
{
|
|
"epoch": 2.0565058620026906,
|
|
"grad_norm": 0.5475245118141174,
|
|
"learning_rate": 1.1464861013014391e-05,
|
|
"loss": 0.825,
|
|
"num_input_tokens_seen": 525926400,
|
|
"step": 64200
|
|
},
|
|
{
|
|
"epoch": 2.0597091421615734,
|
|
"grad_norm": 2.3118770122528076,
|
|
"learning_rate": 1.139367029909717e-05,
|
|
"loss": 0.8469,
|
|
"num_input_tokens_seen": 526745600,
|
|
"step": 64300
|
|
},
|
|
{
|
|
"epoch": 2.0629124223204562,
|
|
"grad_norm": 0.7807962894439697,
|
|
"learning_rate": 1.1322636012272517e-05,
|
|
"loss": 0.8397,
|
|
"num_input_tokens_seen": 527564800,
|
|
"step": 64400
|
|
},
|
|
{
|
|
"epoch": 2.0661157024793386,
|
|
"grad_norm": 1.0216293334960938,
|
|
"learning_rate": 1.1251758969196147e-05,
|
|
"loss": 0.7898,
|
|
"num_input_tokens_seen": 528384000,
|
|
"step": 64500
|
|
},
|
|
{
|
|
"epoch": 2.0693189826382214,
|
|
"grad_norm": 0.7191298604011536,
|
|
"learning_rate": 1.1181039984715991e-05,
|
|
"loss": 0.8449,
|
|
"num_input_tokens_seen": 529203200,
|
|
"step": 64600
|
|
},
|
|
{
|
|
"epoch": 2.0725222627971043,
|
|
"grad_norm": 0.4787365198135376,
|
|
"learning_rate": 1.1110479871862862e-05,
|
|
"loss": 0.7879,
|
|
"num_input_tokens_seen": 530022400,
|
|
"step": 64700
|
|
},
|
|
{
|
|
"epoch": 2.075725542955987,
|
|
"grad_norm": 0.7449747323989868,
|
|
"learning_rate": 1.1040079441841065e-05,
|
|
"loss": 0.866,
|
|
"num_input_tokens_seen": 530841600,
|
|
"step": 64800
|
|
},
|
|
{
|
|
"epoch": 2.0789288231148695,
|
|
"grad_norm": 0.7580021619796753,
|
|
"learning_rate": 1.0969839504019108e-05,
|
|
"loss": 0.851,
|
|
"num_input_tokens_seen": 531660800,
|
|
"step": 64900
|
|
},
|
|
{
|
|
"epoch": 2.0821321032737523,
|
|
"grad_norm": 0.6036601662635803,
|
|
"learning_rate": 1.0899760865920355e-05,
|
|
"loss": 0.814,
|
|
"num_input_tokens_seen": 532480000,
|
|
"step": 65000
|
|
},
|
|
{
|
|
"epoch": 2.085335383432635,
|
|
"grad_norm": 0.553875207901001,
|
|
"learning_rate": 1.0829844333213766e-05,
|
|
"loss": 0.8307,
|
|
"num_input_tokens_seen": 533299200,
|
|
"step": 65100
|
|
},
|
|
{
|
|
"epoch": 2.0885386635915175,
|
|
"grad_norm": 0.6239012479782104,
|
|
"learning_rate": 1.0760090709704642e-05,
|
|
"loss": 0.8406,
|
|
"num_input_tokens_seen": 534118400,
|
|
"step": 65200
|
|
},
|
|
{
|
|
"epoch": 2.0917419437504003,
|
|
"grad_norm": 0.8101912140846252,
|
|
"learning_rate": 1.0690500797325387e-05,
|
|
"loss": 0.8263,
|
|
"num_input_tokens_seen": 534937600,
|
|
"step": 65300
|
|
},
|
|
{
|
|
"epoch": 2.094945223909283,
|
|
"grad_norm": 0.827496349811554,
|
|
"learning_rate": 1.0621075396126265e-05,
|
|
"loss": 0.7959,
|
|
"num_input_tokens_seen": 535756800,
|
|
"step": 65400
|
|
},
|
|
{
|
|
"epoch": 2.098148504068166,
|
|
"grad_norm": 0.7722252607345581,
|
|
"learning_rate": 1.055181530426621e-05,
|
|
"loss": 0.8417,
|
|
"num_input_tokens_seen": 536576000,
|
|
"step": 65500
|
|
},
|
|
{
|
|
"epoch": 2.1013517842270484,
|
|
"grad_norm": 0.8276936411857605,
|
|
"learning_rate": 1.0482721318003644e-05,
|
|
"loss": 0.8267,
|
|
"num_input_tokens_seen": 537395200,
|
|
"step": 65600
|
|
},
|
|
{
|
|
"epoch": 2.104555064385931,
|
|
"grad_norm": 0.5818492770195007,
|
|
"learning_rate": 1.0413794231687357e-05,
|
|
"loss": 0.811,
|
|
"num_input_tokens_seen": 538214400,
|
|
"step": 65700
|
|
},
|
|
{
|
|
"epoch": 2.107758344544814,
|
|
"grad_norm": 1.9946190118789673,
|
|
"learning_rate": 1.0345034837747342e-05,
|
|
"loss": 0.8376,
|
|
"num_input_tokens_seen": 539033600,
|
|
"step": 65800
|
|
},
|
|
{
|
|
"epoch": 2.1109616247036964,
|
|
"grad_norm": 0.5959033370018005,
|
|
"learning_rate": 1.0276443926685694e-05,
|
|
"loss": 0.8641,
|
|
"num_input_tokens_seen": 539852800,
|
|
"step": 65900
|
|
},
|
|
{
|
|
"epoch": 2.1141649048625792,
|
|
"grad_norm": 0.9433934092521667,
|
|
"learning_rate": 1.0208022287067509e-05,
|
|
"loss": 0.8445,
|
|
"num_input_tokens_seen": 540672000,
|
|
"step": 66000
|
|
},
|
|
{
|
|
"epoch": 2.117368185021462,
|
|
"grad_norm": 1.3814393281936646,
|
|
"learning_rate": 1.0139770705511833e-05,
|
|
"loss": 0.8783,
|
|
"num_input_tokens_seen": 541491200,
|
|
"step": 66100
|
|
},
|
|
{
|
|
"epoch": 2.120571465180345,
|
|
"grad_norm": 0.5552910566329956,
|
|
"learning_rate": 1.0071689966682623e-05,
|
|
"loss": 0.7836,
|
|
"num_input_tokens_seen": 542310400,
|
|
"step": 66200
|
|
},
|
|
{
|
|
"epoch": 2.1237747453392273,
|
|
"grad_norm": 0.6831013560295105,
|
|
"learning_rate": 1.0003780853279732e-05,
|
|
"loss": 0.8143,
|
|
"num_input_tokens_seen": 543129600,
|
|
"step": 66300
|
|
},
|
|
{
|
|
"epoch": 2.12697802549811,
|
|
"grad_norm": 1.8912497758865356,
|
|
"learning_rate": 9.936044146029855e-06,
|
|
"loss": 0.8582,
|
|
"num_input_tokens_seen": 543948800,
|
|
"step": 66400
|
|
},
|
|
{
|
|
"epoch": 2.130181305656993,
|
|
"grad_norm": 0.6759600639343262,
|
|
"learning_rate": 9.868480623677643e-06,
|
|
"loss": 0.8295,
|
|
"num_input_tokens_seen": 544768000,
|
|
"step": 66500
|
|
},
|
|
{
|
|
"epoch": 2.1333845858158753,
|
|
"grad_norm": 0.6555814146995544,
|
|
"learning_rate": 9.801091062976665e-06,
|
|
"loss": 0.7856,
|
|
"num_input_tokens_seen": 545587200,
|
|
"step": 66600
|
|
},
|
|
{
|
|
"epoch": 2.136587865974758,
|
|
"grad_norm": 0.7342298626899719,
|
|
"learning_rate": 9.733876238680531e-06,
|
|
"loss": 0.8144,
|
|
"num_input_tokens_seen": 546406400,
|
|
"step": 66700
|
|
},
|
|
{
|
|
"epoch": 2.139791146133641,
|
|
"grad_norm": 1.6135506629943848,
|
|
"learning_rate": 9.666836923533987e-06,
|
|
"loss": 0.7658,
|
|
"num_input_tokens_seen": 547225600,
|
|
"step": 66800
|
|
},
|
|
{
|
|
"epoch": 2.1429944262925233,
|
|
"grad_norm": 0.6479013562202454,
|
|
"learning_rate": 9.599973888263972e-06,
|
|
"loss": 0.7818,
|
|
"num_input_tokens_seen": 548044800,
|
|
"step": 66900
|
|
},
|
|
{
|
|
"epoch": 2.146197706451406,
|
|
"grad_norm": 0.8639338612556458,
|
|
"learning_rate": 9.533287901570843e-06,
|
|
"loss": 0.8259,
|
|
"num_input_tokens_seen": 548864000,
|
|
"step": 67000
|
|
},
|
|
{
|
|
"epoch": 2.149400986610289,
|
|
"grad_norm": 0.852070152759552,
|
|
"learning_rate": 9.466779730119449e-06,
|
|
"loss": 0.84,
|
|
"num_input_tokens_seen": 549683200,
|
|
"step": 67100
|
|
},
|
|
{
|
|
"epoch": 2.152604266769172,
|
|
"grad_norm": 0.8585788607597351,
|
|
"learning_rate": 9.400450138530394e-06,
|
|
"loss": 0.8595,
|
|
"num_input_tokens_seen": 550502400,
|
|
"step": 67200
|
|
},
|
|
{
|
|
"epoch": 2.155807546928054,
|
|
"grad_norm": 2.652194023132324,
|
|
"learning_rate": 9.334299889371217e-06,
|
|
"loss": 0.8404,
|
|
"num_input_tokens_seen": 551321600,
|
|
"step": 67300
|
|
},
|
|
{
|
|
"epoch": 2.159010827086937,
|
|
"grad_norm": 0.6588045954704285,
|
|
"learning_rate": 9.268329743147583e-06,
|
|
"loss": 0.7933,
|
|
"num_input_tokens_seen": 552140800,
|
|
"step": 67400
|
|
},
|
|
{
|
|
"epoch": 2.16221410724582,
|
|
"grad_norm": 2.807159423828125,
|
|
"learning_rate": 9.202540458294623e-06,
|
|
"loss": 0.8066,
|
|
"num_input_tokens_seen": 552960000,
|
|
"step": 67500
|
|
},
|
|
{
|
|
"epoch": 2.1654173874047022,
|
|
"grad_norm": 0.7351047396659851,
|
|
"learning_rate": 9.136932791168132e-06,
|
|
"loss": 0.8831,
|
|
"num_input_tokens_seen": 553779200,
|
|
"step": 67600
|
|
},
|
|
{
|
|
"epoch": 2.168620667563585,
|
|
"grad_norm": 0.6064037084579468,
|
|
"learning_rate": 9.071507496035943e-06,
|
|
"loss": 0.7602,
|
|
"num_input_tokens_seen": 554598400,
|
|
"step": 67700
|
|
},
|
|
{
|
|
"epoch": 2.171823947722468,
|
|
"grad_norm": 0.6641263365745544,
|
|
"learning_rate": 9.006265325069197e-06,
|
|
"loss": 0.7984,
|
|
"num_input_tokens_seen": 555417600,
|
|
"step": 67800
|
|
},
|
|
{
|
|
"epoch": 2.1750272278813503,
|
|
"grad_norm": 0.6006192564964294,
|
|
"learning_rate": 8.941207028333737e-06,
|
|
"loss": 0.7831,
|
|
"num_input_tokens_seen": 556236800,
|
|
"step": 67900
|
|
},
|
|
{
|
|
"epoch": 2.178230508040233,
|
|
"grad_norm": 0.6849149465560913,
|
|
"learning_rate": 8.876333353781468e-06,
|
|
"loss": 0.829,
|
|
"num_input_tokens_seen": 557056000,
|
|
"step": 68000
|
|
},
|
|
{
|
|
"epoch": 2.181433788199116,
|
|
"grad_norm": 0.7569016218185425,
|
|
"learning_rate": 8.811645047241767e-06,
|
|
"loss": 0.8623,
|
|
"num_input_tokens_seen": 557875200,
|
|
"step": 68100
|
|
},
|
|
{
|
|
"epoch": 2.1846370683579988,
|
|
"grad_norm": 0.7035521268844604,
|
|
"learning_rate": 8.74714285241289e-06,
|
|
"loss": 0.8444,
|
|
"num_input_tokens_seen": 558694400,
|
|
"step": 68200
|
|
},
|
|
{
|
|
"epoch": 2.187840348516881,
|
|
"grad_norm": 0.7252819538116455,
|
|
"learning_rate": 8.682827510853426e-06,
|
|
"loss": 0.8287,
|
|
"num_input_tokens_seen": 559513600,
|
|
"step": 68300
|
|
},
|
|
{
|
|
"epoch": 2.191043628675764,
|
|
"grad_norm": 0.5455666780471802,
|
|
"learning_rate": 8.618699761973792e-06,
|
|
"loss": 0.7785,
|
|
"num_input_tokens_seen": 560332800,
|
|
"step": 68400
|
|
},
|
|
{
|
|
"epoch": 2.194246908834647,
|
|
"grad_norm": 0.8008429408073425,
|
|
"learning_rate": 8.554760343027724e-06,
|
|
"loss": 0.8595,
|
|
"num_input_tokens_seen": 561152000,
|
|
"step": 68500
|
|
},
|
|
{
|
|
"epoch": 2.197450188993529,
|
|
"grad_norm": 0.755208432674408,
|
|
"learning_rate": 8.491009989103796e-06,
|
|
"loss": 0.8538,
|
|
"num_input_tokens_seen": 561971200,
|
|
"step": 68600
|
|
},
|
|
{
|
|
"epoch": 2.200653469152412,
|
|
"grad_norm": 0.5776748657226562,
|
|
"learning_rate": 8.427449433116952e-06,
|
|
"loss": 0.8333,
|
|
"num_input_tokens_seen": 562790400,
|
|
"step": 68700
|
|
},
|
|
{
|
|
"epoch": 2.203856749311295,
|
|
"grad_norm": 0.6535948514938354,
|
|
"learning_rate": 8.364079405800105e-06,
|
|
"loss": 0.8281,
|
|
"num_input_tokens_seen": 563609600,
|
|
"step": 68800
|
|
},
|
|
{
|
|
"epoch": 2.2070600294701777,
|
|
"grad_norm": 0.5949485898017883,
|
|
"learning_rate": 8.30090063569573e-06,
|
|
"loss": 0.7887,
|
|
"num_input_tokens_seen": 564428800,
|
|
"step": 68900
|
|
},
|
|
{
|
|
"epoch": 2.21026330962906,
|
|
"grad_norm": 3.0284650325775146,
|
|
"learning_rate": 8.237913849147497e-06,
|
|
"loss": 0.8451,
|
|
"num_input_tokens_seen": 565248000,
|
|
"step": 69000
|
|
},
|
|
{
|
|
"epoch": 2.213466589787943,
|
|
"grad_norm": 0.5593298673629761,
|
|
"learning_rate": 8.1751197702919e-06,
|
|
"loss": 0.8596,
|
|
"num_input_tokens_seen": 566067200,
|
|
"step": 69100
|
|
},
|
|
{
|
|
"epoch": 2.2166698699468257,
|
|
"grad_norm": 0.670230507850647,
|
|
"learning_rate": 8.112519121049942e-06,
|
|
"loss": 0.8584,
|
|
"num_input_tokens_seen": 566886400,
|
|
"step": 69200
|
|
},
|
|
{
|
|
"epoch": 2.219873150105708,
|
|
"grad_norm": 1.34910249710083,
|
|
"learning_rate": 8.050112621118822e-06,
|
|
"loss": 0.8518,
|
|
"num_input_tokens_seen": 567705600,
|
|
"step": 69300
|
|
},
|
|
{
|
|
"epoch": 2.223076430264591,
|
|
"grad_norm": 0.6535902619361877,
|
|
"learning_rate": 7.987900987963695e-06,
|
|
"loss": 0.8544,
|
|
"num_input_tokens_seen": 568524800,
|
|
"step": 69400
|
|
},
|
|
{
|
|
"epoch": 2.2262797104234737,
|
|
"grad_norm": 0.594032883644104,
|
|
"learning_rate": 7.925884936809396e-06,
|
|
"loss": 0.8395,
|
|
"num_input_tokens_seen": 569344000,
|
|
"step": 69500
|
|
},
|
|
{
|
|
"epoch": 2.2294829905823565,
|
|
"grad_norm": 0.6679059863090515,
|
|
"learning_rate": 7.864065180632233e-06,
|
|
"loss": 0.8681,
|
|
"num_input_tokens_seen": 570163200,
|
|
"step": 69600
|
|
},
|
|
{
|
|
"epoch": 2.232686270741239,
|
|
"grad_norm": 0.5853981375694275,
|
|
"learning_rate": 7.802442430151757e-06,
|
|
"loss": 0.7735,
|
|
"num_input_tokens_seen": 570982400,
|
|
"step": 69700
|
|
},
|
|
{
|
|
"epoch": 2.2358895509001218,
|
|
"grad_norm": 1.4077626466751099,
|
|
"learning_rate": 7.741017393822628e-06,
|
|
"loss": 0.7853,
|
|
"num_input_tokens_seen": 571801600,
|
|
"step": 69800
|
|
},
|
|
{
|
|
"epoch": 2.2390928310590046,
|
|
"grad_norm": 0.6583539247512817,
|
|
"learning_rate": 7.679790777826459e-06,
|
|
"loss": 0.8403,
|
|
"num_input_tokens_seen": 572620800,
|
|
"step": 69900
|
|
},
|
|
{
|
|
"epoch": 2.242296111217887,
|
|
"grad_norm": 0.8946901559829712,
|
|
"learning_rate": 7.618763286063698e-06,
|
|
"loss": 0.8336,
|
|
"num_input_tokens_seen": 573440000,
|
|
"step": 70000
|
|
},
|
|
{
|
|
"epoch": 2.24549939137677,
|
|
"grad_norm": 0.7540560364723206,
|
|
"learning_rate": 7.55793562014554e-06,
|
|
"loss": 0.7682,
|
|
"num_input_tokens_seen": 574259200,
|
|
"step": 70100
|
|
},
|
|
{
|
|
"epoch": 2.2487026715356526,
|
|
"grad_norm": 0.7601240873336792,
|
|
"learning_rate": 7.497308479385831e-06,
|
|
"loss": 0.8367,
|
|
"num_input_tokens_seen": 575078400,
|
|
"step": 70200
|
|
},
|
|
{
|
|
"epoch": 2.2519059516945354,
|
|
"grad_norm": 0.7198605537414551,
|
|
"learning_rate": 7.43688256079306e-06,
|
|
"loss": 0.8119,
|
|
"num_input_tokens_seen": 575897600,
|
|
"step": 70300
|
|
},
|
|
{
|
|
"epoch": 2.255109231853418,
|
|
"grad_norm": 0.7405291199684143,
|
|
"learning_rate": 7.376658559062349e-06,
|
|
"loss": 0.8231,
|
|
"num_input_tokens_seen": 576716800,
|
|
"step": 70400
|
|
},
|
|
{
|
|
"epoch": 2.2583125120123007,
|
|
"grad_norm": 0.6844334602355957,
|
|
"learning_rate": 7.31663716656745e-06,
|
|
"loss": 0.852,
|
|
"num_input_tokens_seen": 577536000,
|
|
"step": 70500
|
|
},
|
|
{
|
|
"epoch": 2.2615157921711835,
|
|
"grad_norm": 3.182279348373413,
|
|
"learning_rate": 7.256819073352775e-06,
|
|
"loss": 0.82,
|
|
"num_input_tokens_seen": 578355200,
|
|
"step": 70600
|
|
},
|
|
{
|
|
"epoch": 2.264719072330066,
|
|
"grad_norm": 0.7010332345962524,
|
|
"learning_rate": 7.197204967125498e-06,
|
|
"loss": 0.8417,
|
|
"num_input_tokens_seen": 579174400,
|
|
"step": 70700
|
|
},
|
|
{
|
|
"epoch": 2.2679223524889487,
|
|
"grad_norm": 3.276526927947998,
|
|
"learning_rate": 7.137795533247604e-06,
|
|
"loss": 0.8252,
|
|
"num_input_tokens_seen": 579993600,
|
|
"step": 70800
|
|
},
|
|
{
|
|
"epoch": 2.2711256326478315,
|
|
"grad_norm": 0.6692455410957336,
|
|
"learning_rate": 7.078591454728056e-06,
|
|
"loss": 0.8195,
|
|
"num_input_tokens_seen": 580812800,
|
|
"step": 70900
|
|
},
|
|
{
|
|
"epoch": 2.274328912806714,
|
|
"grad_norm": 0.6837947368621826,
|
|
"learning_rate": 7.019593412214914e-06,
|
|
"loss": 0.8012,
|
|
"num_input_tokens_seen": 581632000,
|
|
"step": 71000
|
|
},
|
|
{
|
|
"epoch": 2.2775321929655967,
|
|
"grad_norm": 0.8453261256217957,
|
|
"learning_rate": 6.960802083987503e-06,
|
|
"loss": 0.8097,
|
|
"num_input_tokens_seen": 582451200,
|
|
"step": 71100
|
|
},
|
|
{
|
|
"epoch": 2.2807354731244796,
|
|
"grad_norm": 0.7615090608596802,
|
|
"learning_rate": 6.902218145948647e-06,
|
|
"loss": 0.8216,
|
|
"num_input_tokens_seen": 583270400,
|
|
"step": 71200
|
|
},
|
|
{
|
|
"epoch": 2.283938753283362,
|
|
"grad_norm": 2.4880526065826416,
|
|
"learning_rate": 6.8438422716168595e-06,
|
|
"loss": 0.829,
|
|
"num_input_tokens_seen": 584089600,
|
|
"step": 71300
|
|
},
|
|
{
|
|
"epoch": 2.2871420334422448,
|
|
"grad_norm": 2.184436798095703,
|
|
"learning_rate": 6.785675132118638e-06,
|
|
"loss": 0.8557,
|
|
"num_input_tokens_seen": 584908800,
|
|
"step": 71400
|
|
},
|
|
{
|
|
"epoch": 2.2903453136011276,
|
|
"grad_norm": 0.6513957977294922,
|
|
"learning_rate": 6.72771739618073e-06,
|
|
"loss": 0.8199,
|
|
"num_input_tokens_seen": 585728000,
|
|
"step": 71500
|
|
},
|
|
{
|
|
"epoch": 2.2935485937600104,
|
|
"grad_norm": 2.187042713165283,
|
|
"learning_rate": 6.6699697301224214e-06,
|
|
"loss": 0.876,
|
|
"num_input_tokens_seen": 586547200,
|
|
"step": 71600
|
|
},
|
|
{
|
|
"epoch": 2.296751873918893,
|
|
"grad_norm": 0.6848201751708984,
|
|
"learning_rate": 6.612432797847937e-06,
|
|
"loss": 0.8013,
|
|
"num_input_tokens_seen": 587366400,
|
|
"step": 71700
|
|
},
|
|
{
|
|
"epoch": 2.2999551540777756,
|
|
"grad_norm": 0.9538524150848389,
|
|
"learning_rate": 6.55510726083873e-06,
|
|
"loss": 0.7922,
|
|
"num_input_tokens_seen": 588185600,
|
|
"step": 71800
|
|
},
|
|
{
|
|
"epoch": 2.3031584342366584,
|
|
"grad_norm": 0.6234622597694397,
|
|
"learning_rate": 6.4979937781459586e-06,
|
|
"loss": 0.7617,
|
|
"num_input_tokens_seen": 589004800,
|
|
"step": 71900
|
|
},
|
|
{
|
|
"epoch": 2.306361714395541,
|
|
"grad_norm": 0.7952730655670166,
|
|
"learning_rate": 6.441093006382831e-06,
|
|
"loss": 0.8744,
|
|
"num_input_tokens_seen": 589824000,
|
|
"step": 72000
|
|
},
|
|
{
|
|
"epoch": 2.3095649945544237,
|
|
"grad_norm": 0.6471823453903198,
|
|
"learning_rate": 6.384405599717125e-06,
|
|
"loss": 0.7952,
|
|
"num_input_tokens_seen": 590643200,
|
|
"step": 72100
|
|
},
|
|
{
|
|
"epoch": 2.3127682747133065,
|
|
"grad_norm": 0.713498592376709,
|
|
"learning_rate": 6.327932209863618e-06,
|
|
"loss": 0.817,
|
|
"num_input_tokens_seen": 591462400,
|
|
"step": 72200
|
|
},
|
|
{
|
|
"epoch": 2.3159715548721893,
|
|
"grad_norm": 0.8223375678062439,
|
|
"learning_rate": 6.271673486076629e-06,
|
|
"loss": 0.8127,
|
|
"num_input_tokens_seen": 592281600,
|
|
"step": 72300
|
|
},
|
|
{
|
|
"epoch": 2.3191748350310717,
|
|
"grad_norm": 2.696056842803955,
|
|
"learning_rate": 6.215630075142523e-06,
|
|
"loss": 0.8191,
|
|
"num_input_tokens_seen": 593100800,
|
|
"step": 72400
|
|
},
|
|
{
|
|
"epoch": 2.3223781151899545,
|
|
"grad_norm": 0.6731551885604858,
|
|
"learning_rate": 6.159802621372279e-06,
|
|
"loss": 0.831,
|
|
"num_input_tokens_seen": 593920000,
|
|
"step": 72500
|
|
},
|
|
{
|
|
"epoch": 2.3255813953488373,
|
|
"grad_norm": 0.6898087859153748,
|
|
"learning_rate": 6.1041917665941275e-06,
|
|
"loss": 0.8249,
|
|
"num_input_tokens_seen": 594739200,
|
|
"step": 72600
|
|
},
|
|
{
|
|
"epoch": 2.3287846755077197,
|
|
"grad_norm": 0.6532519459724426,
|
|
"learning_rate": 6.048798150146112e-06,
|
|
"loss": 0.7416,
|
|
"num_input_tokens_seen": 595558400,
|
|
"step": 72700
|
|
},
|
|
{
|
|
"epoch": 2.3319879556666026,
|
|
"grad_norm": 0.6760110259056091,
|
|
"learning_rate": 5.993622408868788e-06,
|
|
"loss": 0.8451,
|
|
"num_input_tokens_seen": 596377600,
|
|
"step": 72800
|
|
},
|
|
{
|
|
"epoch": 2.3351912358254854,
|
|
"grad_norm": 2.732374668121338,
|
|
"learning_rate": 5.9386651770978516e-06,
|
|
"loss": 0.8654,
|
|
"num_input_tokens_seen": 597196800,
|
|
"step": 72900
|
|
},
|
|
{
|
|
"epoch": 2.338394515984368,
|
|
"grad_norm": 0.6297926306724548,
|
|
"learning_rate": 5.8839270866568816e-06,
|
|
"loss": 0.8397,
|
|
"num_input_tokens_seen": 598016000,
|
|
"step": 73000
|
|
},
|
|
{
|
|
"epoch": 2.3415977961432506,
|
|
"grad_norm": 0.5178629755973816,
|
|
"learning_rate": 5.829408766850078e-06,
|
|
"loss": 0.833,
|
|
"num_input_tokens_seen": 598835200,
|
|
"step": 73100
|
|
},
|
|
{
|
|
"epoch": 2.3448010763021334,
|
|
"grad_norm": 0.5522879958152771,
|
|
"learning_rate": 5.7751108444550066e-06,
|
|
"loss": 0.8174,
|
|
"num_input_tokens_seen": 599654400,
|
|
"step": 73200
|
|
},
|
|
{
|
|
"epoch": 2.3480043564610162,
|
|
"grad_norm": 0.6307721734046936,
|
|
"learning_rate": 5.7210339437154175e-06,
|
|
"loss": 0.7809,
|
|
"num_input_tokens_seen": 600473600,
|
|
"step": 73300
|
|
},
|
|
{
|
|
"epoch": 2.3512076366198986,
|
|
"grad_norm": 0.6830965876579285,
|
|
"learning_rate": 5.667178686334037e-06,
|
|
"loss": 0.8243,
|
|
"num_input_tokens_seen": 601292800,
|
|
"step": 73400
|
|
},
|
|
{
|
|
"epoch": 2.3544109167787814,
|
|
"grad_norm": 2.0725910663604736,
|
|
"learning_rate": 5.613545691465438e-06,
|
|
"loss": 0.7868,
|
|
"num_input_tokens_seen": 602112000,
|
|
"step": 73500
|
|
},
|
|
{
|
|
"epoch": 2.3576141969376643,
|
|
"grad_norm": 0.994819700717926,
|
|
"learning_rate": 5.560135575708927e-06,
|
|
"loss": 0.8176,
|
|
"num_input_tokens_seen": 602931200,
|
|
"step": 73600
|
|
},
|
|
{
|
|
"epoch": 2.360817477096547,
|
|
"grad_norm": 0.7025684714317322,
|
|
"learning_rate": 5.506948953101454e-06,
|
|
"loss": 0.8417,
|
|
"num_input_tokens_seen": 603750400,
|
|
"step": 73700
|
|
},
|
|
{
|
|
"epoch": 2.3640207572554295,
|
|
"grad_norm": 0.6975109577178955,
|
|
"learning_rate": 5.45398643511055e-06,
|
|
"loss": 0.8552,
|
|
"num_input_tokens_seen": 604569600,
|
|
"step": 73800
|
|
},
|
|
{
|
|
"epoch": 2.3672240374143123,
|
|
"grad_norm": 0.6180407404899597,
|
|
"learning_rate": 5.401248630627282e-06,
|
|
"loss": 0.8423,
|
|
"num_input_tokens_seen": 605388800,
|
|
"step": 73900
|
|
},
|
|
{
|
|
"epoch": 2.370427317573195,
|
|
"grad_norm": 0.8194453716278076,
|
|
"learning_rate": 5.3487361459592626e-06,
|
|
"loss": 0.8278,
|
|
"num_input_tokens_seen": 606208000,
|
|
"step": 74000
|
|
},
|
|
{
|
|
"epoch": 2.3736305977320775,
|
|
"grad_norm": 0.6039137244224548,
|
|
"learning_rate": 5.296449584823707e-06,
|
|
"loss": 0.8354,
|
|
"num_input_tokens_seen": 607027200,
|
|
"step": 74100
|
|
},
|
|
{
|
|
"epoch": 2.3768338778909603,
|
|
"grad_norm": 0.6407757997512817,
|
|
"learning_rate": 5.244389548340456e-06,
|
|
"loss": 0.8292,
|
|
"num_input_tokens_seen": 607846400,
|
|
"step": 74200
|
|
},
|
|
{
|
|
"epoch": 2.380037158049843,
|
|
"grad_norm": 1.9735205173492432,
|
|
"learning_rate": 5.19255663502507e-06,
|
|
"loss": 0.8604,
|
|
"num_input_tokens_seen": 608665600,
|
|
"step": 74300
|
|
},
|
|
{
|
|
"epoch": 2.3832404382087256,
|
|
"grad_norm": 0.7297560572624207,
|
|
"learning_rate": 5.1409514407819745e-06,
|
|
"loss": 0.8464,
|
|
"num_input_tokens_seen": 609484800,
|
|
"step": 74400
|
|
},
|
|
{
|
|
"epoch": 2.3864437183676084,
|
|
"grad_norm": 0.641272246837616,
|
|
"learning_rate": 5.089574558897564e-06,
|
|
"loss": 0.8711,
|
|
"num_input_tokens_seen": 610304000,
|
|
"step": 74500
|
|
},
|
|
{
|
|
"epoch": 2.389646998526491,
|
|
"grad_norm": 0.5732747316360474,
|
|
"learning_rate": 5.038426580033431e-06,
|
|
"loss": 0.8357,
|
|
"num_input_tokens_seen": 611123200,
|
|
"step": 74600
|
|
},
|
|
{
|
|
"epoch": 2.3928502786853736,
|
|
"grad_norm": 0.7175111770629883,
|
|
"learning_rate": 4.98750809221955e-06,
|
|
"loss": 0.8782,
|
|
"num_input_tokens_seen": 611942400,
|
|
"step": 74700
|
|
},
|
|
{
|
|
"epoch": 2.3960535588442564,
|
|
"grad_norm": 0.6939539909362793,
|
|
"learning_rate": 4.936819680847499e-06,
|
|
"loss": 0.8051,
|
|
"num_input_tokens_seen": 612761600,
|
|
"step": 74800
|
|
},
|
|
{
|
|
"epoch": 2.3992568390031392,
|
|
"grad_norm": 0.9897929430007935,
|
|
"learning_rate": 4.886361928663779e-06,
|
|
"loss": 0.8208,
|
|
"num_input_tokens_seen": 613580800,
|
|
"step": 74900
|
|
},
|
|
{
|
|
"epoch": 2.402460119162022,
|
|
"grad_norm": 1.3492214679718018,
|
|
"learning_rate": 4.836135415763054e-06,
|
|
"loss": 0.8081,
|
|
"num_input_tokens_seen": 614400000,
|
|
"step": 75000
|
|
},
|
|
{
|
|
"epoch": 2.4056633993209044,
|
|
"grad_norm": 0.6165256500244141,
|
|
"learning_rate": 4.786140719581539e-06,
|
|
"loss": 0.8612,
|
|
"num_input_tokens_seen": 615219200,
|
|
"step": 75100
|
|
},
|
|
{
|
|
"epoch": 2.4088666794797873,
|
|
"grad_norm": 0.7315238118171692,
|
|
"learning_rate": 4.73637841489033e-06,
|
|
"loss": 0.8201,
|
|
"num_input_tokens_seen": 616038400,
|
|
"step": 75200
|
|
},
|
|
{
|
|
"epoch": 2.41206995963867,
|
|
"grad_norm": 0.5693472027778625,
|
|
"learning_rate": 4.686849073788782e-06,
|
|
"loss": 0.8319,
|
|
"num_input_tokens_seen": 616857600,
|
|
"step": 75300
|
|
},
|
|
{
|
|
"epoch": 2.4152732397975525,
|
|
"grad_norm": 1.28626549243927,
|
|
"learning_rate": 4.637553265697978e-06,
|
|
"loss": 0.8012,
|
|
"num_input_tokens_seen": 617676800,
|
|
"step": 75400
|
|
},
|
|
{
|
|
"epoch": 2.4184765199564353,
|
|
"grad_norm": 3.020348072052002,
|
|
"learning_rate": 4.5884915573541326e-06,
|
|
"loss": 0.8216,
|
|
"num_input_tokens_seen": 618496000,
|
|
"step": 75500
|
|
},
|
|
{
|
|
"epoch": 2.421679800115318,
|
|
"grad_norm": 1.7923747301101685,
|
|
"learning_rate": 4.539664512802125e-06,
|
|
"loss": 0.8269,
|
|
"num_input_tokens_seen": 619315200,
|
|
"step": 75600
|
|
},
|
|
{
|
|
"epoch": 2.424883080274201,
|
|
"grad_norm": 0.6749047636985779,
|
|
"learning_rate": 4.491072693388957e-06,
|
|
"loss": 0.7949,
|
|
"num_input_tokens_seen": 620134400,
|
|
"step": 75700
|
|
},
|
|
{
|
|
"epoch": 2.4280863604330833,
|
|
"grad_norm": 0.8918429613113403,
|
|
"learning_rate": 4.442716657757354e-06,
|
|
"loss": 0.8153,
|
|
"num_input_tokens_seen": 620953600,
|
|
"step": 75800
|
|
},
|
|
{
|
|
"epoch": 2.431289640591966,
|
|
"grad_norm": 0.8165135383605957,
|
|
"learning_rate": 4.3945969618393255e-06,
|
|
"loss": 0.8063,
|
|
"num_input_tokens_seen": 621772800,
|
|
"step": 75900
|
|
},
|
|
{
|
|
"epoch": 2.434492920750849,
|
|
"grad_norm": 2.7509946823120117,
|
|
"learning_rate": 4.346714158849744e-06,
|
|
"loss": 0.7779,
|
|
"num_input_tokens_seen": 622592000,
|
|
"step": 76000
|
|
},
|
|
{
|
|
"epoch": 2.4376962009097314,
|
|
"grad_norm": 1.2128119468688965,
|
|
"learning_rate": 4.299068799280032e-06,
|
|
"loss": 0.8322,
|
|
"num_input_tokens_seen": 623411200,
|
|
"step": 76100
|
|
},
|
|
{
|
|
"epoch": 2.440899481068614,
|
|
"grad_norm": 1.1851086616516113,
|
|
"learning_rate": 4.251661430891787e-06,
|
|
"loss": 0.8294,
|
|
"num_input_tokens_seen": 624230400,
|
|
"step": 76200
|
|
},
|
|
{
|
|
"epoch": 2.444102761227497,
|
|
"grad_norm": 0.7874124646186829,
|
|
"learning_rate": 4.20449259871053e-06,
|
|
"loss": 0.819,
|
|
"num_input_tokens_seen": 625049600,
|
|
"step": 76300
|
|
},
|
|
{
|
|
"epoch": 2.44730604138638,
|
|
"grad_norm": 0.6558551788330078,
|
|
"learning_rate": 4.157562845019405e-06,
|
|
"loss": 0.7969,
|
|
"num_input_tokens_seen": 625868800,
|
|
"step": 76400
|
|
},
|
|
{
|
|
"epoch": 2.4505093215452622,
|
|
"grad_norm": 0.7723847031593323,
|
|
"learning_rate": 4.1108727093529644e-06,
|
|
"loss": 0.8516,
|
|
"num_input_tokens_seen": 626688000,
|
|
"step": 76500
|
|
},
|
|
{
|
|
"epoch": 2.453712601704145,
|
|
"grad_norm": 0.6779108047485352,
|
|
"learning_rate": 4.064422728490946e-06,
|
|
"loss": 0.8471,
|
|
"num_input_tokens_seen": 627507200,
|
|
"step": 76600
|
|
},
|
|
{
|
|
"epoch": 2.456915881863028,
|
|
"grad_norm": 0.5954208970069885,
|
|
"learning_rate": 4.018213436452117e-06,
|
|
"loss": 0.84,
|
|
"num_input_tokens_seen": 628326400,
|
|
"step": 76700
|
|
},
|
|
{
|
|
"epoch": 2.4601191620219103,
|
|
"grad_norm": 2.6484439373016357,
|
|
"learning_rate": 3.972245364488136e-06,
|
|
"loss": 0.8224,
|
|
"num_input_tokens_seen": 629145600,
|
|
"step": 76800
|
|
},
|
|
{
|
|
"epoch": 2.463322442180793,
|
|
"grad_norm": 0.6489027142524719,
|
|
"learning_rate": 3.926519041077445e-06,
|
|
"loss": 0.8476,
|
|
"num_input_tokens_seen": 629964800,
|
|
"step": 76900
|
|
},
|
|
{
|
|
"epoch": 2.466525722339676,
|
|
"grad_norm": 2.0896570682525635,
|
|
"learning_rate": 3.8810349919191825e-06,
|
|
"loss": 0.8256,
|
|
"num_input_tokens_seen": 630784000,
|
|
"step": 77000
|
|
},
|
|
{
|
|
"epoch": 2.4697290024985588,
|
|
"grad_norm": 0.8174818158149719,
|
|
"learning_rate": 3.835793739927151e-06,
|
|
"loss": 0.8493,
|
|
"num_input_tokens_seen": 631603200,
|
|
"step": 77100
|
|
},
|
|
{
|
|
"epoch": 2.472932282657441,
|
|
"grad_norm": 0.7576190829277039,
|
|
"learning_rate": 3.7907958052237875e-06,
|
|
"loss": 0.8275,
|
|
"num_input_tokens_seen": 632422400,
|
|
"step": 77200
|
|
},
|
|
{
|
|
"epoch": 2.476135562816324,
|
|
"grad_norm": 1.7763944864273071,
|
|
"learning_rate": 3.746041705134215e-06,
|
|
"loss": 0.8628,
|
|
"num_input_tokens_seen": 633241600,
|
|
"step": 77300
|
|
},
|
|
{
|
|
"epoch": 2.479338842975207,
|
|
"grad_norm": 0.8131124973297119,
|
|
"learning_rate": 3.7015319541802708e-06,
|
|
"loss": 0.8246,
|
|
"num_input_tokens_seen": 634060800,
|
|
"step": 77400
|
|
},
|
|
{
|
|
"epoch": 2.482542123134089,
|
|
"grad_norm": 0.9916465282440186,
|
|
"learning_rate": 3.657267064074607e-06,
|
|
"loss": 0.806,
|
|
"num_input_tokens_seen": 634880000,
|
|
"step": 77500
|
|
},
|
|
{
|
|
"epoch": 2.485745403292972,
|
|
"grad_norm": 1.6239954233169556,
|
|
"learning_rate": 3.613247543714779e-06,
|
|
"loss": 0.8068,
|
|
"num_input_tokens_seen": 635699200,
|
|
"step": 77600
|
|
},
|
|
{
|
|
"epoch": 2.488948683451855,
|
|
"grad_norm": 1.0215014219284058,
|
|
"learning_rate": 3.5694738991774197e-06,
|
|
"loss": 0.7704,
|
|
"num_input_tokens_seen": 636518400,
|
|
"step": 77700
|
|
},
|
|
{
|
|
"epoch": 2.492151963610737,
|
|
"grad_norm": 0.6939218044281006,
|
|
"learning_rate": 3.5259466337124293e-06,
|
|
"loss": 0.8625,
|
|
"num_input_tokens_seen": 637337600,
|
|
"step": 77800
|
|
},
|
|
{
|
|
"epoch": 2.49535524376962,
|
|
"grad_norm": 0.7442044615745544,
|
|
"learning_rate": 3.4826662477371624e-06,
|
|
"loss": 0.8093,
|
|
"num_input_tokens_seen": 638156800,
|
|
"step": 77900
|
|
},
|
|
{
|
|
"epoch": 2.498558523928503,
|
|
"grad_norm": 0.5725979208946228,
|
|
"learning_rate": 3.4396332388307057e-06,
|
|
"loss": 0.8533,
|
|
"num_input_tokens_seen": 638976000,
|
|
"step": 78000
|
|
},
|
|
{
|
|
"epoch": 2.5017618040873852,
|
|
"grad_norm": 2.239358425140381,
|
|
"learning_rate": 3.3968481017281173e-06,
|
|
"loss": 0.8254,
|
|
"num_input_tokens_seen": 639795200,
|
|
"step": 78100
|
|
},
|
|
{
|
|
"epoch": 2.504965084246268,
|
|
"grad_norm": 0.6777194142341614,
|
|
"learning_rate": 3.3543113283147687e-06,
|
|
"loss": 0.8311,
|
|
"num_input_tokens_seen": 640614400,
|
|
"step": 78200
|
|
},
|
|
{
|
|
"epoch": 2.508168364405151,
|
|
"grad_norm": 0.9692057371139526,
|
|
"learning_rate": 3.3120234076206987e-06,
|
|
"loss": 0.8285,
|
|
"num_input_tokens_seen": 641433600,
|
|
"step": 78300
|
|
},
|
|
{
|
|
"epoch": 2.5113716445640337,
|
|
"grad_norm": 0.8157410621643066,
|
|
"learning_rate": 3.2699848258149617e-06,
|
|
"loss": 0.8276,
|
|
"num_input_tokens_seen": 642252800,
|
|
"step": 78400
|
|
},
|
|
{
|
|
"epoch": 2.514574924722916,
|
|
"grad_norm": 1.9688010215759277,
|
|
"learning_rate": 3.228196066200051e-06,
|
|
"loss": 0.7989,
|
|
"num_input_tokens_seen": 643072000,
|
|
"step": 78500
|
|
},
|
|
{
|
|
"epoch": 2.517778204881799,
|
|
"grad_norm": 2.142247200012207,
|
|
"learning_rate": 3.186657609206353e-06,
|
|
"loss": 0.8165,
|
|
"num_input_tokens_seen": 643891200,
|
|
"step": 78600
|
|
},
|
|
{
|
|
"epoch": 2.5209814850406818,
|
|
"grad_norm": 0.7529670596122742,
|
|
"learning_rate": 3.1453699323866047e-06,
|
|
"loss": 0.8476,
|
|
"num_input_tokens_seen": 644710400,
|
|
"step": 78700
|
|
},
|
|
{
|
|
"epoch": 2.524184765199564,
|
|
"grad_norm": 0.5978514552116394,
|
|
"learning_rate": 3.1043335104104233e-06,
|
|
"loss": 0.8386,
|
|
"num_input_tokens_seen": 645529600,
|
|
"step": 78800
|
|
},
|
|
{
|
|
"epoch": 2.527388045358447,
|
|
"grad_norm": 0.7615718841552734,
|
|
"learning_rate": 3.0635488150588338e-06,
|
|
"loss": 0.8198,
|
|
"num_input_tokens_seen": 646348800,
|
|
"step": 78900
|
|
},
|
|
{
|
|
"epoch": 2.53059132551733,
|
|
"grad_norm": 0.7568325400352478,
|
|
"learning_rate": 3.0230163152188463e-06,
|
|
"loss": 0.8364,
|
|
"num_input_tokens_seen": 647168000,
|
|
"step": 79000
|
|
},
|
|
{
|
|
"epoch": 2.5337946056762126,
|
|
"grad_norm": 0.5773870944976807,
|
|
"learning_rate": 2.9827364768780814e-06,
|
|
"loss": 0.7922,
|
|
"num_input_tokens_seen": 647987200,
|
|
"step": 79100
|
|
},
|
|
{
|
|
"epoch": 2.536997885835095,
|
|
"grad_norm": 4.734196662902832,
|
|
"learning_rate": 2.942709763119386e-06,
|
|
"loss": 0.7829,
|
|
"num_input_tokens_seen": 648806400,
|
|
"step": 79200
|
|
},
|
|
{
|
|
"epoch": 2.540201165993978,
|
|
"grad_norm": 0.7763670682907104,
|
|
"learning_rate": 2.9029366341155356e-06,
|
|
"loss": 0.8196,
|
|
"num_input_tokens_seen": 649625600,
|
|
"step": 79300
|
|
},
|
|
{
|
|
"epoch": 2.5434044461528607,
|
|
"grad_norm": 0.6776308417320251,
|
|
"learning_rate": 2.863417547123934e-06,
|
|
"loss": 0.788,
|
|
"num_input_tokens_seen": 650444800,
|
|
"step": 79400
|
|
},
|
|
{
|
|
"epoch": 2.546607726311743,
|
|
"grad_norm": 0.7068803906440735,
|
|
"learning_rate": 2.8241529564813434e-06,
|
|
"loss": 0.8413,
|
|
"num_input_tokens_seen": 651264000,
|
|
"step": 79500
|
|
},
|
|
{
|
|
"epoch": 2.549811006470626,
|
|
"grad_norm": 1.1894068717956543,
|
|
"learning_rate": 2.7851433135986843e-06,
|
|
"loss": 0.851,
|
|
"num_input_tokens_seen": 652083200,
|
|
"step": 79600
|
|
},
|
|
{
|
|
"epoch": 2.5530142866295087,
|
|
"grad_norm": 1.9698837995529175,
|
|
"learning_rate": 2.7463890669558263e-06,
|
|
"loss": 0.8379,
|
|
"num_input_tokens_seen": 652902400,
|
|
"step": 79700
|
|
},
|
|
{
|
|
"epoch": 2.5562175667883915,
|
|
"grad_norm": 1.8066941499710083,
|
|
"learning_rate": 2.707890662096452e-06,
|
|
"loss": 0.7906,
|
|
"num_input_tokens_seen": 653721600,
|
|
"step": 79800
|
|
},
|
|
{
|
|
"epoch": 2.559420846947274,
|
|
"grad_norm": 0.824046790599823,
|
|
"learning_rate": 2.6696485416228987e-06,
|
|
"loss": 0.8011,
|
|
"num_input_tokens_seen": 654540800,
|
|
"step": 79900
|
|
},
|
|
{
|
|
"epoch": 2.5626241271061567,
|
|
"grad_norm": 0.7096015214920044,
|
|
"learning_rate": 2.6316631451911213e-06,
|
|
"loss": 0.8328,
|
|
"num_input_tokens_seen": 655360000,
|
|
"step": 80000
|
|
},
|
|
{
|
|
"epoch": 2.5658274072650396,
|
|
"grad_norm": 0.5634686350822449,
|
|
"learning_rate": 2.593934909505602e-06,
|
|
"loss": 0.8896,
|
|
"num_input_tokens_seen": 656179200,
|
|
"step": 80100
|
|
},
|
|
{
|
|
"epoch": 2.569030687423922,
|
|
"grad_norm": 0.7022582292556763,
|
|
"learning_rate": 2.5564642683143263e-06,
|
|
"loss": 0.8405,
|
|
"num_input_tokens_seen": 656998400,
|
|
"step": 80200
|
|
},
|
|
{
|
|
"epoch": 2.5722339675828048,
|
|
"grad_norm": 0.010020343586802483,
|
|
"learning_rate": 2.51925165240382e-06,
|
|
"loss": 0.8639,
|
|
"num_input_tokens_seen": 657817600,
|
|
"step": 80300
|
|
},
|
|
{
|
|
"epoch": 2.5754372477416876,
|
|
"grad_norm": 0.7010151147842407,
|
|
"learning_rate": 2.482297489594182e-06,
|
|
"loss": 0.813,
|
|
"num_input_tokens_seen": 658636800,
|
|
"step": 80400
|
|
},
|
|
{
|
|
"epoch": 2.5786405279005704,
|
|
"grad_norm": 1.0606889724731445,
|
|
"learning_rate": 2.4456022047341653e-06,
|
|
"loss": 0.8494,
|
|
"num_input_tokens_seen": 659456000,
|
|
"step": 80500
|
|
},
|
|
{
|
|
"epoch": 2.581843808059453,
|
|
"grad_norm": 0.5736305713653564,
|
|
"learning_rate": 2.4091662196963014e-06,
|
|
"loss": 0.8748,
|
|
"num_input_tokens_seen": 660275200,
|
|
"step": 80600
|
|
},
|
|
{
|
|
"epoch": 2.5850470882183356,
|
|
"grad_norm": 0.6299107074737549,
|
|
"learning_rate": 2.3729899533720485e-06,
|
|
"loss": 0.8254,
|
|
"num_input_tokens_seen": 661094400,
|
|
"step": 80700
|
|
},
|
|
{
|
|
"epoch": 2.588250368377218,
|
|
"grad_norm": 0.8091995120048523,
|
|
"learning_rate": 2.3370738216669574e-06,
|
|
"loss": 0.8373,
|
|
"num_input_tokens_seen": 661913600,
|
|
"step": 80800
|
|
},
|
|
{
|
|
"epoch": 2.591453648536101,
|
|
"grad_norm": 0.7887117862701416,
|
|
"learning_rate": 2.3014182374959116e-06,
|
|
"loss": 0.7675,
|
|
"num_input_tokens_seen": 662732800,
|
|
"step": 80900
|
|
},
|
|
{
|
|
"epoch": 2.5946569286949837,
|
|
"grad_norm": 0.7341217994689941,
|
|
"learning_rate": 2.2660236107783783e-06,
|
|
"loss": 0.8264,
|
|
"num_input_tokens_seen": 663552000,
|
|
"step": 81000
|
|
},
|
|
{
|
|
"epoch": 2.5978602088538665,
|
|
"grad_norm": 0.7887162566184998,
|
|
"learning_rate": 2.230890348433684e-06,
|
|
"loss": 0.8579,
|
|
"num_input_tokens_seen": 664371200,
|
|
"step": 81100
|
|
},
|
|
{
|
|
"epoch": 2.6010634890127493,
|
|
"grad_norm": 0.8627157807350159,
|
|
"learning_rate": 2.1960188543763526e-06,
|
|
"loss": 0.8412,
|
|
"num_input_tokens_seen": 665190400,
|
|
"step": 81200
|
|
},
|
|
{
|
|
"epoch": 2.6042667691716317,
|
|
"grad_norm": 2.6676676273345947,
|
|
"learning_rate": 2.161409529511438e-06,
|
|
"loss": 0.7985,
|
|
"num_input_tokens_seen": 666009600,
|
|
"step": 81300
|
|
},
|
|
{
|
|
"epoch": 2.6074700493305145,
|
|
"grad_norm": 0.6035804748535156,
|
|
"learning_rate": 2.127062771729929e-06,
|
|
"loss": 0.8033,
|
|
"num_input_tokens_seen": 666828800,
|
|
"step": 81400
|
|
},
|
|
{
|
|
"epoch": 2.610673329489397,
|
|
"grad_norm": 2.14854097366333,
|
|
"learning_rate": 2.092978975904189e-06,
|
|
"loss": 0.8538,
|
|
"num_input_tokens_seen": 667648000,
|
|
"step": 81500
|
|
},
|
|
{
|
|
"epoch": 2.6138766096482797,
|
|
"grad_norm": 1.651636004447937,
|
|
"learning_rate": 2.059158533883393e-06,
|
|
"loss": 0.8805,
|
|
"num_input_tokens_seen": 668467200,
|
|
"step": 81600
|
|
},
|
|
{
|
|
"epoch": 2.6170798898071626,
|
|
"grad_norm": 2.1014175415039062,
|
|
"learning_rate": 2.025601834489038e-06,
|
|
"loss": 0.8837,
|
|
"num_input_tokens_seen": 669286400,
|
|
"step": 81700
|
|
},
|
|
{
|
|
"epoch": 2.6202831699660454,
|
|
"grad_norm": 0.741468071937561,
|
|
"learning_rate": 1.9923092635104557e-06,
|
|
"loss": 0.7892,
|
|
"num_input_tokens_seen": 670105600,
|
|
"step": 81800
|
|
},
|
|
{
|
|
"epoch": 2.6234864501249278,
|
|
"grad_norm": 1.3246105909347534,
|
|
"learning_rate": 1.9592812037003918e-06,
|
|
"loss": 0.774,
|
|
"num_input_tokens_seen": 670924800,
|
|
"step": 81900
|
|
},
|
|
{
|
|
"epoch": 2.6266897302838106,
|
|
"grad_norm": 0.6697006225585938,
|
|
"learning_rate": 1.9265180347706053e-06,
|
|
"loss": 0.8393,
|
|
"num_input_tokens_seen": 671744000,
|
|
"step": 82000
|
|
},
|
|
{
|
|
"epoch": 2.6298930104426934,
|
|
"grad_norm": 0.5421914458274841,
|
|
"learning_rate": 1.894020133387503e-06,
|
|
"loss": 0.8398,
|
|
"num_input_tokens_seen": 672563200,
|
|
"step": 82100
|
|
},
|
|
{
|
|
"epoch": 2.633096290601576,
|
|
"grad_norm": 2.6112563610076904,
|
|
"learning_rate": 1.8617878731678e-06,
|
|
"loss": 0.8031,
|
|
"num_input_tokens_seen": 673382400,
|
|
"step": 82200
|
|
},
|
|
{
|
|
"epoch": 2.6362995707604586,
|
|
"grad_norm": 0.7507239580154419,
|
|
"learning_rate": 1.8298216246742329e-06,
|
|
"loss": 0.831,
|
|
"num_input_tokens_seen": 674201600,
|
|
"step": 82300
|
|
},
|
|
{
|
|
"epoch": 2.6395028509193414,
|
|
"grad_norm": 2.156158685684204,
|
|
"learning_rate": 1.798121755411289e-06,
|
|
"loss": 0.8778,
|
|
"num_input_tokens_seen": 675020800,
|
|
"step": 82400
|
|
},
|
|
{
|
|
"epoch": 2.6427061310782243,
|
|
"grad_norm": 0.5693337917327881,
|
|
"learning_rate": 1.7666886298210006e-06,
|
|
"loss": 0.7904,
|
|
"num_input_tokens_seen": 675840000,
|
|
"step": 82500
|
|
},
|
|
{
|
|
"epoch": 2.6459094112371067,
|
|
"grad_norm": 0.9597682356834412,
|
|
"learning_rate": 1.735522609278742e-06,
|
|
"loss": 0.8547,
|
|
"num_input_tokens_seen": 676659200,
|
|
"step": 82600
|
|
},
|
|
{
|
|
"epoch": 2.6491126913959895,
|
|
"grad_norm": 0.8956586122512817,
|
|
"learning_rate": 1.7046240520890655e-06,
|
|
"loss": 0.8395,
|
|
"num_input_tokens_seen": 677478400,
|
|
"step": 82700
|
|
},
|
|
{
|
|
"epoch": 2.6523159715548723,
|
|
"grad_norm": 0.918878436088562,
|
|
"learning_rate": 1.6739933134816117e-06,
|
|
"loss": 0.8106,
|
|
"num_input_tokens_seen": 678297600,
|
|
"step": 82800
|
|
},
|
|
{
|
|
"epoch": 2.6555192517137547,
|
|
"grad_norm": 0.6460690498352051,
|
|
"learning_rate": 1.6436307456069832e-06,
|
|
"loss": 0.8427,
|
|
"num_input_tokens_seen": 679116800,
|
|
"step": 82900
|
|
},
|
|
{
|
|
"epoch": 2.6587225318726375,
|
|
"grad_norm": 0.7876623868942261,
|
|
"learning_rate": 1.6135366975327442e-06,
|
|
"loss": 0.8306,
|
|
"num_input_tokens_seen": 679936000,
|
|
"step": 83000
|
|
},
|
|
{
|
|
"epoch": 2.6619258120315203,
|
|
"grad_norm": 0.7109478712081909,
|
|
"learning_rate": 1.5837115152393695e-06,
|
|
"loss": 0.8785,
|
|
"num_input_tokens_seen": 680755200,
|
|
"step": 83100
|
|
},
|
|
{
|
|
"epoch": 2.665129092190403,
|
|
"grad_norm": 0.6864702701568604,
|
|
"learning_rate": 1.5541555416162784e-06,
|
|
"loss": 0.7719,
|
|
"num_input_tokens_seen": 681574400,
|
|
"step": 83200
|
|
},
|
|
{
|
|
"epoch": 2.6683323723492856,
|
|
"grad_norm": 0.5490867495536804,
|
|
"learning_rate": 1.5248691164579054e-06,
|
|
"loss": 0.7945,
|
|
"num_input_tokens_seen": 682393600,
|
|
"step": 83300
|
|
},
|
|
{
|
|
"epoch": 2.6715356525081684,
|
|
"grad_norm": 0.7371602654457092,
|
|
"learning_rate": 1.4958525764597719e-06,
|
|
"loss": 0.8751,
|
|
"num_input_tokens_seen": 683212800,
|
|
"step": 83400
|
|
},
|
|
{
|
|
"epoch": 2.674738932667051,
|
|
"grad_norm": 3.058120012283325,
|
|
"learning_rate": 1.4671062552146342e-06,
|
|
"loss": 0.807,
|
|
"num_input_tokens_seen": 684032000,
|
|
"step": 83500
|
|
},
|
|
{
|
|
"epoch": 2.6779422128259336,
|
|
"grad_norm": 2.8297903537750244,
|
|
"learning_rate": 1.4386304832086333e-06,
|
|
"loss": 0.8519,
|
|
"num_input_tokens_seen": 684851200,
|
|
"step": 83600
|
|
},
|
|
{
|
|
"epoch": 2.6811454929848164,
|
|
"grad_norm": 0.5840158462524414,
|
|
"learning_rate": 1.4104255878175099e-06,
|
|
"loss": 0.7911,
|
|
"num_input_tokens_seen": 685670400,
|
|
"step": 83700
|
|
},
|
|
{
|
|
"epoch": 2.6843487731436992,
|
|
"grad_norm": 0.5358206629753113,
|
|
"learning_rate": 1.382491893302837e-06,
|
|
"loss": 0.85,
|
|
"num_input_tokens_seen": 686489600,
|
|
"step": 83800
|
|
},
|
|
{
|
|
"epoch": 2.687552053302582,
|
|
"grad_norm": 0.5446909666061401,
|
|
"learning_rate": 1.3548297208082678e-06,
|
|
"loss": 0.7469,
|
|
"num_input_tokens_seen": 687308800,
|
|
"step": 83900
|
|
},
|
|
{
|
|
"epoch": 2.6907553334614644,
|
|
"grad_norm": 0.7376157641410828,
|
|
"learning_rate": 1.3274393883558916e-06,
|
|
"loss": 0.815,
|
|
"num_input_tokens_seen": 688128000,
|
|
"step": 84000
|
|
},
|
|
{
|
|
"epoch": 2.6939586136203473,
|
|
"grad_norm": 2.3603358268737793,
|
|
"learning_rate": 1.3003212108425256e-06,
|
|
"loss": 0.8195,
|
|
"num_input_tokens_seen": 688947200,
|
|
"step": 84100
|
|
},
|
|
{
|
|
"epoch": 2.6971618937792297,
|
|
"grad_norm": 2.3444812297821045,
|
|
"learning_rate": 1.2734755000361393e-06,
|
|
"loss": 0.8265,
|
|
"num_input_tokens_seen": 689766400,
|
|
"step": 84200
|
|
},
|
|
{
|
|
"epoch": 2.7003651739381125,
|
|
"grad_norm": 0.7536035776138306,
|
|
"learning_rate": 1.2469025645722333e-06,
|
|
"loss": 0.8382,
|
|
"num_input_tokens_seen": 690585600,
|
|
"step": 84300
|
|
},
|
|
{
|
|
"epoch": 2.7035684540969953,
|
|
"grad_norm": 0.7054631114006042,
|
|
"learning_rate": 1.2206027099503275e-06,
|
|
"loss": 0.7791,
|
|
"num_input_tokens_seen": 691404800,
|
|
"step": 84400
|
|
},
|
|
{
|
|
"epoch": 2.706771734255878,
|
|
"grad_norm": 0.7819291353225708,
|
|
"learning_rate": 1.1945762385304122e-06,
|
|
"loss": 0.8321,
|
|
"num_input_tokens_seen": 692224000,
|
|
"step": 84500
|
|
},
|
|
{
|
|
"epoch": 2.709975014414761,
|
|
"grad_norm": 0.7501091361045837,
|
|
"learning_rate": 1.168823449529488e-06,
|
|
"loss": 0.8494,
|
|
"num_input_tokens_seen": 693043200,
|
|
"step": 84600
|
|
},
|
|
{
|
|
"epoch": 2.7131782945736433,
|
|
"grad_norm": 0.566743016242981,
|
|
"learning_rate": 1.1433446390181402e-06,
|
|
"loss": 0.8685,
|
|
"num_input_tokens_seen": 693862400,
|
|
"step": 84700
|
|
},
|
|
{
|
|
"epoch": 2.716381574732526,
|
|
"grad_norm": 2.204374313354492,
|
|
"learning_rate": 1.1181400999171144e-06,
|
|
"loss": 0.8147,
|
|
"num_input_tokens_seen": 694681600,
|
|
"step": 84800
|
|
},
|
|
{
|
|
"epoch": 2.7195848548914086,
|
|
"grad_norm": 2.641223192214966,
|
|
"learning_rate": 1.0932101219939594e-06,
|
|
"loss": 0.8259,
|
|
"num_input_tokens_seen": 695500800,
|
|
"step": 84900
|
|
},
|
|
{
|
|
"epoch": 2.7227881350502914,
|
|
"grad_norm": 0.747035562992096,
|
|
"learning_rate": 1.0685549918596882e-06,
|
|
"loss": 0.8737,
|
|
"num_input_tokens_seen": 696320000,
|
|
"step": 85000
|
|
},
|
|
{
|
|
"epoch": 2.725991415209174,
|
|
"grad_norm": 0.9778177738189697,
|
|
"learning_rate": 1.0441749929654827e-06,
|
|
"loss": 0.8358,
|
|
"num_input_tokens_seen": 697139200,
|
|
"step": 85100
|
|
},
|
|
{
|
|
"epoch": 2.729194695368057,
|
|
"grad_norm": 2.0086069107055664,
|
|
"learning_rate": 1.0200704055994548e-06,
|
|
"loss": 0.8231,
|
|
"num_input_tokens_seen": 697958400,
|
|
"step": 85200
|
|
},
|
|
{
|
|
"epoch": 2.73239797552694,
|
|
"grad_norm": 0.7290952801704407,
|
|
"learning_rate": 9.962415068833968e-07,
|
|
"loss": 0.8211,
|
|
"num_input_tokens_seen": 698777600,
|
|
"step": 85300
|
|
},
|
|
{
|
|
"epoch": 2.7356012556858222,
|
|
"grad_norm": 0.6520437598228455,
|
|
"learning_rate": 9.726885707696114e-07,
|
|
"loss": 0.8776,
|
|
"num_input_tokens_seen": 699596800,
|
|
"step": 85400
|
|
},
|
|
{
|
|
"epoch": 2.738804535844705,
|
|
"grad_norm": 0.5633389353752136,
|
|
"learning_rate": 9.494118680377612e-07,
|
|
"loss": 0.8198,
|
|
"num_input_tokens_seen": 700416000,
|
|
"step": 85500
|
|
},
|
|
{
|
|
"epoch": 2.7420078160035875,
|
|
"grad_norm": 0.8410841822624207,
|
|
"learning_rate": 9.264116662917405e-07,
|
|
"loss": 0.8894,
|
|
"num_input_tokens_seen": 701235200,
|
|
"step": 85600
|
|
},
|
|
{
|
|
"epoch": 2.7452110961624703,
|
|
"grad_norm": 2.9148612022399902,
|
|
"learning_rate": 9.036882299566229e-07,
|
|
"loss": 0.8259,
|
|
"num_input_tokens_seen": 702054400,
|
|
"step": 85700
|
|
},
|
|
{
|
|
"epoch": 2.748414376321353,
|
|
"grad_norm": 0.5637199878692627,
|
|
"learning_rate": 8.812418202756107e-07,
|
|
"loss": 0.7636,
|
|
"num_input_tokens_seen": 702873600,
|
|
"step": 85800
|
|
},
|
|
{
|
|
"epoch": 2.751617656480236,
|
|
"grad_norm": 0.5929956436157227,
|
|
"learning_rate": 8.590726953070228e-07,
|
|
"loss": 0.8448,
|
|
"num_input_tokens_seen": 703692800,
|
|
"step": 85900
|
|
},
|
|
{
|
|
"epoch": 2.7548209366391183,
|
|
"grad_norm": 0.5491350889205933,
|
|
"learning_rate": 8.371811099213394e-07,
|
|
"loss": 0.8467,
|
|
"num_input_tokens_seen": 704512000,
|
|
"step": 86000
|
|
},
|
|
{
|
|
"epoch": 2.758024216798001,
|
|
"grad_norm": 1.0223699808120728,
|
|
"learning_rate": 8.155673157982601e-07,
|
|
"loss": 0.8133,
|
|
"num_input_tokens_seen": 705331200,
|
|
"step": 86100
|
|
},
|
|
{
|
|
"epoch": 2.761227496956884,
|
|
"grad_norm": 1.5225611925125122,
|
|
"learning_rate": 7.942315614238277e-07,
|
|
"loss": 0.8109,
|
|
"num_input_tokens_seen": 706150400,
|
|
"step": 86200
|
|
},
|
|
{
|
|
"epoch": 2.7644307771157663,
|
|
"grad_norm": 0.8148054480552673,
|
|
"learning_rate": 7.731740920875613e-07,
|
|
"loss": 0.821,
|
|
"num_input_tokens_seen": 706969600,
|
|
"step": 86300
|
|
},
|
|
{
|
|
"epoch": 2.767634057274649,
|
|
"grad_norm": 0.7864372730255127,
|
|
"learning_rate": 7.523951498796283e-07,
|
|
"loss": 0.8135,
|
|
"num_input_tokens_seen": 707788800,
|
|
"step": 86400
|
|
},
|
|
{
|
|
"epoch": 2.770837337433532,
|
|
"grad_norm": 2.5619330406188965,
|
|
"learning_rate": 7.318949736880798e-07,
|
|
"loss": 0.7905,
|
|
"num_input_tokens_seen": 708608000,
|
|
"step": 86500
|
|
},
|
|
{
|
|
"epoch": 2.774040617592415,
|
|
"grad_norm": 1.5780519247055054,
|
|
"learning_rate": 7.116737991960831e-07,
|
|
"loss": 0.8608,
|
|
"num_input_tokens_seen": 709427200,
|
|
"step": 86600
|
|
},
|
|
{
|
|
"epoch": 2.777243897751297,
|
|
"grad_norm": 0.666118323802948,
|
|
"learning_rate": 6.917318588792299e-07,
|
|
"loss": 0.8586,
|
|
"num_input_tokens_seen": 710246400,
|
|
"step": 86700
|
|
},
|
|
{
|
|
"epoch": 2.78044717791018,
|
|
"grad_norm": 0.5050229430198669,
|
|
"learning_rate": 6.720693820028629e-07,
|
|
"loss": 0.8473,
|
|
"num_input_tokens_seen": 711065600,
|
|
"step": 86800
|
|
},
|
|
{
|
|
"epoch": 2.783650458069063,
|
|
"grad_norm": 0.5586540699005127,
|
|
"learning_rate": 6.526865946194172e-07,
|
|
"loss": 0.8182,
|
|
"num_input_tokens_seen": 711884800,
|
|
"step": 86900
|
|
},
|
|
{
|
|
"epoch": 2.7868537382279452,
|
|
"grad_norm": 0.6938973665237427,
|
|
"learning_rate": 6.335837195658528e-07,
|
|
"loss": 0.8493,
|
|
"num_input_tokens_seen": 712704000,
|
|
"step": 87000
|
|
},
|
|
{
|
|
"epoch": 2.790057018386828,
|
|
"grad_norm": 0.8710479736328125,
|
|
"learning_rate": 6.147609764610707e-07,
|
|
"loss": 0.8134,
|
|
"num_input_tokens_seen": 713523200,
|
|
"step": 87100
|
|
},
|
|
{
|
|
"epoch": 2.793260298545711,
|
|
"grad_norm": 2.5295767784118652,
|
|
"learning_rate": 5.962185817034005e-07,
|
|
"loss": 0.7893,
|
|
"num_input_tokens_seen": 714342400,
|
|
"step": 87200
|
|
},
|
|
{
|
|
"epoch": 2.7964635787045937,
|
|
"grad_norm": 0.5434448719024658,
|
|
"learning_rate": 5.779567484681032e-07,
|
|
"loss": 0.7896,
|
|
"num_input_tokens_seen": 715161600,
|
|
"step": 87300
|
|
},
|
|
{
|
|
"epoch": 2.799666858863476,
|
|
"grad_norm": 2.833872079849243,
|
|
"learning_rate": 5.599756867049221e-07,
|
|
"loss": 0.8185,
|
|
"num_input_tokens_seen": 715980800,
|
|
"step": 87400
|
|
},
|
|
{
|
|
"epoch": 2.802870139022359,
|
|
"grad_norm": 0.5753843784332275,
|
|
"learning_rate": 5.422756031356779e-07,
|
|
"loss": 0.8188,
|
|
"num_input_tokens_seen": 716800000,
|
|
"step": 87500
|
|
},
|
|
{
|
|
"epoch": 2.8060734191812418,
|
|
"grad_norm": 0.6721400022506714,
|
|
"learning_rate": 5.248567012518857e-07,
|
|
"loss": 0.8303,
|
|
"num_input_tokens_seen": 717619200,
|
|
"step": 87600
|
|
},
|
|
{
|
|
"epoch": 2.809276699340124,
|
|
"grad_norm": 0.7175859808921814,
|
|
"learning_rate": 5.077191813124105e-07,
|
|
"loss": 0.7866,
|
|
"num_input_tokens_seen": 718438400,
|
|
"step": 87700
|
|
},
|
|
{
|
|
"epoch": 2.812479979499007,
|
|
"grad_norm": 0.9649165868759155,
|
|
"learning_rate": 4.90863240341169e-07,
|
|
"loss": 0.8269,
|
|
"num_input_tokens_seen": 719257600,
|
|
"step": 87800
|
|
},
|
|
{
|
|
"epoch": 2.81568325965789,
|
|
"grad_norm": 0.5693693161010742,
|
|
"learning_rate": 4.742890721248755e-07,
|
|
"loss": 0.7737,
|
|
"num_input_tokens_seen": 720076800,
|
|
"step": 87900
|
|
},
|
|
{
|
|
"epoch": 2.8188865398167726,
|
|
"grad_norm": 0.6442407369613647,
|
|
"learning_rate": 4.579968672107943e-07,
|
|
"loss": 0.8196,
|
|
"num_input_tokens_seen": 720896000,
|
|
"step": 88000
|
|
},
|
|
{
|
|
"epoch": 2.822089819975655,
|
|
"grad_norm": 0.72199547290802,
|
|
"learning_rate": 4.419868129045629e-07,
|
|
"loss": 0.7998,
|
|
"num_input_tokens_seen": 721715200,
|
|
"step": 88100
|
|
},
|
|
{
|
|
"epoch": 2.825293100134538,
|
|
"grad_norm": 1.2243154048919678,
|
|
"learning_rate": 4.2625909326803325e-07,
|
|
"loss": 0.8534,
|
|
"num_input_tokens_seen": 722534400,
|
|
"step": 88200
|
|
},
|
|
{
|
|
"epoch": 2.82849638029342,
|
|
"grad_norm": 0.8224316835403442,
|
|
"learning_rate": 4.1081388911715645e-07,
|
|
"loss": 0.8262,
|
|
"num_input_tokens_seen": 723353600,
|
|
"step": 88300
|
|
},
|
|
{
|
|
"epoch": 2.831699660452303,
|
|
"grad_norm": 0.7001350522041321,
|
|
"learning_rate": 3.9565137801990395e-07,
|
|
"loss": 0.8323,
|
|
"num_input_tokens_seen": 724172800,
|
|
"step": 88400
|
|
},
|
|
{
|
|
"epoch": 2.834902940611186,
|
|
"grad_norm": 0.7441889643669128,
|
|
"learning_rate": 3.807717342942302e-07,
|
|
"loss": 0.8116,
|
|
"num_input_tokens_seen": 724992000,
|
|
"step": 88500
|
|
},
|
|
{
|
|
"epoch": 2.8381062207700687,
|
|
"grad_norm": 0.6325407028198242,
|
|
"learning_rate": 3.661751290060633e-07,
|
|
"loss": 0.8481,
|
|
"num_input_tokens_seen": 725811200,
|
|
"step": 88600
|
|
},
|
|
{
|
|
"epoch": 2.8413095009289515,
|
|
"grad_norm": 0.9763919711112976,
|
|
"learning_rate": 3.5186172996733714e-07,
|
|
"loss": 0.8084,
|
|
"num_input_tokens_seen": 726630400,
|
|
"step": 88700
|
|
},
|
|
{
|
|
"epoch": 2.844512781087834,
|
|
"grad_norm": 0.6528813242912292,
|
|
"learning_rate": 3.3783170173406764e-07,
|
|
"loss": 0.7923,
|
|
"num_input_tokens_seen": 727449600,
|
|
"step": 88800
|
|
},
|
|
{
|
|
"epoch": 2.8477160612467167,
|
|
"grad_norm": 0.8190716505050659,
|
|
"learning_rate": 3.2408520560445463e-07,
|
|
"loss": 0.8397,
|
|
"num_input_tokens_seen": 728268800,
|
|
"step": 88900
|
|
},
|
|
{
|
|
"epoch": 2.850919341405599,
|
|
"grad_norm": 0.6821821928024292,
|
|
"learning_rate": 3.10622399617036e-07,
|
|
"loss": 0.7856,
|
|
"num_input_tokens_seen": 729088000,
|
|
"step": 89000
|
|
},
|
|
{
|
|
"epoch": 2.854122621564482,
|
|
"grad_norm": 0.9017992615699768,
|
|
"learning_rate": 2.9744343854886393e-07,
|
|
"loss": 0.8271,
|
|
"num_input_tokens_seen": 729907200,
|
|
"step": 89100
|
|
},
|
|
{
|
|
"epoch": 2.8573259017233648,
|
|
"grad_norm": 0.6816012263298035,
|
|
"learning_rate": 2.8454847391372886e-07,
|
|
"loss": 0.8334,
|
|
"num_input_tokens_seen": 730726400,
|
|
"step": 89200
|
|
},
|
|
{
|
|
"epoch": 2.8605291818822476,
|
|
"grad_norm": 1.0822001695632935,
|
|
"learning_rate": 2.719376539604107e-07,
|
|
"loss": 0.8198,
|
|
"num_input_tokens_seen": 731545600,
|
|
"step": 89300
|
|
},
|
|
{
|
|
"epoch": 2.86373246204113,
|
|
"grad_norm": 0.782041072845459,
|
|
"learning_rate": 2.5961112367098306e-07,
|
|
"loss": 0.8199,
|
|
"num_input_tokens_seen": 732364800,
|
|
"step": 89400
|
|
},
|
|
{
|
|
"epoch": 2.866935742200013,
|
|
"grad_norm": 1.8875998258590698,
|
|
"learning_rate": 2.4756902475914777e-07,
|
|
"loss": 0.7963,
|
|
"num_input_tokens_seen": 733184000,
|
|
"step": 89500
|
|
},
|
|
{
|
|
"epoch": 2.8701390223588956,
|
|
"grad_norm": 0.549452543258667,
|
|
"learning_rate": 2.358114956685975e-07,
|
|
"loss": 0.8353,
|
|
"num_input_tokens_seen": 734003200,
|
|
"step": 89600
|
|
},
|
|
{
|
|
"epoch": 2.873342302517778,
|
|
"grad_norm": 1.3322216272354126,
|
|
"learning_rate": 2.243386715714224e-07,
|
|
"loss": 0.8547,
|
|
"num_input_tokens_seen": 734822400,
|
|
"step": 89700
|
|
},
|
|
{
|
|
"epoch": 2.876545582676661,
|
|
"grad_norm": 0.8102174997329712,
|
|
"learning_rate": 2.1315068436656983e-07,
|
|
"loss": 0.8233,
|
|
"num_input_tokens_seen": 735641600,
|
|
"step": 89800
|
|
},
|
|
{
|
|
"epoch": 2.8797488628355437,
|
|
"grad_norm": 0.6969431042671204,
|
|
"learning_rate": 2.0224766267831207e-07,
|
|
"loss": 0.8622,
|
|
"num_input_tokens_seen": 736460800,
|
|
"step": 89900
|
|
},
|
|
{
|
|
"epoch": 2.8829521429944265,
|
|
"grad_norm": 1.4771400690078735,
|
|
"learning_rate": 1.9162973185478383e-07,
|
|
"loss": 0.789,
|
|
"num_input_tokens_seen": 737280000,
|
|
"step": 90000
|
|
},
|
|
{
|
|
"epoch": 2.886155423153309,
|
|
"grad_norm": 0.6978898048400879,
|
|
"learning_rate": 1.8129701396652487e-07,
|
|
"loss": 0.8723,
|
|
"num_input_tokens_seen": 738099200,
|
|
"step": 90100
|
|
},
|
|
{
|
|
"epoch": 2.8893587033121917,
|
|
"grad_norm": 0.838759183883667,
|
|
"learning_rate": 1.7124962780508957e-07,
|
|
"loss": 0.8136,
|
|
"num_input_tokens_seen": 738918400,
|
|
"step": 90200
|
|
},
|
|
{
|
|
"epoch": 2.8925619834710745,
|
|
"grad_norm": 0.6396787762641907,
|
|
"learning_rate": 1.6148768888166744e-07,
|
|
"loss": 0.8263,
|
|
"num_input_tokens_seen": 739737600,
|
|
"step": 90300
|
|
},
|
|
{
|
|
"epoch": 2.895765263629957,
|
|
"grad_norm": 0.7068443298339844,
|
|
"learning_rate": 1.5201130942577578e-07,
|
|
"loss": 0.8388,
|
|
"num_input_tokens_seen": 740556800,
|
|
"step": 90400
|
|
},
|
|
{
|
|
"epoch": 2.8989685437888397,
|
|
"grad_norm": 0.5743166208267212,
|
|
"learning_rate": 1.4282059838394701e-07,
|
|
"loss": 0.8284,
|
|
"num_input_tokens_seen": 741376000,
|
|
"step": 90500
|
|
},
|
|
{
|
|
"epoch": 2.9021718239477226,
|
|
"grad_norm": 0.5627537369728088,
|
|
"learning_rate": 1.3391566141848778e-07,
|
|
"loss": 0.834,
|
|
"num_input_tokens_seen": 742195200,
|
|
"step": 90600
|
|
},
|
|
{
|
|
"epoch": 2.9053751041066054,
|
|
"grad_norm": 2.069951057434082,
|
|
"learning_rate": 1.2529660090626894e-07,
|
|
"loss": 0.8798,
|
|
"num_input_tokens_seen": 743014400,
|
|
"step": 90700
|
|
},
|
|
{
|
|
"epoch": 2.9085783842654878,
|
|
"grad_norm": 0.5723984241485596,
|
|
"learning_rate": 1.1696351593753485e-07,
|
|
"loss": 0.8443,
|
|
"num_input_tokens_seen": 743833600,
|
|
"step": 90800
|
|
},
|
|
{
|
|
"epoch": 2.9117816644243706,
|
|
"grad_norm": 0.5584101676940918,
|
|
"learning_rate": 1.0891650231477646e-07,
|
|
"loss": 0.7991,
|
|
"num_input_tokens_seen": 744652800,
|
|
"step": 90900
|
|
},
|
|
{
|
|
"epoch": 2.9149849445832534,
|
|
"grad_norm": 0.8929557800292969,
|
|
"learning_rate": 1.0115565255162107e-07,
|
|
"loss": 0.8134,
|
|
"num_input_tokens_seen": 745472000,
|
|
"step": 91000
|
|
},
|
|
{
|
|
"epoch": 2.918188224742136,
|
|
"grad_norm": 0.5613967776298523,
|
|
"learning_rate": 9.368105587177767e-08,
|
|
"loss": 0.855,
|
|
"num_input_tokens_seen": 746291200,
|
|
"step": 91100
|
|
},
|
|
{
|
|
"epoch": 2.9213915049010186,
|
|
"grad_norm": 0.5235220193862915,
|
|
"learning_rate": 8.649279820800161e-08,
|
|
"loss": 0.7894,
|
|
"num_input_tokens_seen": 747110400,
|
|
"step": 91200
|
|
},
|
|
{
|
|
"epoch": 2.9245947850599014,
|
|
"grad_norm": 2.220933198928833,
|
|
"learning_rate": 7.959096220111206e-08,
|
|
"loss": 0.8311,
|
|
"num_input_tokens_seen": 747929600,
|
|
"step": 91300
|
|
},
|
|
{
|
|
"epoch": 2.9277980652187843,
|
|
"grad_norm": 2.264698028564453,
|
|
"learning_rate": 7.297562719904561e-08,
|
|
"loss": 0.7856,
|
|
"num_input_tokens_seen": 748748800,
|
|
"step": 91400
|
|
},
|
|
{
|
|
"epoch": 2.9310013453776667,
|
|
"grad_norm": 0.6808698773384094,
|
|
"learning_rate": 6.664686925593188e-08,
|
|
"loss": 0.8379,
|
|
"num_input_tokens_seen": 749568000,
|
|
"step": 91500
|
|
},
|
|
{
|
|
"epoch": 2.9342046255365495,
|
|
"grad_norm": 2.1781809329986572,
|
|
"learning_rate": 6.060476113123603e-08,
|
|
"loss": 0.7529,
|
|
"num_input_tokens_seen": 750387200,
|
|
"step": 91600
|
|
},
|
|
{
|
|
"epoch": 2.937407905695432,
|
|
"grad_norm": 0.6591463685035706,
|
|
"learning_rate": 5.4849372288903744e-08,
|
|
"loss": 0.8836,
|
|
"num_input_tokens_seen": 751206400,
|
|
"step": 91700
|
|
},
|
|
{
|
|
"epoch": 2.9406111858543147,
|
|
"grad_norm": 0.5385074019432068,
|
|
"learning_rate": 4.9380768896578614e-08,
|
|
"loss": 0.8253,
|
|
"num_input_tokens_seen": 752025600,
|
|
"step": 91800
|
|
},
|
|
{
|
|
"epoch": 2.9438144660131975,
|
|
"grad_norm": 0.7810553312301636,
|
|
"learning_rate": 4.419901382483327e-08,
|
|
"loss": 0.7867,
|
|
"num_input_tokens_seen": 752844800,
|
|
"step": 91900
|
|
},
|
|
{
|
|
"epoch": 2.9470177461720803,
|
|
"grad_norm": 1.6066702604293823,
|
|
"learning_rate": 3.930416664644498e-08,
|
|
"loss": 0.8089,
|
|
"num_input_tokens_seen": 753664000,
|
|
"step": 92000
|
|
},
|
|
{
|
|
"epoch": 2.950221026330963,
|
|
"grad_norm": 0.8969001173973083,
|
|
"learning_rate": 3.469628363571564e-08,
|
|
"loss": 0.8324,
|
|
"num_input_tokens_seen": 754483200,
|
|
"step": 92100
|
|
},
|
|
{
|
|
"epoch": 2.9534243064898456,
|
|
"grad_norm": 0.6381150484085083,
|
|
"learning_rate": 3.037541776782782e-08,
|
|
"loss": 0.8199,
|
|
"num_input_tokens_seen": 755302400,
|
|
"step": 92200
|
|
},
|
|
{
|
|
"epoch": 2.9566275866487284,
|
|
"grad_norm": 0.8189881443977356,
|
|
"learning_rate": 2.6341618718223048e-08,
|
|
"loss": 0.8282,
|
|
"num_input_tokens_seen": 756121600,
|
|
"step": 92300
|
|
},
|
|
{
|
|
"epoch": 2.9598308668076108,
|
|
"grad_norm": 0.744215190410614,
|
|
"learning_rate": 2.2594932862041173e-08,
|
|
"loss": 0.823,
|
|
"num_input_tokens_seen": 756940800,
|
|
"step": 92400
|
|
},
|
|
{
|
|
"epoch": 2.9630341469664936,
|
|
"grad_norm": 0.6979692578315735,
|
|
"learning_rate": 1.91354032735902e-08,
|
|
"loss": 0.7854,
|
|
"num_input_tokens_seen": 757760000,
|
|
"step": 92500
|
|
},
|
|
{
|
|
"epoch": 2.9662374271253764,
|
|
"grad_norm": 0.6506592035293579,
|
|
"learning_rate": 1.5963069725838385e-08,
|
|
"loss": 0.8654,
|
|
"num_input_tokens_seen": 758579200,
|
|
"step": 92600
|
|
},
|
|
{
|
|
"epoch": 2.9694407072842592,
|
|
"grad_norm": 0.7221033573150635,
|
|
"learning_rate": 1.3077968689964582e-08,
|
|
"loss": 0.7966,
|
|
"num_input_tokens_seen": 759398400,
|
|
"step": 92700
|
|
},
|
|
{
|
|
"epoch": 2.9726439874431416,
|
|
"grad_norm": 0.5663209557533264,
|
|
"learning_rate": 1.0480133334947462e-08,
|
|
"loss": 0.8375,
|
|
"num_input_tokens_seen": 760217600,
|
|
"step": 92800
|
|
},
|
|
{
|
|
"epoch": 2.9758472676020244,
|
|
"grad_norm": 0.7616459131240845,
|
|
"learning_rate": 8.169593527160291e-09,
|
|
"loss": 0.8056,
|
|
"num_input_tokens_seen": 761036800,
|
|
"step": 92900
|
|
},
|
|
{
|
|
"epoch": 2.9790505477609073,
|
|
"grad_norm": 0.7259778380393982,
|
|
"learning_rate": 6.146375830054507e-09,
|
|
"loss": 0.8026,
|
|
"num_input_tokens_seen": 761856000,
|
|
"step": 93000
|
|
},
|
|
{
|
|
"epoch": 2.9822538279197897,
|
|
"grad_norm": 0.6411218643188477,
|
|
"learning_rate": 4.410503503840535e-09,
|
|
"loss": 0.8472,
|
|
"num_input_tokens_seen": 762675200,
|
|
"step": 93100
|
|
},
|
|
{
|
|
"epoch": 2.9854571080786725,
|
|
"grad_norm": 0.6619647741317749,
|
|
"learning_rate": 2.961996505213005e-09,
|
|
"loss": 0.8558,
|
|
"num_input_tokens_seen": 763494400,
|
|
"step": 93200
|
|
},
|
|
{
|
|
"epoch": 2.9886603882375553,
|
|
"grad_norm": 0.7283292412757874,
|
|
"learning_rate": 1.8008714871453613e-09,
|
|
"loss": 0.8321,
|
|
"num_input_tokens_seen": 764313600,
|
|
"step": 93300
|
|
},
|
|
{
|
|
"epoch": 2.991863668396438,
|
|
"grad_norm": 0.7489187717437744,
|
|
"learning_rate": 9.271417986705943e-10,
|
|
"loss": 0.8264,
|
|
"num_input_tokens_seen": 765132800,
|
|
"step": 93400
|
|
},
|
|
{
|
|
"epoch": 2.9950669485553205,
|
|
"grad_norm": 2.186750888824463,
|
|
"learning_rate": 3.408174847480128e-10,
|
|
"loss": 0.7796,
|
|
"num_input_tokens_seen": 765952000,
|
|
"step": 93500
|
|
},
|
|
{
|
|
"epoch": 2.9982702287142033,
|
|
"grad_norm": 2.5423426628112793,
|
|
"learning_rate": 4.1905286135568434e-11,
|
|
"loss": 0.7863,
|
|
"num_input_tokens_seen": 766771200,
|
|
"step": 93600
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"num_input_tokens_seen": 767213568,
|
|
"step": 93654,
|
|
"total_flos": 3.49334314435121e+19,
|
|
"train_loss": 0.04966252789047391,
|
|
"train_runtime": 28761.9651,
|
|
"train_samples_per_second": 3.256,
|
|
"train_steps_per_second": 3.256
|
|
}
|
|
],
|
|
"logging_steps": 100,
|
|
"max_steps": 93654,
|
|
"num_input_tokens_seen": 767213568,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 1000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3.49334314435121e+19,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|