Qwen1.5-0.5Bfinetuning / trainer_state.json
lyc123456's picture
Add model
3d8b7ec verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9324922169424874,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 2.323194980621338,
"learning_rate": 4.9999946882250004e-05,
"loss": 0.2105,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 3.090125799179077,
"learning_rate": 4.999978752922572e-05,
"loss": 0.2656,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 2.8763957023620605,
"learning_rate": 4.999952194160431e-05,
"loss": 0.3075,
"step": 15
},
{
"epoch": 0.03,
"grad_norm": 2.714475631713867,
"learning_rate": 4.999915012051437e-05,
"loss": 0.3164,
"step": 20
},
{
"epoch": 0.03,
"grad_norm": 2.6384477615356445,
"learning_rate": 4.999867206753593e-05,
"loss": 0.3167,
"step": 25
},
{
"epoch": 0.04,
"grad_norm": 3.147310733795166,
"learning_rate": 4.9998087784700426e-05,
"loss": 0.3379,
"step": 30
},
{
"epoch": 0.05,
"grad_norm": 2.4968576431274414,
"learning_rate": 4.9997397274490725e-05,
"loss": 0.3289,
"step": 35
},
{
"epoch": 0.05,
"grad_norm": 2.5513858795166016,
"learning_rate": 4.9996600539841096e-05,
"loss": 0.3304,
"step": 40
},
{
"epoch": 0.06,
"grad_norm": 2.6501898765563965,
"learning_rate": 4.99956975841372e-05,
"loss": 0.3316,
"step": 45
},
{
"epoch": 0.07,
"grad_norm": 3.1661622524261475,
"learning_rate": 4.9994688411216076e-05,
"loss": 0.3352,
"step": 50
},
{
"epoch": 0.07,
"grad_norm": 3.1451425552368164,
"learning_rate": 4.9993573025366124e-05,
"loss": 0.3337,
"step": 55
},
{
"epoch": 0.08,
"grad_norm": 3.058828115463257,
"learning_rate": 4.999235143132708e-05,
"loss": 0.3283,
"step": 60
},
{
"epoch": 0.09,
"grad_norm": 2.6413097381591797,
"learning_rate": 4.999102363429002e-05,
"loss": 0.312,
"step": 65
},
{
"epoch": 0.09,
"grad_norm": 2.8983662128448486,
"learning_rate": 4.99895896398973e-05,
"loss": 0.3184,
"step": 70
},
{
"epoch": 0.1,
"grad_norm": 2.6056504249572754,
"learning_rate": 4.998804945424258e-05,
"loss": 0.3464,
"step": 75
},
{
"epoch": 0.1,
"grad_norm": 2.5964763164520264,
"learning_rate": 4.998640308387074e-05,
"loss": 0.335,
"step": 80
},
{
"epoch": 0.11,
"grad_norm": 2.594703435897827,
"learning_rate": 4.9984650535777896e-05,
"loss": 0.3487,
"step": 85
},
{
"epoch": 0.12,
"grad_norm": 2.996779203414917,
"learning_rate": 4.9982791817411386e-05,
"loss": 0.346,
"step": 90
},
{
"epoch": 0.12,
"grad_norm": 2.9310555458068848,
"learning_rate": 4.998082693666966e-05,
"loss": 0.3203,
"step": 95
},
{
"epoch": 0.13,
"grad_norm": 3.1505908966064453,
"learning_rate": 4.997875590190233e-05,
"loss": 0.3766,
"step": 100
},
{
"epoch": 0.14,
"grad_norm": 2.8678457736968994,
"learning_rate": 4.9976578721910106e-05,
"loss": 0.3404,
"step": 105
},
{
"epoch": 0.14,
"grad_norm": 2.8068525791168213,
"learning_rate": 4.9974295405944714e-05,
"loss": 0.3249,
"step": 110
},
{
"epoch": 0.15,
"grad_norm": 2.8355133533477783,
"learning_rate": 4.9971905963708946e-05,
"loss": 0.3226,
"step": 115
},
{
"epoch": 0.16,
"grad_norm": 3.0205864906311035,
"learning_rate": 4.996941040535653e-05,
"loss": 0.3613,
"step": 120
},
{
"epoch": 0.16,
"grad_norm": 2.9807019233703613,
"learning_rate": 4.9966808741492153e-05,
"loss": 0.3284,
"step": 125
},
{
"epoch": 0.17,
"grad_norm": 3.070850372314453,
"learning_rate": 4.996410098317137e-05,
"loss": 0.3217,
"step": 130
},
{
"epoch": 0.18,
"grad_norm": 3.1543736457824707,
"learning_rate": 4.996128714190058e-05,
"loss": 0.3636,
"step": 135
},
{
"epoch": 0.18,
"grad_norm": 2.80784273147583,
"learning_rate": 4.995836722963699e-05,
"loss": 0.3379,
"step": 140
},
{
"epoch": 0.19,
"grad_norm": 2.6561925411224365,
"learning_rate": 4.9955341258788526e-05,
"loss": 0.3442,
"step": 145
},
{
"epoch": 0.2,
"grad_norm": 2.814857006072998,
"learning_rate": 4.99522092422138e-05,
"loss": 0.3456,
"step": 150
},
{
"epoch": 0.2,
"grad_norm": 2.924438238143921,
"learning_rate": 4.9948971193222086e-05,
"loss": 0.3436,
"step": 155
},
{
"epoch": 0.21,
"grad_norm": 2.7895913124084473,
"learning_rate": 4.994562712557319e-05,
"loss": 0.3319,
"step": 160
},
{
"epoch": 0.22,
"grad_norm": 2.878596544265747,
"learning_rate": 4.9942177053477474e-05,
"loss": 0.342,
"step": 165
},
{
"epoch": 0.22,
"grad_norm": 3.0157933235168457,
"learning_rate": 4.993862099159574e-05,
"loss": 0.3335,
"step": 170
},
{
"epoch": 0.23,
"grad_norm": 2.585909366607666,
"learning_rate": 4.99349589550392e-05,
"loss": 0.3373,
"step": 175
},
{
"epoch": 0.24,
"grad_norm": 2.992539167404175,
"learning_rate": 4.993119095936937e-05,
"loss": 0.3318,
"step": 180
},
{
"epoch": 0.24,
"grad_norm": 2.995790719985962,
"learning_rate": 4.992731702059805e-05,
"loss": 0.3289,
"step": 185
},
{
"epoch": 0.25,
"grad_norm": 3.0879945755004883,
"learning_rate": 4.9923337155187235e-05,
"loss": 0.3309,
"step": 190
},
{
"epoch": 0.26,
"grad_norm": 2.9949581623077393,
"learning_rate": 4.991925138004905e-05,
"loss": 0.3471,
"step": 195
},
{
"epoch": 0.26,
"grad_norm": 2.873623847961426,
"learning_rate": 4.991505971254566e-05,
"loss": 0.3463,
"step": 200
},
{
"epoch": 0.27,
"grad_norm": 3.0350987911224365,
"learning_rate": 4.9910762170489226e-05,
"loss": 0.33,
"step": 205
},
{
"epoch": 0.28,
"grad_norm": 2.794652223587036,
"learning_rate": 4.99063587721418e-05,
"loss": 0.3723,
"step": 210
},
{
"epoch": 0.28,
"grad_norm": 2.4460439682006836,
"learning_rate": 4.990184953621528e-05,
"loss": 0.3512,
"step": 215
},
{
"epoch": 0.29,
"grad_norm": 2.772210121154785,
"learning_rate": 4.989723448187131e-05,
"loss": 0.3232,
"step": 220
},
{
"epoch": 0.29,
"grad_norm": 3.0419492721557617,
"learning_rate": 4.989251362872119e-05,
"loss": 0.3364,
"step": 225
},
{
"epoch": 0.3,
"grad_norm": 3.0731663703918457,
"learning_rate": 4.988768699682579e-05,
"loss": 0.355,
"step": 230
},
{
"epoch": 0.31,
"grad_norm": 2.81030535697937,
"learning_rate": 4.9882754606695524e-05,
"loss": 0.3158,
"step": 235
},
{
"epoch": 0.31,
"grad_norm": 2.7734286785125732,
"learning_rate": 4.9877716479290174e-05,
"loss": 0.3286,
"step": 240
},
{
"epoch": 0.32,
"grad_norm": 2.918984889984131,
"learning_rate": 4.987257263601885e-05,
"loss": 0.3314,
"step": 245
},
{
"epoch": 0.33,
"grad_norm": 2.953944206237793,
"learning_rate": 4.986732309873992e-05,
"loss": 0.3179,
"step": 250
},
{
"epoch": 0.33,
"grad_norm": 2.8582749366760254,
"learning_rate": 4.986196788976086e-05,
"loss": 0.3238,
"step": 255
},
{
"epoch": 0.34,
"grad_norm": 2.757632255554199,
"learning_rate": 4.985650703183822e-05,
"loss": 0.3413,
"step": 260
},
{
"epoch": 0.35,
"grad_norm": 2.9017035961151123,
"learning_rate": 4.985094054817746e-05,
"loss": 0.3335,
"step": 265
},
{
"epoch": 0.35,
"grad_norm": 3.1360483169555664,
"learning_rate": 4.9845268462432916e-05,
"loss": 0.3474,
"step": 270
},
{
"epoch": 0.36,
"grad_norm": 2.847700834274292,
"learning_rate": 4.983949079870765e-05,
"loss": 0.3471,
"step": 275
},
{
"epoch": 0.37,
"grad_norm": 2.8749804496765137,
"learning_rate": 4.983360758155341e-05,
"loss": 0.3389,
"step": 280
},
{
"epoch": 0.37,
"grad_norm": 2.9671127796173096,
"learning_rate": 4.9827618835970426e-05,
"loss": 0.3379,
"step": 285
},
{
"epoch": 0.38,
"grad_norm": 2.869534730911255,
"learning_rate": 4.982152458740741e-05,
"loss": 0.328,
"step": 290
},
{
"epoch": 0.39,
"grad_norm": 2.9593312740325928,
"learning_rate": 4.981532486176138e-05,
"loss": 0.348,
"step": 295
},
{
"epoch": 0.39,
"grad_norm": 3.288499593734741,
"learning_rate": 4.980901968537758e-05,
"loss": 0.3691,
"step": 300
},
{
"epoch": 0.4,
"grad_norm": 3.329684257507324,
"learning_rate": 4.980260908504934e-05,
"loss": 0.3426,
"step": 305
},
{
"epoch": 0.41,
"grad_norm": 2.8230020999908447,
"learning_rate": 4.9796093088018e-05,
"loss": 0.3367,
"step": 310
},
{
"epoch": 0.41,
"grad_norm": 2.743234157562256,
"learning_rate": 4.978947172197277e-05,
"loss": 0.3594,
"step": 315
},
{
"epoch": 0.42,
"grad_norm": 2.874333620071411,
"learning_rate": 4.978274501505061e-05,
"loss": 0.3394,
"step": 320
},
{
"epoch": 0.43,
"grad_norm": 3.2279603481292725,
"learning_rate": 4.9775912995836136e-05,
"loss": 0.3307,
"step": 325
},
{
"epoch": 0.43,
"grad_norm": 2.608811378479004,
"learning_rate": 4.9768975693361454e-05,
"loss": 0.3431,
"step": 330
},
{
"epoch": 0.44,
"grad_norm": 2.868130683898926,
"learning_rate": 4.976193313710608e-05,
"loss": 0.3273,
"step": 335
},
{
"epoch": 0.45,
"grad_norm": 3.2972702980041504,
"learning_rate": 4.9754785356996787e-05,
"loss": 0.3453,
"step": 340
},
{
"epoch": 0.45,
"grad_norm": 3.2560746669769287,
"learning_rate": 4.9747532383407504e-05,
"loss": 0.3831,
"step": 345
},
{
"epoch": 0.46,
"grad_norm": 3.038130760192871,
"learning_rate": 4.9740174247159156e-05,
"loss": 0.3916,
"step": 350
},
{
"epoch": 0.47,
"grad_norm": 3.453185796737671,
"learning_rate": 4.973271097951956e-05,
"loss": 0.3661,
"step": 355
},
{
"epoch": 0.47,
"grad_norm": 2.8996922969818115,
"learning_rate": 4.9725142612203265e-05,
"loss": 0.3685,
"step": 360
},
{
"epoch": 0.48,
"grad_norm": 3.0452466011047363,
"learning_rate": 4.971746917737146e-05,
"loss": 0.3723,
"step": 365
},
{
"epoch": 0.49,
"grad_norm": 3.064406156539917,
"learning_rate": 4.970969070763177e-05,
"loss": 0.4086,
"step": 370
},
{
"epoch": 0.49,
"grad_norm": 3.1569948196411133,
"learning_rate": 4.9701807236038204e-05,
"loss": 0.4095,
"step": 375
},
{
"epoch": 0.5,
"grad_norm": 3.0515644550323486,
"learning_rate": 4.9693818796090927e-05,
"loss": 0.4156,
"step": 380
},
{
"epoch": 0.5,
"grad_norm": 3.2472422122955322,
"learning_rate": 4.968572542173617e-05,
"loss": 0.4684,
"step": 385
},
{
"epoch": 0.51,
"grad_norm": 3.417625904083252,
"learning_rate": 4.96775271473661e-05,
"loss": 0.4493,
"step": 390
},
{
"epoch": 0.52,
"grad_norm": 3.272538185119629,
"learning_rate": 4.9669224007818623e-05,
"loss": 0.4514,
"step": 395
},
{
"epoch": 0.52,
"grad_norm": 2.8853087425231934,
"learning_rate": 4.966081603837725e-05,
"loss": 0.4629,
"step": 400
},
{
"epoch": 0.53,
"grad_norm": 3.1684935092926025,
"learning_rate": 4.965230327477099e-05,
"loss": 0.4347,
"step": 405
},
{
"epoch": 0.54,
"grad_norm": 3.0273630619049072,
"learning_rate": 4.964368575317415e-05,
"loss": 0.4532,
"step": 410
},
{
"epoch": 0.54,
"grad_norm": 2.8964247703552246,
"learning_rate": 4.963496351020619e-05,
"loss": 0.4514,
"step": 415
},
{
"epoch": 0.55,
"grad_norm": 3.0990092754364014,
"learning_rate": 4.962613658293158e-05,
"loss": 0.4611,
"step": 420
},
{
"epoch": 0.56,
"grad_norm": 3.376248836517334,
"learning_rate": 4.961720500885967e-05,
"loss": 0.4585,
"step": 425
},
{
"epoch": 0.56,
"grad_norm": 3.2961933612823486,
"learning_rate": 4.960816882594443e-05,
"loss": 0.4574,
"step": 430
},
{
"epoch": 0.57,
"grad_norm": 3.159632682800293,
"learning_rate": 4.959902807258443e-05,
"loss": 0.4567,
"step": 435
},
{
"epoch": 0.58,
"grad_norm": 3.271243095397949,
"learning_rate": 4.958978278762255e-05,
"loss": 0.4709,
"step": 440
},
{
"epoch": 0.58,
"grad_norm": 2.813108205795288,
"learning_rate": 4.958043301034589e-05,
"loss": 0.477,
"step": 445
},
{
"epoch": 0.59,
"grad_norm": 3.1648154258728027,
"learning_rate": 4.95709787804856e-05,
"loss": 0.4489,
"step": 450
},
{
"epoch": 0.6,
"grad_norm": 3.2871389389038086,
"learning_rate": 4.9561420138216645e-05,
"loss": 0.4604,
"step": 455
},
{
"epoch": 0.6,
"grad_norm": 3.215829849243164,
"learning_rate": 4.955175712415773e-05,
"loss": 0.4703,
"step": 460
},
{
"epoch": 0.61,
"grad_norm": 3.0727405548095703,
"learning_rate": 4.954198977937106e-05,
"loss": 0.4745,
"step": 465
},
{
"epoch": 0.62,
"grad_norm": 3.3414416313171387,
"learning_rate": 4.953211814536217e-05,
"loss": 0.4481,
"step": 470
},
{
"epoch": 0.62,
"grad_norm": 3.058262348175049,
"learning_rate": 4.9522142264079794e-05,
"loss": 0.4765,
"step": 475
},
{
"epoch": 0.63,
"grad_norm": 2.8146088123321533,
"learning_rate": 4.951206217791564e-05,
"loss": 0.4682,
"step": 480
},
{
"epoch": 0.64,
"grad_norm": 3.241665840148926,
"learning_rate": 4.9501877929704215e-05,
"loss": 0.4803,
"step": 485
},
{
"epoch": 0.64,
"grad_norm": 3.362031936645508,
"learning_rate": 4.949158956272268e-05,
"loss": 0.5213,
"step": 490
},
{
"epoch": 0.65,
"grad_norm": 3.4168310165405273,
"learning_rate": 4.948119712069062e-05,
"loss": 0.5243,
"step": 495
},
{
"epoch": 0.66,
"grad_norm": 3.469191312789917,
"learning_rate": 4.9470700647769904e-05,
"loss": 0.5824,
"step": 500
},
{
"epoch": 0.66,
"grad_norm": 3.592074394226074,
"learning_rate": 4.9460100188564426e-05,
"loss": 0.5777,
"step": 505
},
{
"epoch": 0.67,
"grad_norm": 3.4984512329101562,
"learning_rate": 4.944939578812001e-05,
"loss": 0.6011,
"step": 510
},
{
"epoch": 0.68,
"grad_norm": 3.6021430492401123,
"learning_rate": 4.943858749192414e-05,
"loss": 0.6145,
"step": 515
},
{
"epoch": 0.68,
"grad_norm": 3.7211990356445312,
"learning_rate": 4.942767534590581e-05,
"loss": 0.6159,
"step": 520
},
{
"epoch": 0.69,
"grad_norm": 3.5600759983062744,
"learning_rate": 4.9416659396435304e-05,
"loss": 0.5823,
"step": 525
},
{
"epoch": 0.69,
"grad_norm": 3.6605124473571777,
"learning_rate": 4.940553969032403e-05,
"loss": 0.6421,
"step": 530
},
{
"epoch": 0.7,
"grad_norm": 3.654963493347168,
"learning_rate": 4.9394316274824284e-05,
"loss": 0.6296,
"step": 535
},
{
"epoch": 0.71,
"grad_norm": 3.279911518096924,
"learning_rate": 4.938298919762907e-05,
"loss": 0.6206,
"step": 540
},
{
"epoch": 0.71,
"grad_norm": 3.3684518337249756,
"learning_rate": 4.9371558506871893e-05,
"loss": 0.618,
"step": 545
},
{
"epoch": 0.72,
"grad_norm": 3.6144015789031982,
"learning_rate": 4.936002425112657e-05,
"loss": 0.6063,
"step": 550
},
{
"epoch": 0.73,
"grad_norm": 3.5420987606048584,
"learning_rate": 4.934838647940699e-05,
"loss": 0.6417,
"step": 555
},
{
"epoch": 0.73,
"grad_norm": 3.549086570739746,
"learning_rate": 4.933664524116694e-05,
"loss": 0.6196,
"step": 560
},
{
"epoch": 0.74,
"grad_norm": 3.43263840675354,
"learning_rate": 4.9324800586299854e-05,
"loss": 0.6224,
"step": 565
},
{
"epoch": 0.75,
"grad_norm": 3.745143175125122,
"learning_rate": 4.931285256513868e-05,
"loss": 0.6052,
"step": 570
},
{
"epoch": 0.75,
"grad_norm": 3.4879541397094727,
"learning_rate": 4.9300801228455536e-05,
"loss": 0.6168,
"step": 575
},
{
"epoch": 0.76,
"grad_norm": 3.449970245361328,
"learning_rate": 4.9288646627461645e-05,
"loss": 0.6278,
"step": 580
},
{
"epoch": 0.77,
"grad_norm": 3.4467875957489014,
"learning_rate": 4.9276388813807e-05,
"loss": 0.5972,
"step": 585
},
{
"epoch": 0.77,
"grad_norm": 3.513766288757324,
"learning_rate": 4.92640278395802e-05,
"loss": 0.6289,
"step": 590
},
{
"epoch": 0.78,
"grad_norm": 3.6418793201446533,
"learning_rate": 4.925156375730822e-05,
"loss": 0.6228,
"step": 595
},
{
"epoch": 0.79,
"grad_norm": 3.486773729324341,
"learning_rate": 4.923899661995617e-05,
"loss": 0.5931,
"step": 600
},
{
"epoch": 0.79,
"grad_norm": 3.476663112640381,
"learning_rate": 4.92263264809271e-05,
"loss": 0.615,
"step": 605
},
{
"epoch": 0.8,
"grad_norm": 3.6010472774505615,
"learning_rate": 4.9213553394061754e-05,
"loss": 0.6221,
"step": 610
},
{
"epoch": 0.81,
"grad_norm": 3.489880323410034,
"learning_rate": 4.920067741363835e-05,
"loss": 0.6008,
"step": 615
},
{
"epoch": 0.81,
"grad_norm": 3.6153078079223633,
"learning_rate": 4.918769859437232e-05,
"loss": 0.6362,
"step": 620
},
{
"epoch": 0.82,
"grad_norm": 3.6381516456604004,
"learning_rate": 4.9174616991416136e-05,
"loss": 0.6391,
"step": 625
},
{
"epoch": 0.83,
"grad_norm": 3.5741944313049316,
"learning_rate": 4.916143266035901e-05,
"loss": 0.617,
"step": 630
},
{
"epoch": 0.83,
"grad_norm": 3.613692283630371,
"learning_rate": 4.914814565722671e-05,
"loss": 0.628,
"step": 635
},
{
"epoch": 0.84,
"grad_norm": 3.7904045581817627,
"learning_rate": 4.913475603848129e-05,
"loss": 0.6157,
"step": 640
},
{
"epoch": 0.85,
"grad_norm": 3.4816792011260986,
"learning_rate": 4.912126386102086e-05,
"loss": 0.6457,
"step": 645
},
{
"epoch": 0.85,
"grad_norm": 3.371680974960327,
"learning_rate": 4.910766918217935e-05,
"loss": 0.6304,
"step": 650
},
{
"epoch": 0.86,
"grad_norm": 3.502263069152832,
"learning_rate": 4.909397205972627e-05,
"loss": 0.6057,
"step": 655
},
{
"epoch": 0.87,
"grad_norm": 3.6000306606292725,
"learning_rate": 4.908017255186643e-05,
"loss": 0.6629,
"step": 660
},
{
"epoch": 0.87,
"grad_norm": 3.747457265853882,
"learning_rate": 4.906627071723975e-05,
"loss": 0.659,
"step": 665
},
{
"epoch": 0.88,
"grad_norm": 3.356635808944702,
"learning_rate": 4.905226661492095e-05,
"loss": 0.6263,
"step": 670
},
{
"epoch": 0.88,
"grad_norm": 3.6806087493896484,
"learning_rate": 4.903816030441935e-05,
"loss": 0.6128,
"step": 675
},
{
"epoch": 0.89,
"grad_norm": 3.5806305408477783,
"learning_rate": 4.902395184567859e-05,
"loss": 0.6538,
"step": 680
},
{
"epoch": 0.9,
"grad_norm": 3.4562089443206787,
"learning_rate": 4.900964129907638e-05,
"loss": 0.6271,
"step": 685
},
{
"epoch": 0.9,
"grad_norm": 3.561217784881592,
"learning_rate": 4.8995228725424235e-05,
"loss": 0.6683,
"step": 690
},
{
"epoch": 0.91,
"grad_norm": 3.419334650039673,
"learning_rate": 4.898071418596724e-05,
"loss": 0.6503,
"step": 695
},
{
"epoch": 0.92,
"grad_norm": 3.7222039699554443,
"learning_rate": 4.8966097742383765e-05,
"loss": 0.6211,
"step": 700
},
{
"epoch": 0.92,
"grad_norm": 3.767538070678711,
"learning_rate": 4.895137945678522e-05,
"loss": 0.6252,
"step": 705
},
{
"epoch": 0.93,
"grad_norm": 3.1880886554718018,
"learning_rate": 4.893655939171578e-05,
"loss": 0.6403,
"step": 710
},
{
"epoch": 0.94,
"grad_norm": 3.655524492263794,
"learning_rate": 4.892163761015214e-05,
"loss": 0.6344,
"step": 715
},
{
"epoch": 0.94,
"grad_norm": 3.342782735824585,
"learning_rate": 4.890661417550319e-05,
"loss": 0.6339,
"step": 720
},
{
"epoch": 0.95,
"grad_norm": 3.661858320236206,
"learning_rate": 4.889148915160984e-05,
"loss": 0.6554,
"step": 725
},
{
"epoch": 0.96,
"grad_norm": 3.906249761581421,
"learning_rate": 4.887626260274465e-05,
"loss": 0.6478,
"step": 730
},
{
"epoch": 0.96,
"grad_norm": 3.5664069652557373,
"learning_rate": 4.886093459361163e-05,
"loss": 0.652,
"step": 735
},
{
"epoch": 0.97,
"grad_norm": 3.413325786590576,
"learning_rate": 4.8845505189345934e-05,
"loss": 0.6491,
"step": 740
},
{
"epoch": 0.98,
"grad_norm": 3.2417361736297607,
"learning_rate": 4.8829974455513564e-05,
"loss": 0.6344,
"step": 745
},
{
"epoch": 0.98,
"grad_norm": 4.055202960968018,
"learning_rate": 4.881434245811115e-05,
"loss": 0.6458,
"step": 750
},
{
"epoch": 0.99,
"grad_norm": 3.9207661151885986,
"learning_rate": 4.87986092635656e-05,
"loss": 0.6493,
"step": 755
},
{
"epoch": 1.0,
"grad_norm": 3.7753255367279053,
"learning_rate": 4.878277493873388e-05,
"loss": 0.6141,
"step": 760
},
{
"epoch": 1.0,
"grad_norm": 2.760272979736328,
"learning_rate": 4.876683955090267e-05,
"loss": 0.4732,
"step": 765
},
{
"epoch": 1.01,
"grad_norm": 2.363640069961548,
"learning_rate": 4.8750803167788136e-05,
"loss": 0.2479,
"step": 770
},
{
"epoch": 1.02,
"grad_norm": 2.5096426010131836,
"learning_rate": 4.87346658575356e-05,
"loss": 0.2297,
"step": 775
},
{
"epoch": 1.02,
"grad_norm": 2.936283826828003,
"learning_rate": 4.871842768871928e-05,
"loss": 0.231,
"step": 780
},
{
"epoch": 1.03,
"grad_norm": 2.782768964767456,
"learning_rate": 4.8702088730341965e-05,
"loss": 0.2195,
"step": 785
},
{
"epoch": 1.04,
"grad_norm": 2.589066982269287,
"learning_rate": 4.868564905183476e-05,
"loss": 0.2205,
"step": 790
},
{
"epoch": 1.04,
"grad_norm": 2.7772037982940674,
"learning_rate": 4.866910872305675e-05,
"loss": 0.2144,
"step": 795
},
{
"epoch": 1.05,
"grad_norm": 2.887826919555664,
"learning_rate": 4.865246781429476e-05,
"loss": 0.2244,
"step": 800
},
{
"epoch": 1.06,
"grad_norm": 2.558298349380493,
"learning_rate": 4.8635726396262996e-05,
"loss": 0.2422,
"step": 805
},
{
"epoch": 1.06,
"grad_norm": 2.575716972351074,
"learning_rate": 4.861888454010275e-05,
"loss": 0.2223,
"step": 810
},
{
"epoch": 1.07,
"grad_norm": 2.8698527812957764,
"learning_rate": 4.860194231738216e-05,
"loss": 0.2164,
"step": 815
},
{
"epoch": 1.07,
"grad_norm": 2.723947763442993,
"learning_rate": 4.8584899800095864e-05,
"loss": 0.2332,
"step": 820
},
{
"epoch": 1.08,
"grad_norm": 2.9089255332946777,
"learning_rate": 4.8567757060664644e-05,
"loss": 0.2419,
"step": 825
},
{
"epoch": 1.09,
"grad_norm": 2.4928860664367676,
"learning_rate": 4.8550514171935214e-05,
"loss": 0.2268,
"step": 830
},
{
"epoch": 1.09,
"grad_norm": 2.4811110496520996,
"learning_rate": 4.853317120717985e-05,
"loss": 0.2137,
"step": 835
},
{
"epoch": 1.1,
"grad_norm": 3.3581888675689697,
"learning_rate": 4.85157282400961e-05,
"loss": 0.2416,
"step": 840
},
{
"epoch": 1.11,
"grad_norm": 2.5201902389526367,
"learning_rate": 4.849818534480645e-05,
"loss": 0.2263,
"step": 845
},
{
"epoch": 1.11,
"grad_norm": 2.51816463470459,
"learning_rate": 4.8480542595858025e-05,
"loss": 0.2346,
"step": 850
},
{
"epoch": 1.12,
"grad_norm": 2.437803030014038,
"learning_rate": 4.846280006822228e-05,
"loss": 0.2311,
"step": 855
},
{
"epoch": 1.13,
"grad_norm": 2.6363041400909424,
"learning_rate": 4.844495783729467e-05,
"loss": 0.2364,
"step": 860
},
{
"epoch": 1.13,
"grad_norm": 2.600130796432495,
"learning_rate": 4.842701597889432e-05,
"loss": 0.2292,
"step": 865
},
{
"epoch": 1.14,
"grad_norm": 2.5062355995178223,
"learning_rate": 4.840897456926373e-05,
"loss": 0.253,
"step": 870
},
{
"epoch": 1.15,
"grad_norm": 2.7811343669891357,
"learning_rate": 4.8390833685068424e-05,
"loss": 0.2347,
"step": 875
},
{
"epoch": 1.15,
"grad_norm": 2.58297061920166,
"learning_rate": 4.837259340339665e-05,
"loss": 0.2313,
"step": 880
},
{
"epoch": 1.16,
"grad_norm": 2.850160598754883,
"learning_rate": 4.8354253801759e-05,
"loss": 0.2433,
"step": 885
},
{
"epoch": 1.17,
"grad_norm": 2.6711654663085938,
"learning_rate": 4.8335814958088166e-05,
"loss": 0.2384,
"step": 890
},
{
"epoch": 1.17,
"grad_norm": 2.596914291381836,
"learning_rate": 4.8317276950738525e-05,
"loss": 0.2411,
"step": 895
},
{
"epoch": 1.18,
"grad_norm": 2.9930615425109863,
"learning_rate": 4.829863985848587e-05,
"loss": 0.2381,
"step": 900
},
{
"epoch": 1.19,
"grad_norm": 2.8596436977386475,
"learning_rate": 4.827990376052702e-05,
"loss": 0.2409,
"step": 905
},
{
"epoch": 1.19,
"grad_norm": 2.607304573059082,
"learning_rate": 4.826106873647953e-05,
"loss": 0.2387,
"step": 910
},
{
"epoch": 1.2,
"grad_norm": 2.747140645980835,
"learning_rate": 4.824213486638133e-05,
"loss": 0.2552,
"step": 915
},
{
"epoch": 1.21,
"grad_norm": 2.7441892623901367,
"learning_rate": 4.822310223069039e-05,
"loss": 0.2414,
"step": 920
},
{
"epoch": 1.21,
"grad_norm": 2.480534791946411,
"learning_rate": 4.820397091028436e-05,
"loss": 0.2451,
"step": 925
},
{
"epoch": 1.22,
"grad_norm": 2.65727162361145,
"learning_rate": 4.818474098646026e-05,
"loss": 0.2271,
"step": 930
},
{
"epoch": 1.23,
"grad_norm": 2.657895088195801,
"learning_rate": 4.8165412540934116e-05,
"loss": 0.2463,
"step": 935
},
{
"epoch": 1.23,
"grad_norm": 2.8118693828582764,
"learning_rate": 4.814598565584062e-05,
"loss": 0.2586,
"step": 940
},
{
"epoch": 1.24,
"grad_norm": 2.7916576862335205,
"learning_rate": 4.812646041373275e-05,
"loss": 0.2487,
"step": 945
},
{
"epoch": 1.25,
"grad_norm": 2.8312575817108154,
"learning_rate": 4.810683689758147e-05,
"loss": 0.2448,
"step": 950
},
{
"epoch": 1.25,
"grad_norm": 2.789888620376587,
"learning_rate": 4.808711519077534e-05,
"loss": 0.25,
"step": 955
},
{
"epoch": 1.26,
"grad_norm": 2.660008192062378,
"learning_rate": 4.806729537712017e-05,
"loss": 0.2592,
"step": 960
},
{
"epoch": 1.26,
"grad_norm": 2.80081844329834,
"learning_rate": 4.8047377540838676e-05,
"loss": 0.2633,
"step": 965
},
{
"epoch": 1.27,
"grad_norm": 2.5701184272766113,
"learning_rate": 4.8027361766570117e-05,
"loss": 0.2345,
"step": 970
},
{
"epoch": 1.28,
"grad_norm": 2.6467089653015137,
"learning_rate": 4.8007248139369915e-05,
"loss": 0.2421,
"step": 975
},
{
"epoch": 1.28,
"grad_norm": 2.8026981353759766,
"learning_rate": 4.7987036744709326e-05,
"loss": 0.2462,
"step": 980
},
{
"epoch": 1.29,
"grad_norm": 2.9150643348693848,
"learning_rate": 4.7966727668475044e-05,
"loss": 0.2516,
"step": 985
},
{
"epoch": 1.3,
"grad_norm": 2.872527837753296,
"learning_rate": 4.794632099696888e-05,
"loss": 0.2581,
"step": 990
},
{
"epoch": 1.3,
"grad_norm": 2.764134168624878,
"learning_rate": 4.792581681690734e-05,
"loss": 0.2707,
"step": 995
},
{
"epoch": 1.31,
"grad_norm": 2.886357069015503,
"learning_rate": 4.790521521542129e-05,
"loss": 0.2573,
"step": 1000
},
{
"epoch": 1.32,
"grad_norm": 2.990485429763794,
"learning_rate": 4.788451628005561e-05,
"loss": 0.2634,
"step": 1005
},
{
"epoch": 1.32,
"grad_norm": 2.758971691131592,
"learning_rate": 4.786372009876876e-05,
"loss": 0.2439,
"step": 1010
},
{
"epoch": 1.33,
"grad_norm": 2.70831561088562,
"learning_rate": 4.784282675993245e-05,
"loss": 0.241,
"step": 1015
},
{
"epoch": 1.34,
"grad_norm": 2.6341211795806885,
"learning_rate": 4.782183635233124e-05,
"loss": 0.2652,
"step": 1020
},
{
"epoch": 1.34,
"grad_norm": 2.7551965713500977,
"learning_rate": 4.780074896516219e-05,
"loss": 0.244,
"step": 1025
},
{
"epoch": 1.35,
"grad_norm": 3.252516508102417,
"learning_rate": 4.7779564688034476e-05,
"loss": 0.2594,
"step": 1030
},
{
"epoch": 1.36,
"grad_norm": 2.93808913230896,
"learning_rate": 4.7758283610968985e-05,
"loss": 0.2594,
"step": 1035
},
{
"epoch": 1.36,
"grad_norm": 2.767031192779541,
"learning_rate": 4.773690582439795e-05,
"loss": 0.2506,
"step": 1040
},
{
"epoch": 1.37,
"grad_norm": 2.6166746616363525,
"learning_rate": 4.7715431419164566e-05,
"loss": 0.2624,
"step": 1045
},
{
"epoch": 1.38,
"grad_norm": 2.9592745304107666,
"learning_rate": 4.7693860486522604e-05,
"loss": 0.2735,
"step": 1050
},
{
"epoch": 1.38,
"grad_norm": 2.8421945571899414,
"learning_rate": 4.7672193118136e-05,
"loss": 0.2693,
"step": 1055
},
{
"epoch": 1.39,
"grad_norm": 3.0941479206085205,
"learning_rate": 4.7650429406078525e-05,
"loss": 0.2563,
"step": 1060
},
{
"epoch": 1.4,
"grad_norm": 2.8086464405059814,
"learning_rate": 4.762856944283331e-05,
"loss": 0.2627,
"step": 1065
},
{
"epoch": 1.4,
"grad_norm": 2.981468439102173,
"learning_rate": 4.760661332129254e-05,
"loss": 0.2739,
"step": 1070
},
{
"epoch": 1.41,
"grad_norm": 2.7119858264923096,
"learning_rate": 4.758456113475699e-05,
"loss": 0.2697,
"step": 1075
},
{
"epoch": 1.42,
"grad_norm": 2.955040454864502,
"learning_rate": 4.756241297693566e-05,
"loss": 0.2713,
"step": 1080
},
{
"epoch": 1.42,
"grad_norm": 2.78459095954895,
"learning_rate": 4.7540168941945376e-05,
"loss": 0.2659,
"step": 1085
},
{
"epoch": 1.43,
"grad_norm": 2.754824161529541,
"learning_rate": 4.751782912431038e-05,
"loss": 0.2527,
"step": 1090
},
{
"epoch": 1.44,
"grad_norm": 2.916003465652466,
"learning_rate": 4.749539361896195e-05,
"loss": 0.2554,
"step": 1095
},
{
"epoch": 1.44,
"grad_norm": 2.9990346431732178,
"learning_rate": 4.747286252123797e-05,
"loss": 0.2449,
"step": 1100
},
{
"epoch": 1.45,
"grad_norm": 2.68816876411438,
"learning_rate": 4.7450235926882524e-05,
"loss": 0.2539,
"step": 1105
},
{
"epoch": 1.46,
"grad_norm": 2.7783591747283936,
"learning_rate": 4.742751393204553e-05,
"loss": 0.2673,
"step": 1110
},
{
"epoch": 1.46,
"grad_norm": 3.041889190673828,
"learning_rate": 4.740469663328228e-05,
"loss": 0.2692,
"step": 1115
},
{
"epoch": 1.47,
"grad_norm": 3.2789931297302246,
"learning_rate": 4.738178412755306e-05,
"loss": 0.2691,
"step": 1120
},
{
"epoch": 1.47,
"grad_norm": 2.8584647178649902,
"learning_rate": 4.7358776512222737e-05,
"loss": 0.2722,
"step": 1125
},
{
"epoch": 1.48,
"grad_norm": 2.982015371322632,
"learning_rate": 4.7335673885060316e-05,
"loss": 0.2721,
"step": 1130
},
{
"epoch": 1.49,
"grad_norm": 2.9325811862945557,
"learning_rate": 4.731247634423858e-05,
"loss": 0.2791,
"step": 1135
},
{
"epoch": 1.49,
"grad_norm": 2.9873268604278564,
"learning_rate": 4.728918398833361e-05,
"loss": 0.2805,
"step": 1140
},
{
"epoch": 1.5,
"grad_norm": 2.8286678791046143,
"learning_rate": 4.726579691632442e-05,
"loss": 0.2628,
"step": 1145
},
{
"epoch": 1.51,
"grad_norm": 2.6870853900909424,
"learning_rate": 4.7242315227592496e-05,
"loss": 0.2697,
"step": 1150
},
{
"epoch": 1.51,
"grad_norm": 2.881246566772461,
"learning_rate": 4.721873902192139e-05,
"loss": 0.2786,
"step": 1155
},
{
"epoch": 1.52,
"grad_norm": 2.676746129989624,
"learning_rate": 4.719506839949631e-05,
"loss": 0.2795,
"step": 1160
},
{
"epoch": 1.53,
"grad_norm": 2.8064475059509277,
"learning_rate": 4.717130346090368e-05,
"loss": 0.2729,
"step": 1165
},
{
"epoch": 1.53,
"grad_norm": 2.7660868167877197,
"learning_rate": 4.7147444307130686e-05,
"loss": 0.2752,
"step": 1170
},
{
"epoch": 1.54,
"grad_norm": 2.8748722076416016,
"learning_rate": 4.71234910395649e-05,
"loss": 0.2772,
"step": 1175
},
{
"epoch": 1.55,
"grad_norm": 2.691197633743286,
"learning_rate": 4.7099443759993837e-05,
"loss": 0.256,
"step": 1180
},
{
"epoch": 1.55,
"grad_norm": 2.8552544116973877,
"learning_rate": 4.707530257060445e-05,
"loss": 0.2758,
"step": 1185
},
{
"epoch": 1.56,
"grad_norm": 2.7499427795410156,
"learning_rate": 4.705106757398282e-05,
"loss": 0.2628,
"step": 1190
},
{
"epoch": 1.57,
"grad_norm": 2.6907596588134766,
"learning_rate": 4.702673887311362e-05,
"loss": 0.2662,
"step": 1195
},
{
"epoch": 1.57,
"grad_norm": 2.7225170135498047,
"learning_rate": 4.7002316571379715e-05,
"loss": 0.2709,
"step": 1200
},
{
"epoch": 1.58,
"grad_norm": 3.2904715538024902,
"learning_rate": 4.697780077256172e-05,
"loss": 0.2853,
"step": 1205
},
{
"epoch": 1.59,
"grad_norm": 2.7764620780944824,
"learning_rate": 4.695319158083756e-05,
"loss": 0.2623,
"step": 1210
},
{
"epoch": 1.59,
"grad_norm": 3.36917781829834,
"learning_rate": 4.6928489100782046e-05,
"loss": 0.2806,
"step": 1215
},
{
"epoch": 1.6,
"grad_norm": 3.3074262142181396,
"learning_rate": 4.690369343736636e-05,
"loss": 0.2834,
"step": 1220
},
{
"epoch": 1.61,
"grad_norm": 2.958819627761841,
"learning_rate": 4.6878804695957716e-05,
"loss": 0.2787,
"step": 1225
},
{
"epoch": 1.61,
"grad_norm": 2.8270795345306396,
"learning_rate": 4.6853822982318816e-05,
"loss": 0.2737,
"step": 1230
},
{
"epoch": 1.62,
"grad_norm": 2.6642744541168213,
"learning_rate": 4.682874840260746e-05,
"loss": 0.2872,
"step": 1235
},
{
"epoch": 1.63,
"grad_norm": 3.0754623413085938,
"learning_rate": 4.680358106337607e-05,
"loss": 0.2674,
"step": 1240
},
{
"epoch": 1.63,
"grad_norm": 3.076148271560669,
"learning_rate": 4.6778321071571224e-05,
"loss": 0.2769,
"step": 1245
},
{
"epoch": 1.64,
"grad_norm": 2.8592352867126465,
"learning_rate": 4.675296853453326e-05,
"loss": 0.2799,
"step": 1250
},
{
"epoch": 1.65,
"grad_norm": 3.153860330581665,
"learning_rate": 4.6727523559995734e-05,
"loss": 0.2812,
"step": 1255
},
{
"epoch": 1.65,
"grad_norm": 3.1477208137512207,
"learning_rate": 4.6701986256085046e-05,
"loss": 0.2818,
"step": 1260
},
{
"epoch": 1.66,
"grad_norm": 3.040626049041748,
"learning_rate": 4.667635673131992e-05,
"loss": 0.2832,
"step": 1265
},
{
"epoch": 1.66,
"grad_norm": 3.204580307006836,
"learning_rate": 4.665063509461097e-05,
"loss": 0.3009,
"step": 1270
},
{
"epoch": 1.67,
"grad_norm": 2.8025059700012207,
"learning_rate": 4.662482145526024e-05,
"loss": 0.2776,
"step": 1275
},
{
"epoch": 1.68,
"grad_norm": 3.0659685134887695,
"learning_rate": 4.659891592296071e-05,
"loss": 0.291,
"step": 1280
},
{
"epoch": 1.68,
"grad_norm": 2.9462106227874756,
"learning_rate": 4.6572918607795876e-05,
"loss": 0.287,
"step": 1285
},
{
"epoch": 1.69,
"grad_norm": 3.0103273391723633,
"learning_rate": 4.6546829620239265e-05,
"loss": 0.3025,
"step": 1290
},
{
"epoch": 1.7,
"grad_norm": 2.912851095199585,
"learning_rate": 4.6520649071153916e-05,
"loss": 0.2675,
"step": 1295
},
{
"epoch": 1.7,
"grad_norm": 3.1437137126922607,
"learning_rate": 4.6494377071791996e-05,
"loss": 0.2896,
"step": 1300
},
{
"epoch": 1.71,
"grad_norm": 2.8913474082946777,
"learning_rate": 4.646801373379425e-05,
"loss": 0.3142,
"step": 1305
},
{
"epoch": 1.72,
"grad_norm": 3.0581839084625244,
"learning_rate": 4.644155916918959e-05,
"loss": 0.293,
"step": 1310
},
{
"epoch": 1.72,
"grad_norm": 2.8686771392822266,
"learning_rate": 4.641501349039456e-05,
"loss": 0.273,
"step": 1315
},
{
"epoch": 1.73,
"grad_norm": 2.914700984954834,
"learning_rate": 4.6388376810212905e-05,
"loss": 0.2837,
"step": 1320
},
{
"epoch": 1.74,
"grad_norm": 3.2269139289855957,
"learning_rate": 4.6361649241835056e-05,
"loss": 0.2849,
"step": 1325
},
{
"epoch": 1.74,
"grad_norm": 3.0138943195343018,
"learning_rate": 4.633483089883769e-05,
"loss": 0.2854,
"step": 1330
},
{
"epoch": 1.75,
"grad_norm": 3.2977559566497803,
"learning_rate": 4.63079218951832e-05,
"loss": 0.2922,
"step": 1335
},
{
"epoch": 1.76,
"grad_norm": 3.0085713863372803,
"learning_rate": 4.6280922345219255e-05,
"loss": 0.2838,
"step": 1340
},
{
"epoch": 1.76,
"grad_norm": 3.183983087539673,
"learning_rate": 4.625383236367827e-05,
"loss": 0.282,
"step": 1345
},
{
"epoch": 1.77,
"grad_norm": 2.8702237606048584,
"learning_rate": 4.6226652065676974e-05,
"loss": 0.2786,
"step": 1350
},
{
"epoch": 1.78,
"grad_norm": 3.479321241378784,
"learning_rate": 4.619938156671584e-05,
"loss": 0.2904,
"step": 1355
},
{
"epoch": 1.78,
"grad_norm": 2.9285452365875244,
"learning_rate": 4.61720209826787e-05,
"loss": 0.2861,
"step": 1360
},
{
"epoch": 1.79,
"grad_norm": 3.244591236114502,
"learning_rate": 4.6144570429832144e-05,
"loss": 0.2928,
"step": 1365
},
{
"epoch": 1.8,
"grad_norm": 2.8110570907592773,
"learning_rate": 4.6117030024825114e-05,
"loss": 0.2904,
"step": 1370
},
{
"epoch": 1.8,
"grad_norm": 3.049492359161377,
"learning_rate": 4.6089399884688356e-05,
"loss": 0.2739,
"step": 1375
},
{
"epoch": 1.81,
"grad_norm": 2.960361957550049,
"learning_rate": 4.606168012683394e-05,
"loss": 0.3031,
"step": 1380
},
{
"epoch": 1.82,
"grad_norm": 3.257373571395874,
"learning_rate": 4.603387086905475e-05,
"loss": 0.2993,
"step": 1385
},
{
"epoch": 1.82,
"grad_norm": 3.0115904808044434,
"learning_rate": 4.600597222952402e-05,
"loss": 0.2915,
"step": 1390
},
{
"epoch": 1.83,
"grad_norm": 3.111074209213257,
"learning_rate": 4.597798432679477e-05,
"loss": 0.2948,
"step": 1395
},
{
"epoch": 1.84,
"grad_norm": 3.1926794052124023,
"learning_rate": 4.594990727979937e-05,
"loss": 0.2971,
"step": 1400
},
{
"epoch": 1.84,
"grad_norm": 2.913715362548828,
"learning_rate": 4.5921741207848966e-05,
"loss": 0.2844,
"step": 1405
},
{
"epoch": 1.85,
"grad_norm": 2.8652007579803467,
"learning_rate": 4.5893486230633037e-05,
"loss": 0.2687,
"step": 1410
},
{
"epoch": 1.85,
"grad_norm": 2.927306890487671,
"learning_rate": 4.586514246821885e-05,
"loss": 0.2984,
"step": 1415
},
{
"epoch": 1.86,
"grad_norm": 3.2218594551086426,
"learning_rate": 4.583671004105096e-05,
"loss": 0.2928,
"step": 1420
},
{
"epoch": 1.87,
"grad_norm": 3.1091806888580322,
"learning_rate": 4.580818906995068e-05,
"loss": 0.3024,
"step": 1425
},
{
"epoch": 1.87,
"grad_norm": 3.152013063430786,
"learning_rate": 4.5779579676115604e-05,
"loss": 0.2898,
"step": 1430
},
{
"epoch": 1.88,
"grad_norm": 3.037785053253174,
"learning_rate": 4.575088198111905e-05,
"loss": 0.3012,
"step": 1435
},
{
"epoch": 1.89,
"grad_norm": 3.125337600708008,
"learning_rate": 4.5722096106909595e-05,
"loss": 0.2982,
"step": 1440
},
{
"epoch": 1.89,
"grad_norm": 3.1015219688415527,
"learning_rate": 4.56932221758105e-05,
"loss": 0.3014,
"step": 1445
},
{
"epoch": 1.9,
"grad_norm": 3.0641446113586426,
"learning_rate": 4.566426031051922e-05,
"loss": 0.3057,
"step": 1450
},
{
"epoch": 1.91,
"grad_norm": 3.1846718788146973,
"learning_rate": 4.56352106341069e-05,
"loss": 0.2941,
"step": 1455
},
{
"epoch": 1.91,
"grad_norm": 2.9871373176574707,
"learning_rate": 4.56060732700178e-05,
"loss": 0.2902,
"step": 1460
},
{
"epoch": 1.92,
"grad_norm": 2.941716194152832,
"learning_rate": 4.5576848342068826e-05,
"loss": 0.2999,
"step": 1465
},
{
"epoch": 1.93,
"grad_norm": 2.8153445720672607,
"learning_rate": 4.554753597444896e-05,
"loss": 0.2855,
"step": 1470
},
{
"epoch": 1.93,
"grad_norm": 3.2046408653259277,
"learning_rate": 4.551813629171878e-05,
"loss": 0.3167,
"step": 1475
},
{
"epoch": 1.94,
"grad_norm": 3.2123496532440186,
"learning_rate": 4.548864941880988e-05,
"loss": 0.2929,
"step": 1480
},
{
"epoch": 1.95,
"grad_norm": 2.81064772605896,
"learning_rate": 4.545907548102436e-05,
"loss": 0.3059,
"step": 1485
},
{
"epoch": 1.95,
"grad_norm": 3.07346248626709,
"learning_rate": 4.5429414604034307e-05,
"loss": 0.2902,
"step": 1490
},
{
"epoch": 1.96,
"grad_norm": 2.8002560138702393,
"learning_rate": 4.539966691388125e-05,
"loss": 0.2918,
"step": 1495
},
{
"epoch": 1.97,
"grad_norm": 3.3515923023223877,
"learning_rate": 4.536983253697561e-05,
"loss": 0.304,
"step": 1500
},
{
"epoch": 1.97,
"grad_norm": 3.050218105316162,
"learning_rate": 4.53399116000962e-05,
"loss": 0.3163,
"step": 1505
},
{
"epoch": 1.98,
"grad_norm": 3.1914007663726807,
"learning_rate": 4.530990423038962e-05,
"loss": 0.3071,
"step": 1510
},
{
"epoch": 1.99,
"grad_norm": 3.180460214614868,
"learning_rate": 4.527981055536982e-05,
"loss": 0.3023,
"step": 1515
},
{
"epoch": 1.99,
"grad_norm": 3.2100706100463867,
"learning_rate": 4.524963070291744e-05,
"loss": 0.3219,
"step": 1520
},
{
"epoch": 2.0,
"grad_norm": 2.9520275592803955,
"learning_rate": 4.5219364801279356e-05,
"loss": 0.2968,
"step": 1525
},
{
"epoch": 2.01,
"grad_norm": 2.4291439056396484,
"learning_rate": 4.51890129790681e-05,
"loss": 0.17,
"step": 1530
},
{
"epoch": 2.01,
"grad_norm": 1.9606090784072876,
"learning_rate": 4.5158575365261305e-05,
"loss": 0.1316,
"step": 1535
},
{
"epoch": 2.02,
"grad_norm": 2.126908779144287,
"learning_rate": 4.512805208920118e-05,
"loss": 0.1281,
"step": 1540
},
{
"epoch": 2.03,
"grad_norm": 2.0146312713623047,
"learning_rate": 4.509744328059395e-05,
"loss": 0.1234,
"step": 1545
},
{
"epoch": 2.03,
"grad_norm": 1.9698853492736816,
"learning_rate": 4.506674906950929e-05,
"loss": 0.1341,
"step": 1550
},
{
"epoch": 2.04,
"grad_norm": 2.1764025688171387,
"learning_rate": 4.5035969586379804e-05,
"loss": 0.1331,
"step": 1555
},
{
"epoch": 2.04,
"grad_norm": 2.2242555618286133,
"learning_rate": 4.5005104962000436e-05,
"loss": 0.1325,
"step": 1560
},
{
"epoch": 2.05,
"grad_norm": 2.019362449645996,
"learning_rate": 4.4974155327527926e-05,
"loss": 0.1219,
"step": 1565
},
{
"epoch": 2.06,
"grad_norm": 2.3239810466766357,
"learning_rate": 4.494312081448029e-05,
"loss": 0.1304,
"step": 1570
},
{
"epoch": 2.06,
"grad_norm": 2.2973790168762207,
"learning_rate": 4.4912001554736205e-05,
"loss": 0.1316,
"step": 1575
},
{
"epoch": 2.07,
"grad_norm": 2.4513959884643555,
"learning_rate": 4.488079768053447e-05,
"loss": 0.133,
"step": 1580
},
{
"epoch": 2.08,
"grad_norm": 2.789614200592041,
"learning_rate": 4.484950932447345e-05,
"loss": 0.1378,
"step": 1585
},
{
"epoch": 2.08,
"grad_norm": 2.2913756370544434,
"learning_rate": 4.481813661951052e-05,
"loss": 0.1287,
"step": 1590
},
{
"epoch": 2.09,
"grad_norm": 2.1334588527679443,
"learning_rate": 4.4786679698961476e-05,
"loss": 0.1304,
"step": 1595
},
{
"epoch": 2.1,
"grad_norm": 2.3002805709838867,
"learning_rate": 4.475513869649998e-05,
"loss": 0.134,
"step": 1600
},
{
"epoch": 2.1,
"grad_norm": 2.2173187732696533,
"learning_rate": 4.4723513746157004e-05,
"loss": 0.1359,
"step": 1605
},
{
"epoch": 2.11,
"grad_norm": 1.9922655820846558,
"learning_rate": 4.469180498232024e-05,
"loss": 0.1403,
"step": 1610
},
{
"epoch": 2.12,
"grad_norm": 2.208549737930298,
"learning_rate": 4.466001253973355e-05,
"loss": 0.1316,
"step": 1615
},
{
"epoch": 2.12,
"grad_norm": 2.4228994846343994,
"learning_rate": 4.4628136553496375e-05,
"loss": 0.1336,
"step": 1620
},
{
"epoch": 2.13,
"grad_norm": 2.2046756744384766,
"learning_rate": 4.459617715906316e-05,
"loss": 0.1389,
"step": 1625
},
{
"epoch": 2.14,
"grad_norm": 2.3668532371520996,
"learning_rate": 4.4564134492242805e-05,
"loss": 0.1374,
"step": 1630
},
{
"epoch": 2.14,
"grad_norm": 2.3358521461486816,
"learning_rate": 4.4532008689198056e-05,
"loss": 0.1339,
"step": 1635
},
{
"epoch": 2.15,
"grad_norm": 2.4201912879943848,
"learning_rate": 4.449979988644494e-05,
"loss": 0.1324,
"step": 1640
},
{
"epoch": 2.16,
"grad_norm": 2.356771230697632,
"learning_rate": 4.446750822085218e-05,
"loss": 0.1496,
"step": 1645
},
{
"epoch": 2.16,
"grad_norm": 2.5749542713165283,
"learning_rate": 4.4435133829640645e-05,
"loss": 0.1446,
"step": 1650
},
{
"epoch": 2.17,
"grad_norm": 2.313682794570923,
"learning_rate": 4.440267685038271e-05,
"loss": 0.1417,
"step": 1655
},
{
"epoch": 2.18,
"grad_norm": 2.3327279090881348,
"learning_rate": 4.437013742100171e-05,
"loss": 0.1341,
"step": 1660
},
{
"epoch": 2.18,
"grad_norm": 2.482767105102539,
"learning_rate": 4.4337515679771345e-05,
"loss": 0.1402,
"step": 1665
},
{
"epoch": 2.19,
"grad_norm": 2.6034271717071533,
"learning_rate": 4.4304811765315105e-05,
"loss": 0.1498,
"step": 1670
},
{
"epoch": 2.2,
"grad_norm": 2.2677841186523438,
"learning_rate": 4.427202581660565e-05,
"loss": 0.1414,
"step": 1675
},
{
"epoch": 2.2,
"grad_norm": 2.3339622020721436,
"learning_rate": 4.423915797296425e-05,
"loss": 0.1377,
"step": 1680
},
{
"epoch": 2.21,
"grad_norm": 2.1083145141601562,
"learning_rate": 4.420620837406018e-05,
"loss": 0.1416,
"step": 1685
},
{
"epoch": 2.22,
"grad_norm": 2.400583267211914,
"learning_rate": 4.4173177159910106e-05,
"loss": 0.1383,
"step": 1690
},
{
"epoch": 2.22,
"grad_norm": 2.1524839401245117,
"learning_rate": 4.414006447087755e-05,
"loss": 0.1366,
"step": 1695
},
{
"epoch": 2.23,
"grad_norm": 2.1756019592285156,
"learning_rate": 4.410687044767223e-05,
"loss": 0.1402,
"step": 1700
},
{
"epoch": 2.23,
"grad_norm": 2.5507566928863525,
"learning_rate": 4.407359523134949e-05,
"loss": 0.1514,
"step": 1705
},
{
"epoch": 2.24,
"grad_norm": 2.152941942214966,
"learning_rate": 4.4040238963309696e-05,
"loss": 0.1451,
"step": 1710
},
{
"epoch": 2.25,
"grad_norm": 2.3613879680633545,
"learning_rate": 4.400680178529765e-05,
"loss": 0.1407,
"step": 1715
},
{
"epoch": 2.25,
"grad_norm": 2.624096393585205,
"learning_rate": 4.397328383940196e-05,
"loss": 0.1428,
"step": 1720
},
{
"epoch": 2.26,
"grad_norm": 2.358207941055298,
"learning_rate": 4.393968526805447e-05,
"loss": 0.1443,
"step": 1725
},
{
"epoch": 2.27,
"grad_norm": 2.758371353149414,
"learning_rate": 4.3906006214029585e-05,
"loss": 0.1568,
"step": 1730
},
{
"epoch": 2.27,
"grad_norm": 2.099876642227173,
"learning_rate": 4.387224682044378e-05,
"loss": 0.157,
"step": 1735
},
{
"epoch": 2.28,
"grad_norm": 2.3019683361053467,
"learning_rate": 4.3838407230754885e-05,
"loss": 0.1404,
"step": 1740
},
{
"epoch": 2.29,
"grad_norm": 2.589655637741089,
"learning_rate": 4.3804487588761544e-05,
"loss": 0.156,
"step": 1745
},
{
"epoch": 2.29,
"grad_norm": 2.4104435443878174,
"learning_rate": 4.3770488038602555e-05,
"loss": 0.1467,
"step": 1750
},
{
"epoch": 2.3,
"grad_norm": 2.6529500484466553,
"learning_rate": 4.373640872475627e-05,
"loss": 0.1475,
"step": 1755
},
{
"epoch": 2.31,
"grad_norm": 2.272524833679199,
"learning_rate": 4.370224979204003e-05,
"loss": 0.1423,
"step": 1760
},
{
"epoch": 2.31,
"grad_norm": 2.421292781829834,
"learning_rate": 4.366801138560948e-05,
"loss": 0.149,
"step": 1765
},
{
"epoch": 2.32,
"grad_norm": 2.280380964279175,
"learning_rate": 4.3633693650957976e-05,
"loss": 0.1468,
"step": 1770
},
{
"epoch": 2.33,
"grad_norm": 2.0802671909332275,
"learning_rate": 4.3599296733916004e-05,
"loss": 0.157,
"step": 1775
},
{
"epoch": 2.33,
"grad_norm": 2.234787940979004,
"learning_rate": 4.3564820780650496e-05,
"loss": 0.1428,
"step": 1780
},
{
"epoch": 2.34,
"grad_norm": 2.337618589401245,
"learning_rate": 4.353026593766427e-05,
"loss": 0.1459,
"step": 1785
},
{
"epoch": 2.35,
"grad_norm": 2.529278516769409,
"learning_rate": 4.3495632351795367e-05,
"loss": 0.1617,
"step": 1790
},
{
"epoch": 2.35,
"grad_norm": 2.2081286907196045,
"learning_rate": 4.3460920170216425e-05,
"loss": 0.1487,
"step": 1795
},
{
"epoch": 2.36,
"grad_norm": 2.180853843688965,
"learning_rate": 4.34261295404341e-05,
"loss": 0.139,
"step": 1800
},
{
"epoch": 2.37,
"grad_norm": 2.5588650703430176,
"learning_rate": 4.339126061028837e-05,
"loss": 0.1489,
"step": 1805
},
{
"epoch": 2.37,
"grad_norm": 2.441371202468872,
"learning_rate": 4.335631352795199e-05,
"loss": 0.1544,
"step": 1810
},
{
"epoch": 2.38,
"grad_norm": 2.429845094680786,
"learning_rate": 4.332128844192977e-05,
"loss": 0.151,
"step": 1815
},
{
"epoch": 2.39,
"grad_norm": 2.3163018226623535,
"learning_rate": 4.328618550105802e-05,
"loss": 0.1521,
"step": 1820
},
{
"epoch": 2.39,
"grad_norm": 2.4328958988189697,
"learning_rate": 4.325100485450389e-05,
"loss": 0.1581,
"step": 1825
},
{
"epoch": 2.4,
"grad_norm": 2.343770980834961,
"learning_rate": 4.3215746651764686e-05,
"loss": 0.1544,
"step": 1830
},
{
"epoch": 2.41,
"grad_norm": 2.4985294342041016,
"learning_rate": 4.3180411042667354e-05,
"loss": 0.1557,
"step": 1835
},
{
"epoch": 2.41,
"grad_norm": 2.6652395725250244,
"learning_rate": 4.314499817736773e-05,
"loss": 0.1465,
"step": 1840
},
{
"epoch": 2.42,
"grad_norm": 2.50243878364563,
"learning_rate": 4.3109508206349945e-05,
"loss": 0.1514,
"step": 1845
},
{
"epoch": 2.43,
"grad_norm": 2.537421703338623,
"learning_rate": 4.30739412804258e-05,
"loss": 0.155,
"step": 1850
},
{
"epoch": 2.43,
"grad_norm": 2.244147539138794,
"learning_rate": 4.3038297550734096e-05,
"loss": 0.15,
"step": 1855
},
{
"epoch": 2.44,
"grad_norm": 2.4972686767578125,
"learning_rate": 4.300257716874001e-05,
"loss": 0.1559,
"step": 1860
},
{
"epoch": 2.44,
"grad_norm": 2.495651960372925,
"learning_rate": 4.296678028623446e-05,
"loss": 0.1589,
"step": 1865
},
{
"epoch": 2.45,
"grad_norm": 2.649902582168579,
"learning_rate": 4.293090705533342e-05,
"loss": 0.1528,
"step": 1870
},
{
"epoch": 2.46,
"grad_norm": 2.281095266342163,
"learning_rate": 4.2894957628477316e-05,
"loss": 0.1639,
"step": 1875
},
{
"epoch": 2.46,
"grad_norm": 2.5233304500579834,
"learning_rate": 4.285893215843036e-05,
"loss": 0.1528,
"step": 1880
},
{
"epoch": 2.47,
"grad_norm": 2.6843202114105225,
"learning_rate": 4.282283079827993e-05,
"loss": 0.1623,
"step": 1885
},
{
"epoch": 2.48,
"grad_norm": 2.4476354122161865,
"learning_rate": 4.278665370143583e-05,
"loss": 0.1562,
"step": 1890
},
{
"epoch": 2.48,
"grad_norm": 2.337167501449585,
"learning_rate": 4.2750401021629765e-05,
"loss": 0.165,
"step": 1895
},
{
"epoch": 2.49,
"grad_norm": 2.610464096069336,
"learning_rate": 4.271407291291459e-05,
"loss": 0.1591,
"step": 1900
},
{
"epoch": 2.5,
"grad_norm": 2.4951589107513428,
"learning_rate": 4.267766952966369e-05,
"loss": 0.1587,
"step": 1905
},
{
"epoch": 2.5,
"grad_norm": 2.0912039279937744,
"learning_rate": 4.2641191026570336e-05,
"loss": 0.1529,
"step": 1910
},
{
"epoch": 2.51,
"grad_norm": 2.724330425262451,
"learning_rate": 4.260463755864702e-05,
"loss": 0.1693,
"step": 1915
},
{
"epoch": 2.52,
"grad_norm": 2.3671672344207764,
"learning_rate": 4.256800928122475e-05,
"loss": 0.157,
"step": 1920
},
{
"epoch": 2.52,
"grad_norm": 2.536565065383911,
"learning_rate": 4.2531306349952496e-05,
"loss": 0.1697,
"step": 1925
},
{
"epoch": 2.53,
"grad_norm": 2.5092501640319824,
"learning_rate": 4.2494528920796406e-05,
"loss": 0.1655,
"step": 1930
},
{
"epoch": 2.54,
"grad_norm": 2.6707546710968018,
"learning_rate": 4.2457677150039224e-05,
"loss": 0.1604,
"step": 1935
},
{
"epoch": 2.54,
"grad_norm": 2.4832890033721924,
"learning_rate": 4.242075119427961e-05,
"loss": 0.1504,
"step": 1940
},
{
"epoch": 2.55,
"grad_norm": 2.4126479625701904,
"learning_rate": 4.238375121043145e-05,
"loss": 0.1552,
"step": 1945
},
{
"epoch": 2.56,
"grad_norm": 2.3602805137634277,
"learning_rate": 4.234667735572323e-05,
"loss": 0.1556,
"step": 1950
},
{
"epoch": 2.56,
"grad_norm": 2.4358716011047363,
"learning_rate": 4.230952978769731e-05,
"loss": 0.1569,
"step": 1955
},
{
"epoch": 2.57,
"grad_norm": 2.6005828380584717,
"learning_rate": 4.227230866420932e-05,
"loss": 0.158,
"step": 1960
},
{
"epoch": 2.58,
"grad_norm": 2.054624557495117,
"learning_rate": 4.223501414342745e-05,
"loss": 0.1644,
"step": 1965
},
{
"epoch": 2.58,
"grad_norm": 2.5402703285217285,
"learning_rate": 4.219764638383177e-05,
"loss": 0.1587,
"step": 1970
},
{
"epoch": 2.59,
"grad_norm": 2.09084153175354,
"learning_rate": 4.216020554421359e-05,
"loss": 0.1561,
"step": 1975
},
{
"epoch": 2.6,
"grad_norm": 2.529383659362793,
"learning_rate": 4.2122691783674786e-05,
"loss": 0.1656,
"step": 1980
},
{
"epoch": 2.6,
"grad_norm": 2.9956157207489014,
"learning_rate": 4.208510526162704e-05,
"loss": 0.1649,
"step": 1985
},
{
"epoch": 2.61,
"grad_norm": 2.585899591445923,
"learning_rate": 4.20474461377913e-05,
"loss": 0.1635,
"step": 1990
},
{
"epoch": 2.62,
"grad_norm": 2.6515862941741943,
"learning_rate": 4.200971457219699e-05,
"loss": 0.1713,
"step": 1995
},
{
"epoch": 2.62,
"grad_norm": 2.22086501121521,
"learning_rate": 4.197191072518139e-05,
"loss": 0.151,
"step": 2000
},
{
"epoch": 2.63,
"grad_norm": 2.543677568435669,
"learning_rate": 4.19340347573889e-05,
"loss": 0.1756,
"step": 2005
},
{
"epoch": 2.63,
"grad_norm": 2.5006985664367676,
"learning_rate": 4.1896086829770445e-05,
"loss": 0.152,
"step": 2010
},
{
"epoch": 2.64,
"grad_norm": 2.534740924835205,
"learning_rate": 4.185806710358268e-05,
"loss": 0.1681,
"step": 2015
},
{
"epoch": 2.65,
"grad_norm": 2.562382459640503,
"learning_rate": 4.181997574038741e-05,
"loss": 0.162,
"step": 2020
},
{
"epoch": 2.65,
"grad_norm": 2.5193183422088623,
"learning_rate": 4.178181290205082e-05,
"loss": 0.1663,
"step": 2025
},
{
"epoch": 2.66,
"grad_norm": 2.6807899475097656,
"learning_rate": 4.174357875074285e-05,
"loss": 0.1636,
"step": 2030
},
{
"epoch": 2.67,
"grad_norm": 2.598508834838867,
"learning_rate": 4.170527344893647e-05,
"loss": 0.1704,
"step": 2035
},
{
"epoch": 2.67,
"grad_norm": 2.301255464553833,
"learning_rate": 4.1666897159406984e-05,
"loss": 0.1644,
"step": 2040
},
{
"epoch": 2.68,
"grad_norm": 2.4087393283843994,
"learning_rate": 4.162845004523137e-05,
"loss": 0.1739,
"step": 2045
},
{
"epoch": 2.69,
"grad_norm": 2.3460421562194824,
"learning_rate": 4.158993226978757e-05,
"loss": 0.1658,
"step": 2050
},
{
"epoch": 2.69,
"grad_norm": 2.640719175338745,
"learning_rate": 4.155134399675378e-05,
"loss": 0.1529,
"step": 2055
},
{
"epoch": 2.7,
"grad_norm": 2.6366817951202393,
"learning_rate": 4.151268539010777e-05,
"loss": 0.176,
"step": 2060
},
{
"epoch": 2.71,
"grad_norm": 2.4204182624816895,
"learning_rate": 4.1473956614126225e-05,
"loss": 0.1579,
"step": 2065
},
{
"epoch": 2.71,
"grad_norm": 2.7791054248809814,
"learning_rate": 4.1435157833383955e-05,
"loss": 0.1604,
"step": 2070
},
{
"epoch": 2.72,
"grad_norm": 2.4026386737823486,
"learning_rate": 4.139628921275329e-05,
"loss": 0.164,
"step": 2075
},
{
"epoch": 2.73,
"grad_norm": 2.740560531616211,
"learning_rate": 4.1357350917403314e-05,
"loss": 0.1791,
"step": 2080
},
{
"epoch": 2.73,
"grad_norm": 2.6298422813415527,
"learning_rate": 4.131834311279919e-05,
"loss": 0.1691,
"step": 2085
},
{
"epoch": 2.74,
"grad_norm": 2.610245704650879,
"learning_rate": 4.12792659647015e-05,
"loss": 0.1694,
"step": 2090
},
{
"epoch": 2.75,
"grad_norm": 2.5160694122314453,
"learning_rate": 4.124011963916541e-05,
"loss": 0.1712,
"step": 2095
},
{
"epoch": 2.75,
"grad_norm": 2.4107940196990967,
"learning_rate": 4.1200904302540136e-05,
"loss": 0.1587,
"step": 2100
},
{
"epoch": 2.76,
"grad_norm": 2.5999083518981934,
"learning_rate": 4.116162012146809e-05,
"loss": 0.1683,
"step": 2105
},
{
"epoch": 2.77,
"grad_norm": 2.592486619949341,
"learning_rate": 4.112226726288427e-05,
"loss": 0.1673,
"step": 2110
},
{
"epoch": 2.77,
"grad_norm": 2.6168549060821533,
"learning_rate": 4.1082845894015495e-05,
"loss": 0.1573,
"step": 2115
},
{
"epoch": 2.78,
"grad_norm": 2.690314769744873,
"learning_rate": 4.104335618237972e-05,
"loss": 0.1763,
"step": 2120
},
{
"epoch": 2.79,
"grad_norm": 2.612140417098999,
"learning_rate": 4.1003798295785325e-05,
"loss": 0.1671,
"step": 2125
},
{
"epoch": 2.79,
"grad_norm": 2.6909706592559814,
"learning_rate": 4.096417240233036e-05,
"loss": 0.1653,
"step": 2130
},
{
"epoch": 2.8,
"grad_norm": 2.353872299194336,
"learning_rate": 4.092447867040191e-05,
"loss": 0.1721,
"step": 2135
},
{
"epoch": 2.81,
"grad_norm": 2.776252508163452,
"learning_rate": 4.088471726867531e-05,
"loss": 0.1792,
"step": 2140
},
{
"epoch": 2.81,
"grad_norm": 2.5471363067626953,
"learning_rate": 4.084488836611346e-05,
"loss": 0.1728,
"step": 2145
},
{
"epoch": 2.82,
"grad_norm": 2.5439553260803223,
"learning_rate": 4.080499213196607e-05,
"loss": 0.1734,
"step": 2150
},
{
"epoch": 2.82,
"grad_norm": 2.695373773574829,
"learning_rate": 4.076502873576903e-05,
"loss": 0.1625,
"step": 2155
},
{
"epoch": 2.83,
"grad_norm": 2.5151877403259277,
"learning_rate": 4.072499834734357e-05,
"loss": 0.1598,
"step": 2160
},
{
"epoch": 2.84,
"grad_norm": 2.4009640216827393,
"learning_rate": 4.068490113679563e-05,
"loss": 0.1574,
"step": 2165
},
{
"epoch": 2.84,
"grad_norm": 2.4583699703216553,
"learning_rate": 4.06447372745151e-05,
"loss": 0.1689,
"step": 2170
},
{
"epoch": 2.85,
"grad_norm": 2.4071240425109863,
"learning_rate": 4.060450693117511e-05,
"loss": 0.1722,
"step": 2175
},
{
"epoch": 2.86,
"grad_norm": 2.36995267868042,
"learning_rate": 4.056421027773126e-05,
"loss": 0.1709,
"step": 2180
},
{
"epoch": 2.86,
"grad_norm": 2.5631325244903564,
"learning_rate": 4.0523847485420984e-05,
"loss": 0.173,
"step": 2185
},
{
"epoch": 2.87,
"grad_norm": 2.7295174598693848,
"learning_rate": 4.048341872576272e-05,
"loss": 0.173,
"step": 2190
},
{
"epoch": 2.88,
"grad_norm": 2.5564053058624268,
"learning_rate": 4.044292417055525e-05,
"loss": 0.1684,
"step": 2195
},
{
"epoch": 2.88,
"grad_norm": 2.627962589263916,
"learning_rate": 4.040236399187696e-05,
"loss": 0.1717,
"step": 2200
},
{
"epoch": 2.89,
"grad_norm": 2.5658953189849854,
"learning_rate": 4.0361738362085064e-05,
"loss": 0.1719,
"step": 2205
},
{
"epoch": 2.9,
"grad_norm": 2.4268243312835693,
"learning_rate": 4.032104745381494e-05,
"loss": 0.1612,
"step": 2210
},
{
"epoch": 2.9,
"grad_norm": 2.6990509033203125,
"learning_rate": 4.028029143997935e-05,
"loss": 0.1671,
"step": 2215
},
{
"epoch": 2.91,
"grad_norm": 2.4805703163146973,
"learning_rate": 4.0239470493767704e-05,
"loss": 0.1735,
"step": 2220
},
{
"epoch": 2.92,
"grad_norm": 2.5650622844696045,
"learning_rate": 4.019858478864534e-05,
"loss": 0.1662,
"step": 2225
},
{
"epoch": 2.92,
"grad_norm": 2.4036471843719482,
"learning_rate": 4.015763449835281e-05,
"loss": 0.1571,
"step": 2230
},
{
"epoch": 2.93,
"grad_norm": 2.5735135078430176,
"learning_rate": 4.0116619796905104e-05,
"loss": 0.1676,
"step": 2235
},
{
"epoch": 2.94,
"grad_norm": 2.7664101123809814,
"learning_rate": 4.0075540858590883e-05,
"loss": 0.1825,
"step": 2240
},
{
"epoch": 2.94,
"grad_norm": 2.384687900543213,
"learning_rate": 4.003439785797183e-05,
"loss": 0.169,
"step": 2245
},
{
"epoch": 2.95,
"grad_norm": 2.6205379962921143,
"learning_rate": 3.999319096988183e-05,
"loss": 0.1745,
"step": 2250
},
{
"epoch": 2.96,
"grad_norm": 2.52746319770813,
"learning_rate": 3.995192036942625e-05,
"loss": 0.166,
"step": 2255
},
{
"epoch": 2.96,
"grad_norm": 2.5834269523620605,
"learning_rate": 3.991058623198123e-05,
"loss": 0.1758,
"step": 2260
},
{
"epoch": 2.97,
"grad_norm": 2.5553784370422363,
"learning_rate": 3.9869188733192846e-05,
"loss": 0.1755,
"step": 2265
},
{
"epoch": 2.98,
"grad_norm": 2.5766592025756836,
"learning_rate": 3.982772804897649e-05,
"loss": 0.1687,
"step": 2270
},
{
"epoch": 2.98,
"grad_norm": 2.7371819019317627,
"learning_rate": 3.978620435551599e-05,
"loss": 0.1705,
"step": 2275
},
{
"epoch": 2.99,
"grad_norm": 2.321173906326294,
"learning_rate": 3.974461782926299e-05,
"loss": 0.162,
"step": 2280
},
{
"epoch": 3.0,
"grad_norm": 2.64091420173645,
"learning_rate": 3.970296864693609e-05,
"loss": 0.1652,
"step": 2285
},
{
"epoch": 3.0,
"grad_norm": 1.3675230741500854,
"learning_rate": 3.9661256985520156e-05,
"loss": 0.1358,
"step": 2290
},
{
"epoch": 3.01,
"grad_norm": 1.4976637363433838,
"learning_rate": 3.961948302226557e-05,
"loss": 0.0672,
"step": 2295
},
{
"epoch": 3.01,
"grad_norm": 1.758554220199585,
"learning_rate": 3.957764693468743e-05,
"loss": 0.066,
"step": 2300
},
{
"epoch": 3.02,
"grad_norm": 1.4874210357666016,
"learning_rate": 3.953574890056485e-05,
"loss": 0.0629,
"step": 2305
},
{
"epoch": 3.03,
"grad_norm": 1.7129358053207397,
"learning_rate": 3.9493789097940185e-05,
"loss": 0.0642,
"step": 2310
},
{
"epoch": 3.03,
"grad_norm": 1.8975974321365356,
"learning_rate": 3.9451767705118246e-05,
"loss": 0.0679,
"step": 2315
},
{
"epoch": 3.04,
"grad_norm": 1.5627552270889282,
"learning_rate": 3.940968490066559e-05,
"loss": 0.0642,
"step": 2320
},
{
"epoch": 3.05,
"grad_norm": 1.6444036960601807,
"learning_rate": 3.9367540863409714e-05,
"loss": 0.0691,
"step": 2325
},
{
"epoch": 3.05,
"grad_norm": 1.6043704748153687,
"learning_rate": 3.932533577243835e-05,
"loss": 0.0644,
"step": 2330
},
{
"epoch": 3.06,
"grad_norm": 1.6515140533447266,
"learning_rate": 3.9283069807098636e-05,
"loss": 0.0729,
"step": 2335
},
{
"epoch": 3.07,
"grad_norm": 1.6297287940979004,
"learning_rate": 3.9240743146996425e-05,
"loss": 0.068,
"step": 2340
},
{
"epoch": 3.07,
"grad_norm": 1.557881236076355,
"learning_rate": 3.919835597199548e-05,
"loss": 0.0688,
"step": 2345
},
{
"epoch": 3.08,
"grad_norm": 1.708101511001587,
"learning_rate": 3.915590846221669e-05,
"loss": 0.0673,
"step": 2350
},
{
"epoch": 3.09,
"grad_norm": 1.6620030403137207,
"learning_rate": 3.911340079803736e-05,
"loss": 0.0702,
"step": 2355
},
{
"epoch": 3.09,
"grad_norm": 1.938750982284546,
"learning_rate": 3.9070833160090415e-05,
"loss": 0.0695,
"step": 2360
},
{
"epoch": 3.1,
"grad_norm": 1.7759830951690674,
"learning_rate": 3.902820572926362e-05,
"loss": 0.0732,
"step": 2365
},
{
"epoch": 3.11,
"grad_norm": 1.846912145614624,
"learning_rate": 3.898551868669883e-05,
"loss": 0.0668,
"step": 2370
},
{
"epoch": 3.11,
"grad_norm": 1.79542076587677,
"learning_rate": 3.8942772213791224e-05,
"loss": 0.0714,
"step": 2375
},
{
"epoch": 3.12,
"grad_norm": 1.5793654918670654,
"learning_rate": 3.889996649218852e-05,
"loss": 0.0682,
"step": 2380
},
{
"epoch": 3.13,
"grad_norm": 1.8609654903411865,
"learning_rate": 3.8857101703790196e-05,
"loss": 0.0738,
"step": 2385
},
{
"epoch": 3.13,
"grad_norm": 1.681381344795227,
"learning_rate": 3.881417803074676e-05,
"loss": 0.0747,
"step": 2390
},
{
"epoch": 3.14,
"grad_norm": 2.0434184074401855,
"learning_rate": 3.877119565545891e-05,
"loss": 0.0806,
"step": 2395
},
{
"epoch": 3.15,
"grad_norm": 1.8694220781326294,
"learning_rate": 3.8728154760576817e-05,
"loss": 0.0884,
"step": 2400
},
{
"epoch": 3.15,
"grad_norm": 1.9645694494247437,
"learning_rate": 3.868505552899931e-05,
"loss": 0.0875,
"step": 2405
},
{
"epoch": 3.16,
"grad_norm": 1.8987983465194702,
"learning_rate": 3.8641898143873155e-05,
"loss": 0.0917,
"step": 2410
},
{
"epoch": 3.17,
"grad_norm": 2.0280187129974365,
"learning_rate": 3.859868278859218e-05,
"loss": 0.0878,
"step": 2415
},
{
"epoch": 3.17,
"grad_norm": 2.094468832015991,
"learning_rate": 3.855540964679658e-05,
"loss": 0.0877,
"step": 2420
},
{
"epoch": 3.18,
"grad_norm": 1.9860283136367798,
"learning_rate": 3.851207890237213e-05,
"loss": 0.0915,
"step": 2425
},
{
"epoch": 3.19,
"grad_norm": 2.1709773540496826,
"learning_rate": 3.846869073944934e-05,
"loss": 0.095,
"step": 2430
},
{
"epoch": 3.19,
"grad_norm": 1.8917242288589478,
"learning_rate": 3.842524534240276e-05,
"loss": 0.0895,
"step": 2435
},
{
"epoch": 3.2,
"grad_norm": 1.9368691444396973,
"learning_rate": 3.8381742895850106e-05,
"loss": 0.0921,
"step": 2440
},
{
"epoch": 3.2,
"grad_norm": 2.072715997695923,
"learning_rate": 3.8338183584651554e-05,
"loss": 0.0905,
"step": 2445
},
{
"epoch": 3.21,
"grad_norm": 2.0285987854003906,
"learning_rate": 3.8294567593908915e-05,
"loss": 0.0941,
"step": 2450
},
{
"epoch": 3.22,
"grad_norm": 2.015815019607544,
"learning_rate": 3.825089510896485e-05,
"loss": 0.0918,
"step": 2455
},
{
"epoch": 3.22,
"grad_norm": 2.0444390773773193,
"learning_rate": 3.820716631540209e-05,
"loss": 0.0938,
"step": 2460
},
{
"epoch": 3.23,
"grad_norm": 2.241682291030884,
"learning_rate": 3.816338139904265e-05,
"loss": 0.0981,
"step": 2465
},
{
"epoch": 3.24,
"grad_norm": 2.1586482524871826,
"learning_rate": 3.811954054594702e-05,
"loss": 0.0916,
"step": 2470
},
{
"epoch": 3.24,
"grad_norm": 1.968621850013733,
"learning_rate": 3.807564394241341e-05,
"loss": 0.0886,
"step": 2475
},
{
"epoch": 3.25,
"grad_norm": 2.180476427078247,
"learning_rate": 3.8031691774976904e-05,
"loss": 0.0955,
"step": 2480
},
{
"epoch": 3.26,
"grad_norm": 1.9769107103347778,
"learning_rate": 3.7987684230408735e-05,
"loss": 0.0933,
"step": 2485
},
{
"epoch": 3.26,
"grad_norm": 1.7934843301773071,
"learning_rate": 3.794362149571545e-05,
"loss": 0.087,
"step": 2490
},
{
"epoch": 3.27,
"grad_norm": 2.203385591506958,
"learning_rate": 3.7899503758138114e-05,
"loss": 0.0927,
"step": 2495
},
{
"epoch": 3.28,
"grad_norm": 2.1554219722747803,
"learning_rate": 3.78553312051515e-05,
"loss": 0.0917,
"step": 2500
},
{
"epoch": 3.28,
"grad_norm": 2.0353715419769287,
"learning_rate": 3.781110402446337e-05,
"loss": 0.0961,
"step": 2505
},
{
"epoch": 3.29,
"grad_norm": 2.0579957962036133,
"learning_rate": 3.776682240401357e-05,
"loss": 0.1026,
"step": 2510
},
{
"epoch": 3.3,
"grad_norm": 2.3313910961151123,
"learning_rate": 3.772248653197331e-05,
"loss": 0.0908,
"step": 2515
},
{
"epoch": 3.3,
"grad_norm": 2.0414376258850098,
"learning_rate": 3.767809659674433e-05,
"loss": 0.0909,
"step": 2520
},
{
"epoch": 3.31,
"grad_norm": 2.0286598205566406,
"learning_rate": 3.7633652786958105e-05,
"loss": 0.0968,
"step": 2525
},
{
"epoch": 3.32,
"grad_norm": 2.367244005203247,
"learning_rate": 3.758915529147506e-05,
"loss": 0.0923,
"step": 2530
},
{
"epoch": 3.32,
"grad_norm": 1.8567143678665161,
"learning_rate": 3.754460429938373e-05,
"loss": 0.092,
"step": 2535
},
{
"epoch": 3.33,
"grad_norm": 2.1443471908569336,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.0926,
"step": 2540
},
{
"epoch": 3.34,
"grad_norm": 2.019869089126587,
"learning_rate": 3.745534258286627e-05,
"loss": 0.0851,
"step": 2545
},
{
"epoch": 3.34,
"grad_norm": 2.1114661693573,
"learning_rate": 3.741063223775066e-05,
"loss": 0.0867,
"step": 2550
},
{
"epoch": 3.35,
"grad_norm": 2.078768253326416,
"learning_rate": 3.736586915464621e-05,
"loss": 0.0949,
"step": 2555
},
{
"epoch": 3.36,
"grad_norm": 1.929516315460205,
"learning_rate": 3.732105352377004e-05,
"loss": 0.0931,
"step": 2560
},
{
"epoch": 3.36,
"grad_norm": 2.1101810932159424,
"learning_rate": 3.727618553556262e-05,
"loss": 0.0943,
"step": 2565
},
{
"epoch": 3.37,
"grad_norm": 2.1217589378356934,
"learning_rate": 3.723126538068686e-05,
"loss": 0.1018,
"step": 2570
},
{
"epoch": 3.38,
"grad_norm": 2.1733384132385254,
"learning_rate": 3.718629325002736e-05,
"loss": 0.0931,
"step": 2575
},
{
"epoch": 3.38,
"grad_norm": 1.986570119857788,
"learning_rate": 3.714126933468959e-05,
"loss": 0.0977,
"step": 2580
},
{
"epoch": 3.39,
"grad_norm": 2.1583731174468994,
"learning_rate": 3.709619382599909e-05,
"loss": 0.0959,
"step": 2585
},
{
"epoch": 3.4,
"grad_norm": 2.0934367179870605,
"learning_rate": 3.705106691550063e-05,
"loss": 0.093,
"step": 2590
},
{
"epoch": 3.4,
"grad_norm": 2.1203556060791016,
"learning_rate": 3.700588879495739e-05,
"loss": 0.0969,
"step": 2595
},
{
"epoch": 3.41,
"grad_norm": 1.69106924533844,
"learning_rate": 3.6960659656350186e-05,
"loss": 0.0935,
"step": 2600
},
{
"epoch": 3.41,
"grad_norm": 2.283950090408325,
"learning_rate": 3.6915379691876615e-05,
"loss": 0.0961,
"step": 2605
},
{
"epoch": 3.42,
"grad_norm": 2.030233383178711,
"learning_rate": 3.6870049093950284e-05,
"loss": 0.0968,
"step": 2610
},
{
"epoch": 3.43,
"grad_norm": 2.127016544342041,
"learning_rate": 3.682466805519992e-05,
"loss": 0.0986,
"step": 2615
},
{
"epoch": 3.43,
"grad_norm": 2.0715529918670654,
"learning_rate": 3.677923676846864e-05,
"loss": 0.0908,
"step": 2620
},
{
"epoch": 3.44,
"grad_norm": 2.094383955001831,
"learning_rate": 3.673375542681305e-05,
"loss": 0.0971,
"step": 2625
},
{
"epoch": 3.45,
"grad_norm": 2.1275360584259033,
"learning_rate": 3.668822422350247e-05,
"loss": 0.1002,
"step": 2630
},
{
"epoch": 3.45,
"grad_norm": 2.068857192993164,
"learning_rate": 3.6642643352018116e-05,
"loss": 0.0893,
"step": 2635
},
{
"epoch": 3.46,
"grad_norm": 1.982419490814209,
"learning_rate": 3.659701300605224e-05,
"loss": 0.097,
"step": 2640
},
{
"epoch": 3.47,
"grad_norm": 2.1440236568450928,
"learning_rate": 3.6551333379507346e-05,
"loss": 0.1063,
"step": 2645
},
{
"epoch": 3.47,
"grad_norm": 1.987056016921997,
"learning_rate": 3.650560466649538e-05,
"loss": 0.0935,
"step": 2650
},
{
"epoch": 3.48,
"grad_norm": 1.9351013898849487,
"learning_rate": 3.645982706133682e-05,
"loss": 0.0901,
"step": 2655
},
{
"epoch": 3.49,
"grad_norm": 2.3136351108551025,
"learning_rate": 3.641400075855995e-05,
"loss": 0.0992,
"step": 2660
},
{
"epoch": 3.49,
"grad_norm": 2.232473611831665,
"learning_rate": 3.636812595289998e-05,
"loss": 0.104,
"step": 2665
},
{
"epoch": 3.5,
"grad_norm": 2.2792115211486816,
"learning_rate": 3.632220283929822e-05,
"loss": 0.1002,
"step": 2670
},
{
"epoch": 3.51,
"grad_norm": 2.2950987815856934,
"learning_rate": 3.627623161290127e-05,
"loss": 0.1014,
"step": 2675
},
{
"epoch": 3.51,
"grad_norm": 1.9094067811965942,
"learning_rate": 3.623021246906018e-05,
"loss": 0.1012,
"step": 2680
},
{
"epoch": 3.52,
"grad_norm": 1.9479122161865234,
"learning_rate": 3.618414560332962e-05,
"loss": 0.0971,
"step": 2685
},
{
"epoch": 3.53,
"grad_norm": 1.8091012239456177,
"learning_rate": 3.6138031211467044e-05,
"loss": 0.103,
"step": 2690
},
{
"epoch": 3.53,
"grad_norm": 2.095745086669922,
"learning_rate": 3.609186948943188e-05,
"loss": 0.0953,
"step": 2695
},
{
"epoch": 3.54,
"grad_norm": 2.011467218399048,
"learning_rate": 3.604566063338467e-05,
"loss": 0.1009,
"step": 2700
},
{
"epoch": 3.55,
"grad_norm": 2.2293827533721924,
"learning_rate": 3.599940483968625e-05,
"loss": 0.0942,
"step": 2705
},
{
"epoch": 3.55,
"grad_norm": 1.9385508298873901,
"learning_rate": 3.595310230489692e-05,
"loss": 0.0961,
"step": 2710
},
{
"epoch": 3.56,
"grad_norm": 2.123690366744995,
"learning_rate": 3.5906753225775586e-05,
"loss": 0.0982,
"step": 2715
},
{
"epoch": 3.57,
"grad_norm": 2.051839828491211,
"learning_rate": 3.586035779927896e-05,
"loss": 0.1023,
"step": 2720
},
{
"epoch": 3.57,
"grad_norm": 2.269162654876709,
"learning_rate": 3.581391622256069e-05,
"loss": 0.0995,
"step": 2725
},
{
"epoch": 3.58,
"grad_norm": 1.9228086471557617,
"learning_rate": 3.576742869297056e-05,
"loss": 0.0998,
"step": 2730
},
{
"epoch": 3.59,
"grad_norm": 1.946722149848938,
"learning_rate": 3.5720895408053574e-05,
"loss": 0.0968,
"step": 2735
},
{
"epoch": 3.59,
"grad_norm": 2.1756057739257812,
"learning_rate": 3.567431656554923e-05,
"loss": 0.0912,
"step": 2740
},
{
"epoch": 3.6,
"grad_norm": 1.7844388484954834,
"learning_rate": 3.562769236339058e-05,
"loss": 0.0957,
"step": 2745
},
{
"epoch": 3.6,
"grad_norm": 2.02689528465271,
"learning_rate": 3.5581022999703464e-05,
"loss": 0.0926,
"step": 2750
},
{
"epoch": 3.61,
"grad_norm": 2.160264730453491,
"learning_rate": 3.553430867280557e-05,
"loss": 0.0974,
"step": 2755
},
{
"epoch": 3.62,
"grad_norm": 1.962109923362732,
"learning_rate": 3.548754958120573e-05,
"loss": 0.0969,
"step": 2760
},
{
"epoch": 3.62,
"grad_norm": 1.9709469079971313,
"learning_rate": 3.544074592360294e-05,
"loss": 0.0969,
"step": 2765
},
{
"epoch": 3.63,
"grad_norm": 1.968109130859375,
"learning_rate": 3.5393897898885606e-05,
"loss": 0.1024,
"step": 2770
},
{
"epoch": 3.64,
"grad_norm": 1.9209555387496948,
"learning_rate": 3.534700570613067e-05,
"loss": 0.1017,
"step": 2775
},
{
"epoch": 3.64,
"grad_norm": 2.151937961578369,
"learning_rate": 3.530006954460274e-05,
"loss": 0.1007,
"step": 2780
},
{
"epoch": 3.65,
"grad_norm": 1.9381475448608398,
"learning_rate": 3.525308961375329e-05,
"loss": 0.0947,
"step": 2785
},
{
"epoch": 3.66,
"grad_norm": 1.9445606470108032,
"learning_rate": 3.520606611321976e-05,
"loss": 0.1005,
"step": 2790
},
{
"epoch": 3.66,
"grad_norm": 2.063396692276001,
"learning_rate": 3.515899924282478e-05,
"loss": 0.1041,
"step": 2795
},
{
"epoch": 3.67,
"grad_norm": 2.594733715057373,
"learning_rate": 3.511188920257523e-05,
"loss": 0.0985,
"step": 2800
},
{
"epoch": 3.68,
"grad_norm": 2.2250747680664062,
"learning_rate": 3.506473619266146e-05,
"loss": 0.0956,
"step": 2805
},
{
"epoch": 3.68,
"grad_norm": 2.2515833377838135,
"learning_rate": 3.501754041345643e-05,
"loss": 0.097,
"step": 2810
},
{
"epoch": 3.69,
"grad_norm": 2.0630807876586914,
"learning_rate": 3.497030206551481e-05,
"loss": 0.1029,
"step": 2815
},
{
"epoch": 3.7,
"grad_norm": 2.0855114459991455,
"learning_rate": 3.492302134957218e-05,
"loss": 0.1018,
"step": 2820
},
{
"epoch": 3.7,
"grad_norm": 2.0847525596618652,
"learning_rate": 3.487569846654417e-05,
"loss": 0.0974,
"step": 2825
},
{
"epoch": 3.71,
"grad_norm": 2.245652675628662,
"learning_rate": 3.4828333617525586e-05,
"loss": 0.0982,
"step": 2830
},
{
"epoch": 3.72,
"grad_norm": 2.2418930530548096,
"learning_rate": 3.4780927003789556e-05,
"loss": 0.0984,
"step": 2835
},
{
"epoch": 3.72,
"grad_norm": 2.1843297481536865,
"learning_rate": 3.47334788267867e-05,
"loss": 0.0971,
"step": 2840
},
{
"epoch": 3.73,
"grad_norm": 2.1401710510253906,
"learning_rate": 3.468598928814425e-05,
"loss": 0.0983,
"step": 2845
},
{
"epoch": 3.74,
"grad_norm": 2.237949848175049,
"learning_rate": 3.4638458589665194e-05,
"loss": 0.1012,
"step": 2850
},
{
"epoch": 3.74,
"grad_norm": 2.101795196533203,
"learning_rate": 3.459088693332743e-05,
"loss": 0.0957,
"step": 2855
},
{
"epoch": 3.75,
"grad_norm": 2.1468005180358887,
"learning_rate": 3.454327452128292e-05,
"loss": 0.1016,
"step": 2860
},
{
"epoch": 3.76,
"grad_norm": 2.139878034591675,
"learning_rate": 3.449562155585679e-05,
"loss": 0.0956,
"step": 2865
},
{
"epoch": 3.76,
"grad_norm": 2.2107348442077637,
"learning_rate": 3.444792823954651e-05,
"loss": 0.1002,
"step": 2870
},
{
"epoch": 3.77,
"grad_norm": 2.1431825160980225,
"learning_rate": 3.440019477502101e-05,
"loss": 0.0979,
"step": 2875
},
{
"epoch": 3.78,
"grad_norm": 2.027465581893921,
"learning_rate": 3.435242136511984e-05,
"loss": 0.0988,
"step": 2880
},
{
"epoch": 3.78,
"grad_norm": 1.9341384172439575,
"learning_rate": 3.430460821285225e-05,
"loss": 0.0945,
"step": 2885
},
{
"epoch": 3.79,
"grad_norm": 2.29193377494812,
"learning_rate": 3.425675552139645e-05,
"loss": 0.0993,
"step": 2890
},
{
"epoch": 3.79,
"grad_norm": 2.169417381286621,
"learning_rate": 3.4208863494098586e-05,
"loss": 0.1008,
"step": 2895
},
{
"epoch": 3.8,
"grad_norm": 2.211463212966919,
"learning_rate": 3.416093233447201e-05,
"loss": 0.0955,
"step": 2900
},
{
"epoch": 3.81,
"grad_norm": 2.137601137161255,
"learning_rate": 3.411296224619635e-05,
"loss": 0.1063,
"step": 2905
},
{
"epoch": 3.81,
"grad_norm": 2.2494797706604004,
"learning_rate": 3.4064953433116675e-05,
"loss": 0.1026,
"step": 2910
},
{
"epoch": 3.82,
"grad_norm": 2.046558141708374,
"learning_rate": 3.401690609924258e-05,
"loss": 0.1007,
"step": 2915
},
{
"epoch": 3.83,
"grad_norm": 2.0194621086120605,
"learning_rate": 3.396882044874736e-05,
"loss": 0.0924,
"step": 2920
},
{
"epoch": 3.83,
"grad_norm": 2.130725145339966,
"learning_rate": 3.392069668596716e-05,
"loss": 0.0976,
"step": 2925
},
{
"epoch": 3.84,
"grad_norm": 2.6385340690612793,
"learning_rate": 3.3872535015400035e-05,
"loss": 0.1062,
"step": 2930
},
{
"epoch": 3.85,
"grad_norm": 1.9961224794387817,
"learning_rate": 3.382433564170517e-05,
"loss": 0.1025,
"step": 2935
},
{
"epoch": 3.85,
"grad_norm": 2.336229085922241,
"learning_rate": 3.377609876970194e-05,
"loss": 0.0954,
"step": 2940
},
{
"epoch": 3.86,
"grad_norm": 2.0952861309051514,
"learning_rate": 3.372782460436908e-05,
"loss": 0.0983,
"step": 2945
},
{
"epoch": 3.87,
"grad_norm": 2.1143643856048584,
"learning_rate": 3.367951335084379e-05,
"loss": 0.1025,
"step": 2950
},
{
"epoch": 3.87,
"grad_norm": 2.1523244380950928,
"learning_rate": 3.363116521442087e-05,
"loss": 0.1022,
"step": 2955
},
{
"epoch": 3.88,
"grad_norm": 2.2812914848327637,
"learning_rate": 3.3582780400551864e-05,
"loss": 0.1058,
"step": 2960
},
{
"epoch": 3.89,
"grad_norm": 2.106767177581787,
"learning_rate": 3.353435911484417e-05,
"loss": 0.0975,
"step": 2965
},
{
"epoch": 3.89,
"grad_norm": 1.8392610549926758,
"learning_rate": 3.348590156306017e-05,
"loss": 0.1002,
"step": 2970
},
{
"epoch": 3.9,
"grad_norm": 2.386420726776123,
"learning_rate": 3.343740795111634e-05,
"loss": 0.1028,
"step": 2975
},
{
"epoch": 3.91,
"grad_norm": 2.3647854328155518,
"learning_rate": 3.338887848508242e-05,
"loss": 0.098,
"step": 2980
},
{
"epoch": 3.91,
"grad_norm": 2.3350307941436768,
"learning_rate": 3.334031337118048e-05,
"loss": 0.1101,
"step": 2985
},
{
"epoch": 3.92,
"grad_norm": 2.1085422039031982,
"learning_rate": 3.3291712815784104e-05,
"loss": 0.1061,
"step": 2990
},
{
"epoch": 3.93,
"grad_norm": 2.0244526863098145,
"learning_rate": 3.3243077025417443e-05,
"loss": 0.1001,
"step": 2995
},
{
"epoch": 3.93,
"grad_norm": 2.0625123977661133,
"learning_rate": 3.319440620675442e-05,
"loss": 0.0924,
"step": 3000
}
],
"logging_steps": 5,
"max_steps": 7620,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"total_flos": 1.0787435279725363e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}