|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9324922169424874, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.323194980621338, |
|
"learning_rate": 4.9999946882250004e-05, |
|
"loss": 0.2105, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.090125799179077, |
|
"learning_rate": 4.999978752922572e-05, |
|
"loss": 0.2656, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.8763957023620605, |
|
"learning_rate": 4.999952194160431e-05, |
|
"loss": 0.3075, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.714475631713867, |
|
"learning_rate": 4.999915012051437e-05, |
|
"loss": 0.3164, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.6384477615356445, |
|
"learning_rate": 4.999867206753593e-05, |
|
"loss": 0.3167, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.147310733795166, |
|
"learning_rate": 4.9998087784700426e-05, |
|
"loss": 0.3379, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.4968576431274414, |
|
"learning_rate": 4.9997397274490725e-05, |
|
"loss": 0.3289, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.5513858795166016, |
|
"learning_rate": 4.9996600539841096e-05, |
|
"loss": 0.3304, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.6501898765563965, |
|
"learning_rate": 4.99956975841372e-05, |
|
"loss": 0.3316, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.1661622524261475, |
|
"learning_rate": 4.9994688411216076e-05, |
|
"loss": 0.3352, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.1451425552368164, |
|
"learning_rate": 4.9993573025366124e-05, |
|
"loss": 0.3337, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.058828115463257, |
|
"learning_rate": 4.999235143132708e-05, |
|
"loss": 0.3283, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.6413097381591797, |
|
"learning_rate": 4.999102363429002e-05, |
|
"loss": 0.312, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.8983662128448486, |
|
"learning_rate": 4.99895896398973e-05, |
|
"loss": 0.3184, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.6056504249572754, |
|
"learning_rate": 4.998804945424258e-05, |
|
"loss": 0.3464, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.5964763164520264, |
|
"learning_rate": 4.998640308387074e-05, |
|
"loss": 0.335, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.594703435897827, |
|
"learning_rate": 4.9984650535777896e-05, |
|
"loss": 0.3487, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.996779203414917, |
|
"learning_rate": 4.9982791817411386e-05, |
|
"loss": 0.346, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.9310555458068848, |
|
"learning_rate": 4.998082693666966e-05, |
|
"loss": 0.3203, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.1505908966064453, |
|
"learning_rate": 4.997875590190233e-05, |
|
"loss": 0.3766, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.8678457736968994, |
|
"learning_rate": 4.9976578721910106e-05, |
|
"loss": 0.3404, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.8068525791168213, |
|
"learning_rate": 4.9974295405944714e-05, |
|
"loss": 0.3249, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.8355133533477783, |
|
"learning_rate": 4.9971905963708946e-05, |
|
"loss": 0.3226, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.0205864906311035, |
|
"learning_rate": 4.996941040535653e-05, |
|
"loss": 0.3613, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.9807019233703613, |
|
"learning_rate": 4.9966808741492153e-05, |
|
"loss": 0.3284, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.070850372314453, |
|
"learning_rate": 4.996410098317137e-05, |
|
"loss": 0.3217, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.1543736457824707, |
|
"learning_rate": 4.996128714190058e-05, |
|
"loss": 0.3636, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.80784273147583, |
|
"learning_rate": 4.995836722963699e-05, |
|
"loss": 0.3379, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.6561925411224365, |
|
"learning_rate": 4.9955341258788526e-05, |
|
"loss": 0.3442, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.814857006072998, |
|
"learning_rate": 4.99522092422138e-05, |
|
"loss": 0.3456, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.924438238143921, |
|
"learning_rate": 4.9948971193222086e-05, |
|
"loss": 0.3436, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.7895913124084473, |
|
"learning_rate": 4.994562712557319e-05, |
|
"loss": 0.3319, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.878596544265747, |
|
"learning_rate": 4.9942177053477474e-05, |
|
"loss": 0.342, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.0157933235168457, |
|
"learning_rate": 4.993862099159574e-05, |
|
"loss": 0.3335, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.585909366607666, |
|
"learning_rate": 4.99349589550392e-05, |
|
"loss": 0.3373, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.992539167404175, |
|
"learning_rate": 4.993119095936937e-05, |
|
"loss": 0.3318, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.995790719985962, |
|
"learning_rate": 4.992731702059805e-05, |
|
"loss": 0.3289, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.0879945755004883, |
|
"learning_rate": 4.9923337155187235e-05, |
|
"loss": 0.3309, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.9949581623077393, |
|
"learning_rate": 4.991925138004905e-05, |
|
"loss": 0.3471, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.873623847961426, |
|
"learning_rate": 4.991505971254566e-05, |
|
"loss": 0.3463, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.0350987911224365, |
|
"learning_rate": 4.9910762170489226e-05, |
|
"loss": 0.33, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.794652223587036, |
|
"learning_rate": 4.99063587721418e-05, |
|
"loss": 0.3723, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.4460439682006836, |
|
"learning_rate": 4.990184953621528e-05, |
|
"loss": 0.3512, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.772210121154785, |
|
"learning_rate": 4.989723448187131e-05, |
|
"loss": 0.3232, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.0419492721557617, |
|
"learning_rate": 4.989251362872119e-05, |
|
"loss": 0.3364, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.0731663703918457, |
|
"learning_rate": 4.988768699682579e-05, |
|
"loss": 0.355, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.81030535697937, |
|
"learning_rate": 4.9882754606695524e-05, |
|
"loss": 0.3158, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.7734286785125732, |
|
"learning_rate": 4.9877716479290174e-05, |
|
"loss": 0.3286, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.918984889984131, |
|
"learning_rate": 4.987257263601885e-05, |
|
"loss": 0.3314, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.953944206237793, |
|
"learning_rate": 4.986732309873992e-05, |
|
"loss": 0.3179, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.8582749366760254, |
|
"learning_rate": 4.986196788976086e-05, |
|
"loss": 0.3238, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.757632255554199, |
|
"learning_rate": 4.985650703183822e-05, |
|
"loss": 0.3413, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.9017035961151123, |
|
"learning_rate": 4.985094054817746e-05, |
|
"loss": 0.3335, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.1360483169555664, |
|
"learning_rate": 4.9845268462432916e-05, |
|
"loss": 0.3474, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.847700834274292, |
|
"learning_rate": 4.983949079870765e-05, |
|
"loss": 0.3471, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.8749804496765137, |
|
"learning_rate": 4.983360758155341e-05, |
|
"loss": 0.3389, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.9671127796173096, |
|
"learning_rate": 4.9827618835970426e-05, |
|
"loss": 0.3379, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.869534730911255, |
|
"learning_rate": 4.982152458740741e-05, |
|
"loss": 0.328, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.9593312740325928, |
|
"learning_rate": 4.981532486176138e-05, |
|
"loss": 0.348, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.288499593734741, |
|
"learning_rate": 4.980901968537758e-05, |
|
"loss": 0.3691, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.329684257507324, |
|
"learning_rate": 4.980260908504934e-05, |
|
"loss": 0.3426, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.8230020999908447, |
|
"learning_rate": 4.9796093088018e-05, |
|
"loss": 0.3367, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.743234157562256, |
|
"learning_rate": 4.978947172197277e-05, |
|
"loss": 0.3594, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.874333620071411, |
|
"learning_rate": 4.978274501505061e-05, |
|
"loss": 0.3394, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.2279603481292725, |
|
"learning_rate": 4.9775912995836136e-05, |
|
"loss": 0.3307, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.608811378479004, |
|
"learning_rate": 4.9768975693361454e-05, |
|
"loss": 0.3431, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.868130683898926, |
|
"learning_rate": 4.976193313710608e-05, |
|
"loss": 0.3273, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.2972702980041504, |
|
"learning_rate": 4.9754785356996787e-05, |
|
"loss": 0.3453, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.2560746669769287, |
|
"learning_rate": 4.9747532383407504e-05, |
|
"loss": 0.3831, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 3.038130760192871, |
|
"learning_rate": 4.9740174247159156e-05, |
|
"loss": 0.3916, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.453185796737671, |
|
"learning_rate": 4.973271097951956e-05, |
|
"loss": 0.3661, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.8996922969818115, |
|
"learning_rate": 4.9725142612203265e-05, |
|
"loss": 0.3685, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.0452466011047363, |
|
"learning_rate": 4.971746917737146e-05, |
|
"loss": 0.3723, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.064406156539917, |
|
"learning_rate": 4.970969070763177e-05, |
|
"loss": 0.4086, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.1569948196411133, |
|
"learning_rate": 4.9701807236038204e-05, |
|
"loss": 0.4095, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.0515644550323486, |
|
"learning_rate": 4.9693818796090927e-05, |
|
"loss": 0.4156, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.2472422122955322, |
|
"learning_rate": 4.968572542173617e-05, |
|
"loss": 0.4684, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.417625904083252, |
|
"learning_rate": 4.96775271473661e-05, |
|
"loss": 0.4493, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.272538185119629, |
|
"learning_rate": 4.9669224007818623e-05, |
|
"loss": 0.4514, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.8853087425231934, |
|
"learning_rate": 4.966081603837725e-05, |
|
"loss": 0.4629, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.1684935092926025, |
|
"learning_rate": 4.965230327477099e-05, |
|
"loss": 0.4347, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.0273630619049072, |
|
"learning_rate": 4.964368575317415e-05, |
|
"loss": 0.4532, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.8964247703552246, |
|
"learning_rate": 4.963496351020619e-05, |
|
"loss": 0.4514, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.0990092754364014, |
|
"learning_rate": 4.962613658293158e-05, |
|
"loss": 0.4611, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.376248836517334, |
|
"learning_rate": 4.961720500885967e-05, |
|
"loss": 0.4585, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.2961933612823486, |
|
"learning_rate": 4.960816882594443e-05, |
|
"loss": 0.4574, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.159632682800293, |
|
"learning_rate": 4.959902807258443e-05, |
|
"loss": 0.4567, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.271243095397949, |
|
"learning_rate": 4.958978278762255e-05, |
|
"loss": 0.4709, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.813108205795288, |
|
"learning_rate": 4.958043301034589e-05, |
|
"loss": 0.477, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.1648154258728027, |
|
"learning_rate": 4.95709787804856e-05, |
|
"loss": 0.4489, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.2871389389038086, |
|
"learning_rate": 4.9561420138216645e-05, |
|
"loss": 0.4604, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.215829849243164, |
|
"learning_rate": 4.955175712415773e-05, |
|
"loss": 0.4703, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.0727405548095703, |
|
"learning_rate": 4.954198977937106e-05, |
|
"loss": 0.4745, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 3.3414416313171387, |
|
"learning_rate": 4.953211814536217e-05, |
|
"loss": 0.4481, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 3.058262348175049, |
|
"learning_rate": 4.9522142264079794e-05, |
|
"loss": 0.4765, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.8146088123321533, |
|
"learning_rate": 4.951206217791564e-05, |
|
"loss": 0.4682, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.241665840148926, |
|
"learning_rate": 4.9501877929704215e-05, |
|
"loss": 0.4803, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.362031936645508, |
|
"learning_rate": 4.949158956272268e-05, |
|
"loss": 0.5213, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.4168310165405273, |
|
"learning_rate": 4.948119712069062e-05, |
|
"loss": 0.5243, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.469191312789917, |
|
"learning_rate": 4.9470700647769904e-05, |
|
"loss": 0.5824, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.592074394226074, |
|
"learning_rate": 4.9460100188564426e-05, |
|
"loss": 0.5777, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.4984512329101562, |
|
"learning_rate": 4.944939578812001e-05, |
|
"loss": 0.6011, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.6021430492401123, |
|
"learning_rate": 4.943858749192414e-05, |
|
"loss": 0.6145, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.7211990356445312, |
|
"learning_rate": 4.942767534590581e-05, |
|
"loss": 0.6159, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.5600759983062744, |
|
"learning_rate": 4.9416659396435304e-05, |
|
"loss": 0.5823, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.6605124473571777, |
|
"learning_rate": 4.940553969032403e-05, |
|
"loss": 0.6421, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.654963493347168, |
|
"learning_rate": 4.9394316274824284e-05, |
|
"loss": 0.6296, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.279911518096924, |
|
"learning_rate": 4.938298919762907e-05, |
|
"loss": 0.6206, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.3684518337249756, |
|
"learning_rate": 4.9371558506871893e-05, |
|
"loss": 0.618, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.6144015789031982, |
|
"learning_rate": 4.936002425112657e-05, |
|
"loss": 0.6063, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.5420987606048584, |
|
"learning_rate": 4.934838647940699e-05, |
|
"loss": 0.6417, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.549086570739746, |
|
"learning_rate": 4.933664524116694e-05, |
|
"loss": 0.6196, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.43263840675354, |
|
"learning_rate": 4.9324800586299854e-05, |
|
"loss": 0.6224, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.745143175125122, |
|
"learning_rate": 4.931285256513868e-05, |
|
"loss": 0.6052, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.4879541397094727, |
|
"learning_rate": 4.9300801228455536e-05, |
|
"loss": 0.6168, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.449970245361328, |
|
"learning_rate": 4.9288646627461645e-05, |
|
"loss": 0.6278, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.4467875957489014, |
|
"learning_rate": 4.9276388813807e-05, |
|
"loss": 0.5972, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.513766288757324, |
|
"learning_rate": 4.92640278395802e-05, |
|
"loss": 0.6289, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.6418793201446533, |
|
"learning_rate": 4.925156375730822e-05, |
|
"loss": 0.6228, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.486773729324341, |
|
"learning_rate": 4.923899661995617e-05, |
|
"loss": 0.5931, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.476663112640381, |
|
"learning_rate": 4.92263264809271e-05, |
|
"loss": 0.615, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.6010472774505615, |
|
"learning_rate": 4.9213553394061754e-05, |
|
"loss": 0.6221, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.489880323410034, |
|
"learning_rate": 4.920067741363835e-05, |
|
"loss": 0.6008, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.6153078079223633, |
|
"learning_rate": 4.918769859437232e-05, |
|
"loss": 0.6362, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.6381516456604004, |
|
"learning_rate": 4.9174616991416136e-05, |
|
"loss": 0.6391, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.5741944313049316, |
|
"learning_rate": 4.916143266035901e-05, |
|
"loss": 0.617, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.613692283630371, |
|
"learning_rate": 4.914814565722671e-05, |
|
"loss": 0.628, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.7904045581817627, |
|
"learning_rate": 4.913475603848129e-05, |
|
"loss": 0.6157, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 3.4816792011260986, |
|
"learning_rate": 4.912126386102086e-05, |
|
"loss": 0.6457, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 3.371680974960327, |
|
"learning_rate": 4.910766918217935e-05, |
|
"loss": 0.6304, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 3.502263069152832, |
|
"learning_rate": 4.909397205972627e-05, |
|
"loss": 0.6057, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.6000306606292725, |
|
"learning_rate": 4.908017255186643e-05, |
|
"loss": 0.6629, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.747457265853882, |
|
"learning_rate": 4.906627071723975e-05, |
|
"loss": 0.659, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.356635808944702, |
|
"learning_rate": 4.905226661492095e-05, |
|
"loss": 0.6263, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.6806087493896484, |
|
"learning_rate": 4.903816030441935e-05, |
|
"loss": 0.6128, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.5806305408477783, |
|
"learning_rate": 4.902395184567859e-05, |
|
"loss": 0.6538, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.4562089443206787, |
|
"learning_rate": 4.900964129907638e-05, |
|
"loss": 0.6271, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.561217784881592, |
|
"learning_rate": 4.8995228725424235e-05, |
|
"loss": 0.6683, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 3.419334650039673, |
|
"learning_rate": 4.898071418596724e-05, |
|
"loss": 0.6503, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.7222039699554443, |
|
"learning_rate": 4.8966097742383765e-05, |
|
"loss": 0.6211, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.767538070678711, |
|
"learning_rate": 4.895137945678522e-05, |
|
"loss": 0.6252, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 3.1880886554718018, |
|
"learning_rate": 4.893655939171578e-05, |
|
"loss": 0.6403, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.655524492263794, |
|
"learning_rate": 4.892163761015214e-05, |
|
"loss": 0.6344, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.342782735824585, |
|
"learning_rate": 4.890661417550319e-05, |
|
"loss": 0.6339, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 3.661858320236206, |
|
"learning_rate": 4.889148915160984e-05, |
|
"loss": 0.6554, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.906249761581421, |
|
"learning_rate": 4.887626260274465e-05, |
|
"loss": 0.6478, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.5664069652557373, |
|
"learning_rate": 4.886093459361163e-05, |
|
"loss": 0.652, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 3.413325786590576, |
|
"learning_rate": 4.8845505189345934e-05, |
|
"loss": 0.6491, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.2417361736297607, |
|
"learning_rate": 4.8829974455513564e-05, |
|
"loss": 0.6344, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.055202960968018, |
|
"learning_rate": 4.881434245811115e-05, |
|
"loss": 0.6458, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 3.9207661151885986, |
|
"learning_rate": 4.87986092635656e-05, |
|
"loss": 0.6493, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.7753255367279053, |
|
"learning_rate": 4.878277493873388e-05, |
|
"loss": 0.6141, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.760272979736328, |
|
"learning_rate": 4.876683955090267e-05, |
|
"loss": 0.4732, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.363640069961548, |
|
"learning_rate": 4.8750803167788136e-05, |
|
"loss": 0.2479, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.5096426010131836, |
|
"learning_rate": 4.87346658575356e-05, |
|
"loss": 0.2297, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.936283826828003, |
|
"learning_rate": 4.871842768871928e-05, |
|
"loss": 0.231, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.782768964767456, |
|
"learning_rate": 4.8702088730341965e-05, |
|
"loss": 0.2195, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.589066982269287, |
|
"learning_rate": 4.868564905183476e-05, |
|
"loss": 0.2205, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.7772037982940674, |
|
"learning_rate": 4.866910872305675e-05, |
|
"loss": 0.2144, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.887826919555664, |
|
"learning_rate": 4.865246781429476e-05, |
|
"loss": 0.2244, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.558298349380493, |
|
"learning_rate": 4.8635726396262996e-05, |
|
"loss": 0.2422, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.575716972351074, |
|
"learning_rate": 4.861888454010275e-05, |
|
"loss": 0.2223, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.8698527812957764, |
|
"learning_rate": 4.860194231738216e-05, |
|
"loss": 0.2164, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.723947763442993, |
|
"learning_rate": 4.8584899800095864e-05, |
|
"loss": 0.2332, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.9089255332946777, |
|
"learning_rate": 4.8567757060664644e-05, |
|
"loss": 0.2419, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.4928860664367676, |
|
"learning_rate": 4.8550514171935214e-05, |
|
"loss": 0.2268, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.4811110496520996, |
|
"learning_rate": 4.853317120717985e-05, |
|
"loss": 0.2137, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 3.3581888675689697, |
|
"learning_rate": 4.85157282400961e-05, |
|
"loss": 0.2416, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.5201902389526367, |
|
"learning_rate": 4.849818534480645e-05, |
|
"loss": 0.2263, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.51816463470459, |
|
"learning_rate": 4.8480542595858025e-05, |
|
"loss": 0.2346, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.437803030014038, |
|
"learning_rate": 4.846280006822228e-05, |
|
"loss": 0.2311, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.6363041400909424, |
|
"learning_rate": 4.844495783729467e-05, |
|
"loss": 0.2364, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.600130796432495, |
|
"learning_rate": 4.842701597889432e-05, |
|
"loss": 0.2292, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.5062355995178223, |
|
"learning_rate": 4.840897456926373e-05, |
|
"loss": 0.253, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.7811343669891357, |
|
"learning_rate": 4.8390833685068424e-05, |
|
"loss": 0.2347, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.58297061920166, |
|
"learning_rate": 4.837259340339665e-05, |
|
"loss": 0.2313, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.850160598754883, |
|
"learning_rate": 4.8354253801759e-05, |
|
"loss": 0.2433, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.6711654663085938, |
|
"learning_rate": 4.8335814958088166e-05, |
|
"loss": 0.2384, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.596914291381836, |
|
"learning_rate": 4.8317276950738525e-05, |
|
"loss": 0.2411, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.9930615425109863, |
|
"learning_rate": 4.829863985848587e-05, |
|
"loss": 0.2381, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.8596436977386475, |
|
"learning_rate": 4.827990376052702e-05, |
|
"loss": 0.2409, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.607304573059082, |
|
"learning_rate": 4.826106873647953e-05, |
|
"loss": 0.2387, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.747140645980835, |
|
"learning_rate": 4.824213486638133e-05, |
|
"loss": 0.2552, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.7441892623901367, |
|
"learning_rate": 4.822310223069039e-05, |
|
"loss": 0.2414, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.480534791946411, |
|
"learning_rate": 4.820397091028436e-05, |
|
"loss": 0.2451, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.65727162361145, |
|
"learning_rate": 4.818474098646026e-05, |
|
"loss": 0.2271, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.657895088195801, |
|
"learning_rate": 4.8165412540934116e-05, |
|
"loss": 0.2463, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.8118693828582764, |
|
"learning_rate": 4.814598565584062e-05, |
|
"loss": 0.2586, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.7916576862335205, |
|
"learning_rate": 4.812646041373275e-05, |
|
"loss": 0.2487, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.8312575817108154, |
|
"learning_rate": 4.810683689758147e-05, |
|
"loss": 0.2448, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.789888620376587, |
|
"learning_rate": 4.808711519077534e-05, |
|
"loss": 0.25, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.660008192062378, |
|
"learning_rate": 4.806729537712017e-05, |
|
"loss": 0.2592, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.80081844329834, |
|
"learning_rate": 4.8047377540838676e-05, |
|
"loss": 0.2633, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.5701184272766113, |
|
"learning_rate": 4.8027361766570117e-05, |
|
"loss": 0.2345, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.6467089653015137, |
|
"learning_rate": 4.8007248139369915e-05, |
|
"loss": 0.2421, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.8026981353759766, |
|
"learning_rate": 4.7987036744709326e-05, |
|
"loss": 0.2462, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.9150643348693848, |
|
"learning_rate": 4.7966727668475044e-05, |
|
"loss": 0.2516, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.872527837753296, |
|
"learning_rate": 4.794632099696888e-05, |
|
"loss": 0.2581, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.764134168624878, |
|
"learning_rate": 4.792581681690734e-05, |
|
"loss": 0.2707, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.886357069015503, |
|
"learning_rate": 4.790521521542129e-05, |
|
"loss": 0.2573, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.990485429763794, |
|
"learning_rate": 4.788451628005561e-05, |
|
"loss": 0.2634, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.758971691131592, |
|
"learning_rate": 4.786372009876876e-05, |
|
"loss": 0.2439, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.70831561088562, |
|
"learning_rate": 4.784282675993245e-05, |
|
"loss": 0.241, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.6341211795806885, |
|
"learning_rate": 4.782183635233124e-05, |
|
"loss": 0.2652, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.7551965713500977, |
|
"learning_rate": 4.780074896516219e-05, |
|
"loss": 0.244, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 3.252516508102417, |
|
"learning_rate": 4.7779564688034476e-05, |
|
"loss": 0.2594, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.93808913230896, |
|
"learning_rate": 4.7758283610968985e-05, |
|
"loss": 0.2594, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.767031192779541, |
|
"learning_rate": 4.773690582439795e-05, |
|
"loss": 0.2506, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.6166746616363525, |
|
"learning_rate": 4.7715431419164566e-05, |
|
"loss": 0.2624, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.9592745304107666, |
|
"learning_rate": 4.7693860486522604e-05, |
|
"loss": 0.2735, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.8421945571899414, |
|
"learning_rate": 4.7672193118136e-05, |
|
"loss": 0.2693, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 3.0941479206085205, |
|
"learning_rate": 4.7650429406078525e-05, |
|
"loss": 0.2563, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.8086464405059814, |
|
"learning_rate": 4.762856944283331e-05, |
|
"loss": 0.2627, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.981468439102173, |
|
"learning_rate": 4.760661332129254e-05, |
|
"loss": 0.2739, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.7119858264923096, |
|
"learning_rate": 4.758456113475699e-05, |
|
"loss": 0.2697, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.955040454864502, |
|
"learning_rate": 4.756241297693566e-05, |
|
"loss": 0.2713, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.78459095954895, |
|
"learning_rate": 4.7540168941945376e-05, |
|
"loss": 0.2659, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.754824161529541, |
|
"learning_rate": 4.751782912431038e-05, |
|
"loss": 0.2527, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.916003465652466, |
|
"learning_rate": 4.749539361896195e-05, |
|
"loss": 0.2554, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.9990346431732178, |
|
"learning_rate": 4.747286252123797e-05, |
|
"loss": 0.2449, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.68816876411438, |
|
"learning_rate": 4.7450235926882524e-05, |
|
"loss": 0.2539, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.7783591747283936, |
|
"learning_rate": 4.742751393204553e-05, |
|
"loss": 0.2673, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 3.041889190673828, |
|
"learning_rate": 4.740469663328228e-05, |
|
"loss": 0.2692, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 3.2789931297302246, |
|
"learning_rate": 4.738178412755306e-05, |
|
"loss": 0.2691, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.8584647178649902, |
|
"learning_rate": 4.7358776512222737e-05, |
|
"loss": 0.2722, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.982015371322632, |
|
"learning_rate": 4.7335673885060316e-05, |
|
"loss": 0.2721, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.9325811862945557, |
|
"learning_rate": 4.731247634423858e-05, |
|
"loss": 0.2791, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.9873268604278564, |
|
"learning_rate": 4.728918398833361e-05, |
|
"loss": 0.2805, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.8286678791046143, |
|
"learning_rate": 4.726579691632442e-05, |
|
"loss": 0.2628, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.6870853900909424, |
|
"learning_rate": 4.7242315227592496e-05, |
|
"loss": 0.2697, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.881246566772461, |
|
"learning_rate": 4.721873902192139e-05, |
|
"loss": 0.2786, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.676746129989624, |
|
"learning_rate": 4.719506839949631e-05, |
|
"loss": 0.2795, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.8064475059509277, |
|
"learning_rate": 4.717130346090368e-05, |
|
"loss": 0.2729, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.7660868167877197, |
|
"learning_rate": 4.7147444307130686e-05, |
|
"loss": 0.2752, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.8748722076416016, |
|
"learning_rate": 4.71234910395649e-05, |
|
"loss": 0.2772, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.691197633743286, |
|
"learning_rate": 4.7099443759993837e-05, |
|
"loss": 0.256, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.8552544116973877, |
|
"learning_rate": 4.707530257060445e-05, |
|
"loss": 0.2758, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.7499427795410156, |
|
"learning_rate": 4.705106757398282e-05, |
|
"loss": 0.2628, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.6907596588134766, |
|
"learning_rate": 4.702673887311362e-05, |
|
"loss": 0.2662, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.7225170135498047, |
|
"learning_rate": 4.7002316571379715e-05, |
|
"loss": 0.2709, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 3.2904715538024902, |
|
"learning_rate": 4.697780077256172e-05, |
|
"loss": 0.2853, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.7764620780944824, |
|
"learning_rate": 4.695319158083756e-05, |
|
"loss": 0.2623, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 3.36917781829834, |
|
"learning_rate": 4.6928489100782046e-05, |
|
"loss": 0.2806, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 3.3074262142181396, |
|
"learning_rate": 4.690369343736636e-05, |
|
"loss": 0.2834, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.958819627761841, |
|
"learning_rate": 4.6878804695957716e-05, |
|
"loss": 0.2787, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.8270795345306396, |
|
"learning_rate": 4.6853822982318816e-05, |
|
"loss": 0.2737, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.6642744541168213, |
|
"learning_rate": 4.682874840260746e-05, |
|
"loss": 0.2872, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 3.0754623413085938, |
|
"learning_rate": 4.680358106337607e-05, |
|
"loss": 0.2674, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 3.076148271560669, |
|
"learning_rate": 4.6778321071571224e-05, |
|
"loss": 0.2769, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.8592352867126465, |
|
"learning_rate": 4.675296853453326e-05, |
|
"loss": 0.2799, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 3.153860330581665, |
|
"learning_rate": 4.6727523559995734e-05, |
|
"loss": 0.2812, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 3.1477208137512207, |
|
"learning_rate": 4.6701986256085046e-05, |
|
"loss": 0.2818, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 3.040626049041748, |
|
"learning_rate": 4.667635673131992e-05, |
|
"loss": 0.2832, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 3.204580307006836, |
|
"learning_rate": 4.665063509461097e-05, |
|
"loss": 0.3009, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.8025059700012207, |
|
"learning_rate": 4.662482145526024e-05, |
|
"loss": 0.2776, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.0659685134887695, |
|
"learning_rate": 4.659891592296071e-05, |
|
"loss": 0.291, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.9462106227874756, |
|
"learning_rate": 4.6572918607795876e-05, |
|
"loss": 0.287, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 3.0103273391723633, |
|
"learning_rate": 4.6546829620239265e-05, |
|
"loss": 0.3025, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.912851095199585, |
|
"learning_rate": 4.6520649071153916e-05, |
|
"loss": 0.2675, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 3.1437137126922607, |
|
"learning_rate": 4.6494377071791996e-05, |
|
"loss": 0.2896, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.8913474082946777, |
|
"learning_rate": 4.646801373379425e-05, |
|
"loss": 0.3142, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 3.0581839084625244, |
|
"learning_rate": 4.644155916918959e-05, |
|
"loss": 0.293, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.8686771392822266, |
|
"learning_rate": 4.641501349039456e-05, |
|
"loss": 0.273, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.914700984954834, |
|
"learning_rate": 4.6388376810212905e-05, |
|
"loss": 0.2837, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 3.2269139289855957, |
|
"learning_rate": 4.6361649241835056e-05, |
|
"loss": 0.2849, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 3.0138943195343018, |
|
"learning_rate": 4.633483089883769e-05, |
|
"loss": 0.2854, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 3.2977559566497803, |
|
"learning_rate": 4.63079218951832e-05, |
|
"loss": 0.2922, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 3.0085713863372803, |
|
"learning_rate": 4.6280922345219255e-05, |
|
"loss": 0.2838, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 3.183983087539673, |
|
"learning_rate": 4.625383236367827e-05, |
|
"loss": 0.282, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.8702237606048584, |
|
"learning_rate": 4.6226652065676974e-05, |
|
"loss": 0.2786, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 3.479321241378784, |
|
"learning_rate": 4.619938156671584e-05, |
|
"loss": 0.2904, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.9285452365875244, |
|
"learning_rate": 4.61720209826787e-05, |
|
"loss": 0.2861, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 3.244591236114502, |
|
"learning_rate": 4.6144570429832144e-05, |
|
"loss": 0.2928, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.8110570907592773, |
|
"learning_rate": 4.6117030024825114e-05, |
|
"loss": 0.2904, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 3.049492359161377, |
|
"learning_rate": 4.6089399884688356e-05, |
|
"loss": 0.2739, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.960361957550049, |
|
"learning_rate": 4.606168012683394e-05, |
|
"loss": 0.3031, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 3.257373571395874, |
|
"learning_rate": 4.603387086905475e-05, |
|
"loss": 0.2993, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 3.0115904808044434, |
|
"learning_rate": 4.600597222952402e-05, |
|
"loss": 0.2915, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 3.111074209213257, |
|
"learning_rate": 4.597798432679477e-05, |
|
"loss": 0.2948, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 3.1926794052124023, |
|
"learning_rate": 4.594990727979937e-05, |
|
"loss": 0.2971, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.913715362548828, |
|
"learning_rate": 4.5921741207848966e-05, |
|
"loss": 0.2844, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.8652007579803467, |
|
"learning_rate": 4.5893486230633037e-05, |
|
"loss": 0.2687, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.927306890487671, |
|
"learning_rate": 4.586514246821885e-05, |
|
"loss": 0.2984, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 3.2218594551086426, |
|
"learning_rate": 4.583671004105096e-05, |
|
"loss": 0.2928, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 3.1091806888580322, |
|
"learning_rate": 4.580818906995068e-05, |
|
"loss": 0.3024, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 3.152013063430786, |
|
"learning_rate": 4.5779579676115604e-05, |
|
"loss": 0.2898, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 3.037785053253174, |
|
"learning_rate": 4.575088198111905e-05, |
|
"loss": 0.3012, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 3.125337600708008, |
|
"learning_rate": 4.5722096106909595e-05, |
|
"loss": 0.2982, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 3.1015219688415527, |
|
"learning_rate": 4.56932221758105e-05, |
|
"loss": 0.3014, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 3.0641446113586426, |
|
"learning_rate": 4.566426031051922e-05, |
|
"loss": 0.3057, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 3.1846718788146973, |
|
"learning_rate": 4.56352106341069e-05, |
|
"loss": 0.2941, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.9871373176574707, |
|
"learning_rate": 4.56060732700178e-05, |
|
"loss": 0.2902, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.941716194152832, |
|
"learning_rate": 4.5576848342068826e-05, |
|
"loss": 0.2999, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.8153445720672607, |
|
"learning_rate": 4.554753597444896e-05, |
|
"loss": 0.2855, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 3.2046408653259277, |
|
"learning_rate": 4.551813629171878e-05, |
|
"loss": 0.3167, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 3.2123496532440186, |
|
"learning_rate": 4.548864941880988e-05, |
|
"loss": 0.2929, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.81064772605896, |
|
"learning_rate": 4.545907548102436e-05, |
|
"loss": 0.3059, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 3.07346248626709, |
|
"learning_rate": 4.5429414604034307e-05, |
|
"loss": 0.2902, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.8002560138702393, |
|
"learning_rate": 4.539966691388125e-05, |
|
"loss": 0.2918, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 3.3515923023223877, |
|
"learning_rate": 4.536983253697561e-05, |
|
"loss": 0.304, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 3.050218105316162, |
|
"learning_rate": 4.53399116000962e-05, |
|
"loss": 0.3163, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 3.1914007663726807, |
|
"learning_rate": 4.530990423038962e-05, |
|
"loss": 0.3071, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 3.180460214614868, |
|
"learning_rate": 4.527981055536982e-05, |
|
"loss": 0.3023, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 3.2100706100463867, |
|
"learning_rate": 4.524963070291744e-05, |
|
"loss": 0.3219, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.9520275592803955, |
|
"learning_rate": 4.5219364801279356e-05, |
|
"loss": 0.2968, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.4291439056396484, |
|
"learning_rate": 4.51890129790681e-05, |
|
"loss": 0.17, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.9606090784072876, |
|
"learning_rate": 4.5158575365261305e-05, |
|
"loss": 0.1316, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 2.126908779144287, |
|
"learning_rate": 4.512805208920118e-05, |
|
"loss": 0.1281, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 2.0146312713623047, |
|
"learning_rate": 4.509744328059395e-05, |
|
"loss": 0.1234, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.9698853492736816, |
|
"learning_rate": 4.506674906950929e-05, |
|
"loss": 0.1341, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.1764025688171387, |
|
"learning_rate": 4.5035969586379804e-05, |
|
"loss": 0.1331, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.2242555618286133, |
|
"learning_rate": 4.5005104962000436e-05, |
|
"loss": 0.1325, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 2.019362449645996, |
|
"learning_rate": 4.4974155327527926e-05, |
|
"loss": 0.1219, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.3239810466766357, |
|
"learning_rate": 4.494312081448029e-05, |
|
"loss": 0.1304, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.2973790168762207, |
|
"learning_rate": 4.4912001554736205e-05, |
|
"loss": 0.1316, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 2.4513959884643555, |
|
"learning_rate": 4.488079768053447e-05, |
|
"loss": 0.133, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.789614200592041, |
|
"learning_rate": 4.484950932447345e-05, |
|
"loss": 0.1378, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.2913756370544434, |
|
"learning_rate": 4.481813661951052e-05, |
|
"loss": 0.1287, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 2.1334588527679443, |
|
"learning_rate": 4.4786679698961476e-05, |
|
"loss": 0.1304, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.3002805709838867, |
|
"learning_rate": 4.475513869649998e-05, |
|
"loss": 0.134, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.2173187732696533, |
|
"learning_rate": 4.4723513746157004e-05, |
|
"loss": 0.1359, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.9922655820846558, |
|
"learning_rate": 4.469180498232024e-05, |
|
"loss": 0.1403, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.208549737930298, |
|
"learning_rate": 4.466001253973355e-05, |
|
"loss": 0.1316, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.4228994846343994, |
|
"learning_rate": 4.4628136553496375e-05, |
|
"loss": 0.1336, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 2.2046756744384766, |
|
"learning_rate": 4.459617715906316e-05, |
|
"loss": 0.1389, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 2.3668532371520996, |
|
"learning_rate": 4.4564134492242805e-05, |
|
"loss": 0.1374, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 2.3358521461486816, |
|
"learning_rate": 4.4532008689198056e-05, |
|
"loss": 0.1339, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 2.4201912879943848, |
|
"learning_rate": 4.449979988644494e-05, |
|
"loss": 0.1324, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.356771230697632, |
|
"learning_rate": 4.446750822085218e-05, |
|
"loss": 0.1496, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.5749542713165283, |
|
"learning_rate": 4.4435133829640645e-05, |
|
"loss": 0.1446, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 2.313682794570923, |
|
"learning_rate": 4.440267685038271e-05, |
|
"loss": 0.1417, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 2.3327279090881348, |
|
"learning_rate": 4.437013742100171e-05, |
|
"loss": 0.1341, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 2.482767105102539, |
|
"learning_rate": 4.4337515679771345e-05, |
|
"loss": 0.1402, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 2.6034271717071533, |
|
"learning_rate": 4.4304811765315105e-05, |
|
"loss": 0.1498, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 2.2677841186523438, |
|
"learning_rate": 4.427202581660565e-05, |
|
"loss": 0.1414, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 2.3339622020721436, |
|
"learning_rate": 4.423915797296425e-05, |
|
"loss": 0.1377, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 2.1083145141601562, |
|
"learning_rate": 4.420620837406018e-05, |
|
"loss": 0.1416, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 2.400583267211914, |
|
"learning_rate": 4.4173177159910106e-05, |
|
"loss": 0.1383, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 2.1524839401245117, |
|
"learning_rate": 4.414006447087755e-05, |
|
"loss": 0.1366, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 2.1756019592285156, |
|
"learning_rate": 4.410687044767223e-05, |
|
"loss": 0.1402, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 2.5507566928863525, |
|
"learning_rate": 4.407359523134949e-05, |
|
"loss": 0.1514, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.152941942214966, |
|
"learning_rate": 4.4040238963309696e-05, |
|
"loss": 0.1451, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.3613879680633545, |
|
"learning_rate": 4.400680178529765e-05, |
|
"loss": 0.1407, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.624096393585205, |
|
"learning_rate": 4.397328383940196e-05, |
|
"loss": 0.1428, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 2.358207941055298, |
|
"learning_rate": 4.393968526805447e-05, |
|
"loss": 0.1443, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 2.758371353149414, |
|
"learning_rate": 4.3906006214029585e-05, |
|
"loss": 0.1568, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 2.099876642227173, |
|
"learning_rate": 4.387224682044378e-05, |
|
"loss": 0.157, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 2.3019683361053467, |
|
"learning_rate": 4.3838407230754885e-05, |
|
"loss": 0.1404, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.589655637741089, |
|
"learning_rate": 4.3804487588761544e-05, |
|
"loss": 0.156, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.4104435443878174, |
|
"learning_rate": 4.3770488038602555e-05, |
|
"loss": 0.1467, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 2.6529500484466553, |
|
"learning_rate": 4.373640872475627e-05, |
|
"loss": 0.1475, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 2.272524833679199, |
|
"learning_rate": 4.370224979204003e-05, |
|
"loss": 0.1423, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 2.421292781829834, |
|
"learning_rate": 4.366801138560948e-05, |
|
"loss": 0.149, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 2.280380964279175, |
|
"learning_rate": 4.3633693650957976e-05, |
|
"loss": 0.1468, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 2.0802671909332275, |
|
"learning_rate": 4.3599296733916004e-05, |
|
"loss": 0.157, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 2.234787940979004, |
|
"learning_rate": 4.3564820780650496e-05, |
|
"loss": 0.1428, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 2.337618589401245, |
|
"learning_rate": 4.353026593766427e-05, |
|
"loss": 0.1459, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 2.529278516769409, |
|
"learning_rate": 4.3495632351795367e-05, |
|
"loss": 0.1617, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 2.2081286907196045, |
|
"learning_rate": 4.3460920170216425e-05, |
|
"loss": 0.1487, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 2.180853843688965, |
|
"learning_rate": 4.34261295404341e-05, |
|
"loss": 0.139, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 2.5588650703430176, |
|
"learning_rate": 4.339126061028837e-05, |
|
"loss": 0.1489, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 2.441371202468872, |
|
"learning_rate": 4.335631352795199e-05, |
|
"loss": 0.1544, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 2.429845094680786, |
|
"learning_rate": 4.332128844192977e-05, |
|
"loss": 0.151, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 2.3163018226623535, |
|
"learning_rate": 4.328618550105802e-05, |
|
"loss": 0.1521, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 2.4328958988189697, |
|
"learning_rate": 4.325100485450389e-05, |
|
"loss": 0.1581, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.343770980834961, |
|
"learning_rate": 4.3215746651764686e-05, |
|
"loss": 0.1544, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 2.4985294342041016, |
|
"learning_rate": 4.3180411042667354e-05, |
|
"loss": 0.1557, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 2.6652395725250244, |
|
"learning_rate": 4.314499817736773e-05, |
|
"loss": 0.1465, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 2.50243878364563, |
|
"learning_rate": 4.3109508206349945e-05, |
|
"loss": 0.1514, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 2.537421703338623, |
|
"learning_rate": 4.30739412804258e-05, |
|
"loss": 0.155, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 2.244147539138794, |
|
"learning_rate": 4.3038297550734096e-05, |
|
"loss": 0.15, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 2.4972686767578125, |
|
"learning_rate": 4.300257716874001e-05, |
|
"loss": 0.1559, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 2.495651960372925, |
|
"learning_rate": 4.296678028623446e-05, |
|
"loss": 0.1589, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 2.649902582168579, |
|
"learning_rate": 4.293090705533342e-05, |
|
"loss": 0.1528, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 2.281095266342163, |
|
"learning_rate": 4.2894957628477316e-05, |
|
"loss": 0.1639, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 2.5233304500579834, |
|
"learning_rate": 4.285893215843036e-05, |
|
"loss": 0.1528, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 2.6843202114105225, |
|
"learning_rate": 4.282283079827993e-05, |
|
"loss": 0.1623, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.4476354122161865, |
|
"learning_rate": 4.278665370143583e-05, |
|
"loss": 0.1562, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.337167501449585, |
|
"learning_rate": 4.2750401021629765e-05, |
|
"loss": 0.165, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 2.610464096069336, |
|
"learning_rate": 4.271407291291459e-05, |
|
"loss": 0.1591, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.4951589107513428, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 0.1587, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.0912039279937744, |
|
"learning_rate": 4.2641191026570336e-05, |
|
"loss": 0.1529, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 2.724330425262451, |
|
"learning_rate": 4.260463755864702e-05, |
|
"loss": 0.1693, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 2.3671672344207764, |
|
"learning_rate": 4.256800928122475e-05, |
|
"loss": 0.157, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 2.536565065383911, |
|
"learning_rate": 4.2531306349952496e-05, |
|
"loss": 0.1697, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 2.5092501640319824, |
|
"learning_rate": 4.2494528920796406e-05, |
|
"loss": 0.1655, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 2.6707546710968018, |
|
"learning_rate": 4.2457677150039224e-05, |
|
"loss": 0.1604, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 2.4832890033721924, |
|
"learning_rate": 4.242075119427961e-05, |
|
"loss": 0.1504, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 2.4126479625701904, |
|
"learning_rate": 4.238375121043145e-05, |
|
"loss": 0.1552, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 2.3602805137634277, |
|
"learning_rate": 4.234667735572323e-05, |
|
"loss": 0.1556, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 2.4358716011047363, |
|
"learning_rate": 4.230952978769731e-05, |
|
"loss": 0.1569, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 2.6005828380584717, |
|
"learning_rate": 4.227230866420932e-05, |
|
"loss": 0.158, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 2.054624557495117, |
|
"learning_rate": 4.223501414342745e-05, |
|
"loss": 0.1644, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 2.5402703285217285, |
|
"learning_rate": 4.219764638383177e-05, |
|
"loss": 0.1587, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 2.09084153175354, |
|
"learning_rate": 4.216020554421359e-05, |
|
"loss": 0.1561, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.529383659362793, |
|
"learning_rate": 4.2122691783674786e-05, |
|
"loss": 0.1656, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.9956157207489014, |
|
"learning_rate": 4.208510526162704e-05, |
|
"loss": 0.1649, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 2.585899591445923, |
|
"learning_rate": 4.20474461377913e-05, |
|
"loss": 0.1635, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 2.6515862941741943, |
|
"learning_rate": 4.200971457219699e-05, |
|
"loss": 0.1713, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 2.22086501121521, |
|
"learning_rate": 4.197191072518139e-05, |
|
"loss": 0.151, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 2.543677568435669, |
|
"learning_rate": 4.19340347573889e-05, |
|
"loss": 0.1756, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 2.5006985664367676, |
|
"learning_rate": 4.1896086829770445e-05, |
|
"loss": 0.152, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 2.534740924835205, |
|
"learning_rate": 4.185806710358268e-05, |
|
"loss": 0.1681, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 2.562382459640503, |
|
"learning_rate": 4.181997574038741e-05, |
|
"loss": 0.162, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 2.5193183422088623, |
|
"learning_rate": 4.178181290205082e-05, |
|
"loss": 0.1663, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 2.6807899475097656, |
|
"learning_rate": 4.174357875074285e-05, |
|
"loss": 0.1636, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 2.598508834838867, |
|
"learning_rate": 4.170527344893647e-05, |
|
"loss": 0.1704, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 2.301255464553833, |
|
"learning_rate": 4.1666897159406984e-05, |
|
"loss": 0.1644, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.4087393283843994, |
|
"learning_rate": 4.162845004523137e-05, |
|
"loss": 0.1739, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 2.3460421562194824, |
|
"learning_rate": 4.158993226978757e-05, |
|
"loss": 0.1658, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 2.640719175338745, |
|
"learning_rate": 4.155134399675378e-05, |
|
"loss": 0.1529, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 2.6366817951202393, |
|
"learning_rate": 4.151268539010777e-05, |
|
"loss": 0.176, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 2.4204182624816895, |
|
"learning_rate": 4.1473956614126225e-05, |
|
"loss": 0.1579, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 2.7791054248809814, |
|
"learning_rate": 4.1435157833383955e-05, |
|
"loss": 0.1604, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 2.4026386737823486, |
|
"learning_rate": 4.139628921275329e-05, |
|
"loss": 0.164, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 2.740560531616211, |
|
"learning_rate": 4.1357350917403314e-05, |
|
"loss": 0.1791, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 2.6298422813415527, |
|
"learning_rate": 4.131834311279919e-05, |
|
"loss": 0.1691, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 2.610245704650879, |
|
"learning_rate": 4.12792659647015e-05, |
|
"loss": 0.1694, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 2.5160694122314453, |
|
"learning_rate": 4.124011963916541e-05, |
|
"loss": 0.1712, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 2.4107940196990967, |
|
"learning_rate": 4.1200904302540136e-05, |
|
"loss": 0.1587, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 2.5999083518981934, |
|
"learning_rate": 4.116162012146809e-05, |
|
"loss": 0.1683, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 2.592486619949341, |
|
"learning_rate": 4.112226726288427e-05, |
|
"loss": 0.1673, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 2.6168549060821533, |
|
"learning_rate": 4.1082845894015495e-05, |
|
"loss": 0.1573, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 2.690314769744873, |
|
"learning_rate": 4.104335618237972e-05, |
|
"loss": 0.1763, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 2.612140417098999, |
|
"learning_rate": 4.1003798295785325e-05, |
|
"loss": 0.1671, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 2.6909706592559814, |
|
"learning_rate": 4.096417240233036e-05, |
|
"loss": 0.1653, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.353872299194336, |
|
"learning_rate": 4.092447867040191e-05, |
|
"loss": 0.1721, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.776252508163452, |
|
"learning_rate": 4.088471726867531e-05, |
|
"loss": 0.1792, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.5471363067626953, |
|
"learning_rate": 4.084488836611346e-05, |
|
"loss": 0.1728, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 2.5439553260803223, |
|
"learning_rate": 4.080499213196607e-05, |
|
"loss": 0.1734, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 2.695373773574829, |
|
"learning_rate": 4.076502873576903e-05, |
|
"loss": 0.1625, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 2.5151877403259277, |
|
"learning_rate": 4.072499834734357e-05, |
|
"loss": 0.1598, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 2.4009640216827393, |
|
"learning_rate": 4.068490113679563e-05, |
|
"loss": 0.1574, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 2.4583699703216553, |
|
"learning_rate": 4.06447372745151e-05, |
|
"loss": 0.1689, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 2.4071240425109863, |
|
"learning_rate": 4.060450693117511e-05, |
|
"loss": 0.1722, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 2.36995267868042, |
|
"learning_rate": 4.056421027773126e-05, |
|
"loss": 0.1709, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 2.5631325244903564, |
|
"learning_rate": 4.0523847485420984e-05, |
|
"loss": 0.173, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 2.7295174598693848, |
|
"learning_rate": 4.048341872576272e-05, |
|
"loss": 0.173, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.5564053058624268, |
|
"learning_rate": 4.044292417055525e-05, |
|
"loss": 0.1684, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.627962589263916, |
|
"learning_rate": 4.040236399187696e-05, |
|
"loss": 0.1717, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 2.5658953189849854, |
|
"learning_rate": 4.0361738362085064e-05, |
|
"loss": 0.1719, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 2.4268243312835693, |
|
"learning_rate": 4.032104745381494e-05, |
|
"loss": 0.1612, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 2.6990509033203125, |
|
"learning_rate": 4.028029143997935e-05, |
|
"loss": 0.1671, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 2.4805703163146973, |
|
"learning_rate": 4.0239470493767704e-05, |
|
"loss": 0.1735, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.5650622844696045, |
|
"learning_rate": 4.019858478864534e-05, |
|
"loss": 0.1662, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.4036471843719482, |
|
"learning_rate": 4.015763449835281e-05, |
|
"loss": 0.1571, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 2.5735135078430176, |
|
"learning_rate": 4.0116619796905104e-05, |
|
"loss": 0.1676, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 2.7664101123809814, |
|
"learning_rate": 4.0075540858590883e-05, |
|
"loss": 0.1825, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 2.384687900543213, |
|
"learning_rate": 4.003439785797183e-05, |
|
"loss": 0.169, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 2.6205379962921143, |
|
"learning_rate": 3.999319096988183e-05, |
|
"loss": 0.1745, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 2.52746319770813, |
|
"learning_rate": 3.995192036942625e-05, |
|
"loss": 0.166, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 2.5834269523620605, |
|
"learning_rate": 3.991058623198123e-05, |
|
"loss": 0.1758, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 2.5553784370422363, |
|
"learning_rate": 3.9869188733192846e-05, |
|
"loss": 0.1755, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 2.5766592025756836, |
|
"learning_rate": 3.982772804897649e-05, |
|
"loss": 0.1687, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 2.7371819019317627, |
|
"learning_rate": 3.978620435551599e-05, |
|
"loss": 0.1705, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 2.321173906326294, |
|
"learning_rate": 3.974461782926299e-05, |
|
"loss": 0.162, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 2.64091420173645, |
|
"learning_rate": 3.970296864693609e-05, |
|
"loss": 0.1652, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.3675230741500854, |
|
"learning_rate": 3.9661256985520156e-05, |
|
"loss": 0.1358, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 1.4976637363433838, |
|
"learning_rate": 3.961948302226557e-05, |
|
"loss": 0.0672, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 1.758554220199585, |
|
"learning_rate": 3.957764693468743e-05, |
|
"loss": 0.066, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 1.4874210357666016, |
|
"learning_rate": 3.953574890056485e-05, |
|
"loss": 0.0629, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.7129358053207397, |
|
"learning_rate": 3.9493789097940185e-05, |
|
"loss": 0.0642, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.8975974321365356, |
|
"learning_rate": 3.9451767705118246e-05, |
|
"loss": 0.0679, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 1.5627552270889282, |
|
"learning_rate": 3.940968490066559e-05, |
|
"loss": 0.0642, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 1.6444036960601807, |
|
"learning_rate": 3.9367540863409714e-05, |
|
"loss": 0.0691, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 1.6043704748153687, |
|
"learning_rate": 3.932533577243835e-05, |
|
"loss": 0.0644, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 1.6515140533447266, |
|
"learning_rate": 3.9283069807098636e-05, |
|
"loss": 0.0729, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 1.6297287940979004, |
|
"learning_rate": 3.9240743146996425e-05, |
|
"loss": 0.068, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 1.557881236076355, |
|
"learning_rate": 3.919835597199548e-05, |
|
"loss": 0.0688, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.708101511001587, |
|
"learning_rate": 3.915590846221669e-05, |
|
"loss": 0.0673, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 1.6620030403137207, |
|
"learning_rate": 3.911340079803736e-05, |
|
"loss": 0.0702, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 1.938750982284546, |
|
"learning_rate": 3.9070833160090415e-05, |
|
"loss": 0.0695, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.7759830951690674, |
|
"learning_rate": 3.902820572926362e-05, |
|
"loss": 0.0732, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 1.846912145614624, |
|
"learning_rate": 3.898551868669883e-05, |
|
"loss": 0.0668, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 1.79542076587677, |
|
"learning_rate": 3.8942772213791224e-05, |
|
"loss": 0.0714, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.5793654918670654, |
|
"learning_rate": 3.889996649218852e-05, |
|
"loss": 0.0682, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 1.8609654903411865, |
|
"learning_rate": 3.8857101703790196e-05, |
|
"loss": 0.0738, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 1.681381344795227, |
|
"learning_rate": 3.881417803074676e-05, |
|
"loss": 0.0747, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 2.0434184074401855, |
|
"learning_rate": 3.877119565545891e-05, |
|
"loss": 0.0806, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 1.8694220781326294, |
|
"learning_rate": 3.8728154760576817e-05, |
|
"loss": 0.0884, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 1.9645694494247437, |
|
"learning_rate": 3.868505552899931e-05, |
|
"loss": 0.0875, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.8987983465194702, |
|
"learning_rate": 3.8641898143873155e-05, |
|
"loss": 0.0917, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 2.0280187129974365, |
|
"learning_rate": 3.859868278859218e-05, |
|
"loss": 0.0878, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 2.094468832015991, |
|
"learning_rate": 3.855540964679658e-05, |
|
"loss": 0.0877, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 1.9860283136367798, |
|
"learning_rate": 3.851207890237213e-05, |
|
"loss": 0.0915, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 2.1709773540496826, |
|
"learning_rate": 3.846869073944934e-05, |
|
"loss": 0.095, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 1.8917242288589478, |
|
"learning_rate": 3.842524534240276e-05, |
|
"loss": 0.0895, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.9368691444396973, |
|
"learning_rate": 3.8381742895850106e-05, |
|
"loss": 0.0921, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 2.072715997695923, |
|
"learning_rate": 3.8338183584651554e-05, |
|
"loss": 0.0905, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 2.0285987854003906, |
|
"learning_rate": 3.8294567593908915e-05, |
|
"loss": 0.0941, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 2.015815019607544, |
|
"learning_rate": 3.825089510896485e-05, |
|
"loss": 0.0918, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 2.0444390773773193, |
|
"learning_rate": 3.820716631540209e-05, |
|
"loss": 0.0938, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 2.241682291030884, |
|
"learning_rate": 3.816338139904265e-05, |
|
"loss": 0.0981, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 2.1586482524871826, |
|
"learning_rate": 3.811954054594702e-05, |
|
"loss": 0.0916, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.968621850013733, |
|
"learning_rate": 3.807564394241341e-05, |
|
"loss": 0.0886, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 2.180476427078247, |
|
"learning_rate": 3.8031691774976904e-05, |
|
"loss": 0.0955, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.9769107103347778, |
|
"learning_rate": 3.7987684230408735e-05, |
|
"loss": 0.0933, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.7934843301773071, |
|
"learning_rate": 3.794362149571545e-05, |
|
"loss": 0.087, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 2.203385591506958, |
|
"learning_rate": 3.7899503758138114e-05, |
|
"loss": 0.0927, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 2.1554219722747803, |
|
"learning_rate": 3.78553312051515e-05, |
|
"loss": 0.0917, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 2.0353715419769287, |
|
"learning_rate": 3.781110402446337e-05, |
|
"loss": 0.0961, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 2.0579957962036133, |
|
"learning_rate": 3.776682240401357e-05, |
|
"loss": 0.1026, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 2.3313910961151123, |
|
"learning_rate": 3.772248653197331e-05, |
|
"loss": 0.0908, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 2.0414376258850098, |
|
"learning_rate": 3.767809659674433e-05, |
|
"loss": 0.0909, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 2.0286598205566406, |
|
"learning_rate": 3.7633652786958105e-05, |
|
"loss": 0.0968, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 2.367244005203247, |
|
"learning_rate": 3.758915529147506e-05, |
|
"loss": 0.0923, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 1.8567143678665161, |
|
"learning_rate": 3.754460429938373e-05, |
|
"loss": 0.092, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 2.1443471908569336, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.0926, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 2.019869089126587, |
|
"learning_rate": 3.745534258286627e-05, |
|
"loss": 0.0851, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 2.1114661693573, |
|
"learning_rate": 3.741063223775066e-05, |
|
"loss": 0.0867, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 2.078768253326416, |
|
"learning_rate": 3.736586915464621e-05, |
|
"loss": 0.0949, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.929516315460205, |
|
"learning_rate": 3.732105352377004e-05, |
|
"loss": 0.0931, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 2.1101810932159424, |
|
"learning_rate": 3.727618553556262e-05, |
|
"loss": 0.0943, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 2.1217589378356934, |
|
"learning_rate": 3.723126538068686e-05, |
|
"loss": 0.1018, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 2.1733384132385254, |
|
"learning_rate": 3.718629325002736e-05, |
|
"loss": 0.0931, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 1.986570119857788, |
|
"learning_rate": 3.714126933468959e-05, |
|
"loss": 0.0977, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 2.1583731174468994, |
|
"learning_rate": 3.709619382599909e-05, |
|
"loss": 0.0959, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.0934367179870605, |
|
"learning_rate": 3.705106691550063e-05, |
|
"loss": 0.093, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.1203556060791016, |
|
"learning_rate": 3.700588879495739e-05, |
|
"loss": 0.0969, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 1.69106924533844, |
|
"learning_rate": 3.6960659656350186e-05, |
|
"loss": 0.0935, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 2.283950090408325, |
|
"learning_rate": 3.6915379691876615e-05, |
|
"loss": 0.0961, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 2.030233383178711, |
|
"learning_rate": 3.6870049093950284e-05, |
|
"loss": 0.0968, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 2.127016544342041, |
|
"learning_rate": 3.682466805519992e-05, |
|
"loss": 0.0986, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 2.0715529918670654, |
|
"learning_rate": 3.677923676846864e-05, |
|
"loss": 0.0908, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 2.094383955001831, |
|
"learning_rate": 3.673375542681305e-05, |
|
"loss": 0.0971, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 2.1275360584259033, |
|
"learning_rate": 3.668822422350247e-05, |
|
"loss": 0.1002, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 2.068857192993164, |
|
"learning_rate": 3.6642643352018116e-05, |
|
"loss": 0.0893, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 1.982419490814209, |
|
"learning_rate": 3.659701300605224e-05, |
|
"loss": 0.097, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 2.1440236568450928, |
|
"learning_rate": 3.6551333379507346e-05, |
|
"loss": 0.1063, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 1.987056016921997, |
|
"learning_rate": 3.650560466649538e-05, |
|
"loss": 0.0935, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.9351013898849487, |
|
"learning_rate": 3.645982706133682e-05, |
|
"loss": 0.0901, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 2.3136351108551025, |
|
"learning_rate": 3.641400075855995e-05, |
|
"loss": 0.0992, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 2.232473611831665, |
|
"learning_rate": 3.636812595289998e-05, |
|
"loss": 0.104, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 2.2792115211486816, |
|
"learning_rate": 3.632220283929822e-05, |
|
"loss": 0.1002, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 2.2950987815856934, |
|
"learning_rate": 3.627623161290127e-05, |
|
"loss": 0.1014, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 1.9094067811965942, |
|
"learning_rate": 3.623021246906018e-05, |
|
"loss": 0.1012, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 1.9479122161865234, |
|
"learning_rate": 3.618414560332962e-05, |
|
"loss": 0.0971, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.8091012239456177, |
|
"learning_rate": 3.6138031211467044e-05, |
|
"loss": 0.103, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 2.095745086669922, |
|
"learning_rate": 3.609186948943188e-05, |
|
"loss": 0.0953, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 2.011467218399048, |
|
"learning_rate": 3.604566063338467e-05, |
|
"loss": 0.1009, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 2.2293827533721924, |
|
"learning_rate": 3.599940483968625e-05, |
|
"loss": 0.0942, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 1.9385508298873901, |
|
"learning_rate": 3.595310230489692e-05, |
|
"loss": 0.0961, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 2.123690366744995, |
|
"learning_rate": 3.5906753225775586e-05, |
|
"loss": 0.0982, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 2.051839828491211, |
|
"learning_rate": 3.586035779927896e-05, |
|
"loss": 0.1023, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 2.269162654876709, |
|
"learning_rate": 3.581391622256069e-05, |
|
"loss": 0.0995, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.9228086471557617, |
|
"learning_rate": 3.576742869297056e-05, |
|
"loss": 0.0998, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 1.946722149848938, |
|
"learning_rate": 3.5720895408053574e-05, |
|
"loss": 0.0968, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 2.1756057739257812, |
|
"learning_rate": 3.567431656554923e-05, |
|
"loss": 0.0912, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.7844388484954834, |
|
"learning_rate": 3.562769236339058e-05, |
|
"loss": 0.0957, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 2.02689528465271, |
|
"learning_rate": 3.5581022999703464e-05, |
|
"loss": 0.0926, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 2.160264730453491, |
|
"learning_rate": 3.553430867280557e-05, |
|
"loss": 0.0974, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 1.962109923362732, |
|
"learning_rate": 3.548754958120573e-05, |
|
"loss": 0.0969, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 1.9709469079971313, |
|
"learning_rate": 3.544074592360294e-05, |
|
"loss": 0.0969, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 1.968109130859375, |
|
"learning_rate": 3.5393897898885606e-05, |
|
"loss": 0.1024, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 1.9209555387496948, |
|
"learning_rate": 3.534700570613067e-05, |
|
"loss": 0.1017, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 2.151937961578369, |
|
"learning_rate": 3.530006954460274e-05, |
|
"loss": 0.1007, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 1.9381475448608398, |
|
"learning_rate": 3.525308961375329e-05, |
|
"loss": 0.0947, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 1.9445606470108032, |
|
"learning_rate": 3.520606611321976e-05, |
|
"loss": 0.1005, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 2.063396692276001, |
|
"learning_rate": 3.515899924282478e-05, |
|
"loss": 0.1041, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 2.594733715057373, |
|
"learning_rate": 3.511188920257523e-05, |
|
"loss": 0.0985, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 2.2250747680664062, |
|
"learning_rate": 3.506473619266146e-05, |
|
"loss": 0.0956, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 2.2515833377838135, |
|
"learning_rate": 3.501754041345643e-05, |
|
"loss": 0.097, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 2.0630807876586914, |
|
"learning_rate": 3.497030206551481e-05, |
|
"loss": 0.1029, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 2.0855114459991455, |
|
"learning_rate": 3.492302134957218e-05, |
|
"loss": 0.1018, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 2.0847525596618652, |
|
"learning_rate": 3.487569846654417e-05, |
|
"loss": 0.0974, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 2.245652675628662, |
|
"learning_rate": 3.4828333617525586e-05, |
|
"loss": 0.0982, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 2.2418930530548096, |
|
"learning_rate": 3.4780927003789556e-05, |
|
"loss": 0.0984, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 2.1843297481536865, |
|
"learning_rate": 3.47334788267867e-05, |
|
"loss": 0.0971, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 2.1401710510253906, |
|
"learning_rate": 3.468598928814425e-05, |
|
"loss": 0.0983, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 2.237949848175049, |
|
"learning_rate": 3.4638458589665194e-05, |
|
"loss": 0.1012, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 2.101795196533203, |
|
"learning_rate": 3.459088693332743e-05, |
|
"loss": 0.0957, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 2.1468005180358887, |
|
"learning_rate": 3.454327452128292e-05, |
|
"loss": 0.1016, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 2.139878034591675, |
|
"learning_rate": 3.449562155585679e-05, |
|
"loss": 0.0956, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 2.2107348442077637, |
|
"learning_rate": 3.444792823954651e-05, |
|
"loss": 0.1002, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 2.1431825160980225, |
|
"learning_rate": 3.440019477502101e-05, |
|
"loss": 0.0979, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 2.027465581893921, |
|
"learning_rate": 3.435242136511984e-05, |
|
"loss": 0.0988, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 1.9341384172439575, |
|
"learning_rate": 3.430460821285225e-05, |
|
"loss": 0.0945, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 2.29193377494812, |
|
"learning_rate": 3.425675552139645e-05, |
|
"loss": 0.0993, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 2.169417381286621, |
|
"learning_rate": 3.4208863494098586e-05, |
|
"loss": 0.1008, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 2.211463212966919, |
|
"learning_rate": 3.416093233447201e-05, |
|
"loss": 0.0955, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 2.137601137161255, |
|
"learning_rate": 3.411296224619635e-05, |
|
"loss": 0.1063, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 2.2494797706604004, |
|
"learning_rate": 3.4064953433116675e-05, |
|
"loss": 0.1026, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 2.046558141708374, |
|
"learning_rate": 3.401690609924258e-05, |
|
"loss": 0.1007, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 2.0194621086120605, |
|
"learning_rate": 3.396882044874736e-05, |
|
"loss": 0.0924, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 2.130725145339966, |
|
"learning_rate": 3.392069668596716e-05, |
|
"loss": 0.0976, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 2.6385340690612793, |
|
"learning_rate": 3.3872535015400035e-05, |
|
"loss": 0.1062, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 1.9961224794387817, |
|
"learning_rate": 3.382433564170517e-05, |
|
"loss": 0.1025, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 2.336229085922241, |
|
"learning_rate": 3.377609876970194e-05, |
|
"loss": 0.0954, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 2.0952861309051514, |
|
"learning_rate": 3.372782460436908e-05, |
|
"loss": 0.0983, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 2.1143643856048584, |
|
"learning_rate": 3.367951335084379e-05, |
|
"loss": 0.1025, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 2.1523244380950928, |
|
"learning_rate": 3.363116521442087e-05, |
|
"loss": 0.1022, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 2.2812914848327637, |
|
"learning_rate": 3.3582780400551864e-05, |
|
"loss": 0.1058, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 2.106767177581787, |
|
"learning_rate": 3.353435911484417e-05, |
|
"loss": 0.0975, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 1.8392610549926758, |
|
"learning_rate": 3.348590156306017e-05, |
|
"loss": 0.1002, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 2.386420726776123, |
|
"learning_rate": 3.343740795111634e-05, |
|
"loss": 0.1028, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 2.3647854328155518, |
|
"learning_rate": 3.338887848508242e-05, |
|
"loss": 0.098, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 2.3350307941436768, |
|
"learning_rate": 3.334031337118048e-05, |
|
"loss": 0.1101, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 2.1085422039031982, |
|
"learning_rate": 3.3291712815784104e-05, |
|
"loss": 0.1061, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 2.0244526863098145, |
|
"learning_rate": 3.3243077025417443e-05, |
|
"loss": 0.1001, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 2.0625123977661133, |
|
"learning_rate": 3.319440620675442e-05, |
|
"loss": 0.0924, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 7620, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 1.0787435279725363e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|