|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.24991175432403812, |
|
"eval_steps": 500, |
|
"global_step": 531, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004706436051300153, |
|
"grad_norm": 0.6595008969306946, |
|
"learning_rate": 2.5000000000000004e-07, |
|
"loss": 9.6515, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0009412872102600306, |
|
"grad_norm": 0.7391405701637268, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 9.5434, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0014119308153900459, |
|
"grad_norm": 0.8721428513526917, |
|
"learning_rate": 7.5e-07, |
|
"loss": 9.0645, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0018825744205200612, |
|
"grad_norm": 0.9540417790412903, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 9.0978, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0023532180256500765, |
|
"grad_norm": 1.0068703889846802, |
|
"learning_rate": 1.25e-06, |
|
"loss": 8.8096, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0028238616307800918, |
|
"grad_norm": 0.7046281695365906, |
|
"learning_rate": 1.5e-06, |
|
"loss": 9.5863, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.003294505235910107, |
|
"grad_norm": 1.027761459350586, |
|
"learning_rate": 1.7500000000000002e-06, |
|
"loss": 9.3746, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0037651488410401224, |
|
"grad_norm": 0.7785173058509827, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 9.1443, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.004235792446170138, |
|
"grad_norm": 0.8485608696937561, |
|
"learning_rate": 2.25e-06, |
|
"loss": 9.2293, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.004706436051300153, |
|
"grad_norm": 0.8275871872901917, |
|
"learning_rate": 2.5e-06, |
|
"loss": 8.7838, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005177079656430168, |
|
"grad_norm": 0.5895422101020813, |
|
"learning_rate": 2.7500000000000004e-06, |
|
"loss": 9.5706, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0056477232615601836, |
|
"grad_norm": 0.9113247394561768, |
|
"learning_rate": 3e-06, |
|
"loss": 8.9198, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.006118366866690199, |
|
"grad_norm": 0.7459664940834045, |
|
"learning_rate": 3.2500000000000002e-06, |
|
"loss": 9.2372, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.006589010471820214, |
|
"grad_norm": 0.6556370854377747, |
|
"learning_rate": 3.5000000000000004e-06, |
|
"loss": 9.1809, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0070596540769502295, |
|
"grad_norm": 0.719078540802002, |
|
"learning_rate": 3.75e-06, |
|
"loss": 9.2422, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.007530297682080245, |
|
"grad_norm": 0.8138344287872314, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 9.271, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00800094128721026, |
|
"grad_norm": 0.7246189713478088, |
|
"learning_rate": 4.250000000000001e-06, |
|
"loss": 9.5405, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.008471584892340275, |
|
"grad_norm": 0.8132815361022949, |
|
"learning_rate": 4.5e-06, |
|
"loss": 9.7983, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.00894222849747029, |
|
"grad_norm": 0.5946951508522034, |
|
"learning_rate": 4.75e-06, |
|
"loss": 9.733, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.009412872102600306, |
|
"grad_norm": 0.5157704949378967, |
|
"learning_rate": 5e-06, |
|
"loss": 9.6086, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.009883515707730321, |
|
"grad_norm": 0.5629891157150269, |
|
"learning_rate": 5.25e-06, |
|
"loss": 9.2102, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.010354159312860337, |
|
"grad_norm": 0.48590287566185, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 9.7732, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.010824802917990352, |
|
"grad_norm": 0.5960127711296082, |
|
"learning_rate": 5.750000000000001e-06, |
|
"loss": 9.3421, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.011295446523120367, |
|
"grad_norm": 0.48235076665878296, |
|
"learning_rate": 6e-06, |
|
"loss": 9.5374, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.011766090128250382, |
|
"grad_norm": 0.4856416881084442, |
|
"learning_rate": 6.25e-06, |
|
"loss": 9.2162, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.012236733733380398, |
|
"grad_norm": 0.45604783296585083, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 9.3802, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.012707377338510413, |
|
"grad_norm": 0.4940997064113617, |
|
"learning_rate": 6.750000000000001e-06, |
|
"loss": 8.9352, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.013178020943640428, |
|
"grad_norm": 0.5067102909088135, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 9.6871, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.013648664548770444, |
|
"grad_norm": 0.5070438385009766, |
|
"learning_rate": 7.25e-06, |
|
"loss": 9.1244, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.014119308153900459, |
|
"grad_norm": 0.47256559133529663, |
|
"learning_rate": 7.5e-06, |
|
"loss": 9.6139, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014589951759030474, |
|
"grad_norm": 0.6668869853019714, |
|
"learning_rate": 7.75e-06, |
|
"loss": 8.9173, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.01506059536416049, |
|
"grad_norm": 0.7926103472709656, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 8.8604, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.015531238969290505, |
|
"grad_norm": 0.4389215409755707, |
|
"learning_rate": 8.25e-06, |
|
"loss": 9.42, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.01600188257442052, |
|
"grad_norm": 0.527125895023346, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 9.5552, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.016472526179550535, |
|
"grad_norm": 0.5376142263412476, |
|
"learning_rate": 8.75e-06, |
|
"loss": 9.1412, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01694316978468055, |
|
"grad_norm": 0.4762144386768341, |
|
"learning_rate": 9e-06, |
|
"loss": 9.2153, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.017413813389810566, |
|
"grad_norm": 0.46567338705062866, |
|
"learning_rate": 9.25e-06, |
|
"loss": 9.3836, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.01788445699494058, |
|
"grad_norm": 0.4322827458381653, |
|
"learning_rate": 9.5e-06, |
|
"loss": 8.9984, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.018355100600070597, |
|
"grad_norm": 0.42570286989212036, |
|
"learning_rate": 9.750000000000002e-06, |
|
"loss": 9.0916, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.018825744205200612, |
|
"grad_norm": 0.43363815546035767, |
|
"learning_rate": 1e-05, |
|
"loss": 9.0663, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.019296387810330627, |
|
"grad_norm": 0.3969482481479645, |
|
"learning_rate": 1.025e-05, |
|
"loss": 9.4064, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.019767031415460642, |
|
"grad_norm": 0.4335750639438629, |
|
"learning_rate": 1.05e-05, |
|
"loss": 9.262, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.020237675020590658, |
|
"grad_norm": 0.4210178852081299, |
|
"learning_rate": 1.075e-05, |
|
"loss": 9.4898, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.020708318625720673, |
|
"grad_norm": 0.39311668276786804, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 9.7063, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.02117896223085069, |
|
"grad_norm": 0.39521753787994385, |
|
"learning_rate": 1.125e-05, |
|
"loss": 9.3065, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.021649605835980704, |
|
"grad_norm": 0.42978909611701965, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 8.9722, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.02212024944111072, |
|
"grad_norm": 0.47351160645484924, |
|
"learning_rate": 1.175e-05, |
|
"loss": 8.9028, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.022590893046240734, |
|
"grad_norm": 0.4192260801792145, |
|
"learning_rate": 1.2e-05, |
|
"loss": 8.913, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.02306153665137075, |
|
"grad_norm": 0.42306703329086304, |
|
"learning_rate": 1.225e-05, |
|
"loss": 9.3223, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.023532180256500765, |
|
"grad_norm": 0.40158239006996155, |
|
"learning_rate": 1.25e-05, |
|
"loss": 9.5922, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02400282386163078, |
|
"grad_norm": 0.5165021419525146, |
|
"learning_rate": 1.2750000000000002e-05, |
|
"loss": 9.24, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.024473467466760795, |
|
"grad_norm": 0.3930136263370514, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 8.7955, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.02494411107189081, |
|
"grad_norm": 0.3975488543510437, |
|
"learning_rate": 1.3250000000000002e-05, |
|
"loss": 8.7474, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.025414754677020826, |
|
"grad_norm": 0.46201732754707336, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 9.1239, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.02588539828215084, |
|
"grad_norm": 0.42599615454673767, |
|
"learning_rate": 1.3750000000000002e-05, |
|
"loss": 9.2889, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.026356041887280857, |
|
"grad_norm": 0.3889259994029999, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 9.5315, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.026826685492410872, |
|
"grad_norm": 0.3762259781360626, |
|
"learning_rate": 1.4249999999999999e-05, |
|
"loss": 9.3968, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.027297329097540887, |
|
"grad_norm": 0.4486519396305084, |
|
"learning_rate": 1.45e-05, |
|
"loss": 9.2345, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.027767972702670903, |
|
"grad_norm": 0.43613263964653015, |
|
"learning_rate": 1.475e-05, |
|
"loss": 9.293, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.028238616307800918, |
|
"grad_norm": 0.40770891308784485, |
|
"learning_rate": 1.5e-05, |
|
"loss": 8.9544, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.028709259912930933, |
|
"grad_norm": 0.36603429913520813, |
|
"learning_rate": 1.525e-05, |
|
"loss": 9.5768, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.02917990351806095, |
|
"grad_norm": 0.41165047883987427, |
|
"learning_rate": 1.55e-05, |
|
"loss": 9.0203, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.029650547123190964, |
|
"grad_norm": 0.4514125883579254, |
|
"learning_rate": 1.575e-05, |
|
"loss": 9.2653, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.03012119072832098, |
|
"grad_norm": 0.41333243250846863, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 8.9577, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.030591834333450994, |
|
"grad_norm": 0.42950087785720825, |
|
"learning_rate": 1.6250000000000002e-05, |
|
"loss": 9.341, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03106247793858101, |
|
"grad_norm": 0.4158640205860138, |
|
"learning_rate": 1.65e-05, |
|
"loss": 9.6118, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.031533121543711025, |
|
"grad_norm": 0.39954355359077454, |
|
"learning_rate": 1.675e-05, |
|
"loss": 9.0818, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.03200376514884104, |
|
"grad_norm": 0.38233450055122375, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 9.3953, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.032474408753971055, |
|
"grad_norm": 0.37950408458709717, |
|
"learning_rate": 1.725e-05, |
|
"loss": 9.3594, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.03294505235910107, |
|
"grad_norm": 0.475953608751297, |
|
"learning_rate": 1.75e-05, |
|
"loss": 9.0956, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.033415695964231086, |
|
"grad_norm": 0.4252181947231293, |
|
"learning_rate": 1.775e-05, |
|
"loss": 9.1928, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0338863395693611, |
|
"grad_norm": 0.3946019411087036, |
|
"learning_rate": 1.8e-05, |
|
"loss": 9.1933, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.03435698317449112, |
|
"grad_norm": 0.4342809021472931, |
|
"learning_rate": 1.825e-05, |
|
"loss": 9.2859, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.03482762677962113, |
|
"grad_norm": 0.3921419084072113, |
|
"learning_rate": 1.85e-05, |
|
"loss": 9.1214, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.03529827038475115, |
|
"grad_norm": 0.3992595374584198, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 9.332, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03576891398988116, |
|
"grad_norm": 0.40269696712493896, |
|
"learning_rate": 1.9e-05, |
|
"loss": 9.4244, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.03623955759501118, |
|
"grad_norm": 0.41852205991744995, |
|
"learning_rate": 1.925e-05, |
|
"loss": 9.3765, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.03671020120014119, |
|
"grad_norm": 0.5162649750709534, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 8.3471, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.03718084480527121, |
|
"grad_norm": 0.4802299737930298, |
|
"learning_rate": 1.9750000000000002e-05, |
|
"loss": 9.3251, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.037651488410401224, |
|
"grad_norm": 0.4261873960494995, |
|
"learning_rate": 2e-05, |
|
"loss": 9.5181, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03812213201553124, |
|
"grad_norm": 0.4193435311317444, |
|
"learning_rate": 2.025e-05, |
|
"loss": 9.4217, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.038592775620661254, |
|
"grad_norm": 0.4148464798927307, |
|
"learning_rate": 2.05e-05, |
|
"loss": 8.7618, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.03906341922579127, |
|
"grad_norm": 0.4396406412124634, |
|
"learning_rate": 2.075e-05, |
|
"loss": 9.4059, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.039534062830921285, |
|
"grad_norm": 0.43215858936309814, |
|
"learning_rate": 2.1e-05, |
|
"loss": 9.0061, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0400047064360513, |
|
"grad_norm": 0.4347785711288452, |
|
"learning_rate": 2.125e-05, |
|
"loss": 8.5384, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.040475350041181316, |
|
"grad_norm": 0.47068068385124207, |
|
"learning_rate": 2.15e-05, |
|
"loss": 9.2299, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.04094599364631133, |
|
"grad_norm": 0.44863706827163696, |
|
"learning_rate": 2.175e-05, |
|
"loss": 8.7932, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.041416637251441346, |
|
"grad_norm": 0.4525277316570282, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 9.1699, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.04188728085657136, |
|
"grad_norm": 0.41207849979400635, |
|
"learning_rate": 2.2250000000000002e-05, |
|
"loss": 9.4979, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.04235792446170138, |
|
"grad_norm": 0.4179534912109375, |
|
"learning_rate": 2.25e-05, |
|
"loss": 9.1519, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04282856806683139, |
|
"grad_norm": 0.472789466381073, |
|
"learning_rate": 2.275e-05, |
|
"loss": 9.1048, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.04329921167196141, |
|
"grad_norm": 0.44435739517211914, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 9.2816, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.04376985527709142, |
|
"grad_norm": 0.41012299060821533, |
|
"learning_rate": 2.3250000000000003e-05, |
|
"loss": 9.4546, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.04424049888222144, |
|
"grad_norm": 0.4100490212440491, |
|
"learning_rate": 2.35e-05, |
|
"loss": 9.4397, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.04471114248735145, |
|
"grad_norm": 0.4229314923286438, |
|
"learning_rate": 2.375e-05, |
|
"loss": 8.9033, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.04518178609248147, |
|
"grad_norm": 0.39841172099113464, |
|
"learning_rate": 2.4e-05, |
|
"loss": 9.3391, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.045652429697611484, |
|
"grad_norm": 0.4041540324687958, |
|
"learning_rate": 2.425e-05, |
|
"loss": 9.3347, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0461230733027415, |
|
"grad_norm": 0.4046013653278351, |
|
"learning_rate": 2.45e-05, |
|
"loss": 9.4645, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.046593716907871514, |
|
"grad_norm": 0.3989504277706146, |
|
"learning_rate": 2.4750000000000002e-05, |
|
"loss": 9.2343, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.04706436051300153, |
|
"grad_norm": 0.41768062114715576, |
|
"learning_rate": 2.5e-05, |
|
"loss": 9.6114, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.047535004118131545, |
|
"grad_norm": 0.4360901713371277, |
|
"learning_rate": 2.525e-05, |
|
"loss": 9.3584, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.04800564772326156, |
|
"grad_norm": 0.5093626976013184, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 9.3969, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.048476291328391576, |
|
"grad_norm": 0.5148160457611084, |
|
"learning_rate": 2.5750000000000002e-05, |
|
"loss": 9.3607, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.04894693493352159, |
|
"grad_norm": 0.4556065797805786, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 8.6494, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.049417578538651606, |
|
"grad_norm": 0.48136287927627563, |
|
"learning_rate": 2.625e-05, |
|
"loss": 8.8816, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.04988822214378162, |
|
"grad_norm": 0.4007977247238159, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 9.0173, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.05035886574891164, |
|
"grad_norm": 0.5088827610015869, |
|
"learning_rate": 2.6750000000000003e-05, |
|
"loss": 9.4898, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.05082950935404165, |
|
"grad_norm": 0.4222247898578644, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 9.5039, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.05130015295917167, |
|
"grad_norm": 0.42676958441734314, |
|
"learning_rate": 2.725e-05, |
|
"loss": 9.3007, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.05177079656430168, |
|
"grad_norm": 0.4315201938152313, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 9.1473, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0522414401694317, |
|
"grad_norm": 0.5586130619049072, |
|
"learning_rate": 2.7750000000000004e-05, |
|
"loss": 9.486, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.05271208377456171, |
|
"grad_norm": 0.4153185486793518, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 9.2632, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.05318272737969173, |
|
"grad_norm": 0.47736650705337524, |
|
"learning_rate": 2.825e-05, |
|
"loss": 8.9582, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.053653370984821744, |
|
"grad_norm": 0.4127710163593292, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 9.3019, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.05412401458995176, |
|
"grad_norm": 0.44509121775627136, |
|
"learning_rate": 2.8749999999999997e-05, |
|
"loss": 9.1081, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.054594658195081774, |
|
"grad_norm": 0.4519471526145935, |
|
"learning_rate": 2.9e-05, |
|
"loss": 9.4795, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.05506530180021179, |
|
"grad_norm": 0.4292161464691162, |
|
"learning_rate": 2.925e-05, |
|
"loss": 9.2027, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.055535945405341805, |
|
"grad_norm": 0.46465009450912476, |
|
"learning_rate": 2.95e-05, |
|
"loss": 9.081, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.05600658901047182, |
|
"grad_norm": 0.4395250976085663, |
|
"learning_rate": 2.975e-05, |
|
"loss": 9.4345, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.056477232615601836, |
|
"grad_norm": 0.4673008918762207, |
|
"learning_rate": 3e-05, |
|
"loss": 9.3435, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05694787622073185, |
|
"grad_norm": 0.4328051209449768, |
|
"learning_rate": 3.025e-05, |
|
"loss": 8.7147, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.057418519825861866, |
|
"grad_norm": 0.444002240896225, |
|
"learning_rate": 3.05e-05, |
|
"loss": 8.8049, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.05788916343099188, |
|
"grad_norm": 0.4078370928764343, |
|
"learning_rate": 3.075e-05, |
|
"loss": 9.1032, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.0583598070361219, |
|
"grad_norm": 0.4445233941078186, |
|
"learning_rate": 3.1e-05, |
|
"loss": 9.279, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.05883045064125191, |
|
"grad_norm": 0.4282757639884949, |
|
"learning_rate": 3.125e-05, |
|
"loss": 9.4163, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05930109424638193, |
|
"grad_norm": 0.41878628730773926, |
|
"learning_rate": 3.15e-05, |
|
"loss": 8.9876, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.05977173785151194, |
|
"grad_norm": 0.6357080340385437, |
|
"learning_rate": 3.175e-05, |
|
"loss": 8.4245, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.06024238145664196, |
|
"grad_norm": 0.4595104455947876, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 9.1227, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.06071302506177197, |
|
"grad_norm": 1.0947221517562866, |
|
"learning_rate": 3.2250000000000005e-05, |
|
"loss": 8.6819, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.06118366866690199, |
|
"grad_norm": 0.43211594223976135, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 9.1862, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.061654312272032004, |
|
"grad_norm": 0.4080043137073517, |
|
"learning_rate": 3.275e-05, |
|
"loss": 9.0489, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.06212495587716202, |
|
"grad_norm": 0.48265427350997925, |
|
"learning_rate": 3.3e-05, |
|
"loss": 9.257, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.06259559948229203, |
|
"grad_norm": 0.45756152272224426, |
|
"learning_rate": 3.325e-05, |
|
"loss": 8.9598, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.06306624308742205, |
|
"grad_norm": 0.3848661780357361, |
|
"learning_rate": 3.35e-05, |
|
"loss": 9.5542, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.06353688669255206, |
|
"grad_norm": 0.43142908811569214, |
|
"learning_rate": 3.375000000000001e-05, |
|
"loss": 9.0434, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.06400753029768208, |
|
"grad_norm": 0.39845573902130127, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 9.7228, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.06447817390281209, |
|
"grad_norm": 0.4854653775691986, |
|
"learning_rate": 3.4250000000000006e-05, |
|
"loss": 8.9226, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.06494881750794211, |
|
"grad_norm": 0.41691291332244873, |
|
"learning_rate": 3.45e-05, |
|
"loss": 9.4588, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.06541946111307212, |
|
"grad_norm": 0.41709139943122864, |
|
"learning_rate": 3.475e-05, |
|
"loss": 8.9146, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.06589010471820214, |
|
"grad_norm": 0.3843998312950134, |
|
"learning_rate": 3.5e-05, |
|
"loss": 8.9889, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06636074832333215, |
|
"grad_norm": 0.4418933391571045, |
|
"learning_rate": 3.525e-05, |
|
"loss": 9.3688, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.06683139192846217, |
|
"grad_norm": 0.3844826817512512, |
|
"learning_rate": 3.55e-05, |
|
"loss": 9.2518, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.06730203553359218, |
|
"grad_norm": 0.4951348900794983, |
|
"learning_rate": 3.575e-05, |
|
"loss": 8.9785, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.0677726791387222, |
|
"grad_norm": 0.475685179233551, |
|
"learning_rate": 3.6e-05, |
|
"loss": 9.0013, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.06824332274385221, |
|
"grad_norm": 0.5578158497810364, |
|
"learning_rate": 3.625e-05, |
|
"loss": 8.9177, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.06871396634898223, |
|
"grad_norm": 0.6955916881561279, |
|
"learning_rate": 3.65e-05, |
|
"loss": 8.9298, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.06918460995411224, |
|
"grad_norm": 0.4071875810623169, |
|
"learning_rate": 3.675e-05, |
|
"loss": 9.1422, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.06965525355924226, |
|
"grad_norm": 0.49543336033821106, |
|
"learning_rate": 3.7e-05, |
|
"loss": 9.4138, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.07012589716437227, |
|
"grad_norm": 0.4391457438468933, |
|
"learning_rate": 3.7250000000000004e-05, |
|
"loss": 9.3566, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.0705965407695023, |
|
"grad_norm": 0.4311358630657196, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 8.6678, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0710671843746323, |
|
"grad_norm": 0.4233754873275757, |
|
"learning_rate": 3.775e-05, |
|
"loss": 8.9541, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.07153782797976233, |
|
"grad_norm": 0.4653347432613373, |
|
"learning_rate": 3.8e-05, |
|
"loss": 8.953, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.07200847158489233, |
|
"grad_norm": 0.4828343689441681, |
|
"learning_rate": 3.825e-05, |
|
"loss": 8.9577, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.07247911519002236, |
|
"grad_norm": 0.43757960200309753, |
|
"learning_rate": 3.85e-05, |
|
"loss": 9.2349, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.07294975879515236, |
|
"grad_norm": 0.4094442129135132, |
|
"learning_rate": 3.875e-05, |
|
"loss": 9.424, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.07342040240028239, |
|
"grad_norm": 0.536808967590332, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 8.9437, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.0738910460054124, |
|
"grad_norm": 0.4084169268608093, |
|
"learning_rate": 3.9250000000000005e-05, |
|
"loss": 9.5204, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.07436168961054242, |
|
"grad_norm": 0.4906410574913025, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 9.0682, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.07483233321567243, |
|
"grad_norm": 0.42850637435913086, |
|
"learning_rate": 3.9750000000000004e-05, |
|
"loss": 9.0241, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.07530297682080245, |
|
"grad_norm": 0.3832900822162628, |
|
"learning_rate": 4e-05, |
|
"loss": 9.4956, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07577362042593246, |
|
"grad_norm": 0.39132505655288696, |
|
"learning_rate": 4.025e-05, |
|
"loss": 9.4623, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.07624426403106248, |
|
"grad_norm": 0.44959893822669983, |
|
"learning_rate": 4.05e-05, |
|
"loss": 9.0518, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.07671490763619249, |
|
"grad_norm": 0.41552799940109253, |
|
"learning_rate": 4.075e-05, |
|
"loss": 9.1268, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.07718555124132251, |
|
"grad_norm": 0.42259296774864197, |
|
"learning_rate": 4.1e-05, |
|
"loss": 9.1533, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.07765619484645252, |
|
"grad_norm": 0.4441682994365692, |
|
"learning_rate": 4.125e-05, |
|
"loss": 8.7568, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.07812683845158254, |
|
"grad_norm": 0.42241615056991577, |
|
"learning_rate": 4.15e-05, |
|
"loss": 9.3366, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.07859748205671255, |
|
"grad_norm": 0.3997664153575897, |
|
"learning_rate": 4.175e-05, |
|
"loss": 8.855, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.07906812566184257, |
|
"grad_norm": 0.4293980300426483, |
|
"learning_rate": 4.2e-05, |
|
"loss": 8.9744, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.07953876926697258, |
|
"grad_norm": 0.4279899001121521, |
|
"learning_rate": 4.2250000000000004e-05, |
|
"loss": 9.0692, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.0800094128721026, |
|
"grad_norm": 0.4207955002784729, |
|
"learning_rate": 4.25e-05, |
|
"loss": 8.8506, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08048005647723261, |
|
"grad_norm": 0.41057008504867554, |
|
"learning_rate": 4.275e-05, |
|
"loss": 9.2402, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.08095070008236263, |
|
"grad_norm": 0.4556719660758972, |
|
"learning_rate": 4.3e-05, |
|
"loss": 9.3806, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.08142134368749264, |
|
"grad_norm": 0.4468841850757599, |
|
"learning_rate": 4.325e-05, |
|
"loss": 9.0331, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.08189198729262266, |
|
"grad_norm": 0.4206986725330353, |
|
"learning_rate": 4.35e-05, |
|
"loss": 8.6767, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.08236263089775267, |
|
"grad_norm": 0.42576491832733154, |
|
"learning_rate": 4.375e-05, |
|
"loss": 8.7183, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.08283327450288269, |
|
"grad_norm": 0.4180700182914734, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 8.8461, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.0833039181080127, |
|
"grad_norm": 0.3981553614139557, |
|
"learning_rate": 4.4250000000000005e-05, |
|
"loss": 8.9324, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.08377456171314272, |
|
"grad_norm": 0.4038431942462921, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 8.7611, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.08424520531827273, |
|
"grad_norm": 0.4555639326572418, |
|
"learning_rate": 4.4750000000000004e-05, |
|
"loss": 8.4839, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.08471584892340275, |
|
"grad_norm": 0.39343494176864624, |
|
"learning_rate": 4.5e-05, |
|
"loss": 9.0263, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08518649252853276, |
|
"grad_norm": 0.4226400852203369, |
|
"learning_rate": 4.525e-05, |
|
"loss": 8.9829, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.08565713613366278, |
|
"grad_norm": 0.3735749125480652, |
|
"learning_rate": 4.55e-05, |
|
"loss": 9.6609, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.08612777973879279, |
|
"grad_norm": 0.4413192868232727, |
|
"learning_rate": 4.575e-05, |
|
"loss": 9.0126, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.08659842334392281, |
|
"grad_norm": 0.3925839364528656, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 9.2048, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.08706906694905282, |
|
"grad_norm": 0.3941839933395386, |
|
"learning_rate": 4.6250000000000006e-05, |
|
"loss": 9.2662, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.08753971055418285, |
|
"grad_norm": 0.47577032446861267, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 8.9474, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.08801035415931285, |
|
"grad_norm": 0.4306804835796356, |
|
"learning_rate": 4.6750000000000005e-05, |
|
"loss": 8.8199, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.08848099776444288, |
|
"grad_norm": 0.4680851995944977, |
|
"learning_rate": 4.7e-05, |
|
"loss": 8.7651, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.08895164136957288, |
|
"grad_norm": 0.4325461983680725, |
|
"learning_rate": 4.7249999999999997e-05, |
|
"loss": 9.1391, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.0894222849747029, |
|
"grad_norm": 0.7051356434822083, |
|
"learning_rate": 4.75e-05, |
|
"loss": 8.8018, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08989292857983291, |
|
"grad_norm": 0.37214136123657227, |
|
"learning_rate": 4.775e-05, |
|
"loss": 9.4374, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.09036357218496294, |
|
"grad_norm": 0.4161190688610077, |
|
"learning_rate": 4.8e-05, |
|
"loss": 9.0213, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.09083421579009295, |
|
"grad_norm": 0.39017942547798157, |
|
"learning_rate": 4.825e-05, |
|
"loss": 9.4081, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.09130485939522297, |
|
"grad_norm": 0.3661479353904724, |
|
"learning_rate": 4.85e-05, |
|
"loss": 9.5162, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.09177550300035298, |
|
"grad_norm": 0.4220457077026367, |
|
"learning_rate": 4.875e-05, |
|
"loss": 8.8268, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.092246146605483, |
|
"grad_norm": 0.4123201370239258, |
|
"learning_rate": 4.9e-05, |
|
"loss": 9.1464, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.092716790210613, |
|
"grad_norm": 0.3835439383983612, |
|
"learning_rate": 4.9250000000000004e-05, |
|
"loss": 9.2391, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.09318743381574303, |
|
"grad_norm": 0.3718632459640503, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 9.2759, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.09365807742087304, |
|
"grad_norm": 0.5267420411109924, |
|
"learning_rate": 4.975e-05, |
|
"loss": 9.0097, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.09412872102600306, |
|
"grad_norm": 0.3542408049106598, |
|
"learning_rate": 5e-05, |
|
"loss": 9.5282, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09459936463113307, |
|
"grad_norm": 0.40344443917274475, |
|
"learning_rate": 4.999999247114854e-05, |
|
"loss": 9.3784, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.09507000823626309, |
|
"grad_norm": 0.41083309054374695, |
|
"learning_rate": 4.999996988459869e-05, |
|
"loss": 9.4365, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.0955406518413931, |
|
"grad_norm": 0.369400292634964, |
|
"learning_rate": 4.9999932240364054e-05, |
|
"loss": 9.3167, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.09601129544652312, |
|
"grad_norm": 0.36150887608528137, |
|
"learning_rate": 4.9999879538467306e-05, |
|
"loss": 9.5957, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.09648193905165313, |
|
"grad_norm": 0.44035205245018005, |
|
"learning_rate": 4.99998117789402e-05, |
|
"loss": 8.8501, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.09695258265678315, |
|
"grad_norm": 0.42898210883140564, |
|
"learning_rate": 4.999972896182352e-05, |
|
"loss": 8.8283, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.09742322626191316, |
|
"grad_norm": 0.3809720277786255, |
|
"learning_rate": 4.999963108716718e-05, |
|
"loss": 9.3219, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.09789386986704318, |
|
"grad_norm": 0.38228464126586914, |
|
"learning_rate": 4.999951815503011e-05, |
|
"loss": 9.2669, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.09836451347217319, |
|
"grad_norm": 0.3908674120903015, |
|
"learning_rate": 4.9999390165480335e-05, |
|
"loss": 8.9417, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.09883515707730321, |
|
"grad_norm": 0.34623146057128906, |
|
"learning_rate": 4.999924711859495e-05, |
|
"loss": 9.6014, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09930580068243322, |
|
"grad_norm": 0.3909365236759186, |
|
"learning_rate": 4.99990890144601e-05, |
|
"loss": 9.1546, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.09977644428756324, |
|
"grad_norm": 0.3888709843158722, |
|
"learning_rate": 4.999891585317103e-05, |
|
"loss": 9.3649, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.10024708789269325, |
|
"grad_norm": 0.45398378372192383, |
|
"learning_rate": 4.9998727634832024e-05, |
|
"loss": 8.9172, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.10071773149782327, |
|
"grad_norm": 0.36648306250572205, |
|
"learning_rate": 4.9998524359556445e-05, |
|
"loss": 9.0638, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.10118837510295328, |
|
"grad_norm": 0.37433892488479614, |
|
"learning_rate": 4.999830602746673e-05, |
|
"loss": 9.3322, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1016590187080833, |
|
"grad_norm": 0.38904431462287903, |
|
"learning_rate": 4.99980726386944e-05, |
|
"loss": 9.322, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.10212966231321331, |
|
"grad_norm": 0.38138681650161743, |
|
"learning_rate": 4.9997824193380004e-05, |
|
"loss": 9.6177, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.10260030591834333, |
|
"grad_norm": 0.39529645442962646, |
|
"learning_rate": 4.9997560691673194e-05, |
|
"loss": 9.054, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.10307094952347334, |
|
"grad_norm": 0.4126908481121063, |
|
"learning_rate": 4.999728213373267e-05, |
|
"loss": 9.4406, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.10354159312860337, |
|
"grad_norm": 0.4137309491634369, |
|
"learning_rate": 4.999698851972622e-05, |
|
"loss": 9.0403, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10401223673373337, |
|
"grad_norm": 0.4086442291736603, |
|
"learning_rate": 4.999667984983069e-05, |
|
"loss": 9.3006, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.1044828803388634, |
|
"grad_norm": 0.5080444812774658, |
|
"learning_rate": 4.999635612423198e-05, |
|
"loss": 9.1856, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.1049535239439934, |
|
"grad_norm": 0.36199596524238586, |
|
"learning_rate": 4.9996017343125085e-05, |
|
"loss": 9.3119, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.10542416754912343, |
|
"grad_norm": 0.4086923897266388, |
|
"learning_rate": 4.9995663506714054e-05, |
|
"loss": 9.1335, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.10589481115425343, |
|
"grad_norm": 0.42041823267936707, |
|
"learning_rate": 4.9995294615212006e-05, |
|
"loss": 8.9113, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.10636545475938346, |
|
"grad_norm": 0.35369089245796204, |
|
"learning_rate": 4.999491066884113e-05, |
|
"loss": 9.4732, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.10683609836451347, |
|
"grad_norm": 0.8479387164115906, |
|
"learning_rate": 4.9994511667832665e-05, |
|
"loss": 9.1135, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.10730674196964349, |
|
"grad_norm": 0.38847988843917847, |
|
"learning_rate": 4.999409761242696e-05, |
|
"loss": 9.3632, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.1077773855747735, |
|
"grad_norm": 0.43660977482795715, |
|
"learning_rate": 4.999366850287337e-05, |
|
"loss": 8.6279, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.10824802917990352, |
|
"grad_norm": 0.6459296345710754, |
|
"learning_rate": 4.999322433943038e-05, |
|
"loss": 9.1736, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10871867278503353, |
|
"grad_norm": 0.453952819108963, |
|
"learning_rate": 4.99927651223655e-05, |
|
"loss": 8.7847, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.10918931639016355, |
|
"grad_norm": 0.3641432821750641, |
|
"learning_rate": 4.9992290851955325e-05, |
|
"loss": 9.1591, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.10965995999529356, |
|
"grad_norm": 0.43097686767578125, |
|
"learning_rate": 4.999180152848551e-05, |
|
"loss": 8.8475, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.11013060360042358, |
|
"grad_norm": 0.40101760625839233, |
|
"learning_rate": 4.999129715225077e-05, |
|
"loss": 9.3003, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.11060124720555359, |
|
"grad_norm": 0.38456395268440247, |
|
"learning_rate": 4.99907777235549e-05, |
|
"loss": 9.0397, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.11107189081068361, |
|
"grad_norm": 0.3518768846988678, |
|
"learning_rate": 4.9990243242710764e-05, |
|
"loss": 9.3619, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.11154253441581362, |
|
"grad_norm": 0.43492040038108826, |
|
"learning_rate": 4.9989693710040284e-05, |
|
"loss": 8.9691, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.11201317802094364, |
|
"grad_norm": 0.4434773325920105, |
|
"learning_rate": 4.998912912587444e-05, |
|
"loss": 8.6355, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.11248382162607365, |
|
"grad_norm": 0.4103478193283081, |
|
"learning_rate": 4.998854949055328e-05, |
|
"loss": 9.0966, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.11295446523120367, |
|
"grad_norm": 0.409065842628479, |
|
"learning_rate": 4.998795480442595e-05, |
|
"loss": 8.9825, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11342510883633368, |
|
"grad_norm": 0.3709560036659241, |
|
"learning_rate": 4.9987345067850596e-05, |
|
"loss": 9.383, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.1138957524414637, |
|
"grad_norm": 0.4049656391143799, |
|
"learning_rate": 4.9986720281194496e-05, |
|
"loss": 8.8382, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.11436639604659371, |
|
"grad_norm": 0.40016597509384155, |
|
"learning_rate": 4.998608044483396e-05, |
|
"loss": 9.0227, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.11483703965172373, |
|
"grad_norm": 0.41628897190093994, |
|
"learning_rate": 4.998542555915435e-05, |
|
"loss": 9.1208, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.11530768325685374, |
|
"grad_norm": 0.37839028239250183, |
|
"learning_rate": 4.998475562455013e-05, |
|
"loss": 9.2952, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.11577832686198376, |
|
"grad_norm": 0.37010782957077026, |
|
"learning_rate": 4.99840706414248e-05, |
|
"loss": 8.8903, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.11624897046711377, |
|
"grad_norm": 0.40624648332595825, |
|
"learning_rate": 4.998337061019092e-05, |
|
"loss": 9.1322, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.1167196140722438, |
|
"grad_norm": 0.330285906791687, |
|
"learning_rate": 4.998265553127013e-05, |
|
"loss": 9.3509, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.1171902576773738, |
|
"grad_norm": 0.4315396249294281, |
|
"learning_rate": 4.9981925405093146e-05, |
|
"loss": 8.5941, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.11766090128250382, |
|
"grad_norm": 0.46557149291038513, |
|
"learning_rate": 4.99811802320997e-05, |
|
"loss": 8.7841, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11813154488763383, |
|
"grad_norm": 0.40763556957244873, |
|
"learning_rate": 4.998042001273864e-05, |
|
"loss": 9.0945, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.11860218849276385, |
|
"grad_norm": 0.38328826427459717, |
|
"learning_rate": 4.9979644747467835e-05, |
|
"loss": 9.5115, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.11907283209789386, |
|
"grad_norm": 0.3737850487232208, |
|
"learning_rate": 4.997885443675424e-05, |
|
"loss": 8.6629, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.11954347570302389, |
|
"grad_norm": 0.38939982652664185, |
|
"learning_rate": 4.997804908107387e-05, |
|
"loss": 9.1315, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.1200141193081539, |
|
"grad_norm": 0.41033586859703064, |
|
"learning_rate": 4.997722868091179e-05, |
|
"loss": 8.9948, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.12048476291328392, |
|
"grad_norm": 0.4496087431907654, |
|
"learning_rate": 4.997639323676214e-05, |
|
"loss": 8.7967, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.12095540651841392, |
|
"grad_norm": 0.4463037848472595, |
|
"learning_rate": 4.997554274912811e-05, |
|
"loss": 8.6575, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.12142605012354395, |
|
"grad_norm": 0.447477251291275, |
|
"learning_rate": 4.997467721852196e-05, |
|
"loss": 9.4086, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.12189669372867395, |
|
"grad_norm": 0.40504494309425354, |
|
"learning_rate": 4.9973796645465e-05, |
|
"loss": 9.6567, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.12236733733380398, |
|
"grad_norm": 0.4193851351737976, |
|
"learning_rate": 4.9972901030487616e-05, |
|
"loss": 9.415, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12283798093893399, |
|
"grad_norm": 0.37490740418434143, |
|
"learning_rate": 4.997199037412923e-05, |
|
"loss": 9.094, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.12330862454406401, |
|
"grad_norm": 0.4043318033218384, |
|
"learning_rate": 4.997106467693835e-05, |
|
"loss": 9.1566, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.12377926814919402, |
|
"grad_norm": 0.3795372247695923, |
|
"learning_rate": 4.997012393947253e-05, |
|
"loss": 9.5975, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.12424991175432404, |
|
"grad_norm": 0.38997772336006165, |
|
"learning_rate": 4.996916816229837e-05, |
|
"loss": 9.3275, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.12472055535945405, |
|
"grad_norm": 0.41787171363830566, |
|
"learning_rate": 4.9968197345991565e-05, |
|
"loss": 8.9184, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.12519119896458406, |
|
"grad_norm": 0.4403538703918457, |
|
"learning_rate": 4.996721149113682e-05, |
|
"loss": 9.0055, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.12566184256971408, |
|
"grad_norm": 0.44756266474723816, |
|
"learning_rate": 4.996621059832795e-05, |
|
"loss": 9.0517, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.1261324861748441, |
|
"grad_norm": 0.3958662748336792, |
|
"learning_rate": 4.996519466816778e-05, |
|
"loss": 9.1983, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.12660312977997412, |
|
"grad_norm": 0.5548920035362244, |
|
"learning_rate": 4.9964163701268224e-05, |
|
"loss": 9.0239, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.12707377338510412, |
|
"grad_norm": 0.38231074810028076, |
|
"learning_rate": 4.996311769825024e-05, |
|
"loss": 9.4057, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12754441699023414, |
|
"grad_norm": 0.37411412596702576, |
|
"learning_rate": 4.996205665974384e-05, |
|
"loss": 9.147, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.12801506059536416, |
|
"grad_norm": 0.36638572812080383, |
|
"learning_rate": 4.996098058638809e-05, |
|
"loss": 9.3312, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.12848570420049418, |
|
"grad_norm": 0.36364972591400146, |
|
"learning_rate": 4.995988947883114e-05, |
|
"loss": 9.4873, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.12895634780562418, |
|
"grad_norm": 0.415054053068161, |
|
"learning_rate": 4.9958783337730156e-05, |
|
"loss": 9.0241, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.1294269914107542, |
|
"grad_norm": 0.616145133972168, |
|
"learning_rate": 4.995766216375137e-05, |
|
"loss": 9.1209, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.12989763501588422, |
|
"grad_norm": 0.3728233575820923, |
|
"learning_rate": 4.9956525957570086e-05, |
|
"loss": 9.5214, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.13036827862101424, |
|
"grad_norm": 0.4377942681312561, |
|
"learning_rate": 4.995537471987066e-05, |
|
"loss": 8.7668, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.13083892222614424, |
|
"grad_norm": 0.4865539073944092, |
|
"learning_rate": 4.9954208451346465e-05, |
|
"loss": 8.8752, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.13130956583127426, |
|
"grad_norm": 0.4728136658668518, |
|
"learning_rate": 4.995302715269997e-05, |
|
"loss": 9.0947, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.13178020943640428, |
|
"grad_norm": 0.40794286131858826, |
|
"learning_rate": 4.995183082464269e-05, |
|
"loss": 8.9566, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1322508530415343, |
|
"grad_norm": 0.35321590304374695, |
|
"learning_rate": 4.995061946789516e-05, |
|
"loss": 9.4166, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.1327214966466643, |
|
"grad_norm": 0.41053611040115356, |
|
"learning_rate": 4.9949393083187005e-05, |
|
"loss": 9.0913, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.13319214025179432, |
|
"grad_norm": 0.4475056231021881, |
|
"learning_rate": 4.9948151671256883e-05, |
|
"loss": 8.422, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.13366278385692434, |
|
"grad_norm": 0.34866318106651306, |
|
"learning_rate": 4.994689523285251e-05, |
|
"loss": 9.2168, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.13413342746205437, |
|
"grad_norm": 0.4374255836009979, |
|
"learning_rate": 4.994562376873064e-05, |
|
"loss": 8.9508, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.13460407106718436, |
|
"grad_norm": 0.38839930295944214, |
|
"learning_rate": 4.9944337279657106e-05, |
|
"loss": 8.8695, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.13507471467231438, |
|
"grad_norm": 0.4352591335773468, |
|
"learning_rate": 4.994303576640674e-05, |
|
"loss": 8.7637, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.1355453582774444, |
|
"grad_norm": 0.36577296257019043, |
|
"learning_rate": 4.994171922976348e-05, |
|
"loss": 9.4622, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.13601600188257443, |
|
"grad_norm": 0.3764691650867462, |
|
"learning_rate": 4.994038767052028e-05, |
|
"loss": 9.3536, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.13648664548770442, |
|
"grad_norm": 0.3795958161354065, |
|
"learning_rate": 4.993904108947914e-05, |
|
"loss": 8.9066, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13695728909283444, |
|
"grad_norm": 0.42235082387924194, |
|
"learning_rate": 4.993767948745113e-05, |
|
"loss": 9.168, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.13742793269796447, |
|
"grad_norm": 0.41240936517715454, |
|
"learning_rate": 4.993630286525634e-05, |
|
"loss": 8.8015, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.1378985763030945, |
|
"grad_norm": 0.40508440136909485, |
|
"learning_rate": 4.993491122372394e-05, |
|
"loss": 8.9218, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.13836921990822448, |
|
"grad_norm": 0.44761571288108826, |
|
"learning_rate": 4.99335045636921e-05, |
|
"loss": 8.9542, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.1388398635133545, |
|
"grad_norm": 0.35136064887046814, |
|
"learning_rate": 4.993208288600808e-05, |
|
"loss": 9.0036, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.13931050711848453, |
|
"grad_norm": 0.3560550808906555, |
|
"learning_rate": 4.9930646191528175e-05, |
|
"loss": 9.5513, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.13978115072361455, |
|
"grad_norm": 0.40760746598243713, |
|
"learning_rate": 4.99291944811177e-05, |
|
"loss": 9.1574, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.14025179432874454, |
|
"grad_norm": 0.4152514338493347, |
|
"learning_rate": 4.992772775565104e-05, |
|
"loss": 8.9221, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.14072243793387457, |
|
"grad_norm": 0.36200031638145447, |
|
"learning_rate": 4.992624601601162e-05, |
|
"loss": 9.2766, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.1411930815390046, |
|
"grad_norm": 0.3931048512458801, |
|
"learning_rate": 4.992474926309191e-05, |
|
"loss": 9.0796, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1416637251441346, |
|
"grad_norm": 0.3852521777153015, |
|
"learning_rate": 4.992323749779339e-05, |
|
"loss": 8.9804, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.1421343687492646, |
|
"grad_norm": 0.42558741569519043, |
|
"learning_rate": 4.992171072102663e-05, |
|
"loss": 8.6188, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.14260501235439463, |
|
"grad_norm": 0.40560707449913025, |
|
"learning_rate": 4.992016893371122e-05, |
|
"loss": 9.2215, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.14307565595952465, |
|
"grad_norm": 0.3654381334781647, |
|
"learning_rate": 4.9918612136775776e-05, |
|
"loss": 9.6141, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.14354629956465467, |
|
"grad_norm": 0.3547174632549286, |
|
"learning_rate": 4.9917040331157986e-05, |
|
"loss": 9.4322, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.14401694316978467, |
|
"grad_norm": 0.3975953161716461, |
|
"learning_rate": 4.9915453517804554e-05, |
|
"loss": 9.0455, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.1444875867749147, |
|
"grad_norm": 0.4045639932155609, |
|
"learning_rate": 4.991385169767123e-05, |
|
"loss": 8.6646, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.1449582303800447, |
|
"grad_norm": 0.39949241280555725, |
|
"learning_rate": 4.9912234871722805e-05, |
|
"loss": 8.9656, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.14542887398517473, |
|
"grad_norm": 0.38490548729896545, |
|
"learning_rate": 4.9910603040933116e-05, |
|
"loss": 9.2289, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.14589951759030473, |
|
"grad_norm": 0.38393279910087585, |
|
"learning_rate": 4.9908956206285e-05, |
|
"loss": 9.5308, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.14637016119543475, |
|
"grad_norm": 0.41801533102989197, |
|
"learning_rate": 4.990729436877038e-05, |
|
"loss": 9.179, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.14684080480056477, |
|
"grad_norm": 0.3734685182571411, |
|
"learning_rate": 4.9905617529390203e-05, |
|
"loss": 9.4323, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.1473114484056948, |
|
"grad_norm": 0.38498827815055847, |
|
"learning_rate": 4.9903925689154425e-05, |
|
"loss": 8.7253, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.1477820920108248, |
|
"grad_norm": 0.4148082435131073, |
|
"learning_rate": 4.990221884908206e-05, |
|
"loss": 9.5291, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.1482527356159548, |
|
"grad_norm": 0.3645360469818115, |
|
"learning_rate": 4.990049701020115e-05, |
|
"loss": 9.3854, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.14872337922108483, |
|
"grad_norm": 0.39119553565979004, |
|
"learning_rate": 4.989876017354878e-05, |
|
"loss": 8.8417, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.14919402282621486, |
|
"grad_norm": 0.40799564123153687, |
|
"learning_rate": 4.989700834017105e-05, |
|
"loss": 9.1028, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.14966466643134485, |
|
"grad_norm": 0.36694031953811646, |
|
"learning_rate": 4.9895241511123114e-05, |
|
"loss": 9.26, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.15013531003647487, |
|
"grad_norm": 0.4914778769016266, |
|
"learning_rate": 4.989345968746914e-05, |
|
"loss": 9.3256, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.1506059536416049, |
|
"grad_norm": 0.43579304218292236, |
|
"learning_rate": 4.989166287028234e-05, |
|
"loss": 8.7753, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15107659724673492, |
|
"grad_norm": 0.37302032113075256, |
|
"learning_rate": 4.988985106064495e-05, |
|
"loss": 9.3832, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.1515472408518649, |
|
"grad_norm": 0.3695763945579529, |
|
"learning_rate": 4.988802425964824e-05, |
|
"loss": 8.7549, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.15201788445699493, |
|
"grad_norm": 0.4146966338157654, |
|
"learning_rate": 4.98861824683925e-05, |
|
"loss": 8.8819, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.15248852806212496, |
|
"grad_norm": 0.36729514598846436, |
|
"learning_rate": 4.9884325687987056e-05, |
|
"loss": 8.9922, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.15295917166725498, |
|
"grad_norm": 0.3997980058193207, |
|
"learning_rate": 4.9882453919550264e-05, |
|
"loss": 9.0574, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.15342981527238497, |
|
"grad_norm": 0.31628280878067017, |
|
"learning_rate": 4.9880567164209515e-05, |
|
"loss": 9.7555, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.153900458877515, |
|
"grad_norm": 0.3956843316555023, |
|
"learning_rate": 4.98786654231012e-05, |
|
"loss": 9.2441, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.15437110248264502, |
|
"grad_norm": 0.399984747171402, |
|
"learning_rate": 4.987674869737077e-05, |
|
"loss": 9.0811, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.15484174608777504, |
|
"grad_norm": 0.40124884247779846, |
|
"learning_rate": 4.987481698817268e-05, |
|
"loss": 8.7801, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.15531238969290503, |
|
"grad_norm": 0.36277976632118225, |
|
"learning_rate": 4.98728702966704e-05, |
|
"loss": 9.1685, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15578303329803506, |
|
"grad_norm": 0.4415287375450134, |
|
"learning_rate": 4.987090862403646e-05, |
|
"loss": 8.6159, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.15625367690316508, |
|
"grad_norm": 0.4005844295024872, |
|
"learning_rate": 4.986893197145237e-05, |
|
"loss": 8.7962, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.1567243205082951, |
|
"grad_norm": 0.4147176742553711, |
|
"learning_rate": 4.9866940340108704e-05, |
|
"loss": 9.1667, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.1571949641134251, |
|
"grad_norm": 0.5922366976737976, |
|
"learning_rate": 4.986493373120502e-05, |
|
"loss": 9.1685, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.15766560771855512, |
|
"grad_norm": 0.42389023303985596, |
|
"learning_rate": 4.986291214594992e-05, |
|
"loss": 8.9005, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.15813625132368514, |
|
"grad_norm": 3.3356659412384033, |
|
"learning_rate": 4.986087558556104e-05, |
|
"loss": 8.8868, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.15860689492881516, |
|
"grad_norm": 0.3584047853946686, |
|
"learning_rate": 4.9858824051264985e-05, |
|
"loss": 9.3012, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.15907753853394516, |
|
"grad_norm": 0.432365357875824, |
|
"learning_rate": 4.985675754429744e-05, |
|
"loss": 8.6683, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.15954818213907518, |
|
"grad_norm": 0.4141758680343628, |
|
"learning_rate": 4.985467606590305e-05, |
|
"loss": 8.8902, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.1600188257442052, |
|
"grad_norm": 0.5318158268928528, |
|
"learning_rate": 4.985257961733553e-05, |
|
"loss": 9.3213, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.16048946934933522, |
|
"grad_norm": 0.4039144814014435, |
|
"learning_rate": 4.985046819985758e-05, |
|
"loss": 9.3521, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.16096011295446522, |
|
"grad_norm": 0.4055419862270355, |
|
"learning_rate": 4.984834181474093e-05, |
|
"loss": 9.032, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.16143075655959524, |
|
"grad_norm": 0.47234630584716797, |
|
"learning_rate": 4.9846200463266304e-05, |
|
"loss": 8.9415, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.16190140016472526, |
|
"grad_norm": 0.3458828628063202, |
|
"learning_rate": 4.984404414672346e-05, |
|
"loss": 9.3418, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.16237204376985528, |
|
"grad_norm": 0.4208340048789978, |
|
"learning_rate": 4.9841872866411175e-05, |
|
"loss": 8.5468, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.16284268737498528, |
|
"grad_norm": 0.4632960855960846, |
|
"learning_rate": 4.983968662363723e-05, |
|
"loss": 8.357, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.1633133309801153, |
|
"grad_norm": 0.3957667946815491, |
|
"learning_rate": 4.98374854197184e-05, |
|
"loss": 9.5873, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.16378397458524532, |
|
"grad_norm": 0.45077890157699585, |
|
"learning_rate": 4.98352692559805e-05, |
|
"loss": 8.6973, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.16425461819037535, |
|
"grad_norm": 0.36463478207588196, |
|
"learning_rate": 4.983303813375833e-05, |
|
"loss": 9.1421, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.16472526179550534, |
|
"grad_norm": 0.4010748565196991, |
|
"learning_rate": 4.983079205439574e-05, |
|
"loss": 9.1377, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16519590540063536, |
|
"grad_norm": 0.39440232515335083, |
|
"learning_rate": 4.982853101924554e-05, |
|
"loss": 8.9753, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.16566654900576538, |
|
"grad_norm": 0.4520394504070282, |
|
"learning_rate": 4.9826255029669577e-05, |
|
"loss": 8.7352, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.1661371926108954, |
|
"grad_norm": 0.4330653250217438, |
|
"learning_rate": 4.98239640870387e-05, |
|
"loss": 9.0555, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.1666078362160254, |
|
"grad_norm": 0.47660115361213684, |
|
"learning_rate": 4.982165819273275e-05, |
|
"loss": 8.6404, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.16707847982115542, |
|
"grad_norm": 0.4233279228210449, |
|
"learning_rate": 4.98193373481406e-05, |
|
"loss": 8.9099, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.16754912342628545, |
|
"grad_norm": 0.43518248200416565, |
|
"learning_rate": 4.98170015546601e-05, |
|
"loss": 8.6882, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.16801976703141547, |
|
"grad_norm": 0.3644963800907135, |
|
"learning_rate": 4.981465081369814e-05, |
|
"loss": 9.2448, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.16849041063654546, |
|
"grad_norm": 0.38815975189208984, |
|
"learning_rate": 4.981228512667057e-05, |
|
"loss": 9.558, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.16896105424167548, |
|
"grad_norm": 0.4271330237388611, |
|
"learning_rate": 4.980990449500227e-05, |
|
"loss": 8.4688, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.1694316978468055, |
|
"grad_norm": 0.4300340712070465, |
|
"learning_rate": 4.980750892012711e-05, |
|
"loss": 8.5112, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16990234145193553, |
|
"grad_norm": 0.3674795627593994, |
|
"learning_rate": 4.980509840348796e-05, |
|
"loss": 9.1979, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.17037298505706552, |
|
"grad_norm": 0.39522647857666016, |
|
"learning_rate": 4.980267294653671e-05, |
|
"loss": 9.3743, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.17084362866219555, |
|
"grad_norm": 0.4358430504798889, |
|
"learning_rate": 4.980023255073422e-05, |
|
"loss": 9.1216, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.17131427226732557, |
|
"grad_norm": 0.40390607714653015, |
|
"learning_rate": 4.9797777217550367e-05, |
|
"loss": 8.9767, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.1717849158724556, |
|
"grad_norm": 0.3644031584262848, |
|
"learning_rate": 4.9795306948464e-05, |
|
"loss": 9.2284, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.17225555947758558, |
|
"grad_norm": 0.41837140917778015, |
|
"learning_rate": 4.979282174496302e-05, |
|
"loss": 8.8997, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.1727262030827156, |
|
"grad_norm": 0.38197219371795654, |
|
"learning_rate": 4.979032160854424e-05, |
|
"loss": 9.1135, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.17319684668784563, |
|
"grad_norm": 0.3703914284706116, |
|
"learning_rate": 4.9787806540713546e-05, |
|
"loss": 9.499, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.17366749029297565, |
|
"grad_norm": 0.5900145769119263, |
|
"learning_rate": 4.978527654298576e-05, |
|
"loss": 9.6679, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.17413813389810565, |
|
"grad_norm": 0.4443458318710327, |
|
"learning_rate": 4.9782731616884736e-05, |
|
"loss": 8.4039, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.17460877750323567, |
|
"grad_norm": 0.31717589497566223, |
|
"learning_rate": 4.978017176394331e-05, |
|
"loss": 9.7594, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.1750794211083657, |
|
"grad_norm": 0.3682294189929962, |
|
"learning_rate": 4.977759698570328e-05, |
|
"loss": 9.3738, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.1755500647134957, |
|
"grad_norm": 0.36333027482032776, |
|
"learning_rate": 4.977500728371547e-05, |
|
"loss": 9.4728, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.1760207083186257, |
|
"grad_norm": 0.38923901319503784, |
|
"learning_rate": 4.9772402659539674e-05, |
|
"loss": 9.0362, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.17649135192375573, |
|
"grad_norm": 0.3548789620399475, |
|
"learning_rate": 4.9769783114744686e-05, |
|
"loss": 9.4734, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.17696199552888575, |
|
"grad_norm": 0.3727724552154541, |
|
"learning_rate": 4.976714865090827e-05, |
|
"loss": 8.9019, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.17743263913401577, |
|
"grad_norm": 0.3825220763683319, |
|
"learning_rate": 4.976449926961719e-05, |
|
"loss": 9.4008, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.17790328273914577, |
|
"grad_norm": 0.36432167887687683, |
|
"learning_rate": 4.9761834972467185e-05, |
|
"loss": 9.4614, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.1783739263442758, |
|
"grad_norm": 0.4360719621181488, |
|
"learning_rate": 4.975915576106299e-05, |
|
"loss": 8.9864, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.1788445699494058, |
|
"grad_norm": 0.36198675632476807, |
|
"learning_rate": 4.975646163701831e-05, |
|
"loss": 9.3858, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.17931521355453583, |
|
"grad_norm": 0.3615058362483978, |
|
"learning_rate": 4.9753752601955836e-05, |
|
"loss": 9.4513, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.17978585715966583, |
|
"grad_norm": 0.38385000824928284, |
|
"learning_rate": 4.975102865750725e-05, |
|
"loss": 9.0129, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.18025650076479585, |
|
"grad_norm": 0.42161351442337036, |
|
"learning_rate": 4.9748289805313196e-05, |
|
"loss": 8.8066, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.18072714436992587, |
|
"grad_norm": 0.3863692879676819, |
|
"learning_rate": 4.9745536047023324e-05, |
|
"loss": 9.0613, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.1811977879750559, |
|
"grad_norm": 0.35685333609580994, |
|
"learning_rate": 4.9742767384296216e-05, |
|
"loss": 9.1823, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.1816684315801859, |
|
"grad_norm": 0.4146454930305481, |
|
"learning_rate": 4.973998381879949e-05, |
|
"loss": 9.0627, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.1821390751853159, |
|
"grad_norm": 0.40701958537101746, |
|
"learning_rate": 4.973718535220969e-05, |
|
"loss": 9.4653, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.18260971879044594, |
|
"grad_norm": 0.5105063915252686, |
|
"learning_rate": 4.973437198621237e-05, |
|
"loss": 9.1349, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.18308036239557596, |
|
"grad_norm": 0.3464662730693817, |
|
"learning_rate": 4.973154372250203e-05, |
|
"loss": 9.3152, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.18355100600070595, |
|
"grad_norm": 0.3519923985004425, |
|
"learning_rate": 4.972870056278216e-05, |
|
"loss": 9.6833, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.18402164960583597, |
|
"grad_norm": 0.3777810037136078, |
|
"learning_rate": 4.972584250876522e-05, |
|
"loss": 8.9543, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.184492293210966, |
|
"grad_norm": 0.45620018243789673, |
|
"learning_rate": 4.972296956217265e-05, |
|
"loss": 8.5477, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.18496293681609602, |
|
"grad_norm": 0.3768126368522644, |
|
"learning_rate": 4.972008172473483e-05, |
|
"loss": 9.2837, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.185433580421226, |
|
"grad_norm": 0.37716034054756165, |
|
"learning_rate": 4.971717899819113e-05, |
|
"loss": 9.0821, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.18590422402635604, |
|
"grad_norm": 0.40171629190444946, |
|
"learning_rate": 4.9714261384289896e-05, |
|
"loss": 9.0963, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.18637486763148606, |
|
"grad_norm": 0.41346555948257446, |
|
"learning_rate": 4.9711328884788434e-05, |
|
"loss": 8.6835, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.18684551123661608, |
|
"grad_norm": 0.3882580101490021, |
|
"learning_rate": 4.970838150145299e-05, |
|
"loss": 8.998, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.18731615484174607, |
|
"grad_norm": 0.40618547797203064, |
|
"learning_rate": 4.9705419236058825e-05, |
|
"loss": 8.8586, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.1877867984468761, |
|
"grad_norm": 0.4610426127910614, |
|
"learning_rate": 4.970244209039012e-05, |
|
"loss": 8.5731, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.18825744205200612, |
|
"grad_norm": 0.3799988329410553, |
|
"learning_rate": 4.969945006624003e-05, |
|
"loss": 8.9463, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18872808565713614, |
|
"grad_norm": 0.37528830766677856, |
|
"learning_rate": 4.969644316541068e-05, |
|
"loss": 8.9402, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.18919872926226614, |
|
"grad_norm": 0.3422936201095581, |
|
"learning_rate": 4.9693421389713156e-05, |
|
"loss": 9.3497, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.18966937286739616, |
|
"grad_norm": 0.35784366726875305, |
|
"learning_rate": 4.969038474096749e-05, |
|
"loss": 9.1984, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.19014001647252618, |
|
"grad_norm": 0.36203494668006897, |
|
"learning_rate": 4.96873332210027e-05, |
|
"loss": 9.5096, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.1906106600776562, |
|
"grad_norm": 0.3657507598400116, |
|
"learning_rate": 4.9684266831656706e-05, |
|
"loss": 9.4901, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.1910813036827862, |
|
"grad_norm": 0.3886093199253082, |
|
"learning_rate": 4.9681185574776446e-05, |
|
"loss": 9.2492, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.19155194728791622, |
|
"grad_norm": 0.4091348350048065, |
|
"learning_rate": 4.967808945221778e-05, |
|
"loss": 8.9341, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.19202259089304624, |
|
"grad_norm": 0.45772606134414673, |
|
"learning_rate": 4.967497846584552e-05, |
|
"loss": 9.1159, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.19249323449817626, |
|
"grad_norm": 0.4274662733078003, |
|
"learning_rate": 4.967185261753345e-05, |
|
"loss": 9.0557, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.19296387810330626, |
|
"grad_norm": 0.3963877558708191, |
|
"learning_rate": 4.96687119091643e-05, |
|
"loss": 9.2221, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.19343452170843628, |
|
"grad_norm": 0.3958019018173218, |
|
"learning_rate": 4.966555634262972e-05, |
|
"loss": 8.7826, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.1939051653135663, |
|
"grad_norm": 0.3447028398513794, |
|
"learning_rate": 4.9662385919830347e-05, |
|
"loss": 9.5672, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.19437580891869632, |
|
"grad_norm": 0.41687721014022827, |
|
"learning_rate": 4.965920064267575e-05, |
|
"loss": 8.7692, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.19484645252382632, |
|
"grad_norm": 0.40204861760139465, |
|
"learning_rate": 4.9656000513084455e-05, |
|
"loss": 8.9861, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.19531709612895634, |
|
"grad_norm": 0.3969802260398865, |
|
"learning_rate": 4.965278553298392e-05, |
|
"loss": 8.7663, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.19578773973408636, |
|
"grad_norm": 0.3831544518470764, |
|
"learning_rate": 4.964955570431055e-05, |
|
"loss": 9.1338, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.19625838333921639, |
|
"grad_norm": 0.40865185856819153, |
|
"learning_rate": 4.96463110290097e-05, |
|
"loss": 8.7582, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.19672902694434638, |
|
"grad_norm": 0.36668238043785095, |
|
"learning_rate": 4.964305150903566e-05, |
|
"loss": 9.185, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.1971996705494764, |
|
"grad_norm": 0.4229344129562378, |
|
"learning_rate": 4.963977714635168e-05, |
|
"loss": 9.0629, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.19767031415460642, |
|
"grad_norm": 0.36557090282440186, |
|
"learning_rate": 4.963648794292992e-05, |
|
"loss": 9.2807, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19814095775973645, |
|
"grad_norm": 0.36382701992988586, |
|
"learning_rate": 4.9633183900751504e-05, |
|
"loss": 9.3589, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.19861160136486644, |
|
"grad_norm": 0.34733355045318604, |
|
"learning_rate": 4.962986502180648e-05, |
|
"loss": 9.246, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.19908224496999646, |
|
"grad_norm": 0.39794841408729553, |
|
"learning_rate": 4.962653130809383e-05, |
|
"loss": 8.8009, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.19955288857512649, |
|
"grad_norm": 1.290969967842102, |
|
"learning_rate": 4.962318276162148e-05, |
|
"loss": 8.8199, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.2000235321802565, |
|
"grad_norm": 0.41390761733055115, |
|
"learning_rate": 4.961981938440629e-05, |
|
"loss": 8.8504, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.2004941757853865, |
|
"grad_norm": 0.4563705623149872, |
|
"learning_rate": 4.9616441178474044e-05, |
|
"loss": 8.4598, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.20096481939051652, |
|
"grad_norm": 0.41248825192451477, |
|
"learning_rate": 4.9613048145859465e-05, |
|
"loss": 8.9862, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.20143546299564655, |
|
"grad_norm": 0.3711670935153961, |
|
"learning_rate": 4.9609640288606205e-05, |
|
"loss": 9.1376, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.20190610660077657, |
|
"grad_norm": 0.3998201787471771, |
|
"learning_rate": 4.960621760876686e-05, |
|
"loss": 8.8631, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.20237675020590656, |
|
"grad_norm": 0.39512693881988525, |
|
"learning_rate": 4.96027801084029e-05, |
|
"loss": 8.6108, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.20284739381103659, |
|
"grad_norm": 0.40403223037719727, |
|
"learning_rate": 4.95993277895848e-05, |
|
"loss": 8.9947, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.2033180374161666, |
|
"grad_norm": 0.37190157175064087, |
|
"learning_rate": 4.959586065439189e-05, |
|
"loss": 9.0393, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.20378868102129663, |
|
"grad_norm": 0.49797308444976807, |
|
"learning_rate": 4.959237870491247e-05, |
|
"loss": 8.4229, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.20425932462642662, |
|
"grad_norm": 0.4093763828277588, |
|
"learning_rate": 4.958888194324374e-05, |
|
"loss": 9.2132, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.20472996823155665, |
|
"grad_norm": 0.4164353609085083, |
|
"learning_rate": 4.958537037149183e-05, |
|
"loss": 9.3971, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.20520061183668667, |
|
"grad_norm": 0.4578768312931061, |
|
"learning_rate": 4.958184399177178e-05, |
|
"loss": 8.8712, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.2056712554418167, |
|
"grad_norm": 0.3586215674877167, |
|
"learning_rate": 4.957830280620758e-05, |
|
"loss": 9.3741, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.20614189904694669, |
|
"grad_norm": 0.4265285134315491, |
|
"learning_rate": 4.9574746816932084e-05, |
|
"loss": 9.5791, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.2066125426520767, |
|
"grad_norm": 0.4029577672481537, |
|
"learning_rate": 4.9571176026087116e-05, |
|
"loss": 8.7589, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.20708318625720673, |
|
"grad_norm": 0.38180944323539734, |
|
"learning_rate": 4.9567590435823383e-05, |
|
"loss": 9.0139, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.20755382986233675, |
|
"grad_norm": 0.39456745982170105, |
|
"learning_rate": 4.9563990048300524e-05, |
|
"loss": 9.1201, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.20802447346746675, |
|
"grad_norm": 0.5495271682739258, |
|
"learning_rate": 4.956037486568706e-05, |
|
"loss": 8.5788, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.20849511707259677, |
|
"grad_norm": 0.4691711366176605, |
|
"learning_rate": 4.9556744890160477e-05, |
|
"loss": 8.6122, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.2089657606777268, |
|
"grad_norm": 0.42626431584358215, |
|
"learning_rate": 4.955310012390711e-05, |
|
"loss": 9.0031, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.20943640428285681, |
|
"grad_norm": 0.3541715145111084, |
|
"learning_rate": 4.954944056912224e-05, |
|
"loss": 9.3784, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.2099070478879868, |
|
"grad_norm": 0.3353878855705261, |
|
"learning_rate": 4.954576622801006e-05, |
|
"loss": 9.2536, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.21037769149311683, |
|
"grad_norm": 0.45526987314224243, |
|
"learning_rate": 4.954207710278364e-05, |
|
"loss": 8.8725, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.21084833509824685, |
|
"grad_norm": 0.3993997275829315, |
|
"learning_rate": 4.953837319566497e-05, |
|
"loss": 8.7531, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.21131897870337688, |
|
"grad_norm": 0.4544302821159363, |
|
"learning_rate": 4.953465450888495e-05, |
|
"loss": 8.6906, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.21178962230850687, |
|
"grad_norm": 0.35516420006752014, |
|
"learning_rate": 4.9530921044683374e-05, |
|
"loss": 9.0749, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2122602659136369, |
|
"grad_norm": 1.6792665719985962, |
|
"learning_rate": 4.9527172805308944e-05, |
|
"loss": 9.2437, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.21273090951876691, |
|
"grad_norm": 0.46345287561416626, |
|
"learning_rate": 4.952340979301924e-05, |
|
"loss": 9.0281, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.21320155312389694, |
|
"grad_norm": 0.447298139333725, |
|
"learning_rate": 4.951963201008076e-05, |
|
"loss": 8.9642, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.21367219672902693, |
|
"grad_norm": 0.4767840504646301, |
|
"learning_rate": 4.9515839458768905e-05, |
|
"loss": 8.359, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.21414284033415695, |
|
"grad_norm": 0.4263994097709656, |
|
"learning_rate": 4.9512032141367946e-05, |
|
"loss": 9.1196, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.21461348393928698, |
|
"grad_norm": 0.4342626929283142, |
|
"learning_rate": 4.950821006017107e-05, |
|
"loss": 8.6583, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.215084127544417, |
|
"grad_norm": 0.3934561610221863, |
|
"learning_rate": 4.950437321748034e-05, |
|
"loss": 9.0519, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.215554771149547, |
|
"grad_norm": 0.4860813319683075, |
|
"learning_rate": 4.9500521615606716e-05, |
|
"loss": 8.5634, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.21602541475467701, |
|
"grad_norm": 0.35411691665649414, |
|
"learning_rate": 4.949665525687005e-05, |
|
"loss": 9.1898, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.21649605835980704, |
|
"grad_norm": 0.4290132224559784, |
|
"learning_rate": 4.94927741435991e-05, |
|
"loss": 8.9995, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.21696670196493706, |
|
"grad_norm": 0.3373097777366638, |
|
"learning_rate": 4.948887827813147e-05, |
|
"loss": 9.3386, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.21743734557006705, |
|
"grad_norm": 0.42341887950897217, |
|
"learning_rate": 4.948496766281368e-05, |
|
"loss": 9.3743, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.21790798917519708, |
|
"grad_norm": 0.3915397524833679, |
|
"learning_rate": 4.9481042300001124e-05, |
|
"loss": 9.1503, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.2183786327803271, |
|
"grad_norm": 0.4155285954475403, |
|
"learning_rate": 4.947710219205808e-05, |
|
"loss": 9.0803, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.21884927638545712, |
|
"grad_norm": 0.4009873867034912, |
|
"learning_rate": 4.94731473413577e-05, |
|
"loss": 8.8088, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.21931991999058711, |
|
"grad_norm": 0.3694516122341156, |
|
"learning_rate": 4.946917775028204e-05, |
|
"loss": 9.6886, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.21979056359571714, |
|
"grad_norm": 0.4301382899284363, |
|
"learning_rate": 4.946519342122199e-05, |
|
"loss": 8.8388, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.22026120720084716, |
|
"grad_norm": 0.3725178837776184, |
|
"learning_rate": 4.946119435657738e-05, |
|
"loss": 9.3083, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.22073185080597718, |
|
"grad_norm": 0.34573477506637573, |
|
"learning_rate": 4.945718055875684e-05, |
|
"loss": 9.3972, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.22120249441110718, |
|
"grad_norm": 0.4900851845741272, |
|
"learning_rate": 4.945315203017795e-05, |
|
"loss": 8.8847, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2216731380162372, |
|
"grad_norm": 0.3375721871852875, |
|
"learning_rate": 4.944910877326709e-05, |
|
"loss": 9.3369, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.22214378162136722, |
|
"grad_norm": 0.38274478912353516, |
|
"learning_rate": 4.944505079045958e-05, |
|
"loss": 9.2587, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.22261442522649724, |
|
"grad_norm": 0.45915624499320984, |
|
"learning_rate": 4.944097808419955e-05, |
|
"loss": 8.6162, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.22308506883162724, |
|
"grad_norm": 0.4436270296573639, |
|
"learning_rate": 4.9436890656940045e-05, |
|
"loss": 8.9692, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.22355571243675726, |
|
"grad_norm": 0.44073861837387085, |
|
"learning_rate": 4.943278851114293e-05, |
|
"loss": 8.6524, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.22402635604188728, |
|
"grad_norm": 0.37401431798934937, |
|
"learning_rate": 4.942867164927899e-05, |
|
"loss": 9.3269, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.2244969996470173, |
|
"grad_norm": 0.36092767119407654, |
|
"learning_rate": 4.942454007382782e-05, |
|
"loss": 9.0893, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.2249676432521473, |
|
"grad_norm": 0.46312302350997925, |
|
"learning_rate": 4.9420393787277917e-05, |
|
"loss": 9.3986, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.22543828685727732, |
|
"grad_norm": 0.339429646730423, |
|
"learning_rate": 4.9416232792126615e-05, |
|
"loss": 9.3501, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.22590893046240734, |
|
"grad_norm": 0.4019092917442322, |
|
"learning_rate": 4.941205709088011e-05, |
|
"loss": 8.8818, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.22637957406753736, |
|
"grad_norm": 0.4025574028491974, |
|
"learning_rate": 4.940786668605348e-05, |
|
"loss": 9.0087, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.22685021767266736, |
|
"grad_norm": 0.41925379633903503, |
|
"learning_rate": 4.9403661580170626e-05, |
|
"loss": 9.0019, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.22732086127779738, |
|
"grad_norm": 0.38912633061408997, |
|
"learning_rate": 4.939944177576432e-05, |
|
"loss": 9.4554, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.2277915048829274, |
|
"grad_norm": 0.3775523602962494, |
|
"learning_rate": 4.9395207275376175e-05, |
|
"loss": 8.911, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.22826214848805743, |
|
"grad_norm": 0.37626808881759644, |
|
"learning_rate": 4.939095808155668e-05, |
|
"loss": 8.9951, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.22873279209318742, |
|
"grad_norm": 0.4059127867221832, |
|
"learning_rate": 4.938669419686516e-05, |
|
"loss": 9.0841, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.22920343569831744, |
|
"grad_norm": 0.35881519317626953, |
|
"learning_rate": 4.938241562386977e-05, |
|
"loss": 9.2341, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.22967407930344746, |
|
"grad_norm": 0.42100849747657776, |
|
"learning_rate": 4.9378122365147536e-05, |
|
"loss": 9.0711, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.2301447229085775, |
|
"grad_norm": 0.4081602394580841, |
|
"learning_rate": 4.9373814423284336e-05, |
|
"loss": 9.0102, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.23061536651370748, |
|
"grad_norm": 0.3893739581108093, |
|
"learning_rate": 4.936949180087486e-05, |
|
"loss": 9.1481, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2310860101188375, |
|
"grad_norm": 0.38784539699554443, |
|
"learning_rate": 4.936515450052267e-05, |
|
"loss": 9.2699, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.23155665372396753, |
|
"grad_norm": 0.39232099056243896, |
|
"learning_rate": 4.9360802524840156e-05, |
|
"loss": 9.1015, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.23202729732909755, |
|
"grad_norm": 0.4174420237541199, |
|
"learning_rate": 4.935643587644855e-05, |
|
"loss": 8.8689, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.23249794093422754, |
|
"grad_norm": 0.3970744013786316, |
|
"learning_rate": 4.9352054557977905e-05, |
|
"loss": 9.134, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.23296858453935756, |
|
"grad_norm": 0.34588709473609924, |
|
"learning_rate": 4.934765857206715e-05, |
|
"loss": 9.1163, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.2334392281444876, |
|
"grad_norm": 0.38045328855514526, |
|
"learning_rate": 4.934324792136399e-05, |
|
"loss": 9.2736, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.2339098717496176, |
|
"grad_norm": 0.3795531094074249, |
|
"learning_rate": 4.9338822608525027e-05, |
|
"loss": 9.2326, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.2343805153547476, |
|
"grad_norm": 0.3959232270717621, |
|
"learning_rate": 4.9334382636215646e-05, |
|
"loss": 9.2973, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.23485115895987763, |
|
"grad_norm": 0.40320464968681335, |
|
"learning_rate": 4.932992800711009e-05, |
|
"loss": 8.8766, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.23532180256500765, |
|
"grad_norm": 0.35472753643989563, |
|
"learning_rate": 4.9325458723891405e-05, |
|
"loss": 9.2191, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23579244617013767, |
|
"grad_norm": 0.40472298860549927, |
|
"learning_rate": 4.932097478925148e-05, |
|
"loss": 8.8783, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.23626308977526767, |
|
"grad_norm": 0.4293891489505768, |
|
"learning_rate": 4.931647620589104e-05, |
|
"loss": 8.4516, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.2367337333803977, |
|
"grad_norm": 0.3897256851196289, |
|
"learning_rate": 4.9311962976519586e-05, |
|
"loss": 9.2541, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.2372043769855277, |
|
"grad_norm": 0.36981016397476196, |
|
"learning_rate": 4.9307435103855507e-05, |
|
"loss": 9.0664, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.23767502059065773, |
|
"grad_norm": 0.4339733421802521, |
|
"learning_rate": 4.930289259062596e-05, |
|
"loss": 9.2965, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.23814566419578773, |
|
"grad_norm": 0.4204358756542206, |
|
"learning_rate": 4.9298335439566946e-05, |
|
"loss": 9.0738, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.23861630780091775, |
|
"grad_norm": 0.3759208023548126, |
|
"learning_rate": 4.929376365342326e-05, |
|
"loss": 9.5119, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.23908695140604777, |
|
"grad_norm": 0.3684697151184082, |
|
"learning_rate": 4.9289177234948535e-05, |
|
"loss": 9.338, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.2395575950111778, |
|
"grad_norm": 0.40956175327301025, |
|
"learning_rate": 4.928457618690522e-05, |
|
"loss": 9.0164, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.2400282386163078, |
|
"grad_norm": 0.4373653829097748, |
|
"learning_rate": 4.927996051206454e-05, |
|
"loss": 8.4385, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2404988822214378, |
|
"grad_norm": 0.3845258951187134, |
|
"learning_rate": 4.927533021320657e-05, |
|
"loss": 9.3247, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.24096952582656783, |
|
"grad_norm": 0.3763442039489746, |
|
"learning_rate": 4.9270685293120164e-05, |
|
"loss": 9.357, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.24144016943169785, |
|
"grad_norm": 0.4450169503688812, |
|
"learning_rate": 4.9266025754603005e-05, |
|
"loss": 8.5107, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.24191081303682785, |
|
"grad_norm": 0.41103556752204895, |
|
"learning_rate": 4.926135160046157e-05, |
|
"loss": 9.3063, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.24238145664195787, |
|
"grad_norm": 0.4856661856174469, |
|
"learning_rate": 4.925666283351114e-05, |
|
"loss": 8.7831, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.2428521002470879, |
|
"grad_norm": 0.3764643371105194, |
|
"learning_rate": 4.92519594565758e-05, |
|
"loss": 9.0384, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.24332274385221792, |
|
"grad_norm": 0.3988141417503357, |
|
"learning_rate": 4.924724147248841e-05, |
|
"loss": 9.1045, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.2437933874573479, |
|
"grad_norm": 0.3450901210308075, |
|
"learning_rate": 4.924250888409069e-05, |
|
"loss": 9.3091, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.24426403106247793, |
|
"grad_norm": 0.4347275495529175, |
|
"learning_rate": 4.923776169423309e-05, |
|
"loss": 9.115, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.24473467466760795, |
|
"grad_norm": 0.36428380012512207, |
|
"learning_rate": 4.923299990577488e-05, |
|
"loss": 9.0645, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.24520531827273798, |
|
"grad_norm": 0.4311101734638214, |
|
"learning_rate": 4.922822352158412e-05, |
|
"loss": 8.7247, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.24567596187786797, |
|
"grad_norm": 0.4824456572532654, |
|
"learning_rate": 4.922343254453768e-05, |
|
"loss": 8.7448, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.246146605482998, |
|
"grad_norm": 0.5465502738952637, |
|
"learning_rate": 4.9218626977521206e-05, |
|
"loss": 8.471, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.24661724908812802, |
|
"grad_norm": 0.4191696047782898, |
|
"learning_rate": 4.921380682342912e-05, |
|
"loss": 8.4572, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.24708789269325804, |
|
"grad_norm": 0.40454065799713135, |
|
"learning_rate": 4.920897208516464e-05, |
|
"loss": 9.3254, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.24755853629838803, |
|
"grad_norm": 0.36362919211387634, |
|
"learning_rate": 4.920412276563977e-05, |
|
"loss": 9.4725, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.24802917990351805, |
|
"grad_norm": 0.38239118456840515, |
|
"learning_rate": 4.91992588677753e-05, |
|
"loss": 8.5503, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.24849982350864808, |
|
"grad_norm": 0.3423115015029907, |
|
"learning_rate": 4.919438039450078e-05, |
|
"loss": 9.294, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.2489704671137781, |
|
"grad_norm": 0.3812299966812134, |
|
"learning_rate": 4.918948734875457e-05, |
|
"loss": 9.374, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.2494411107189081, |
|
"grad_norm": 0.5085097551345825, |
|
"learning_rate": 4.9184579733483796e-05, |
|
"loss": 8.5979, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.24991175432403812, |
|
"grad_norm": 0.34993723034858704, |
|
"learning_rate": 4.917965755164433e-05, |
|
"loss": 9.4077, |
|
"step": 531 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 4248, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 531, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.093280422836306e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|