|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.997697620874904, |
|
"eval_steps": 500, |
|
"global_step": 1953, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 72.47212998792584, |
|
"learning_rate": 1.020408163265306e-06, |
|
"loss": 17.0139, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 72.19709441394318, |
|
"learning_rate": 5.102040816326531e-06, |
|
"loss": 16.8099, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 74.11072818499085, |
|
"learning_rate": 1.0204081632653061e-05, |
|
"loss": 16.7768, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 67.88747859998395, |
|
"learning_rate": 1.5306122448979594e-05, |
|
"loss": 15.8745, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 57.15585576506464, |
|
"learning_rate": 2.0408163265306123e-05, |
|
"loss": 13.4197, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 52.16705016348157, |
|
"learning_rate": 2.5510204081632654e-05, |
|
"loss": 10.8782, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 34.872891212782, |
|
"learning_rate": 3.061224489795919e-05, |
|
"loss": 8.0236, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 20.96608544121856, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 5.7602, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 14.39165011334111, |
|
"learning_rate": 4.0816326530612245e-05, |
|
"loss": 4.4129, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 9.537119159153542, |
|
"learning_rate": 4.591836734693878e-05, |
|
"loss": 3.4241, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 10.051565364855378, |
|
"learning_rate": 5.102040816326531e-05, |
|
"loss": 2.7711, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.902644991920862, |
|
"learning_rate": 5.6122448979591836e-05, |
|
"loss": 2.1132, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.282533867200368, |
|
"learning_rate": 6.122448979591838e-05, |
|
"loss": 1.7932, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.849440118853736, |
|
"learning_rate": 6.63265306122449e-05, |
|
"loss": 1.5357, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.395981446361856, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 1.4488, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.9290320550828686, |
|
"learning_rate": 7.653061224489796e-05, |
|
"loss": 1.3654, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.9509560155111505, |
|
"learning_rate": 8.163265306122449e-05, |
|
"loss": 1.2287, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.9800497930938712, |
|
"learning_rate": 8.673469387755102e-05, |
|
"loss": 1.2863, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.7120992753385207, |
|
"learning_rate": 9.183673469387756e-05, |
|
"loss": 1.1712, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.2653861656665453, |
|
"learning_rate": 9.693877551020408e-05, |
|
"loss": 1.2174, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.453167617907916, |
|
"learning_rate": 0.00010204081632653062, |
|
"loss": 1.1714, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.547620666252296, |
|
"learning_rate": 0.00010714285714285715, |
|
"loss": 1.1168, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.6713272237258214, |
|
"learning_rate": 0.00011224489795918367, |
|
"loss": 1.0868, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.5402309108002492, |
|
"learning_rate": 0.00011734693877551022, |
|
"loss": 1.1351, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.5970274628335959, |
|
"learning_rate": 0.00012244897959183676, |
|
"loss": 1.0946, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.6342350847388514, |
|
"learning_rate": 0.00012755102040816328, |
|
"loss": 1.1211, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.6632080501970268, |
|
"learning_rate": 0.0001326530612244898, |
|
"loss": 1.0501, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.4060120288881937, |
|
"learning_rate": 0.00013775510204081635, |
|
"loss": 1.0956, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.801531788644497, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 1.072, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.520734386032582, |
|
"learning_rate": 0.0001479591836734694, |
|
"loss": 1.1119, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.840843283748014, |
|
"learning_rate": 0.0001530612244897959, |
|
"loss": 0.9969, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.07792070441396, |
|
"learning_rate": 0.00015816326530612246, |
|
"loss": 1.0208, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.9723613912374163, |
|
"learning_rate": 0.00016326530612244898, |
|
"loss": 0.9975, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.8108507027898844, |
|
"learning_rate": 0.00016836734693877553, |
|
"loss": 1.0388, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.049204774598262, |
|
"learning_rate": 0.00017346938775510205, |
|
"loss": 0.9837, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.013771271987093, |
|
"learning_rate": 0.0001785714285714286, |
|
"loss": 0.956, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.6734633410801107, |
|
"learning_rate": 0.00018367346938775512, |
|
"loss": 1.0388, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.3629530195645043, |
|
"learning_rate": 0.00018877551020408164, |
|
"loss": 1.0407, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.0061060540694267, |
|
"learning_rate": 0.00019387755102040816, |
|
"loss": 1.0528, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.8957278214013087, |
|
"learning_rate": 0.0001989795918367347, |
|
"loss": 1.0637, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.659431161796715, |
|
"learning_rate": 0.00019999744233089168, |
|
"loss": 1.0197, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.9623902009306882, |
|
"learning_rate": 0.00019998705202436978, |
|
"loss": 1.0171, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.666768664331948, |
|
"learning_rate": 0.0001999686700559419, |
|
"loss": 0.9411, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.6263956387135465, |
|
"learning_rate": 0.00019994229789482308, |
|
"loss": 1.0452, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.104534739444239, |
|
"learning_rate": 0.00019990793764886012, |
|
"loss": 1.0318, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.5223676518998526, |
|
"learning_rate": 0.0001998655920643634, |
|
"loss": 0.9393, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.399019073099987, |
|
"learning_rate": 0.000199815264525887, |
|
"loss": 0.97, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.8651896389488705, |
|
"learning_rate": 0.00019975695905595855, |
|
"loss": 1.0187, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.536205961219932, |
|
"learning_rate": 0.00019969068031475744, |
|
"loss": 0.9716, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.7078218228495357, |
|
"learning_rate": 0.0001996164335997425, |
|
"loss": 0.9959, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3467510953394168, |
|
"learning_rate": 0.0001995342248452285, |
|
"loss": 0.9602, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.9084544882836239, |
|
"learning_rate": 0.00019944406062191204, |
|
"loss": 0.9775, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.5574610066347279, |
|
"learning_rate": 0.000199345948136346, |
|
"loss": 1.0177, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.109450536237562, |
|
"learning_rate": 0.00019923989523036394, |
|
"loss": 0.9819, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.7828828000504573, |
|
"learning_rate": 0.00019912591038045307, |
|
"loss": 0.9707, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3138848135299093, |
|
"learning_rate": 0.0001990040026970768, |
|
"loss": 0.9909, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.5708872027230352, |
|
"learning_rate": 0.0001988741819239467, |
|
"loss": 0.9744, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.7915877912980807, |
|
"learning_rate": 0.0001987364584372435, |
|
"loss": 0.9629, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.7688040681479107, |
|
"learning_rate": 0.00019859084324478791, |
|
"loss": 1.0066, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.2087916028360217, |
|
"learning_rate": 0.00019843734798516077, |
|
"loss": 0.9273, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.7217374396550518, |
|
"learning_rate": 0.00019827598492677283, |
|
"loss": 0.9599, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.1827063296813014, |
|
"learning_rate": 0.000198106766966884, |
|
"loss": 0.9526, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.275288932081838, |
|
"learning_rate": 0.0001979297076305728, |
|
"loss": 0.9351, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.4572732094413907, |
|
"learning_rate": 0.00019774482106965513, |
|
"loss": 0.9916, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.3548000574556105, |
|
"learning_rate": 0.00019755212206155318, |
|
"loss": 0.9894, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.4221295720770253, |
|
"learning_rate": 0.00019735162600811447, |
|
"loss": 0.9147, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.1147927135270754, |
|
"learning_rate": 0.00019714334893438062, |
|
"loss": 0.9173, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.5740508551311372, |
|
"learning_rate": 0.00019692730748730662, |
|
"loss": 1.0049, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.4361894716708776, |
|
"learning_rate": 0.0001967035189344303, |
|
"loss": 0.9772, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.5979099481664976, |
|
"learning_rate": 0.00019647200116249214, |
|
"loss": 0.9734, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1416579629890438, |
|
"learning_rate": 0.00019623277267600574, |
|
"loss": 0.9695, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.2859502643647958, |
|
"learning_rate": 0.0001959858525957786, |
|
"loss": 0.9726, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.654528585364595, |
|
"learning_rate": 0.00019573126065738415, |
|
"loss": 0.9099, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.6432235938460993, |
|
"learning_rate": 0.00019546901720958405, |
|
"loss": 0.993, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.184408195362666, |
|
"learning_rate": 0.00019519914321270196, |
|
"loss": 0.983, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.364894780646226, |
|
"learning_rate": 0.00019492166023694823, |
|
"loss": 0.9385, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.2140596195120288, |
|
"learning_rate": 0.0001946365904606957, |
|
"loss": 0.928, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.2023189029574775, |
|
"learning_rate": 0.00019434395666870734, |
|
"loss": 0.9497, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.5293781545066814, |
|
"learning_rate": 0.00019404378225031482, |
|
"loss": 0.9845, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.2719601216605032, |
|
"learning_rate": 0.00019373609119754926, |
|
"loss": 0.9535, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.1767287730617413, |
|
"learning_rate": 0.00019342090810322361, |
|
"loss": 0.9669, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.5092663681250142, |
|
"learning_rate": 0.00019309825815896697, |
|
"loss": 0.9097, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.065331569108817, |
|
"learning_rate": 0.00019276816715321107, |
|
"loss": 0.9257, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0551712975999572, |
|
"learning_rate": 0.00019243066146912914, |
|
"loss": 0.9179, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.280851911643696, |
|
"learning_rate": 0.00019208576808252726, |
|
"loss": 0.9322, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.1957795064970969, |
|
"learning_rate": 0.00019173351455968805, |
|
"loss": 0.9472, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.6691262118179164, |
|
"learning_rate": 0.00019137392905516757, |
|
"loss": 0.9833, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.2087456453830114, |
|
"learning_rate": 0.0001910070403095449, |
|
"loss": 0.9554, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.2175939573659607, |
|
"learning_rate": 0.00019063287764712513, |
|
"loss": 0.9844, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.4498433195544413, |
|
"learning_rate": 0.00019025147097359528, |
|
"loss": 0.9467, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.429864892305334, |
|
"learning_rate": 0.00018986285077363446, |
|
"loss": 0.9309, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.170651014792718, |
|
"learning_rate": 0.00018946704810847689, |
|
"loss": 0.9234, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.3282215747364223, |
|
"learning_rate": 0.00018906409461342952, |
|
"loss": 0.9536, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.1845632567557303, |
|
"learning_rate": 0.00018865402249534347, |
|
"loss": 0.9772, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.2127271540342577, |
|
"learning_rate": 0.00018823686453003973, |
|
"loss": 0.9523, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.2360135960506637, |
|
"learning_rate": 0.00018781265405968972, |
|
"loss": 0.9135, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.5041667778892749, |
|
"learning_rate": 0.0001873814249901501, |
|
"loss": 0.9625, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.1504429942546481, |
|
"learning_rate": 0.00018694321178825286, |
|
"loss": 0.9363, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.0319491329942774, |
|
"learning_rate": 0.00018649804947905055, |
|
"loss": 0.9054, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.2204274089754297, |
|
"learning_rate": 0.0001860459736430169, |
|
"loss": 0.9635, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.0611368485238843, |
|
"learning_rate": 0.00018558702041320273, |
|
"loss": 0.9445, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.1018739389426906, |
|
"learning_rate": 0.00018512122647234812, |
|
"loss": 0.9289, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.9882389573927103, |
|
"learning_rate": 0.0001846486290499505, |
|
"loss": 0.9911, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.2994396811323112, |
|
"learning_rate": 0.0001841692659192889, |
|
"loss": 0.9264, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.5236972647383147, |
|
"learning_rate": 0.00018368317539440492, |
|
"loss": 0.9563, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.1383444262695794, |
|
"learning_rate": 0.0001831903963270404, |
|
"loss": 0.977, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.110789605109835, |
|
"learning_rate": 0.00018269096810353205, |
|
"loss": 0.9388, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.6690032269651018, |
|
"learning_rate": 0.00018218493064166353, |
|
"loss": 0.923, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.2741603156202803, |
|
"learning_rate": 0.00018167232438747485, |
|
"loss": 0.959, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.1359315426783947, |
|
"learning_rate": 0.00018115319031202965, |
|
"loss": 0.958, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.1462965424102658, |
|
"learning_rate": 0.00018062756990814058, |
|
"loss": 0.9206, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.9795818536509971, |
|
"learning_rate": 0.00018009550518705285, |
|
"loss": 0.9027, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9867688236460216, |
|
"learning_rate": 0.00017955703867508633, |
|
"loss": 0.9283, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.0077012911846719, |
|
"learning_rate": 0.00017901221341023673, |
|
"loss": 0.9516, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.3421028406940887, |
|
"learning_rate": 0.00017846107293873555, |
|
"loss": 0.9121, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.4547843706488663, |
|
"learning_rate": 0.0001779036613115696, |
|
"loss": 0.8875, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.1060535265201301, |
|
"learning_rate": 0.00017734002308096014, |
|
"loss": 0.9554, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.5055847688809278, |
|
"learning_rate": 0.00017677020329680203, |
|
"loss": 0.9173, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.1611583032421355, |
|
"learning_rate": 0.00017619424750306287, |
|
"loss": 0.9086, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.1353445717142476, |
|
"learning_rate": 0.00017561220173414297, |
|
"loss": 0.967, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.9563792857101335, |
|
"learning_rate": 0.00017502411251119586, |
|
"loss": 0.9155, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.1337314999879342, |
|
"learning_rate": 0.00017443002683841002, |
|
"loss": 0.8905, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.0287944149688226, |
|
"learning_rate": 0.00017382999219925203, |
|
"loss": 0.9092, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.2022884801513027, |
|
"learning_rate": 0.00017322405655267122, |
|
"loss": 0.8703, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.2018380484271611, |
|
"learning_rate": 0.0001726122683292667, |
|
"loss": 0.9769, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.2278521149338097, |
|
"learning_rate": 0.0001719946764274162, |
|
"loss": 0.9632, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.132615620517034, |
|
"learning_rate": 0.00017137133020936782, |
|
"loss": 0.943, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.0891584621905475, |
|
"learning_rate": 0.00017074227949729481, |
|
"loss": 0.9249, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.1417202547948113, |
|
"learning_rate": 0.00017010757456931334, |
|
"loss": 0.9055, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.9346881604970835, |
|
"learning_rate": 0.0001694672661554638, |
|
"loss": 0.9336, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.1301994877879067, |
|
"learning_rate": 0.0001688214054336563, |
|
"loss": 0.9062, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.2441989183425903, |
|
"eval_runtime": 252.9523, |
|
"eval_samples_per_second": 9.132, |
|
"eval_steps_per_second": 0.573, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.179898601273253, |
|
"learning_rate": 0.00016817004402558012, |
|
"loss": 0.9027, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.9417315541112381, |
|
"learning_rate": 0.0001675132339925776, |
|
"loss": 0.9119, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.0837703721555252, |
|
"learning_rate": 0.0001668510278314833, |
|
"loss": 0.86, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.1272174437541118, |
|
"learning_rate": 0.00016618347847042778, |
|
"loss": 0.8711, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.1359544360875102, |
|
"learning_rate": 0.00016551063926460748, |
|
"loss": 0.8776, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.1917117624970714, |
|
"learning_rate": 0.00016483256399202006, |
|
"loss": 0.9209, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.0249074428213159, |
|
"learning_rate": 0.00016414930684916613, |
|
"loss": 0.8196, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.1816776012865573, |
|
"learning_rate": 0.00016346092244671746, |
|
"loss": 0.8279, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.077779310150086, |
|
"learning_rate": 0.00016276746580515218, |
|
"loss": 0.8997, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.2009550281356633, |
|
"learning_rate": 0.00016206899235035702, |
|
"loss": 0.8938, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.0925262506944138, |
|
"learning_rate": 0.00016136555790919748, |
|
"loss": 0.8856, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.0198591870033458, |
|
"learning_rate": 0.0001606572187050556, |
|
"loss": 0.8978, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.1119975974231693, |
|
"learning_rate": 0.0001599440313533363, |
|
"loss": 0.8876, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.190419344020418, |
|
"learning_rate": 0.00015922605285694215, |
|
"loss": 0.904, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.1639792062638574, |
|
"learning_rate": 0.0001585033406017175, |
|
"loss": 0.9021, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.2040942925245026, |
|
"learning_rate": 0.0001577759523518616, |
|
"loss": 0.9092, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.0704322799863495, |
|
"learning_rate": 0.00015704394624531184, |
|
"loss": 0.8274, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.0627822260269795, |
|
"learning_rate": 0.00015630738078909685, |
|
"loss": 0.8776, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.9905597794182393, |
|
"learning_rate": 0.00015556631485466027, |
|
"loss": 0.9246, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.173456919024251, |
|
"learning_rate": 0.00015482080767315528, |
|
"loss": 0.9656, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.0873012836166025, |
|
"learning_rate": 0.00015407091883071054, |
|
"loss": 0.9464, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.1473043693064626, |
|
"learning_rate": 0.00015331670826366754, |
|
"loss": 0.8496, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.985606297895901, |
|
"learning_rate": 0.00015255823625379017, |
|
"loss": 0.853, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.231267953892478, |
|
"learning_rate": 0.00015179556342344644, |
|
"loss": 0.8652, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.0503811328789359, |
|
"learning_rate": 0.00015102875073076324, |
|
"loss": 0.9447, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.0561678099502803, |
|
"learning_rate": 0.00015025785946475408, |
|
"loss": 0.879, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.9127623837677151, |
|
"learning_rate": 0.00014948295124042057, |
|
"loss": 0.9144, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.147282495230838, |
|
"learning_rate": 0.00014870408799382752, |
|
"loss": 0.9404, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.0608780257484842, |
|
"learning_rate": 0.00014792133197715266, |
|
"loss": 0.9021, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.2163076965365305, |
|
"learning_rate": 0.0001471347457537111, |
|
"loss": 0.915, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.1177869922249701, |
|
"learning_rate": 0.00014634439219295478, |
|
"loss": 0.8648, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.194327396103283, |
|
"learning_rate": 0.0001455503344654474, |
|
"loss": 0.9526, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.0905268007576805, |
|
"learning_rate": 0.00014475263603781554, |
|
"loss": 0.8757, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.9152140993127001, |
|
"learning_rate": 0.0001439513606676759, |
|
"loss": 0.8722, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.9900853991789728, |
|
"learning_rate": 0.00014314657239853927, |
|
"loss": 0.8669, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.0767926512086485, |
|
"learning_rate": 0.000142338335554692, |
|
"loss": 0.8841, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.8724281805811482, |
|
"learning_rate": 0.00014152671473605428, |
|
"loss": 0.826, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.9929237657179188, |
|
"learning_rate": 0.0001407117748130174, |
|
"loss": 0.8765, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.0329119732814056, |
|
"learning_rate": 0.00013989358092125843, |
|
"loss": 0.879, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.8699115976888, |
|
"learning_rate": 0.00013907219845653442, |
|
"loss": 0.8871, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.0311070826655615, |
|
"learning_rate": 0.00013824769306945532, |
|
"loss": 0.9038, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.2265447591069667, |
|
"learning_rate": 0.00013742013066023678, |
|
"loss": 0.8918, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.2487739691504016, |
|
"learning_rate": 0.00013658957737343298, |
|
"loss": 0.8986, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.9999833105305884, |
|
"learning_rate": 0.00013575609959264994, |
|
"loss": 0.9054, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.0265246633147944, |
|
"learning_rate": 0.0001349197639352395, |
|
"loss": 0.8781, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.2752835605974824, |
|
"learning_rate": 0.00013408063724697499, |
|
"loss": 0.9096, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.0841900048671298, |
|
"learning_rate": 0.00013323878659670836, |
|
"loss": 0.8954, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.9822221386317229, |
|
"learning_rate": 0.00013239427927100964, |
|
"loss": 0.9197, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.0664898243823369, |
|
"learning_rate": 0.00013154718276878872, |
|
"loss": 0.8101, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.9728647259678721, |
|
"learning_rate": 0.00013069756479590065, |
|
"loss": 0.8808, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.233908623068181, |
|
"learning_rate": 0.00012984549325973394, |
|
"loss": 0.8942, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.0507024710570716, |
|
"learning_rate": 0.000128991036263783, |
|
"loss": 0.8719, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.9728551705551435, |
|
"learning_rate": 0.0001281342621022048, |
|
"loss": 0.8734, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.9127829023580948, |
|
"learning_rate": 0.00012727523925436026, |
|
"loss": 0.8641, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.114858935845165, |
|
"learning_rate": 0.00012641403637934112, |
|
"loss": 0.8989, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.0021048323600934, |
|
"learning_rate": 0.00012555072231048192, |
|
"loss": 0.8757, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.9850639243326992, |
|
"learning_rate": 0.00012468536604985867, |
|
"loss": 0.8595, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.8816979962904702, |
|
"learning_rate": 0.00012381803676277345, |
|
"loss": 0.8854, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.052381113604825, |
|
"learning_rate": 0.00012294880377222649, |
|
"loss": 0.8966, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.9471916607767124, |
|
"learning_rate": 0.0001220777365533751, |
|
"loss": 0.8977, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.9648558401941475, |
|
"learning_rate": 0.00012120490472798112, |
|
"loss": 0.8837, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.9430401800136761, |
|
"learning_rate": 0.0001203303780588458, |
|
"loss": 0.9009, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.9624197502185033, |
|
"learning_rate": 0.00011945422644423425, |
|
"loss": 0.8645, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.9260810941977449, |
|
"learning_rate": 0.00011857651991228855, |
|
"loss": 0.8243, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.9362374564085868, |
|
"learning_rate": 0.00011769732861543057, |
|
"loss": 0.886, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.9293220698915177, |
|
"learning_rate": 0.00011681672282475495, |
|
"loss": 0.9028, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.9026227086147156, |
|
"learning_rate": 0.00011593477292441251, |
|
"loss": 0.8253, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.9630673700134131, |
|
"learning_rate": 0.00011505154940598468, |
|
"loss": 0.8686, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.8694549872905225, |
|
"learning_rate": 0.00011416712286284943, |
|
"loss": 0.8782, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.007303227993832, |
|
"learning_rate": 0.00011328156398453864, |
|
"loss": 0.8633, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.9083011875988664, |
|
"learning_rate": 0.00011239494355108848, |
|
"loss": 0.9039, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.0177463749412752, |
|
"learning_rate": 0.00011150733242738198, |
|
"loss": 0.9029, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.8631790225704151, |
|
"learning_rate": 0.00011061880155748497, |
|
"loss": 0.8385, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.5230132007370771, |
|
"learning_rate": 0.00010972942195897582, |
|
"loss": 0.9055, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.8997463696925398, |
|
"learning_rate": 0.00010883926471726926, |
|
"loss": 0.8656, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.9377827912335661, |
|
"learning_rate": 0.00010794840097993466, |
|
"loss": 0.9163, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.9887953595089554, |
|
"learning_rate": 0.00010705690195100939, |
|
"loss": 0.8789, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.0514798236504352, |
|
"learning_rate": 0.00010616483888530781, |
|
"loss": 0.9027, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.9297004988976107, |
|
"learning_rate": 0.00010527228308272605, |
|
"loss": 0.9473, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 1.079573310483164, |
|
"learning_rate": 0.0001043793058825431, |
|
"loss": 0.8308, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.8437863342382683, |
|
"learning_rate": 0.00010348597865771909, |
|
"loss": 0.9183, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.0314459958872142, |
|
"learning_rate": 0.00010259237280919054, |
|
"loss": 0.8965, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.1300004513750828, |
|
"learning_rate": 0.00010169855976016345, |
|
"loss": 0.9058, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.915751139105661, |
|
"learning_rate": 0.00010080461095040476, |
|
"loss": 0.8522, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.9282214710118697, |
|
"learning_rate": 9.991059783053244e-05, |
|
"loss": 0.893, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.9267500154050946, |
|
"learning_rate": 9.901659185630445e-05, |
|
"loss": 0.9187, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.9851835947209288, |
|
"learning_rate": 9.812266448290767e-05, |
|
"loss": 0.8489, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.9072684185548759, |
|
"learning_rate": 9.722888715924664e-05, |
|
"loss": 0.8598, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.0793886242538702, |
|
"learning_rate": 9.633533132223293e-05, |
|
"loss": 0.9136, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.9100627615470746, |
|
"learning_rate": 9.54420683910753e-05, |
|
"loss": 0.894, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.053250149548252, |
|
"learning_rate": 9.454916976157144e-05, |
|
"loss": 0.8604, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.054945196161409, |
|
"learning_rate": 9.365670680040157e-05, |
|
"loss": 0.8875, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.0344061549877617, |
|
"learning_rate": 9.276475083942416e-05, |
|
"loss": 0.8612, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.034253018495178, |
|
"learning_rate": 9.187337316997476e-05, |
|
"loss": 0.8901, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.9350006681122276, |
|
"learning_rate": 9.09826450371678e-05, |
|
"loss": 0.8913, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.1854492776108583, |
|
"learning_rate": 9.009263763420228e-05, |
|
"loss": 0.9029, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 1.0313744595376304, |
|
"learning_rate": 8.920342209667136e-05, |
|
"loss": 0.8399, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.109169472722535, |
|
"learning_rate": 8.831506949687685e-05, |
|
"loss": 0.8517, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.9979711332157661, |
|
"learning_rate": 8.74276508381486e-05, |
|
"loss": 0.8773, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.9829724669057092, |
|
"learning_rate": 8.654123704916927e-05, |
|
"loss": 0.879, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.99994654751558, |
|
"learning_rate": 8.565589897830543e-05, |
|
"loss": 0.8523, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.9478833521675312, |
|
"learning_rate": 8.47717073879447e-05, |
|
"loss": 0.8541, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.9034620075081865, |
|
"learning_rate": 8.388873294884e-05, |
|
"loss": 0.8918, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.8723934369993729, |
|
"learning_rate": 8.300704623446111e-05, |
|
"loss": 0.9133, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.9525657563232377, |
|
"learning_rate": 8.212671771535379e-05, |
|
"loss": 0.8997, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.9812331418181283, |
|
"learning_rate": 8.124781775350741e-05, |
|
"loss": 0.8877, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.0880452788188242, |
|
"learning_rate": 8.037041659673105e-05, |
|
"loss": 0.9202, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 1.097063406936711, |
|
"learning_rate": 7.949458437303891e-05, |
|
"loss": 0.9068, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.816681343841031, |
|
"learning_rate": 7.862039108504513e-05, |
|
"loss": 0.8774, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.0107773305774448, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 0.8973, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.151927912306129, |
|
"learning_rate": 7.687720066604844e-05, |
|
"loss": 0.8857, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.7574607659244479, |
|
"learning_rate": 7.600834286297035e-05, |
|
"loss": 0.8681, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.993971472764409, |
|
"learning_rate": 7.514140264030413e-05, |
|
"loss": 0.9421, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.1317554618491943, |
|
"learning_rate": 7.427644928995326e-05, |
|
"loss": 0.9151, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.9946380695782382, |
|
"learning_rate": 7.341355194501638e-05, |
|
"loss": 0.9331, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.9307204785839222, |
|
"learning_rate": 7.2552779574262e-05, |
|
"loss": 0.9324, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.8165240509268185, |
|
"learning_rate": 7.16942009766159e-05, |
|
"loss": 0.8972, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.0160286848183477, |
|
"learning_rate": 7.083788477566206e-05, |
|
"loss": 0.888, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.9588077943326088, |
|
"learning_rate": 6.998389941415811e-05, |
|
"loss": 0.8776, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.0027507311971422, |
|
"learning_rate": 6.913231314856467e-05, |
|
"loss": 0.892, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.8157424560652131, |
|
"learning_rate": 6.828319404358998e-05, |
|
"loss": 0.8611, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.9346949667365426, |
|
"learning_rate": 6.74366099667495e-05, |
|
"loss": 0.9041, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.9310369545608014, |
|
"learning_rate": 6.659262858294167e-05, |
|
"loss": 0.8348, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.916679588029739, |
|
"learning_rate": 6.575131734903952e-05, |
|
"loss": 0.8665, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.8335818591855719, |
|
"learning_rate": 6.491274350849914e-05, |
|
"loss": 0.8892, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.9831047716703826, |
|
"learning_rate": 6.407697408598497e-05, |
|
"loss": 0.8944, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.9268335345851625, |
|
"learning_rate": 6.324407588201292e-05, |
|
"loss": 0.8536, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.0086597114412514, |
|
"learning_rate": 6.241411546761109e-05, |
|
"loss": 0.8983, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.9455374773689034, |
|
"learning_rate": 6.158715917899893e-05, |
|
"loss": 0.8638, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.9581394982958279, |
|
"learning_rate": 6.076327311228522e-05, |
|
"loss": 0.907, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.1707841157913208, |
|
"eval_runtime": 252.7994, |
|
"eval_samples_per_second": 9.138, |
|
"eval_steps_per_second": 0.574, |
|
"step": 1303 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.0117168856081882, |
|
"learning_rate": 5.99425231181853e-05, |
|
"loss": 0.8658, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.8313758221887009, |
|
"learning_rate": 5.9124974796757614e-05, |
|
"loss": 0.803, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.1429742233636946, |
|
"learning_rate": 5.831069349216069e-05, |
|
"loss": 0.854, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.2005681342348582, |
|
"learning_rate": 5.7499744287430366e-05, |
|
"loss": 0.8209, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.0882362071151326, |
|
"learning_rate": 5.6692191999277614e-05, |
|
"loss": 0.8182, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.0316332870317524, |
|
"learning_rate": 5.588810117290843e-05, |
|
"loss": 0.888, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.8415700670907011, |
|
"learning_rate": 5.508753607686452e-05, |
|
"loss": 0.8274, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.19035695841182, |
|
"learning_rate": 5.429056069788663e-05, |
|
"loss": 0.8587, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.8282530747648139, |
|
"learning_rate": 5.3497238735800456e-05, |
|
"loss": 0.8582, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.8236642419616373, |
|
"learning_rate": 5.2707633598425023e-05, |
|
"loss": 0.8242, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.9499844942270589, |
|
"learning_rate": 5.192180839650482e-05, |
|
"loss": 0.8419, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.9665172203162608, |
|
"learning_rate": 5.1139825938665706e-05, |
|
"loss": 0.8168, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.9570953639965174, |
|
"learning_rate": 5.036174872639443e-05, |
|
"loss": 0.7975, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.1769715973472896, |
|
"learning_rate": 4.95876389490435e-05, |
|
"loss": 0.8803, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.0737767072312128, |
|
"learning_rate": 4.8817558478860316e-05, |
|
"loss": 0.8392, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.0356976294210105, |
|
"learning_rate": 4.805156886604192e-05, |
|
"loss": 0.8427, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.9731310972525462, |
|
"learning_rate": 4.728973133381557e-05, |
|
"loss": 0.8422, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.0915183222326856, |
|
"learning_rate": 4.6532106773545356e-05, |
|
"loss": 0.8002, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.0419392375432739, |
|
"learning_rate": 4.5778755739865234e-05, |
|
"loss": 0.8035, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.034845804276428, |
|
"learning_rate": 4.5029738445839143e-05, |
|
"loss": 0.8633, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.9808221912256501, |
|
"learning_rate": 4.4285114758148385e-05, |
|
"loss": 0.8238, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.9518518180660357, |
|
"learning_rate": 4.3544944192306536e-05, |
|
"loss": 0.821, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.0002459852548866, |
|
"learning_rate": 4.2809285907902804e-05, |
|
"loss": 0.8153, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.9090185258143434, |
|
"learning_rate": 4.207819870387331e-05, |
|
"loss": 0.8724, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.8794828030312869, |
|
"learning_rate": 4.135174101380154e-05, |
|
"loss": 0.7995, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.8426944935252626, |
|
"learning_rate": 4.0629970901248125e-05, |
|
"loss": 0.878, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.0954146157200644, |
|
"learning_rate": 3.991294605510969e-05, |
|
"loss": 0.8605, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.0887405499363092, |
|
"learning_rate": 3.920072378500814e-05, |
|
"loss": 0.853, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.0547395334098018, |
|
"learning_rate": 3.849336101671015e-05, |
|
"loss": 0.8921, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.069888150135721, |
|
"learning_rate": 3.779091428757692e-05, |
|
"loss": 0.8161, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.9353341753814841, |
|
"learning_rate": 3.709343974204577e-05, |
|
"loss": 0.8179, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.0108851490383193, |
|
"learning_rate": 3.640099312714235e-05, |
|
"loss": 0.8385, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.064940293841458, |
|
"learning_rate": 3.5713629788025036e-05, |
|
"loss": 0.8135, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.1487280639099855, |
|
"learning_rate": 3.503140466356151e-05, |
|
"loss": 0.8021, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.9701769325400604, |
|
"learning_rate": 3.435437228193741e-05, |
|
"loss": 0.8399, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.9942305006661968, |
|
"learning_rate": 3.3682586756298185e-05, |
|
"loss": 0.8427, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.0099321403910597, |
|
"learning_rate": 3.3016101780424146e-05, |
|
"loss": 0.8693, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.169305983316981, |
|
"learning_rate": 3.235497062443852e-05, |
|
"loss": 0.8545, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.9289433553993577, |
|
"learning_rate": 3.169924613055003e-05, |
|
"loss": 0.7912, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.1184033184773026, |
|
"learning_rate": 3.10489807088294e-05, |
|
"loss": 0.8488, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.8651320149450171, |
|
"learning_rate": 3.0404226333020114e-05, |
|
"loss": 0.8063, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.015380797479418, |
|
"learning_rate": 2.976503453638452e-05, |
|
"loss": 0.8281, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.9105715510091448, |
|
"learning_rate": 2.9131456407584912e-05, |
|
"loss": 0.8396, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.8953944449056994, |
|
"learning_rate": 2.8503542586600095e-05, |
|
"loss": 0.8059, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.9445583773531011, |
|
"learning_rate": 2.7881343260677938e-05, |
|
"loss": 0.7555, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.9760863299166253, |
|
"learning_rate": 2.7264908160324044e-05, |
|
"loss": 0.872, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.9217062841634305, |
|
"learning_rate": 2.66542865553269e-05, |
|
"loss": 0.8772, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.9862000270745448, |
|
"learning_rate": 2.6049527250820048e-05, |
|
"loss": 0.8042, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.9372789886762245, |
|
"learning_rate": 2.5450678583381037e-05, |
|
"loss": 0.8373, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.9849583322670422, |
|
"learning_rate": 2.4857788417168082e-05, |
|
"loss": 0.8449, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.9531252163149414, |
|
"learning_rate": 2.4270904140094597e-05, |
|
"loss": 0.8204, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 1.1160019622624047, |
|
"learning_rate": 2.3690072660041373e-05, |
|
"loss": 0.857, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.0574865907973916, |
|
"learning_rate": 2.3115340401107487e-05, |
|
"loss": 0.8154, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 1.0014445052570997, |
|
"learning_rate": 2.254675329989988e-05, |
|
"loss": 0.8526, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.1337662282318417, |
|
"learning_rate": 2.1984356801861506e-05, |
|
"loss": 0.7529, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 1.0240125708817747, |
|
"learning_rate": 2.1428195857639256e-05, |
|
"loss": 0.8252, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.8777879796341344, |
|
"learning_rate": 2.0878314919491183e-05, |
|
"loss": 0.8349, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.9749914994991105, |
|
"learning_rate": 2.0334757937733374e-05, |
|
"loss": 0.8147, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.0047182891795483, |
|
"learning_rate": 1.9797568357227293e-05, |
|
"loss": 0.8225, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.102505598943093, |
|
"learning_rate": 1.92667891139074e-05, |
|
"loss": 0.8356, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.9052644924404957, |
|
"learning_rate": 1.8742462631349246e-05, |
|
"loss": 0.8509, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.9927305294402076, |
|
"learning_rate": 1.822463081737883e-05, |
|
"loss": 0.8182, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.0563552318088962, |
|
"learning_rate": 1.7713335060722946e-05, |
|
"loss": 0.7578, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 1.0170016112678426, |
|
"learning_rate": 1.720861622770116e-05, |
|
"loss": 0.7562, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.9404587306406071, |
|
"learning_rate": 1.671051465895953e-05, |
|
"loss": 0.786, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.0171241452919628, |
|
"learning_rate": 1.6219070166246154e-05, |
|
"loss": 0.8616, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.0322115593813674, |
|
"learning_rate": 1.5734322029229253e-05, |
|
"loss": 0.8592, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.0192391783063095, |
|
"learning_rate": 1.5256308992357716e-05, |
|
"loss": 0.8372, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.17212669377524, |
|
"learning_rate": 1.4785069261764184e-05, |
|
"loss": 0.7713, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.0439043355220827, |
|
"learning_rate": 1.4320640502211536e-05, |
|
"loss": 0.8379, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.9140306646774222, |
|
"learning_rate": 1.386305983408236e-05, |
|
"loss": 0.8041, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 1.061796317725875, |
|
"learning_rate": 1.3412363830412078e-05, |
|
"loss": 0.8318, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.0834691203563807, |
|
"learning_rate": 1.2968588513965706e-05, |
|
"loss": 0.851, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.9887481512630522, |
|
"learning_rate": 1.2531769354358825e-05, |
|
"loss": 0.8728, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 1.058272033476656, |
|
"learning_rate": 1.2101941265222373e-05, |
|
"loss": 0.8016, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.072187651154863, |
|
"learning_rate": 1.1679138601412255e-05, |
|
"loss": 0.8505, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 1.2017961960493708, |
|
"learning_rate": 1.126339515626349e-05, |
|
"loss": 0.8601, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.9334846571871207, |
|
"learning_rate": 1.0854744158889085e-05, |
|
"loss": 0.8178, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.9385801565830063, |
|
"learning_rate": 1.0453218271524224e-05, |
|
"loss": 0.8155, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 1.0026372920490085, |
|
"learning_rate": 1.0058849586915653e-05, |
|
"loss": 0.8463, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.0094111622202961, |
|
"learning_rate": 9.671669625756574e-06, |
|
"loss": 0.8291, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.0498204906448678, |
|
"learning_rate": 9.291709334167397e-06, |
|
"loss": 0.8694, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.9108848375007711, |
|
"learning_rate": 8.918999081222156e-06, |
|
"loss": 0.8154, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.1045628537182532, |
|
"learning_rate": 8.553568656521293e-06, |
|
"loss": 0.8542, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.9187751707089163, |
|
"learning_rate": 8.195447267810686e-06, |
|
"loss": 0.8847, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 1.0976708579458017, |
|
"learning_rate": 7.844663538647101e-06, |
|
"loss": 0.8168, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 1.0277757622039416, |
|
"learning_rate": 7.501245506110433e-06, |
|
"loss": 0.824, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.9800736010664038, |
|
"learning_rate": 7.165220618562751e-06, |
|
"loss": 0.8499, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.1066605466255004, |
|
"learning_rate": 6.83661573345451e-06, |
|
"loss": 0.869, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.0217949991902664, |
|
"learning_rate": 6.515457115177803e-06, |
|
"loss": 0.8308, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.0285372086807927, |
|
"learning_rate": 6.20177043296728e-06, |
|
"loss": 0.8513, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.9228807374530837, |
|
"learning_rate": 5.895580758848318e-06, |
|
"loss": 0.8359, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.042562235261924, |
|
"learning_rate": 5.596912565633184e-06, |
|
"loss": 0.8144, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 1.098352136508281, |
|
"learning_rate": 5.305789724965038e-06, |
|
"loss": 0.8253, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.9667182261715308, |
|
"learning_rate": 5.022235505409823e-06, |
|
"loss": 0.8672, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 1.019562250529469, |
|
"learning_rate": 4.746272570596555e-06, |
|
"loss": 0.8214, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.2161564555157138, |
|
"learning_rate": 4.477922977405913e-06, |
|
"loss": 0.8113, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.0015900069881538, |
|
"learning_rate": 4.217208174207199e-06, |
|
"loss": 0.8484, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.9631542498199687, |
|
"learning_rate": 3.964148999144202e-06, |
|
"loss": 0.853, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.9926634902794105, |
|
"learning_rate": 3.71876567846946e-06, |
|
"loss": 0.8488, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.8438960246283821, |
|
"learning_rate": 3.481077824927792e-06, |
|
"loss": 0.829, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.0027991142986536, |
|
"learning_rate": 3.251104436188679e-06, |
|
"loss": 0.8416, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 1.0903635867609474, |
|
"learning_rate": 3.0288638933277934e-06, |
|
"loss": 0.8065, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 1.0073217221247301, |
|
"learning_rate": 2.8143739593578856e-06, |
|
"loss": 0.7876, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.8171096137144128, |
|
"learning_rate": 2.607651777809039e-06, |
|
"loss": 0.7574, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.9374489147765585, |
|
"learning_rate": 2.4087138713584367e-06, |
|
"loss": 0.8652, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.054601585067846, |
|
"learning_rate": 2.2175761405097584e-06, |
|
"loss": 0.8471, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.064632352850785, |
|
"learning_rate": 2.0342538623222997e-06, |
|
"loss": 0.8189, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.8546672110619797, |
|
"learning_rate": 1.8587616891899363e-06, |
|
"loss": 0.7818, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.9645255792031215, |
|
"learning_rate": 1.6911136476699508e-06, |
|
"loss": 0.7776, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.068610540574793, |
|
"learning_rate": 1.5313231373619952e-06, |
|
"loss": 0.7857, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.100889943747955, |
|
"learning_rate": 1.3794029298370814e-06, |
|
"loss": 0.8141, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.9783722379990912, |
|
"learning_rate": 1.2353651676167643e-06, |
|
"loss": 0.8358, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.8803200725105156, |
|
"learning_rate": 1.0992213632026517e-06, |
|
"loss": 0.8678, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.1249544133659088, |
|
"learning_rate": 9.709823981562282e-07, |
|
"loss": 0.8078, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.056395258554381, |
|
"learning_rate": 8.506585222291752e-07, |
|
"loss": 0.7749, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.9253933119169012, |
|
"learning_rate": 7.382593525440573e-07, |
|
"loss": 0.8008, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.9212088582083587, |
|
"learning_rate": 6.337938728257054e-07, |
|
"loss": 0.8333, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 1.100818174265163, |
|
"learning_rate": 5.372704326831901e-07, |
|
"loss": 0.8342, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.8164157006379008, |
|
"learning_rate": 4.486967469424008e-07, |
|
"loss": 0.8583, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.0674250153511942, |
|
"learning_rate": 3.6807989502949394e-07, |
|
"loss": 0.8213, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.9095567427952401, |
|
"learning_rate": 2.954263204050123e-07, |
|
"loss": 0.8252, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.0738638532567681, |
|
"learning_rate": 2.3074183004887505e-07, |
|
"loss": 0.8448, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.1019761677075681, |
|
"learning_rate": 1.7403159399629332e-07, |
|
"loss": 0.8088, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.9934974979812884, |
|
"learning_rate": 1.2530014492446728e-07, |
|
"loss": 0.8029, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.0433381527945313, |
|
"learning_rate": 8.455137779038724e-08, |
|
"loss": 0.8209, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 1.032700859857025, |
|
"learning_rate": 5.1788549519438124e-08, |
|
"loss": 0.7365, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.9408894102132038, |
|
"learning_rate": 2.7014278745163268e-08, |
|
"loss": 0.8451, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.0657773929136594, |
|
"learning_rate": 1.0230545599909658e-08, |
|
"loss": 0.8246, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.0107360164317165, |
|
"learning_rate": 1.438691556565619e-09, |
|
"loss": 0.8209, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.1822351217269897, |
|
"eval_runtime": 252.7889, |
|
"eval_samples_per_second": 9.138, |
|
"eval_steps_per_second": 0.574, |
|
"step": 1953 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1953, |
|
"total_flos": 4109935435055104.0, |
|
"train_loss": 1.1336081770219621, |
|
"train_runtime": 21652.8936, |
|
"train_samples_per_second": 2.888, |
|
"train_steps_per_second": 0.09 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1953, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 4109935435055104.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|