|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 1730, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.66322660446167, |
|
"learning_rate": 2.8901734104046245e-05, |
|
"loss": 2.8724, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.364796161651611, |
|
"learning_rate": 5.780346820809249e-05, |
|
"loss": 1.8669, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 7.0544962882995605, |
|
"learning_rate": 8.670520231213874e-05, |
|
"loss": 1.0394, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.588165521621704, |
|
"learning_rate": 0.00011560693641618498, |
|
"loss": 0.8095, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.311723470687866, |
|
"learning_rate": 0.00014450867052023122, |
|
"loss": 0.7673, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.0632615089416504, |
|
"learning_rate": 0.00017341040462427748, |
|
"loss": 0.7738, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.7875710725784302, |
|
"learning_rate": 0.00019974309569685292, |
|
"loss": 0.7151, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.7314724922180176, |
|
"learning_rate": 0.00019653179190751445, |
|
"loss": 0.6782, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.1410841941833496, |
|
"learning_rate": 0.00019332048811817598, |
|
"loss": 0.6733, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.6796475648880005, |
|
"learning_rate": 0.0001901091843288375, |
|
"loss": 0.711, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.9651137590408325, |
|
"learning_rate": 0.00018689788053949903, |
|
"loss": 0.6285, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.9334725141525269, |
|
"learning_rate": 0.00018368657675016056, |
|
"loss": 0.6387, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.9697507619857788, |
|
"learning_rate": 0.00018047527296082209, |
|
"loss": 0.6203, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.5927447080612183, |
|
"learning_rate": 0.0001772639691714836, |
|
"loss": 0.6036, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.5775573253631592, |
|
"learning_rate": 0.00017405266538214514, |
|
"loss": 0.5443, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.265123128890991, |
|
"learning_rate": 0.0001708413615928067, |
|
"loss": 0.5864, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.9081721305847168, |
|
"learning_rate": 0.00016763005780346822, |
|
"loss": 0.497, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.7822768688201904, |
|
"learning_rate": 0.00016441875401412975, |
|
"loss": 0.5652, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.585552453994751, |
|
"learning_rate": 0.00016120745022479128, |
|
"loss": 0.5356, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.8936752080917358, |
|
"learning_rate": 0.0001579961464354528, |
|
"loss": 0.4897, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.7502835988998413, |
|
"learning_rate": 0.00015478484264611433, |
|
"loss": 0.5051, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.6106928586959839, |
|
"learning_rate": 0.00015157353885677586, |
|
"loss": 0.5034, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.6488401889801025, |
|
"learning_rate": 0.00014836223506743738, |
|
"loss": 0.4692, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.5491576194763184, |
|
"learning_rate": 0.0001451509312780989, |
|
"loss": 0.5016, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.7753480672836304, |
|
"learning_rate": 0.00014193962748876044, |
|
"loss": 0.4836, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.5605947971343994, |
|
"learning_rate": 0.00013872832369942197, |
|
"loss": 0.5059, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 1.4864176511764526, |
|
"learning_rate": 0.0001355170199100835, |
|
"loss": 0.4667, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.6282180547714233, |
|
"learning_rate": 0.00013230571612074502, |
|
"loss": 0.4208, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.9214680194854736, |
|
"learning_rate": 0.00012909441233140655, |
|
"loss": 0.3733, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.9027807712554932, |
|
"learning_rate": 0.00012588310854206808, |
|
"loss": 0.3831, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.756855845451355, |
|
"learning_rate": 0.0001226718047527296, |
|
"loss": 0.3876, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.5269505977630615, |
|
"learning_rate": 0.00011946050096339114, |
|
"loss": 0.3847, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.7476940155029297, |
|
"learning_rate": 0.00011624919717405267, |
|
"loss": 0.3604, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.8043824434280396, |
|
"learning_rate": 0.0001130378933847142, |
|
"loss": 0.384, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.4151132106781006, |
|
"learning_rate": 0.00010982658959537572, |
|
"loss": 0.3745, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.907926082611084, |
|
"learning_rate": 0.00010661528580603725, |
|
"loss": 0.372, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.8263049125671387, |
|
"learning_rate": 0.00010340398201669879, |
|
"loss": 0.3972, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.8957775831222534, |
|
"learning_rate": 0.00010019267822736032, |
|
"loss": 0.3967, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.772609829902649, |
|
"learning_rate": 9.698137443802185e-05, |
|
"loss": 0.3775, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.976365566253662, |
|
"learning_rate": 9.377007064868337e-05, |
|
"loss": 0.4043, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.689432978630066, |
|
"learning_rate": 9.05587668593449e-05, |
|
"loss": 0.3226, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.9092849493026733, |
|
"learning_rate": 8.734746307000643e-05, |
|
"loss": 0.3758, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 1.7606756687164307, |
|
"learning_rate": 8.413615928066796e-05, |
|
"loss": 0.3095, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 1.8509767055511475, |
|
"learning_rate": 8.092485549132948e-05, |
|
"loss": 0.2712, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.5414364337921143, |
|
"learning_rate": 7.771355170199101e-05, |
|
"loss": 0.2695, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 1.8055434226989746, |
|
"learning_rate": 7.450224791265255e-05, |
|
"loss": 0.2828, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.0254344940185547, |
|
"learning_rate": 7.129094412331408e-05, |
|
"loss": 0.2729, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 2.2866463661193848, |
|
"learning_rate": 6.80796403339756e-05, |
|
"loss": 0.3022, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 1.5647475719451904, |
|
"learning_rate": 6.486833654463712e-05, |
|
"loss": 0.2893, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 1.9078121185302734, |
|
"learning_rate": 6.165703275529865e-05, |
|
"loss": 0.2751, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 1.9310845136642456, |
|
"learning_rate": 5.844572896596018e-05, |
|
"loss": 0.2614, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.5173165798187256, |
|
"learning_rate": 5.523442517662171e-05, |
|
"loss": 0.2781, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 2.2908642292022705, |
|
"learning_rate": 5.2023121387283234e-05, |
|
"loss": 0.2889, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 2.0346012115478516, |
|
"learning_rate": 4.881181759794477e-05, |
|
"loss": 0.2997, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 1.6963484287261963, |
|
"learning_rate": 4.56005138086063e-05, |
|
"loss": 0.2415, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 1.552463173866272, |
|
"learning_rate": 4.238921001926782e-05, |
|
"loss": 0.229, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 2.3235974311828613, |
|
"learning_rate": 3.917790622992935e-05, |
|
"loss": 0.1927, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 1.6171674728393555, |
|
"learning_rate": 3.596660244059088e-05, |
|
"loss": 0.2116, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 1.9670432806015015, |
|
"learning_rate": 3.275529865125241e-05, |
|
"loss": 0.207, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 2.1716980934143066, |
|
"learning_rate": 2.9543994861913938e-05, |
|
"loss": 0.2023, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 2.0103349685668945, |
|
"learning_rate": 2.6332691072575465e-05, |
|
"loss": 0.2081, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 2.488182544708252, |
|
"learning_rate": 2.3121387283236996e-05, |
|
"loss": 0.2206, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 2.150543451309204, |
|
"learning_rate": 1.9910083493898523e-05, |
|
"loss": 0.1989, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 1.952592134475708, |
|
"learning_rate": 1.6698779704560053e-05, |
|
"loss": 0.2074, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 1.8111902475357056, |
|
"learning_rate": 1.348747591522158e-05, |
|
"loss": 0.2122, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 2.595923662185669, |
|
"learning_rate": 1.027617212588311e-05, |
|
"loss": 0.191, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 2.17901611328125, |
|
"learning_rate": 7.064868336544637e-06, |
|
"loss": 0.2205, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 2.284546136856079, |
|
"learning_rate": 3.853564547206165e-06, |
|
"loss": 0.2239, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 2.4415647983551025, |
|
"learning_rate": 6.422607578676943e-07, |
|
"loss": 0.1967, |
|
"step": 1725 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1730, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 7.5248089823232e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|