|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.262571103526735, |
|
"eval_steps": 5000, |
|
"global_step": 55048, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11376564277588168, |
|
"grad_norm": 4.343299865722656, |
|
"learning_rate": 7.960000000000001e-05, |
|
"loss": 4.3757, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.22753128555176336, |
|
"grad_norm": 3.4980385303497314, |
|
"learning_rate": 0.0001596, |
|
"loss": 1.7997, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3412969283276451, |
|
"grad_norm": 3.350770950317383, |
|
"learning_rate": 0.0002396, |
|
"loss": 1.4879, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4550625711035267, |
|
"grad_norm": 3.362269163131714, |
|
"learning_rate": 0.0003196, |
|
"loss": 1.3834, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5688282138794084, |
|
"grad_norm": 2.424733877182007, |
|
"learning_rate": 0.0003996, |
|
"loss": 1.3351, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5688282138794084, |
|
"eval_accuracy": 0.668216, |
|
"eval_loss": 1.3343349695205688, |
|
"eval_runtime": 15.5029, |
|
"eval_samples_per_second": 16125.98, |
|
"eval_steps_per_second": 31.542, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6825938566552902, |
|
"grad_norm": 2.601680040359497, |
|
"learning_rate": 0.00047960000000000006, |
|
"loss": 1.3097, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7963594994311718, |
|
"grad_norm": 1.9989418983459473, |
|
"learning_rate": 0.0005596, |
|
"loss": 1.2866, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.9101251422070534, |
|
"grad_norm": 1.8081185817718506, |
|
"learning_rate": 0.0006396, |
|
"loss": 1.2689, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.023890784982935, |
|
"grad_norm": 1.6405588388442993, |
|
"learning_rate": 0.00071952, |
|
"loss": 1.2487, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.1376564277588168, |
|
"grad_norm": 1.22613525390625, |
|
"learning_rate": 0.00079952, |
|
"loss": 1.2165, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.1376564277588168, |
|
"eval_accuracy": 0.686196, |
|
"eval_loss": 1.2481120824813843, |
|
"eval_runtime": 15.8446, |
|
"eval_samples_per_second": 15778.257, |
|
"eval_steps_per_second": 30.862, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.2514220705346986, |
|
"grad_norm": 1.2695167064666748, |
|
"learning_rate": 0.0007996786565611985, |
|
"loss": 1.2046, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.36518771331058, |
|
"grad_norm": 1.2521305084228516, |
|
"learning_rate": 0.0007987086748436788, |
|
"loss": 1.1849, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.4789533560864618, |
|
"grad_norm": 1.19619619846344, |
|
"learning_rate": 0.0007970896788508052, |
|
"loss": 1.1534, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.5927189988623436, |
|
"grad_norm": 1.0483107566833496, |
|
"learning_rate": 0.0007948275336376884, |
|
"loss": 1.1312, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.7064846416382253, |
|
"grad_norm": 1.3618515729904175, |
|
"learning_rate": 0.0007919213896323948, |
|
"loss": 1.112, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.7064846416382253, |
|
"eval_accuracy": 0.716556, |
|
"eval_loss": 1.1176625490188599, |
|
"eval_runtime": 15.2386, |
|
"eval_samples_per_second": 16405.688, |
|
"eval_steps_per_second": 32.09, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.820250284414107, |
|
"grad_norm": 0.9307771325111389, |
|
"learning_rate": 0.0007883817747762077, |
|
"loss": 1.0986, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.9340159271899886, |
|
"grad_norm": 0.9189246296882629, |
|
"learning_rate": 0.0007842073597303121, |
|
"loss": 1.0847, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.04778156996587, |
|
"grad_norm": 0.8016377687454224, |
|
"learning_rate": 0.0007794081581686037, |
|
"loss": 1.0506, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.161547212741752, |
|
"grad_norm": 0.9774219989776611, |
|
"learning_rate": 0.0007739919744091065, |
|
"loss": 1.0158, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.2753128555176336, |
|
"grad_norm": 0.8449124693870544, |
|
"learning_rate": 0.0007679676160878387, |
|
"loss": 1.0138, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.2753128555176336, |
|
"eval_accuracy": 0.732636, |
|
"eval_loss": 1.052935242652893, |
|
"eval_runtime": 15.2399, |
|
"eval_samples_per_second": 16404.275, |
|
"eval_steps_per_second": 32.087, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.3890784982935154, |
|
"grad_norm": 0.9211858510971069, |
|
"learning_rate": 0.0007613448798360993, |
|
"loss": 1.0113, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.502844141069397, |
|
"grad_norm": 0.7870326042175293, |
|
"learning_rate": 0.0007541345353494786, |
|
"loss": 1.0024, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.616609783845279, |
|
"grad_norm": 0.8683303594589233, |
|
"learning_rate": 0.0007463483078745015, |
|
"loss": 1.0032, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.73037542662116, |
|
"grad_norm": 1.031267523765564, |
|
"learning_rate": 0.000738007485475254, |
|
"loss": 0.9961, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.8441410693970424, |
|
"grad_norm": 0.8440726399421692, |
|
"learning_rate": 0.0007291089356699791, |
|
"loss": 0.9909, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.8441410693970424, |
|
"eval_accuracy": 0.741524, |
|
"eval_loss": 1.0143921375274658, |
|
"eval_runtime": 14.9299, |
|
"eval_samples_per_second": 16744.96, |
|
"eval_steps_per_second": 32.753, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.9579067121729237, |
|
"grad_norm": 0.708281934261322, |
|
"learning_rate": 0.0007196848947861554, |
|
"loss": 0.9832, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.0716723549488054, |
|
"grad_norm": 0.692997395992279, |
|
"learning_rate": 0.000709742030952583, |
|
"loss": 0.9383, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 3.185437997724687, |
|
"grad_norm": 0.8190609216690063, |
|
"learning_rate": 0.0006992857783851634, |
|
"loss": 0.9193, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 3.299203640500569, |
|
"grad_norm": 0.7791016697883606, |
|
"learning_rate": 0.0006883428362373026, |
|
"loss": 0.9197, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.4129692832764507, |
|
"grad_norm": 0.6834008693695068, |
|
"learning_rate": 0.0006769309995941914, |
|
"loss": 0.9236, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.4129692832764507, |
|
"eval_accuracy": 0.748324, |
|
"eval_loss": 0.9886829257011414, |
|
"eval_runtime": 14.8544, |
|
"eval_samples_per_second": 16830.07, |
|
"eval_steps_per_second": 32.92, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.526734926052332, |
|
"grad_norm": 0.7945353388786316, |
|
"learning_rate": 0.0006650809067991791, |
|
"loss": 0.9259, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.640500568828214, |
|
"grad_norm": 0.7771942019462585, |
|
"learning_rate": 0.000652788107427868, |
|
"loss": 0.924, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.7542662116040955, |
|
"grad_norm": 0.7232080101966858, |
|
"learning_rate": 0.0006400842315977677, |
|
"loss": 0.9149, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.868031854379977, |
|
"grad_norm": 0.6129056215286255, |
|
"learning_rate": 0.0006270032202430253, |
|
"loss": 0.9142, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.981797497155859, |
|
"grad_norm": 0.7053471803665161, |
|
"learning_rate": 0.0006135401606551002, |
|
"loss": 0.914, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.981797497155859, |
|
"eval_accuracy": 0.754936, |
|
"eval_loss": 0.9585933089256287, |
|
"eval_runtime": 15.1971, |
|
"eval_samples_per_second": 16450.463, |
|
"eval_steps_per_second": 32.177, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 4.09556313993174, |
|
"grad_norm": 0.7185536623001099, |
|
"learning_rate": 0.0005997438247807972, |
|
"loss": 0.8508, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 4.2093287827076225, |
|
"grad_norm": 0.7729761600494385, |
|
"learning_rate": 0.0005856090312640852, |
|
"loss": 0.8434, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 4.323094425483504, |
|
"grad_norm": 0.8405170440673828, |
|
"learning_rate": 0.0005711724058927512, |
|
"loss": 0.8442, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 4.436860068259386, |
|
"grad_norm": 0.6555745005607605, |
|
"learning_rate": 0.0005564574250751392, |
|
"loss": 0.852, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 4.550625711035267, |
|
"grad_norm": 0.6500961184501648, |
|
"learning_rate": 0.0005415031062964693, |
|
"loss": 0.849, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.550625711035267, |
|
"eval_accuracy": 0.758092, |
|
"eval_loss": 0.9504426121711731, |
|
"eval_runtime": 14.8523, |
|
"eval_samples_per_second": 16832.413, |
|
"eval_steps_per_second": 32.924, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.664391353811149, |
|
"grad_norm": 0.6345399618148804, |
|
"learning_rate": 0.0005263038333083039, |
|
"loss": 0.8493, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.778156996587031, |
|
"grad_norm": 0.7285176515579224, |
|
"learning_rate": 0.0005108991688044689, |
|
"loss": 0.8505, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.891922639362912, |
|
"grad_norm": 0.7443712949752808, |
|
"learning_rate": 0.000495314163389589, |
|
"loss": 0.8515, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 5.005688282138794, |
|
"grad_norm": 0.6659076809883118, |
|
"learning_rate": 0.0004795899698565036, |
|
"loss": 0.8403, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 5.1194539249146755, |
|
"grad_norm": 0.7172214984893799, |
|
"learning_rate": 0.00046372068282238195, |
|
"loss": 0.7614, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 5.1194539249146755, |
|
"eval_accuracy": 0.761088, |
|
"eval_loss": 0.9495302438735962, |
|
"eval_runtime": 15.3396, |
|
"eval_samples_per_second": 16297.69, |
|
"eval_steps_per_second": 31.878, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 5.233219567690558, |
|
"grad_norm": 0.8739346861839294, |
|
"learning_rate": 0.0004477477751198958, |
|
"loss": 0.767, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 5.346985210466439, |
|
"grad_norm": 0.8742613196372986, |
|
"learning_rate": 0.0004316972214137623, |
|
"loss": 0.7723, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 5.460750853242321, |
|
"grad_norm": 0.7512331604957581, |
|
"learning_rate": 0.00041559512263430705, |
|
"loss": 0.7738, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 5.5745164960182025, |
|
"grad_norm": 0.7592815160751343, |
|
"learning_rate": 0.00039948379493191056, |
|
"loss": 0.7735, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 5.688282138794084, |
|
"grad_norm": 0.6751989722251892, |
|
"learning_rate": 0.00038335718753151784, |
|
"loss": 0.7726, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.688282138794084, |
|
"eval_accuracy": 0.763736, |
|
"eval_loss": 0.9361330270767212, |
|
"eval_runtime": 17.5361, |
|
"eval_samples_per_second": 14256.306, |
|
"eval_steps_per_second": 27.885, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.802047781569966, |
|
"grad_norm": 0.7502247095108032, |
|
"learning_rate": 0.0003672737214802269, |
|
"loss": 0.7735, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 5.915813424345847, |
|
"grad_norm": 0.7880488038063049, |
|
"learning_rate": 0.000351227356466713, |
|
"loss": 0.7736, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 6.0295790671217295, |
|
"grad_norm": 0.7424056529998779, |
|
"learning_rate": 0.0003352762228480271, |
|
"loss": 0.7455, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 6.143344709897611, |
|
"grad_norm": 0.8073525428771973, |
|
"learning_rate": 0.0003194143300116524, |
|
"loss": 0.6797, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 6.257110352673493, |
|
"grad_norm": 0.7268177270889282, |
|
"learning_rate": 0.00030369914003658996, |
|
"loss": 0.6867, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 6.257110352673493, |
|
"eval_accuracy": 0.763196, |
|
"eval_loss": 0.9648858308792114, |
|
"eval_runtime": 15.5355, |
|
"eval_samples_per_second": 16092.147, |
|
"eval_steps_per_second": 31.476, |
|
"step": 55000 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 87900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5000, |
|
"total_flos": 2.354556845700649e+18, |
|
"train_batch_size": 512, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|