|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 625, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5752072334289551, |
|
"learning_rate": 0.0001993517017828201, |
|
"loss": 1.6434, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7842408418655396, |
|
"learning_rate": 0.0001961102106969206, |
|
"loss": 1.2847, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4593060612678528, |
|
"learning_rate": 0.0001928687196110211, |
|
"loss": 1.1227, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.45610150694847107, |
|
"learning_rate": 0.00018962722852512156, |
|
"loss": 1.0359, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.4743576645851135, |
|
"learning_rate": 0.00018638573743922206, |
|
"loss": 0.992, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5059588551521301, |
|
"learning_rate": 0.00018314424635332255, |
|
"loss": 0.9782, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5296839475631714, |
|
"learning_rate": 0.00017990275526742302, |
|
"loss": 0.8997, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6169719099998474, |
|
"learning_rate": 0.00017666126418152352, |
|
"loss": 0.887, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5174543261528015, |
|
"learning_rate": 0.00017341977309562402, |
|
"loss": 0.8388, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5253883600234985, |
|
"learning_rate": 0.0001701782820097245, |
|
"loss": 0.875, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5177808403968811, |
|
"learning_rate": 0.00016693679092382496, |
|
"loss": 0.8193, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.50547856092453, |
|
"learning_rate": 0.00016369529983792545, |
|
"loss": 0.8444, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5066828727722168, |
|
"learning_rate": 0.00016045380875202592, |
|
"loss": 0.857, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.4495781362056732, |
|
"learning_rate": 0.00015721231766612642, |
|
"loss": 0.7897, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5362279415130615, |
|
"learning_rate": 0.00015429497568881685, |
|
"loss": 0.7992, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5713534951210022, |
|
"learning_rate": 0.00015105348460291734, |
|
"loss": 0.8446, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.4843427240848541, |
|
"learning_rate": 0.00014781199351701784, |
|
"loss": 0.7906, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.4727051556110382, |
|
"learning_rate": 0.0001445705024311183, |
|
"loss": 0.8281, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.4981895685195923, |
|
"learning_rate": 0.0001413290113452188, |
|
"loss": 0.7966, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.4925459027290344, |
|
"learning_rate": 0.0001380875202593193, |
|
"loss": 0.7889, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.4470859467983246, |
|
"learning_rate": 0.00013484602917341977, |
|
"loss": 0.7588, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.46052438020706177, |
|
"learning_rate": 0.00013160453808752027, |
|
"loss": 0.7863, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.4708721935749054, |
|
"learning_rate": 0.00012836304700162077, |
|
"loss": 0.8244, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.4670282304286957, |
|
"learning_rate": 0.00012512155591572124, |
|
"loss": 0.7391, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.4596056342124939, |
|
"learning_rate": 0.00012188006482982173, |
|
"loss": 0.8057, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.3906649053096771, |
|
"learning_rate": 0.0001186385737439222, |
|
"loss": 0.7594, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.5942690372467041, |
|
"learning_rate": 0.0001153970826580227, |
|
"loss": 0.7615, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.40805208683013916, |
|
"learning_rate": 0.00011215559157212318, |
|
"loss": 0.7607, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.5320433974266052, |
|
"learning_rate": 0.00010891410048622365, |
|
"loss": 0.7666, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.4362870752811432, |
|
"learning_rate": 0.00010567260940032415, |
|
"loss": 0.7683, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5106446146965027, |
|
"learning_rate": 0.00010243111831442465, |
|
"loss": 0.7777, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.506986677646637, |
|
"learning_rate": 9.918962722852513e-05, |
|
"loss": 0.7637, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.5665115118026733, |
|
"learning_rate": 9.594813614262561e-05, |
|
"loss": 0.7497, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.4510403275489807, |
|
"learning_rate": 9.27066450567261e-05, |
|
"loss": 0.7644, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5009343028068542, |
|
"learning_rate": 8.946515397082659e-05, |
|
"loss": 0.7766, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.4533093273639679, |
|
"learning_rate": 8.622366288492708e-05, |
|
"loss": 0.7579, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.4576455056667328, |
|
"learning_rate": 8.298217179902756e-05, |
|
"loss": 0.7731, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.47114574909210205, |
|
"learning_rate": 7.974068071312804e-05, |
|
"loss": 0.7612, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5052211284637451, |
|
"learning_rate": 7.649918962722853e-05, |
|
"loss": 0.7529, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.4952121675014496, |
|
"learning_rate": 7.325769854132901e-05, |
|
"loss": 0.7645, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.5584261417388916, |
|
"learning_rate": 7.00162074554295e-05, |
|
"loss": 0.7859, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5271150469779968, |
|
"learning_rate": 6.677471636952999e-05, |
|
"loss": 0.7662, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6051105260848999, |
|
"learning_rate": 6.353322528363047e-05, |
|
"loss": 0.7217, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5145427584648132, |
|
"learning_rate": 6.029173419773096e-05, |
|
"loss": 0.763, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5100213885307312, |
|
"learning_rate": 5.7050243111831445e-05, |
|
"loss": 0.7493, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6406119465827942, |
|
"learning_rate": 5.380875202593193e-05, |
|
"loss": 0.7856, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.5142606496810913, |
|
"learning_rate": 5.056726094003241e-05, |
|
"loss": 0.7439, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6167362332344055, |
|
"learning_rate": 4.73257698541329e-05, |
|
"loss": 0.7615, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5693132281303406, |
|
"learning_rate": 4.408427876823339e-05, |
|
"loss": 0.7535, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5026053190231323, |
|
"learning_rate": 4.0842787682333875e-05, |
|
"loss": 0.7397, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5282934904098511, |
|
"learning_rate": 3.760129659643436e-05, |
|
"loss": 0.7519, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5583475828170776, |
|
"learning_rate": 3.435980551053485e-05, |
|
"loss": 0.736, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5823280215263367, |
|
"learning_rate": 3.111831442463534e-05, |
|
"loss": 0.7383, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.4686075747013092, |
|
"learning_rate": 2.7876823338735818e-05, |
|
"loss": 0.7272, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5750771164894104, |
|
"learning_rate": 2.4635332252836304e-05, |
|
"loss": 0.7483, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.4452055096626282, |
|
"learning_rate": 2.1393841166936794e-05, |
|
"loss": 0.7412, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.48527467250823975, |
|
"learning_rate": 1.8152350081037278e-05, |
|
"loss": 0.7287, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.6023633480072021, |
|
"learning_rate": 1.4910858995137764e-05, |
|
"loss": 0.7856, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.40449661016464233, |
|
"learning_rate": 1.1669367909238251e-05, |
|
"loss": 0.7191, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5225895643234253, |
|
"learning_rate": 8.427876823338736e-06, |
|
"loss": 0.7132, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.4504361152648926, |
|
"learning_rate": 5.186385737439222e-06, |
|
"loss": 0.7344, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.48884958028793335, |
|
"learning_rate": 1.9448946515397086e-06, |
|
"loss": 0.7086, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7055141925811768, |
|
"eval_runtime": 58.7356, |
|
"eval_samples_per_second": 1.703, |
|
"eval_steps_per_second": 0.426, |
|
"step": 625 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 625, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 4350923021058048.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|