{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.262571103526735, "eval_steps": 5000, "global_step": 55048, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11376564277588168, "grad_norm": 4.343299865722656, "learning_rate": 7.960000000000001e-05, "loss": 4.3757, "step": 1000 }, { "epoch": 0.22753128555176336, "grad_norm": 3.4980385303497314, "learning_rate": 0.0001596, "loss": 1.7997, "step": 2000 }, { "epoch": 0.3412969283276451, "grad_norm": 3.350770950317383, "learning_rate": 0.0002396, "loss": 1.4879, "step": 3000 }, { "epoch": 0.4550625711035267, "grad_norm": 3.362269163131714, "learning_rate": 0.0003196, "loss": 1.3834, "step": 4000 }, { "epoch": 0.5688282138794084, "grad_norm": 2.424733877182007, "learning_rate": 0.0003996, "loss": 1.3351, "step": 5000 }, { "epoch": 0.5688282138794084, "eval_accuracy": 0.668216, "eval_loss": 1.3343349695205688, "eval_runtime": 15.5029, "eval_samples_per_second": 16125.98, "eval_steps_per_second": 31.542, "step": 5000 }, { "epoch": 0.6825938566552902, "grad_norm": 2.601680040359497, "learning_rate": 0.00047960000000000006, "loss": 1.3097, "step": 6000 }, { "epoch": 0.7963594994311718, "grad_norm": 1.9989418983459473, "learning_rate": 0.0005596, "loss": 1.2866, "step": 7000 }, { "epoch": 0.9101251422070534, "grad_norm": 1.8081185817718506, "learning_rate": 0.0006396, "loss": 1.2689, "step": 8000 }, { "epoch": 1.023890784982935, "grad_norm": 1.6405588388442993, "learning_rate": 0.00071952, "loss": 1.2487, "step": 9000 }, { "epoch": 1.1376564277588168, "grad_norm": 1.22613525390625, "learning_rate": 0.00079952, "loss": 1.2165, "step": 10000 }, { "epoch": 1.1376564277588168, "eval_accuracy": 0.686196, "eval_loss": 1.2481120824813843, "eval_runtime": 15.8446, "eval_samples_per_second": 15778.257, "eval_steps_per_second": 30.862, "step": 10000 }, { "epoch": 1.2514220705346986, "grad_norm": 1.2695167064666748, "learning_rate": 0.0007996786565611985, "loss": 1.2046, "step": 11000 }, { "epoch": 1.36518771331058, "grad_norm": 1.2521305084228516, "learning_rate": 0.0007987086748436788, "loss": 1.1849, "step": 12000 }, { "epoch": 1.4789533560864618, "grad_norm": 1.19619619846344, "learning_rate": 0.0007970896788508052, "loss": 1.1534, "step": 13000 }, { "epoch": 1.5927189988623436, "grad_norm": 1.0483107566833496, "learning_rate": 0.0007948275336376884, "loss": 1.1312, "step": 14000 }, { "epoch": 1.7064846416382253, "grad_norm": 1.3618515729904175, "learning_rate": 0.0007919213896323948, "loss": 1.112, "step": 15000 }, { "epoch": 1.7064846416382253, "eval_accuracy": 0.716556, "eval_loss": 1.1176625490188599, "eval_runtime": 15.2386, "eval_samples_per_second": 16405.688, "eval_steps_per_second": 32.09, "step": 15000 }, { "epoch": 1.820250284414107, "grad_norm": 0.9307771325111389, "learning_rate": 0.0007883817747762077, "loss": 1.0986, "step": 16000 }, { "epoch": 1.9340159271899886, "grad_norm": 0.9189246296882629, "learning_rate": 0.0007842073597303121, "loss": 1.0847, "step": 17000 }, { "epoch": 2.04778156996587, "grad_norm": 0.8016377687454224, "learning_rate": 0.0007794081581686037, "loss": 1.0506, "step": 18000 }, { "epoch": 2.161547212741752, "grad_norm": 0.9774219989776611, "learning_rate": 0.0007739919744091065, "loss": 1.0158, "step": 19000 }, { "epoch": 2.2753128555176336, "grad_norm": 0.8449124693870544, "learning_rate": 0.0007679676160878387, "loss": 1.0138, "step": 20000 }, { "epoch": 2.2753128555176336, "eval_accuracy": 0.732636, "eval_loss": 1.052935242652893, "eval_runtime": 15.2399, "eval_samples_per_second": 16404.275, "eval_steps_per_second": 32.087, "step": 20000 }, { "epoch": 2.3890784982935154, "grad_norm": 0.9211858510971069, "learning_rate": 0.0007613448798360993, "loss": 1.0113, "step": 21000 }, { "epoch": 2.502844141069397, "grad_norm": 0.7870326042175293, "learning_rate": 0.0007541345353494786, "loss": 1.0024, "step": 22000 }, { "epoch": 2.616609783845279, "grad_norm": 0.8683303594589233, "learning_rate": 0.0007463483078745015, "loss": 1.0032, "step": 23000 }, { "epoch": 2.73037542662116, "grad_norm": 1.031267523765564, "learning_rate": 0.000738007485475254, "loss": 0.9961, "step": 24000 }, { "epoch": 2.8441410693970424, "grad_norm": 0.8440726399421692, "learning_rate": 0.0007291089356699791, "loss": 0.9909, "step": 25000 }, { "epoch": 2.8441410693970424, "eval_accuracy": 0.741524, "eval_loss": 1.0143921375274658, "eval_runtime": 14.9299, "eval_samples_per_second": 16744.96, "eval_steps_per_second": 32.753, "step": 25000 }, { "epoch": 2.9579067121729237, "grad_norm": 0.708281934261322, "learning_rate": 0.0007196848947861554, "loss": 0.9832, "step": 26000 }, { "epoch": 3.0716723549488054, "grad_norm": 0.692997395992279, "learning_rate": 0.000709742030952583, "loss": 0.9383, "step": 27000 }, { "epoch": 3.185437997724687, "grad_norm": 0.8190609216690063, "learning_rate": 0.0006992857783851634, "loss": 0.9193, "step": 28000 }, { "epoch": 3.299203640500569, "grad_norm": 0.7791016697883606, "learning_rate": 0.0006883428362373026, "loss": 0.9197, "step": 29000 }, { "epoch": 3.4129692832764507, "grad_norm": 0.6834008693695068, "learning_rate": 0.0006769309995941914, "loss": 0.9236, "step": 30000 }, { "epoch": 3.4129692832764507, "eval_accuracy": 0.748324, "eval_loss": 0.9886829257011414, "eval_runtime": 14.8544, "eval_samples_per_second": 16830.07, "eval_steps_per_second": 32.92, "step": 30000 }, { "epoch": 3.526734926052332, "grad_norm": 0.7945353388786316, "learning_rate": 0.0006650809067991791, "loss": 0.9259, "step": 31000 }, { "epoch": 3.640500568828214, "grad_norm": 0.7771942019462585, "learning_rate": 0.000652788107427868, "loss": 0.924, "step": 32000 }, { "epoch": 3.7542662116040955, "grad_norm": 0.7232080101966858, "learning_rate": 0.0006400842315977677, "loss": 0.9149, "step": 33000 }, { "epoch": 3.868031854379977, "grad_norm": 0.6129056215286255, "learning_rate": 0.0006270032202430253, "loss": 0.9142, "step": 34000 }, { "epoch": 3.981797497155859, "grad_norm": 0.7053471803665161, "learning_rate": 0.0006135401606551002, "loss": 0.914, "step": 35000 }, { "epoch": 3.981797497155859, "eval_accuracy": 0.754936, "eval_loss": 0.9585933089256287, "eval_runtime": 15.1971, "eval_samples_per_second": 16450.463, "eval_steps_per_second": 32.177, "step": 35000 }, { "epoch": 4.09556313993174, "grad_norm": 0.7185536623001099, "learning_rate": 0.0005997438247807972, "loss": 0.8508, "step": 36000 }, { "epoch": 4.2093287827076225, "grad_norm": 0.7729761600494385, "learning_rate": 0.0005856090312640852, "loss": 0.8434, "step": 37000 }, { "epoch": 4.323094425483504, "grad_norm": 0.8405170440673828, "learning_rate": 0.0005711724058927512, "loss": 0.8442, "step": 38000 }, { "epoch": 4.436860068259386, "grad_norm": 0.6555745005607605, "learning_rate": 0.0005564574250751392, "loss": 0.852, "step": 39000 }, { "epoch": 4.550625711035267, "grad_norm": 0.6500961184501648, "learning_rate": 0.0005415031062964693, "loss": 0.849, "step": 40000 }, { "epoch": 4.550625711035267, "eval_accuracy": 0.758092, "eval_loss": 0.9504426121711731, "eval_runtime": 14.8523, "eval_samples_per_second": 16832.413, "eval_steps_per_second": 32.924, "step": 40000 }, { "epoch": 4.664391353811149, "grad_norm": 0.6345399618148804, "learning_rate": 0.0005263038333083039, "loss": 0.8493, "step": 41000 }, { "epoch": 4.778156996587031, "grad_norm": 0.7285176515579224, "learning_rate": 0.0005108991688044689, "loss": 0.8505, "step": 42000 }, { "epoch": 4.891922639362912, "grad_norm": 0.7443712949752808, "learning_rate": 0.000495314163389589, "loss": 0.8515, "step": 43000 }, { "epoch": 5.005688282138794, "grad_norm": 0.6659076809883118, "learning_rate": 0.0004795899698565036, "loss": 0.8403, "step": 44000 }, { "epoch": 5.1194539249146755, "grad_norm": 0.7172214984893799, "learning_rate": 0.00046372068282238195, "loss": 0.7614, "step": 45000 }, { "epoch": 5.1194539249146755, "eval_accuracy": 0.761088, "eval_loss": 0.9495302438735962, "eval_runtime": 15.3396, "eval_samples_per_second": 16297.69, "eval_steps_per_second": 31.878, "step": 45000 }, { "epoch": 5.233219567690558, "grad_norm": 0.8739346861839294, "learning_rate": 0.0004477477751198958, "loss": 0.767, "step": 46000 }, { "epoch": 5.346985210466439, "grad_norm": 0.8742613196372986, "learning_rate": 0.0004316972214137623, "loss": 0.7723, "step": 47000 }, { "epoch": 5.460750853242321, "grad_norm": 0.7512331604957581, "learning_rate": 0.00041559512263430705, "loss": 0.7738, "step": 48000 }, { "epoch": 5.5745164960182025, "grad_norm": 0.7592815160751343, "learning_rate": 0.00039948379493191056, "loss": 0.7735, "step": 49000 }, { "epoch": 5.688282138794084, "grad_norm": 0.6751989722251892, "learning_rate": 0.00038335718753151784, "loss": 0.7726, "step": 50000 }, { "epoch": 5.688282138794084, "eval_accuracy": 0.763736, "eval_loss": 0.9361330270767212, "eval_runtime": 17.5361, "eval_samples_per_second": 14256.306, "eval_steps_per_second": 27.885, "step": 50000 }, { "epoch": 5.802047781569966, "grad_norm": 0.7502247095108032, "learning_rate": 0.0003672737214802269, "loss": 0.7735, "step": 51000 }, { "epoch": 5.915813424345847, "grad_norm": 0.7880488038063049, "learning_rate": 0.000351227356466713, "loss": 0.7736, "step": 52000 }, { "epoch": 6.0295790671217295, "grad_norm": 0.7424056529998779, "learning_rate": 0.0003352762228480271, "loss": 0.7455, "step": 53000 }, { "epoch": 6.143344709897611, "grad_norm": 0.8073525428771973, "learning_rate": 0.0003194143300116524, "loss": 0.6797, "step": 54000 }, { "epoch": 6.257110352673493, "grad_norm": 0.7268177270889282, "learning_rate": 0.00030369914003658996, "loss": 0.6867, "step": 55000 }, { "epoch": 6.257110352673493, "eval_accuracy": 0.763196, "eval_loss": 0.9648858308792114, "eval_runtime": 15.5355, "eval_samples_per_second": 16092.147, "eval_steps_per_second": 31.476, "step": 55000 } ], "logging_steps": 1000, "max_steps": 87900, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000, "total_flos": 2.354556845700649e+18, "train_batch_size": 512, "trial_name": null, "trial_params": null }