{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 500, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 35.146240234375, "learning_rate": 4.9004900490049e-05, "loss": 10.1867, "mean_token_accuracy": 0.544253178048879, "step": 1000 }, { "epoch": 0.032, "grad_norm": 32.855838775634766, "learning_rate": 4.8004800480048006e-05, "loss": 10.2308, "mean_token_accuracy": 0.5428702699318528, "step": 2000 }, { "epoch": 0.048, "grad_norm": 36.6450309753418, "learning_rate": 4.700470047004701e-05, "loss": 10.2215, "mean_token_accuracy": 0.5424384255111218, "step": 3000 }, { "epoch": 0.064, "grad_norm": 33.15324783325195, "learning_rate": 4.6004600460046006e-05, "loss": 10.2813, "mean_token_accuracy": 0.5411355452165008, "step": 4000 }, { "epoch": 0.08, "grad_norm": 31.203840255737305, "learning_rate": 4.500450045004501e-05, "loss": 10.2447, "mean_token_accuracy": 0.5425998609103262, "step": 5000 }, { "epoch": 0.096, "grad_norm": 29.603290557861328, "learning_rate": 4.4004400440044006e-05, "loss": 10.131, "mean_token_accuracy": 0.5463594534434378, "step": 6000 }, { "epoch": 0.112, "grad_norm": 31.61845588684082, "learning_rate": 4.3004300430043e-05, "loss": 10.0479, "mean_token_accuracy": 0.5483995637446641, "step": 7000 }, { "epoch": 0.128, "grad_norm": 31.28079605102539, "learning_rate": 4.2004200420042006e-05, "loss": 9.9802, "mean_token_accuracy": 0.5492958616241813, "step": 8000 }, { "epoch": 0.144, "grad_norm": 28.147220611572266, "learning_rate": 4.100410041004101e-05, "loss": 9.8445, "mean_token_accuracy": 0.5530450477898121, "step": 9000 }, { "epoch": 0.16, "grad_norm": 26.830713272094727, "learning_rate": 4.0004000400040005e-05, "loss": 9.8066, "mean_token_accuracy": 0.5538738285191357, "step": 10000 }, { "epoch": 0.176, "grad_norm": 28.744468688964844, "learning_rate": 3.900390039003901e-05, "loss": 9.7206, "mean_token_accuracy": 0.5572981600053608, "step": 11000 }, { "epoch": 0.192, "grad_norm": 25.372802734375, "learning_rate": 3.8003800380038005e-05, "loss": 9.6634, "mean_token_accuracy": 0.5570068260915577, "step": 12000 }, { "epoch": 0.208, "grad_norm": 31.25323486328125, "learning_rate": 3.7003700370037e-05, "loss": 9.5426, "mean_token_accuracy": 0.5603208757042885, "step": 13000 }, { "epoch": 0.224, "grad_norm": 33.54015350341797, "learning_rate": 3.6003600360036005e-05, "loss": 9.532, "mean_token_accuracy": 0.561353420805186, "step": 14000 }, { "epoch": 0.24, "grad_norm": 30.744821548461914, "learning_rate": 3.500350035003501e-05, "loss": 9.4778, "mean_token_accuracy": 0.5629392731450498, "step": 15000 }, { "epoch": 0.256, "grad_norm": 24.013673782348633, "learning_rate": 3.4003400340034005e-05, "loss": 9.477, "mean_token_accuracy": 0.5625265723504126, "step": 16000 }, { "epoch": 0.272, "grad_norm": 27.767776489257812, "learning_rate": 3.300330033003301e-05, "loss": 9.3889, "mean_token_accuracy": 0.5659195666387677, "step": 17000 }, { "epoch": 0.288, "grad_norm": 29.1698055267334, "learning_rate": 3.2003200320032004e-05, "loss": 9.3476, "mean_token_accuracy": 0.5666751223653554, "step": 18000 }, { "epoch": 0.304, "grad_norm": 29.41615867614746, "learning_rate": 3.1003100310031e-05, "loss": 9.3244, "mean_token_accuracy": 0.5674368364065886, "step": 19000 }, { "epoch": 0.32, "grad_norm": 23.839937210083008, "learning_rate": 3.0003000300030004e-05, "loss": 9.2717, "mean_token_accuracy": 0.5696731022559106, "step": 20000 }, { "epoch": 0.336, "grad_norm": 28.645061492919922, "learning_rate": 2.9002900290029007e-05, "loss": 9.2327, "mean_token_accuracy": 0.5706070831567049, "step": 21000 }, { "epoch": 0.352, "grad_norm": 26.104412078857422, "learning_rate": 2.8002800280028004e-05, "loss": 9.223, "mean_token_accuracy": 0.5696454518660903, "step": 22000 }, { "epoch": 0.368, "grad_norm": 28.753164291381836, "learning_rate": 2.7002700270027004e-05, "loss": 9.145, "mean_token_accuracy": 0.5734736853465437, "step": 23000 }, { "epoch": 0.384, "grad_norm": 21.9370174407959, "learning_rate": 2.6002600260026007e-05, "loss": 9.1538, "mean_token_accuracy": 0.5731835125163197, "step": 24000 }, { "epoch": 0.4, "grad_norm": 31.202007293701172, "learning_rate": 2.5002500250025003e-05, "loss": 9.1016, "mean_token_accuracy": 0.5749420530423522, "step": 25000 }, { "epoch": 0.416, "grad_norm": 24.899829864501953, "learning_rate": 2.4002400240024003e-05, "loss": 9.1086, "mean_token_accuracy": 0.5731981860995292, "step": 26000 }, { "epoch": 0.432, "grad_norm": 26.59105682373047, "learning_rate": 2.3002300230023003e-05, "loss": 9.0585, "mean_token_accuracy": 0.575087952144444, "step": 27000 }, { "epoch": 0.448, "grad_norm": 27.35274314880371, "learning_rate": 2.2002200220022003e-05, "loss": 9.0378, "mean_token_accuracy": 0.5757604394182563, "step": 28000 }, { "epoch": 0.464, "grad_norm": 23.581249237060547, "learning_rate": 2.1002100210021003e-05, "loss": 9.087, "mean_token_accuracy": 0.5731492869332433, "step": 29000 }, { "epoch": 0.48, "grad_norm": 26.905712127685547, "learning_rate": 2.0002000200020003e-05, "loss": 9.0834, "mean_token_accuracy": 0.5751373803690076, "step": 30000 }, { "epoch": 0.496, "grad_norm": 24.928512573242188, "learning_rate": 1.9001900190019003e-05, "loss": 9.0077, "mean_token_accuracy": 0.5775581553503871, "step": 31000 }, { "epoch": 0.512, "grad_norm": 28.373720169067383, "learning_rate": 1.8001800180018002e-05, "loss": 9.0328, "mean_token_accuracy": 0.575654436133802, "step": 32000 }, { "epoch": 0.528, "grad_norm": 26.213802337646484, "learning_rate": 1.7001700170017002e-05, "loss": 8.9223, "mean_token_accuracy": 0.5791815776266158, "step": 33000 }, { "epoch": 0.544, "grad_norm": 27.070953369140625, "learning_rate": 1.6001600160016002e-05, "loss": 8.9483, "mean_token_accuracy": 0.5792850709185005, "step": 34000 }, { "epoch": 0.56, "grad_norm": 22.90890884399414, "learning_rate": 1.5001500150015002e-05, "loss": 9.0419, "mean_token_accuracy": 0.5748572928607464, "step": 35000 }, { "epoch": 0.576, "grad_norm": 28.693235397338867, "learning_rate": 1.4001400140014002e-05, "loss": 8.951, "mean_token_accuracy": 0.5788668767511844, "step": 36000 }, { "epoch": 0.592, "grad_norm": 27.749176025390625, "learning_rate": 1.3001300130013003e-05, "loss": 8.9335, "mean_token_accuracy": 0.5779373695105314, "step": 37000 }, { "epoch": 0.608, "grad_norm": 25.057411193847656, "learning_rate": 1.2001200120012002e-05, "loss": 8.8612, "mean_token_accuracy": 0.5811370112374425, "step": 38000 }, { "epoch": 0.624, "grad_norm": 26.132497787475586, "learning_rate": 1.1001100110011001e-05, "loss": 8.8688, "mean_token_accuracy": 0.5816743801310659, "step": 39000 }, { "epoch": 0.64, "grad_norm": 26.350906372070312, "learning_rate": 1.0001000100010001e-05, "loss": 8.8358, "mean_token_accuracy": 0.5821756240203977, "step": 40000 }, { "epoch": 0.656, "grad_norm": 24.874052047729492, "learning_rate": 9.000900090009001e-06, "loss": 8.8207, "mean_token_accuracy": 0.5826298766359687, "step": 41000 }, { "epoch": 0.672, "grad_norm": 26.102046966552734, "learning_rate": 8.000800080008001e-06, "loss": 8.8275, "mean_token_accuracy": 0.5821652906313538, "step": 42000 }, { "epoch": 0.688, "grad_norm": 29.679323196411133, "learning_rate": 7.000700070007001e-06, "loss": 8.85, "mean_token_accuracy": 0.5809146241471171, "step": 43000 }, { "epoch": 0.704, "grad_norm": 26.106046676635742, "learning_rate": 6.000600060006001e-06, "loss": 8.8531, "mean_token_accuracy": 0.5821815392710269, "step": 44000 }, { "epoch": 0.72, "grad_norm": 22.304044723510742, "learning_rate": 5.000500050005001e-06, "loss": 8.8142, "mean_token_accuracy": 0.5825774453170598, "step": 45000 }, { "epoch": 0.736, "grad_norm": 28.982166290283203, "learning_rate": 4.0004000400040005e-06, "loss": 8.8434, "mean_token_accuracy": 0.5813510757684708, "step": 46000 }, { "epoch": 0.752, "grad_norm": 27.076814651489258, "learning_rate": 3.0003000300030004e-06, "loss": 8.7994, "mean_token_accuracy": 0.5838636282868683, "step": 47000 }, { "epoch": 0.768, "grad_norm": 26.112808227539062, "learning_rate": 2.0002000200020003e-06, "loss": 8.8055, "mean_token_accuracy": 0.5828906665407121, "step": 48000 }, { "epoch": 0.784, "grad_norm": 24.94652557373047, "learning_rate": 1.0001000100010001e-06, "loss": 8.8295, "mean_token_accuracy": 0.5821196795813739, "step": 49000 }, { "epoch": 0.8, "grad_norm": 28.15529441833496, "learning_rate": 0.0, "loss": 8.7714, "mean_token_accuracy": 0.5843258857652545, "step": 50000 } ], "logging_steps": 1000, "max_steps": 50000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.68231960576e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }