{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9540481400437635, "eval_steps": 1.0, "global_step": 108, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 6.122653587017292, "learning_rate": 0.0, "loss": 0.646, "step": 1 }, { "epoch": 0.05, "grad_norm": 4.716859801224774, "learning_rate": 5e-06, "loss": 0.5427, "step": 2 }, { "epoch": 0.08, "grad_norm": 5.019981578406485, "learning_rate": 7.924812503605782e-06, "loss": 0.549, "step": 3 }, { "epoch": 0.11, "grad_norm": 2.467488328063134, "learning_rate": 1e-05, "loss": 0.4574, "step": 4 }, { "epoch": 0.14, "grad_norm": 4.728992664952675, "learning_rate": 1e-05, "loss": 0.4952, "step": 5 }, { "epoch": 0.16, "grad_norm": 2.970029793004458, "learning_rate": 1e-05, "loss": 0.4373, "step": 6 }, { "epoch": 0.19, "grad_norm": 3.152396699018966, "learning_rate": 1e-05, "loss": 0.4491, "step": 7 }, { "epoch": 0.22, "grad_norm": 2.2898095220878196, "learning_rate": 1e-05, "loss": 0.421, "step": 8 }, { "epoch": 0.25, "grad_norm": 1.8312653886857113, "learning_rate": 1e-05, "loss": 0.3626, "step": 9 }, { "epoch": 0.27, "grad_norm": 1.8281463002677139, "learning_rate": 1e-05, "loss": 0.3628, "step": 10 }, { "epoch": 0.3, "grad_norm": 1.7367109671193546, "learning_rate": 1e-05, "loss": 0.3394, "step": 11 }, { "epoch": 0.33, "grad_norm": 1.6255181971686055, "learning_rate": 1e-05, "loss": 0.333, "step": 12 }, { "epoch": 0.36, "grad_norm": 1.6622699558938445, "learning_rate": 1e-05, "loss": 0.313, "step": 13 }, { "epoch": 0.38, "grad_norm": 1.4241097114069272, "learning_rate": 1e-05, "loss": 0.2917, "step": 14 }, { "epoch": 0.41, "grad_norm": 1.3259856150813727, "learning_rate": 1e-05, "loss": 0.2742, "step": 15 }, { "epoch": 0.44, "grad_norm": 1.363385021609024, "learning_rate": 1e-05, "loss": 0.2931, "step": 16 }, { "epoch": 0.46, "grad_norm": 1.612345679634704, "learning_rate": 1e-05, "loss": 0.2723, "step": 17 }, { "epoch": 0.49, "grad_norm": 1.394500889215576, "learning_rate": 1e-05, "loss": 0.2623, "step": 18 }, { "epoch": 0.52, "grad_norm": 1.2629355627249936, "learning_rate": 1e-05, "loss": 0.2719, "step": 19 }, { "epoch": 0.55, "grad_norm": 1.4121814773595716, "learning_rate": 1e-05, "loss": 0.2453, "step": 20 }, { "epoch": 0.57, "grad_norm": 1.2877132103814335, "learning_rate": 1e-05, "loss": 0.244, "step": 21 }, { "epoch": 0.6, "grad_norm": 1.2523378620255512, "learning_rate": 1e-05, "loss": 0.2349, "step": 22 }, { "epoch": 0.63, "grad_norm": 1.1902225037740863, "learning_rate": 1e-05, "loss": 0.2265, "step": 23 }, { "epoch": 0.66, "grad_norm": 1.1879613096031454, "learning_rate": 1e-05, "loss": 0.2144, "step": 24 }, { "epoch": 0.68, "grad_norm": 1.1464570881387057, "learning_rate": 1e-05, "loss": 0.2063, "step": 25 }, { "epoch": 0.71, "grad_norm": 1.2752585041011075, "learning_rate": 1e-05, "loss": 0.2157, "step": 26 }, { "epoch": 0.74, "grad_norm": 1.155543288413803, "learning_rate": 1e-05, "loss": 0.1916, "step": 27 }, { "epoch": 0.77, "grad_norm": 1.2042714749577323, "learning_rate": 1e-05, "loss": 0.2071, "step": 28 }, { "epoch": 0.79, "grad_norm": 1.1346405455010145, "learning_rate": 1e-05, "loss": 0.1991, "step": 29 }, { "epoch": 0.82, "grad_norm": 1.0793677741286372, "learning_rate": 1e-05, "loss": 0.1895, "step": 30 }, { "epoch": 0.85, "grad_norm": 1.0641597541926833, "learning_rate": 1e-05, "loss": 0.1968, "step": 31 }, { "epoch": 0.88, "grad_norm": 1.1159083684301505, "learning_rate": 1e-05, "loss": 0.1846, "step": 32 }, { "epoch": 0.9, "grad_norm": 1.0614368255136861, "learning_rate": 1e-05, "loss": 0.1849, "step": 33 }, { "epoch": 0.93, "grad_norm": 0.9659837421382899, "learning_rate": 1e-05, "loss": 0.1677, "step": 34 }, { "epoch": 0.96, "grad_norm": 0.9869347069695258, "learning_rate": 1e-05, "loss": 0.1789, "step": 35 }, { "epoch": 0.98, "grad_norm": 1.0555806958429526, "learning_rate": 1e-05, "loss": 0.1901, "step": 36 }, { "epoch": 1.01, "grad_norm": 1.0003790636897225, "learning_rate": 1e-05, "loss": 0.1317, "step": 37 }, { "epoch": 1.04, "grad_norm": 0.8755581193987241, "learning_rate": 1e-05, "loss": 0.109, "step": 38 }, { "epoch": 1.07, "grad_norm": 0.9600336243215675, "learning_rate": 1e-05, "loss": 0.1225, "step": 39 }, { "epoch": 1.09, "grad_norm": 0.983666193649008, "learning_rate": 1e-05, "loss": 0.1206, "step": 40 }, { "epoch": 1.12, "grad_norm": 0.9984047080313273, "learning_rate": 1e-05, "loss": 0.114, "step": 41 }, { "epoch": 1.15, "grad_norm": 1.0701560459785802, "learning_rate": 1e-05, "loss": 0.1143, "step": 42 }, { "epoch": 1.18, "grad_norm": 1.0722426369355342, "learning_rate": 1e-05, "loss": 0.124, "step": 43 }, { "epoch": 1.2, "grad_norm": 0.8817516131260538, "learning_rate": 1e-05, "loss": 0.1079, "step": 44 }, { "epoch": 1.23, "grad_norm": 0.9625731291493045, "learning_rate": 1e-05, "loss": 0.1222, "step": 45 }, { "epoch": 1.26, "grad_norm": 0.9812159017657305, "learning_rate": 1e-05, "loss": 0.1096, "step": 46 }, { "epoch": 1.29, "grad_norm": 0.8745591684992073, "learning_rate": 1e-05, "loss": 0.1149, "step": 47 }, { "epoch": 1.31, "grad_norm": 0.9507892298975904, "learning_rate": 1e-05, "loss": 0.1163, "step": 48 }, { "epoch": 1.34, "grad_norm": 0.8611093549236812, "learning_rate": 1e-05, "loss": 0.11, "step": 49 }, { "epoch": 1.37, "grad_norm": 0.8444613309525189, "learning_rate": 1e-05, "loss": 0.1054, "step": 50 }, { "epoch": 1.39, "grad_norm": 0.9868965033294682, "learning_rate": 1e-05, "loss": 0.1206, "step": 51 }, { "epoch": 1.42, "grad_norm": 0.7940733083936387, "learning_rate": 1e-05, "loss": 0.1072, "step": 52 }, { "epoch": 1.45, "grad_norm": 0.8572147743881433, "learning_rate": 1e-05, "loss": 0.1044, "step": 53 }, { "epoch": 1.48, "grad_norm": 0.8209526691122747, "learning_rate": 1e-05, "loss": 0.1009, "step": 54 }, { "epoch": 1.5, "grad_norm": 0.7779846619307967, "learning_rate": 1e-05, "loss": 0.1018, "step": 55 }, { "epoch": 1.53, "grad_norm": 0.8291999953627118, "learning_rate": 1e-05, "loss": 0.1025, "step": 56 }, { "epoch": 1.56, "grad_norm": 0.8321877301816655, "learning_rate": 1e-05, "loss": 0.1094, "step": 57 }, { "epoch": 1.59, "grad_norm": 0.8542389871893485, "learning_rate": 1e-05, "loss": 0.1079, "step": 58 }, { "epoch": 1.61, "grad_norm": 0.7737670400014411, "learning_rate": 1e-05, "loss": 0.0958, "step": 59 }, { "epoch": 1.64, "grad_norm": 0.8129322360573784, "learning_rate": 1e-05, "loss": 0.093, "step": 60 }, { "epoch": 1.67, "grad_norm": 0.8293838232530079, "learning_rate": 1e-05, "loss": 0.1054, "step": 61 }, { "epoch": 1.7, "grad_norm": 0.6810818138246434, "learning_rate": 1e-05, "loss": 0.0906, "step": 62 }, { "epoch": 1.72, "grad_norm": 0.7937807744835117, "learning_rate": 1e-05, "loss": 0.098, "step": 63 }, { "epoch": 1.75, "grad_norm": 0.8224807756832562, "learning_rate": 1e-05, "loss": 0.1101, "step": 64 }, { "epoch": 1.78, "grad_norm": 0.7304387601530952, "learning_rate": 1e-05, "loss": 0.1, "step": 65 }, { "epoch": 1.81, "grad_norm": 0.8142026342771219, "learning_rate": 1e-05, "loss": 0.0964, "step": 66 }, { "epoch": 1.83, "grad_norm": 0.7431609339195293, "learning_rate": 1e-05, "loss": 0.0936, "step": 67 }, { "epoch": 1.86, "grad_norm": 0.7512520528680077, "learning_rate": 1e-05, "loss": 0.0949, "step": 68 }, { "epoch": 1.89, "grad_norm": 0.7538760164866836, "learning_rate": 1e-05, "loss": 0.0989, "step": 69 }, { "epoch": 1.91, "grad_norm": 0.8103341693726498, "learning_rate": 1e-05, "loss": 0.1028, "step": 70 }, { "epoch": 1.94, "grad_norm": 0.8357385002863533, "learning_rate": 1e-05, "loss": 0.0966, "step": 71 }, { "epoch": 1.97, "grad_norm": 0.7944109386823767, "learning_rate": 1e-05, "loss": 0.1032, "step": 72 }, { "epoch": 2.0, "grad_norm": 0.8167924040143067, "learning_rate": 1e-05, "loss": 0.1003, "step": 73 }, { "epoch": 2.02, "grad_norm": 0.6224097456559627, "learning_rate": 1e-05, "loss": 0.0604, "step": 74 }, { "epoch": 2.05, "grad_norm": 0.6419960808589802, "learning_rate": 1e-05, "loss": 0.0652, "step": 75 }, { "epoch": 2.08, "grad_norm": 0.7787247503593108, "learning_rate": 1e-05, "loss": 0.0709, "step": 76 }, { "epoch": 2.11, "grad_norm": 0.7252667545531377, "learning_rate": 1e-05, "loss": 0.0571, "step": 77 }, { "epoch": 2.13, "grad_norm": 0.7217097658566882, "learning_rate": 1e-05, "loss": 0.0656, "step": 78 }, { "epoch": 2.16, "grad_norm": 0.7168372530155407, "learning_rate": 1e-05, "loss": 0.0527, "step": 79 }, { "epoch": 2.19, "grad_norm": 0.7418408777634922, "learning_rate": 1e-05, "loss": 0.0585, "step": 80 }, { "epoch": 2.22, "grad_norm": 0.7101096286468248, "learning_rate": 1e-05, "loss": 0.0509, "step": 81 }, { "epoch": 2.24, "grad_norm": 0.7006749099813174, "learning_rate": 1e-05, "loss": 0.0576, "step": 82 }, { "epoch": 2.27, "grad_norm": 0.7944497077223811, "learning_rate": 1e-05, "loss": 0.0545, "step": 83 }, { "epoch": 2.3, "grad_norm": 0.6722848545285588, "learning_rate": 1e-05, "loss": 0.0563, "step": 84 }, { "epoch": 2.32, "grad_norm": 0.7120414843684311, "learning_rate": 1e-05, "loss": 0.058, "step": 85 }, { "epoch": 2.35, "grad_norm": 0.7279580151325783, "learning_rate": 1e-05, "loss": 0.0552, "step": 86 }, { "epoch": 2.38, "grad_norm": 0.7029506482473885, "learning_rate": 1e-05, "loss": 0.0577, "step": 87 }, { "epoch": 2.41, "grad_norm": 0.6189514282541002, "learning_rate": 1e-05, "loss": 0.0547, "step": 88 }, { "epoch": 2.43, "grad_norm": 0.6229759439930223, "learning_rate": 1e-05, "loss": 0.0513, "step": 89 }, { "epoch": 2.46, "grad_norm": 0.6863028991803624, "learning_rate": 1e-05, "loss": 0.0613, "step": 90 }, { "epoch": 2.49, "grad_norm": 0.7364535429784711, "learning_rate": 1e-05, "loss": 0.0652, "step": 91 }, { "epoch": 2.52, "grad_norm": 0.7387032302781582, "learning_rate": 1e-05, "loss": 0.0638, "step": 92 }, { "epoch": 2.54, "grad_norm": 0.6757297547267043, "learning_rate": 1e-05, "loss": 0.0586, "step": 93 }, { "epoch": 2.57, "grad_norm": 0.6588502217604668, "learning_rate": 1e-05, "loss": 0.05, "step": 94 }, { "epoch": 2.6, "grad_norm": 0.6612243757810015, "learning_rate": 1e-05, "loss": 0.0565, "step": 95 }, { "epoch": 2.63, "grad_norm": 0.6510872422256165, "learning_rate": 1e-05, "loss": 0.0564, "step": 96 }, { "epoch": 2.65, "grad_norm": 0.6599878520531972, "learning_rate": 1e-05, "loss": 0.0584, "step": 97 }, { "epoch": 2.68, "grad_norm": 0.6723176479777001, "learning_rate": 1e-05, "loss": 0.0596, "step": 98 }, { "epoch": 2.71, "grad_norm": 0.6738851793824463, "learning_rate": 1e-05, "loss": 0.0568, "step": 99 }, { "epoch": 2.74, "grad_norm": 0.6730157693288188, "learning_rate": 1e-05, "loss": 0.0567, "step": 100 }, { "epoch": 2.76, "grad_norm": 0.6025834169032148, "learning_rate": 1e-05, "loss": 0.0543, "step": 101 }, { "epoch": 2.79, "grad_norm": 0.5662111947365751, "learning_rate": 1e-05, "loss": 0.0521, "step": 102 }, { "epoch": 2.82, "grad_norm": 0.6744169703896066, "learning_rate": 1e-05, "loss": 0.0589, "step": 103 }, { "epoch": 2.84, "grad_norm": 0.6312659616633817, "learning_rate": 1e-05, "loss": 0.0544, "step": 104 }, { "epoch": 2.87, "grad_norm": 0.6011739294981976, "learning_rate": 1e-05, "loss": 0.055, "step": 105 }, { "epoch": 2.9, "grad_norm": 0.6427838412250556, "learning_rate": 1e-05, "loss": 0.0582, "step": 106 }, { "epoch": 2.93, "grad_norm": 0.6537825081243189, "learning_rate": 1e-05, "loss": 0.0579, "step": 107 }, { "epoch": 2.95, "grad_norm": 0.6762138754041659, "learning_rate": 1e-05, "loss": 0.0588, "step": 108 }, { "epoch": 2.95, "step": 108, "total_flos": 90174812381184.0, "train_loss": 0.15581304762788392, "train_runtime": 6548.7943, "train_samples_per_second": 2.513, "train_steps_per_second": 0.016 } ], "logging_steps": 1.0, "max_steps": 108, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1.0, "total_flos": 90174812381184.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }