{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.995850622406639, "eval_steps": 500, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008298755186721992, "grad_norm": 0.80859375, "learning_rate": 8.695652173913044e-06, "loss": 2.3455, "step": 1 }, { "epoch": 0.016597510373443983, "grad_norm": 0.78125, "learning_rate": 1.739130434782609e-05, "loss": 2.2601, "step": 2 }, { "epoch": 0.024896265560165973, "grad_norm": 0.75390625, "learning_rate": 2.608695652173913e-05, "loss": 2.2494, "step": 3 }, { "epoch": 0.03319502074688797, "grad_norm": 0.8046875, "learning_rate": 3.478260869565218e-05, "loss": 2.2988, "step": 4 }, { "epoch": 0.04149377593360996, "grad_norm": 0.95703125, "learning_rate": 4.347826086956522e-05, "loss": 3.0938, "step": 5 }, { "epoch": 0.04979253112033195, "grad_norm": 0.7578125, "learning_rate": 5.217391304347826e-05, "loss": 2.1633, "step": 6 }, { "epoch": 0.058091286307053944, "grad_norm": 0.7421875, "learning_rate": 6.086956521739131e-05, "loss": 2.1488, "step": 7 }, { "epoch": 0.06639004149377593, "grad_norm": 0.62109375, "learning_rate": 6.956521739130436e-05, "loss": 2.0031, "step": 8 }, { "epoch": 0.07468879668049792, "grad_norm": 0.52734375, "learning_rate": 7.82608695652174e-05, "loss": 1.9347, "step": 9 }, { "epoch": 0.08298755186721991, "grad_norm": 0.369140625, "learning_rate": 8.695652173913044e-05, "loss": 1.8457, "step": 10 }, { "epoch": 0.0912863070539419, "grad_norm": 0.357421875, "learning_rate": 9.565217391304348e-05, "loss": 1.7273, "step": 11 }, { "epoch": 0.0995850622406639, "grad_norm": 0.376953125, "learning_rate": 0.00010434782608695653, "loss": 1.7584, "step": 12 }, { "epoch": 0.1078838174273859, "grad_norm": 0.39453125, "learning_rate": 0.00011304347826086956, "loss": 1.6387, "step": 13 }, { "epoch": 0.11618257261410789, "grad_norm": 0.458984375, "learning_rate": 0.00012173913043478263, "loss": 1.6614, "step": 14 }, { "epoch": 0.12448132780082988, "grad_norm": 0.482421875, "learning_rate": 0.00013043478260869567, "loss": 1.648, "step": 15 }, { "epoch": 0.13278008298755187, "grad_norm": 0.447265625, "learning_rate": 0.0001391304347826087, "loss": 1.6337, "step": 16 }, { "epoch": 0.14107883817427386, "grad_norm": 0.37109375, "learning_rate": 0.00014782608695652173, "loss": 1.6004, "step": 17 }, { "epoch": 0.14937759336099585, "grad_norm": 0.41796875, "learning_rate": 0.0001565217391304348, "loss": 1.6146, "step": 18 }, { "epoch": 0.15767634854771784, "grad_norm": 0.38671875, "learning_rate": 0.00016521739130434784, "loss": 1.4754, "step": 19 }, { "epoch": 0.16597510373443983, "grad_norm": 1.890625, "learning_rate": 0.00017391304347826088, "loss": 2.4256, "step": 20 }, { "epoch": 0.17427385892116182, "grad_norm": 0.296875, "learning_rate": 0.00018260869565217392, "loss": 1.4601, "step": 21 }, { "epoch": 0.1825726141078838, "grad_norm": 0.291015625, "learning_rate": 0.00019130434782608697, "loss": 1.4737, "step": 22 }, { "epoch": 0.1908713692946058, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.4942, "step": 23 }, { "epoch": 0.1991701244813278, "grad_norm": 0.259765625, "learning_rate": 0.00019998952044849376, "loss": 1.42, "step": 24 }, { "epoch": 0.2074688796680498, "grad_norm": 0.255859375, "learning_rate": 0.00019995808399039496, "loss": 1.4399, "step": 25 }, { "epoch": 0.2157676348547718, "grad_norm": 0.41796875, "learning_rate": 0.00019990569721450326, "loss": 2.1457, "step": 26 }, { "epoch": 0.22406639004149378, "grad_norm": 0.302734375, "learning_rate": 0.00019983237110061697, "loss": 1.4462, "step": 27 }, { "epoch": 0.23236514522821577, "grad_norm": 0.2890625, "learning_rate": 0.00019973812101723188, "loss": 1.3847, "step": 28 }, { "epoch": 0.24066390041493776, "grad_norm": 0.2412109375, "learning_rate": 0.00019962296671832003, "loss": 1.4013, "step": 29 }, { "epoch": 0.24896265560165975, "grad_norm": 0.2578125, "learning_rate": 0.00019948693233918952, "loss": 1.3528, "step": 30 }, { "epoch": 0.2572614107883817, "grad_norm": 0.255859375, "learning_rate": 0.00019933004639142605, "loss": 1.3523, "step": 31 }, { "epoch": 0.26556016597510373, "grad_norm": 0.265625, "learning_rate": 0.000199152341756917, "loss": 1.36, "step": 32 }, { "epoch": 0.27385892116182575, "grad_norm": 0.26953125, "learning_rate": 0.00019895385568095982, "loss": 1.3316, "step": 33 }, { "epoch": 0.2821576763485477, "grad_norm": 0.265625, "learning_rate": 0.00019873462976445553, "loss": 1.371, "step": 34 }, { "epoch": 0.29045643153526973, "grad_norm": 0.2578125, "learning_rate": 0.00019849470995518992, "loss": 1.4089, "step": 35 }, { "epoch": 0.2987551867219917, "grad_norm": 0.255859375, "learning_rate": 0.0001982341465382029, "loss": 1.3663, "step": 36 }, { "epoch": 0.3070539419087137, "grad_norm": 0.267578125, "learning_rate": 0.00019795299412524945, "loss": 1.3573, "step": 37 }, { "epoch": 0.3153526970954357, "grad_norm": 0.24609375, "learning_rate": 0.00019765131164335345, "loss": 1.3048, "step": 38 }, { "epoch": 0.3236514522821577, "grad_norm": 0.7734375, "learning_rate": 0.000197329162322457, "loss": 2.0409, "step": 39 }, { "epoch": 0.33195020746887965, "grad_norm": 0.25390625, "learning_rate": 0.00019698661368216817, "loss": 1.3583, "step": 40 }, { "epoch": 0.34024896265560167, "grad_norm": 0.2490234375, "learning_rate": 0.00019662373751760934, "loss": 1.2915, "step": 41 }, { "epoch": 0.34854771784232363, "grad_norm": 0.2578125, "learning_rate": 0.00019624060988436966, "loss": 1.3238, "step": 42 }, { "epoch": 0.35684647302904565, "grad_norm": 0.2294921875, "learning_rate": 0.0001958373110825644, "loss": 1.3412, "step": 43 }, { "epoch": 0.3651452282157676, "grad_norm": 0.236328125, "learning_rate": 0.00019541392564000488, "loss": 1.2947, "step": 44 }, { "epoch": 0.37344398340248963, "grad_norm": 0.2451171875, "learning_rate": 0.00019497054229448223, "loss": 1.3453, "step": 45 }, { "epoch": 0.3817427385892116, "grad_norm": 0.248046875, "learning_rate": 0.0001945072539751685, "loss": 1.3186, "step": 46 }, { "epoch": 0.3900414937759336, "grad_norm": 0.2431640625, "learning_rate": 0.00019402415778313977, "loss": 1.3524, "step": 47 }, { "epoch": 0.3983402489626556, "grad_norm": 0.267578125, "learning_rate": 0.00019352135497102463, "loss": 1.3403, "step": 48 }, { "epoch": 0.4066390041493776, "grad_norm": 0.251953125, "learning_rate": 0.0001929989509217824, "loss": 1.3266, "step": 49 }, { "epoch": 0.4149377593360996, "grad_norm": 0.251953125, "learning_rate": 0.0001924570551266159, "loss": 1.3373, "step": 50 }, { "epoch": 0.42323651452282157, "grad_norm": 0.2412109375, "learning_rate": 0.00019189578116202307, "loss": 1.268, "step": 51 }, { "epoch": 0.4315352697095436, "grad_norm": 0.23828125, "learning_rate": 0.00019131524666599233, "loss": 1.2878, "step": 52 }, { "epoch": 0.43983402489626555, "grad_norm": 0.236328125, "learning_rate": 0.00019071557331334669, "loss": 1.2968, "step": 53 }, { "epoch": 0.44813278008298757, "grad_norm": 0.2353515625, "learning_rate": 0.0001900968867902419, "loss": 1.3174, "step": 54 }, { "epoch": 0.45643153526970953, "grad_norm": 0.2412109375, "learning_rate": 0.00018945931676782373, "loss": 1.283, "step": 55 }, { "epoch": 0.46473029045643155, "grad_norm": 0.248046875, "learning_rate": 0.0001888029968750498, "loss": 1.279, "step": 56 }, { "epoch": 0.4730290456431535, "grad_norm": 0.240234375, "learning_rate": 0.00018812806467068268, "loss": 1.3151, "step": 57 }, { "epoch": 0.48132780082987553, "grad_norm": 0.244140625, "learning_rate": 0.00018743466161445823, "loss": 1.2763, "step": 58 }, { "epoch": 0.4896265560165975, "grad_norm": 0.25390625, "learning_rate": 0.00018672293303743738, "loss": 1.3259, "step": 59 }, { "epoch": 0.4979253112033195, "grad_norm": 0.255859375, "learning_rate": 0.00018599302811154572, "loss": 1.287, "step": 60 }, { "epoch": 0.5062240663900415, "grad_norm": 0.267578125, "learning_rate": 0.00018524509981830852, "loss": 1.2794, "step": 61 }, { "epoch": 0.5145228215767634, "grad_norm": 0.255859375, "learning_rate": 0.00018447930491678733, "loss": 1.3154, "step": 62 }, { "epoch": 0.5228215767634855, "grad_norm": 0.248046875, "learning_rate": 0.00018369580391072433, "loss": 1.2979, "step": 63 }, { "epoch": 0.5311203319502075, "grad_norm": 0.2421875, "learning_rate": 0.00018289476101490256, "loss": 1.326, "step": 64 }, { "epoch": 0.5394190871369294, "grad_norm": 0.24609375, "learning_rate": 0.00018207634412072764, "loss": 1.2843, "step": 65 }, { "epoch": 0.5477178423236515, "grad_norm": 0.2490234375, "learning_rate": 0.00018124072476103956, "loss": 1.3233, "step": 66 }, { "epoch": 0.5560165975103735, "grad_norm": 0.248046875, "learning_rate": 0.00018038807807416068, "loss": 1.3208, "step": 67 }, { "epoch": 0.5643153526970954, "grad_norm": 0.2431640625, "learning_rate": 0.00017951858276718844, "loss": 1.2949, "step": 68 }, { "epoch": 0.5726141078838174, "grad_norm": 0.412109375, "learning_rate": 0.00017863242107853995, "loss": 1.9205, "step": 69 }, { "epoch": 0.5809128630705395, "grad_norm": 0.2578125, "learning_rate": 0.0001777297787397563, "loss": 1.2847, "step": 70 }, { "epoch": 0.5892116182572614, "grad_norm": 0.2578125, "learning_rate": 0.00017681084493657525, "loss": 1.298, "step": 71 }, { "epoch": 0.5975103734439834, "grad_norm": 0.255859375, "learning_rate": 0.0001758758122692791, "loss": 1.2805, "step": 72 }, { "epoch": 0.6058091286307054, "grad_norm": 0.25390625, "learning_rate": 0.00017492487671232784, "loss": 1.2829, "step": 73 }, { "epoch": 0.6141078838174274, "grad_norm": 0.2470703125, "learning_rate": 0.00017395823757328444, "loss": 1.2523, "step": 74 }, { "epoch": 0.6224066390041494, "grad_norm": 0.25, "learning_rate": 0.00017297609745104184, "loss": 1.2977, "step": 75 }, { "epoch": 0.6307053941908713, "grad_norm": 0.25, "learning_rate": 0.0001719786621933599, "loss": 1.3103, "step": 76 }, { "epoch": 0.6390041493775933, "grad_norm": 0.25390625, "learning_rate": 0.00017096614085372185, "loss": 1.3006, "step": 77 }, { "epoch": 0.6473029045643154, "grad_norm": 0.435546875, "learning_rate": 0.00016993874564751822, "loss": 1.8113, "step": 78 }, { "epoch": 0.6556016597510373, "grad_norm": 0.6953125, "learning_rate": 0.00016889669190756868, "loss": 2.3268, "step": 79 }, { "epoch": 0.6639004149377593, "grad_norm": 0.2734375, "learning_rate": 0.00016784019803899, "loss": 1.2574, "step": 80 }, { "epoch": 0.6721991701244814, "grad_norm": 0.259765625, "learning_rate": 0.0001667694854734204, "loss": 1.2365, "step": 81 }, { "epoch": 0.6804979253112033, "grad_norm": 0.26953125, "learning_rate": 0.0001656847786226095, "loss": 1.2974, "step": 82 }, { "epoch": 0.6887966804979253, "grad_norm": 0.267578125, "learning_rate": 0.00016458630483138356, "loss": 1.2225, "step": 83 }, { "epoch": 0.6970954356846473, "grad_norm": 0.2734375, "learning_rate": 0.00016347429432999602, "loss": 1.2477, "step": 84 }, { "epoch": 0.7053941908713693, "grad_norm": 0.28125, "learning_rate": 0.00016234898018587337, "loss": 1.2872, "step": 85 }, { "epoch": 0.7136929460580913, "grad_norm": 0.271484375, "learning_rate": 0.0001612105982547663, "loss": 1.2824, "step": 86 }, { "epoch": 0.7219917012448133, "grad_norm": 0.26171875, "learning_rate": 0.00016005938713131642, "loss": 1.3056, "step": 87 }, { "epoch": 0.7302904564315352, "grad_norm": 0.265625, "learning_rate": 0.00015889558809904902, "loss": 1.244, "step": 88 }, { "epoch": 0.7385892116182573, "grad_norm": 0.267578125, "learning_rate": 0.00015771944507980207, "loss": 1.2742, "step": 89 }, { "epoch": 0.7468879668049793, "grad_norm": 0.259765625, "learning_rate": 0.00015653120458260263, "loss": 1.3265, "step": 90 }, { "epoch": 0.7551867219917012, "grad_norm": 0.25, "learning_rate": 0.00015533111565200044, "loss": 1.2373, "step": 91 }, { "epoch": 0.7634854771784232, "grad_norm": 0.259765625, "learning_rate": 0.0001541194298158708, "loss": 1.3201, "step": 92 }, { "epoch": 0.7717842323651453, "grad_norm": 0.27734375, "learning_rate": 0.00015289640103269625, "loss": 1.2052, "step": 93 }, { "epoch": 0.7800829875518672, "grad_norm": 0.283203125, "learning_rate": 0.00015166228563833934, "loss": 1.2638, "step": 94 }, { "epoch": 0.7883817427385892, "grad_norm": 0.2578125, "learning_rate": 0.00015041734229231688, "loss": 1.2334, "step": 95 }, { "epoch": 0.7966804979253111, "grad_norm": 0.263671875, "learning_rate": 0.00014916183192358718, "loss": 1.2396, "step": 96 }, { "epoch": 0.8049792531120332, "grad_norm": 0.2578125, "learning_rate": 0.00014789601767586173, "loss": 1.2272, "step": 97 }, { "epoch": 0.8132780082987552, "grad_norm": 0.28125, "learning_rate": 0.00014662016485245274, "loss": 1.2784, "step": 98 }, { "epoch": 0.8215767634854771, "grad_norm": 0.275390625, "learning_rate": 0.00014533454086066772, "loss": 1.2512, "step": 99 }, { "epoch": 0.8298755186721992, "grad_norm": 0.275390625, "learning_rate": 0.00014403941515576344, "loss": 1.2562, "step": 100 }, { "epoch": 0.8381742738589212, "grad_norm": 0.267578125, "learning_rate": 0.00014273505918447054, "loss": 1.3054, "step": 101 }, { "epoch": 0.8464730290456431, "grad_norm": 0.263671875, "learning_rate": 0.00014142174632810072, "loss": 1.2831, "step": 102 }, { "epoch": 0.8547717842323651, "grad_norm": 0.279296875, "learning_rate": 0.0001400997518452484, "loss": 1.31, "step": 103 }, { "epoch": 0.8630705394190872, "grad_norm": 0.25, "learning_rate": 0.00013876935281409907, "loss": 1.2472, "step": 104 }, { "epoch": 0.8713692946058091, "grad_norm": 0.28515625, "learning_rate": 0.00013743082807435615, "loss": 1.2792, "step": 105 }, { "epoch": 0.8796680497925311, "grad_norm": 0.267578125, "learning_rate": 0.00013608445816879866, "loss": 1.2853, "step": 106 }, { "epoch": 0.8879668049792531, "grad_norm": 0.265625, "learning_rate": 0.00013473052528448201, "loss": 1.2923, "step": 107 }, { "epoch": 0.8962655601659751, "grad_norm": 0.275390625, "learning_rate": 0.00013336931319359426, "loss": 1.2768, "step": 108 }, { "epoch": 0.9045643153526971, "grad_norm": 0.267578125, "learning_rate": 0.00013200110719397968, "loss": 1.2701, "step": 109 }, { "epoch": 0.9128630705394191, "grad_norm": 0.25390625, "learning_rate": 0.00013062619404934317, "loss": 1.21, "step": 110 }, { "epoch": 0.921161825726141, "grad_norm": 0.255859375, "learning_rate": 0.00012924486192914705, "loss": 1.2808, "step": 111 }, { "epoch": 0.9294605809128631, "grad_norm": 0.26171875, "learning_rate": 0.00012785740034821329, "loss": 1.2616, "step": 112 }, { "epoch": 0.9377593360995851, "grad_norm": 0.265625, "learning_rate": 0.00012646410010604397, "loss": 1.2094, "step": 113 }, { "epoch": 0.946058091286307, "grad_norm": 0.267578125, "learning_rate": 0.00012506525322587207, "loss": 1.2128, "step": 114 }, { "epoch": 0.9543568464730291, "grad_norm": 0.267578125, "learning_rate": 0.0001236611528934562, "loss": 1.2573, "step": 115 }, { "epoch": 0.9626556016597511, "grad_norm": 0.259765625, "learning_rate": 0.00012225209339563145, "loss": 1.2367, "step": 116 }, { "epoch": 0.970954356846473, "grad_norm": 0.263671875, "learning_rate": 0.00012083837005862946, "loss": 1.2313, "step": 117 }, { "epoch": 0.979253112033195, "grad_norm": 0.265625, "learning_rate": 0.00011942027918618074, "loss": 1.2495, "step": 118 }, { "epoch": 0.9875518672199171, "grad_norm": 0.259765625, "learning_rate": 0.0001179981179974121, "loss": 1.2225, "step": 119 }, { "epoch": 0.995850622406639, "grad_norm": 0.28125, "learning_rate": 0.00011657218456455206, "loss": 1.2869, "step": 120 } ], "logging_steps": 1, "max_steps": 240, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 120, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.97353639329792e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }