{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": 2.61431884765625, "accuracy": 0.4765625, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 42.59496307373047, "learning_rate": 1.5000000000000002e-07, "loss": 0.8092, "step": 1 }, { "Batch Mean": 2.574005126953125, "accuracy": 0.578125, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 46.709930419921875, "learning_rate": 3.0000000000000004e-07, "loss": 0.7799, "step": 2 }, { "Batch Mean": 2.560516357421875, "accuracy": 0.484375, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 44.51314163208008, "learning_rate": 4.5e-07, "loss": 0.8043, "step": 3 }, { "Batch Mean": 2.6197509765625, "accuracy": 0.5, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 42.86121368408203, "learning_rate": 6.000000000000001e-07, "loss": 0.8014, "step": 4 }, { "Batch Mean": 2.565338134765625, "accuracy": 0.53125, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 44.409908294677734, "learning_rate": 7.5e-07, "loss": 0.7983, "step": 5 }, { "Batch Mean": 2.522857666015625, "accuracy": 0.484375, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 43.77534484863281, "learning_rate": 9e-07, "loss": 0.7975, "step": 6 }, { "Batch Mean": 2.3875732421875, "accuracy": 0.4921875, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 41.981407165527344, "learning_rate": 1.05e-06, "loss": 0.7951, "step": 7 }, { "Batch Mean": 2.317047119140625, "accuracy": 0.515625, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 40.53751754760742, "learning_rate": 1.2000000000000002e-06, "loss": 0.791, "step": 8 }, { "Batch Mean": 1.757080078125, "accuracy": 0.5078125, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 31.105939865112305, "learning_rate": 1.35e-06, "loss": 0.7571, "step": 9 }, { "Batch Mean": 1.6220855712890625, "accuracy": 0.578125, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 33.61161804199219, "learning_rate": 1.5e-06, "loss": 0.7234, "step": 10 }, { "Batch Mean": 1.400146484375, "accuracy": 0.53125, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 34.554622650146484, "learning_rate": 1.65e-06, "loss": 0.725, "step": 11 }, { "Batch Mean": -0.43697381019592285, "accuracy": 0.5234375, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 12.222326278686523, "learning_rate": 1.8e-06, "loss": 0.6986, "step": 12 }, { "Batch Mean": -0.8854951858520508, "accuracy": 0.5234375, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 16.398786544799805, "learning_rate": 1.95e-06, "loss": 0.7081, "step": 13 }, { "Batch Mean": -0.9508838653564453, "accuracy": 0.640625, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 19.590002059936523, "learning_rate": 2.1e-06, "loss": 0.6784, "step": 14 }, { "Batch Mean": -1.0560526847839355, "accuracy": 0.4921875, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 22.669742584228516, "learning_rate": 2.25e-06, "loss": 0.7235, "step": 15 }, { "Batch Mean": -1.1405725479125977, "accuracy": 0.625, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 22.189626693725586, "learning_rate": 2.4000000000000003e-06, "loss": 0.6714, "step": 16 }, { "Batch Mean": -1.1522831916809082, "accuracy": 0.640625, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 19.9060115814209, "learning_rate": 2.55e-06, "loss": 0.6757, "step": 17 }, { "Batch Mean": -1.1666946411132812, "accuracy": 0.53125, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 23.661216735839844, "learning_rate": 2.7e-06, "loss": 0.7002, "step": 18 }, { "Batch Mean": -0.999359130859375, "accuracy": 0.6640625, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 15.433859825134277, "learning_rate": 2.85e-06, "loss": 0.6471, "step": 19 }, { "Batch Mean": -0.6625549793243408, "accuracy": 0.6171875, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 13.135390281677246, "learning_rate": 3e-06, "loss": 0.662, "step": 20 }, { "Batch Mean": -0.13487493991851807, "accuracy": 0.6171875, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 7.336737632751465, "learning_rate": 2.992105263157895e-06, "loss": 0.629, "step": 21 }, { "Batch Mean": 0.2938040494918823, "accuracy": 0.6640625, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 9.586155891418457, "learning_rate": 2.9842105263157896e-06, "loss": 0.6225, "step": 22 }, { "Batch Mean": 0.6780391931533813, "accuracy": 0.7421875, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 14.825817108154297, "learning_rate": 2.9763157894736843e-06, "loss": 0.607, "step": 23 }, { "Batch Mean": 0.9421095848083496, "accuracy": 0.609375, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 16.7529354095459, "learning_rate": 2.968421052631579e-06, "loss": 0.6542, "step": 24 }, { "Batch Mean": 1.0722179412841797, "accuracy": 0.71875, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 18.940032958984375, "learning_rate": 2.960526315789474e-06, "loss": 0.5882, "step": 25 }, { "Batch Mean": 0.8396664261817932, "accuracy": 0.6484375, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 14.680763244628906, "learning_rate": 2.9526315789473685e-06, "loss": 0.6463, "step": 26 }, { "Batch Mean": 0.2986793518066406, "accuracy": 0.671875, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 7.3362345695495605, "learning_rate": 2.9447368421052633e-06, "loss": 0.5923, "step": 27 }, { "Batch Mean": -0.29196763038635254, "accuracy": 0.734375, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 8.053507804870605, "learning_rate": 2.936842105263158e-06, "loss": 0.5261, "step": 28 }, { "Batch Mean": -0.8161113262176514, "accuracy": 0.7109375, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 14.036270141601562, "learning_rate": 2.9289473684210528e-06, "loss": 0.5482, "step": 29 }, { "Batch Mean": -0.7824737429618835, "accuracy": 0.75, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 13.984892845153809, "learning_rate": 2.9210526315789475e-06, "loss": 0.5939, "step": 30 }, { "Batch Mean": -0.4927825927734375, "accuracy": 0.734375, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 10.751565933227539, "learning_rate": 2.9131578947368423e-06, "loss": 0.5901, "step": 31 }, { "Batch Mean": 0.15497040748596191, "accuracy": 0.6796875, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 8.30102252960205, "learning_rate": 2.905263157894737e-06, "loss": 0.5731, "step": 32 }, { "Batch Mean": 0.16421844065189362, "accuracy": 0.7578125, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 9.087366104125977, "learning_rate": 2.8973684210526318e-06, "loss": 0.4958, "step": 33 }, { "Batch Mean": 0.3596491813659668, "accuracy": 0.859375, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 9.984371185302734, "learning_rate": 2.8894736842105265e-06, "loss": 0.4321, "step": 34 }, { "Batch Mean": 0.2128266543149948, "accuracy": 0.734375, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 7.400077819824219, "learning_rate": 2.8815789473684213e-06, "loss": 0.5178, "step": 35 }, { "Batch Mean": 0.10449030995368958, "accuracy": 0.7578125, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 6.669457912445068, "learning_rate": 2.873684210526316e-06, "loss": 0.4914, "step": 36 }, { "Batch Mean": -0.24382781982421875, "accuracy": 0.6953125, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 10.831747055053711, "learning_rate": 2.8657894736842103e-06, "loss": 0.5305, "step": 37 }, { "Batch Mean": -0.29154396057128906, "accuracy": 0.7109375, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 11.060967445373535, "learning_rate": 2.857894736842105e-06, "loss": 0.5589, "step": 38 }, { "Batch Mean": -0.07830595970153809, "accuracy": 0.765625, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 8.426444053649902, "learning_rate": 2.85e-06, "loss": 0.4663, "step": 39 }, { "Batch Mean": -0.09856069087982178, "accuracy": 0.7890625, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 11.046717643737793, "learning_rate": 2.8421052631578946e-06, "loss": 0.455, "step": 40 }, { "Batch Mean": 0.29904642701148987, "accuracy": 0.8125, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 11.232060432434082, "learning_rate": 2.8342105263157897e-06, "loss": 0.437, "step": 41 }, { "Batch Mean": 0.43527090549468994, "accuracy": 0.78125, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 12.555906295776367, "learning_rate": 2.8263157894736845e-06, "loss": 0.5316, "step": 42 }, { "Batch Mean": 0.02639901638031006, "accuracy": 0.8046875, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 9.500948905944824, "learning_rate": 2.8184210526315792e-06, "loss": 0.4874, "step": 43 }, { "Batch Mean": -0.3524761199951172, "accuracy": 0.7421875, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 10.417457580566406, "learning_rate": 2.810526315789474e-06, "loss": 0.5011, "step": 44 }, { "Batch Mean": -0.32391130924224854, "accuracy": 0.8125, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 9.867128372192383, "learning_rate": 2.8026315789473687e-06, "loss": 0.4908, "step": 45 }, { "Batch Mean": -0.42060422897338867, "accuracy": 0.8125, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 10.613035202026367, "learning_rate": 2.7947368421052635e-06, "loss": 0.4642, "step": 46 }, { "Batch Mean": 0.04145359992980957, "accuracy": 0.7890625, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 12.13752555847168, "learning_rate": 2.7868421052631578e-06, "loss": 0.4943, "step": 47 }, { "Batch Mean": 0.24553179740905762, "accuracy": 0.8046875, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 11.048128128051758, "learning_rate": 2.7789473684210525e-06, "loss": 0.3946, "step": 48 }, { "Batch Mean": -0.05052506923675537, "accuracy": 0.796875, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 9.053565979003906, "learning_rate": 2.7710526315789473e-06, "loss": 0.4474, "step": 49 }, { "Batch Mean": -0.43336963653564453, "accuracy": 0.7265625, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 15.04668140411377, "learning_rate": 2.763157894736842e-06, "loss": 0.5496, "step": 50 }, { "Batch Mean": -0.29952335357666016, "accuracy": 0.765625, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 11.302435874938965, "learning_rate": 2.7552631578947368e-06, "loss": 0.4337, "step": 51 }, { "Batch Mean": -0.01873302459716797, "accuracy": 0.828125, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 10.582301139831543, "learning_rate": 2.7473684210526315e-06, "loss": 0.4073, "step": 52 }, { "Batch Mean": 0.4117751717567444, "accuracy": 0.8046875, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 9.776220321655273, "learning_rate": 2.7394736842105263e-06, "loss": 0.4235, "step": 53 }, { "Batch Mean": 0.5990171432495117, "accuracy": 0.78125, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 12.647154808044434, "learning_rate": 2.7315789473684214e-06, "loss": 0.4858, "step": 54 }, { "Batch Mean": 0.2538492679595947, "accuracy": 0.7265625, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 11.564103126525879, "learning_rate": 2.723684210526316e-06, "loss": 0.5233, "step": 55 }, { "Batch Mean": 0.17059040069580078, "accuracy": 0.7578125, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 10.589515686035156, "learning_rate": 2.715789473684211e-06, "loss": 0.4762, "step": 56 }, { "Batch Mean": -0.3642357587814331, "accuracy": 0.78125, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 10.727502822875977, "learning_rate": 2.7078947368421052e-06, "loss": 0.5289, "step": 57 }, { "Batch Mean": -0.5980481505393982, "accuracy": 0.8203125, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 8.915611267089844, "learning_rate": 2.7e-06, "loss": 0.4038, "step": 58 }, { "Batch Mean": -0.46013855934143066, "accuracy": 0.859375, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 9.178686141967773, "learning_rate": 2.6921052631578947e-06, "loss": 0.3708, "step": 59 }, { "Batch Mean": -0.1478586494922638, "accuracy": 0.84375, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 6.339688301086426, "learning_rate": 2.6842105263157895e-06, "loss": 0.4036, "step": 60 }, { "Batch Mean": 0.003389716148376465, "accuracy": 0.7578125, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 6.763300895690918, "learning_rate": 2.6763157894736842e-06, "loss": 0.5163, "step": 61 }, { "Batch Mean": 0.4018087387084961, "accuracy": 0.7734375, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 8.631525039672852, "learning_rate": 2.668421052631579e-06, "loss": 0.4219, "step": 62 }, { "Batch Mean": 0.342004656791687, "accuracy": 0.734375, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 9.190362930297852, "learning_rate": 2.6605263157894737e-06, "loss": 0.4708, "step": 63 }, { "Batch Mean": 0.5178697109222412, "accuracy": 0.8125, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 10.207452774047852, "learning_rate": 2.6526315789473685e-06, "loss": 0.4458, "step": 64 }, { "Batch Mean": 0.03012150526046753, "accuracy": 0.8125, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 8.050507545471191, "learning_rate": 2.644736842105263e-06, "loss": 0.4344, "step": 65 }, { "Batch Mean": -0.3691895604133606, "accuracy": 0.78125, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 11.163555145263672, "learning_rate": 2.636842105263158e-06, "loss": 0.4276, "step": 66 }, { "Batch Mean": -0.2624788284301758, "accuracy": 0.8125, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 9.523059844970703, "learning_rate": 2.6289473684210527e-06, "loss": 0.4001, "step": 67 }, { "Batch Mean": -0.0055138468742370605, "accuracy": 0.8046875, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 9.094910621643066, "learning_rate": 2.6210526315789474e-06, "loss": 0.4024, "step": 68 }, { "Batch Mean": 0.2572704553604126, "accuracy": 0.7734375, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 10.00645923614502, "learning_rate": 2.613157894736842e-06, "loss": 0.4802, "step": 69 }, { "Batch Mean": 0.5112218856811523, "accuracy": 0.8046875, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 11.776649475097656, "learning_rate": 2.605263157894737e-06, "loss": 0.4194, "step": 70 }, { "Batch Mean": 0.29661768674850464, "accuracy": 0.8125, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 9.701162338256836, "learning_rate": 2.5973684210526317e-06, "loss": 0.4356, "step": 71 }, { "Batch Mean": 0.049837589263916016, "accuracy": 0.859375, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 8.692646980285645, "learning_rate": 2.5894736842105264e-06, "loss": 0.2881, "step": 72 }, { "Batch Mean": -0.3786022663116455, "accuracy": 0.8125, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 10.834145545959473, "learning_rate": 2.581578947368421e-06, "loss": 0.4654, "step": 73 }, { "Batch Mean": -0.20977401733398438, "accuracy": 0.8515625, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 7.816598892211914, "learning_rate": 2.573684210526316e-06, "loss": 0.4101, "step": 74 }, { "Batch Mean": -0.6599991321563721, "accuracy": 0.8203125, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 13.089007377624512, "learning_rate": 2.5657894736842107e-06, "loss": 0.427, "step": 75 }, { "Batch Mean": -0.37617337703704834, "accuracy": 0.7734375, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 12.075210571289062, "learning_rate": 2.5578947368421054e-06, "loss": 0.4797, "step": 76 }, { "Batch Mean": 0.1281442642211914, "accuracy": 0.8046875, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 9.70462703704834, "learning_rate": 2.55e-06, "loss": 0.4425, "step": 77 }, { "Batch Mean": 0.6071650981903076, "accuracy": 0.828125, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 14.395353317260742, "learning_rate": 2.542105263157895e-06, "loss": 0.4016, "step": 78 }, { "Batch Mean": 0.34730714559555054, "accuracy": 0.8203125, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 9.122536659240723, "learning_rate": 2.5342105263157892e-06, "loss": 0.3994, "step": 79 }, { "Batch Mean": 0.36483922600746155, "accuracy": 0.828125, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 10.289307594299316, "learning_rate": 2.526315789473684e-06, "loss": 0.368, "step": 80 }, { "Batch Mean": -0.20039799809455872, "accuracy": 0.8125, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 7.813342094421387, "learning_rate": 2.5184210526315787e-06, "loss": 0.3679, "step": 81 }, { "Batch Mean": -0.1629079282283783, "accuracy": 0.8203125, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 7.504952430725098, "learning_rate": 2.510526315789474e-06, "loss": 0.3823, "step": 82 }, { "Batch Mean": -0.07863587141036987, "accuracy": 0.8046875, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 7.721461296081543, "learning_rate": 2.5026315789473686e-06, "loss": 0.3967, "step": 83 }, { "Batch Mean": 0.17194491624832153, "accuracy": 0.8125, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 8.062063217163086, "learning_rate": 2.4947368421052634e-06, "loss": 0.4195, "step": 84 }, { "Batch Mean": 0.11748838424682617, "accuracy": 0.8359375, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 8.320355415344238, "learning_rate": 2.486842105263158e-06, "loss": 0.3864, "step": 85 }, { "Batch Mean": -0.40801382064819336, "accuracy": 0.7734375, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 14.8066987991333, "learning_rate": 2.478947368421053e-06, "loss": 0.4639, "step": 86 }, { "Batch Mean": -0.35811758041381836, "accuracy": 0.7734375, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 14.074177742004395, "learning_rate": 2.4710526315789476e-06, "loss": 0.4999, "step": 87 }, { "Batch Mean": 0.278584361076355, "accuracy": 0.7734375, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 10.59004020690918, "learning_rate": 2.4631578947368424e-06, "loss": 0.404, "step": 88 }, { "Batch Mean": -0.21054387092590332, "accuracy": 0.84375, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 8.488351821899414, "learning_rate": 2.4552631578947367e-06, "loss": 0.3517, "step": 89 }, { "Batch Mean": -0.22967231273651123, "accuracy": 0.84375, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 10.347502708435059, "learning_rate": 2.4473684210526314e-06, "loss": 0.3382, "step": 90 }, { "Batch Mean": 0.1892259418964386, "accuracy": 0.7890625, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 10.092534065246582, "learning_rate": 2.439473684210526e-06, "loss": 0.4586, "step": 91 }, { "Batch Mean": 0.516247570514679, "accuracy": 0.8828125, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 13.944212913513184, "learning_rate": 2.431578947368421e-06, "loss": 0.3067, "step": 92 }, { "Batch Mean": 0.056681275367736816, "accuracy": 0.7890625, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 8.53646183013916, "learning_rate": 2.4236842105263157e-06, "loss": 0.3946, "step": 93 }, { "Batch Mean": -0.47129684686660767, "accuracy": 0.8359375, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 11.441537857055664, "learning_rate": 2.4157894736842104e-06, "loss": 0.3891, "step": 94 }, { "Batch Mean": -0.3262195587158203, "accuracy": 0.84375, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 10.751629829406738, "learning_rate": 2.4078947368421056e-06, "loss": 0.3578, "step": 95 }, { "Batch Mean": -0.6593484878540039, "accuracy": 0.828125, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 15.914350509643555, "learning_rate": 2.4000000000000003e-06, "loss": 0.4204, "step": 96 }, { "Batch Mean": -0.32147085666656494, "accuracy": 0.8359375, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 11.814691543579102, "learning_rate": 2.392105263157895e-06, "loss": 0.3892, "step": 97 }, { "Batch Mean": 0.49291136860847473, "accuracy": 0.875, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 11.86365795135498, "learning_rate": 2.38421052631579e-06, "loss": 0.3264, "step": 98 }, { "Batch Mean": 0.49174070358276367, "accuracy": 0.8515625, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 10.874544143676758, "learning_rate": 2.376315789473684e-06, "loss": 0.3847, "step": 99 }, { "Batch Mean": 0.6919900178909302, "accuracy": 0.796875, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 15.105185508728027, "learning_rate": 2.368421052631579e-06, "loss": 0.4066, "step": 100 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }