{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.75, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": -2.9549560546875, "accuracy": 0.5078125, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 38.43654251098633, "learning_rate": 1.5000000000000002e-07, "loss": 0.8271, "step": 1 }, { "Batch Mean": -2.9178466796875, "accuracy": 0.46875, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 38.60389709472656, "learning_rate": 3.0000000000000004e-07, "loss": 0.8459, "step": 2 }, { "Batch Mean": -2.95965576171875, "accuracy": 0.4453125, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 40.155521392822266, "learning_rate": 4.5e-07, "loss": 0.8471, "step": 3 }, { "Batch Mean": -2.93951416015625, "accuracy": 0.515625, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 39.40812683105469, "learning_rate": 6.000000000000001e-07, "loss": 0.8381, "step": 4 }, { "Batch Mean": -2.9075927734375, "accuracy": 0.46875, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 38.975746154785156, "learning_rate": 7.5e-07, "loss": 0.8312, "step": 5 }, { "Batch Mean": -2.85748291015625, "accuracy": 0.3125, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 38.6825065612793, "learning_rate": 9e-07, "loss": 0.8466, "step": 6 }, { "Batch Mean": -2.689208984375, "accuracy": 0.390625, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 34.53704833984375, "learning_rate": 1.05e-06, "loss": 0.8319, "step": 7 }, { "Batch Mean": -2.65997314453125, "accuracy": 0.453125, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 31.157188415527344, "learning_rate": 1.2000000000000002e-06, "loss": 0.8268, "step": 8 }, { "Batch Mean": -2.0892333984375, "accuracy": 0.4921875, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 24.8486270904541, "learning_rate": 1.35e-06, "loss": 0.7703, "step": 9 }, { "Batch Mean": -1.9425048828125, "accuracy": 0.4140625, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 25.407278060913086, "learning_rate": 1.5e-06, "loss": 0.7765, "step": 10 }, { "Batch Mean": -1.772979736328125, "accuracy": 0.34375, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 24.18885040283203, "learning_rate": 1.65e-06, "loss": 0.7768, "step": 11 }, { "Batch Mean": 0.017196819186210632, "accuracy": 0.5, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 8.657893180847168, "learning_rate": 1.8e-06, "loss": 0.7086, "step": 12 }, { "Batch Mean": 0.37944215536117554, "accuracy": 0.4765625, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 13.965994834899902, "learning_rate": 1.95e-06, "loss": 0.6999, "step": 13 }, { "Batch Mean": 0.5208501815795898, "accuracy": 0.6171875, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 22.43783950805664, "learning_rate": 2.1e-06, "loss": 0.6639, "step": 14 }, { "Batch Mean": 0.7098770141601562, "accuracy": 0.5546875, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 12.728647232055664, "learning_rate": 2.25e-06, "loss": 0.6735, "step": 15 }, { "Batch Mean": 0.9353656768798828, "accuracy": 0.6953125, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 18.56553077697754, "learning_rate": 2.4000000000000003e-06, "loss": 0.6592, "step": 16 }, { "Batch Mean": 1.4734210968017578, "accuracy": 0.640625, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 27.16295051574707, "learning_rate": 2.55e-06, "loss": 0.6713, "step": 17 }, { "Batch Mean": 1.8671340942382812, "accuracy": 0.6484375, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 38.08174514770508, "learning_rate": 2.7e-06, "loss": 0.6832, "step": 18 }, { "Batch Mean": 1.8309574127197266, "accuracy": 0.671875, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 43.57632827758789, "learning_rate": 2.85e-06, "loss": 0.6855, "step": 19 }, { "Batch Mean": 1.5924930572509766, "accuracy": 0.6015625, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 35.4539909362793, "learning_rate": 3e-06, "loss": 0.7016, "step": 20 }, { "Batch Mean": 1.2645306587219238, "accuracy": 0.7109375, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 33.7216796875, "learning_rate": 2.992105263157895e-06, "loss": 0.625, "step": 21 }, { "Batch Mean": 0.44832003116607666, "accuracy": 0.71875, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 22.181182861328125, "learning_rate": 2.9842105263157896e-06, "loss": 0.5738, "step": 22 }, { "Batch Mean": -0.5859236717224121, "accuracy": 0.6796875, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 19.15841293334961, "learning_rate": 2.9763157894736843e-06, "loss": 0.6047, "step": 23 }, { "Batch Mean": -1.2775471210479736, "accuracy": 0.7421875, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 46.0495491027832, "learning_rate": 2.968421052631579e-06, "loss": 0.5853, "step": 24 }, { "Batch Mean": -1.3207778930664062, "accuracy": 0.6953125, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 44.94231033325195, "learning_rate": 2.960526315789474e-06, "loss": 0.6105, "step": 25 }, { "Batch Mean": -1.3301990032196045, "accuracy": 0.75, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 45.24757766723633, "learning_rate": 2.9526315789473685e-06, "loss": 0.5729, "step": 26 }, { "Batch Mean": -0.9073872566223145, "accuracy": 0.7421875, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 30.45476531982422, "learning_rate": 2.9447368421052633e-06, "loss": 0.5707, "step": 27 }, { "Batch Mean": -0.3670217990875244, "accuracy": 0.7421875, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 13.637907981872559, "learning_rate": 2.936842105263158e-06, "loss": 0.5046, "step": 28 }, { "Batch Mean": 0.32126128673553467, "accuracy": 0.71875, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 11.712913513183594, "learning_rate": 2.9289473684210528e-06, "loss": 0.5753, "step": 29 }, { "Batch Mean": 0.5725572109222412, "accuracy": 0.65625, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 13.215094566345215, "learning_rate": 2.9210526315789475e-06, "loss": 0.6093, "step": 30 }, { "Batch Mean": 0.6064486503601074, "accuracy": 0.71875, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 17.638071060180664, "learning_rate": 2.9131578947368423e-06, "loss": 0.5394, "step": 31 }, { "Batch Mean": 0.4119257926940918, "accuracy": 0.796875, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 17.742324829101562, "learning_rate": 2.905263157894737e-06, "loss": 0.5049, "step": 32 }, { "Batch Mean": 0.31667208671569824, "accuracy": 0.8125, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 13.98446273803711, "learning_rate": 2.8973684210526318e-06, "loss": 0.466, "step": 33 }, { "Batch Mean": -0.4047205150127411, "accuracy": 0.78125, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 10.204410552978516, "learning_rate": 2.8894736842105265e-06, "loss": 0.4622, "step": 34 }, { "Batch Mean": -0.3730044364929199, "accuracy": 0.7890625, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 12.993864059448242, "learning_rate": 2.8815789473684213e-06, "loss": 0.4788, "step": 35 }, { "Batch Mean": -0.38590008020401, "accuracy": 0.7578125, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 10.724040031433105, "learning_rate": 2.873684210526316e-06, "loss": 0.4847, "step": 36 }, { "Batch Mean": -0.21977567672729492, "accuracy": 0.765625, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 12.729230880737305, "learning_rate": 2.8657894736842103e-06, "loss": 0.5396, "step": 37 }, { "Batch Mean": -0.044023871421813965, "accuracy": 0.7578125, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 12.698225975036621, "learning_rate": 2.857894736842105e-06, "loss": 0.5317, "step": 38 }, { "Batch Mean": 0.3004276752471924, "accuracy": 0.84375, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 15.1500244140625, "learning_rate": 2.85e-06, "loss": 0.3625, "step": 39 }, { "Batch Mean": 0.4614996314048767, "accuracy": 0.78125, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 14.328938484191895, "learning_rate": 2.8421052631578946e-06, "loss": 0.4741, "step": 40 }, { "Batch Mean": 0.219995379447937, "accuracy": 0.734375, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 16.527143478393555, "learning_rate": 2.8342105263157897e-06, "loss": 0.5388, "step": 41 }, { "Batch Mean": 0.24077653884887695, "accuracy": 0.71875, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 15.48572826385498, "learning_rate": 2.8263157894736845e-06, "loss": 0.5193, "step": 42 }, { "Batch Mean": -0.29242193698883057, "accuracy": 0.7421875, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 14.708074569702148, "learning_rate": 2.8184210526315792e-06, "loss": 0.5096, "step": 43 }, { "Batch Mean": -0.5579910278320312, "accuracy": 0.765625, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 18.473812103271484, "learning_rate": 2.810526315789474e-06, "loss": 0.5251, "step": 44 }, { "Batch Mean": -0.44131672382354736, "accuracy": 0.703125, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 24.496036529541016, "learning_rate": 2.8026315789473687e-06, "loss": 0.5768, "step": 45 }, { "Batch Mean": -0.2054896354675293, "accuracy": 0.8203125, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 16.322071075439453, "learning_rate": 2.7947368421052635e-06, "loss": 0.441, "step": 46 }, { "Batch Mean": 0.5062417984008789, "accuracy": 0.734375, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 14.525461196899414, "learning_rate": 2.7868421052631578e-06, "loss": 0.4942, "step": 47 }, { "Batch Mean": 0.2660309374332428, "accuracy": 0.8359375, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 9.259953498840332, "learning_rate": 2.7789473684210525e-06, "loss": 0.3931, "step": 48 }, { "Batch Mean": 0.34576767683029175, "accuracy": 0.7578125, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 11.19707202911377, "learning_rate": 2.7710526315789473e-06, "loss": 0.4824, "step": 49 }, { "Batch Mean": -0.02308782935142517, "accuracy": 0.796875, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 8.539365768432617, "learning_rate": 2.763157894736842e-06, "loss": 0.4374, "step": 50 }, { "Batch Mean": -0.11477279663085938, "accuracy": 0.796875, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 8.27216911315918, "learning_rate": 2.7552631578947368e-06, "loss": 0.4368, "step": 51 }, { "Batch Mean": -0.31789374351501465, "accuracy": 0.8203125, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 10.260517120361328, "learning_rate": 2.7473684210526315e-06, "loss": 0.4347, "step": 52 }, { "Batch Mean": 0.09443974494934082, "accuracy": 0.8125, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 7.1499223709106445, "learning_rate": 2.7394736842105263e-06, "loss": 0.4101, "step": 53 }, { "Batch Mean": 0.10455203056335449, "accuracy": 0.8046875, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 8.759523391723633, "learning_rate": 2.7315789473684214e-06, "loss": 0.4678, "step": 54 }, { "Batch Mean": 0.3895939588546753, "accuracy": 0.8046875, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 12.878321647644043, "learning_rate": 2.723684210526316e-06, "loss": 0.3717, "step": 55 }, { "Batch Mean": 0.4579277038574219, "accuracy": 0.7421875, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 11.548985481262207, "learning_rate": 2.715789473684211e-06, "loss": 0.5196, "step": 56 }, { "Batch Mean": 0.0871274471282959, "accuracy": 0.8203125, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 9.4905424118042, "learning_rate": 2.7078947368421052e-06, "loss": 0.4155, "step": 57 }, { "Batch Mean": -0.27567076683044434, "accuracy": 0.8125, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 9.461136817932129, "learning_rate": 2.7e-06, "loss": 0.4003, "step": 58 }, { "Batch Mean": -0.5103405714035034, "accuracy": 0.8046875, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 13.972933769226074, "learning_rate": 2.6921052631578947e-06, "loss": 0.4395, "step": 59 }, { "Batch Mean": -0.47735142707824707, "accuracy": 0.734375, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 12.991920471191406, "learning_rate": 2.6842105263157895e-06, "loss": 0.5417, "step": 60 }, { "Batch Mean": -0.2846529483795166, "accuracy": 0.78125, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 10.198124885559082, "learning_rate": 2.6763157894736842e-06, "loss": 0.4312, "step": 61 }, { "Batch Mean": 0.15892720222473145, "accuracy": 0.828125, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 8.384869575500488, "learning_rate": 2.668421052631579e-06, "loss": 0.3634, "step": 62 }, { "Batch Mean": 0.16252660751342773, "accuracy": 0.796875, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 9.487040519714355, "learning_rate": 2.6605263157894737e-06, "loss": 0.4331, "step": 63 }, { "Batch Mean": 0.39125096797943115, "accuracy": 0.84375, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 10.764354705810547, "learning_rate": 2.6526315789473685e-06, "loss": 0.3796, "step": 64 }, { "Batch Mean": 0.09885883331298828, "accuracy": 0.796875, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 8.251502990722656, "learning_rate": 2.644736842105263e-06, "loss": 0.4103, "step": 65 }, { "Batch Mean": -0.0615391731262207, "accuracy": 0.78125, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 7.180301189422607, "learning_rate": 2.636842105263158e-06, "loss": 0.4218, "step": 66 }, { "Batch Mean": -0.44274067878723145, "accuracy": 0.8359375, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 13.95067310333252, "learning_rate": 2.6289473684210527e-06, "loss": 0.4196, "step": 67 }, { "Batch Mean": -0.6870249509811401, "accuracy": 0.796875, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 15.820197105407715, "learning_rate": 2.6210526315789474e-06, "loss": 0.4135, "step": 68 }, { "Batch Mean": -0.7148702144622803, "accuracy": 0.7421875, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 18.115154266357422, "learning_rate": 2.613157894736842e-06, "loss": 0.5175, "step": 69 }, { "Batch Mean": -0.1326247751712799, "accuracy": 0.7734375, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 8.709199905395508, "learning_rate": 2.605263157894737e-06, "loss": 0.4396, "step": 70 }, { "Batch Mean": 0.15152764320373535, "accuracy": 0.828125, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 7.756810665130615, "learning_rate": 2.5973684210526317e-06, "loss": 0.4127, "step": 71 }, { "Batch Mean": 0.47759437561035156, "accuracy": 0.875, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 11.938055038452148, "learning_rate": 2.5894736842105264e-06, "loss": 0.3728, "step": 72 }, { "Batch Mean": 0.6294691562652588, "accuracy": 0.8828125, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 16.090402603149414, "learning_rate": 2.581578947368421e-06, "loss": 0.3428, "step": 73 }, { "Batch Mean": 0.6011961698532104, "accuracy": 0.8203125, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 13.527336120605469, "learning_rate": 2.573684210526316e-06, "loss": 0.5287, "step": 74 }, { "Batch Mean": 0.009085655212402344, "accuracy": 0.7890625, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 7.578335285186768, "learning_rate": 2.5657894736842107e-06, "loss": 0.4145, "step": 75 }, { "Batch Mean": -0.12276363372802734, "accuracy": 0.78125, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 7.629185676574707, "learning_rate": 2.5578947368421054e-06, "loss": 0.4716, "step": 76 }, { "Batch Mean": -0.34308671951293945, "accuracy": 0.8125, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 8.904648780822754, "learning_rate": 2.55e-06, "loss": 0.3838, "step": 77 }, { "Batch Mean": -0.28826236724853516, "accuracy": 0.78125, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 9.088787078857422, "learning_rate": 2.542105263157895e-06, "loss": 0.4531, "step": 78 }, { "Batch Mean": -0.19193828105926514, "accuracy": 0.859375, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 7.69492769241333, "learning_rate": 2.5342105263157892e-06, "loss": 0.3733, "step": 79 }, { "Batch Mean": 0.3690178394317627, "accuracy": 0.8203125, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 15.70446491241455, "learning_rate": 2.526315789473684e-06, "loss": 0.3847, "step": 80 }, { "Batch Mean": 0.456756591796875, "accuracy": 0.8359375, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 13.275654792785645, "learning_rate": 2.5184210526315787e-06, "loss": 0.4081, "step": 81 }, { "Batch Mean": 0.486045241355896, "accuracy": 0.7578125, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 14.960883140563965, "learning_rate": 2.510526315789474e-06, "loss": 0.45, "step": 82 }, { "Batch Mean": 0.007278919219970703, "accuracy": 0.7890625, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 9.315552711486816, "learning_rate": 2.5026315789473686e-06, "loss": 0.4603, "step": 83 }, { "Batch Mean": -0.3312312364578247, "accuracy": 0.7421875, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 18.19389533996582, "learning_rate": 2.4947368421052634e-06, "loss": 0.4748, "step": 84 }, { "Batch Mean": -0.023685932159423828, "accuracy": 0.8515625, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 7.646451473236084, "learning_rate": 2.486842105263158e-06, "loss": 0.3291, "step": 85 }, { "Batch Mean": -0.418212890625, "accuracy": 0.84375, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 11.397950172424316, "learning_rate": 2.478947368421053e-06, "loss": 0.3491, "step": 86 }, { "Batch Mean": -0.2105419635772705, "accuracy": 0.796875, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 16.71412467956543, "learning_rate": 2.4710526315789476e-06, "loss": 0.3686, "step": 87 }, { "Batch Mean": 0.18758773803710938, "accuracy": 0.78125, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 15.67270278930664, "learning_rate": 2.4631578947368424e-06, "loss": 0.4844, "step": 88 }, { "Batch Mean": 0.20205163955688477, "accuracy": 0.78125, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 12.556285858154297, "learning_rate": 2.4552631578947367e-06, "loss": 0.4631, "step": 89 }, { "Batch Mean": 0.2241612672805786, "accuracy": 0.84375, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 13.160058975219727, "learning_rate": 2.4473684210526314e-06, "loss": 0.3143, "step": 90 }, { "Batch Mean": 0.5450854301452637, "accuracy": 0.8203125, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 19.004138946533203, "learning_rate": 2.439473684210526e-06, "loss": 0.3814, "step": 91 }, { "Batch Mean": 0.2415924072265625, "accuracy": 0.78125, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 13.732966423034668, "learning_rate": 2.431578947368421e-06, "loss": 0.3925, "step": 92 }, { "Batch Mean": -0.015282154083251953, "accuracy": 0.8203125, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 11.732492446899414, "learning_rate": 2.4236842105263157e-06, "loss": 0.3839, "step": 93 }, { "Batch Mean": -0.3396167755126953, "accuracy": 0.8203125, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 15.799851417541504, "learning_rate": 2.4157894736842104e-06, "loss": 0.3893, "step": 94 }, { "Batch Mean": -0.3651285171508789, "accuracy": 0.8203125, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 12.945575714111328, "learning_rate": 2.4078947368421056e-06, "loss": 0.3942, "step": 95 }, { "Batch Mean": -0.24241328239440918, "accuracy": 0.828125, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 11.39241886138916, "learning_rate": 2.4000000000000003e-06, "loss": 0.3606, "step": 96 }, { "Batch Mean": -0.10072767734527588, "accuracy": 0.8203125, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 9.7492094039917, "learning_rate": 2.392105263157895e-06, "loss": 0.3818, "step": 97 }, { "Batch Mean": 0.03002488613128662, "accuracy": 0.8515625, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 6.9692864418029785, "learning_rate": 2.38421052631579e-06, "loss": 0.3336, "step": 98 }, { "Batch Mean": 0.3361194133758545, "accuracy": 0.8046875, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 13.135473251342773, "learning_rate": 2.376315789473684e-06, "loss": 0.3912, "step": 99 }, { "Batch Mean": 0.4362337589263916, "accuracy": 0.796875, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 12.29136848449707, "learning_rate": 2.368421052631579e-06, "loss": 0.4137, "step": 100 }, { "Batch Mean": 0.4257016181945801, "accuracy": 0.875, "epoch": 0.25, "step": 100 }, { "epoch": 0.2525, "grad_norm": 13.732154846191406, "learning_rate": 2.3605263157894736e-06, "loss": 0.3721, "step": 101 }, { "Batch Mean": 0.06758689880371094, "accuracy": 0.8125, "epoch": 0.2525, "step": 101 }, { "epoch": 0.255, "grad_norm": 9.70384693145752, "learning_rate": 2.3526315789473684e-06, "loss": 0.4345, "step": 102 }, { "Batch Mean": -0.4097929000854492, "accuracy": 0.8125, "epoch": 0.255, "step": 102 }, { "epoch": 0.2575, "grad_norm": 12.665343284606934, "learning_rate": 2.344736842105263e-06, "loss": 0.3869, "step": 103 }, { "Batch Mean": -0.8724770545959473, "accuracy": 0.8203125, "epoch": 0.2575, "step": 103 }, { "epoch": 0.26, "grad_norm": 20.12427520751953, "learning_rate": 2.336842105263158e-06, "loss": 0.4037, "step": 104 }, { "Batch Mean": -0.4974844455718994, "accuracy": 0.7890625, "epoch": 0.26, "step": 104 }, { "epoch": 0.2625, "grad_norm": 17.464170455932617, "learning_rate": 2.3289473684210526e-06, "loss": 0.4492, "step": 105 }, { "Batch Mean": -0.35894912481307983, "accuracy": 0.828125, "epoch": 0.2625, "step": 105 }, { "epoch": 0.265, "grad_norm": 10.907158851623535, "learning_rate": 2.3210526315789473e-06, "loss": 0.3291, "step": 106 }, { "Batch Mean": 0.3651762008666992, "accuracy": 0.8125, "epoch": 0.265, "step": 106 }, { "epoch": 0.2675, "grad_norm": 11.427133560180664, "learning_rate": 2.313157894736842e-06, "loss": 0.4021, "step": 107 }, { "Batch Mean": 0.2751443386077881, "accuracy": 0.8203125, "epoch": 0.2675, "step": 107 }, { "epoch": 0.27, "grad_norm": 8.229965209960938, "learning_rate": 2.305263157894737e-06, "loss": 0.3729, "step": 108 }, { "Batch Mean": 0.9130334854125977, "accuracy": 0.859375, "epoch": 0.27, "step": 108 }, { "epoch": 0.2725, "grad_norm": 22.57686996459961, "learning_rate": 2.2973684210526316e-06, "loss": 0.3397, "step": 109 }, { "Batch Mean": 0.7281837463378906, "accuracy": 0.7734375, "epoch": 0.2725, "step": 109 }, { "epoch": 0.275, "grad_norm": 16.920743942260742, "learning_rate": 2.2894736842105263e-06, "loss": 0.4643, "step": 110 }, { "Batch Mean": 0.4563823342323303, "accuracy": 0.8125, "epoch": 0.275, "step": 110 }, { "epoch": 0.2775, "grad_norm": 15.203326225280762, "learning_rate": 2.281578947368421e-06, "loss": 0.3964, "step": 111 }, { "Batch Mean": -0.07430100440979004, "accuracy": 0.8203125, "epoch": 0.2775, "step": 111 }, { "epoch": 0.28, "grad_norm": 13.103719711303711, "learning_rate": 2.273684210526316e-06, "loss": 0.4149, "step": 112 }, { "Batch Mean": -0.5845143795013428, "accuracy": 0.8125, "epoch": 0.28, "step": 112 }, { "epoch": 0.2825, "grad_norm": 24.87859535217285, "learning_rate": 2.2657894736842106e-06, "loss": 0.4353, "step": 113 }, { "Batch Mean": -1.1507502794265747, "accuracy": 0.8671875, "epoch": 0.2825, "step": 113 }, { "epoch": 0.285, "grad_norm": 27.043203353881836, "learning_rate": 2.2578947368421053e-06, "loss": 0.3758, "step": 114 }, { "Batch Mean": -1.2163114547729492, "accuracy": 0.8671875, "epoch": 0.285, "step": 114 }, { "epoch": 0.2875, "grad_norm": 24.969430923461914, "learning_rate": 2.25e-06, "loss": 0.341, "step": 115 }, { "Batch Mean": -0.6281991004943848, "accuracy": 0.828125, "epoch": 0.2875, "step": 115 }, { "epoch": 0.29, "grad_norm": 14.463933944702148, "learning_rate": 2.242105263157895e-06, "loss": 0.4114, "step": 116 }, { "Batch Mean": 0.09254121780395508, "accuracy": 0.84375, "epoch": 0.29, "step": 116 }, { "epoch": 0.2925, "grad_norm": 7.876263618469238, "learning_rate": 2.2342105263157895e-06, "loss": 0.36, "step": 117 }, { "Batch Mean": 0.4857252240180969, "accuracy": 0.7890625, "epoch": 0.2925, "step": 117 }, { "epoch": 0.295, "grad_norm": 10.79910659790039, "learning_rate": 2.2263157894736843e-06, "loss": 0.376, "step": 118 }, { "Batch Mean": 0.5527479648590088, "accuracy": 0.84375, "epoch": 0.295, "step": 118 }, { "epoch": 0.2975, "grad_norm": 14.145975112915039, "learning_rate": 2.218421052631579e-06, "loss": 0.3598, "step": 119 }, { "Batch Mean": 0.33617615699768066, "accuracy": 0.859375, "epoch": 0.2975, "step": 119 }, { "epoch": 0.3, "grad_norm": 9.850839614868164, "learning_rate": 2.2105263157894738e-06, "loss": 0.3558, "step": 120 }, { "Batch Mean": 0.37594175338745117, "accuracy": 0.765625, "epoch": 0.3, "step": 120 }, { "epoch": 0.3025, "grad_norm": 10.884904861450195, "learning_rate": 2.2026315789473685e-06, "loss": 0.4847, "step": 121 }, { "Batch Mean": -0.30231380462646484, "accuracy": 0.8203125, "epoch": 0.3025, "step": 121 }, { "epoch": 0.305, "grad_norm": 8.063970565795898, "learning_rate": 2.1947368421052633e-06, "loss": 0.3881, "step": 122 }, { "Batch Mean": -0.4219226837158203, "accuracy": 0.8046875, "epoch": 0.305, "step": 122 }, { "epoch": 0.3075, "grad_norm": 11.049520492553711, "learning_rate": 2.186842105263158e-06, "loss": 0.3564, "step": 123 }, { "Batch Mean": -0.015915870666503906, "accuracy": 0.796875, "epoch": 0.3075, "step": 123 }, { "epoch": 0.31, "grad_norm": 7.677197456359863, "learning_rate": 2.1789473684210528e-06, "loss": 0.3923, "step": 124 }, { "Batch Mean": 0.25323057174682617, "accuracy": 0.859375, "epoch": 0.31, "step": 124 }, { "epoch": 0.3125, "grad_norm": 8.646540641784668, "learning_rate": 2.1710526315789475e-06, "loss": 0.4231, "step": 125 }, { "Batch Mean": -0.45605993270874023, "accuracy": 0.8359375, "epoch": 0.3125, "step": 125 }, { "epoch": 0.315, "grad_norm": 10.95660400390625, "learning_rate": 2.1631578947368423e-06, "loss": 0.3905, "step": 126 }, { "Batch Mean": -0.09001016616821289, "accuracy": 0.859375, "epoch": 0.315, "step": 126 }, { "epoch": 0.3175, "grad_norm": 6.532036304473877, "learning_rate": 2.155263157894737e-06, "loss": 0.3104, "step": 127 }, { "Batch Mean": 0.001552581787109375, "accuracy": 0.78125, "epoch": 0.3175, "step": 127 }, { "epoch": 0.32, "grad_norm": 7.6803178787231445, "learning_rate": 2.1473684210526317e-06, "loss": 0.4176, "step": 128 }, { "Batch Mean": 0.24973249435424805, "accuracy": 0.84375, "epoch": 0.32, "step": 128 }, { "epoch": 0.3225, "grad_norm": 7.414150714874268, "learning_rate": 2.1394736842105265e-06, "loss": 0.3664, "step": 129 }, { "Batch Mean": -0.17685949802398682, "accuracy": 0.859375, "epoch": 0.3225, "step": 129 }, { "epoch": 0.325, "grad_norm": 8.062408447265625, "learning_rate": 2.1315789473684212e-06, "loss": 0.3671, "step": 130 }, { "Batch Mean": 0.3163696527481079, "accuracy": 0.8125, "epoch": 0.325, "step": 130 }, { "epoch": 0.3275, "grad_norm": 8.69417953491211, "learning_rate": 2.123684210526316e-06, "loss": 0.3946, "step": 131 }, { "Batch Mean": 0.13493728637695312, "accuracy": 0.8359375, "epoch": 0.3275, "step": 131 }, { "epoch": 0.33, "grad_norm": 7.916721343994141, "learning_rate": 2.1157894736842103e-06, "loss": 0.4373, "step": 132 }, { "Batch Mean": 0.032731056213378906, "accuracy": 0.8046875, "epoch": 0.33, "step": 132 }, { "epoch": 0.3325, "grad_norm": 6.909370422363281, "learning_rate": 2.107894736842105e-06, "loss": 0.3488, "step": 133 }, { "Batch Mean": -0.6435856819152832, "accuracy": 0.84375, "epoch": 0.3325, "step": 133 }, { "epoch": 0.335, "grad_norm": 13.454378128051758, "learning_rate": 2.1e-06, "loss": 0.4012, "step": 134 }, { "Batch Mean": -0.04024988412857056, "accuracy": 0.8125, "epoch": 0.335, "step": 134 }, { "epoch": 0.3375, "grad_norm": 6.9208502769470215, "learning_rate": 2.0921052631578945e-06, "loss": 0.4132, "step": 135 }, { "Batch Mean": 0.10650634765625, "accuracy": 0.7890625, "epoch": 0.3375, "step": 135 }, { "epoch": 0.34, "grad_norm": 7.637556552886963, "learning_rate": 2.0842105263157897e-06, "loss": 0.4497, "step": 136 }, { "Batch Mean": -0.2209153175354004, "accuracy": 0.7734375, "epoch": 0.34, "step": 136 }, { "epoch": 0.3425, "grad_norm": 8.982176780700684, "learning_rate": 2.0763157894736845e-06, "loss": 0.4867, "step": 137 }, { "Batch Mean": 0.0443209707736969, "accuracy": 0.8359375, "epoch": 0.3425, "step": 137 }, { "epoch": 0.345, "grad_norm": 6.714746475219727, "learning_rate": 2.068421052631579e-06, "loss": 0.3642, "step": 138 }, { "Batch Mean": 0.03779444098472595, "accuracy": 0.859375, "epoch": 0.345, "step": 138 }, { "epoch": 0.3475, "grad_norm": 6.019307613372803, "learning_rate": 2.060526315789474e-06, "loss": 0.3308, "step": 139 }, { "Batch Mean": 0.2002730369567871, "accuracy": 0.8046875, "epoch": 0.3475, "step": 139 }, { "epoch": 0.35, "grad_norm": 7.735702037811279, "learning_rate": 2.0526315789473687e-06, "loss": 0.4197, "step": 140 }, { "Batch Mean": 0.16213250160217285, "accuracy": 0.828125, "epoch": 0.35, "step": 140 }, { "epoch": 0.3525, "grad_norm": 7.075012683868408, "learning_rate": 2.0447368421052634e-06, "loss": 0.3772, "step": 141 }, { "Batch Mean": -0.05973696708679199, "accuracy": 0.859375, "epoch": 0.3525, "step": 141 }, { "epoch": 0.355, "grad_norm": 6.793128967285156, "learning_rate": 2.0368421052631578e-06, "loss": 0.3782, "step": 142 }, { "Batch Mean": -0.2323307991027832, "accuracy": 0.84375, "epoch": 0.355, "step": 142 }, { "epoch": 0.3575, "grad_norm": 8.474074363708496, "learning_rate": 2.0289473684210525e-06, "loss": 0.412, "step": 143 }, { "Batch Mean": -0.594693660736084, "accuracy": 0.8359375, "epoch": 0.3575, "step": 143 }, { "epoch": 0.36, "grad_norm": 11.406280517578125, "learning_rate": 2.0210526315789473e-06, "loss": 0.3999, "step": 144 }, { "Batch Mean": -0.34098196029663086, "accuracy": 0.7890625, "epoch": 0.36, "step": 144 }, { "epoch": 0.3625, "grad_norm": 8.726512908935547, "learning_rate": 2.013157894736842e-06, "loss": 0.4033, "step": 145 }, { "Batch Mean": 0.14231419563293457, "accuracy": 0.8359375, "epoch": 0.3625, "step": 145 }, { "epoch": 0.365, "grad_norm": 6.767786979675293, "learning_rate": 2.0052631578947367e-06, "loss": 0.3359, "step": 146 }, { "Batch Mean": 0.4020230770111084, "accuracy": 0.8203125, "epoch": 0.365, "step": 146 }, { "epoch": 0.3675, "grad_norm": 9.113909721374512, "learning_rate": 1.9973684210526315e-06, "loss": 0.4163, "step": 147 }, { "Batch Mean": 0.46575236320495605, "accuracy": 0.8046875, "epoch": 0.3675, "step": 147 }, { "epoch": 0.37, "grad_norm": 10.250436782836914, "learning_rate": 1.9894736842105262e-06, "loss": 0.4773, "step": 148 }, { "Batch Mean": 0.1646571159362793, "accuracy": 0.8125, "epoch": 0.37, "step": 148 }, { "epoch": 0.3725, "grad_norm": 8.450485229492188, "learning_rate": 1.9815789473684214e-06, "loss": 0.354, "step": 149 }, { "Batch Mean": 0.20849394798278809, "accuracy": 0.890625, "epoch": 0.3725, "step": 149 }, { "epoch": 0.375, "grad_norm": 8.217352867126465, "learning_rate": 1.973684210526316e-06, "loss": 0.3627, "step": 150 }, { "Batch Mean": -0.42828369140625, "accuracy": 0.8125, "epoch": 0.375, "step": 150 }, { "epoch": 0.3775, "grad_norm": 12.038201332092285, "learning_rate": 1.965789473684211e-06, "loss": 0.3728, "step": 151 }, { "Batch Mean": -0.11544227600097656, "accuracy": 0.78125, "epoch": 0.3775, "step": 151 }, { "epoch": 0.38, "grad_norm": 7.889610290527344, "learning_rate": 1.9578947368421052e-06, "loss": 0.3893, "step": 152 }, { "Batch Mean": -0.21321868896484375, "accuracy": 0.8046875, "epoch": 0.38, "step": 152 }, { "epoch": 0.3825, "grad_norm": 8.342473030090332, "learning_rate": 1.95e-06, "loss": 0.4122, "step": 153 }, { "Batch Mean": -0.15835070610046387, "accuracy": 0.8046875, "epoch": 0.3825, "step": 153 }, { "epoch": 0.385, "grad_norm": 7.926384449005127, "learning_rate": 1.9421052631578947e-06, "loss": 0.3922, "step": 154 }, { "Batch Mean": 0.010342597961425781, "accuracy": 0.8046875, "epoch": 0.385, "step": 154 }, { "epoch": 0.3875, "grad_norm": 8.17468547821045, "learning_rate": 1.9342105263157895e-06, "loss": 0.4333, "step": 155 }, { "Batch Mean": 0.3719363212585449, "accuracy": 0.8046875, "epoch": 0.3875, "step": 155 }, { "epoch": 0.39, "grad_norm": 10.360675811767578, "learning_rate": 1.926315789473684e-06, "loss": 0.4128, "step": 156 }, { "Batch Mean": 0.12261229753494263, "accuracy": 0.78125, "epoch": 0.39, "step": 156 }, { "epoch": 0.3925, "grad_norm": 9.016860961914062, "learning_rate": 1.918421052631579e-06, "loss": 0.443, "step": 157 }, { "Batch Mean": -0.06574655324220657, "accuracy": 0.84375, "epoch": 0.3925, "step": 157 }, { "epoch": 0.395, "grad_norm": 6.0845255851745605, "learning_rate": 1.9105263157894737e-06, "loss": 0.3329, "step": 158 }, { "Batch Mean": -0.33632707595825195, "accuracy": 0.796875, "epoch": 0.395, "step": 158 }, { "epoch": 0.3975, "grad_norm": 9.574487686157227, "learning_rate": 1.9026315789473684e-06, "loss": 0.4283, "step": 159 }, { "Batch Mean": 0.016783952713012695, "accuracy": 0.84375, "epoch": 0.3975, "step": 159 }, { "epoch": 0.4, "grad_norm": 6.5984787940979, "learning_rate": 1.8947368421052632e-06, "loss": 0.3412, "step": 160 }, { "Batch Mean": 0.2222914844751358, "accuracy": 0.84375, "epoch": 0.4, "step": 160 }, { "epoch": 0.4025, "grad_norm": 7.398411273956299, "learning_rate": 1.8868421052631577e-06, "loss": 0.4231, "step": 161 }, { "Batch Mean": 0.136377215385437, "accuracy": 0.84375, "epoch": 0.4025, "step": 161 }, { "epoch": 0.405, "grad_norm": 8.897592544555664, "learning_rate": 1.8789473684210525e-06, "loss": 0.393, "step": 162 }, { "Batch Mean": 0.03430792689323425, "accuracy": 0.8125, "epoch": 0.405, "step": 162 }, { "epoch": 0.4075, "grad_norm": 6.535599231719971, "learning_rate": 1.8710526315789476e-06, "loss": 0.3837, "step": 163 }, { "Batch Mean": -0.09270691871643066, "accuracy": 0.8515625, "epoch": 0.4075, "step": 163 }, { "epoch": 0.41, "grad_norm": 6.2713823318481445, "learning_rate": 1.8631578947368424e-06, "loss": 0.3364, "step": 164 }, { "Batch Mean": -0.07146286964416504, "accuracy": 0.8203125, "epoch": 0.41, "step": 164 }, { "epoch": 0.4125, "grad_norm": 6.161220073699951, "learning_rate": 1.855263157894737e-06, "loss": 0.3677, "step": 165 }, { "Batch Mean": -0.20211690664291382, "accuracy": 0.78125, "epoch": 0.4125, "step": 165 }, { "epoch": 0.415, "grad_norm": 7.355227470397949, "learning_rate": 1.8473684210526317e-06, "loss": 0.4361, "step": 166 }, { "Batch Mean": -0.3163893222808838, "accuracy": 0.8203125, "epoch": 0.415, "step": 166 }, { "epoch": 0.4175, "grad_norm": 8.728869438171387, "learning_rate": 1.8394736842105264e-06, "loss": 0.3997, "step": 167 }, { "Batch Mean": 0.3005563020706177, "accuracy": 0.8515625, "epoch": 0.4175, "step": 167 }, { "epoch": 0.42, "grad_norm": 6.728272914886475, "learning_rate": 1.8315789473684211e-06, "loss": 0.329, "step": 168 }, { "Batch Mean": 0.1099938154220581, "accuracy": 0.828125, "epoch": 0.42, "step": 168 }, { "epoch": 0.4225, "grad_norm": 6.236492156982422, "learning_rate": 1.8236842105263159e-06, "loss": 0.3884, "step": 169 }, { "Batch Mean": 0.3710876703262329, "accuracy": 0.796875, "epoch": 0.4225, "step": 169 }, { "epoch": 0.425, "grad_norm": 8.478776931762695, "learning_rate": 1.8157894736842106e-06, "loss": 0.3932, "step": 170 }, { "Batch Mean": -0.22776031494140625, "accuracy": 0.8515625, "epoch": 0.425, "step": 170 }, { "epoch": 0.4275, "grad_norm": 7.591424942016602, "learning_rate": 1.8078947368421052e-06, "loss": 0.3521, "step": 171 }, { "Batch Mean": -0.3130757212638855, "accuracy": 0.890625, "epoch": 0.4275, "step": 171 }, { "epoch": 0.43, "grad_norm": 7.310946941375732, "learning_rate": 1.8e-06, "loss": 0.3295, "step": 172 }, { "Batch Mean": 0.1858811378479004, "accuracy": 0.890625, "epoch": 0.43, "step": 172 }, { "epoch": 0.4325, "grad_norm": 8.169261932373047, "learning_rate": 1.7921052631578947e-06, "loss": 0.313, "step": 173 }, { "Batch Mean": 0.2934098243713379, "accuracy": 0.84375, "epoch": 0.4325, "step": 173 }, { "epoch": 0.435, "grad_norm": 8.353775978088379, "learning_rate": 1.7842105263157894e-06, "loss": 0.3104, "step": 174 }, { "Batch Mean": 0.1525893211364746, "accuracy": 0.7890625, "epoch": 0.435, "step": 174 }, { "epoch": 0.4375, "grad_norm": 7.484781265258789, "learning_rate": 1.7763157894736842e-06, "loss": 0.4333, "step": 175 }, { "Batch Mean": 0.12224054336547852, "accuracy": 0.84375, "epoch": 0.4375, "step": 175 }, { "epoch": 0.44, "grad_norm": 7.87322473526001, "learning_rate": 1.768421052631579e-06, "loss": 0.3525, "step": 176 }, { "Batch Mean": -0.12131023406982422, "accuracy": 0.875, "epoch": 0.44, "step": 176 }, { "epoch": 0.4425, "grad_norm": 6.532097339630127, "learning_rate": 1.7605263157894739e-06, "loss": 0.3028, "step": 177 }, { "Batch Mean": -0.17700982093811035, "accuracy": 0.78125, "epoch": 0.4425, "step": 177 }, { "epoch": 0.445, "grad_norm": 10.696880340576172, "learning_rate": 1.7526315789473686e-06, "loss": 0.4192, "step": 178 }, { "Batch Mean": -0.20450687408447266, "accuracy": 0.859375, "epoch": 0.445, "step": 178 }, { "epoch": 0.4475, "grad_norm": 8.799153327941895, "learning_rate": 1.7447368421052633e-06, "loss": 0.3349, "step": 179 }, { "Batch Mean": 0.114410400390625, "accuracy": 0.828125, "epoch": 0.4475, "step": 179 }, { "epoch": 0.45, "grad_norm": 8.984070777893066, "learning_rate": 1.736842105263158e-06, "loss": 0.3446, "step": 180 }, { "Batch Mean": 0.1458113193511963, "accuracy": 0.8515625, "epoch": 0.45, "step": 180 }, { "epoch": 0.4525, "grad_norm": 9.16761302947998, "learning_rate": 1.7289473684210526e-06, "loss": 0.346, "step": 181 }, { "Batch Mean": -0.05868828296661377, "accuracy": 0.90625, "epoch": 0.4525, "step": 181 }, { "epoch": 0.455, "grad_norm": 6.830461025238037, "learning_rate": 1.7210526315789474e-06, "loss": 0.2781, "step": 182 }, { "Batch Mean": 0.0004514455795288086, "accuracy": 0.8515625, "epoch": 0.455, "step": 182 }, { "epoch": 0.4575, "grad_norm": 8.418164253234863, "learning_rate": 1.7131578947368421e-06, "loss": 0.3541, "step": 183 }, { "Batch Mean": -0.24701285362243652, "accuracy": 0.8203125, "epoch": 0.4575, "step": 183 }, { "epoch": 0.46, "grad_norm": 8.39130687713623, "learning_rate": 1.7052631578947369e-06, "loss": 0.3704, "step": 184 }, { "Batch Mean": 0.25158822536468506, "accuracy": 0.84375, "epoch": 0.46, "step": 184 }, { "epoch": 0.4625, "grad_norm": 8.080472946166992, "learning_rate": 1.6973684210526316e-06, "loss": 0.3565, "step": 185 }, { "Batch Mean": 0.044036865234375, "accuracy": 0.7578125, "epoch": 0.4625, "step": 185 }, { "epoch": 0.465, "grad_norm": 10.760668754577637, "learning_rate": 1.6894736842105264e-06, "loss": 0.515, "step": 186 }, { "Batch Mean": 0.0032491683959960938, "accuracy": 0.859375, "epoch": 0.465, "step": 186 }, { "epoch": 0.4675, "grad_norm": 7.858923435211182, "learning_rate": 1.6815789473684209e-06, "loss": 0.3053, "step": 187 }, { "Batch Mean": -0.0027008056640625, "accuracy": 0.859375, "epoch": 0.4675, "step": 187 }, { "epoch": 0.47, "grad_norm": 7.517134189605713, "learning_rate": 1.6736842105263156e-06, "loss": 0.2864, "step": 188 }, { "Batch Mean": 0.25811076164245605, "accuracy": 0.8203125, "epoch": 0.47, "step": 188 }, { "epoch": 0.4725, "grad_norm": 7.516969680786133, "learning_rate": 1.6657894736842104e-06, "loss": 0.3299, "step": 189 }, { "Batch Mean": 0.024303913116455078, "accuracy": 0.796875, "epoch": 0.4725, "step": 189 }, { "epoch": 0.475, "grad_norm": 9.072182655334473, "learning_rate": 1.6578947368421056e-06, "loss": 0.4424, "step": 190 }, { "Batch Mean": -0.38180357217788696, "accuracy": 0.7890625, "epoch": 0.475, "step": 190 }, { "epoch": 0.4775, "grad_norm": 9.461527824401855, "learning_rate": 1.65e-06, "loss": 0.3751, "step": 191 }, { "Batch Mean": -0.594372034072876, "accuracy": 0.84375, "epoch": 0.4775, "step": 191 }, { "epoch": 0.48, "grad_norm": 11.741211891174316, "learning_rate": 1.6421052631578948e-06, "loss": 0.3127, "step": 192 }, { "Batch Mean": -0.5924481153488159, "accuracy": 0.890625, "epoch": 0.48, "step": 192 }, { "epoch": 0.4825, "grad_norm": 10.474485397338867, "learning_rate": 1.6342105263157896e-06, "loss": 0.276, "step": 193 }, { "Batch Mean": -0.16649462282657623, "accuracy": 0.8125, "epoch": 0.4825, "step": 193 }, { "epoch": 0.485, "grad_norm": 8.504786491394043, "learning_rate": 1.6263157894736843e-06, "loss": 0.3619, "step": 194 }, { "Batch Mean": 0.15191316604614258, "accuracy": 0.8671875, "epoch": 0.485, "step": 194 }, { "epoch": 0.4875, "grad_norm": 6.697569370269775, "learning_rate": 1.618421052631579e-06, "loss": 0.3237, "step": 195 }, { "Batch Mean": 0.5648140907287598, "accuracy": 0.828125, "epoch": 0.4875, "step": 195 }, { "epoch": 0.49, "grad_norm": 11.56078815460205, "learning_rate": 1.6105263157894738e-06, "loss": 0.3372, "step": 196 }, { "Batch Mean": 0.6280609369277954, "accuracy": 0.8203125, "epoch": 0.49, "step": 196 }, { "epoch": 0.4925, "grad_norm": 12.10026741027832, "learning_rate": 1.6026315789473683e-06, "loss": 0.376, "step": 197 }, { "Batch Mean": 0.46126890182495117, "accuracy": 0.796875, "epoch": 0.4925, "step": 197 }, { "epoch": 0.495, "grad_norm": 9.432372093200684, "learning_rate": 1.594736842105263e-06, "loss": 0.3827, "step": 198 }, { "Batch Mean": 0.3517181873321533, "accuracy": 0.796875, "epoch": 0.495, "step": 198 }, { "epoch": 0.4975, "grad_norm": 9.432829856872559, "learning_rate": 1.5868421052631578e-06, "loss": 0.3766, "step": 199 }, { "Batch Mean": -0.18610143661499023, "accuracy": 0.7890625, "epoch": 0.4975, "step": 199 }, { "epoch": 0.5, "grad_norm": 8.5609130859375, "learning_rate": 1.5789473684210526e-06, "loss": 0.4419, "step": 200 }, { "Batch Mean": -0.3865816593170166, "accuracy": 0.890625, "epoch": 0.5, "step": 200 }, { "epoch": 0.5025, "grad_norm": 8.512102127075195, "learning_rate": 1.5710526315789473e-06, "loss": 0.2774, "step": 201 }, { "Batch Mean": -0.4919016361236572, "accuracy": 0.859375, "epoch": 0.5025, "step": 201 }, { "epoch": 0.505, "grad_norm": 9.126331329345703, "learning_rate": 1.563157894736842e-06, "loss": 0.3426, "step": 202 }, { "Batch Mean": -0.48360347747802734, "accuracy": 0.78125, "epoch": 0.505, "step": 202 }, { "epoch": 0.5075, "grad_norm": 10.955025672912598, "learning_rate": 1.5552631578947368e-06, "loss": 0.411, "step": 203 }, { "Batch Mean": -0.04683363437652588, "accuracy": 0.796875, "epoch": 0.5075, "step": 203 }, { "epoch": 0.51, "grad_norm": 7.002114295959473, "learning_rate": 1.5473684210526318e-06, "loss": 0.3729, "step": 204 }, { "Batch Mean": 0.03174877166748047, "accuracy": 0.890625, "epoch": 0.51, "step": 204 }, { "epoch": 0.5125, "grad_norm": 6.588340759277344, "learning_rate": 1.5394736842105265e-06, "loss": 0.32, "step": 205 }, { "Batch Mean": 0.31306135654449463, "accuracy": 0.8125, "epoch": 0.5125, "step": 205 }, { "epoch": 0.515, "grad_norm": 9.080763816833496, "learning_rate": 1.5315789473684213e-06, "loss": 0.4178, "step": 206 }, { "Batch Mean": 0.5678501129150391, "accuracy": 0.84375, "epoch": 0.515, "step": 206 }, { "epoch": 0.5175, "grad_norm": 10.213749885559082, "learning_rate": 1.5236842105263158e-06, "loss": 0.3523, "step": 207 }, { "Batch Mean": 0.2368319034576416, "accuracy": 0.84375, "epoch": 0.5175, "step": 207 }, { "epoch": 0.52, "grad_norm": 8.609210968017578, "learning_rate": 1.5157894736842105e-06, "loss": 0.3701, "step": 208 }, { "Batch Mean": 0.03226196765899658, "accuracy": 0.8046875, "epoch": 0.52, "step": 208 }, { "epoch": 0.5225, "grad_norm": 8.7013578414917, "learning_rate": 1.5078947368421053e-06, "loss": 0.4091, "step": 209 }, { "Batch Mean": -0.3510150909423828, "accuracy": 0.828125, "epoch": 0.5225, "step": 209 }, { "epoch": 0.525, "grad_norm": 9.209632873535156, "learning_rate": 1.5e-06, "loss": 0.4172, "step": 210 }, { "Batch Mean": 0.02189686894416809, "accuracy": 0.859375, "epoch": 0.525, "step": 210 }, { "epoch": 0.5275, "grad_norm": 7.103114128112793, "learning_rate": 1.4921052631578948e-06, "loss": 0.3525, "step": 211 }, { "Batch Mean": 0.08981943130493164, "accuracy": 0.8359375, "epoch": 0.5275, "step": 211 }, { "epoch": 0.53, "grad_norm": 8.323429107666016, "learning_rate": 1.4842105263157895e-06, "loss": 0.3985, "step": 212 }, { "Batch Mean": -0.0260312557220459, "accuracy": 0.859375, "epoch": 0.53, "step": 212 }, { "epoch": 0.5325, "grad_norm": 6.364190101623535, "learning_rate": 1.4763157894736843e-06, "loss": 0.3283, "step": 213 }, { "Batch Mean": -0.45758867263793945, "accuracy": 0.828125, "epoch": 0.5325, "step": 213 }, { "epoch": 0.535, "grad_norm": 10.124600410461426, "learning_rate": 1.468421052631579e-06, "loss": 0.4036, "step": 214 }, { "Batch Mean": -0.12888717651367188, "accuracy": 0.9140625, "epoch": 0.535, "step": 214 }, { "epoch": 0.5375, "grad_norm": 6.383907794952393, "learning_rate": 1.4605263157894738e-06, "loss": 0.26, "step": 215 }, { "Batch Mean": 0.06431245803833008, "accuracy": 0.8203125, "epoch": 0.5375, "step": 215 }, { "epoch": 0.54, "grad_norm": 6.595977783203125, "learning_rate": 1.4526315789473685e-06, "loss": 0.368, "step": 216 }, { "Batch Mean": 0.015802383422851562, "accuracy": 0.8359375, "epoch": 0.54, "step": 216 }, { "epoch": 0.5425, "grad_norm": 5.937958240509033, "learning_rate": 1.4447368421052633e-06, "loss": 0.3386, "step": 217 }, { "Batch Mean": 0.2596292495727539, "accuracy": 0.8671875, "epoch": 0.5425, "step": 217 }, { "epoch": 0.545, "grad_norm": 8.265673637390137, "learning_rate": 1.436842105263158e-06, "loss": 0.3517, "step": 218 }, { "Batch Mean": 0.06246137619018555, "accuracy": 0.8515625, "epoch": 0.545, "step": 218 }, { "epoch": 0.5475, "grad_norm": 7.670858860015869, "learning_rate": 1.4289473684210525e-06, "loss": 0.3248, "step": 219 }, { "Batch Mean": 0.49367237091064453, "accuracy": 0.84375, "epoch": 0.5475, "step": 219 }, { "epoch": 0.55, "grad_norm": 9.92224407196045, "learning_rate": 1.4210526315789473e-06, "loss": 0.4093, "step": 220 }, { "Batch Mean": -0.056406617164611816, "accuracy": 0.8359375, "epoch": 0.55, "step": 220 }, { "epoch": 0.5525, "grad_norm": 7.7244648933410645, "learning_rate": 1.4131578947368422e-06, "loss": 0.3588, "step": 221 }, { "Batch Mean": -0.31797313690185547, "accuracy": 0.8125, "epoch": 0.5525, "step": 221 }, { "epoch": 0.555, "grad_norm": 8.702153205871582, "learning_rate": 1.405263157894737e-06, "loss": 0.3758, "step": 222 }, { "Batch Mean": 0.1353154182434082, "accuracy": 0.8046875, "epoch": 0.555, "step": 222 }, { "epoch": 0.5575, "grad_norm": 8.6244535446167, "learning_rate": 1.3973684210526317e-06, "loss": 0.4032, "step": 223 }, { "Batch Mean": 0.02267169952392578, "accuracy": 0.890625, "epoch": 0.5575, "step": 223 }, { "epoch": 0.56, "grad_norm": 5.774755954742432, "learning_rate": 1.3894736842105263e-06, "loss": 0.2707, "step": 224 }, { "Batch Mean": -0.02749919891357422, "accuracy": 0.890625, "epoch": 0.56, "step": 224 }, { "epoch": 0.5625, "grad_norm": 5.9841532707214355, "learning_rate": 1.381578947368421e-06, "loss": 0.2991, "step": 225 }, { "Batch Mean": -0.05124187469482422, "accuracy": 0.8203125, "epoch": 0.5625, "step": 225 }, { "epoch": 0.565, "grad_norm": 7.033453464508057, "learning_rate": 1.3736842105263158e-06, "loss": 0.3352, "step": 226 }, { "Batch Mean": -0.4628112316131592, "accuracy": 0.828125, "epoch": 0.565, "step": 226 }, { "epoch": 0.5675, "grad_norm": 10.019220352172852, "learning_rate": 1.3657894736842107e-06, "loss": 0.3739, "step": 227 }, { "Batch Mean": 0.15478086471557617, "accuracy": 0.8203125, "epoch": 0.5675, "step": 227 }, { "epoch": 0.57, "grad_norm": 8.91198444366455, "learning_rate": 1.3578947368421055e-06, "loss": 0.3313, "step": 228 }, { "Batch Mean": 0.06676062941551208, "accuracy": 0.8828125, "epoch": 0.57, "step": 228 }, { "epoch": 0.5725, "grad_norm": 7.143804550170898, "learning_rate": 1.35e-06, "loss": 0.3072, "step": 229 }, { "Batch Mean": 0.47977638244628906, "accuracy": 0.8515625, "epoch": 0.5725, "step": 229 }, { "epoch": 0.575, "grad_norm": 11.133859634399414, "learning_rate": 1.3421052631578947e-06, "loss": 0.345, "step": 230 }, { "Batch Mean": 0.3305387496948242, "accuracy": 0.859375, "epoch": 0.575, "step": 230 }, { "epoch": 0.5775, "grad_norm": 10.80010986328125, "learning_rate": 1.3342105263157895e-06, "loss": 0.3141, "step": 231 }, { "Batch Mean": 0.43216419219970703, "accuracy": 0.84375, "epoch": 0.5775, "step": 231 }, { "epoch": 0.58, "grad_norm": 9.948732376098633, "learning_rate": 1.3263157894736842e-06, "loss": 0.3991, "step": 232 }, { "Batch Mean": 0.14983630180358887, "accuracy": 0.8203125, "epoch": 0.58, "step": 232 }, { "epoch": 0.5825, "grad_norm": 7.463159084320068, "learning_rate": 1.318421052631579e-06, "loss": 0.3243, "step": 233 }, { "Batch Mean": -0.5400023460388184, "accuracy": 0.7890625, "epoch": 0.5825, "step": 233 }, { "epoch": 0.585, "grad_norm": 11.068828582763672, "learning_rate": 1.3105263157894737e-06, "loss": 0.4177, "step": 234 }, { "Batch Mean": -0.42549943923950195, "accuracy": 0.8984375, "epoch": 0.585, "step": 234 }, { "epoch": 0.5875, "grad_norm": 11.707810401916504, "learning_rate": 1.3026315789473685e-06, "loss": 0.2911, "step": 235 }, { "Batch Mean": -0.5239953994750977, "accuracy": 0.875, "epoch": 0.5875, "step": 235 }, { "epoch": 0.59, "grad_norm": 10.632058143615723, "learning_rate": 1.2947368421052632e-06, "loss": 0.2737, "step": 236 }, { "Batch Mean": -0.3963519334793091, "accuracy": 0.7890625, "epoch": 0.59, "step": 236 }, { "epoch": 0.5925, "grad_norm": 11.079835891723633, "learning_rate": 1.286842105263158e-06, "loss": 0.4142, "step": 237 }, { "Batch Mean": -0.047173500061035156, "accuracy": 0.859375, "epoch": 0.5925, "step": 237 }, { "epoch": 0.595, "grad_norm": 7.266193389892578, "learning_rate": 1.2789473684210527e-06, "loss": 0.2802, "step": 238 }, { "Batch Mean": 0.07653871178627014, "accuracy": 0.828125, "epoch": 0.595, "step": 238 }, { "epoch": 0.5975, "grad_norm": 7.714227676391602, "learning_rate": 1.2710526315789474e-06, "loss": 0.3471, "step": 239 }, { "Batch Mean": 0.6806421279907227, "accuracy": 0.859375, "epoch": 0.5975, "step": 239 }, { "epoch": 0.6, "grad_norm": 14.054402351379395, "learning_rate": 1.263157894736842e-06, "loss": 0.371, "step": 240 }, { "Batch Mean": 0.5353469848632812, "accuracy": 0.890625, "epoch": 0.6, "step": 240 }, { "epoch": 0.6025, "grad_norm": 10.932038307189941, "learning_rate": 1.255263157894737e-06, "loss": 0.3251, "step": 241 }, { "Batch Mean": 0.4242100715637207, "accuracy": 0.84375, "epoch": 0.6025, "step": 241 }, { "epoch": 0.605, "grad_norm": 9.543030738830566, "learning_rate": 1.2473684210526317e-06, "loss": 0.366, "step": 242 }, { "Batch Mean": 0.1224508285522461, "accuracy": 0.8359375, "epoch": 0.605, "step": 242 }, { "epoch": 0.6075, "grad_norm": 7.535095691680908, "learning_rate": 1.2394736842105264e-06, "loss": 0.355, "step": 243 }, { "Batch Mean": -0.1669750213623047, "accuracy": 0.828125, "epoch": 0.6075, "step": 243 }, { "epoch": 0.61, "grad_norm": 7.886036396026611, "learning_rate": 1.2315789473684212e-06, "loss": 0.3262, "step": 244 }, { "Batch Mean": -1.217991828918457, "accuracy": 0.8359375, "epoch": 0.61, "step": 244 }, { "epoch": 0.6125, "grad_norm": 21.176101684570312, "learning_rate": 1.2236842105263157e-06, "loss": 0.4438, "step": 245 }, { "Batch Mean": -0.2878119945526123, "accuracy": 0.7578125, "epoch": 0.6125, "step": 245 }, { "epoch": 0.615, "grad_norm": 10.222801208496094, "learning_rate": 1.2157894736842105e-06, "loss": 0.439, "step": 246 }, { "Batch Mean": -0.626392126083374, "accuracy": 0.8984375, "epoch": 0.615, "step": 246 }, { "epoch": 0.6175, "grad_norm": 11.413473129272461, "learning_rate": 1.2078947368421052e-06, "loss": 0.2802, "step": 247 }, { "Batch Mean": -0.20267772674560547, "accuracy": 0.875, "epoch": 0.6175, "step": 247 }, { "epoch": 0.62, "grad_norm": 7.269984245300293, "learning_rate": 1.2000000000000002e-06, "loss": 0.3002, "step": 248 }, { "Batch Mean": -0.5709860324859619, "accuracy": 0.828125, "epoch": 0.62, "step": 248 }, { "epoch": 0.6225, "grad_norm": 12.802457809448242, "learning_rate": 1.192105263157895e-06, "loss": 0.3694, "step": 249 }, { "Batch Mean": -0.2514686584472656, "accuracy": 0.8203125, "epoch": 0.6225, "step": 249 }, { "epoch": 0.625, "grad_norm": 8.272834777832031, "learning_rate": 1.1842105263157894e-06, "loss": 0.3595, "step": 250 }, { "Batch Mean": 0.7458231449127197, "accuracy": 0.8125, "epoch": 0.625, "step": 250 }, { "epoch": 0.6275, "grad_norm": 13.081140518188477, "learning_rate": 1.1763157894736842e-06, "loss": 0.3856, "step": 251 }, { "Batch Mean": 0.17588496208190918, "accuracy": 0.8671875, "epoch": 0.6275, "step": 251 }, { "epoch": 0.63, "grad_norm": 7.108640193939209, "learning_rate": 1.168421052631579e-06, "loss": 0.332, "step": 252 }, { "Batch Mean": 0.5019898414611816, "accuracy": 0.84375, "epoch": 0.63, "step": 252 }, { "epoch": 0.6325, "grad_norm": 8.727738380432129, "learning_rate": 1.1605263157894737e-06, "loss": 0.34, "step": 253 }, { "Batch Mean": 0.30086028575897217, "accuracy": 0.890625, "epoch": 0.6325, "step": 253 }, { "epoch": 0.635, "grad_norm": 7.778132915496826, "learning_rate": 1.1526315789473684e-06, "loss": 0.3347, "step": 254 }, { "Batch Mean": 0.49059462547302246, "accuracy": 0.796875, "epoch": 0.635, "step": 254 }, { "epoch": 0.6375, "grad_norm": 10.36148738861084, "learning_rate": 1.1447368421052632e-06, "loss": 0.4268, "step": 255 }, { "Batch Mean": 0.26512861251831055, "accuracy": 0.859375, "epoch": 0.6375, "step": 255 }, { "epoch": 0.64, "grad_norm": 8.016122817993164, "learning_rate": 1.136842105263158e-06, "loss": 0.3719, "step": 256 }, { "Batch Mean": -0.6386747360229492, "accuracy": 0.8515625, "epoch": 0.64, "step": 256 }, { "epoch": 0.6425, "grad_norm": 10.996342658996582, "learning_rate": 1.1289473684210527e-06, "loss": 0.3249, "step": 257 }, { "Batch Mean": -0.41381216049194336, "accuracy": 0.828125, "epoch": 0.6425, "step": 257 }, { "epoch": 0.645, "grad_norm": 8.828882217407227, "learning_rate": 1.1210526315789474e-06, "loss": 0.4112, "step": 258 }, { "Batch Mean": -0.37595319747924805, "accuracy": 0.78125, "epoch": 0.645, "step": 258 }, { "epoch": 0.6475, "grad_norm": 9.125770568847656, "learning_rate": 1.1131578947368421e-06, "loss": 0.4207, "step": 259 }, { "Batch Mean": -0.22586441040039062, "accuracy": 0.875, "epoch": 0.6475, "step": 259 }, { "epoch": 0.65, "grad_norm": 7.365307807922363, "learning_rate": 1.1052631578947369e-06, "loss": 0.3451, "step": 260 }, { "Batch Mean": -0.028152525424957275, "accuracy": 0.8203125, "epoch": 0.65, "step": 260 }, { "epoch": 0.6525, "grad_norm": 6.994399070739746, "learning_rate": 1.0973684210526316e-06, "loss": 0.4034, "step": 261 }, { "Batch Mean": 0.17945003509521484, "accuracy": 0.8671875, "epoch": 0.6525, "step": 261 }, { "epoch": 0.655, "grad_norm": 7.0014519691467285, "learning_rate": 1.0894736842105264e-06, "loss": 0.3369, "step": 262 }, { "Batch Mean": -0.10725253820419312, "accuracy": 0.8515625, "epoch": 0.655, "step": 262 }, { "epoch": 0.6575, "grad_norm": 6.243435382843018, "learning_rate": 1.0815789473684211e-06, "loss": 0.3248, "step": 263 }, { "Batch Mean": 0.28885936737060547, "accuracy": 0.8359375, "epoch": 0.6575, "step": 263 }, { "epoch": 0.66, "grad_norm": 8.345162391662598, "learning_rate": 1.0736842105263159e-06, "loss": 0.3149, "step": 264 }, { "Batch Mean": 0.3201725482940674, "accuracy": 0.8515625, "epoch": 0.66, "step": 264 }, { "epoch": 0.6625, "grad_norm": 8.161121368408203, "learning_rate": 1.0657894736842106e-06, "loss": 0.3174, "step": 265 }, { "Batch Mean": 0.2368483543395996, "accuracy": 0.8671875, "epoch": 0.6625, "step": 265 }, { "epoch": 0.665, "grad_norm": 7.132945537567139, "learning_rate": 1.0578947368421052e-06, "loss": 0.3138, "step": 266 }, { "Batch Mean": 0.5943679809570312, "accuracy": 0.8984375, "epoch": 0.665, "step": 266 }, { "epoch": 0.6675, "grad_norm": 10.465624809265137, "learning_rate": 1.05e-06, "loss": 0.2962, "step": 267 }, { "Batch Mean": 0.4077954888343811, "accuracy": 0.890625, "epoch": 0.6675, "step": 267 }, { "epoch": 0.67, "grad_norm": 8.254660606384277, "learning_rate": 1.0421052631578949e-06, "loss": 0.2817, "step": 268 }, { "Batch Mean": -0.07658010721206665, "accuracy": 0.8203125, "epoch": 0.67, "step": 268 }, { "epoch": 0.6725, "grad_norm": 7.049543380737305, "learning_rate": 1.0342105263157896e-06, "loss": 0.3661, "step": 269 }, { "Batch Mean": -0.16821885108947754, "accuracy": 0.8203125, "epoch": 0.6725, "step": 269 }, { "epoch": 0.675, "grad_norm": 6.973754405975342, "learning_rate": 1.0263157894736843e-06, "loss": 0.3486, "step": 270 }, { "Batch Mean": -0.5536280870437622, "accuracy": 0.828125, "epoch": 0.675, "step": 270 }, { "epoch": 0.6775, "grad_norm": 10.284018516540527, "learning_rate": 1.0184210526315789e-06, "loss": 0.3327, "step": 271 }, { "Batch Mean": -0.12136930227279663, "accuracy": 0.859375, "epoch": 0.6775, "step": 271 }, { "epoch": 0.68, "grad_norm": 7.349569797515869, "learning_rate": 1.0105263157894736e-06, "loss": 0.3365, "step": 272 }, { "Batch Mean": -0.21032047271728516, "accuracy": 0.859375, "epoch": 0.68, "step": 272 }, { "epoch": 0.6825, "grad_norm": 8.34876823425293, "learning_rate": 1.0026315789473684e-06, "loss": 0.3146, "step": 273 }, { "Batch Mean": -0.41312098503112793, "accuracy": 0.8515625, "epoch": 0.6825, "step": 273 }, { "epoch": 0.685, "grad_norm": 8.684778213500977, "learning_rate": 9.947368421052631e-07, "loss": 0.3135, "step": 274 }, { "Batch Mean": -0.10464096069335938, "accuracy": 0.859375, "epoch": 0.685, "step": 274 }, { "epoch": 0.6875, "grad_norm": 6.903010845184326, "learning_rate": 9.86842105263158e-07, "loss": 0.3719, "step": 275 }, { "Batch Mean": 0.23662281036376953, "accuracy": 0.8515625, "epoch": 0.6875, "step": 275 }, { "epoch": 0.69, "grad_norm": 7.497840881347656, "learning_rate": 9.789473684210526e-07, "loss": 0.3371, "step": 276 }, { "Batch Mean": 0.13962489366531372, "accuracy": 0.8671875, "epoch": 0.69, "step": 276 }, { "epoch": 0.6925, "grad_norm": 6.886028289794922, "learning_rate": 9.710526315789474e-07, "loss": 0.3285, "step": 277 }, { "Batch Mean": 0.03725790977478027, "accuracy": 0.8671875, "epoch": 0.6925, "step": 277 }, { "epoch": 0.695, "grad_norm": 6.258803844451904, "learning_rate": 9.63157894736842e-07, "loss": 0.2595, "step": 278 }, { "Batch Mean": 0.39067840576171875, "accuracy": 0.8203125, "epoch": 0.695, "step": 278 }, { "epoch": 0.6975, "grad_norm": 8.779505729675293, "learning_rate": 9.552631578947368e-07, "loss": 0.3781, "step": 279 }, { "Batch Mean": 0.06871318817138672, "accuracy": 0.828125, "epoch": 0.6975, "step": 279 }, { "epoch": 0.7, "grad_norm": 7.97261905670166, "learning_rate": 9.473684210526316e-07, "loss": 0.3928, "step": 280 }, { "Batch Mean": 0.23102843761444092, "accuracy": 0.8515625, "epoch": 0.7, "step": 280 }, { "epoch": 0.7025, "grad_norm": 7.682852745056152, "learning_rate": 9.394736842105262e-07, "loss": 0.3187, "step": 281 }, { "Batch Mean": 0.061847686767578125, "accuracy": 0.8984375, "epoch": 0.7025, "step": 281 }, { "epoch": 0.705, "grad_norm": 6.796916961669922, "learning_rate": 9.315789473684212e-07, "loss": 0.3183, "step": 282 }, { "Batch Mean": -0.49417901039123535, "accuracy": 0.828125, "epoch": 0.705, "step": 282 }, { "epoch": 0.7075, "grad_norm": 11.345503807067871, "learning_rate": 9.236842105263158e-07, "loss": 0.3701, "step": 283 }, { "Batch Mean": -0.13564801216125488, "accuracy": 0.875, "epoch": 0.7075, "step": 283 }, { "epoch": 0.71, "grad_norm": 6.1998515129089355, "learning_rate": 9.157894736842106e-07, "loss": 0.2908, "step": 284 }, { "Batch Mean": -0.20754623413085938, "accuracy": 0.8046875, "epoch": 0.71, "step": 284 }, { "epoch": 0.7125, "grad_norm": 9.002010345458984, "learning_rate": 9.078947368421053e-07, "loss": 0.4074, "step": 285 }, { "Batch Mean": -0.48828125, "accuracy": 0.84375, "epoch": 0.7125, "step": 285 }, { "epoch": 0.715, "grad_norm": 11.633318901062012, "learning_rate": 9e-07, "loss": 0.4178, "step": 286 }, { "Batch Mean": -0.30758142471313477, "accuracy": 0.8046875, "epoch": 0.715, "step": 286 }, { "epoch": 0.7175, "grad_norm": 8.946621894836426, "learning_rate": 8.921052631578947e-07, "loss": 0.4023, "step": 287 }, { "Batch Mean": -0.24362659454345703, "accuracy": 0.8359375, "epoch": 0.7175, "step": 287 }, { "epoch": 0.72, "grad_norm": 8.48685359954834, "learning_rate": 8.842105263157895e-07, "loss": 0.3542, "step": 288 }, { "Batch Mean": 0.4370979368686676, "accuracy": 0.796875, "epoch": 0.72, "step": 288 }, { "epoch": 0.7225, "grad_norm": 9.334733009338379, "learning_rate": 8.763157894736843e-07, "loss": 0.3882, "step": 289 }, { "Batch Mean": 0.4301643371582031, "accuracy": 0.78125, "epoch": 0.7225, "step": 289 }, { "epoch": 0.725, "grad_norm": 9.78183650970459, "learning_rate": 8.68421052631579e-07, "loss": 0.4386, "step": 290 }, { "Batch Mean": 0.3213467597961426, "accuracy": 0.84375, "epoch": 0.725, "step": 290 }, { "epoch": 0.7275, "grad_norm": 7.349422454833984, "learning_rate": 8.605263157894737e-07, "loss": 0.3187, "step": 291 }, { "Batch Mean": 0.03937339782714844, "accuracy": 0.8984375, "epoch": 0.7275, "step": 291 }, { "epoch": 0.73, "grad_norm": 5.609205722808838, "learning_rate": 8.526315789473684e-07, "loss": 0.2493, "step": 292 }, { "Batch Mean": 0.48126816749572754, "accuracy": 0.890625, "epoch": 0.73, "step": 292 }, { "epoch": 0.7325, "grad_norm": 10.584858894348145, "learning_rate": 8.447368421052632e-07, "loss": 0.3036, "step": 293 }, { "Batch Mean": -0.08370530605316162, "accuracy": 0.78125, "epoch": 0.7325, "step": 293 }, { "epoch": 0.735, "grad_norm": 7.3887739181518555, "learning_rate": 8.368421052631578e-07, "loss": 0.4238, "step": 294 }, { "Batch Mean": -0.32436466217041016, "accuracy": 0.8515625, "epoch": 0.735, "step": 294 }, { "epoch": 0.7375, "grad_norm": 8.470144271850586, "learning_rate": 8.289473684210528e-07, "loss": 0.3559, "step": 295 }, { "Batch Mean": -0.09412622451782227, "accuracy": 0.875, "epoch": 0.7375, "step": 295 }, { "epoch": 0.74, "grad_norm": 5.89725923538208, "learning_rate": 8.210526315789474e-07, "loss": 0.3067, "step": 296 }, { "Batch Mean": -0.07551002502441406, "accuracy": 0.8828125, "epoch": 0.74, "step": 296 }, { "epoch": 0.7425, "grad_norm": 6.473114967346191, "learning_rate": 8.131578947368422e-07, "loss": 0.3266, "step": 297 }, { "Batch Mean": -0.17006683349609375, "accuracy": 0.78125, "epoch": 0.7425, "step": 297 }, { "epoch": 0.745, "grad_norm": 8.315503120422363, "learning_rate": 8.052631578947369e-07, "loss": 0.4281, "step": 298 }, { "Batch Mean": 0.375512957572937, "accuracy": 0.890625, "epoch": 0.745, "step": 298 }, { "epoch": 0.7475, "grad_norm": 7.217694282531738, "learning_rate": 7.973684210526315e-07, "loss": 0.2905, "step": 299 }, { "Batch Mean": 0.051157474517822266, "accuracy": 0.828125, "epoch": 0.7475, "step": 299 }, { "epoch": 0.75, "grad_norm": 6.537591934204102, "learning_rate": 7.894736842105263e-07, "loss": 0.3391, "step": 300 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }