|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"Batch Mean": -2.9549560546875, |
|
"accuracy": 0.5078125, |
|
"epoch": 0, |
|
"step": 0 |
|
}, |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 38.43654251098633, |
|
"learning_rate": 1.5000000000000002e-07, |
|
"loss": 0.8271, |
|
"step": 1 |
|
}, |
|
{ |
|
"Batch Mean": -2.9178466796875, |
|
"accuracy": 0.46875, |
|
"epoch": 0.0025, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 38.60389709472656, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 0.8459, |
|
"step": 2 |
|
}, |
|
{ |
|
"Batch Mean": -2.95965576171875, |
|
"accuracy": 0.4453125, |
|
"epoch": 0.005, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 40.155521392822266, |
|
"learning_rate": 4.5e-07, |
|
"loss": 0.8471, |
|
"step": 3 |
|
}, |
|
{ |
|
"Batch Mean": -2.93951416015625, |
|
"accuracy": 0.515625, |
|
"epoch": 0.0075, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 39.40812683105469, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.8381, |
|
"step": 4 |
|
}, |
|
{ |
|
"Batch Mean": -2.9075927734375, |
|
"accuracy": 0.46875, |
|
"epoch": 0.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 38.975746154785156, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.8312, |
|
"step": 5 |
|
}, |
|
{ |
|
"Batch Mean": -2.85748291015625, |
|
"accuracy": 0.3125, |
|
"epoch": 0.0125, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 38.6825065612793, |
|
"learning_rate": 9e-07, |
|
"loss": 0.8466, |
|
"step": 6 |
|
}, |
|
{ |
|
"Batch Mean": -2.689208984375, |
|
"accuracy": 0.390625, |
|
"epoch": 0.015, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0175, |
|
"grad_norm": 34.53704833984375, |
|
"learning_rate": 1.05e-06, |
|
"loss": 0.8319, |
|
"step": 7 |
|
}, |
|
{ |
|
"Batch Mean": -2.65997314453125, |
|
"accuracy": 0.453125, |
|
"epoch": 0.0175, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 31.157188415527344, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.8268, |
|
"step": 8 |
|
}, |
|
{ |
|
"Batch Mean": -2.0892333984375, |
|
"accuracy": 0.4921875, |
|
"epoch": 0.02, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0225, |
|
"grad_norm": 24.8486270904541, |
|
"learning_rate": 1.35e-06, |
|
"loss": 0.7703, |
|
"step": 9 |
|
}, |
|
{ |
|
"Batch Mean": -1.9425048828125, |
|
"accuracy": 0.4140625, |
|
"epoch": 0.0225, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 25.407278060913086, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.7765, |
|
"step": 10 |
|
}, |
|
{ |
|
"Batch Mean": -1.772979736328125, |
|
"accuracy": 0.34375, |
|
"epoch": 0.025, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0275, |
|
"grad_norm": 24.18885040283203, |
|
"learning_rate": 1.65e-06, |
|
"loss": 0.7768, |
|
"step": 11 |
|
}, |
|
{ |
|
"Batch Mean": 0.017196819186210632, |
|
"accuracy": 0.5, |
|
"epoch": 0.0275, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 8.657893180847168, |
|
"learning_rate": 1.8e-06, |
|
"loss": 0.7086, |
|
"step": 12 |
|
}, |
|
{ |
|
"Batch Mean": 0.37944215536117554, |
|
"accuracy": 0.4765625, |
|
"epoch": 0.03, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0325, |
|
"grad_norm": 13.965994834899902, |
|
"learning_rate": 1.95e-06, |
|
"loss": 0.6999, |
|
"step": 13 |
|
}, |
|
{ |
|
"Batch Mean": 0.5208501815795898, |
|
"accuracy": 0.6171875, |
|
"epoch": 0.0325, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 22.43783950805664, |
|
"learning_rate": 2.1e-06, |
|
"loss": 0.6639, |
|
"step": 14 |
|
}, |
|
{ |
|
"Batch Mean": 0.7098770141601562, |
|
"accuracy": 0.5546875, |
|
"epoch": 0.035, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 12.728647232055664, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.6735, |
|
"step": 15 |
|
}, |
|
{ |
|
"Batch Mean": 0.9353656768798828, |
|
"accuracy": 0.6953125, |
|
"epoch": 0.0375, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 18.56553077697754, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.6592, |
|
"step": 16 |
|
}, |
|
{ |
|
"Batch Mean": 1.4734210968017578, |
|
"accuracy": 0.640625, |
|
"epoch": 0.04, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0425, |
|
"grad_norm": 27.16295051574707, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.6713, |
|
"step": 17 |
|
}, |
|
{ |
|
"Batch Mean": 1.8671340942382812, |
|
"accuracy": 0.6484375, |
|
"epoch": 0.0425, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 38.08174514770508, |
|
"learning_rate": 2.7e-06, |
|
"loss": 0.6832, |
|
"step": 18 |
|
}, |
|
{ |
|
"Batch Mean": 1.8309574127197266, |
|
"accuracy": 0.671875, |
|
"epoch": 0.045, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0475, |
|
"grad_norm": 43.57632827758789, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.6855, |
|
"step": 19 |
|
}, |
|
{ |
|
"Batch Mean": 1.5924930572509766, |
|
"accuracy": 0.6015625, |
|
"epoch": 0.0475, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 35.4539909362793, |
|
"learning_rate": 3e-06, |
|
"loss": 0.7016, |
|
"step": 20 |
|
}, |
|
{ |
|
"Batch Mean": 1.2645306587219238, |
|
"accuracy": 0.7109375, |
|
"epoch": 0.05, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0525, |
|
"grad_norm": 33.7216796875, |
|
"learning_rate": 2.992105263157895e-06, |
|
"loss": 0.625, |
|
"step": 21 |
|
}, |
|
{ |
|
"Batch Mean": 0.44832003116607666, |
|
"accuracy": 0.71875, |
|
"epoch": 0.0525, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 22.181182861328125, |
|
"learning_rate": 2.9842105263157896e-06, |
|
"loss": 0.5738, |
|
"step": 22 |
|
}, |
|
{ |
|
"Batch Mean": -0.5859236717224121, |
|
"accuracy": 0.6796875, |
|
"epoch": 0.055, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0575, |
|
"grad_norm": 19.15841293334961, |
|
"learning_rate": 2.9763157894736843e-06, |
|
"loss": 0.6047, |
|
"step": 23 |
|
}, |
|
{ |
|
"Batch Mean": -1.2775471210479736, |
|
"accuracy": 0.7421875, |
|
"epoch": 0.0575, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 46.0495491027832, |
|
"learning_rate": 2.968421052631579e-06, |
|
"loss": 0.5853, |
|
"step": 24 |
|
}, |
|
{ |
|
"Batch Mean": -1.3207778930664062, |
|
"accuracy": 0.6953125, |
|
"epoch": 0.06, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 44.94231033325195, |
|
"learning_rate": 2.960526315789474e-06, |
|
"loss": 0.6105, |
|
"step": 25 |
|
}, |
|
{ |
|
"Batch Mean": -1.3301990032196045, |
|
"accuracy": 0.75, |
|
"epoch": 0.0625, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 45.24757766723633, |
|
"learning_rate": 2.9526315789473685e-06, |
|
"loss": 0.5729, |
|
"step": 26 |
|
}, |
|
{ |
|
"Batch Mean": -0.9073872566223145, |
|
"accuracy": 0.7421875, |
|
"epoch": 0.065, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0675, |
|
"grad_norm": 30.45476531982422, |
|
"learning_rate": 2.9447368421052633e-06, |
|
"loss": 0.5707, |
|
"step": 27 |
|
}, |
|
{ |
|
"Batch Mean": -0.3670217990875244, |
|
"accuracy": 0.7421875, |
|
"epoch": 0.0675, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 13.637907981872559, |
|
"learning_rate": 2.936842105263158e-06, |
|
"loss": 0.5046, |
|
"step": 28 |
|
}, |
|
{ |
|
"Batch Mean": 0.32126128673553467, |
|
"accuracy": 0.71875, |
|
"epoch": 0.07, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0725, |
|
"grad_norm": 11.712913513183594, |
|
"learning_rate": 2.9289473684210528e-06, |
|
"loss": 0.5753, |
|
"step": 29 |
|
}, |
|
{ |
|
"Batch Mean": 0.5725572109222412, |
|
"accuracy": 0.65625, |
|
"epoch": 0.0725, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 13.215094566345215, |
|
"learning_rate": 2.9210526315789475e-06, |
|
"loss": 0.6093, |
|
"step": 30 |
|
}, |
|
{ |
|
"Batch Mean": 0.6064486503601074, |
|
"accuracy": 0.71875, |
|
"epoch": 0.075, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0775, |
|
"grad_norm": 17.638071060180664, |
|
"learning_rate": 2.9131578947368423e-06, |
|
"loss": 0.5394, |
|
"step": 31 |
|
}, |
|
{ |
|
"Batch Mean": 0.4119257926940918, |
|
"accuracy": 0.796875, |
|
"epoch": 0.0775, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 17.742324829101562, |
|
"learning_rate": 2.905263157894737e-06, |
|
"loss": 0.5049, |
|
"step": 32 |
|
}, |
|
{ |
|
"Batch Mean": 0.31667208671569824, |
|
"accuracy": 0.8125, |
|
"epoch": 0.08, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0825, |
|
"grad_norm": 13.98446273803711, |
|
"learning_rate": 2.8973684210526318e-06, |
|
"loss": 0.466, |
|
"step": 33 |
|
}, |
|
{ |
|
"Batch Mean": -0.4047205150127411, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0825, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 10.204410552978516, |
|
"learning_rate": 2.8894736842105265e-06, |
|
"loss": 0.4622, |
|
"step": 34 |
|
}, |
|
{ |
|
"Batch Mean": -0.3730044364929199, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.085, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 12.993864059448242, |
|
"learning_rate": 2.8815789473684213e-06, |
|
"loss": 0.4788, |
|
"step": 35 |
|
}, |
|
{ |
|
"Batch Mean": -0.38590008020401, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.0875, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 10.724040031433105, |
|
"learning_rate": 2.873684210526316e-06, |
|
"loss": 0.4847, |
|
"step": 36 |
|
}, |
|
{ |
|
"Batch Mean": -0.21977567672729492, |
|
"accuracy": 0.765625, |
|
"epoch": 0.09, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0925, |
|
"grad_norm": 12.729230880737305, |
|
"learning_rate": 2.8657894736842103e-06, |
|
"loss": 0.5396, |
|
"step": 37 |
|
}, |
|
{ |
|
"Batch Mean": -0.044023871421813965, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.0925, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 12.698225975036621, |
|
"learning_rate": 2.857894736842105e-06, |
|
"loss": 0.5317, |
|
"step": 38 |
|
}, |
|
{ |
|
"Batch Mean": 0.3004276752471924, |
|
"accuracy": 0.84375, |
|
"epoch": 0.095, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0975, |
|
"grad_norm": 15.1500244140625, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.3625, |
|
"step": 39 |
|
}, |
|
{ |
|
"Batch Mean": 0.4614996314048767, |
|
"accuracy": 0.78125, |
|
"epoch": 0.0975, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 14.328938484191895, |
|
"learning_rate": 2.8421052631578946e-06, |
|
"loss": 0.4741, |
|
"step": 40 |
|
}, |
|
{ |
|
"Batch Mean": 0.219995379447937, |
|
"accuracy": 0.734375, |
|
"epoch": 0.1, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1025, |
|
"grad_norm": 16.527143478393555, |
|
"learning_rate": 2.8342105263157897e-06, |
|
"loss": 0.5388, |
|
"step": 41 |
|
}, |
|
{ |
|
"Batch Mean": 0.24077653884887695, |
|
"accuracy": 0.71875, |
|
"epoch": 0.1025, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 15.48572826385498, |
|
"learning_rate": 2.8263157894736845e-06, |
|
"loss": 0.5193, |
|
"step": 42 |
|
}, |
|
{ |
|
"Batch Mean": -0.29242193698883057, |
|
"accuracy": 0.7421875, |
|
"epoch": 0.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1075, |
|
"grad_norm": 14.708074569702148, |
|
"learning_rate": 2.8184210526315792e-06, |
|
"loss": 0.5096, |
|
"step": 43 |
|
}, |
|
{ |
|
"Batch Mean": -0.5579910278320312, |
|
"accuracy": 0.765625, |
|
"epoch": 0.1075, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 18.473812103271484, |
|
"learning_rate": 2.810526315789474e-06, |
|
"loss": 0.5251, |
|
"step": 44 |
|
}, |
|
{ |
|
"Batch Mean": -0.44131672382354736, |
|
"accuracy": 0.703125, |
|
"epoch": 0.11, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 24.496036529541016, |
|
"learning_rate": 2.8026315789473687e-06, |
|
"loss": 0.5768, |
|
"step": 45 |
|
}, |
|
{ |
|
"Batch Mean": -0.2054896354675293, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.1125, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 16.322071075439453, |
|
"learning_rate": 2.7947368421052635e-06, |
|
"loss": 0.441, |
|
"step": 46 |
|
}, |
|
{ |
|
"Batch Mean": 0.5062417984008789, |
|
"accuracy": 0.734375, |
|
"epoch": 0.115, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1175, |
|
"grad_norm": 14.525461196899414, |
|
"learning_rate": 2.7868421052631578e-06, |
|
"loss": 0.4942, |
|
"step": 47 |
|
}, |
|
{ |
|
"Batch Mean": 0.2660309374332428, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.1175, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 9.259953498840332, |
|
"learning_rate": 2.7789473684210525e-06, |
|
"loss": 0.3931, |
|
"step": 48 |
|
}, |
|
{ |
|
"Batch Mean": 0.34576767683029175, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.12, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1225, |
|
"grad_norm": 11.19707202911377, |
|
"learning_rate": 2.7710526315789473e-06, |
|
"loss": 0.4824, |
|
"step": 49 |
|
}, |
|
{ |
|
"Batch Mean": -0.02308782935142517, |
|
"accuracy": 0.796875, |
|
"epoch": 0.1225, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 8.539365768432617, |
|
"learning_rate": 2.763157894736842e-06, |
|
"loss": 0.4374, |
|
"step": 50 |
|
}, |
|
{ |
|
"Batch Mean": -0.11477279663085938, |
|
"accuracy": 0.796875, |
|
"epoch": 0.125, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1275, |
|
"grad_norm": 8.27216911315918, |
|
"learning_rate": 2.7552631578947368e-06, |
|
"loss": 0.4368, |
|
"step": 51 |
|
}, |
|
{ |
|
"Batch Mean": -0.31789374351501465, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.1275, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 10.260517120361328, |
|
"learning_rate": 2.7473684210526315e-06, |
|
"loss": 0.4347, |
|
"step": 52 |
|
}, |
|
{ |
|
"Batch Mean": 0.09443974494934082, |
|
"accuracy": 0.8125, |
|
"epoch": 0.13, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1325, |
|
"grad_norm": 7.1499223709106445, |
|
"learning_rate": 2.7394736842105263e-06, |
|
"loss": 0.4101, |
|
"step": 53 |
|
}, |
|
{ |
|
"Batch Mean": 0.10455203056335449, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.1325, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 8.759523391723633, |
|
"learning_rate": 2.7315789473684214e-06, |
|
"loss": 0.4678, |
|
"step": 54 |
|
}, |
|
{ |
|
"Batch Mean": 0.3895939588546753, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.135, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 12.878321647644043, |
|
"learning_rate": 2.723684210526316e-06, |
|
"loss": 0.3717, |
|
"step": 55 |
|
}, |
|
{ |
|
"Batch Mean": 0.4579277038574219, |
|
"accuracy": 0.7421875, |
|
"epoch": 0.1375, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 11.548985481262207, |
|
"learning_rate": 2.715789473684211e-06, |
|
"loss": 0.5196, |
|
"step": 56 |
|
}, |
|
{ |
|
"Batch Mean": 0.0871274471282959, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.14, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1425, |
|
"grad_norm": 9.4905424118042, |
|
"learning_rate": 2.7078947368421052e-06, |
|
"loss": 0.4155, |
|
"step": 57 |
|
}, |
|
{ |
|
"Batch Mean": -0.27567076683044434, |
|
"accuracy": 0.8125, |
|
"epoch": 0.1425, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 9.461136817932129, |
|
"learning_rate": 2.7e-06, |
|
"loss": 0.4003, |
|
"step": 58 |
|
}, |
|
{ |
|
"Batch Mean": -0.5103405714035034, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.145, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1475, |
|
"grad_norm": 13.972933769226074, |
|
"learning_rate": 2.6921052631578947e-06, |
|
"loss": 0.4395, |
|
"step": 59 |
|
}, |
|
{ |
|
"Batch Mean": -0.47735142707824707, |
|
"accuracy": 0.734375, |
|
"epoch": 0.1475, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 12.991920471191406, |
|
"learning_rate": 2.6842105263157895e-06, |
|
"loss": 0.5417, |
|
"step": 60 |
|
}, |
|
{ |
|
"Batch Mean": -0.2846529483795166, |
|
"accuracy": 0.78125, |
|
"epoch": 0.15, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1525, |
|
"grad_norm": 10.198124885559082, |
|
"learning_rate": 2.6763157894736842e-06, |
|
"loss": 0.4312, |
|
"step": 61 |
|
}, |
|
{ |
|
"Batch Mean": 0.15892720222473145, |
|
"accuracy": 0.828125, |
|
"epoch": 0.1525, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 8.384869575500488, |
|
"learning_rate": 2.668421052631579e-06, |
|
"loss": 0.3634, |
|
"step": 62 |
|
}, |
|
{ |
|
"Batch Mean": 0.16252660751342773, |
|
"accuracy": 0.796875, |
|
"epoch": 0.155, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1575, |
|
"grad_norm": 9.487040519714355, |
|
"learning_rate": 2.6605263157894737e-06, |
|
"loss": 0.4331, |
|
"step": 63 |
|
}, |
|
{ |
|
"Batch Mean": 0.39125096797943115, |
|
"accuracy": 0.84375, |
|
"epoch": 0.1575, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 10.764354705810547, |
|
"learning_rate": 2.6526315789473685e-06, |
|
"loss": 0.3796, |
|
"step": 64 |
|
}, |
|
{ |
|
"Batch Mean": 0.09885883331298828, |
|
"accuracy": 0.796875, |
|
"epoch": 0.16, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 8.251502990722656, |
|
"learning_rate": 2.644736842105263e-06, |
|
"loss": 0.4103, |
|
"step": 65 |
|
}, |
|
{ |
|
"Batch Mean": -0.0615391731262207, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1625, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 7.180301189422607, |
|
"learning_rate": 2.636842105263158e-06, |
|
"loss": 0.4218, |
|
"step": 66 |
|
}, |
|
{ |
|
"Batch Mean": -0.44274067878723145, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.165, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1675, |
|
"grad_norm": 13.95067310333252, |
|
"learning_rate": 2.6289473684210527e-06, |
|
"loss": 0.4196, |
|
"step": 67 |
|
}, |
|
{ |
|
"Batch Mean": -0.6870249509811401, |
|
"accuracy": 0.796875, |
|
"epoch": 0.1675, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 15.820197105407715, |
|
"learning_rate": 2.6210526315789474e-06, |
|
"loss": 0.4135, |
|
"step": 68 |
|
}, |
|
{ |
|
"Batch Mean": -0.7148702144622803, |
|
"accuracy": 0.7421875, |
|
"epoch": 0.17, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1725, |
|
"grad_norm": 18.115154266357422, |
|
"learning_rate": 2.613157894736842e-06, |
|
"loss": 0.5175, |
|
"step": 69 |
|
}, |
|
{ |
|
"Batch Mean": -0.1326247751712799, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.1725, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 8.709199905395508, |
|
"learning_rate": 2.605263157894737e-06, |
|
"loss": 0.4396, |
|
"step": 70 |
|
}, |
|
{ |
|
"Batch Mean": 0.15152764320373535, |
|
"accuracy": 0.828125, |
|
"epoch": 0.175, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1775, |
|
"grad_norm": 7.756810665130615, |
|
"learning_rate": 2.5973684210526317e-06, |
|
"loss": 0.4127, |
|
"step": 71 |
|
}, |
|
{ |
|
"Batch Mean": 0.47759437561035156, |
|
"accuracy": 0.875, |
|
"epoch": 0.1775, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 11.938055038452148, |
|
"learning_rate": 2.5894736842105264e-06, |
|
"loss": 0.3728, |
|
"step": 72 |
|
}, |
|
{ |
|
"Batch Mean": 0.6294691562652588, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.18, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1825, |
|
"grad_norm": 16.090402603149414, |
|
"learning_rate": 2.581578947368421e-06, |
|
"loss": 0.3428, |
|
"step": 73 |
|
}, |
|
{ |
|
"Batch Mean": 0.6011961698532104, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.1825, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 13.527336120605469, |
|
"learning_rate": 2.573684210526316e-06, |
|
"loss": 0.5287, |
|
"step": 74 |
|
}, |
|
{ |
|
"Batch Mean": 0.009085655212402344, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.185, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 7.578335285186768, |
|
"learning_rate": 2.5657894736842107e-06, |
|
"loss": 0.4145, |
|
"step": 75 |
|
}, |
|
{ |
|
"Batch Mean": -0.12276363372802734, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1875, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 7.629185676574707, |
|
"learning_rate": 2.5578947368421054e-06, |
|
"loss": 0.4716, |
|
"step": 76 |
|
}, |
|
{ |
|
"Batch Mean": -0.34308671951293945, |
|
"accuracy": 0.8125, |
|
"epoch": 0.19, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1925, |
|
"grad_norm": 8.904648780822754, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.3838, |
|
"step": 77 |
|
}, |
|
{ |
|
"Batch Mean": -0.28826236724853516, |
|
"accuracy": 0.78125, |
|
"epoch": 0.1925, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 9.088787078857422, |
|
"learning_rate": 2.542105263157895e-06, |
|
"loss": 0.4531, |
|
"step": 78 |
|
}, |
|
{ |
|
"Batch Mean": -0.19193828105926514, |
|
"accuracy": 0.859375, |
|
"epoch": 0.195, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1975, |
|
"grad_norm": 7.69492769241333, |
|
"learning_rate": 2.5342105263157892e-06, |
|
"loss": 0.3733, |
|
"step": 79 |
|
}, |
|
{ |
|
"Batch Mean": 0.3690178394317627, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.1975, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 15.70446491241455, |
|
"learning_rate": 2.526315789473684e-06, |
|
"loss": 0.3847, |
|
"step": 80 |
|
}, |
|
{ |
|
"Batch Mean": 0.456756591796875, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.2, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2025, |
|
"grad_norm": 13.275654792785645, |
|
"learning_rate": 2.5184210526315787e-06, |
|
"loss": 0.4081, |
|
"step": 81 |
|
}, |
|
{ |
|
"Batch Mean": 0.486045241355896, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.2025, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 14.960883140563965, |
|
"learning_rate": 2.510526315789474e-06, |
|
"loss": 0.45, |
|
"step": 82 |
|
}, |
|
{ |
|
"Batch Mean": 0.007278919219970703, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.205, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2075, |
|
"grad_norm": 9.315552711486816, |
|
"learning_rate": 2.5026315789473686e-06, |
|
"loss": 0.4603, |
|
"step": 83 |
|
}, |
|
{ |
|
"Batch Mean": -0.3312312364578247, |
|
"accuracy": 0.7421875, |
|
"epoch": 0.2075, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 18.19389533996582, |
|
"learning_rate": 2.4947368421052634e-06, |
|
"loss": 0.4748, |
|
"step": 84 |
|
}, |
|
{ |
|
"Batch Mean": -0.023685932159423828, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.21, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2125, |
|
"grad_norm": 7.646451473236084, |
|
"learning_rate": 2.486842105263158e-06, |
|
"loss": 0.3291, |
|
"step": 85 |
|
}, |
|
{ |
|
"Batch Mean": -0.418212890625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2125, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 11.397950172424316, |
|
"learning_rate": 2.478947368421053e-06, |
|
"loss": 0.3491, |
|
"step": 86 |
|
}, |
|
{ |
|
"Batch Mean": -0.2105419635772705, |
|
"accuracy": 0.796875, |
|
"epoch": 0.215, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2175, |
|
"grad_norm": 16.71412467956543, |
|
"learning_rate": 2.4710526315789476e-06, |
|
"loss": 0.3686, |
|
"step": 87 |
|
}, |
|
{ |
|
"Batch Mean": 0.18758773803710938, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2175, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 15.67270278930664, |
|
"learning_rate": 2.4631578947368424e-06, |
|
"loss": 0.4844, |
|
"step": 88 |
|
}, |
|
{ |
|
"Batch Mean": 0.20205163955688477, |
|
"accuracy": 0.78125, |
|
"epoch": 0.22, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2225, |
|
"grad_norm": 12.556285858154297, |
|
"learning_rate": 2.4552631578947367e-06, |
|
"loss": 0.4631, |
|
"step": 89 |
|
}, |
|
{ |
|
"Batch Mean": 0.2241612672805786, |
|
"accuracy": 0.84375, |
|
"epoch": 0.2225, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 13.160058975219727, |
|
"learning_rate": 2.4473684210526314e-06, |
|
"loss": 0.3143, |
|
"step": 90 |
|
}, |
|
{ |
|
"Batch Mean": 0.5450854301452637, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.225, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2275, |
|
"grad_norm": 19.004138946533203, |
|
"learning_rate": 2.439473684210526e-06, |
|
"loss": 0.3814, |
|
"step": 91 |
|
}, |
|
{ |
|
"Batch Mean": 0.2415924072265625, |
|
"accuracy": 0.78125, |
|
"epoch": 0.2275, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 13.732966423034668, |
|
"learning_rate": 2.431578947368421e-06, |
|
"loss": 0.3925, |
|
"step": 92 |
|
}, |
|
{ |
|
"Batch Mean": -0.015282154083251953, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.23, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2325, |
|
"grad_norm": 11.732492446899414, |
|
"learning_rate": 2.4236842105263157e-06, |
|
"loss": 0.3839, |
|
"step": 93 |
|
}, |
|
{ |
|
"Batch Mean": -0.3396167755126953, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.2325, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.235, |
|
"grad_norm": 15.799851417541504, |
|
"learning_rate": 2.4157894736842104e-06, |
|
"loss": 0.3893, |
|
"step": 94 |
|
}, |
|
{ |
|
"Batch Mean": -0.3651285171508789, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.235, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2375, |
|
"grad_norm": 12.945575714111328, |
|
"learning_rate": 2.4078947368421056e-06, |
|
"loss": 0.3942, |
|
"step": 95 |
|
}, |
|
{ |
|
"Batch Mean": -0.24241328239440918, |
|
"accuracy": 0.828125, |
|
"epoch": 0.2375, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 11.39241886138916, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.3606, |
|
"step": 96 |
|
}, |
|
{ |
|
"Batch Mean": -0.10072767734527588, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.24, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2425, |
|
"grad_norm": 9.7492094039917, |
|
"learning_rate": 2.392105263157895e-06, |
|
"loss": 0.3818, |
|
"step": 97 |
|
}, |
|
{ |
|
"Batch Mean": 0.03002488613128662, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.2425, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.245, |
|
"grad_norm": 6.9692864418029785, |
|
"learning_rate": 2.38421052631579e-06, |
|
"loss": 0.3336, |
|
"step": 98 |
|
}, |
|
{ |
|
"Batch Mean": 0.3361194133758545, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.245, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2475, |
|
"grad_norm": 13.135473251342773, |
|
"learning_rate": 2.376315789473684e-06, |
|
"loss": 0.3912, |
|
"step": 99 |
|
}, |
|
{ |
|
"Batch Mean": 0.4362337589263916, |
|
"accuracy": 0.796875, |
|
"epoch": 0.2475, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 12.29136848449707, |
|
"learning_rate": 2.368421052631579e-06, |
|
"loss": 0.4137, |
|
"step": 100 |
|
}, |
|
{ |
|
"Batch Mean": 0.4257016181945801, |
|
"accuracy": 0.875, |
|
"epoch": 0.25, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2525, |
|
"grad_norm": 13.732154846191406, |
|
"learning_rate": 2.3605263157894736e-06, |
|
"loss": 0.3721, |
|
"step": 101 |
|
}, |
|
{ |
|
"Batch Mean": 0.06758689880371094, |
|
"accuracy": 0.8125, |
|
"epoch": 0.2525, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.255, |
|
"grad_norm": 9.70384693145752, |
|
"learning_rate": 2.3526315789473684e-06, |
|
"loss": 0.4345, |
|
"step": 102 |
|
}, |
|
{ |
|
"Batch Mean": -0.4097929000854492, |
|
"accuracy": 0.8125, |
|
"epoch": 0.255, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2575, |
|
"grad_norm": 12.665343284606934, |
|
"learning_rate": 2.344736842105263e-06, |
|
"loss": 0.3869, |
|
"step": 103 |
|
}, |
|
{ |
|
"Batch Mean": -0.8724770545959473, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.2575, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 20.12427520751953, |
|
"learning_rate": 2.336842105263158e-06, |
|
"loss": 0.4037, |
|
"step": 104 |
|
}, |
|
{ |
|
"Batch Mean": -0.4974844455718994, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.26, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2625, |
|
"grad_norm": 17.464170455932617, |
|
"learning_rate": 2.3289473684210526e-06, |
|
"loss": 0.4492, |
|
"step": 105 |
|
}, |
|
{ |
|
"Batch Mean": -0.35894912481307983, |
|
"accuracy": 0.828125, |
|
"epoch": 0.2625, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.265, |
|
"grad_norm": 10.907158851623535, |
|
"learning_rate": 2.3210526315789473e-06, |
|
"loss": 0.3291, |
|
"step": 106 |
|
}, |
|
{ |
|
"Batch Mean": 0.3651762008666992, |
|
"accuracy": 0.8125, |
|
"epoch": 0.265, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2675, |
|
"grad_norm": 11.427133560180664, |
|
"learning_rate": 2.313157894736842e-06, |
|
"loss": 0.4021, |
|
"step": 107 |
|
}, |
|
{ |
|
"Batch Mean": 0.2751443386077881, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.2675, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 8.229965209960938, |
|
"learning_rate": 2.305263157894737e-06, |
|
"loss": 0.3729, |
|
"step": 108 |
|
}, |
|
{ |
|
"Batch Mean": 0.9130334854125977, |
|
"accuracy": 0.859375, |
|
"epoch": 0.27, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2725, |
|
"grad_norm": 22.57686996459961, |
|
"learning_rate": 2.2973684210526316e-06, |
|
"loss": 0.3397, |
|
"step": 109 |
|
}, |
|
{ |
|
"Batch Mean": 0.7281837463378906, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.2725, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 16.920743942260742, |
|
"learning_rate": 2.2894736842105263e-06, |
|
"loss": 0.4643, |
|
"step": 110 |
|
}, |
|
{ |
|
"Batch Mean": 0.4563823342323303, |
|
"accuracy": 0.8125, |
|
"epoch": 0.275, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2775, |
|
"grad_norm": 15.203326225280762, |
|
"learning_rate": 2.281578947368421e-06, |
|
"loss": 0.3964, |
|
"step": 111 |
|
}, |
|
{ |
|
"Batch Mean": -0.07430100440979004, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.2775, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 13.103719711303711, |
|
"learning_rate": 2.273684210526316e-06, |
|
"loss": 0.4149, |
|
"step": 112 |
|
}, |
|
{ |
|
"Batch Mean": -0.5845143795013428, |
|
"accuracy": 0.8125, |
|
"epoch": 0.28, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2825, |
|
"grad_norm": 24.87859535217285, |
|
"learning_rate": 2.2657894736842106e-06, |
|
"loss": 0.4353, |
|
"step": 113 |
|
}, |
|
{ |
|
"Batch Mean": -1.1507502794265747, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.2825, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.285, |
|
"grad_norm": 27.043203353881836, |
|
"learning_rate": 2.2578947368421053e-06, |
|
"loss": 0.3758, |
|
"step": 114 |
|
}, |
|
{ |
|
"Batch Mean": -1.2163114547729492, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.285, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2875, |
|
"grad_norm": 24.969430923461914, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.341, |
|
"step": 115 |
|
}, |
|
{ |
|
"Batch Mean": -0.6281991004943848, |
|
"accuracy": 0.828125, |
|
"epoch": 0.2875, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 14.463933944702148, |
|
"learning_rate": 2.242105263157895e-06, |
|
"loss": 0.4114, |
|
"step": 116 |
|
}, |
|
{ |
|
"Batch Mean": 0.09254121780395508, |
|
"accuracy": 0.84375, |
|
"epoch": 0.29, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2925, |
|
"grad_norm": 7.876263618469238, |
|
"learning_rate": 2.2342105263157895e-06, |
|
"loss": 0.36, |
|
"step": 117 |
|
}, |
|
{ |
|
"Batch Mean": 0.4857252240180969, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.2925, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.295, |
|
"grad_norm": 10.79910659790039, |
|
"learning_rate": 2.2263157894736843e-06, |
|
"loss": 0.376, |
|
"step": 118 |
|
}, |
|
{ |
|
"Batch Mean": 0.5527479648590088, |
|
"accuracy": 0.84375, |
|
"epoch": 0.295, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2975, |
|
"grad_norm": 14.145975112915039, |
|
"learning_rate": 2.218421052631579e-06, |
|
"loss": 0.3598, |
|
"step": 119 |
|
}, |
|
{ |
|
"Batch Mean": 0.33617615699768066, |
|
"accuracy": 0.859375, |
|
"epoch": 0.2975, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 9.850839614868164, |
|
"learning_rate": 2.2105263157894738e-06, |
|
"loss": 0.3558, |
|
"step": 120 |
|
}, |
|
{ |
|
"Batch Mean": 0.37594175338745117, |
|
"accuracy": 0.765625, |
|
"epoch": 0.3, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3025, |
|
"grad_norm": 10.884904861450195, |
|
"learning_rate": 2.2026315789473685e-06, |
|
"loss": 0.4847, |
|
"step": 121 |
|
}, |
|
{ |
|
"Batch Mean": -0.30231380462646484, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.3025, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.305, |
|
"grad_norm": 8.063970565795898, |
|
"learning_rate": 2.1947368421052633e-06, |
|
"loss": 0.3881, |
|
"step": 122 |
|
}, |
|
{ |
|
"Batch Mean": -0.4219226837158203, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.305, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3075, |
|
"grad_norm": 11.049520492553711, |
|
"learning_rate": 2.186842105263158e-06, |
|
"loss": 0.3564, |
|
"step": 123 |
|
}, |
|
{ |
|
"Batch Mean": -0.015915870666503906, |
|
"accuracy": 0.796875, |
|
"epoch": 0.3075, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 7.677197456359863, |
|
"learning_rate": 2.1789473684210528e-06, |
|
"loss": 0.3923, |
|
"step": 124 |
|
}, |
|
{ |
|
"Batch Mean": 0.25323057174682617, |
|
"accuracy": 0.859375, |
|
"epoch": 0.31, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 8.646540641784668, |
|
"learning_rate": 2.1710526315789475e-06, |
|
"loss": 0.4231, |
|
"step": 125 |
|
}, |
|
{ |
|
"Batch Mean": -0.45605993270874023, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.3125, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.315, |
|
"grad_norm": 10.95660400390625, |
|
"learning_rate": 2.1631578947368423e-06, |
|
"loss": 0.3905, |
|
"step": 126 |
|
}, |
|
{ |
|
"Batch Mean": -0.09001016616821289, |
|
"accuracy": 0.859375, |
|
"epoch": 0.315, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3175, |
|
"grad_norm": 6.532036304473877, |
|
"learning_rate": 2.155263157894737e-06, |
|
"loss": 0.3104, |
|
"step": 127 |
|
}, |
|
{ |
|
"Batch Mean": 0.001552581787109375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.3175, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 7.6803178787231445, |
|
"learning_rate": 2.1473684210526317e-06, |
|
"loss": 0.4176, |
|
"step": 128 |
|
}, |
|
{ |
|
"Batch Mean": 0.24973249435424805, |
|
"accuracy": 0.84375, |
|
"epoch": 0.32, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3225, |
|
"grad_norm": 7.414150714874268, |
|
"learning_rate": 2.1394736842105265e-06, |
|
"loss": 0.3664, |
|
"step": 129 |
|
}, |
|
{ |
|
"Batch Mean": -0.17685949802398682, |
|
"accuracy": 0.859375, |
|
"epoch": 0.3225, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 8.062408447265625, |
|
"learning_rate": 2.1315789473684212e-06, |
|
"loss": 0.3671, |
|
"step": 130 |
|
}, |
|
{ |
|
"Batch Mean": 0.3163696527481079, |
|
"accuracy": 0.8125, |
|
"epoch": 0.325, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3275, |
|
"grad_norm": 8.69417953491211, |
|
"learning_rate": 2.123684210526316e-06, |
|
"loss": 0.3946, |
|
"step": 131 |
|
}, |
|
{ |
|
"Batch Mean": 0.13493728637695312, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.3275, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 7.916721343994141, |
|
"learning_rate": 2.1157894736842103e-06, |
|
"loss": 0.4373, |
|
"step": 132 |
|
}, |
|
{ |
|
"Batch Mean": 0.032731056213378906, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.33, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3325, |
|
"grad_norm": 6.909370422363281, |
|
"learning_rate": 2.107894736842105e-06, |
|
"loss": 0.3488, |
|
"step": 133 |
|
}, |
|
{ |
|
"Batch Mean": -0.6435856819152832, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3325, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.335, |
|
"grad_norm": 13.454378128051758, |
|
"learning_rate": 2.1e-06, |
|
"loss": 0.4012, |
|
"step": 134 |
|
}, |
|
{ |
|
"Batch Mean": -0.04024988412857056, |
|
"accuracy": 0.8125, |
|
"epoch": 0.335, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3375, |
|
"grad_norm": 6.9208502769470215, |
|
"learning_rate": 2.0921052631578945e-06, |
|
"loss": 0.4132, |
|
"step": 135 |
|
}, |
|
{ |
|
"Batch Mean": 0.10650634765625, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.3375, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 7.637556552886963, |
|
"learning_rate": 2.0842105263157897e-06, |
|
"loss": 0.4497, |
|
"step": 136 |
|
}, |
|
{ |
|
"Batch Mean": -0.2209153175354004, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.34, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3425, |
|
"grad_norm": 8.982176780700684, |
|
"learning_rate": 2.0763157894736845e-06, |
|
"loss": 0.4867, |
|
"step": 137 |
|
}, |
|
{ |
|
"Batch Mean": 0.0443209707736969, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.3425, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.345, |
|
"grad_norm": 6.714746475219727, |
|
"learning_rate": 2.068421052631579e-06, |
|
"loss": 0.3642, |
|
"step": 138 |
|
}, |
|
{ |
|
"Batch Mean": 0.03779444098472595, |
|
"accuracy": 0.859375, |
|
"epoch": 0.345, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3475, |
|
"grad_norm": 6.019307613372803, |
|
"learning_rate": 2.060526315789474e-06, |
|
"loss": 0.3308, |
|
"step": 139 |
|
}, |
|
{ |
|
"Batch Mean": 0.2002730369567871, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.3475, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 7.735702037811279, |
|
"learning_rate": 2.0526315789473687e-06, |
|
"loss": 0.4197, |
|
"step": 140 |
|
}, |
|
{ |
|
"Batch Mean": 0.16213250160217285, |
|
"accuracy": 0.828125, |
|
"epoch": 0.35, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3525, |
|
"grad_norm": 7.075012683868408, |
|
"learning_rate": 2.0447368421052634e-06, |
|
"loss": 0.3772, |
|
"step": 141 |
|
}, |
|
{ |
|
"Batch Mean": -0.05973696708679199, |
|
"accuracy": 0.859375, |
|
"epoch": 0.3525, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.355, |
|
"grad_norm": 6.793128967285156, |
|
"learning_rate": 2.0368421052631578e-06, |
|
"loss": 0.3782, |
|
"step": 142 |
|
}, |
|
{ |
|
"Batch Mean": -0.2323307991027832, |
|
"accuracy": 0.84375, |
|
"epoch": 0.355, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3575, |
|
"grad_norm": 8.474074363708496, |
|
"learning_rate": 2.0289473684210525e-06, |
|
"loss": 0.412, |
|
"step": 143 |
|
}, |
|
{ |
|
"Batch Mean": -0.594693660736084, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.3575, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 11.406280517578125, |
|
"learning_rate": 2.0210526315789473e-06, |
|
"loss": 0.3999, |
|
"step": 144 |
|
}, |
|
{ |
|
"Batch Mean": -0.34098196029663086, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.36, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3625, |
|
"grad_norm": 8.726512908935547, |
|
"learning_rate": 2.013157894736842e-06, |
|
"loss": 0.4033, |
|
"step": 145 |
|
}, |
|
{ |
|
"Batch Mean": 0.14231419563293457, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.3625, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.365, |
|
"grad_norm": 6.767786979675293, |
|
"learning_rate": 2.0052631578947367e-06, |
|
"loss": 0.3359, |
|
"step": 146 |
|
}, |
|
{ |
|
"Batch Mean": 0.4020230770111084, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.365, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3675, |
|
"grad_norm": 9.113909721374512, |
|
"learning_rate": 1.9973684210526315e-06, |
|
"loss": 0.4163, |
|
"step": 147 |
|
}, |
|
{ |
|
"Batch Mean": 0.46575236320495605, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.3675, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 10.250436782836914, |
|
"learning_rate": 1.9894736842105262e-06, |
|
"loss": 0.4773, |
|
"step": 148 |
|
}, |
|
{ |
|
"Batch Mean": 0.1646571159362793, |
|
"accuracy": 0.8125, |
|
"epoch": 0.37, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3725, |
|
"grad_norm": 8.450485229492188, |
|
"learning_rate": 1.9815789473684214e-06, |
|
"loss": 0.354, |
|
"step": 149 |
|
}, |
|
{ |
|
"Batch Mean": 0.20849394798278809, |
|
"accuracy": 0.890625, |
|
"epoch": 0.3725, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 8.217352867126465, |
|
"learning_rate": 1.973684210526316e-06, |
|
"loss": 0.3627, |
|
"step": 150 |
|
}, |
|
{ |
|
"Batch Mean": -0.42828369140625, |
|
"accuracy": 0.8125, |
|
"epoch": 0.375, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3775, |
|
"grad_norm": 12.038201332092285, |
|
"learning_rate": 1.965789473684211e-06, |
|
"loss": 0.3728, |
|
"step": 151 |
|
}, |
|
{ |
|
"Batch Mean": -0.11544227600097656, |
|
"accuracy": 0.78125, |
|
"epoch": 0.3775, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 7.889610290527344, |
|
"learning_rate": 1.9578947368421052e-06, |
|
"loss": 0.3893, |
|
"step": 152 |
|
}, |
|
{ |
|
"Batch Mean": -0.21321868896484375, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.38, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3825, |
|
"grad_norm": 8.342473030090332, |
|
"learning_rate": 1.95e-06, |
|
"loss": 0.4122, |
|
"step": 153 |
|
}, |
|
{ |
|
"Batch Mean": -0.15835070610046387, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.3825, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.385, |
|
"grad_norm": 7.926384449005127, |
|
"learning_rate": 1.9421052631578947e-06, |
|
"loss": 0.3922, |
|
"step": 154 |
|
}, |
|
{ |
|
"Batch Mean": 0.010342597961425781, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.385, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.3875, |
|
"grad_norm": 8.17468547821045, |
|
"learning_rate": 1.9342105263157895e-06, |
|
"loss": 0.4333, |
|
"step": 155 |
|
}, |
|
{ |
|
"Batch Mean": 0.3719363212585449, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.3875, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 10.360675811767578, |
|
"learning_rate": 1.926315789473684e-06, |
|
"loss": 0.4128, |
|
"step": 156 |
|
}, |
|
{ |
|
"Batch Mean": 0.12261229753494263, |
|
"accuracy": 0.78125, |
|
"epoch": 0.39, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3925, |
|
"grad_norm": 9.016860961914062, |
|
"learning_rate": 1.918421052631579e-06, |
|
"loss": 0.443, |
|
"step": 157 |
|
}, |
|
{ |
|
"Batch Mean": -0.06574655324220657, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3925, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.395, |
|
"grad_norm": 6.0845255851745605, |
|
"learning_rate": 1.9105263157894737e-06, |
|
"loss": 0.3329, |
|
"step": 158 |
|
}, |
|
{ |
|
"Batch Mean": -0.33632707595825195, |
|
"accuracy": 0.796875, |
|
"epoch": 0.395, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3975, |
|
"grad_norm": 9.574487686157227, |
|
"learning_rate": 1.9026315789473684e-06, |
|
"loss": 0.4283, |
|
"step": 159 |
|
}, |
|
{ |
|
"Batch Mean": 0.016783952713012695, |
|
"accuracy": 0.84375, |
|
"epoch": 0.3975, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.5984787940979, |
|
"learning_rate": 1.8947368421052632e-06, |
|
"loss": 0.3412, |
|
"step": 160 |
|
}, |
|
{ |
|
"Batch Mean": 0.2222914844751358, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4025, |
|
"grad_norm": 7.398411273956299, |
|
"learning_rate": 1.8868421052631577e-06, |
|
"loss": 0.4231, |
|
"step": 161 |
|
}, |
|
{ |
|
"Batch Mean": 0.136377215385437, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4025, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.405, |
|
"grad_norm": 8.897592544555664, |
|
"learning_rate": 1.8789473684210525e-06, |
|
"loss": 0.393, |
|
"step": 162 |
|
}, |
|
{ |
|
"Batch Mean": 0.03430792689323425, |
|
"accuracy": 0.8125, |
|
"epoch": 0.405, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4075, |
|
"grad_norm": 6.535599231719971, |
|
"learning_rate": 1.8710526315789476e-06, |
|
"loss": 0.3837, |
|
"step": 163 |
|
}, |
|
{ |
|
"Batch Mean": -0.09270691871643066, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.4075, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 6.2713823318481445, |
|
"learning_rate": 1.8631578947368424e-06, |
|
"loss": 0.3364, |
|
"step": 164 |
|
}, |
|
{ |
|
"Batch Mean": -0.07146286964416504, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.41, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4125, |
|
"grad_norm": 6.161220073699951, |
|
"learning_rate": 1.855263157894737e-06, |
|
"loss": 0.3677, |
|
"step": 165 |
|
}, |
|
{ |
|
"Batch Mean": -0.20211690664291382, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4125, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.415, |
|
"grad_norm": 7.355227470397949, |
|
"learning_rate": 1.8473684210526317e-06, |
|
"loss": 0.4361, |
|
"step": 166 |
|
}, |
|
{ |
|
"Batch Mean": -0.3163893222808838, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.415, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4175, |
|
"grad_norm": 8.728869438171387, |
|
"learning_rate": 1.8394736842105264e-06, |
|
"loss": 0.3997, |
|
"step": 167 |
|
}, |
|
{ |
|
"Batch Mean": 0.3005563020706177, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.4175, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 6.728272914886475, |
|
"learning_rate": 1.8315789473684211e-06, |
|
"loss": 0.329, |
|
"step": 168 |
|
}, |
|
{ |
|
"Batch Mean": 0.1099938154220581, |
|
"accuracy": 0.828125, |
|
"epoch": 0.42, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4225, |
|
"grad_norm": 6.236492156982422, |
|
"learning_rate": 1.8236842105263159e-06, |
|
"loss": 0.3884, |
|
"step": 169 |
|
}, |
|
{ |
|
"Batch Mean": 0.3710876703262329, |
|
"accuracy": 0.796875, |
|
"epoch": 0.4225, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 8.478776931762695, |
|
"learning_rate": 1.8157894736842106e-06, |
|
"loss": 0.3932, |
|
"step": 170 |
|
}, |
|
{ |
|
"Batch Mean": -0.22776031494140625, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.425, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4275, |
|
"grad_norm": 7.591424942016602, |
|
"learning_rate": 1.8078947368421052e-06, |
|
"loss": 0.3521, |
|
"step": 171 |
|
}, |
|
{ |
|
"Batch Mean": -0.3130757212638855, |
|
"accuracy": 0.890625, |
|
"epoch": 0.4275, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 7.310946941375732, |
|
"learning_rate": 1.8e-06, |
|
"loss": 0.3295, |
|
"step": 172 |
|
}, |
|
{ |
|
"Batch Mean": 0.1858811378479004, |
|
"accuracy": 0.890625, |
|
"epoch": 0.43, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4325, |
|
"grad_norm": 8.169261932373047, |
|
"learning_rate": 1.7921052631578947e-06, |
|
"loss": 0.313, |
|
"step": 173 |
|
}, |
|
{ |
|
"Batch Mean": 0.2934098243713379, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4325, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.435, |
|
"grad_norm": 8.353775978088379, |
|
"learning_rate": 1.7842105263157894e-06, |
|
"loss": 0.3104, |
|
"step": 174 |
|
}, |
|
{ |
|
"Batch Mean": 0.1525893211364746, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.435, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 7.484781265258789, |
|
"learning_rate": 1.7763157894736842e-06, |
|
"loss": 0.4333, |
|
"step": 175 |
|
}, |
|
{ |
|
"Batch Mean": 0.12224054336547852, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4375, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 7.87322473526001, |
|
"learning_rate": 1.768421052631579e-06, |
|
"loss": 0.3525, |
|
"step": 176 |
|
}, |
|
{ |
|
"Batch Mean": -0.12131023406982422, |
|
"accuracy": 0.875, |
|
"epoch": 0.44, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4425, |
|
"grad_norm": 6.532097339630127, |
|
"learning_rate": 1.7605263157894739e-06, |
|
"loss": 0.3028, |
|
"step": 177 |
|
}, |
|
{ |
|
"Batch Mean": -0.17700982093811035, |
|
"accuracy": 0.78125, |
|
"epoch": 0.4425, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.445, |
|
"grad_norm": 10.696880340576172, |
|
"learning_rate": 1.7526315789473686e-06, |
|
"loss": 0.4192, |
|
"step": 178 |
|
}, |
|
{ |
|
"Batch Mean": -0.20450687408447266, |
|
"accuracy": 0.859375, |
|
"epoch": 0.445, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4475, |
|
"grad_norm": 8.799153327941895, |
|
"learning_rate": 1.7447368421052633e-06, |
|
"loss": 0.3349, |
|
"step": 179 |
|
}, |
|
{ |
|
"Batch Mean": 0.114410400390625, |
|
"accuracy": 0.828125, |
|
"epoch": 0.4475, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 8.984070777893066, |
|
"learning_rate": 1.736842105263158e-06, |
|
"loss": 0.3446, |
|
"step": 180 |
|
}, |
|
{ |
|
"Batch Mean": 0.1458113193511963, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.45, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4525, |
|
"grad_norm": 9.16761302947998, |
|
"learning_rate": 1.7289473684210526e-06, |
|
"loss": 0.346, |
|
"step": 181 |
|
}, |
|
{ |
|
"Batch Mean": -0.05868828296661377, |
|
"accuracy": 0.90625, |
|
"epoch": 0.4525, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.455, |
|
"grad_norm": 6.830461025238037, |
|
"learning_rate": 1.7210526315789474e-06, |
|
"loss": 0.2781, |
|
"step": 182 |
|
}, |
|
{ |
|
"Batch Mean": 0.0004514455795288086, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.455, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4575, |
|
"grad_norm": 8.418164253234863, |
|
"learning_rate": 1.7131578947368421e-06, |
|
"loss": 0.3541, |
|
"step": 183 |
|
}, |
|
{ |
|
"Batch Mean": -0.24701285362243652, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.4575, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 8.39130687713623, |
|
"learning_rate": 1.7052631578947369e-06, |
|
"loss": 0.3704, |
|
"step": 184 |
|
}, |
|
{ |
|
"Batch Mean": 0.25158822536468506, |
|
"accuracy": 0.84375, |
|
"epoch": 0.46, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4625, |
|
"grad_norm": 8.080472946166992, |
|
"learning_rate": 1.6973684210526316e-06, |
|
"loss": 0.3565, |
|
"step": 185 |
|
}, |
|
{ |
|
"Batch Mean": 0.044036865234375, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.4625, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.465, |
|
"grad_norm": 10.760668754577637, |
|
"learning_rate": 1.6894736842105264e-06, |
|
"loss": 0.515, |
|
"step": 186 |
|
}, |
|
{ |
|
"Batch Mean": 0.0032491683959960938, |
|
"accuracy": 0.859375, |
|
"epoch": 0.465, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4675, |
|
"grad_norm": 7.858923435211182, |
|
"learning_rate": 1.6815789473684209e-06, |
|
"loss": 0.3053, |
|
"step": 187 |
|
}, |
|
{ |
|
"Batch Mean": -0.0027008056640625, |
|
"accuracy": 0.859375, |
|
"epoch": 0.4675, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 7.517134189605713, |
|
"learning_rate": 1.6736842105263156e-06, |
|
"loss": 0.2864, |
|
"step": 188 |
|
}, |
|
{ |
|
"Batch Mean": 0.25811076164245605, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.47, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4725, |
|
"grad_norm": 7.516969680786133, |
|
"learning_rate": 1.6657894736842104e-06, |
|
"loss": 0.3299, |
|
"step": 189 |
|
}, |
|
{ |
|
"Batch Mean": 0.024303913116455078, |
|
"accuracy": 0.796875, |
|
"epoch": 0.4725, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 9.072182655334473, |
|
"learning_rate": 1.6578947368421056e-06, |
|
"loss": 0.4424, |
|
"step": 190 |
|
}, |
|
{ |
|
"Batch Mean": -0.38180357217788696, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.475, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4775, |
|
"grad_norm": 9.461527824401855, |
|
"learning_rate": 1.65e-06, |
|
"loss": 0.3751, |
|
"step": 191 |
|
}, |
|
{ |
|
"Batch Mean": -0.594372034072876, |
|
"accuracy": 0.84375, |
|
"epoch": 0.4775, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 11.741211891174316, |
|
"learning_rate": 1.6421052631578948e-06, |
|
"loss": 0.3127, |
|
"step": 192 |
|
}, |
|
{ |
|
"Batch Mean": -0.5924481153488159, |
|
"accuracy": 0.890625, |
|
"epoch": 0.48, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4825, |
|
"grad_norm": 10.474485397338867, |
|
"learning_rate": 1.6342105263157896e-06, |
|
"loss": 0.276, |
|
"step": 193 |
|
}, |
|
{ |
|
"Batch Mean": -0.16649462282657623, |
|
"accuracy": 0.8125, |
|
"epoch": 0.4825, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.485, |
|
"grad_norm": 8.504786491394043, |
|
"learning_rate": 1.6263157894736843e-06, |
|
"loss": 0.3619, |
|
"step": 194 |
|
}, |
|
{ |
|
"Batch Mean": 0.15191316604614258, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.485, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.4875, |
|
"grad_norm": 6.697569370269775, |
|
"learning_rate": 1.618421052631579e-06, |
|
"loss": 0.3237, |
|
"step": 195 |
|
}, |
|
{ |
|
"Batch Mean": 0.5648140907287598, |
|
"accuracy": 0.828125, |
|
"epoch": 0.4875, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 11.56078815460205, |
|
"learning_rate": 1.6105263157894738e-06, |
|
"loss": 0.3372, |
|
"step": 196 |
|
}, |
|
{ |
|
"Batch Mean": 0.6280609369277954, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.49, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4925, |
|
"grad_norm": 12.10026741027832, |
|
"learning_rate": 1.6026315789473683e-06, |
|
"loss": 0.376, |
|
"step": 197 |
|
}, |
|
{ |
|
"Batch Mean": 0.46126890182495117, |
|
"accuracy": 0.796875, |
|
"epoch": 0.4925, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.495, |
|
"grad_norm": 9.432372093200684, |
|
"learning_rate": 1.594736842105263e-06, |
|
"loss": 0.3827, |
|
"step": 198 |
|
}, |
|
{ |
|
"Batch Mean": 0.3517181873321533, |
|
"accuracy": 0.796875, |
|
"epoch": 0.495, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4975, |
|
"grad_norm": 9.432829856872559, |
|
"learning_rate": 1.5868421052631578e-06, |
|
"loss": 0.3766, |
|
"step": 199 |
|
}, |
|
{ |
|
"Batch Mean": -0.18610143661499023, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.4975, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 8.5609130859375, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 0.4419, |
|
"step": 200 |
|
}, |
|
{ |
|
"Batch Mean": -0.3865816593170166, |
|
"accuracy": 0.890625, |
|
"epoch": 0.5, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5025, |
|
"grad_norm": 8.512102127075195, |
|
"learning_rate": 1.5710526315789473e-06, |
|
"loss": 0.2774, |
|
"step": 201 |
|
}, |
|
{ |
|
"Batch Mean": -0.4919016361236572, |
|
"accuracy": 0.859375, |
|
"epoch": 0.5025, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.505, |
|
"grad_norm": 9.126331329345703, |
|
"learning_rate": 1.563157894736842e-06, |
|
"loss": 0.3426, |
|
"step": 202 |
|
}, |
|
{ |
|
"Batch Mean": -0.48360347747802734, |
|
"accuracy": 0.78125, |
|
"epoch": 0.505, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5075, |
|
"grad_norm": 10.955025672912598, |
|
"learning_rate": 1.5552631578947368e-06, |
|
"loss": 0.411, |
|
"step": 203 |
|
}, |
|
{ |
|
"Batch Mean": -0.04683363437652588, |
|
"accuracy": 0.796875, |
|
"epoch": 0.5075, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 7.002114295959473, |
|
"learning_rate": 1.5473684210526318e-06, |
|
"loss": 0.3729, |
|
"step": 204 |
|
}, |
|
{ |
|
"Batch Mean": 0.03174877166748047, |
|
"accuracy": 0.890625, |
|
"epoch": 0.51, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5125, |
|
"grad_norm": 6.588340759277344, |
|
"learning_rate": 1.5394736842105265e-06, |
|
"loss": 0.32, |
|
"step": 205 |
|
}, |
|
{ |
|
"Batch Mean": 0.31306135654449463, |
|
"accuracy": 0.8125, |
|
"epoch": 0.5125, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.515, |
|
"grad_norm": 9.080763816833496, |
|
"learning_rate": 1.5315789473684213e-06, |
|
"loss": 0.4178, |
|
"step": 206 |
|
}, |
|
{ |
|
"Batch Mean": 0.5678501129150391, |
|
"accuracy": 0.84375, |
|
"epoch": 0.515, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5175, |
|
"grad_norm": 10.213749885559082, |
|
"learning_rate": 1.5236842105263158e-06, |
|
"loss": 0.3523, |
|
"step": 207 |
|
}, |
|
{ |
|
"Batch Mean": 0.2368319034576416, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5175, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 8.609210968017578, |
|
"learning_rate": 1.5157894736842105e-06, |
|
"loss": 0.3701, |
|
"step": 208 |
|
}, |
|
{ |
|
"Batch Mean": 0.03226196765899658, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.52, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5225, |
|
"grad_norm": 8.7013578414917, |
|
"learning_rate": 1.5078947368421053e-06, |
|
"loss": 0.4091, |
|
"step": 209 |
|
}, |
|
{ |
|
"Batch Mean": -0.3510150909423828, |
|
"accuracy": 0.828125, |
|
"epoch": 0.5225, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.525, |
|
"grad_norm": 9.209632873535156, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.4172, |
|
"step": 210 |
|
}, |
|
{ |
|
"Batch Mean": 0.02189686894416809, |
|
"accuracy": 0.859375, |
|
"epoch": 0.525, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5275, |
|
"grad_norm": 7.103114128112793, |
|
"learning_rate": 1.4921052631578948e-06, |
|
"loss": 0.3525, |
|
"step": 211 |
|
}, |
|
{ |
|
"Batch Mean": 0.08981943130493164, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.5275, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 8.323429107666016, |
|
"learning_rate": 1.4842105263157895e-06, |
|
"loss": 0.3985, |
|
"step": 212 |
|
}, |
|
{ |
|
"Batch Mean": -0.0260312557220459, |
|
"accuracy": 0.859375, |
|
"epoch": 0.53, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5325, |
|
"grad_norm": 6.364190101623535, |
|
"learning_rate": 1.4763157894736843e-06, |
|
"loss": 0.3283, |
|
"step": 213 |
|
}, |
|
{ |
|
"Batch Mean": -0.45758867263793945, |
|
"accuracy": 0.828125, |
|
"epoch": 0.5325, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.535, |
|
"grad_norm": 10.124600410461426, |
|
"learning_rate": 1.468421052631579e-06, |
|
"loss": 0.4036, |
|
"step": 214 |
|
}, |
|
{ |
|
"Batch Mean": -0.12888717651367188, |
|
"accuracy": 0.9140625, |
|
"epoch": 0.535, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5375, |
|
"grad_norm": 6.383907794952393, |
|
"learning_rate": 1.4605263157894738e-06, |
|
"loss": 0.26, |
|
"step": 215 |
|
}, |
|
{ |
|
"Batch Mean": 0.06431245803833008, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.5375, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 6.595977783203125, |
|
"learning_rate": 1.4526315789473685e-06, |
|
"loss": 0.368, |
|
"step": 216 |
|
}, |
|
{ |
|
"Batch Mean": 0.015802383422851562, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.54, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5425, |
|
"grad_norm": 5.937958240509033, |
|
"learning_rate": 1.4447368421052633e-06, |
|
"loss": 0.3386, |
|
"step": 217 |
|
}, |
|
{ |
|
"Batch Mean": 0.2596292495727539, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.5425, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.545, |
|
"grad_norm": 8.265673637390137, |
|
"learning_rate": 1.436842105263158e-06, |
|
"loss": 0.3517, |
|
"step": 218 |
|
}, |
|
{ |
|
"Batch Mean": 0.06246137619018555, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.545, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5475, |
|
"grad_norm": 7.670858860015869, |
|
"learning_rate": 1.4289473684210525e-06, |
|
"loss": 0.3248, |
|
"step": 219 |
|
}, |
|
{ |
|
"Batch Mean": 0.49367237091064453, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5475, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 9.92224407196045, |
|
"learning_rate": 1.4210526315789473e-06, |
|
"loss": 0.4093, |
|
"step": 220 |
|
}, |
|
{ |
|
"Batch Mean": -0.056406617164611816, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.55, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5525, |
|
"grad_norm": 7.7244648933410645, |
|
"learning_rate": 1.4131578947368422e-06, |
|
"loss": 0.3588, |
|
"step": 221 |
|
}, |
|
{ |
|
"Batch Mean": -0.31797313690185547, |
|
"accuracy": 0.8125, |
|
"epoch": 0.5525, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.555, |
|
"grad_norm": 8.702153205871582, |
|
"learning_rate": 1.405263157894737e-06, |
|
"loss": 0.3758, |
|
"step": 222 |
|
}, |
|
{ |
|
"Batch Mean": 0.1353154182434082, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.555, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5575, |
|
"grad_norm": 8.6244535446167, |
|
"learning_rate": 1.3973684210526317e-06, |
|
"loss": 0.4032, |
|
"step": 223 |
|
}, |
|
{ |
|
"Batch Mean": 0.02267169952392578, |
|
"accuracy": 0.890625, |
|
"epoch": 0.5575, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 5.774755954742432, |
|
"learning_rate": 1.3894736842105263e-06, |
|
"loss": 0.2707, |
|
"step": 224 |
|
}, |
|
{ |
|
"Batch Mean": -0.02749919891357422, |
|
"accuracy": 0.890625, |
|
"epoch": 0.56, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 5.9841532707214355, |
|
"learning_rate": 1.381578947368421e-06, |
|
"loss": 0.2991, |
|
"step": 225 |
|
}, |
|
{ |
|
"Batch Mean": -0.05124187469482422, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.5625, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.565, |
|
"grad_norm": 7.033453464508057, |
|
"learning_rate": 1.3736842105263158e-06, |
|
"loss": 0.3352, |
|
"step": 226 |
|
}, |
|
{ |
|
"Batch Mean": -0.4628112316131592, |
|
"accuracy": 0.828125, |
|
"epoch": 0.565, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5675, |
|
"grad_norm": 10.019220352172852, |
|
"learning_rate": 1.3657894736842107e-06, |
|
"loss": 0.3739, |
|
"step": 227 |
|
}, |
|
{ |
|
"Batch Mean": 0.15478086471557617, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.5675, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 8.91198444366455, |
|
"learning_rate": 1.3578947368421055e-06, |
|
"loss": 0.3313, |
|
"step": 228 |
|
}, |
|
{ |
|
"Batch Mean": 0.06676062941551208, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.57, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5725, |
|
"grad_norm": 7.143804550170898, |
|
"learning_rate": 1.35e-06, |
|
"loss": 0.3072, |
|
"step": 229 |
|
}, |
|
{ |
|
"Batch Mean": 0.47977638244628906, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.5725, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.575, |
|
"grad_norm": 11.133859634399414, |
|
"learning_rate": 1.3421052631578947e-06, |
|
"loss": 0.345, |
|
"step": 230 |
|
}, |
|
{ |
|
"Batch Mean": 0.3305387496948242, |
|
"accuracy": 0.859375, |
|
"epoch": 0.575, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5775, |
|
"grad_norm": 10.80010986328125, |
|
"learning_rate": 1.3342105263157895e-06, |
|
"loss": 0.3141, |
|
"step": 231 |
|
}, |
|
{ |
|
"Batch Mean": 0.43216419219970703, |
|
"accuracy": 0.84375, |
|
"epoch": 0.5775, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 9.948732376098633, |
|
"learning_rate": 1.3263157894736842e-06, |
|
"loss": 0.3991, |
|
"step": 232 |
|
}, |
|
{ |
|
"Batch Mean": 0.14983630180358887, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.58, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5825, |
|
"grad_norm": 7.463159084320068, |
|
"learning_rate": 1.318421052631579e-06, |
|
"loss": 0.3243, |
|
"step": 233 |
|
}, |
|
{ |
|
"Batch Mean": -0.5400023460388184, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.5825, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.585, |
|
"grad_norm": 11.068828582763672, |
|
"learning_rate": 1.3105263157894737e-06, |
|
"loss": 0.4177, |
|
"step": 234 |
|
}, |
|
{ |
|
"Batch Mean": -0.42549943923950195, |
|
"accuracy": 0.8984375, |
|
"epoch": 0.585, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5875, |
|
"grad_norm": 11.707810401916504, |
|
"learning_rate": 1.3026315789473685e-06, |
|
"loss": 0.2911, |
|
"step": 235 |
|
}, |
|
{ |
|
"Batch Mean": -0.5239953994750977, |
|
"accuracy": 0.875, |
|
"epoch": 0.5875, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 10.632058143615723, |
|
"learning_rate": 1.2947368421052632e-06, |
|
"loss": 0.2737, |
|
"step": 236 |
|
}, |
|
{ |
|
"Batch Mean": -0.3963519334793091, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.59, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5925, |
|
"grad_norm": 11.079835891723633, |
|
"learning_rate": 1.286842105263158e-06, |
|
"loss": 0.4142, |
|
"step": 237 |
|
}, |
|
{ |
|
"Batch Mean": -0.047173500061035156, |
|
"accuracy": 0.859375, |
|
"epoch": 0.5925, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.595, |
|
"grad_norm": 7.266193389892578, |
|
"learning_rate": 1.2789473684210527e-06, |
|
"loss": 0.2802, |
|
"step": 238 |
|
}, |
|
{ |
|
"Batch Mean": 0.07653871178627014, |
|
"accuracy": 0.828125, |
|
"epoch": 0.595, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5975, |
|
"grad_norm": 7.714227676391602, |
|
"learning_rate": 1.2710526315789474e-06, |
|
"loss": 0.3471, |
|
"step": 239 |
|
}, |
|
{ |
|
"Batch Mean": 0.6806421279907227, |
|
"accuracy": 0.859375, |
|
"epoch": 0.5975, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 14.054402351379395, |
|
"learning_rate": 1.263157894736842e-06, |
|
"loss": 0.371, |
|
"step": 240 |
|
}, |
|
{ |
|
"Batch Mean": 0.5353469848632812, |
|
"accuracy": 0.890625, |
|
"epoch": 0.6, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6025, |
|
"grad_norm": 10.932038307189941, |
|
"learning_rate": 1.255263157894737e-06, |
|
"loss": 0.3251, |
|
"step": 241 |
|
}, |
|
{ |
|
"Batch Mean": 0.4242100715637207, |
|
"accuracy": 0.84375, |
|
"epoch": 0.6025, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.605, |
|
"grad_norm": 9.543030738830566, |
|
"learning_rate": 1.2473684210526317e-06, |
|
"loss": 0.366, |
|
"step": 242 |
|
}, |
|
{ |
|
"Batch Mean": 0.1224508285522461, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.605, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6075, |
|
"grad_norm": 7.535095691680908, |
|
"learning_rate": 1.2394736842105264e-06, |
|
"loss": 0.355, |
|
"step": 243 |
|
}, |
|
{ |
|
"Batch Mean": -0.1669750213623047, |
|
"accuracy": 0.828125, |
|
"epoch": 0.6075, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 7.886036396026611, |
|
"learning_rate": 1.2315789473684212e-06, |
|
"loss": 0.3262, |
|
"step": 244 |
|
}, |
|
{ |
|
"Batch Mean": -1.217991828918457, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.61, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6125, |
|
"grad_norm": 21.176101684570312, |
|
"learning_rate": 1.2236842105263157e-06, |
|
"loss": 0.4438, |
|
"step": 245 |
|
}, |
|
{ |
|
"Batch Mean": -0.2878119945526123, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.6125, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.615, |
|
"grad_norm": 10.222801208496094, |
|
"learning_rate": 1.2157894736842105e-06, |
|
"loss": 0.439, |
|
"step": 246 |
|
}, |
|
{ |
|
"Batch Mean": -0.626392126083374, |
|
"accuracy": 0.8984375, |
|
"epoch": 0.615, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6175, |
|
"grad_norm": 11.413473129272461, |
|
"learning_rate": 1.2078947368421052e-06, |
|
"loss": 0.2802, |
|
"step": 247 |
|
}, |
|
{ |
|
"Batch Mean": -0.20267772674560547, |
|
"accuracy": 0.875, |
|
"epoch": 0.6175, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 7.269984245300293, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.3002, |
|
"step": 248 |
|
}, |
|
{ |
|
"Batch Mean": -0.5709860324859619, |
|
"accuracy": 0.828125, |
|
"epoch": 0.62, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6225, |
|
"grad_norm": 12.802457809448242, |
|
"learning_rate": 1.192105263157895e-06, |
|
"loss": 0.3694, |
|
"step": 249 |
|
}, |
|
{ |
|
"Batch Mean": -0.2514686584472656, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.6225, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 8.272834777832031, |
|
"learning_rate": 1.1842105263157894e-06, |
|
"loss": 0.3595, |
|
"step": 250 |
|
}, |
|
{ |
|
"Batch Mean": 0.7458231449127197, |
|
"accuracy": 0.8125, |
|
"epoch": 0.625, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6275, |
|
"grad_norm": 13.081140518188477, |
|
"learning_rate": 1.1763157894736842e-06, |
|
"loss": 0.3856, |
|
"step": 251 |
|
}, |
|
{ |
|
"Batch Mean": 0.17588496208190918, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.6275, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 7.108640193939209, |
|
"learning_rate": 1.168421052631579e-06, |
|
"loss": 0.332, |
|
"step": 252 |
|
}, |
|
{ |
|
"Batch Mean": 0.5019898414611816, |
|
"accuracy": 0.84375, |
|
"epoch": 0.63, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6325, |
|
"grad_norm": 8.727738380432129, |
|
"learning_rate": 1.1605263157894737e-06, |
|
"loss": 0.34, |
|
"step": 253 |
|
}, |
|
{ |
|
"Batch Mean": 0.30086028575897217, |
|
"accuracy": 0.890625, |
|
"epoch": 0.6325, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.635, |
|
"grad_norm": 7.778132915496826, |
|
"learning_rate": 1.1526315789473684e-06, |
|
"loss": 0.3347, |
|
"step": 254 |
|
}, |
|
{ |
|
"Batch Mean": 0.49059462547302246, |
|
"accuracy": 0.796875, |
|
"epoch": 0.635, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6375, |
|
"grad_norm": 10.36148738861084, |
|
"learning_rate": 1.1447368421052632e-06, |
|
"loss": 0.4268, |
|
"step": 255 |
|
}, |
|
{ |
|
"Batch Mean": 0.26512861251831055, |
|
"accuracy": 0.859375, |
|
"epoch": 0.6375, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 8.016122817993164, |
|
"learning_rate": 1.136842105263158e-06, |
|
"loss": 0.3719, |
|
"step": 256 |
|
}, |
|
{ |
|
"Batch Mean": -0.6386747360229492, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.64, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6425, |
|
"grad_norm": 10.996342658996582, |
|
"learning_rate": 1.1289473684210527e-06, |
|
"loss": 0.3249, |
|
"step": 257 |
|
}, |
|
{ |
|
"Batch Mean": -0.41381216049194336, |
|
"accuracy": 0.828125, |
|
"epoch": 0.6425, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.645, |
|
"grad_norm": 8.828882217407227, |
|
"learning_rate": 1.1210526315789474e-06, |
|
"loss": 0.4112, |
|
"step": 258 |
|
}, |
|
{ |
|
"Batch Mean": -0.37595319747924805, |
|
"accuracy": 0.78125, |
|
"epoch": 0.645, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6475, |
|
"grad_norm": 9.125770568847656, |
|
"learning_rate": 1.1131578947368421e-06, |
|
"loss": 0.4207, |
|
"step": 259 |
|
}, |
|
{ |
|
"Batch Mean": -0.22586441040039062, |
|
"accuracy": 0.875, |
|
"epoch": 0.6475, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 7.365307807922363, |
|
"learning_rate": 1.1052631578947369e-06, |
|
"loss": 0.3451, |
|
"step": 260 |
|
}, |
|
{ |
|
"Batch Mean": -0.028152525424957275, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.65, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6525, |
|
"grad_norm": 6.994399070739746, |
|
"learning_rate": 1.0973684210526316e-06, |
|
"loss": 0.4034, |
|
"step": 261 |
|
}, |
|
{ |
|
"Batch Mean": 0.17945003509521484, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.6525, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.655, |
|
"grad_norm": 7.0014519691467285, |
|
"learning_rate": 1.0894736842105264e-06, |
|
"loss": 0.3369, |
|
"step": 262 |
|
}, |
|
{ |
|
"Batch Mean": -0.10725253820419312, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.655, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6575, |
|
"grad_norm": 6.243435382843018, |
|
"learning_rate": 1.0815789473684211e-06, |
|
"loss": 0.3248, |
|
"step": 263 |
|
}, |
|
{ |
|
"Batch Mean": 0.28885936737060547, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.6575, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 8.345162391662598, |
|
"learning_rate": 1.0736842105263159e-06, |
|
"loss": 0.3149, |
|
"step": 264 |
|
}, |
|
{ |
|
"Batch Mean": 0.3201725482940674, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.66, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6625, |
|
"grad_norm": 8.161121368408203, |
|
"learning_rate": 1.0657894736842106e-06, |
|
"loss": 0.3174, |
|
"step": 265 |
|
}, |
|
{ |
|
"Batch Mean": 0.2368483543395996, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.6625, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.665, |
|
"grad_norm": 7.132945537567139, |
|
"learning_rate": 1.0578947368421052e-06, |
|
"loss": 0.3138, |
|
"step": 266 |
|
}, |
|
{ |
|
"Batch Mean": 0.5943679809570312, |
|
"accuracy": 0.8984375, |
|
"epoch": 0.665, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6675, |
|
"grad_norm": 10.465624809265137, |
|
"learning_rate": 1.05e-06, |
|
"loss": 0.2962, |
|
"step": 267 |
|
}, |
|
{ |
|
"Batch Mean": 0.4077954888343811, |
|
"accuracy": 0.890625, |
|
"epoch": 0.6675, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 8.254660606384277, |
|
"learning_rate": 1.0421052631578949e-06, |
|
"loss": 0.2817, |
|
"step": 268 |
|
}, |
|
{ |
|
"Batch Mean": -0.07658010721206665, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.67, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6725, |
|
"grad_norm": 7.049543380737305, |
|
"learning_rate": 1.0342105263157896e-06, |
|
"loss": 0.3661, |
|
"step": 269 |
|
}, |
|
{ |
|
"Batch Mean": -0.16821885108947754, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.6725, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.675, |
|
"grad_norm": 6.973754405975342, |
|
"learning_rate": 1.0263157894736843e-06, |
|
"loss": 0.3486, |
|
"step": 270 |
|
}, |
|
{ |
|
"Batch Mean": -0.5536280870437622, |
|
"accuracy": 0.828125, |
|
"epoch": 0.675, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6775, |
|
"grad_norm": 10.284018516540527, |
|
"learning_rate": 1.0184210526315789e-06, |
|
"loss": 0.3327, |
|
"step": 271 |
|
}, |
|
{ |
|
"Batch Mean": -0.12136930227279663, |
|
"accuracy": 0.859375, |
|
"epoch": 0.6775, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 7.349569797515869, |
|
"learning_rate": 1.0105263157894736e-06, |
|
"loss": 0.3365, |
|
"step": 272 |
|
}, |
|
{ |
|
"Batch Mean": -0.21032047271728516, |
|
"accuracy": 0.859375, |
|
"epoch": 0.68, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6825, |
|
"grad_norm": 8.34876823425293, |
|
"learning_rate": 1.0026315789473684e-06, |
|
"loss": 0.3146, |
|
"step": 273 |
|
}, |
|
{ |
|
"Batch Mean": -0.41312098503112793, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.6825, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.685, |
|
"grad_norm": 8.684778213500977, |
|
"learning_rate": 9.947368421052631e-07, |
|
"loss": 0.3135, |
|
"step": 274 |
|
}, |
|
{ |
|
"Batch Mean": -0.10464096069335938, |
|
"accuracy": 0.859375, |
|
"epoch": 0.685, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 6.903010845184326, |
|
"learning_rate": 9.86842105263158e-07, |
|
"loss": 0.3719, |
|
"step": 275 |
|
}, |
|
{ |
|
"Batch Mean": 0.23662281036376953, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.6875, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 7.497840881347656, |
|
"learning_rate": 9.789473684210526e-07, |
|
"loss": 0.3371, |
|
"step": 276 |
|
}, |
|
{ |
|
"Batch Mean": 0.13962489366531372, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.69, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6925, |
|
"grad_norm": 6.886028289794922, |
|
"learning_rate": 9.710526315789474e-07, |
|
"loss": 0.3285, |
|
"step": 277 |
|
}, |
|
{ |
|
"Batch Mean": 0.03725790977478027, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.6925, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.695, |
|
"grad_norm": 6.258803844451904, |
|
"learning_rate": 9.63157894736842e-07, |
|
"loss": 0.2595, |
|
"step": 278 |
|
}, |
|
{ |
|
"Batch Mean": 0.39067840576171875, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.695, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.6975, |
|
"grad_norm": 8.779505729675293, |
|
"learning_rate": 9.552631578947368e-07, |
|
"loss": 0.3781, |
|
"step": 279 |
|
}, |
|
{ |
|
"Batch Mean": 0.06871318817138672, |
|
"accuracy": 0.828125, |
|
"epoch": 0.6975, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 7.97261905670166, |
|
"learning_rate": 9.473684210526316e-07, |
|
"loss": 0.3928, |
|
"step": 280 |
|
}, |
|
{ |
|
"Batch Mean": 0.23102843761444092, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.7, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7025, |
|
"grad_norm": 7.682852745056152, |
|
"learning_rate": 9.394736842105262e-07, |
|
"loss": 0.3187, |
|
"step": 281 |
|
}, |
|
{ |
|
"Batch Mean": 0.061847686767578125, |
|
"accuracy": 0.8984375, |
|
"epoch": 0.7025, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.705, |
|
"grad_norm": 6.796916961669922, |
|
"learning_rate": 9.315789473684212e-07, |
|
"loss": 0.3183, |
|
"step": 282 |
|
}, |
|
{ |
|
"Batch Mean": -0.49417901039123535, |
|
"accuracy": 0.828125, |
|
"epoch": 0.705, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7075, |
|
"grad_norm": 11.345503807067871, |
|
"learning_rate": 9.236842105263158e-07, |
|
"loss": 0.3701, |
|
"step": 283 |
|
}, |
|
{ |
|
"Batch Mean": -0.13564801216125488, |
|
"accuracy": 0.875, |
|
"epoch": 0.7075, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 6.1998515129089355, |
|
"learning_rate": 9.157894736842106e-07, |
|
"loss": 0.2908, |
|
"step": 284 |
|
}, |
|
{ |
|
"Batch Mean": -0.20754623413085938, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.71, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7125, |
|
"grad_norm": 9.002010345458984, |
|
"learning_rate": 9.078947368421053e-07, |
|
"loss": 0.4074, |
|
"step": 285 |
|
}, |
|
{ |
|
"Batch Mean": -0.48828125, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7125, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.715, |
|
"grad_norm": 11.633318901062012, |
|
"learning_rate": 9e-07, |
|
"loss": 0.4178, |
|
"step": 286 |
|
}, |
|
{ |
|
"Batch Mean": -0.30758142471313477, |
|
"accuracy": 0.8046875, |
|
"epoch": 0.715, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7175, |
|
"grad_norm": 8.946621894836426, |
|
"learning_rate": 8.921052631578947e-07, |
|
"loss": 0.4023, |
|
"step": 287 |
|
}, |
|
{ |
|
"Batch Mean": -0.24362659454345703, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.7175, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 8.48685359954834, |
|
"learning_rate": 8.842105263157895e-07, |
|
"loss": 0.3542, |
|
"step": 288 |
|
}, |
|
{ |
|
"Batch Mean": 0.4370979368686676, |
|
"accuracy": 0.796875, |
|
"epoch": 0.72, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7225, |
|
"grad_norm": 9.334733009338379, |
|
"learning_rate": 8.763157894736843e-07, |
|
"loss": 0.3882, |
|
"step": 289 |
|
}, |
|
{ |
|
"Batch Mean": 0.4301643371582031, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7225, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.725, |
|
"grad_norm": 9.78183650970459, |
|
"learning_rate": 8.68421052631579e-07, |
|
"loss": 0.4386, |
|
"step": 290 |
|
}, |
|
{ |
|
"Batch Mean": 0.3213467597961426, |
|
"accuracy": 0.84375, |
|
"epoch": 0.725, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7275, |
|
"grad_norm": 7.349422454833984, |
|
"learning_rate": 8.605263157894737e-07, |
|
"loss": 0.3187, |
|
"step": 291 |
|
}, |
|
{ |
|
"Batch Mean": 0.03937339782714844, |
|
"accuracy": 0.8984375, |
|
"epoch": 0.7275, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 5.609205722808838, |
|
"learning_rate": 8.526315789473684e-07, |
|
"loss": 0.2493, |
|
"step": 292 |
|
}, |
|
{ |
|
"Batch Mean": 0.48126816749572754, |
|
"accuracy": 0.890625, |
|
"epoch": 0.73, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7325, |
|
"grad_norm": 10.584858894348145, |
|
"learning_rate": 8.447368421052632e-07, |
|
"loss": 0.3036, |
|
"step": 293 |
|
}, |
|
{ |
|
"Batch Mean": -0.08370530605316162, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7325, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.735, |
|
"grad_norm": 7.3887739181518555, |
|
"learning_rate": 8.368421052631578e-07, |
|
"loss": 0.4238, |
|
"step": 294 |
|
}, |
|
{ |
|
"Batch Mean": -0.32436466217041016, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.735, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7375, |
|
"grad_norm": 8.470144271850586, |
|
"learning_rate": 8.289473684210528e-07, |
|
"loss": 0.3559, |
|
"step": 295 |
|
}, |
|
{ |
|
"Batch Mean": -0.09412622451782227, |
|
"accuracy": 0.875, |
|
"epoch": 0.7375, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 5.89725923538208, |
|
"learning_rate": 8.210526315789474e-07, |
|
"loss": 0.3067, |
|
"step": 296 |
|
}, |
|
{ |
|
"Batch Mean": -0.07551002502441406, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.74, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7425, |
|
"grad_norm": 6.473114967346191, |
|
"learning_rate": 8.131578947368422e-07, |
|
"loss": 0.3266, |
|
"step": 297 |
|
}, |
|
{ |
|
"Batch Mean": -0.17006683349609375, |
|
"accuracy": 0.78125, |
|
"epoch": 0.7425, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.745, |
|
"grad_norm": 8.315503120422363, |
|
"learning_rate": 8.052631578947369e-07, |
|
"loss": 0.4281, |
|
"step": 298 |
|
}, |
|
{ |
|
"Batch Mean": 0.375512957572937, |
|
"accuracy": 0.890625, |
|
"epoch": 0.745, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7475, |
|
"grad_norm": 7.217694282531738, |
|
"learning_rate": 7.973684210526315e-07, |
|
"loss": 0.2905, |
|
"step": 299 |
|
}, |
|
{ |
|
"Batch Mean": 0.051157474517822266, |
|
"accuracy": 0.828125, |
|
"epoch": 0.7475, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 6.537591934204102, |
|
"learning_rate": 7.894736842105263e-07, |
|
"loss": 0.3391, |
|
"step": 300 |
|
}, |
|
{ |
|
"Batch Mean": 0.12043190002441406, |
|
"accuracy": 0.765625, |
|
"epoch": 0.75, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7525, |
|
"grad_norm": 6.898367881774902, |
|
"learning_rate": 7.81578947368421e-07, |
|
"loss": 0.4138, |
|
"step": 301 |
|
}, |
|
{ |
|
"Batch Mean": 0.25075435638427734, |
|
"accuracy": 0.828125, |
|
"epoch": 0.7525, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.755, |
|
"grad_norm": 7.489468574523926, |
|
"learning_rate": 7.736842105263159e-07, |
|
"loss": 0.354, |
|
"step": 302 |
|
}, |
|
{ |
|
"Batch Mean": 0.03667092323303223, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.755, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7575, |
|
"grad_norm": 7.046087741851807, |
|
"learning_rate": 7.657894736842106e-07, |
|
"loss": 0.3718, |
|
"step": 303 |
|
}, |
|
{ |
|
"Batch Mean": -0.3230457305908203, |
|
"accuracy": 0.7890625, |
|
"epoch": 0.7575, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 8.91894245147705, |
|
"learning_rate": 7.578947368421053e-07, |
|
"loss": 0.4313, |
|
"step": 304 |
|
}, |
|
{ |
|
"Batch Mean": 0.05684959888458252, |
|
"accuracy": 0.921875, |
|
"epoch": 0.76, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7625, |
|
"grad_norm": 5.546080112457275, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.2463, |
|
"step": 305 |
|
}, |
|
{ |
|
"Batch Mean": -0.19317865371704102, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7625, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.765, |
|
"grad_norm": 6.150706768035889, |
|
"learning_rate": 7.421052631578948e-07, |
|
"loss": 0.2549, |
|
"step": 306 |
|
}, |
|
{ |
|
"Batch Mean": 0.09738802909851074, |
|
"accuracy": 0.828125, |
|
"epoch": 0.765, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7675, |
|
"grad_norm": 7.414944171905518, |
|
"learning_rate": 7.342105263157895e-07, |
|
"loss": 0.3891, |
|
"step": 307 |
|
}, |
|
{ |
|
"Batch Mean": -0.32265615463256836, |
|
"accuracy": 0.828125, |
|
"epoch": 0.7675, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 7.190647125244141, |
|
"learning_rate": 7.263157894736843e-07, |
|
"loss": 0.3282, |
|
"step": 308 |
|
}, |
|
{ |
|
"Batch Mean": 0.009046733379364014, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.77, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7725, |
|
"grad_norm": 8.073942184448242, |
|
"learning_rate": 7.18421052631579e-07, |
|
"loss": 0.3754, |
|
"step": 309 |
|
}, |
|
{ |
|
"Batch Mean": 0.047414422035217285, |
|
"accuracy": 0.90625, |
|
"epoch": 0.7725, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.775, |
|
"grad_norm": 5.672469615936279, |
|
"learning_rate": 7.105263157894736e-07, |
|
"loss": 0.2681, |
|
"step": 310 |
|
}, |
|
{ |
|
"Batch Mean": -0.12704157829284668, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.775, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7775, |
|
"grad_norm": 7.583930492401123, |
|
"learning_rate": 7.026315789473685e-07, |
|
"loss": 0.3454, |
|
"step": 311 |
|
}, |
|
{ |
|
"Batch Mean": 0.19763851165771484, |
|
"accuracy": 0.84375, |
|
"epoch": 0.7775, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 6.852874755859375, |
|
"learning_rate": 6.947368421052631e-07, |
|
"loss": 0.3431, |
|
"step": 312 |
|
}, |
|
{ |
|
"Batch Mean": 0.07733583450317383, |
|
"accuracy": 0.8125, |
|
"epoch": 0.78, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7825, |
|
"grad_norm": 7.628359794616699, |
|
"learning_rate": 6.868421052631579e-07, |
|
"loss": 0.4149, |
|
"step": 313 |
|
}, |
|
{ |
|
"Batch Mean": 0.18402791023254395, |
|
"accuracy": 0.765625, |
|
"epoch": 0.7825, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.785, |
|
"grad_norm": 8.48436450958252, |
|
"learning_rate": 6.789473684210527e-07, |
|
"loss": 0.4332, |
|
"step": 314 |
|
}, |
|
{ |
|
"Batch Mean": -0.02489006519317627, |
|
"accuracy": 0.828125, |
|
"epoch": 0.785, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7875, |
|
"grad_norm": 7.366763114929199, |
|
"learning_rate": 6.710526315789474e-07, |
|
"loss": 0.3156, |
|
"step": 315 |
|
}, |
|
{ |
|
"Batch Mean": 0.26845455169677734, |
|
"accuracy": 0.796875, |
|
"epoch": 0.7875, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 8.023492813110352, |
|
"learning_rate": 6.631578947368421e-07, |
|
"loss": 0.397, |
|
"step": 316 |
|
}, |
|
{ |
|
"Batch Mean": -0.23256540298461914, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.79, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7925, |
|
"grad_norm": 7.89279842376709, |
|
"learning_rate": 6.552631578947369e-07, |
|
"loss": 0.3337, |
|
"step": 317 |
|
}, |
|
{ |
|
"Batch Mean": -0.05113410949707031, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.7925, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.795, |
|
"grad_norm": 7.3378496170043945, |
|
"learning_rate": 6.473684210526316e-07, |
|
"loss": 0.3519, |
|
"step": 318 |
|
}, |
|
{ |
|
"Batch Mean": -0.11820012331008911, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.795, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7975, |
|
"grad_norm": 7.006772994995117, |
|
"learning_rate": 6.394736842105264e-07, |
|
"loss": 0.3382, |
|
"step": 319 |
|
}, |
|
{ |
|
"Batch Mean": 0.06811904907226562, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.7975, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.040895938873291, |
|
"learning_rate": 6.31578947368421e-07, |
|
"loss": 0.3181, |
|
"step": 320 |
|
}, |
|
{ |
|
"Batch Mean": -0.14085137844085693, |
|
"accuracy": 0.875, |
|
"epoch": 0.8, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8025, |
|
"grad_norm": 6.362124919891357, |
|
"learning_rate": 6.236842105263158e-07, |
|
"loss": 0.2963, |
|
"step": 321 |
|
}, |
|
{ |
|
"Batch Mean": -0.024917006492614746, |
|
"accuracy": 0.8125, |
|
"epoch": 0.8025, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.805, |
|
"grad_norm": 6.772774696350098, |
|
"learning_rate": 6.157894736842106e-07, |
|
"loss": 0.3363, |
|
"step": 322 |
|
}, |
|
{ |
|
"Batch Mean": -0.04024147987365723, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.805, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8075, |
|
"grad_norm": 8.158961296081543, |
|
"learning_rate": 6.078947368421052e-07, |
|
"loss": 0.3641, |
|
"step": 323 |
|
}, |
|
{ |
|
"Batch Mean": 0.0783843994140625, |
|
"accuracy": 0.84375, |
|
"epoch": 0.8075, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 6.623997211456299, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.3166, |
|
"step": 324 |
|
}, |
|
{ |
|
"Batch Mean": 0.15433627367019653, |
|
"accuracy": 0.890625, |
|
"epoch": 0.81, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 7.545943737030029, |
|
"learning_rate": 5.921052631578947e-07, |
|
"loss": 0.3395, |
|
"step": 325 |
|
}, |
|
{ |
|
"Batch Mean": -0.08928751945495605, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.8125, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.815, |
|
"grad_norm": 8.483359336853027, |
|
"learning_rate": 5.842105263157895e-07, |
|
"loss": 0.4117, |
|
"step": 326 |
|
}, |
|
{ |
|
"Batch Mean": -0.16211652755737305, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.815, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8175, |
|
"grad_norm": 6.594727993011475, |
|
"learning_rate": 5.763157894736842e-07, |
|
"loss": 0.3159, |
|
"step": 327 |
|
}, |
|
{ |
|
"Batch Mean": -0.07417866587638855, |
|
"accuracy": 0.890625, |
|
"epoch": 0.8175, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 6.430022716522217, |
|
"learning_rate": 5.68421052631579e-07, |
|
"loss": 0.2779, |
|
"step": 328 |
|
}, |
|
{ |
|
"Batch Mean": 0.08148670196533203, |
|
"accuracy": 0.7734375, |
|
"epoch": 0.82, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8225, |
|
"grad_norm": 7.70820951461792, |
|
"learning_rate": 5.605263157894737e-07, |
|
"loss": 0.4169, |
|
"step": 329 |
|
}, |
|
{ |
|
"Batch Mean": 0.21605968475341797, |
|
"accuracy": 0.84375, |
|
"epoch": 0.8225, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.825, |
|
"grad_norm": 8.158958435058594, |
|
"learning_rate": 5.526315789473684e-07, |
|
"loss": 0.3429, |
|
"step": 330 |
|
}, |
|
{ |
|
"Batch Mean": -0.37101975083351135, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.825, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8275, |
|
"grad_norm": 8.67979621887207, |
|
"learning_rate": 5.447368421052632e-07, |
|
"loss": 0.3413, |
|
"step": 331 |
|
}, |
|
{ |
|
"Batch Mean": 0.013836383819580078, |
|
"accuracy": 0.875, |
|
"epoch": 0.8275, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 6.170620441436768, |
|
"learning_rate": 5.368421052631579e-07, |
|
"loss": 0.3218, |
|
"step": 332 |
|
}, |
|
{ |
|
"Batch Mean": 0.11910724639892578, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.83, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8325, |
|
"grad_norm": 7.671329498291016, |
|
"learning_rate": 5.289473684210526e-07, |
|
"loss": 0.3497, |
|
"step": 333 |
|
}, |
|
{ |
|
"Batch Mean": -0.0868523120880127, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.8325, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.835, |
|
"grad_norm": 8.080835342407227, |
|
"learning_rate": 5.210526315789474e-07, |
|
"loss": 0.444, |
|
"step": 334 |
|
}, |
|
{ |
|
"Batch Mean": -0.2944847345352173, |
|
"accuracy": 0.890625, |
|
"epoch": 0.835, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8375, |
|
"grad_norm": 7.778637409210205, |
|
"learning_rate": 5.131578947368422e-07, |
|
"loss": 0.3101, |
|
"step": 335 |
|
}, |
|
{ |
|
"Batch Mean": -0.021279096603393555, |
|
"accuracy": 0.8125, |
|
"epoch": 0.8375, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 7.106449604034424, |
|
"learning_rate": 5.052631578947368e-07, |
|
"loss": 0.3444, |
|
"step": 336 |
|
}, |
|
{ |
|
"Batch Mean": -0.015690743923187256, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.84, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8425, |
|
"grad_norm": 6.589627742767334, |
|
"learning_rate": 4.973684210526316e-07, |
|
"loss": 0.3206, |
|
"step": 337 |
|
}, |
|
{ |
|
"Batch Mean": -0.2671794891357422, |
|
"accuracy": 0.859375, |
|
"epoch": 0.8425, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.845, |
|
"grad_norm": 7.155620098114014, |
|
"learning_rate": 4.894736842105263e-07, |
|
"loss": 0.284, |
|
"step": 338 |
|
}, |
|
{ |
|
"Batch Mean": 0.4332003593444824, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.845, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8475, |
|
"grad_norm": 8.876181602478027, |
|
"learning_rate": 4.81578947368421e-07, |
|
"loss": 0.3197, |
|
"step": 339 |
|
}, |
|
{ |
|
"Batch Mean": -0.24415969848632812, |
|
"accuracy": 0.84375, |
|
"epoch": 0.8475, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 7.213611602783203, |
|
"learning_rate": 4.736842105263158e-07, |
|
"loss": 0.3136, |
|
"step": 340 |
|
}, |
|
{ |
|
"Batch Mean": 0.1467573642730713, |
|
"accuracy": 0.8984375, |
|
"epoch": 0.85, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8525, |
|
"grad_norm": 6.28555965423584, |
|
"learning_rate": 4.657894736842106e-07, |
|
"loss": 0.2883, |
|
"step": 341 |
|
}, |
|
{ |
|
"Batch Mean": 0.32343077659606934, |
|
"accuracy": 0.890625, |
|
"epoch": 0.8525, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.855, |
|
"grad_norm": 7.480923175811768, |
|
"learning_rate": 4.578947368421053e-07, |
|
"loss": 0.2804, |
|
"step": 342 |
|
}, |
|
{ |
|
"Batch Mean": 0.11205053329467773, |
|
"accuracy": 0.9140625, |
|
"epoch": 0.855, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8575, |
|
"grad_norm": 6.250729560852051, |
|
"learning_rate": 4.5e-07, |
|
"loss": 0.3269, |
|
"step": 343 |
|
}, |
|
{ |
|
"Batch Mean": 0.5144360661506653, |
|
"accuracy": 0.828125, |
|
"epoch": 0.8575, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 11.377030372619629, |
|
"learning_rate": 4.421052631578947e-07, |
|
"loss": 0.4329, |
|
"step": 344 |
|
}, |
|
{ |
|
"Batch Mean": 0.508669376373291, |
|
"accuracy": 0.8125, |
|
"epoch": 0.86, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8625, |
|
"grad_norm": 10.210086822509766, |
|
"learning_rate": 4.342105263157895e-07, |
|
"loss": 0.3719, |
|
"step": 345 |
|
}, |
|
{ |
|
"Batch Mean": 0.08904838562011719, |
|
"accuracy": 0.859375, |
|
"epoch": 0.8625, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.865, |
|
"grad_norm": 7.207253932952881, |
|
"learning_rate": 4.263157894736842e-07, |
|
"loss": 0.3348, |
|
"step": 346 |
|
}, |
|
{ |
|
"Batch Mean": -0.4642624855041504, |
|
"accuracy": 0.84375, |
|
"epoch": 0.865, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.8675, |
|
"grad_norm": 11.001867294311523, |
|
"learning_rate": 4.184210526315789e-07, |
|
"loss": 0.3268, |
|
"step": 347 |
|
}, |
|
{ |
|
"Batch Mean": -0.29027581214904785, |
|
"accuracy": 0.8125, |
|
"epoch": 0.8675, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 9.372380256652832, |
|
"learning_rate": 4.105263157894737e-07, |
|
"loss": 0.4095, |
|
"step": 348 |
|
}, |
|
{ |
|
"Batch Mean": -0.41428279876708984, |
|
"accuracy": 0.84375, |
|
"epoch": 0.87, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8725, |
|
"grad_norm": 9.246240615844727, |
|
"learning_rate": 4.0263157894736845e-07, |
|
"loss": 0.3333, |
|
"step": 349 |
|
}, |
|
{ |
|
"Batch Mean": -0.2982146739959717, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.8725, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 7.75571346282959, |
|
"learning_rate": 3.9473684210526315e-07, |
|
"loss": 0.3629, |
|
"step": 350 |
|
}, |
|
{ |
|
"Batch Mean": -0.02528667449951172, |
|
"accuracy": 0.828125, |
|
"epoch": 0.875, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8775, |
|
"grad_norm": 8.56804084777832, |
|
"learning_rate": 3.8684210526315794e-07, |
|
"loss": 0.4156, |
|
"step": 351 |
|
}, |
|
{ |
|
"Batch Mean": -0.17065882682800293, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.8775, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 6.510917663574219, |
|
"learning_rate": 3.7894736842105264e-07, |
|
"loss": 0.3052, |
|
"step": 352 |
|
}, |
|
{ |
|
"Batch Mean": -0.3633178472518921, |
|
"accuracy": 0.84375, |
|
"epoch": 0.88, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8825, |
|
"grad_norm": 7.901680946350098, |
|
"learning_rate": 3.710526315789474e-07, |
|
"loss": 0.3647, |
|
"step": 353 |
|
}, |
|
{ |
|
"Batch Mean": 0.17653004825115204, |
|
"accuracy": 0.875, |
|
"epoch": 0.8825, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.885, |
|
"grad_norm": 6.570638179779053, |
|
"learning_rate": 3.6315789473684213e-07, |
|
"loss": 0.2676, |
|
"step": 354 |
|
}, |
|
{ |
|
"Batch Mean": 0.008600831031799316, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.885, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8875, |
|
"grad_norm": 7.484086036682129, |
|
"learning_rate": 3.552631578947368e-07, |
|
"loss": 0.3628, |
|
"step": 355 |
|
}, |
|
{ |
|
"Batch Mean": 0.4547309875488281, |
|
"accuracy": 0.7578125, |
|
"epoch": 0.8875, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 9.815321922302246, |
|
"learning_rate": 3.4736842105263157e-07, |
|
"loss": 0.4354, |
|
"step": 356 |
|
}, |
|
{ |
|
"Batch Mean": 0.03906369209289551, |
|
"accuracy": 0.828125, |
|
"epoch": 0.89, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8925, |
|
"grad_norm": 6.368960857391357, |
|
"learning_rate": 3.3947368421052636e-07, |
|
"loss": 0.3352, |
|
"step": 357 |
|
}, |
|
{ |
|
"Batch Mean": -0.0681946873664856, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.8925, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.895, |
|
"grad_norm": 5.644773006439209, |
|
"learning_rate": 3.3157894736842106e-07, |
|
"loss": 0.3015, |
|
"step": 358 |
|
}, |
|
{ |
|
"Batch Mean": 0.0739591121673584, |
|
"accuracy": 0.8984375, |
|
"epoch": 0.895, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.8975, |
|
"grad_norm": 6.563121795654297, |
|
"learning_rate": 3.236842105263158e-07, |
|
"loss": 0.3015, |
|
"step": 359 |
|
}, |
|
{ |
|
"Batch Mean": 0.2460918426513672, |
|
"accuracy": 0.859375, |
|
"epoch": 0.8975, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 6.774952411651611, |
|
"learning_rate": 3.157894736842105e-07, |
|
"loss": 0.2958, |
|
"step": 360 |
|
}, |
|
{ |
|
"Batch Mean": 0.34413933753967285, |
|
"accuracy": 0.859375, |
|
"epoch": 0.9, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9025, |
|
"grad_norm": 7.19598388671875, |
|
"learning_rate": 3.078947368421053e-07, |
|
"loss": 0.2959, |
|
"step": 361 |
|
}, |
|
{ |
|
"Batch Mean": -0.12102225422859192, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.9025, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.905, |
|
"grad_norm": 7.87446928024292, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 0.3272, |
|
"step": 362 |
|
}, |
|
{ |
|
"Batch Mean": -0.2931605577468872, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.905, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9075, |
|
"grad_norm": 6.900648593902588, |
|
"learning_rate": 2.9210526315789473e-07, |
|
"loss": 0.3052, |
|
"step": 363 |
|
}, |
|
{ |
|
"Batch Mean": 0.13887238502502441, |
|
"accuracy": 0.8125, |
|
"epoch": 0.9075, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 7.05329704284668, |
|
"learning_rate": 2.842105263157895e-07, |
|
"loss": 0.3621, |
|
"step": 364 |
|
}, |
|
{ |
|
"Batch Mean": -0.48413872718811035, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.91, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9125, |
|
"grad_norm": 9.932937622070312, |
|
"learning_rate": 2.763157894736842e-07, |
|
"loss": 0.3113, |
|
"step": 365 |
|
}, |
|
{ |
|
"Batch Mean": 0.14877915382385254, |
|
"accuracy": 0.859375, |
|
"epoch": 0.9125, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.915, |
|
"grad_norm": 6.157749176025391, |
|
"learning_rate": 2.6842105263157897e-07, |
|
"loss": 0.2868, |
|
"step": 366 |
|
}, |
|
{ |
|
"Batch Mean": -0.3388862609863281, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.915, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9175, |
|
"grad_norm": 8.218609809875488, |
|
"learning_rate": 2.605263157894737e-07, |
|
"loss": 0.3538, |
|
"step": 367 |
|
}, |
|
{ |
|
"Batch Mean": 0.21689987182617188, |
|
"accuracy": 0.90625, |
|
"epoch": 0.9175, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 6.768619537353516, |
|
"learning_rate": 2.526315789473684e-07, |
|
"loss": 0.2282, |
|
"step": 368 |
|
}, |
|
{ |
|
"Batch Mean": 0.23206400871276855, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.92, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9225, |
|
"grad_norm": 8.070509910583496, |
|
"learning_rate": 2.4473684210526315e-07, |
|
"loss": 0.3599, |
|
"step": 369 |
|
}, |
|
{ |
|
"Batch Mean": 0.37568068504333496, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.9225, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.925, |
|
"grad_norm": 8.64689826965332, |
|
"learning_rate": 2.368421052631579e-07, |
|
"loss": 0.3361, |
|
"step": 370 |
|
}, |
|
{ |
|
"Batch Mean": -0.009695947170257568, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.925, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9275, |
|
"grad_norm": 6.34818696975708, |
|
"learning_rate": 2.2894736842105264e-07, |
|
"loss": 0.3253, |
|
"step": 371 |
|
}, |
|
{ |
|
"Batch Mean": 0.21973037719726562, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.9275, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 7.155085563659668, |
|
"learning_rate": 2.2105263157894736e-07, |
|
"loss": 0.3446, |
|
"step": 372 |
|
}, |
|
{ |
|
"Batch Mean": -0.09836244583129883, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.93, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9325, |
|
"grad_norm": 5.654759407043457, |
|
"learning_rate": 2.131578947368421e-07, |
|
"loss": 0.2657, |
|
"step": 373 |
|
}, |
|
{ |
|
"Batch Mean": 0.13301897048950195, |
|
"accuracy": 0.8984375, |
|
"epoch": 0.9325, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.935, |
|
"grad_norm": 6.158660411834717, |
|
"learning_rate": 2.0526315789473685e-07, |
|
"loss": 0.248, |
|
"step": 374 |
|
}, |
|
{ |
|
"Batch Mean": 0.45058536529541016, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.935, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 9.395992279052734, |
|
"learning_rate": 1.9736842105263157e-07, |
|
"loss": 0.3643, |
|
"step": 375 |
|
}, |
|
{ |
|
"Batch Mean": -0.34908556938171387, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.9375, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 8.39039134979248, |
|
"learning_rate": 1.8947368421052632e-07, |
|
"loss": 0.3645, |
|
"step": 376 |
|
}, |
|
{ |
|
"Batch Mean": -0.11708331108093262, |
|
"accuracy": 0.8125, |
|
"epoch": 0.94, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9425, |
|
"grad_norm": 8.546663284301758, |
|
"learning_rate": 1.8157894736842106e-07, |
|
"loss": 0.3989, |
|
"step": 377 |
|
}, |
|
{ |
|
"Batch Mean": -0.013531684875488281, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.9425, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.945, |
|
"grad_norm": 8.821385383605957, |
|
"learning_rate": 1.7368421052631578e-07, |
|
"loss": 0.3431, |
|
"step": 378 |
|
}, |
|
{ |
|
"Batch Mean": -0.2583411931991577, |
|
"accuracy": 0.8125, |
|
"epoch": 0.945, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9475, |
|
"grad_norm": 10.091469764709473, |
|
"learning_rate": 1.6578947368421053e-07, |
|
"loss": 0.4154, |
|
"step": 379 |
|
}, |
|
{ |
|
"Batch Mean": -0.21181374788284302, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.9475, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 6.982586860656738, |
|
"learning_rate": 1.5789473684210525e-07, |
|
"loss": 0.2711, |
|
"step": 380 |
|
}, |
|
{ |
|
"Batch Mean": -0.22420763969421387, |
|
"accuracy": 0.8828125, |
|
"epoch": 0.95, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9525, |
|
"grad_norm": 7.029630661010742, |
|
"learning_rate": 1.5000000000000002e-07, |
|
"loss": 0.2851, |
|
"step": 381 |
|
}, |
|
{ |
|
"Batch Mean": 0.01129150390625, |
|
"accuracy": 0.875, |
|
"epoch": 0.9525, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.955, |
|
"grad_norm": 6.5657219886779785, |
|
"learning_rate": 1.4210526315789474e-07, |
|
"loss": 0.3011, |
|
"step": 382 |
|
}, |
|
{ |
|
"Batch Mean": -0.07816898822784424, |
|
"accuracy": 0.875, |
|
"epoch": 0.955, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9575, |
|
"grad_norm": 6.4461669921875, |
|
"learning_rate": 1.3421052631578948e-07, |
|
"loss": 0.3308, |
|
"step": 383 |
|
}, |
|
{ |
|
"Batch Mean": -0.06400156021118164, |
|
"accuracy": 0.84375, |
|
"epoch": 0.9575, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 7.643894195556641, |
|
"learning_rate": 1.263157894736842e-07, |
|
"loss": 0.3519, |
|
"step": 384 |
|
}, |
|
{ |
|
"Batch Mean": -0.05643463134765625, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.96, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9625, |
|
"grad_norm": 7.603409290313721, |
|
"learning_rate": 1.1842105263157895e-07, |
|
"loss": 0.356, |
|
"step": 385 |
|
}, |
|
{ |
|
"Batch Mean": -0.13699448108673096, |
|
"accuracy": 0.859375, |
|
"epoch": 0.9625, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.965, |
|
"grad_norm": 8.098373413085938, |
|
"learning_rate": 1.1052631578947368e-07, |
|
"loss": 0.3473, |
|
"step": 386 |
|
}, |
|
{ |
|
"Batch Mean": 0.1001596450805664, |
|
"accuracy": 0.828125, |
|
"epoch": 0.965, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9675, |
|
"grad_norm": 7.347716331481934, |
|
"learning_rate": 1.0263157894736843e-07, |
|
"loss": 0.3799, |
|
"step": 387 |
|
}, |
|
{ |
|
"Batch Mean": -0.14007353782653809, |
|
"accuracy": 0.9140625, |
|
"epoch": 0.9675, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 6.1965413093566895, |
|
"learning_rate": 9.473684210526316e-08, |
|
"loss": 0.2634, |
|
"step": 388 |
|
}, |
|
{ |
|
"Batch Mean": -0.0102081298828125, |
|
"accuracy": 0.890625, |
|
"epoch": 0.97, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9725, |
|
"grad_norm": 5.818850517272949, |
|
"learning_rate": 8.684210526315789e-08, |
|
"loss": 0.267, |
|
"step": 389 |
|
}, |
|
{ |
|
"Batch Mean": -0.10731267929077148, |
|
"accuracy": 0.84375, |
|
"epoch": 0.9725, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.975, |
|
"grad_norm": 8.648015975952148, |
|
"learning_rate": 7.894736842105262e-08, |
|
"loss": 0.4098, |
|
"step": 390 |
|
}, |
|
{ |
|
"Batch Mean": 0.2496967315673828, |
|
"accuracy": 0.8671875, |
|
"epoch": 0.975, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9775, |
|
"grad_norm": 7.774208068847656, |
|
"learning_rate": 7.105263157894737e-08, |
|
"loss": 0.3166, |
|
"step": 391 |
|
}, |
|
{ |
|
"Batch Mean": 0.21097368001937866, |
|
"accuracy": 0.8203125, |
|
"epoch": 0.9775, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 7.0467424392700195, |
|
"learning_rate": 6.31578947368421e-08, |
|
"loss": 0.3168, |
|
"step": 392 |
|
}, |
|
{ |
|
"Batch Mean": 0.15709495544433594, |
|
"accuracy": 0.8515625, |
|
"epoch": 0.98, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.9825, |
|
"grad_norm": 7.612072944641113, |
|
"learning_rate": 5.526315789473684e-08, |
|
"loss": 0.3379, |
|
"step": 393 |
|
}, |
|
{ |
|
"Batch Mean": -0.05299580097198486, |
|
"accuracy": 0.796875, |
|
"epoch": 0.9825, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.985, |
|
"grad_norm": 7.588818550109863, |
|
"learning_rate": 4.736842105263158e-08, |
|
"loss": 0.409, |
|
"step": 394 |
|
}, |
|
{ |
|
"Batch Mean": 0.217301607131958, |
|
"accuracy": 0.796875, |
|
"epoch": 0.985, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.9875, |
|
"grad_norm": 7.986033916473389, |
|
"learning_rate": 3.947368421052631e-08, |
|
"loss": 0.3834, |
|
"step": 395 |
|
}, |
|
{ |
|
"Batch Mean": -0.03413909673690796, |
|
"accuracy": 0.859375, |
|
"epoch": 0.9875, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 7.176063060760498, |
|
"learning_rate": 3.157894736842105e-08, |
|
"loss": 0.3217, |
|
"step": 396 |
|
}, |
|
{ |
|
"Batch Mean": -0.07313796877861023, |
|
"accuracy": 0.859375, |
|
"epoch": 0.99, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.9925, |
|
"grad_norm": 7.412654399871826, |
|
"learning_rate": 2.368421052631579e-08, |
|
"loss": 0.3313, |
|
"step": 397 |
|
}, |
|
{ |
|
"Batch Mean": -0.1518573760986328, |
|
"accuracy": 0.8984375, |
|
"epoch": 0.9925, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.995, |
|
"grad_norm": 6.981944561004639, |
|
"learning_rate": 1.5789473684210525e-08, |
|
"loss": 0.2881, |
|
"step": 398 |
|
}, |
|
{ |
|
"Batch Mean": 0.06286072731018066, |
|
"accuracy": 0.8359375, |
|
"epoch": 0.995, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.9975, |
|
"grad_norm": 7.32573127746582, |
|
"learning_rate": 7.894736842105263e-09, |
|
"loss": 0.3567, |
|
"step": 399 |
|
}, |
|
{ |
|
"Batch Mean": 0.34907403588294983, |
|
"accuracy": 0.828125, |
|
"epoch": 0.9975, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 8.953917503356934, |
|
"learning_rate": 0.0, |
|
"loss": 0.3223, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|