{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": 3.64324951171875, "accuracy": 0.6015625, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 6.657606601715088, "learning_rate": 2.5000000000000004e-07, "loss": 0.6742, "step": 1 }, { "Batch Mean": 3.58660888671875, "accuracy": 0.5, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 5.2591233253479, "learning_rate": 5.000000000000001e-07, "loss": 0.6808, "step": 2 }, { "Batch Mean": 3.56298828125, "accuracy": 0.546875, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 5.481376647949219, "learning_rate": 7.5e-07, "loss": 0.6846, "step": 3 }, { "Batch Mean": 3.60186767578125, "accuracy": 0.5625, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 7.51620626449585, "learning_rate": 1.0000000000000002e-06, "loss": 0.6892, "step": 4 }, { "Batch Mean": 3.57855224609375, "accuracy": 0.453125, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 8.899469375610352, "learning_rate": 1.25e-06, "loss": 0.6956, "step": 5 }, { "Batch Mean": 3.595703125, "accuracy": 0.5546875, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 6.726147174835205, "learning_rate": 1.5e-06, "loss": 0.6839, "step": 6 }, { "Batch Mean": 3.64105224609375, "accuracy": 0.453125, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 7.116271018981934, "learning_rate": 1.75e-06, "loss": 0.7154, "step": 7 }, { "Batch Mean": 3.6541748046875, "accuracy": 0.5546875, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 6.190150260925293, "learning_rate": 2.0000000000000003e-06, "loss": 0.6844, "step": 8 }, { "Batch Mean": 3.6507568359375, "accuracy": 0.6015625, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 7.176676273345947, "learning_rate": 2.25e-06, "loss": 0.665, "step": 9 }, { "Batch Mean": 3.6649169921875, "accuracy": 0.59375, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 5.139154434204102, "learning_rate": 2.5e-06, "loss": 0.6626, "step": 10 }, { "Batch Mean": 3.70733642578125, "accuracy": 0.671875, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 5.621807098388672, "learning_rate": 2.7500000000000004e-06, "loss": 0.6299, "step": 11 }, { "Batch Mean": 3.74017333984375, "accuracy": 0.65625, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 5.459871768951416, "learning_rate": 3e-06, "loss": 0.6321, "step": 12 }, { "Batch Mean": 3.8514404296875, "accuracy": 0.625, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 7.757749557495117, "learning_rate": 3.2500000000000002e-06, "loss": 0.6453, "step": 13 }, { "Batch Mean": 3.80059814453125, "accuracy": 0.703125, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 6.681384086608887, "learning_rate": 3.5e-06, "loss": 0.5769, "step": 14 }, { "Batch Mean": 3.8978271484375, "accuracy": 0.6640625, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 10.110990524291992, "learning_rate": 3.7500000000000005e-06, "loss": 0.6354, "step": 15 }, { "Batch Mean": 3.6146774291992188, "accuracy": 0.71875, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 10.233904838562012, "learning_rate": 4.000000000000001e-06, "loss": 0.6182, "step": 16 }, { "Batch Mean": 3.5331335067749023, "accuracy": 0.6328125, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 10.035528182983398, "learning_rate": 4.25e-06, "loss": 0.6252, "step": 17 }, { "Batch Mean": 3.0357871055603027, "accuracy": 0.625, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 9.972918510437012, "learning_rate": 4.5e-06, "loss": 0.6225, "step": 18 }, { "Batch Mean": 2.6747031211853027, "accuracy": 0.6796875, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 21.3759765625, "learning_rate": 4.75e-06, "loss": 0.6328, "step": 19 }, { "Batch Mean": 2.418215751647949, "accuracy": 0.6875, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 10.712898254394531, "learning_rate": 5e-06, "loss": 0.6352, "step": 20 }, { "Batch Mean": 1.9824256896972656, "accuracy": 0.640625, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 10.833212852478027, "learning_rate": 4.986842105263158e-06, "loss": 0.6244, "step": 21 }, { "Batch Mean": 1.5567502975463867, "accuracy": 0.6484375, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 8.710196495056152, "learning_rate": 4.973684210526316e-06, "loss": 0.6002, "step": 22 }, { "Batch Mean": 1.262591004371643, "accuracy": 0.6484375, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 9.207337379455566, "learning_rate": 4.960526315789474e-06, "loss": 0.5754, "step": 23 }, { "Batch Mean": 0.9924072027206421, "accuracy": 0.7109375, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 7.179128170013428, "learning_rate": 4.947368421052632e-06, "loss": 0.5446, "step": 24 }, { "Batch Mean": 0.7883305549621582, "accuracy": 0.6484375, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 7.777095794677734, "learning_rate": 4.9342105263157895e-06, "loss": 0.6383, "step": 25 }, { "Batch Mean": 0.7204087972640991, "accuracy": 0.7109375, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 6.316252708435059, "learning_rate": 4.921052631578948e-06, "loss": 0.5416, "step": 26 }, { "Batch Mean": 0.7001075744628906, "accuracy": 0.75, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 6.2172017097473145, "learning_rate": 4.907894736842106e-06, "loss": 0.5603, "step": 27 }, { "Batch Mean": 0.4948960542678833, "accuracy": 0.6640625, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 6.071000576019287, "learning_rate": 4.894736842105264e-06, "loss": 0.5919, "step": 28 }, { "Batch Mean": 0.6193783283233643, "accuracy": 0.609375, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 7.123428821563721, "learning_rate": 4.881578947368422e-06, "loss": 0.6265, "step": 29 }, { "Batch Mean": 0.5463962554931641, "accuracy": 0.6640625, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 8.785526275634766, "learning_rate": 4.8684210526315795e-06, "loss": 0.6251, "step": 30 }, { "Batch Mean": 0.6883676052093506, "accuracy": 0.7265625, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 6.22755241394043, "learning_rate": 4.855263157894737e-06, "loss": 0.5505, "step": 31 }, { "Batch Mean": 0.8646153211593628, "accuracy": 0.71875, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 5.980678558349609, "learning_rate": 4.842105263157895e-06, "loss": 0.5817, "step": 32 }, { "Batch Mean": 0.9955297708511353, "accuracy": 0.7109375, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 5.80774450302124, "learning_rate": 4.828947368421053e-06, "loss": 0.5697, "step": 33 }, { "Batch Mean": 1.163506031036377, "accuracy": 0.7421875, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 7.277930736541748, "learning_rate": 4.815789473684211e-06, "loss": 0.5299, "step": 34 }, { "Batch Mean": 1.314648151397705, "accuracy": 0.7109375, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 6.664306163787842, "learning_rate": 4.802631578947369e-06, "loss": 0.5548, "step": 35 }, { "Batch Mean": 1.4656352996826172, "accuracy": 0.7421875, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 6.886595249176025, "learning_rate": 4.789473684210527e-06, "loss": 0.5432, "step": 36 }, { "Batch Mean": 1.5227254629135132, "accuracy": 0.7109375, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 6.418576717376709, "learning_rate": 4.7763157894736844e-06, "loss": 0.5536, "step": 37 }, { "Batch Mean": 1.631667137145996, "accuracy": 0.71875, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 8.560834884643555, "learning_rate": 4.763157894736842e-06, "loss": 0.569, "step": 38 }, { "Batch Mean": 1.6476902961730957, "accuracy": 0.734375, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 8.121230125427246, "learning_rate": 4.75e-06, "loss": 0.5418, "step": 39 }, { "Batch Mean": 1.492204189300537, "accuracy": 0.71875, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 7.698504447937012, "learning_rate": 4.736842105263158e-06, "loss": 0.5079, "step": 40 }, { "Batch Mean": 1.3397908210754395, "accuracy": 0.75, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 8.164704322814941, "learning_rate": 4.723684210526316e-06, "loss": 0.5291, "step": 41 }, { "Batch Mean": 1.4944896697998047, "accuracy": 0.75, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 7.740182399749756, "learning_rate": 4.710526315789474e-06, "loss": 0.5375, "step": 42 }, { "Batch Mean": 1.322850227355957, "accuracy": 0.765625, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 7.922670841217041, "learning_rate": 4.697368421052632e-06, "loss": 0.4777, "step": 43 }, { "Batch Mean": 1.4683116674423218, "accuracy": 0.7578125, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 8.393726348876953, "learning_rate": 4.68421052631579e-06, "loss": 0.4728, "step": 44 }, { "Batch Mean": 1.6116371154785156, "accuracy": 0.7109375, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 9.989266395568848, "learning_rate": 4.671052631578948e-06, "loss": 0.5428, "step": 45 }, { "Batch Mean": 1.52450692653656, "accuracy": 0.7890625, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 9.131291389465332, "learning_rate": 4.657894736842106e-06, "loss": 0.4236, "step": 46 }, { "Batch Mean": 1.776949405670166, "accuracy": 0.71875, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 11.912744522094727, "learning_rate": 4.6447368421052635e-06, "loss": 0.6295, "step": 47 }, { "Batch Mean": 2.3817920684814453, "accuracy": 0.6875, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 10.917041778564453, "learning_rate": 4.631578947368421e-06, "loss": 0.5316, "step": 48 }, { "Batch Mean": 2.175340175628662, "accuracy": 0.7734375, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 8.514415740966797, "learning_rate": 4.618421052631579e-06, "loss": 0.4724, "step": 49 }, { "Batch Mean": 2.521066665649414, "accuracy": 0.65625, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 10.435226440429688, "learning_rate": 4.605263157894737e-06, "loss": 0.6361, "step": 50 }, { "Batch Mean": 2.40045166015625, "accuracy": 0.7578125, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 7.130827903747559, "learning_rate": 4.592105263157895e-06, "loss": 0.4949, "step": 51 }, { "Batch Mean": 2.7120556831359863, "accuracy": 0.6953125, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 6.832071304321289, "learning_rate": 4.578947368421053e-06, "loss": 0.5509, "step": 52 }, { "Batch Mean": 2.6350326538085938, "accuracy": 0.7109375, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 6.527810096740723, "learning_rate": 4.565789473684211e-06, "loss": 0.5334, "step": 53 }, { "Batch Mean": 2.8320467472076416, "accuracy": 0.75, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 6.686796188354492, "learning_rate": 4.552631578947369e-06, "loss": 0.505, "step": 54 }, { "Batch Mean": 2.5793724060058594, "accuracy": 0.71875, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 6.586344242095947, "learning_rate": 4.539473684210527e-06, "loss": 0.5145, "step": 55 }, { "Batch Mean": 2.652639389038086, "accuracy": 0.7890625, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 7.232706069946289, "learning_rate": 4.526315789473685e-06, "loss": 0.4847, "step": 56 }, { "Batch Mean": 2.7974865436553955, "accuracy": 0.671875, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 9.864455223083496, "learning_rate": 4.513157894736843e-06, "loss": 0.5848, "step": 57 }, { "Batch Mean": 2.7397356033325195, "accuracy": 0.7734375, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 6.654571056365967, "learning_rate": 4.5e-06, "loss": 0.5149, "step": 58 }, { "Batch Mean": 3.020786762237549, "accuracy": 0.7109375, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 7.521582126617432, "learning_rate": 4.4868421052631584e-06, "loss": 0.532, "step": 59 }, { "Batch Mean": 3.0784976482391357, "accuracy": 0.7421875, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 7.8287272453308105, "learning_rate": 4.473684210526316e-06, "loss": 0.5201, "step": 60 }, { "Batch Mean": 3.2929041385650635, "accuracy": 0.6875, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 7.036741733551025, "learning_rate": 4.460526315789474e-06, "loss": 0.5192, "step": 61 }, { "Batch Mean": 3.2737629413604736, "accuracy": 0.7734375, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 8.064910888671875, "learning_rate": 4.447368421052632e-06, "loss": 0.5094, "step": 62 }, { "Batch Mean": 3.0026721954345703, "accuracy": 0.75, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 8.189529418945312, "learning_rate": 4.43421052631579e-06, "loss": 0.4838, "step": 63 }, { "Batch Mean": 3.0598621368408203, "accuracy": 0.7109375, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 7.452142238616943, "learning_rate": 4.4210526315789476e-06, "loss": 0.5297, "step": 64 }, { "Batch Mean": 2.890104293823242, "accuracy": 0.734375, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 7.773412227630615, "learning_rate": 4.407894736842105e-06, "loss": 0.4938, "step": 65 }, { "Batch Mean": 2.8558566570281982, "accuracy": 0.7890625, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 7.888664722442627, "learning_rate": 4.394736842105263e-06, "loss": 0.5426, "step": 66 }, { "Batch Mean": 2.8415894508361816, "accuracy": 0.7265625, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 7.992714881896973, "learning_rate": 4.381578947368421e-06, "loss": 0.5278, "step": 67 }, { "Batch Mean": 2.5067033767700195, "accuracy": 0.71875, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 7.569831848144531, "learning_rate": 4.368421052631579e-06, "loss": 0.497, "step": 68 }, { "Batch Mean": 2.690906047821045, "accuracy": 0.7890625, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 8.406615257263184, "learning_rate": 4.3552631578947375e-06, "loss": 0.5383, "step": 69 }, { "Batch Mean": 2.54428768157959, "accuracy": 0.7890625, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 6.022793292999268, "learning_rate": 4.342105263157895e-06, "loss": 0.4619, "step": 70 }, { "Batch Mean": 2.539039134979248, "accuracy": 0.75, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 5.931001663208008, "learning_rate": 4.328947368421053e-06, "loss": 0.4643, "step": 71 }, { "Batch Mean": 2.3624706268310547, "accuracy": 0.75, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 6.277031898498535, "learning_rate": 4.315789473684211e-06, "loss": 0.5027, "step": 72 }, { "Batch Mean": 2.5795602798461914, "accuracy": 0.7734375, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 7.181740760803223, "learning_rate": 4.302631578947369e-06, "loss": 0.4652, "step": 73 }, { "Batch Mean": 2.328335762023926, "accuracy": 0.7265625, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 6.926029205322266, "learning_rate": 4.289473684210527e-06, "loss": 0.4554, "step": 74 }, { "Batch Mean": 2.195453643798828, "accuracy": 0.7109375, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 7.934150695800781, "learning_rate": 4.276315789473684e-06, "loss": 0.5458, "step": 75 }, { "Batch Mean": 2.7026095390319824, "accuracy": 0.7421875, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 7.034529209136963, "learning_rate": 4.2631578947368425e-06, "loss": 0.5041, "step": 76 }, { "Batch Mean": 2.611628532409668, "accuracy": 0.734375, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 7.741730213165283, "learning_rate": 4.25e-06, "loss": 0.4804, "step": 77 }, { "Batch Mean": 2.5273165702819824, "accuracy": 0.765625, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 7.251089572906494, "learning_rate": 4.236842105263158e-06, "loss": 0.4959, "step": 78 }, { "Batch Mean": 2.5558762550354004, "accuracy": 0.7109375, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 7.240261077880859, "learning_rate": 4.223684210526316e-06, "loss": 0.5173, "step": 79 }, { "Batch Mean": 2.2066729068756104, "accuracy": 0.78125, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 7.0953474044799805, "learning_rate": 4.210526315789474e-06, "loss": 0.5174, "step": 80 }, { "Batch Mean": 2.377552032470703, "accuracy": 0.796875, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 7.204663276672363, "learning_rate": 4.197368421052632e-06, "loss": 0.4628, "step": 81 }, { "Batch Mean": 2.3107597827911377, "accuracy": 0.7734375, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 7.396126747131348, "learning_rate": 4.18421052631579e-06, "loss": 0.5078, "step": 82 }, { "Batch Mean": 1.8419753313064575, "accuracy": 0.7421875, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 8.316776275634766, "learning_rate": 4.171052631578948e-06, "loss": 0.4945, "step": 83 }, { "Batch Mean": 2.162795066833496, "accuracy": 0.859375, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 6.570835113525391, "learning_rate": 4.157894736842106e-06, "loss": 0.3966, "step": 84 }, { "Batch Mean": 2.2369089126586914, "accuracy": 0.765625, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 7.585580825805664, "learning_rate": 4.144736842105263e-06, "loss": 0.5321, "step": 85 }, { "Batch Mean": 2.4348106384277344, "accuracy": 0.7265625, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 7.9683098793029785, "learning_rate": 4.1315789473684216e-06, "loss": 0.5224, "step": 86 }, { "Batch Mean": 2.9450817108154297, "accuracy": 0.8125, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 7.830796718597412, "learning_rate": 4.118421052631579e-06, "loss": 0.4866, "step": 87 }, { "Batch Mean": 2.9916036128997803, "accuracy": 0.765625, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 9.743931770324707, "learning_rate": 4.105263157894737e-06, "loss": 0.5408, "step": 88 }, { "Batch Mean": 3.0171022415161133, "accuracy": 0.8046875, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 7.245960235595703, "learning_rate": 4.092105263157895e-06, "loss": 0.4232, "step": 89 }, { "Batch Mean": 3.2700085639953613, "accuracy": 0.75, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 7.346113681793213, "learning_rate": 4.078947368421053e-06, "loss": 0.4981, "step": 90 }, { "Batch Mean": 3.375551700592041, "accuracy": 0.7734375, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 8.201985359191895, "learning_rate": 4.065789473684211e-06, "loss": 0.4761, "step": 91 }, { "Batch Mean": 3.438584327697754, "accuracy": 0.7265625, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 8.22840690612793, "learning_rate": 4.052631578947368e-06, "loss": 0.5708, "step": 92 }, { "Batch Mean": 3.2276859283447266, "accuracy": 0.71875, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 7.885838508605957, "learning_rate": 4.0394736842105265e-06, "loss": 0.5382, "step": 93 }, { "Batch Mean": 3.2703161239624023, "accuracy": 0.765625, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 5.956949234008789, "learning_rate": 4.026315789473684e-06, "loss": 0.4753, "step": 94 }, { "Batch Mean": 2.999190330505371, "accuracy": 0.7578125, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 6.309754848480225, "learning_rate": 4.013157894736842e-06, "loss": 0.5308, "step": 95 }, { "Batch Mean": 2.9133434295654297, "accuracy": 0.7734375, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 7.293734073638916, "learning_rate": 4.000000000000001e-06, "loss": 0.4746, "step": 96 }, { "Batch Mean": 2.940767288208008, "accuracy": 0.7734375, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 5.970653533935547, "learning_rate": 3.986842105263158e-06, "loss": 0.4666, "step": 97 }, { "Batch Mean": 2.4930241107940674, "accuracy": 0.828125, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 5.983125686645508, "learning_rate": 3.9736842105263165e-06, "loss": 0.4405, "step": 98 }, { "Batch Mean": 2.768970012664795, "accuracy": 0.6875, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 7.222211837768555, "learning_rate": 3.960526315789474e-06, "loss": 0.5831, "step": 99 }, { "Batch Mean": 2.600813150405884, "accuracy": 0.8125, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 6.174572944641113, "learning_rate": 3.947368421052632e-06, "loss": 0.3996, "step": 100 }, { "Batch Mean": 2.7795987129211426, "accuracy": 0.7421875, "epoch": 0.25, "step": 100 }, { "epoch": 0.2525, "grad_norm": 6.83671236038208, "learning_rate": 3.93421052631579e-06, "loss": 0.5049, "step": 101 }, { "Batch Mean": 3.153959274291992, "accuracy": 0.7578125, "epoch": 0.2525, "step": 101 }, { "epoch": 0.255, "grad_norm": 7.497275352478027, "learning_rate": 3.921052631578947e-06, "loss": 0.4726, "step": 102 }, { "Batch Mean": 3.3440189361572266, "accuracy": 0.8125, "epoch": 0.255, "step": 102 }, { "epoch": 0.2575, "grad_norm": 6.329479217529297, "learning_rate": 3.907894736842106e-06, "loss": 0.3769, "step": 103 }, { "Batch Mean": 3.1725075244903564, "accuracy": 0.765625, "epoch": 0.2575, "step": 103 }, { "epoch": 0.26, "grad_norm": 7.60507869720459, "learning_rate": 3.894736842105263e-06, "loss": 0.4352, "step": 104 }, { "Batch Mean": 3.409212827682495, "accuracy": 0.7578125, "epoch": 0.26, "step": 104 }, { "epoch": 0.2625, "grad_norm": 9.034660339355469, "learning_rate": 3.8815789473684214e-06, "loss": 0.4505, "step": 105 }, { "Batch Mean": 3.7719802856445312, "accuracy": 0.765625, "epoch": 0.2625, "step": 105 }, { "epoch": 0.265, "grad_norm": 8.710013389587402, "learning_rate": 3.868421052631579e-06, "loss": 0.4257, "step": 106 }, { "Batch Mean": 4.079001426696777, "accuracy": 0.71875, "epoch": 0.265, "step": 106 }, { "epoch": 0.2675, "grad_norm": 11.614973068237305, "learning_rate": 3.855263157894737e-06, "loss": 0.487, "step": 107 }, { "Batch Mean": 4.675480842590332, "accuracy": 0.7578125, "epoch": 0.2675, "step": 107 }, { "epoch": 0.27, "grad_norm": 11.474054336547852, "learning_rate": 3.842105263157895e-06, "loss": 0.5035, "step": 108 }, { "Batch Mean": 4.677275657653809, "accuracy": 0.765625, "epoch": 0.27, "step": 108 }, { "epoch": 0.2725, "grad_norm": 12.407071113586426, "learning_rate": 3.828947368421053e-06, "loss": 0.5013, "step": 109 }, { "Batch Mean": 5.106083869934082, "accuracy": 0.8125, "epoch": 0.2725, "step": 109 }, { "epoch": 0.275, "grad_norm": 10.379884719848633, "learning_rate": 3.815789473684211e-06, "loss": 0.3935, "step": 110 }, { "Batch Mean": 5.2830119132995605, "accuracy": 0.8359375, "epoch": 0.275, "step": 110 }, { "epoch": 0.2775, "grad_norm": 8.18240737915039, "learning_rate": 3.802631578947369e-06, "loss": 0.3212, "step": 111 }, { "Batch Mean": 4.785458564758301, "accuracy": 0.84375, "epoch": 0.2775, "step": 111 }, { "epoch": 0.28, "grad_norm": 8.109498977661133, "learning_rate": 3.789473684210527e-06, "loss": 0.325, "step": 112 }, { "Batch Mean": 4.754400730133057, "accuracy": 0.78125, "epoch": 0.28, "step": 112 }, { "epoch": 0.2825, "grad_norm": 11.198568344116211, "learning_rate": 3.7763157894736847e-06, "loss": 0.477, "step": 113 }, { "Batch Mean": 4.87004280090332, "accuracy": 0.796875, "epoch": 0.2825, "step": 113 }, { "epoch": 0.285, "grad_norm": 10.391942977905273, "learning_rate": 3.7631578947368426e-06, "loss": 0.4144, "step": 114 }, { "Batch Mean": 5.210759162902832, "accuracy": 0.6953125, "epoch": 0.285, "step": 114 }, { "epoch": 0.2875, "grad_norm": 13.044468879699707, "learning_rate": 3.7500000000000005e-06, "loss": 0.5755, "step": 115 }, { "Batch Mean": 4.891458511352539, "accuracy": 0.75, "epoch": 0.2875, "step": 115 }, { "epoch": 0.29, "grad_norm": 10.2129487991333, "learning_rate": 3.736842105263158e-06, "loss": 0.5067, "step": 116 }, { "Batch Mean": 4.567095756530762, "accuracy": 0.828125, "epoch": 0.29, "step": 116 }, { "epoch": 0.2925, "grad_norm": 7.831666469573975, "learning_rate": 3.723684210526316e-06, "loss": 0.3706, "step": 117 }, { "Batch Mean": 4.688891410827637, "accuracy": 0.8125, "epoch": 0.2925, "step": 117 }, { "epoch": 0.295, "grad_norm": 8.019044876098633, "learning_rate": 3.710526315789474e-06, "loss": 0.398, "step": 118 }, { "Batch Mean": 4.864707946777344, "accuracy": 0.7109375, "epoch": 0.295, "step": 118 }, { "epoch": 0.2975, "grad_norm": 9.05286979675293, "learning_rate": 3.6973684210526317e-06, "loss": 0.5245, "step": 119 }, { "Batch Mean": 4.6876678466796875, "accuracy": 0.7890625, "epoch": 0.2975, "step": 119 }, { "epoch": 0.3, "grad_norm": 8.225624084472656, "learning_rate": 3.6842105263157896e-06, "loss": 0.442, "step": 120 }, { "Batch Mean": 4.1729278564453125, "accuracy": 0.7734375, "epoch": 0.3, "step": 120 }, { "epoch": 0.3025, "grad_norm": 7.661060810089111, "learning_rate": 3.6710526315789476e-06, "loss": 0.4628, "step": 121 }, { "Batch Mean": 4.011091232299805, "accuracy": 0.8359375, "epoch": 0.3025, "step": 121 }, { "epoch": 0.305, "grad_norm": 7.079358100891113, "learning_rate": 3.657894736842106e-06, "loss": 0.4218, "step": 122 }, { "Batch Mean": 4.086966514587402, "accuracy": 0.7890625, "epoch": 0.305, "step": 122 }, { "epoch": 0.3075, "grad_norm": 6.512862682342529, "learning_rate": 3.644736842105264e-06, "loss": 0.4282, "step": 123 }, { "Batch Mean": 4.260967254638672, "accuracy": 0.796875, "epoch": 0.3075, "step": 123 }, { "epoch": 0.31, "grad_norm": 7.515966892242432, "learning_rate": 3.6315789473684217e-06, "loss": 0.4666, "step": 124 }, { "Batch Mean": 4.493681907653809, "accuracy": 0.71875, "epoch": 0.31, "step": 124 }, { "epoch": 0.3125, "grad_norm": 7.867684364318848, "learning_rate": 3.618421052631579e-06, "loss": 0.5071, "step": 125 }, { "Batch Mean": 4.890687942504883, "accuracy": 0.8125, "epoch": 0.3125, "step": 125 }, { "epoch": 0.315, "grad_norm": 7.532710552215576, "learning_rate": 3.605263157894737e-06, "loss": 0.4652, "step": 126 }, { "Batch Mean": 4.562954902648926, "accuracy": 0.765625, "epoch": 0.315, "step": 126 }, { "epoch": 0.3175, "grad_norm": 8.537458419799805, "learning_rate": 3.592105263157895e-06, "loss": 0.4776, "step": 127 }, { "Batch Mean": 4.587188720703125, "accuracy": 0.78125, "epoch": 0.3175, "step": 127 }, { "epoch": 0.32, "grad_norm": 6.799316883087158, "learning_rate": 3.578947368421053e-06, "loss": 0.4191, "step": 128 }, { "Batch Mean": 3.9915571212768555, "accuracy": 0.75, "epoch": 0.32, "step": 128 }, { "epoch": 0.3225, "grad_norm": 8.045788764953613, "learning_rate": 3.565789473684211e-06, "loss": 0.506, "step": 129 }, { "Batch Mean": 4.073477745056152, "accuracy": 0.7578125, "epoch": 0.3225, "step": 129 }, { "epoch": 0.325, "grad_norm": 8.907149314880371, "learning_rate": 3.5526315789473687e-06, "loss": 0.4877, "step": 130 }, { "Batch Mean": 4.1734771728515625, "accuracy": 0.75, "epoch": 0.325, "step": 130 }, { "epoch": 0.3275, "grad_norm": 9.162409782409668, "learning_rate": 3.5394736842105266e-06, "loss": 0.5122, "step": 131 }, { "Batch Mean": 3.731600761413574, "accuracy": 0.7890625, "epoch": 0.3275, "step": 131 }, { "epoch": 0.33, "grad_norm": 6.665674209594727, "learning_rate": 3.5263157894736846e-06, "loss": 0.4075, "step": 132 }, { "Batch Mean": 3.5332088470458984, "accuracy": 0.765625, "epoch": 0.33, "step": 132 }, { "epoch": 0.3325, "grad_norm": 7.407108783721924, "learning_rate": 3.513157894736842e-06, "loss": 0.4435, "step": 133 }, { "Batch Mean": 3.2184605598449707, "accuracy": 0.78125, "epoch": 0.3325, "step": 133 }, { "epoch": 0.335, "grad_norm": 8.552702903747559, "learning_rate": 3.5e-06, "loss": 0.4817, "step": 134 }, { "Batch Mean": 3.3773207664489746, "accuracy": 0.8046875, "epoch": 0.335, "step": 134 }, { "epoch": 0.3375, "grad_norm": 8.001891136169434, "learning_rate": 3.486842105263158e-06, "loss": 0.4724, "step": 135 }, { "Batch Mean": 2.997677803039551, "accuracy": 0.75, "epoch": 0.3375, "step": 135 }, { "epoch": 0.34, "grad_norm": 9.054019927978516, "learning_rate": 3.473684210526316e-06, "loss": 0.4988, "step": 136 }, { "Batch Mean": 2.929046154022217, "accuracy": 0.8125, "epoch": 0.34, "step": 136 }, { "epoch": 0.3425, "grad_norm": 8.104907989501953, "learning_rate": 3.460526315789474e-06, "loss": 0.4243, "step": 137 }, { "Batch Mean": 2.75551176071167, "accuracy": 0.8125, "epoch": 0.3425, "step": 137 }, { "epoch": 0.345, "grad_norm": 8.473715782165527, "learning_rate": 3.447368421052632e-06, "loss": 0.4427, "step": 138 }, { "Batch Mean": 2.5024139881134033, "accuracy": 0.7890625, "epoch": 0.345, "step": 138 }, { "epoch": 0.3475, "grad_norm": 9.045927047729492, "learning_rate": 3.43421052631579e-06, "loss": 0.4255, "step": 139 }, { "Batch Mean": 2.417640209197998, "accuracy": 0.796875, "epoch": 0.3475, "step": 139 }, { "epoch": 0.35, "grad_norm": 7.710657119750977, "learning_rate": 3.421052631578948e-06, "loss": 0.443, "step": 140 }, { "Batch Mean": 2.505678653717041, "accuracy": 0.7890625, "epoch": 0.35, "step": 140 }, { "epoch": 0.3525, "grad_norm": 7.663838863372803, "learning_rate": 3.4078947368421057e-06, "loss": 0.4049, "step": 141 }, { "Batch Mean": 3.0147056579589844, "accuracy": 0.75, "epoch": 0.3525, "step": 141 }, { "epoch": 0.355, "grad_norm": 8.560474395751953, "learning_rate": 3.3947368421052636e-06, "loss": 0.5149, "step": 142 }, { "Batch Mean": 2.588562488555908, "accuracy": 0.7578125, "epoch": 0.355, "step": 142 }, { "epoch": 0.3575, "grad_norm": 8.25622272491455, "learning_rate": 3.381578947368421e-06, "loss": 0.4934, "step": 143 }, { "Batch Mean": 2.6298677921295166, "accuracy": 0.828125, "epoch": 0.3575, "step": 143 }, { "epoch": 0.36, "grad_norm": 7.083525657653809, "learning_rate": 3.368421052631579e-06, "loss": 0.4115, "step": 144 }, { "Batch Mean": 2.7647485733032227, "accuracy": 0.796875, "epoch": 0.36, "step": 144 }, { "epoch": 0.3625, "grad_norm": 6.853006362915039, "learning_rate": 3.355263157894737e-06, "loss": 0.4469, "step": 145 }, { "Batch Mean": 2.769733190536499, "accuracy": 0.78125, "epoch": 0.3625, "step": 145 }, { "epoch": 0.365, "grad_norm": 7.704352855682373, "learning_rate": 3.342105263157895e-06, "loss": 0.4753, "step": 146 }, { "Batch Mean": 3.3755264282226562, "accuracy": 0.765625, "epoch": 0.365, "step": 146 }, { "epoch": 0.3675, "grad_norm": 7.600855350494385, "learning_rate": 3.3289473684210528e-06, "loss": 0.46, "step": 147 }, { "Batch Mean": 2.6412248611450195, "accuracy": 0.7578125, "epoch": 0.3675, "step": 147 }, { "epoch": 0.37, "grad_norm": 8.489302635192871, "learning_rate": 3.3157894736842107e-06, "loss": 0.5111, "step": 148 }, { "Batch Mean": 2.835446357727051, "accuracy": 0.7421875, "epoch": 0.37, "step": 148 }, { "epoch": 0.3725, "grad_norm": 7.343207359313965, "learning_rate": 3.302631578947369e-06, "loss": 0.5032, "step": 149 }, { "Batch Mean": 2.8000075817108154, "accuracy": 0.8203125, "epoch": 0.3725, "step": 149 }, { "epoch": 0.375, "grad_norm": 7.448639869689941, "learning_rate": 3.289473684210527e-06, "loss": 0.4456, "step": 150 }, { "Batch Mean": 2.8405256271362305, "accuracy": 0.7890625, "epoch": 0.375, "step": 150 }, { "epoch": 0.3775, "grad_norm": 6.841508865356445, "learning_rate": 3.276315789473685e-06, "loss": 0.4306, "step": 151 }, { "Batch Mean": 2.7483787536621094, "accuracy": 0.796875, "epoch": 0.3775, "step": 151 }, { "epoch": 0.38, "grad_norm": 7.562384128570557, "learning_rate": 3.2631578947368423e-06, "loss": 0.5062, "step": 152 }, { "Batch Mean": 2.8838186264038086, "accuracy": 0.7734375, "epoch": 0.38, "step": 152 }, { "epoch": 0.3825, "grad_norm": 6.867684841156006, "learning_rate": 3.2500000000000002e-06, "loss": 0.447, "step": 153 }, { "Batch Mean": 2.926253318786621, "accuracy": 0.7421875, "epoch": 0.3825, "step": 153 }, { "epoch": 0.385, "grad_norm": 8.004990577697754, "learning_rate": 3.236842105263158e-06, "loss": 0.464, "step": 154 }, { "Batch Mean": 3.1090264320373535, "accuracy": 0.8125, "epoch": 0.385, "step": 154 }, { "epoch": 0.3875, "grad_norm": 6.201688766479492, "learning_rate": 3.223684210526316e-06, "loss": 0.365, "step": 155 }, { "Batch Mean": 3.2874813079833984, "accuracy": 0.8515625, "epoch": 0.3875, "step": 155 }, { "epoch": 0.39, "grad_norm": 6.851440906524658, "learning_rate": 3.210526315789474e-06, "loss": 0.3668, "step": 156 }, { "Batch Mean": 3.5518083572387695, "accuracy": 0.8125, "epoch": 0.39, "step": 156 }, { "epoch": 0.3925, "grad_norm": 7.0716023445129395, "learning_rate": 3.197368421052632e-06, "loss": 0.3978, "step": 157 }, { "Batch Mean": 3.0985703468322754, "accuracy": 0.7734375, "epoch": 0.3925, "step": 157 }, { "epoch": 0.395, "grad_norm": 8.339123725891113, "learning_rate": 3.1842105263157898e-06, "loss": 0.3998, "step": 158 }, { "Batch Mean": 3.281148910522461, "accuracy": 0.7890625, "epoch": 0.395, "step": 158 }, { "epoch": 0.3975, "grad_norm": 8.356124877929688, "learning_rate": 3.1710526315789477e-06, "loss": 0.4491, "step": 159 }, { "Batch Mean": 3.783808708190918, "accuracy": 0.796875, "epoch": 0.3975, "step": 159 }, { "epoch": 0.4, "grad_norm": 9.633821487426758, "learning_rate": 3.157894736842105e-06, "loss": 0.432, "step": 160 }, { "Batch Mean": 3.9359283447265625, "accuracy": 0.78125, "epoch": 0.4, "step": 160 }, { "epoch": 0.4025, "grad_norm": 8.254521369934082, "learning_rate": 3.144736842105263e-06, "loss": 0.3891, "step": 161 }, { "Batch Mean": 4.607031345367432, "accuracy": 0.7265625, "epoch": 0.4025, "step": 161 }, { "epoch": 0.405, "grad_norm": 9.970298767089844, "learning_rate": 3.131578947368421e-06, "loss": 0.4982, "step": 162 }, { "Batch Mean": 4.176942825317383, "accuracy": 0.8125, "epoch": 0.405, "step": 162 }, { "epoch": 0.4075, "grad_norm": 12.374481201171875, "learning_rate": 3.1184210526315793e-06, "loss": 0.4349, "step": 163 }, { "Batch Mean": 4.015748977661133, "accuracy": 0.8046875, "epoch": 0.4075, "step": 163 }, { "epoch": 0.41, "grad_norm": 11.244656562805176, "learning_rate": 3.1052631578947372e-06, "loss": 0.421, "step": 164 }, { "Batch Mean": 4.22213888168335, "accuracy": 0.7578125, "epoch": 0.41, "step": 164 }, { "epoch": 0.4125, "grad_norm": 11.150482177734375, "learning_rate": 3.092105263157895e-06, "loss": 0.4978, "step": 165 }, { "Batch Mean": 3.7064056396484375, "accuracy": 0.78125, "epoch": 0.4125, "step": 165 }, { "epoch": 0.415, "grad_norm": 9.92993450164795, "learning_rate": 3.078947368421053e-06, "loss": 0.3748, "step": 166 }, { "Batch Mean": 4.073935508728027, "accuracy": 0.765625, "epoch": 0.415, "step": 166 }, { "epoch": 0.4175, "grad_norm": 9.726029396057129, "learning_rate": 3.065789473684211e-06, "loss": 0.3952, "step": 167 }, { "Batch Mean": 3.741079807281494, "accuracy": 0.796875, "epoch": 0.4175, "step": 167 }, { "epoch": 0.42, "grad_norm": 10.072457313537598, "learning_rate": 3.052631578947369e-06, "loss": 0.4475, "step": 168 }, { "Batch Mean": 3.423218011856079, "accuracy": 0.734375, "epoch": 0.42, "step": 168 }, { "epoch": 0.4225, "grad_norm": 9.74998950958252, "learning_rate": 3.0394736842105268e-06, "loss": 0.4507, "step": 169 }, { "Batch Mean": 3.3439035415649414, "accuracy": 0.78125, "epoch": 0.4225, "step": 169 }, { "epoch": 0.425, "grad_norm": 9.590579986572266, "learning_rate": 3.0263157894736843e-06, "loss": 0.4641, "step": 170 }, { "Batch Mean": 3.2388744354248047, "accuracy": 0.8828125, "epoch": 0.425, "step": 170 }, { "epoch": 0.4275, "grad_norm": 7.974471569061279, "learning_rate": 3.013157894736842e-06, "loss": 0.2888, "step": 171 }, { "Batch Mean": 2.912121295928955, "accuracy": 0.828125, "epoch": 0.4275, "step": 171 }, { "epoch": 0.43, "grad_norm": 8.418153762817383, "learning_rate": 3e-06, "loss": 0.4083, "step": 172 }, { "Batch Mean": 2.921308994293213, "accuracy": 0.7734375, "epoch": 0.43, "step": 172 }, { "epoch": 0.4325, "grad_norm": 9.274598121643066, "learning_rate": 2.986842105263158e-06, "loss": 0.4237, "step": 173 }, { "Batch Mean": 2.707150459289551, "accuracy": 0.765625, "epoch": 0.4325, "step": 173 }, { "epoch": 0.435, "grad_norm": 10.08448314666748, "learning_rate": 2.973684210526316e-06, "loss": 0.4084, "step": 174 }, { "Batch Mean": 3.289186477661133, "accuracy": 0.78125, "epoch": 0.435, "step": 174 }, { "epoch": 0.4375, "grad_norm": 10.216947555541992, "learning_rate": 2.960526315789474e-06, "loss": 0.4265, "step": 175 }, { "Batch Mean": 3.0034594535827637, "accuracy": 0.734375, "epoch": 0.4375, "step": 175 }, { "epoch": 0.44, "grad_norm": 9.921527862548828, "learning_rate": 2.9473684210526317e-06, "loss": 0.4555, "step": 176 }, { "Batch Mean": 3.1962521076202393, "accuracy": 0.796875, "epoch": 0.44, "step": 176 }, { "epoch": 0.4425, "grad_norm": 9.140130043029785, "learning_rate": 2.93421052631579e-06, "loss": 0.433, "step": 177 }, { "Batch Mean": 3.105267286300659, "accuracy": 0.8359375, "epoch": 0.4425, "step": 177 }, { "epoch": 0.445, "grad_norm": 8.577014923095703, "learning_rate": 2.921052631578948e-06, "loss": 0.3558, "step": 178 }, { "Batch Mean": 3.05171799659729, "accuracy": 0.8515625, "epoch": 0.445, "step": 178 }, { "epoch": 0.4475, "grad_norm": 9.902350425720215, "learning_rate": 2.907894736842106e-06, "loss": 0.3446, "step": 179 }, { "Batch Mean": 2.8749771118164062, "accuracy": 0.7734375, "epoch": 0.4475, "step": 179 }, { "epoch": 0.45, "grad_norm": 8.572197914123535, "learning_rate": 2.8947368421052634e-06, "loss": 0.4343, "step": 180 }, { "Batch Mean": 3.5201892852783203, "accuracy": 0.7734375, "epoch": 0.45, "step": 180 }, { "epoch": 0.4525, "grad_norm": 9.538017272949219, "learning_rate": 2.8815789473684213e-06, "loss": 0.4501, "step": 181 }, { "Batch Mean": 3.441112518310547, "accuracy": 0.8203125, "epoch": 0.4525, "step": 181 }, { "epoch": 0.455, "grad_norm": 8.626726150512695, "learning_rate": 2.868421052631579e-06, "loss": 0.4032, "step": 182 }, { "Batch Mean": 3.4753060340881348, "accuracy": 0.7109375, "epoch": 0.455, "step": 182 }, { "epoch": 0.4575, "grad_norm": 11.793852806091309, "learning_rate": 2.855263157894737e-06, "loss": 0.5783, "step": 183 }, { "Batch Mean": 3.088036060333252, "accuracy": 0.765625, "epoch": 0.4575, "step": 183 }, { "epoch": 0.46, "grad_norm": 9.752645492553711, "learning_rate": 2.842105263157895e-06, "loss": 0.4706, "step": 184 }, { "Batch Mean": 2.640962600708008, "accuracy": 0.71875, "epoch": 0.46, "step": 184 }, { "epoch": 0.4625, "grad_norm": 9.9888277053833, "learning_rate": 2.828947368421053e-06, "loss": 0.5268, "step": 185 }, { "Batch Mean": 2.834287405014038, "accuracy": 0.765625, "epoch": 0.4625, "step": 185 }, { "epoch": 0.465, "grad_norm": 8.2410306930542, "learning_rate": 2.815789473684211e-06, "loss": 0.4375, "step": 186 }, { "Batch Mean": 2.8364157676696777, "accuracy": 0.765625, "epoch": 0.465, "step": 186 }, { "epoch": 0.4675, "grad_norm": 7.653415679931641, "learning_rate": 2.8026315789473683e-06, "loss": 0.3908, "step": 187 }, { "Batch Mean": 2.0791072845458984, "accuracy": 0.78125, "epoch": 0.4675, "step": 187 }, { "epoch": 0.47, "grad_norm": 6.701351165771484, "learning_rate": 2.789473684210526e-06, "loss": 0.3761, "step": 188 }, { "Batch Mean": 2.188058853149414, "accuracy": 0.8125, "epoch": 0.47, "step": 188 }, { "epoch": 0.4725, "grad_norm": 7.052455425262451, "learning_rate": 2.776315789473684e-06, "loss": 0.4348, "step": 189 }, { "Batch Mean": 2.0506675243377686, "accuracy": 0.828125, "epoch": 0.4725, "step": 189 }, { "epoch": 0.475, "grad_norm": 7.033082962036133, "learning_rate": 2.7631578947368424e-06, "loss": 0.3651, "step": 190 }, { "Batch Mean": 2.3128931522369385, "accuracy": 0.8203125, "epoch": 0.475, "step": 190 }, { "epoch": 0.4775, "grad_norm": 7.838496685028076, "learning_rate": 2.7500000000000004e-06, "loss": 0.3981, "step": 191 }, { "Batch Mean": 2.032851219177246, "accuracy": 0.875, "epoch": 0.4775, "step": 191 }, { "epoch": 0.48, "grad_norm": 6.573253631591797, "learning_rate": 2.7368421052631583e-06, "loss": 0.3628, "step": 192 }, { "Batch Mean": 2.3109028339385986, "accuracy": 0.796875, "epoch": 0.48, "step": 192 }, { "epoch": 0.4825, "grad_norm": 8.022283554077148, "learning_rate": 2.723684210526316e-06, "loss": 0.4369, "step": 193 }, { "Batch Mean": 2.6557040214538574, "accuracy": 0.7734375, "epoch": 0.4825, "step": 193 }, { "epoch": 0.485, "grad_norm": 8.877519607543945, "learning_rate": 2.710526315789474e-06, "loss": 0.4504, "step": 194 }, { "Batch Mean": 2.9424595832824707, "accuracy": 0.75, "epoch": 0.485, "step": 194 }, { "epoch": 0.4875, "grad_norm": 7.829299449920654, "learning_rate": 2.697368421052632e-06, "loss": 0.4254, "step": 195 }, { "Batch Mean": 2.644308090209961, "accuracy": 0.7578125, "epoch": 0.4875, "step": 195 }, { "epoch": 0.49, "grad_norm": 9.070138931274414, "learning_rate": 2.68421052631579e-06, "loss": 0.4922, "step": 196 }, { "Batch Mean": 2.4906630516052246, "accuracy": 0.765625, "epoch": 0.49, "step": 196 }, { "epoch": 0.4925, "grad_norm": 9.63489818572998, "learning_rate": 2.6710526315789474e-06, "loss": 0.4602, "step": 197 }, { "Batch Mean": 2.2591946125030518, "accuracy": 0.796875, "epoch": 0.4925, "step": 197 }, { "epoch": 0.495, "grad_norm": 8.234044075012207, "learning_rate": 2.6578947368421053e-06, "loss": 0.398, "step": 198 }, { "Batch Mean": 2.6223649978637695, "accuracy": 0.8125, "epoch": 0.495, "step": 198 }, { "epoch": 0.4975, "grad_norm": 7.958437442779541, "learning_rate": 2.644736842105263e-06, "loss": 0.3909, "step": 199 }, { "Batch Mean": 2.5589590072631836, "accuracy": 0.875, "epoch": 0.4975, "step": 199 }, { "epoch": 0.5, "grad_norm": 7.556171417236328, "learning_rate": 2.631578947368421e-06, "loss": 0.3513, "step": 200 }, { "Batch Mean": 2.3716750144958496, "accuracy": 0.7265625, "epoch": 0.5, "step": 200 }, { "epoch": 0.5025, "grad_norm": 9.392867088317871, "learning_rate": 2.618421052631579e-06, "loss": 0.4831, "step": 201 }, { "Batch Mean": 2.27783203125, "accuracy": 0.8203125, "epoch": 0.5025, "step": 201 }, { "epoch": 0.505, "grad_norm": 8.140517234802246, "learning_rate": 2.605263157894737e-06, "loss": 0.3929, "step": 202 }, { "Batch Mean": 2.535848617553711, "accuracy": 0.84375, "epoch": 0.505, "step": 202 }, { "epoch": 0.5075, "grad_norm": 6.947089195251465, "learning_rate": 2.592105263157895e-06, "loss": 0.3478, "step": 203 }, { "Batch Mean": 2.651693344116211, "accuracy": 0.8046875, "epoch": 0.5075, "step": 203 }, { "epoch": 0.51, "grad_norm": 8.759857177734375, "learning_rate": 2.578947368421053e-06, "loss": 0.4076, "step": 204 }, { "Batch Mean": 2.571043014526367, "accuracy": 0.8046875, "epoch": 0.51, "step": 204 }, { "epoch": 0.5125, "grad_norm": 9.00507926940918, "learning_rate": 2.565789473684211e-06, "loss": 0.3996, "step": 205 }, { "Batch Mean": 2.7687175273895264, "accuracy": 0.75, "epoch": 0.5125, "step": 205 }, { "epoch": 0.515, "grad_norm": 10.412758827209473, "learning_rate": 2.552631578947369e-06, "loss": 0.5306, "step": 206 }, { "Batch Mean": 2.5285263061523438, "accuracy": 0.84375, "epoch": 0.515, "step": 206 }, { "epoch": 0.5175, "grad_norm": 7.771270751953125, "learning_rate": 2.5394736842105265e-06, "loss": 0.3507, "step": 207 }, { "Batch Mean": 2.688607692718506, "accuracy": 0.8359375, "epoch": 0.5175, "step": 207 }, { "epoch": 0.52, "grad_norm": 8.726874351501465, "learning_rate": 2.5263157894736844e-06, "loss": 0.4068, "step": 208 }, { "Batch Mean": 2.94492244720459, "accuracy": 0.8125, "epoch": 0.52, "step": 208 }, { "epoch": 0.5225, "grad_norm": 8.50943660736084, "learning_rate": 2.5131578947368423e-06, "loss": 0.3774, "step": 209 }, { "Batch Mean": 2.7862486839294434, "accuracy": 0.796875, "epoch": 0.5225, "step": 209 }, { "epoch": 0.525, "grad_norm": 8.98309326171875, "learning_rate": 2.5e-06, "loss": 0.4266, "step": 210 }, { "Batch Mean": 2.61392879486084, "accuracy": 0.765625, "epoch": 0.525, "step": 210 }, { "epoch": 0.5275, "grad_norm": 10.080408096313477, "learning_rate": 2.486842105263158e-06, "loss": 0.4663, "step": 211 }, { "Batch Mean": 2.9034132957458496, "accuracy": 0.8203125, "epoch": 0.5275, "step": 211 }, { "epoch": 0.53, "grad_norm": 8.575652122497559, "learning_rate": 2.473684210526316e-06, "loss": 0.3926, "step": 212 }, { "Batch Mean": 2.8661410808563232, "accuracy": 0.75, "epoch": 0.53, "step": 212 }, { "epoch": 0.5325, "grad_norm": 10.517654418945312, "learning_rate": 2.460526315789474e-06, "loss": 0.5594, "step": 213 }, { "Batch Mean": 3.062936782836914, "accuracy": 0.7578125, "epoch": 0.5325, "step": 213 }, { "epoch": 0.535, "grad_norm": 9.07347583770752, "learning_rate": 2.447368421052632e-06, "loss": 0.433, "step": 214 }, { "Batch Mean": 2.8995747566223145, "accuracy": 0.8125, "epoch": 0.535, "step": 214 }, { "epoch": 0.5375, "grad_norm": 9.181224822998047, "learning_rate": 2.4342105263157898e-06, "loss": 0.4336, "step": 215 }, { "Batch Mean": 2.7490530014038086, "accuracy": 0.7421875, "epoch": 0.5375, "step": 215 }, { "epoch": 0.54, "grad_norm": 9.431602478027344, "learning_rate": 2.4210526315789477e-06, "loss": 0.4583, "step": 216 }, { "Batch Mean": 2.801156520843506, "accuracy": 0.7734375, "epoch": 0.54, "step": 216 }, { "epoch": 0.5425, "grad_norm": 8.67341423034668, "learning_rate": 2.4078947368421056e-06, "loss": 0.4219, "step": 217 }, { "Batch Mean": 2.785233497619629, "accuracy": 0.8125, "epoch": 0.5425, "step": 217 }, { "epoch": 0.545, "grad_norm": 8.614130973815918, "learning_rate": 2.3947368421052635e-06, "loss": 0.4214, "step": 218 }, { "Batch Mean": 2.322535514831543, "accuracy": 0.8046875, "epoch": 0.545, "step": 218 }, { "epoch": 0.5475, "grad_norm": 8.403548240661621, "learning_rate": 2.381578947368421e-06, "loss": 0.425, "step": 219 }, { "Batch Mean": 2.722550392150879, "accuracy": 0.8515625, "epoch": 0.5475, "step": 219 }, { "epoch": 0.55, "grad_norm": 8.205376625061035, "learning_rate": 2.368421052631579e-06, "loss": 0.3821, "step": 220 }, { "Batch Mean": 2.360077142715454, "accuracy": 0.7734375, "epoch": 0.55, "step": 220 }, { "epoch": 0.5525, "grad_norm": 9.49571418762207, "learning_rate": 2.355263157894737e-06, "loss": 0.4661, "step": 221 }, { "Batch Mean": 3.0066745281219482, "accuracy": 0.7265625, "epoch": 0.5525, "step": 221 }, { "epoch": 0.555, "grad_norm": 9.534995079040527, "learning_rate": 2.342105263157895e-06, "loss": 0.5174, "step": 222 }, { "Batch Mean": 2.9200942516326904, "accuracy": 0.8359375, "epoch": 0.555, "step": 222 }, { "epoch": 0.5575, "grad_norm": 7.970925331115723, "learning_rate": 2.328947368421053e-06, "loss": 0.4063, "step": 223 }, { "Batch Mean": 3.1014528274536133, "accuracy": 0.8359375, "epoch": 0.5575, "step": 223 }, { "epoch": 0.56, "grad_norm": 7.537857532501221, "learning_rate": 2.3157894736842105e-06, "loss": 0.3926, "step": 224 }, { "Batch Mean": 2.5743775367736816, "accuracy": 0.7890625, "epoch": 0.56, "step": 224 }, { "epoch": 0.5625, "grad_norm": 8.670877456665039, "learning_rate": 2.3026315789473684e-06, "loss": 0.4287, "step": 225 }, { "Batch Mean": 3.103597640991211, "accuracy": 0.8359375, "epoch": 0.5625, "step": 225 }, { "epoch": 0.565, "grad_norm": 7.964511871337891, "learning_rate": 2.2894736842105263e-06, "loss": 0.3888, "step": 226 }, { "Batch Mean": 3.0492055416107178, "accuracy": 0.859375, "epoch": 0.565, "step": 226 }, { "epoch": 0.5675, "grad_norm": 7.54857873916626, "learning_rate": 2.2763157894736847e-06, "loss": 0.3208, "step": 227 }, { "Batch Mean": 3.119175434112549, "accuracy": 0.7734375, "epoch": 0.5675, "step": 227 }, { "epoch": 0.57, "grad_norm": 7.656146049499512, "learning_rate": 2.2631578947368426e-06, "loss": 0.4383, "step": 228 }, { "Batch Mean": 2.846890449523926, "accuracy": 0.75, "epoch": 0.57, "step": 228 }, { "epoch": 0.5725, "grad_norm": 9.830333709716797, "learning_rate": 2.25e-06, "loss": 0.5166, "step": 229 }, { "Batch Mean": 2.3330116271972656, "accuracy": 0.84375, "epoch": 0.5725, "step": 229 }, { "epoch": 0.575, "grad_norm": 9.917115211486816, "learning_rate": 2.236842105263158e-06, "loss": 0.4154, "step": 230 }, { "Batch Mean": 3.0601701736450195, "accuracy": 0.8046875, "epoch": 0.575, "step": 230 }, { "epoch": 0.5775, "grad_norm": 8.37639331817627, "learning_rate": 2.223684210526316e-06, "loss": 0.4456, "step": 231 }, { "Batch Mean": 2.553410530090332, "accuracy": 0.78125, "epoch": 0.5775, "step": 231 }, { "epoch": 0.58, "grad_norm": 8.588013648986816, "learning_rate": 2.2105263157894738e-06, "loss": 0.4204, "step": 232 }, { "Batch Mean": 2.790724754333496, "accuracy": 0.8359375, "epoch": 0.58, "step": 232 }, { "epoch": 0.5825, "grad_norm": 8.133465766906738, "learning_rate": 2.1973684210526317e-06, "loss": 0.3772, "step": 233 }, { "Batch Mean": 2.7968082427978516, "accuracy": 0.84375, "epoch": 0.5825, "step": 233 }, { "epoch": 0.585, "grad_norm": 8.12721061706543, "learning_rate": 2.1842105263157896e-06, "loss": 0.3624, "step": 234 }, { "Batch Mean": 3.189887523651123, "accuracy": 0.8046875, "epoch": 0.585, "step": 234 }, { "epoch": 0.5875, "grad_norm": 8.726601600646973, "learning_rate": 2.1710526315789475e-06, "loss": 0.4139, "step": 235 }, { "Batch Mean": 3.4596574306488037, "accuracy": 0.8515625, "epoch": 0.5875, "step": 235 }, { "epoch": 0.59, "grad_norm": 8.571916580200195, "learning_rate": 2.1578947368421054e-06, "loss": 0.4047, "step": 236 }, { "Batch Mean": 3.403088092803955, "accuracy": 0.8671875, "epoch": 0.59, "step": 236 }, { "epoch": 0.5925, "grad_norm": 8.29578971862793, "learning_rate": 2.1447368421052633e-06, "loss": 0.3556, "step": 237 }, { "Batch Mean": 2.97831654548645, "accuracy": 0.796875, "epoch": 0.5925, "step": 237 }, { "epoch": 0.595, "grad_norm": 9.06924819946289, "learning_rate": 2.1315789473684212e-06, "loss": 0.394, "step": 238 }, { "Batch Mean": 3.729241371154785, "accuracy": 0.8515625, "epoch": 0.595, "step": 238 }, { "epoch": 0.5975, "grad_norm": 9.479219436645508, "learning_rate": 2.118421052631579e-06, "loss": 0.3934, "step": 239 }, { "Batch Mean": 3.851926803588867, "accuracy": 0.78125, "epoch": 0.5975, "step": 239 }, { "epoch": 0.6, "grad_norm": 11.203536033630371, "learning_rate": 2.105263157894737e-06, "loss": 0.4106, "step": 240 }, { "Batch Mean": 3.493197441101074, "accuracy": 0.8046875, "epoch": 0.6, "step": 240 }, { "epoch": 0.6025, "grad_norm": 8.762469291687012, "learning_rate": 2.092105263157895e-06, "loss": 0.4018, "step": 241 }, { "Batch Mean": 3.858477830886841, "accuracy": 0.828125, "epoch": 0.6025, "step": 241 }, { "epoch": 0.605, "grad_norm": 8.721271514892578, "learning_rate": 2.078947368421053e-06, "loss": 0.4044, "step": 242 }, { "Batch Mean": 3.1943435668945312, "accuracy": 0.8828125, "epoch": 0.605, "step": 242 }, { "epoch": 0.6075, "grad_norm": 8.710159301757812, "learning_rate": 2.0657894736842108e-06, "loss": 0.3548, "step": 243 }, { "Batch Mean": 3.7947511672973633, "accuracy": 0.8125, "epoch": 0.6075, "step": 243 }, { "epoch": 0.61, "grad_norm": 9.211752891540527, "learning_rate": 2.0526315789473687e-06, "loss": 0.4265, "step": 244 }, { "Batch Mean": 3.470452308654785, "accuracy": 0.8046875, "epoch": 0.61, "step": 244 }, { "epoch": 0.6125, "grad_norm": 9.422188758850098, "learning_rate": 2.0394736842105266e-06, "loss": 0.4187, "step": 245 }, { "Batch Mean": 3.5571398735046387, "accuracy": 0.71875, "epoch": 0.6125, "step": 245 }, { "epoch": 0.615, "grad_norm": 13.342325210571289, "learning_rate": 2.026315789473684e-06, "loss": 0.5917, "step": 246 }, { "Batch Mean": 3.462411880493164, "accuracy": 0.8125, "epoch": 0.615, "step": 246 }, { "epoch": 0.6175, "grad_norm": 8.84332275390625, "learning_rate": 2.013157894736842e-06, "loss": 0.3884, "step": 247 }, { "Batch Mean": 3.284522533416748, "accuracy": 0.8203125, "epoch": 0.6175, "step": 247 }, { "epoch": 0.62, "grad_norm": 8.868474006652832, "learning_rate": 2.0000000000000003e-06, "loss": 0.3962, "step": 248 }, { "Batch Mean": 3.2781238555908203, "accuracy": 0.7421875, "epoch": 0.62, "step": 248 }, { "epoch": 0.6225, "grad_norm": 9.684708595275879, "learning_rate": 1.9868421052631582e-06, "loss": 0.4578, "step": 249 }, { "Batch Mean": 3.135952949523926, "accuracy": 0.796875, "epoch": 0.6225, "step": 249 }, { "epoch": 0.625, "grad_norm": 8.415375709533691, "learning_rate": 1.973684210526316e-06, "loss": 0.3912, "step": 250 }, { "Batch Mean": 3.135913848876953, "accuracy": 0.8203125, "epoch": 0.625, "step": 250 }, { "epoch": 0.6275, "grad_norm": 8.018439292907715, "learning_rate": 1.9605263157894736e-06, "loss": 0.3714, "step": 251 }, { "Batch Mean": 3.310336112976074, "accuracy": 0.7578125, "epoch": 0.6275, "step": 251 }, { "epoch": 0.63, "grad_norm": 8.3823881149292, "learning_rate": 1.9473684210526315e-06, "loss": 0.427, "step": 252 }, { "Batch Mean": 3.3237199783325195, "accuracy": 0.796875, "epoch": 0.63, "step": 252 }, { "epoch": 0.6325, "grad_norm": 8.449406623840332, "learning_rate": 1.9342105263157895e-06, "loss": 0.4142, "step": 253 }, { "Batch Mean": 3.327472686767578, "accuracy": 0.796875, "epoch": 0.6325, "step": 253 }, { "epoch": 0.635, "grad_norm": 8.559859275817871, "learning_rate": 1.9210526315789474e-06, "loss": 0.3855, "step": 254 }, { "Batch Mean": 3.2711634635925293, "accuracy": 0.6875, "epoch": 0.635, "step": 254 }, { "epoch": 0.6375, "grad_norm": 10.444122314453125, "learning_rate": 1.9078947368421057e-06, "loss": 0.5446, "step": 255 }, { "Batch Mean": 3.2955322265625, "accuracy": 0.734375, "epoch": 0.6375, "step": 255 }, { "epoch": 0.64, "grad_norm": 9.275879859924316, "learning_rate": 1.8947368421052634e-06, "loss": 0.4615, "step": 256 }, { "Batch Mean": 3.2368879318237305, "accuracy": 0.8046875, "epoch": 0.64, "step": 256 }, { "epoch": 0.6425, "grad_norm": 9.692377090454102, "learning_rate": 1.8815789473684213e-06, "loss": 0.4704, "step": 257 }, { "Batch Mean": 2.976322650909424, "accuracy": 0.8515625, "epoch": 0.6425, "step": 257 }, { "epoch": 0.645, "grad_norm": 8.025856018066406, "learning_rate": 1.868421052631579e-06, "loss": 0.3788, "step": 258 }, { "Batch Mean": 2.6816248893737793, "accuracy": 0.78125, "epoch": 0.645, "step": 258 }, { "epoch": 0.6475, "grad_norm": 8.220409393310547, "learning_rate": 1.855263157894737e-06, "loss": 0.4221, "step": 259 }, { "Batch Mean": 2.9761343002319336, "accuracy": 0.828125, "epoch": 0.6475, "step": 259 }, { "epoch": 0.65, "grad_norm": 7.726944923400879, "learning_rate": 1.8421052631578948e-06, "loss": 0.443, "step": 260 }, { "Batch Mean": 2.5574710369110107, "accuracy": 0.859375, "epoch": 0.65, "step": 260 }, { "epoch": 0.6525, "grad_norm": 7.004358768463135, "learning_rate": 1.828947368421053e-06, "loss": 0.3621, "step": 261 }, { "Batch Mean": 2.958151340484619, "accuracy": 0.8203125, "epoch": 0.6525, "step": 261 }, { "epoch": 0.655, "grad_norm": 7.445678234100342, "learning_rate": 1.8157894736842109e-06, "loss": 0.4151, "step": 262 }, { "Batch Mean": 2.1886234283447266, "accuracy": 0.78125, "epoch": 0.655, "step": 262 }, { "epoch": 0.6575, "grad_norm": 8.080365180969238, "learning_rate": 1.8026315789473685e-06, "loss": 0.462, "step": 263 }, { "Batch Mean": 2.1878843307495117, "accuracy": 0.765625, "epoch": 0.6575, "step": 263 }, { "epoch": 0.66, "grad_norm": 7.28476095199585, "learning_rate": 1.7894736842105265e-06, "loss": 0.4611, "step": 264 }, { "Batch Mean": 2.1238341331481934, "accuracy": 0.734375, "epoch": 0.66, "step": 264 }, { "epoch": 0.6625, "grad_norm": 7.6339216232299805, "learning_rate": 1.7763157894736844e-06, "loss": 0.449, "step": 265 }, { "Batch Mean": 2.631845474243164, "accuracy": 0.7265625, "epoch": 0.6625, "step": 265 }, { "epoch": 0.665, "grad_norm": 8.975438117980957, "learning_rate": 1.7631578947368423e-06, "loss": 0.5264, "step": 266 }, { "Batch Mean": 2.501516342163086, "accuracy": 0.8515625, "epoch": 0.665, "step": 266 }, { "epoch": 0.6675, "grad_norm": 8.181595802307129, "learning_rate": 1.75e-06, "loss": 0.3499, "step": 267 }, { "Batch Mean": 2.091623067855835, "accuracy": 0.78125, "epoch": 0.6675, "step": 267 }, { "epoch": 0.67, "grad_norm": 7.984980583190918, "learning_rate": 1.736842105263158e-06, "loss": 0.4639, "step": 268 }, { "Batch Mean": 3.048328399658203, "accuracy": 0.84375, "epoch": 0.67, "step": 268 }, { "epoch": 0.6725, "grad_norm": 7.468845844268799, "learning_rate": 1.723684210526316e-06, "loss": 0.3586, "step": 269 }, { "Batch Mean": 2.925644874572754, "accuracy": 0.8046875, "epoch": 0.6725, "step": 269 }, { "epoch": 0.675, "grad_norm": 8.669090270996094, "learning_rate": 1.710526315789474e-06, "loss": 0.4307, "step": 270 }, { "Batch Mean": 2.9262943267822266, "accuracy": 0.7890625, "epoch": 0.675, "step": 270 }, { "epoch": 0.6775, "grad_norm": 7.891000747680664, "learning_rate": 1.6973684210526318e-06, "loss": 0.4119, "step": 271 }, { "Batch Mean": 3.2007436752319336, "accuracy": 0.765625, "epoch": 0.6775, "step": 271 }, { "epoch": 0.68, "grad_norm": 10.188142776489258, "learning_rate": 1.6842105263157895e-06, "loss": 0.468, "step": 272 }, { "Batch Mean": 3.3460845947265625, "accuracy": 0.8203125, "epoch": 0.68, "step": 272 }, { "epoch": 0.6825, "grad_norm": 8.278077125549316, "learning_rate": 1.6710526315789474e-06, "loss": 0.3438, "step": 273 }, { "Batch Mean": 3.009209156036377, "accuracy": 0.78125, "epoch": 0.6825, "step": 273 }, { "epoch": 0.685, "grad_norm": 8.755949974060059, "learning_rate": 1.6578947368421053e-06, "loss": 0.4507, "step": 274 }, { "Batch Mean": 3.3076977729797363, "accuracy": 0.8125, "epoch": 0.685, "step": 274 }, { "epoch": 0.6875, "grad_norm": 8.441435813903809, "learning_rate": 1.6447368421052635e-06, "loss": 0.3988, "step": 275 }, { "Batch Mean": 3.170098304748535, "accuracy": 0.828125, "epoch": 0.6875, "step": 275 }, { "epoch": 0.69, "grad_norm": 9.062217712402344, "learning_rate": 1.6315789473684212e-06, "loss": 0.4722, "step": 276 }, { "Batch Mean": 3.1647844314575195, "accuracy": 0.7578125, "epoch": 0.69, "step": 276 }, { "epoch": 0.6925, "grad_norm": 9.467844009399414, "learning_rate": 1.618421052631579e-06, "loss": 0.503, "step": 277 }, { "Batch Mean": 3.3005905151367188, "accuracy": 0.78125, "epoch": 0.6925, "step": 277 }, { "epoch": 0.695, "grad_norm": 9.51032829284668, "learning_rate": 1.605263157894737e-06, "loss": 0.4315, "step": 278 }, { "Batch Mean": 3.296581983566284, "accuracy": 0.8203125, "epoch": 0.695, "step": 278 }, { "epoch": 0.6975, "grad_norm": 8.343262672424316, "learning_rate": 1.5921052631578949e-06, "loss": 0.3729, "step": 279 }, { "Batch Mean": 2.9995474815368652, "accuracy": 0.859375, "epoch": 0.6975, "step": 279 }, { "epoch": 0.7, "grad_norm": 7.270707130432129, "learning_rate": 1.5789473684210526e-06, "loss": 0.3277, "step": 280 }, { "Batch Mean": 3.0474696159362793, "accuracy": 0.796875, "epoch": 0.7, "step": 280 }, { "epoch": 0.7025, "grad_norm": 8.632308006286621, "learning_rate": 1.5657894736842105e-06, "loss": 0.4516, "step": 281 }, { "Batch Mean": 3.151510238647461, "accuracy": 0.828125, "epoch": 0.7025, "step": 281 }, { "epoch": 0.705, "grad_norm": 8.563387870788574, "learning_rate": 1.5526315789473686e-06, "loss": 0.3927, "step": 282 }, { "Batch Mean": 3.457594871520996, "accuracy": 0.7890625, "epoch": 0.705, "step": 282 }, { "epoch": 0.7075, "grad_norm": 8.905447006225586, "learning_rate": 1.5394736842105265e-06, "loss": 0.4408, "step": 283 }, { "Batch Mean": 3.3107197284698486, "accuracy": 0.8203125, "epoch": 0.7075, "step": 283 }, { "epoch": 0.71, "grad_norm": 8.482126235961914, "learning_rate": 1.5263157894736844e-06, "loss": 0.3398, "step": 284 }, { "Batch Mean": 3.2536354064941406, "accuracy": 0.8359375, "epoch": 0.71, "step": 284 }, { "epoch": 0.7125, "grad_norm": 8.584724426269531, "learning_rate": 1.5131578947368421e-06, "loss": 0.3777, "step": 285 }, { "Batch Mean": 3.492779493331909, "accuracy": 0.765625, "epoch": 0.7125, "step": 285 }, { "epoch": 0.715, "grad_norm": 9.266121864318848, "learning_rate": 1.5e-06, "loss": 0.4202, "step": 286 }, { "Batch Mean": 3.656635284423828, "accuracy": 0.828125, "epoch": 0.715, "step": 286 }, { "epoch": 0.7175, "grad_norm": 9.95947265625, "learning_rate": 1.486842105263158e-06, "loss": 0.3748, "step": 287 }, { "Batch Mean": 3.3843679428100586, "accuracy": 0.8125, "epoch": 0.7175, "step": 287 }, { "epoch": 0.72, "grad_norm": 8.991701126098633, "learning_rate": 1.4736842105263159e-06, "loss": 0.4783, "step": 288 }, { "Batch Mean": 3.573545455932617, "accuracy": 0.8359375, "epoch": 0.72, "step": 288 }, { "epoch": 0.7225, "grad_norm": 8.646162986755371, "learning_rate": 1.460526315789474e-06, "loss": 0.3234, "step": 289 }, { "Batch Mean": 3.7222461700439453, "accuracy": 0.7890625, "epoch": 0.7225, "step": 289 }, { "epoch": 0.725, "grad_norm": 9.578656196594238, "learning_rate": 1.4473684210526317e-06, "loss": 0.4172, "step": 290 }, { "Batch Mean": 4.053102493286133, "accuracy": 0.7734375, "epoch": 0.725, "step": 290 }, { "epoch": 0.7275, "grad_norm": 8.825078010559082, "learning_rate": 1.4342105263157896e-06, "loss": 0.433, "step": 291 }, { "Batch Mean": 3.838489532470703, "accuracy": 0.78125, "epoch": 0.7275, "step": 291 }, { "epoch": 0.73, "grad_norm": 10.636845588684082, "learning_rate": 1.4210526315789475e-06, "loss": 0.4634, "step": 292 }, { "Batch Mean": 3.395310401916504, "accuracy": 0.75, "epoch": 0.73, "step": 292 }, { "epoch": 0.7325, "grad_norm": 9.59333324432373, "learning_rate": 1.4078947368421054e-06, "loss": 0.5072, "step": 293 }, { "Batch Mean": 3.9789938926696777, "accuracy": 0.796875, "epoch": 0.7325, "step": 293 }, { "epoch": 0.735, "grad_norm": 10.251858711242676, "learning_rate": 1.394736842105263e-06, "loss": 0.4102, "step": 294 }, { "Batch Mean": 3.9914820194244385, "accuracy": 0.8046875, "epoch": 0.735, "step": 294 }, { "epoch": 0.7375, "grad_norm": 8.596433639526367, "learning_rate": 1.3815789473684212e-06, "loss": 0.3908, "step": 295 }, { "Batch Mean": 3.6803340911865234, "accuracy": 0.8046875, "epoch": 0.7375, "step": 295 }, { "epoch": 0.74, "grad_norm": 8.609084129333496, "learning_rate": 1.3684210526315791e-06, "loss": 0.3763, "step": 296 }, { "Batch Mean": 4.072849273681641, "accuracy": 0.8359375, "epoch": 0.74, "step": 296 }, { "epoch": 0.7425, "grad_norm": 9.057376861572266, "learning_rate": 1.355263157894737e-06, "loss": 0.4094, "step": 297 }, { "Batch Mean": 3.7248973846435547, "accuracy": 0.8203125, "epoch": 0.7425, "step": 297 }, { "epoch": 0.745, "grad_norm": 8.338115692138672, "learning_rate": 1.342105263157895e-06, "loss": 0.3808, "step": 298 }, { "Batch Mean": 3.901179313659668, "accuracy": 0.84375, "epoch": 0.745, "step": 298 }, { "epoch": 0.7475, "grad_norm": 7.799005508422852, "learning_rate": 1.3289473684210526e-06, "loss": 0.3699, "step": 299 }, { "Batch Mean": 3.617119789123535, "accuracy": 0.8046875, "epoch": 0.7475, "step": 299 }, { "epoch": 0.75, "grad_norm": 8.738370895385742, "learning_rate": 1.3157894736842106e-06, "loss": 0.3772, "step": 300 }, { "Batch Mean": 4.020530700683594, "accuracy": 0.8125, "epoch": 0.75, "step": 300 }, { "epoch": 0.7525, "grad_norm": 9.246763229370117, "learning_rate": 1.3026315789473685e-06, "loss": 0.4276, "step": 301 }, { "Batch Mean": 3.279233455657959, "accuracy": 0.8203125, "epoch": 0.7525, "step": 301 }, { "epoch": 0.755, "grad_norm": 9.489370346069336, "learning_rate": 1.2894736842105266e-06, "loss": 0.4182, "step": 302 }, { "Batch Mean": 3.3999757766723633, "accuracy": 0.7578125, "epoch": 0.755, "step": 302 }, { "epoch": 0.7575, "grad_norm": 10.291142463684082, "learning_rate": 1.2763157894736845e-06, "loss": 0.5, "step": 303 }, { "Batch Mean": 3.3867852687835693, "accuracy": 0.8515625, "epoch": 0.7575, "step": 303 }, { "epoch": 0.76, "grad_norm": 8.608928680419922, "learning_rate": 1.2631578947368422e-06, "loss": 0.3503, "step": 304 }, { "Batch Mean": 3.8447773456573486, "accuracy": 0.796875, "epoch": 0.76, "step": 304 }, { "epoch": 0.7625, "grad_norm": 9.144750595092773, "learning_rate": 1.25e-06, "loss": 0.3895, "step": 305 }, { "Batch Mean": 3.3639984130859375, "accuracy": 0.8203125, "epoch": 0.7625, "step": 305 }, { "epoch": 0.765, "grad_norm": 8.691869735717773, "learning_rate": 1.236842105263158e-06, "loss": 0.4406, "step": 306 }, { "Batch Mean": 3.6499900817871094, "accuracy": 0.7421875, "epoch": 0.765, "step": 306 }, { "epoch": 0.7675, "grad_norm": 10.274083137512207, "learning_rate": 1.223684210526316e-06, "loss": 0.5364, "step": 307 }, { "Batch Mean": 3.308014392852783, "accuracy": 0.8046875, "epoch": 0.7675, "step": 307 }, { "epoch": 0.77, "grad_norm": 8.63004207611084, "learning_rate": 1.2105263157894738e-06, "loss": 0.4308, "step": 308 }, { "Batch Mean": 3.2193620204925537, "accuracy": 0.7734375, "epoch": 0.77, "step": 308 }, { "epoch": 0.7725, "grad_norm": 8.222301483154297, "learning_rate": 1.1973684210526317e-06, "loss": 0.4106, "step": 309 }, { "Batch Mean": 3.185535430908203, "accuracy": 0.828125, "epoch": 0.7725, "step": 309 }, { "epoch": 0.775, "grad_norm": 8.892534255981445, "learning_rate": 1.1842105263157894e-06, "loss": 0.4403, "step": 310 }, { "Batch Mean": 3.350971221923828, "accuracy": 0.78125, "epoch": 0.775, "step": 310 }, { "epoch": 0.7775, "grad_norm": 8.056893348693848, "learning_rate": 1.1710526315789476e-06, "loss": 0.4403, "step": 311 }, { "Batch Mean": 3.1580140590667725, "accuracy": 0.8203125, "epoch": 0.7775, "step": 311 }, { "epoch": 0.78, "grad_norm": 8.281316757202148, "learning_rate": 1.1578947368421053e-06, "loss": 0.3817, "step": 312 }, { "Batch Mean": 3.2041165828704834, "accuracy": 0.796875, "epoch": 0.78, "step": 312 }, { "epoch": 0.7825, "grad_norm": 8.931872367858887, "learning_rate": 1.1447368421052632e-06, "loss": 0.4469, "step": 313 }, { "Batch Mean": 3.207402229309082, "accuracy": 0.859375, "epoch": 0.7825, "step": 313 }, { "epoch": 0.785, "grad_norm": 7.213859558105469, "learning_rate": 1.1315789473684213e-06, "loss": 0.3417, "step": 314 }, { "Batch Mean": 2.9738054275512695, "accuracy": 0.796875, "epoch": 0.785, "step": 314 }, { "epoch": 0.7875, "grad_norm": 8.266810417175293, "learning_rate": 1.118421052631579e-06, "loss": 0.4323, "step": 315 }, { "Batch Mean": 3.0165514945983887, "accuracy": 0.8046875, "epoch": 0.7875, "step": 315 }, { "epoch": 0.79, "grad_norm": 8.453869819641113, "learning_rate": 1.1052631578947369e-06, "loss": 0.4218, "step": 316 }, { "Batch Mean": 3.1318092346191406, "accuracy": 0.8359375, "epoch": 0.79, "step": 316 }, { "epoch": 0.7925, "grad_norm": 7.634886264801025, "learning_rate": 1.0921052631578948e-06, "loss": 0.3659, "step": 317 }, { "Batch Mean": 3.035651206970215, "accuracy": 0.796875, "epoch": 0.7925, "step": 317 }, { "epoch": 0.795, "grad_norm": 8.668999671936035, "learning_rate": 1.0789473684210527e-06, "loss": 0.4173, "step": 318 }, { "Batch Mean": 3.0223705768585205, "accuracy": 0.84375, "epoch": 0.795, "step": 318 }, { "epoch": 0.7975, "grad_norm": 8.644440650939941, "learning_rate": 1.0657894736842106e-06, "loss": 0.3693, "step": 319 }, { "Batch Mean": 3.43692684173584, "accuracy": 0.78125, "epoch": 0.7975, "step": 319 }, { "epoch": 0.8, "grad_norm": 9.56282901763916, "learning_rate": 1.0526315789473685e-06, "loss": 0.3924, "step": 320 }, { "Batch Mean": 3.548732280731201, "accuracy": 0.7578125, "epoch": 0.8, "step": 320 }, { "epoch": 0.8025, "grad_norm": 10.664633750915527, "learning_rate": 1.0394736842105264e-06, "loss": 0.441, "step": 321 }, { "Batch Mean": 3.7864296436309814, "accuracy": 0.8359375, "epoch": 0.8025, "step": 321 }, { "epoch": 0.805, "grad_norm": 8.181385040283203, "learning_rate": 1.0263157894736843e-06, "loss": 0.3418, "step": 322 }, { "Batch Mean": 3.4767401218414307, "accuracy": 0.7421875, "epoch": 0.805, "step": 322 }, { "epoch": 0.8075, "grad_norm": 11.198949813842773, "learning_rate": 1.013157894736842e-06, "loss": 0.522, "step": 323 }, { "Batch Mean": 3.49471116065979, "accuracy": 0.8203125, "epoch": 0.8075, "step": 323 }, { "epoch": 0.81, "grad_norm": 8.482062339782715, "learning_rate": 1.0000000000000002e-06, "loss": 0.3941, "step": 324 }, { "Batch Mean": 3.611042022705078, "accuracy": 0.796875, "epoch": 0.81, "step": 324 }, { "epoch": 0.8125, "grad_norm": 8.56822395324707, "learning_rate": 9.86842105263158e-07, "loss": 0.3957, "step": 325 }, { "Batch Mean": 3.8978066444396973, "accuracy": 0.8046875, "epoch": 0.8125, "step": 325 }, { "epoch": 0.815, "grad_norm": 8.866768836975098, "learning_rate": 9.736842105263158e-07, "loss": 0.3559, "step": 326 }, { "Batch Mean": 3.9494504928588867, "accuracy": 0.8125, "epoch": 0.815, "step": 326 }, { "epoch": 0.8175, "grad_norm": 9.350076675415039, "learning_rate": 9.605263157894737e-07, "loss": 0.3916, "step": 327 }, { "Batch Mean": 3.7682600021362305, "accuracy": 0.84375, "epoch": 0.8175, "step": 327 }, { "epoch": 0.82, "grad_norm": 8.726659774780273, "learning_rate": 9.473684210526317e-07, "loss": 0.3649, "step": 328 }, { "Batch Mean": 3.6107935905456543, "accuracy": 0.8203125, "epoch": 0.82, "step": 328 }, { "epoch": 0.8225, "grad_norm": 9.7432861328125, "learning_rate": 9.342105263157895e-07, "loss": 0.3862, "step": 329 }, { "Batch Mean": 3.9763107299804688, "accuracy": 0.875, "epoch": 0.8225, "step": 329 }, { "epoch": 0.825, "grad_norm": 9.048009872436523, "learning_rate": 9.210526315789474e-07, "loss": 0.3611, "step": 330 }, { "Batch Mean": 3.72268009185791, "accuracy": 0.78125, "epoch": 0.825, "step": 330 }, { "epoch": 0.8275, "grad_norm": 10.039306640625, "learning_rate": 9.078947368421054e-07, "loss": 0.4104, "step": 331 }, { "Batch Mean": 4.0824432373046875, "accuracy": 0.7734375, "epoch": 0.8275, "step": 331 }, { "epoch": 0.83, "grad_norm": 9.71216869354248, "learning_rate": 8.947368421052632e-07, "loss": 0.4676, "step": 332 }, { "Batch Mean": 4.328099250793457, "accuracy": 0.75, "epoch": 0.83, "step": 332 }, { "epoch": 0.8325, "grad_norm": 12.236089706420898, "learning_rate": 8.815789473684211e-07, "loss": 0.536, "step": 333 }, { "Batch Mean": 4.269603252410889, "accuracy": 0.8515625, "epoch": 0.8325, "step": 333 }, { "epoch": 0.835, "grad_norm": 8.661575317382812, "learning_rate": 8.68421052631579e-07, "loss": 0.3387, "step": 334 }, { "Batch Mean": 4.419895172119141, "accuracy": 0.796875, "epoch": 0.835, "step": 334 }, { "epoch": 0.8375, "grad_norm": 9.375877380371094, "learning_rate": 8.55263157894737e-07, "loss": 0.3868, "step": 335 }, { "Batch Mean": 4.138504981994629, "accuracy": 0.8125, "epoch": 0.8375, "step": 335 }, { "epoch": 0.84, "grad_norm": 10.736989974975586, "learning_rate": 8.421052631578948e-07, "loss": 0.4086, "step": 336 }, { "Batch Mean": 3.9145679473876953, "accuracy": 0.828125, "epoch": 0.84, "step": 336 }, { "epoch": 0.8425, "grad_norm": 9.300797462463379, "learning_rate": 8.289473684210527e-07, "loss": 0.3738, "step": 337 }, { "Batch Mean": 3.9161176681518555, "accuracy": 0.828125, "epoch": 0.8425, "step": 337 }, { "epoch": 0.845, "grad_norm": 8.363279342651367, "learning_rate": 8.157894736842106e-07, "loss": 0.35, "step": 338 }, { "Batch Mean": 4.362409591674805, "accuracy": 0.8359375, "epoch": 0.845, "step": 338 }, { "epoch": 0.8475, "grad_norm": 8.345438003540039, "learning_rate": 8.026315789473685e-07, "loss": 0.359, "step": 339 }, { "Batch Mean": 4.674463272094727, "accuracy": 0.7890625, "epoch": 0.8475, "step": 339 }, { "epoch": 0.85, "grad_norm": 10.6575345993042, "learning_rate": 7.894736842105263e-07, "loss": 0.4622, "step": 340 }, { "Batch Mean": 4.491846084594727, "accuracy": 0.7421875, "epoch": 0.85, "step": 340 }, { "epoch": 0.8525, "grad_norm": 10.247350692749023, "learning_rate": 7.763157894736843e-07, "loss": 0.4286, "step": 341 }, { "Batch Mean": 4.197032928466797, "accuracy": 0.8984375, "epoch": 0.8525, "step": 341 }, { "epoch": 0.855, "grad_norm": 7.982635974884033, "learning_rate": 7.631578947368422e-07, "loss": 0.2775, "step": 342 }, { "Batch Mean": 4.466989517211914, "accuracy": 0.796875, "epoch": 0.855, "step": 342 }, { "epoch": 0.8575, "grad_norm": 9.864999771118164, "learning_rate": 7.5e-07, "loss": 0.4152, "step": 343 }, { "Batch Mean": 4.683363914489746, "accuracy": 0.8203125, "epoch": 0.8575, "step": 343 }, { "epoch": 0.86, "grad_norm": 9.847129821777344, "learning_rate": 7.368421052631579e-07, "loss": 0.4182, "step": 344 }, { "Batch Mean": 4.1891889572143555, "accuracy": 0.765625, "epoch": 0.86, "step": 344 }, { "epoch": 0.8625, "grad_norm": 10.039787292480469, "learning_rate": 7.236842105263158e-07, "loss": 0.43, "step": 345 }, { "Batch Mean": 4.3862714767456055, "accuracy": 0.7578125, "epoch": 0.8625, "step": 345 }, { "epoch": 0.865, "grad_norm": 10.894869804382324, "learning_rate": 7.105263157894737e-07, "loss": 0.4852, "step": 346 }, { "Batch Mean": 4.637632369995117, "accuracy": 0.7109375, "epoch": 0.865, "step": 346 }, { "epoch": 0.8675, "grad_norm": 11.730110168457031, "learning_rate": 6.973684210526316e-07, "loss": 0.5448, "step": 347 }, { "Batch Mean": 4.121967315673828, "accuracy": 0.7890625, "epoch": 0.8675, "step": 347 }, { "epoch": 0.87, "grad_norm": 10.164827346801758, "learning_rate": 6.842105263157896e-07, "loss": 0.4483, "step": 348 }, { "Batch Mean": 4.091468811035156, "accuracy": 0.8046875, "epoch": 0.87, "step": 348 }, { "epoch": 0.8725, "grad_norm": 10.1940336227417, "learning_rate": 6.710526315789475e-07, "loss": 0.4399, "step": 349 }, { "Batch Mean": 3.927487373352051, "accuracy": 0.7890625, "epoch": 0.8725, "step": 349 }, { "epoch": 0.875, "grad_norm": 9.758942604064941, "learning_rate": 6.578947368421053e-07, "loss": 0.4649, "step": 350 }, { "Batch Mean": 3.7871651649475098, "accuracy": 0.796875, "epoch": 0.875, "step": 350 }, { "epoch": 0.8775, "grad_norm": 8.651691436767578, "learning_rate": 6.447368421052633e-07, "loss": 0.387, "step": 351 }, { "Batch Mean": 3.8574633598327637, "accuracy": 0.703125, "epoch": 0.8775, "step": 351 }, { "epoch": 0.88, "grad_norm": 11.066187858581543, "learning_rate": 6.315789473684211e-07, "loss": 0.5628, "step": 352 }, { "Batch Mean": 3.776508331298828, "accuracy": 0.78125, "epoch": 0.88, "step": 352 }, { "epoch": 0.8825, "grad_norm": 9.336889266967773, "learning_rate": 6.18421052631579e-07, "loss": 0.4243, "step": 353 }, { "Batch Mean": 3.574608564376831, "accuracy": 0.8046875, "epoch": 0.8825, "step": 353 }, { "epoch": 0.885, "grad_norm": 8.771798133850098, "learning_rate": 6.052631578947369e-07, "loss": 0.4037, "step": 354 }, { "Batch Mean": 3.912416934967041, "accuracy": 0.8203125, "epoch": 0.885, "step": 354 }, { "epoch": 0.8875, "grad_norm": 8.526622772216797, "learning_rate": 5.921052631578947e-07, "loss": 0.3547, "step": 355 }, { "Batch Mean": 3.8269214630126953, "accuracy": 0.8359375, "epoch": 0.8875, "step": 355 }, { "epoch": 0.89, "grad_norm": 7.636133193969727, "learning_rate": 5.789473684210526e-07, "loss": 0.3949, "step": 356 }, { "Batch Mean": 3.300734758377075, "accuracy": 0.7890625, "epoch": 0.89, "step": 356 }, { "epoch": 0.8925, "grad_norm": 7.370996475219727, "learning_rate": 5.657894736842106e-07, "loss": 0.3966, "step": 357 }, { "Batch Mean": 3.461149215698242, "accuracy": 0.828125, "epoch": 0.8925, "step": 357 }, { "epoch": 0.895, "grad_norm": 7.379176616668701, "learning_rate": 5.526315789473684e-07, "loss": 0.3687, "step": 358 }, { "Batch Mean": 3.815633773803711, "accuracy": 0.8359375, "epoch": 0.895, "step": 358 }, { "epoch": 0.8975, "grad_norm": 7.581010818481445, "learning_rate": 5.394736842105264e-07, "loss": 0.3686, "step": 359 }, { "Batch Mean": 3.77066707611084, "accuracy": 0.8203125, "epoch": 0.8975, "step": 359 }, { "epoch": 0.9, "grad_norm": 7.628429412841797, "learning_rate": 5.263157894736843e-07, "loss": 0.3584, "step": 360 }, { "Batch Mean": 3.6309967041015625, "accuracy": 0.8125, "epoch": 0.9, "step": 360 }, { "epoch": 0.9025, "grad_norm": 7.521507740020752, "learning_rate": 5.131578947368422e-07, "loss": 0.3737, "step": 361 }, { "Batch Mean": 3.9186553955078125, "accuracy": 0.8046875, "epoch": 0.9025, "step": 361 }, { "epoch": 0.905, "grad_norm": 8.632046699523926, "learning_rate": 5.000000000000001e-07, "loss": 0.4121, "step": 362 }, { "Batch Mean": 3.8692808151245117, "accuracy": 0.796875, "epoch": 0.905, "step": 362 }, { "epoch": 0.9075, "grad_norm": 7.9635844230651855, "learning_rate": 4.868421052631579e-07, "loss": 0.3815, "step": 363 }, { "Batch Mean": 3.5908093452453613, "accuracy": 0.8125, "epoch": 0.9075, "step": 363 }, { "epoch": 0.91, "grad_norm": 8.613454818725586, "learning_rate": 4.7368421052631585e-07, "loss": 0.4303, "step": 364 }, { "Batch Mean": 3.655755043029785, "accuracy": 0.8046875, "epoch": 0.91, "step": 364 }, { "epoch": 0.9125, "grad_norm": 7.534533500671387, "learning_rate": 4.605263157894737e-07, "loss": 0.4097, "step": 365 }, { "Batch Mean": 3.880359649658203, "accuracy": 0.8125, "epoch": 0.9125, "step": 365 }, { "epoch": 0.915, "grad_norm": 7.979903221130371, "learning_rate": 4.473684210526316e-07, "loss": 0.3947, "step": 366 }, { "Batch Mean": 3.6699132919311523, "accuracy": 0.8359375, "epoch": 0.915, "step": 366 }, { "epoch": 0.9175, "grad_norm": 7.939827919006348, "learning_rate": 4.342105263157895e-07, "loss": 0.4096, "step": 367 }, { "Batch Mean": 3.707386016845703, "accuracy": 0.8125, "epoch": 0.9175, "step": 367 }, { "epoch": 0.92, "grad_norm": 8.204850196838379, "learning_rate": 4.210526315789474e-07, "loss": 0.3746, "step": 368 }, { "Batch Mean": 3.640469551086426, "accuracy": 0.828125, "epoch": 0.92, "step": 368 }, { "epoch": 0.9225, "grad_norm": 8.378007888793945, "learning_rate": 4.078947368421053e-07, "loss": 0.3763, "step": 369 }, { "Batch Mean": 3.862360954284668, "accuracy": 0.828125, "epoch": 0.9225, "step": 369 }, { "epoch": 0.925, "grad_norm": 8.362809181213379, "learning_rate": 3.9473684210526315e-07, "loss": 0.3806, "step": 370 }, { "Batch Mean": 3.9354522228240967, "accuracy": 0.8125, "epoch": 0.925, "step": 370 }, { "epoch": 0.9275, "grad_norm": 8.232060432434082, "learning_rate": 3.815789473684211e-07, "loss": 0.3913, "step": 371 }, { "Batch Mean": 3.9101943969726562, "accuracy": 0.828125, "epoch": 0.9275, "step": 371 }, { "epoch": 0.93, "grad_norm": 9.147234916687012, "learning_rate": 3.6842105263157896e-07, "loss": 0.365, "step": 372 }, { "Batch Mean": 4.02094841003418, "accuracy": 0.8125, "epoch": 0.93, "step": 372 }, { "epoch": 0.9325, "grad_norm": 8.020033836364746, "learning_rate": 3.5526315789473687e-07, "loss": 0.3612, "step": 373 }, { "Batch Mean": 4.072836399078369, "accuracy": 0.7421875, "epoch": 0.9325, "step": 373 }, { "epoch": 0.935, "grad_norm": 10.13514232635498, "learning_rate": 3.421052631578948e-07, "loss": 0.4718, "step": 374 }, { "Batch Mean": 3.845078229904175, "accuracy": 0.84375, "epoch": 0.935, "step": 374 }, { "epoch": 0.9375, "grad_norm": 7.570011615753174, "learning_rate": 3.2894736842105264e-07, "loss": 0.3368, "step": 375 }, { "Batch Mean": 4.410332679748535, "accuracy": 0.859375, "epoch": 0.9375, "step": 375 }, { "epoch": 0.94, "grad_norm": 8.095381736755371, "learning_rate": 3.1578947368421055e-07, "loss": 0.3759, "step": 376 }, { "Batch Mean": 3.809807062149048, "accuracy": 0.84375, "epoch": 0.94, "step": 376 }, { "epoch": 0.9425, "grad_norm": 8.025097846984863, "learning_rate": 3.0263157894736846e-07, "loss": 0.3845, "step": 377 }, { "Batch Mean": 3.961945056915283, "accuracy": 0.8359375, "epoch": 0.9425, "step": 377 }, { "epoch": 0.945, "grad_norm": 7.539624214172363, "learning_rate": 2.894736842105263e-07, "loss": 0.3791, "step": 378 }, { "Batch Mean": 4.050400257110596, "accuracy": 0.78125, "epoch": 0.945, "step": 378 }, { "epoch": 0.9475, "grad_norm": 9.320887565612793, "learning_rate": 2.763157894736842e-07, "loss": 0.452, "step": 379 }, { "Batch Mean": 4.2832231521606445, "accuracy": 0.8671875, "epoch": 0.9475, "step": 379 }, { "epoch": 0.95, "grad_norm": 7.826888561248779, "learning_rate": 2.6315789473684213e-07, "loss": 0.3447, "step": 380 }, { "Batch Mean": 3.8477821350097656, "accuracy": 0.8046875, "epoch": 0.95, "step": 380 }, { "epoch": 0.9525, "grad_norm": 8.854127883911133, "learning_rate": 2.5000000000000004e-07, "loss": 0.4303, "step": 381 }, { "Batch Mean": 3.8454036712646484, "accuracy": 0.828125, "epoch": 0.9525, "step": 381 }, { "epoch": 0.955, "grad_norm": 8.916884422302246, "learning_rate": 2.3684210526315792e-07, "loss": 0.3829, "step": 382 }, { "Batch Mean": 4.330540657043457, "accuracy": 0.796875, "epoch": 0.955, "step": 382 }, { "epoch": 0.9575, "grad_norm": 8.480066299438477, "learning_rate": 2.236842105263158e-07, "loss": 0.3876, "step": 383 }, { "Batch Mean": 3.9993972778320312, "accuracy": 0.796875, "epoch": 0.9575, "step": 383 }, { "epoch": 0.96, "grad_norm": 8.800076484680176, "learning_rate": 2.105263157894737e-07, "loss": 0.4004, "step": 384 }, { "Batch Mean": 4.047536849975586, "accuracy": 0.7734375, "epoch": 0.96, "step": 384 }, { "epoch": 0.9625, "grad_norm": 10.25916862487793, "learning_rate": 1.9736842105263157e-07, "loss": 0.4702, "step": 385 }, { "Batch Mean": 3.8366403579711914, "accuracy": 0.8125, "epoch": 0.9625, "step": 385 }, { "epoch": 0.965, "grad_norm": 10.083771705627441, "learning_rate": 1.8421052631578948e-07, "loss": 0.4467, "step": 386 }, { "Batch Mean": 4.2874650955200195, "accuracy": 0.828125, "epoch": 0.965, "step": 386 }, { "epoch": 0.9675, "grad_norm": 8.051718711853027, "learning_rate": 1.710526315789474e-07, "loss": 0.4134, "step": 387 }, { "Batch Mean": 4.313050270080566, "accuracy": 0.8671875, "epoch": 0.9675, "step": 387 }, { "epoch": 0.97, "grad_norm": 7.1868462562561035, "learning_rate": 1.5789473684210527e-07, "loss": 0.2999, "step": 388 }, { "Batch Mean": 4.171314716339111, "accuracy": 0.84375, "epoch": 0.97, "step": 388 }, { "epoch": 0.9725, "grad_norm": 8.633392333984375, "learning_rate": 1.4473684210526316e-07, "loss": 0.3498, "step": 389 }, { "Batch Mean": 4.292914390563965, "accuracy": 0.8359375, "epoch": 0.9725, "step": 389 }, { "epoch": 0.975, "grad_norm": 8.538396835327148, "learning_rate": 1.3157894736842107e-07, "loss": 0.3814, "step": 390 }, { "Batch Mean": 4.413171768188477, "accuracy": 0.8203125, "epoch": 0.975, "step": 390 }, { "epoch": 0.9775, "grad_norm": 9.03473949432373, "learning_rate": 1.1842105263157896e-07, "loss": 0.408, "step": 391 }, { "Batch Mean": 4.147255897521973, "accuracy": 0.7734375, "epoch": 0.9775, "step": 391 }, { "epoch": 0.98, "grad_norm": 9.122368812561035, "learning_rate": 1.0526315789473685e-07, "loss": 0.4259, "step": 392 }, { "Batch Mean": 4.238164901733398, "accuracy": 0.78125, "epoch": 0.98, "step": 392 }, { "epoch": 0.9825, "grad_norm": 8.9277982711792, "learning_rate": 9.210526315789474e-08, "loss": 0.3768, "step": 393 }, { "Batch Mean": 4.0150299072265625, "accuracy": 0.7734375, "epoch": 0.9825, "step": 393 }, { "epoch": 0.985, "grad_norm": 9.743870735168457, "learning_rate": 7.894736842105264e-08, "loss": 0.4516, "step": 394 }, { "Batch Mean": 4.45374870300293, "accuracy": 0.796875, "epoch": 0.985, "step": 394 }, { "epoch": 0.9875, "grad_norm": 8.603337287902832, "learning_rate": 6.578947368421053e-08, "loss": 0.4033, "step": 395 }, { "Batch Mean": 4.043880462646484, "accuracy": 0.7578125, "epoch": 0.9875, "step": 395 }, { "epoch": 0.99, "grad_norm": 9.147683143615723, "learning_rate": 5.263157894736842e-08, "loss": 0.4357, "step": 396 }, { "Batch Mean": 3.763970375061035, "accuracy": 0.8046875, "epoch": 0.99, "step": 396 }, { "epoch": 0.9925, "grad_norm": 9.61628532409668, "learning_rate": 3.947368421052632e-08, "loss": 0.442, "step": 397 }, { "Batch Mean": 4.027120113372803, "accuracy": 0.8046875, "epoch": 0.9925, "step": 397 }, { "epoch": 0.995, "grad_norm": 8.325051307678223, "learning_rate": 2.631578947368421e-08, "loss": 0.3942, "step": 398 }, { "Batch Mean": 3.6679060459136963, "accuracy": 0.75, "epoch": 0.995, "step": 398 }, { "epoch": 0.9975, "grad_norm": 9.915172576904297, "learning_rate": 1.3157894736842106e-08, "loss": 0.4845, "step": 399 }, { "Batch Mean": 4.056756019592285, "accuracy": 0.78125, "epoch": 0.9975, "step": 399 }, { "epoch": 1.0, "grad_norm": 9.02087688446045, "learning_rate": 0.0, "loss": 0.3811, "step": 400 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }