{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6, "eval_steps": 500, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": 3.64324951171875, "accuracy": 0.6015625, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 6.657606601715088, "learning_rate": 2.5000000000000004e-07, "loss": 0.6742, "step": 1 }, { "Batch Mean": 3.58660888671875, "accuracy": 0.5, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 5.2591233253479, "learning_rate": 5.000000000000001e-07, "loss": 0.6808, "step": 2 }, { "Batch Mean": 3.56298828125, "accuracy": 0.546875, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 5.481376647949219, "learning_rate": 7.5e-07, "loss": 0.6846, "step": 3 }, { "Batch Mean": 3.60186767578125, "accuracy": 0.5625, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 7.51620626449585, "learning_rate": 1.0000000000000002e-06, "loss": 0.6892, "step": 4 }, { "Batch Mean": 3.57855224609375, "accuracy": 0.453125, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 8.899469375610352, "learning_rate": 1.25e-06, "loss": 0.6956, "step": 5 }, { "Batch Mean": 3.595703125, "accuracy": 0.5546875, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 6.726147174835205, "learning_rate": 1.5e-06, "loss": 0.6839, "step": 6 }, { "Batch Mean": 3.64105224609375, "accuracy": 0.453125, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 7.116271018981934, "learning_rate": 1.75e-06, "loss": 0.7154, "step": 7 }, { "Batch Mean": 3.6541748046875, "accuracy": 0.5546875, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 6.190150260925293, "learning_rate": 2.0000000000000003e-06, "loss": 0.6844, "step": 8 }, { "Batch Mean": 3.6507568359375, "accuracy": 0.6015625, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 7.176676273345947, "learning_rate": 2.25e-06, "loss": 0.665, "step": 9 }, { "Batch Mean": 3.6649169921875, "accuracy": 0.59375, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 5.139154434204102, "learning_rate": 2.5e-06, "loss": 0.6626, "step": 10 }, { "Batch Mean": 3.70733642578125, "accuracy": 0.671875, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 5.621807098388672, "learning_rate": 2.7500000000000004e-06, "loss": 0.6299, "step": 11 }, { "Batch Mean": 3.74017333984375, "accuracy": 0.65625, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 5.459871768951416, "learning_rate": 3e-06, "loss": 0.6321, "step": 12 }, { "Batch Mean": 3.8514404296875, "accuracy": 0.625, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 7.757749557495117, "learning_rate": 3.2500000000000002e-06, "loss": 0.6453, "step": 13 }, { "Batch Mean": 3.80059814453125, "accuracy": 0.703125, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 6.681384086608887, "learning_rate": 3.5e-06, "loss": 0.5769, "step": 14 }, { "Batch Mean": 3.8978271484375, "accuracy": 0.6640625, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 10.110990524291992, "learning_rate": 3.7500000000000005e-06, "loss": 0.6354, "step": 15 }, { "Batch Mean": 3.6146774291992188, "accuracy": 0.71875, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 10.233904838562012, "learning_rate": 4.000000000000001e-06, "loss": 0.6182, "step": 16 }, { "Batch Mean": 3.5331335067749023, "accuracy": 0.6328125, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 10.035528182983398, "learning_rate": 4.25e-06, "loss": 0.6252, "step": 17 }, { "Batch Mean": 3.0357871055603027, "accuracy": 0.625, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 9.972918510437012, "learning_rate": 4.5e-06, "loss": 0.6225, "step": 18 }, { "Batch Mean": 2.6747031211853027, "accuracy": 0.6796875, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 21.3759765625, "learning_rate": 4.75e-06, "loss": 0.6328, "step": 19 }, { "Batch Mean": 2.418215751647949, "accuracy": 0.6875, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 10.712898254394531, "learning_rate": 5e-06, "loss": 0.6352, "step": 20 }, { "Batch Mean": 1.9824256896972656, "accuracy": 0.640625, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 10.833212852478027, "learning_rate": 4.986842105263158e-06, "loss": 0.6244, "step": 21 }, { "Batch Mean": 1.5567502975463867, "accuracy": 0.6484375, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 8.710196495056152, "learning_rate": 4.973684210526316e-06, "loss": 0.6002, "step": 22 }, { "Batch Mean": 1.262591004371643, "accuracy": 0.6484375, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 9.207337379455566, "learning_rate": 4.960526315789474e-06, "loss": 0.5754, "step": 23 }, { "Batch Mean": 0.9924072027206421, "accuracy": 0.7109375, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 7.179128170013428, "learning_rate": 4.947368421052632e-06, "loss": 0.5446, "step": 24 }, { "Batch Mean": 0.7883305549621582, "accuracy": 0.6484375, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 7.777095794677734, "learning_rate": 4.9342105263157895e-06, "loss": 0.6383, "step": 25 }, { "Batch Mean": 0.7204087972640991, "accuracy": 0.7109375, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 6.316252708435059, "learning_rate": 4.921052631578948e-06, "loss": 0.5416, "step": 26 }, { "Batch Mean": 0.7001075744628906, "accuracy": 0.75, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 6.2172017097473145, "learning_rate": 4.907894736842106e-06, "loss": 0.5603, "step": 27 }, { "Batch Mean": 0.4948960542678833, "accuracy": 0.6640625, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 6.071000576019287, "learning_rate": 4.894736842105264e-06, "loss": 0.5919, "step": 28 }, { "Batch Mean": 0.6193783283233643, "accuracy": 0.609375, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 7.123428821563721, "learning_rate": 4.881578947368422e-06, "loss": 0.6265, "step": 29 }, { "Batch Mean": 0.5463962554931641, "accuracy": 0.6640625, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 8.785526275634766, "learning_rate": 4.8684210526315795e-06, "loss": 0.6251, "step": 30 }, { "Batch Mean": 0.6883676052093506, "accuracy": 0.7265625, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 6.22755241394043, "learning_rate": 4.855263157894737e-06, "loss": 0.5505, "step": 31 }, { "Batch Mean": 0.8646153211593628, "accuracy": 0.71875, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 5.980678558349609, "learning_rate": 4.842105263157895e-06, "loss": 0.5817, "step": 32 }, { "Batch Mean": 0.9955297708511353, "accuracy": 0.7109375, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 5.80774450302124, "learning_rate": 4.828947368421053e-06, "loss": 0.5697, "step": 33 }, { "Batch Mean": 1.163506031036377, "accuracy": 0.7421875, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 7.277930736541748, "learning_rate": 4.815789473684211e-06, "loss": 0.5299, "step": 34 }, { "Batch Mean": 1.314648151397705, "accuracy": 0.7109375, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 6.664306163787842, "learning_rate": 4.802631578947369e-06, "loss": 0.5548, "step": 35 }, { "Batch Mean": 1.4656352996826172, "accuracy": 0.7421875, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 6.886595249176025, "learning_rate": 4.789473684210527e-06, "loss": 0.5432, "step": 36 }, { "Batch Mean": 1.5227254629135132, "accuracy": 0.7109375, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 6.418576717376709, "learning_rate": 4.7763157894736844e-06, "loss": 0.5536, "step": 37 }, { "Batch Mean": 1.631667137145996, "accuracy": 0.71875, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 8.560834884643555, "learning_rate": 4.763157894736842e-06, "loss": 0.569, "step": 38 }, { "Batch Mean": 1.6476902961730957, "accuracy": 0.734375, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 8.121230125427246, "learning_rate": 4.75e-06, "loss": 0.5418, "step": 39 }, { "Batch Mean": 1.492204189300537, "accuracy": 0.71875, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 7.698504447937012, "learning_rate": 4.736842105263158e-06, "loss": 0.5079, "step": 40 }, { "Batch Mean": 1.3397908210754395, "accuracy": 0.75, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 8.164704322814941, "learning_rate": 4.723684210526316e-06, "loss": 0.5291, "step": 41 }, { "Batch Mean": 1.4944896697998047, "accuracy": 0.75, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 7.740182399749756, "learning_rate": 4.710526315789474e-06, "loss": 0.5375, "step": 42 }, { "Batch Mean": 1.322850227355957, "accuracy": 0.765625, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 7.922670841217041, "learning_rate": 4.697368421052632e-06, "loss": 0.4777, "step": 43 }, { "Batch Mean": 1.4683116674423218, "accuracy": 0.7578125, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 8.393726348876953, "learning_rate": 4.68421052631579e-06, "loss": 0.4728, "step": 44 }, { "Batch Mean": 1.6116371154785156, "accuracy": 0.7109375, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 9.989266395568848, "learning_rate": 4.671052631578948e-06, "loss": 0.5428, "step": 45 }, { "Batch Mean": 1.52450692653656, "accuracy": 0.7890625, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 9.131291389465332, "learning_rate": 4.657894736842106e-06, "loss": 0.4236, "step": 46 }, { "Batch Mean": 1.776949405670166, "accuracy": 0.71875, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 11.912744522094727, "learning_rate": 4.6447368421052635e-06, "loss": 0.6295, "step": 47 }, { "Batch Mean": 2.3817920684814453, "accuracy": 0.6875, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 10.917041778564453, "learning_rate": 4.631578947368421e-06, "loss": 0.5316, "step": 48 }, { "Batch Mean": 2.175340175628662, "accuracy": 0.7734375, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 8.514415740966797, "learning_rate": 4.618421052631579e-06, "loss": 0.4724, "step": 49 }, { "Batch Mean": 2.521066665649414, "accuracy": 0.65625, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 10.435226440429688, "learning_rate": 4.605263157894737e-06, "loss": 0.6361, "step": 50 }, { "Batch Mean": 2.40045166015625, "accuracy": 0.7578125, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 7.130827903747559, "learning_rate": 4.592105263157895e-06, "loss": 0.4949, "step": 51 }, { "Batch Mean": 2.7120556831359863, "accuracy": 0.6953125, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 6.832071304321289, "learning_rate": 4.578947368421053e-06, "loss": 0.5509, "step": 52 }, { "Batch Mean": 2.6350326538085938, "accuracy": 0.7109375, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 6.527810096740723, "learning_rate": 4.565789473684211e-06, "loss": 0.5334, "step": 53 }, { "Batch Mean": 2.8320467472076416, "accuracy": 0.75, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 6.686796188354492, "learning_rate": 4.552631578947369e-06, "loss": 0.505, "step": 54 }, { "Batch Mean": 2.5793724060058594, "accuracy": 0.71875, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 6.586344242095947, "learning_rate": 4.539473684210527e-06, "loss": 0.5145, "step": 55 }, { "Batch Mean": 2.652639389038086, "accuracy": 0.7890625, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 7.232706069946289, "learning_rate": 4.526315789473685e-06, "loss": 0.4847, "step": 56 }, { "Batch Mean": 2.7974865436553955, "accuracy": 0.671875, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 9.864455223083496, "learning_rate": 4.513157894736843e-06, "loss": 0.5848, "step": 57 }, { "Batch Mean": 2.7397356033325195, "accuracy": 0.7734375, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 6.654571056365967, "learning_rate": 4.5e-06, "loss": 0.5149, "step": 58 }, { "Batch Mean": 3.020786762237549, "accuracy": 0.7109375, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 7.521582126617432, "learning_rate": 4.4868421052631584e-06, "loss": 0.532, "step": 59 }, { "Batch Mean": 3.0784976482391357, "accuracy": 0.7421875, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 7.8287272453308105, "learning_rate": 4.473684210526316e-06, "loss": 0.5201, "step": 60 }, { "Batch Mean": 3.2929041385650635, "accuracy": 0.6875, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 7.036741733551025, "learning_rate": 4.460526315789474e-06, "loss": 0.5192, "step": 61 }, { "Batch Mean": 3.2737629413604736, "accuracy": 0.7734375, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 8.064910888671875, "learning_rate": 4.447368421052632e-06, "loss": 0.5094, "step": 62 }, { "Batch Mean": 3.0026721954345703, "accuracy": 0.75, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 8.189529418945312, "learning_rate": 4.43421052631579e-06, "loss": 0.4838, "step": 63 }, { "Batch Mean": 3.0598621368408203, "accuracy": 0.7109375, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 7.452142238616943, "learning_rate": 4.4210526315789476e-06, "loss": 0.5297, "step": 64 }, { "Batch Mean": 2.890104293823242, "accuracy": 0.734375, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 7.773412227630615, "learning_rate": 4.407894736842105e-06, "loss": 0.4938, "step": 65 }, { "Batch Mean": 2.8558566570281982, "accuracy": 0.7890625, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 7.888664722442627, "learning_rate": 4.394736842105263e-06, "loss": 0.5426, "step": 66 }, { "Batch Mean": 2.8415894508361816, "accuracy": 0.7265625, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 7.992714881896973, "learning_rate": 4.381578947368421e-06, "loss": 0.5278, "step": 67 }, { "Batch Mean": 2.5067033767700195, "accuracy": 0.71875, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 7.569831848144531, "learning_rate": 4.368421052631579e-06, "loss": 0.497, "step": 68 }, { "Batch Mean": 2.690906047821045, "accuracy": 0.7890625, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 8.406615257263184, "learning_rate": 4.3552631578947375e-06, "loss": 0.5383, "step": 69 }, { "Batch Mean": 2.54428768157959, "accuracy": 0.7890625, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 6.022793292999268, "learning_rate": 4.342105263157895e-06, "loss": 0.4619, "step": 70 }, { "Batch Mean": 2.539039134979248, "accuracy": 0.75, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 5.931001663208008, "learning_rate": 4.328947368421053e-06, "loss": 0.4643, "step": 71 }, { "Batch Mean": 2.3624706268310547, "accuracy": 0.75, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 6.277031898498535, "learning_rate": 4.315789473684211e-06, "loss": 0.5027, "step": 72 }, { "Batch Mean": 2.5795602798461914, "accuracy": 0.7734375, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 7.181740760803223, "learning_rate": 4.302631578947369e-06, "loss": 0.4652, "step": 73 }, { "Batch Mean": 2.328335762023926, "accuracy": 0.7265625, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 6.926029205322266, "learning_rate": 4.289473684210527e-06, "loss": 0.4554, "step": 74 }, { "Batch Mean": 2.195453643798828, "accuracy": 0.7109375, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 7.934150695800781, "learning_rate": 4.276315789473684e-06, "loss": 0.5458, "step": 75 }, { "Batch Mean": 2.7026095390319824, "accuracy": 0.7421875, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 7.034529209136963, "learning_rate": 4.2631578947368425e-06, "loss": 0.5041, "step": 76 }, { "Batch Mean": 2.611628532409668, "accuracy": 0.734375, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 7.741730213165283, "learning_rate": 4.25e-06, "loss": 0.4804, "step": 77 }, { "Batch Mean": 2.5273165702819824, "accuracy": 0.765625, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 7.251089572906494, "learning_rate": 4.236842105263158e-06, "loss": 0.4959, "step": 78 }, { "Batch Mean": 2.5558762550354004, "accuracy": 0.7109375, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 7.240261077880859, "learning_rate": 4.223684210526316e-06, "loss": 0.5173, "step": 79 }, { "Batch Mean": 2.2066729068756104, "accuracy": 0.78125, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 7.0953474044799805, "learning_rate": 4.210526315789474e-06, "loss": 0.5174, "step": 80 }, { "Batch Mean": 2.377552032470703, "accuracy": 0.796875, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 7.204663276672363, "learning_rate": 4.197368421052632e-06, "loss": 0.4628, "step": 81 }, { "Batch Mean": 2.3107597827911377, "accuracy": 0.7734375, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 7.396126747131348, "learning_rate": 4.18421052631579e-06, "loss": 0.5078, "step": 82 }, { "Batch Mean": 1.8419753313064575, "accuracy": 0.7421875, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 8.316776275634766, "learning_rate": 4.171052631578948e-06, "loss": 0.4945, "step": 83 }, { "Batch Mean": 2.162795066833496, "accuracy": 0.859375, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 6.570835113525391, "learning_rate": 4.157894736842106e-06, "loss": 0.3966, "step": 84 }, { "Batch Mean": 2.2369089126586914, "accuracy": 0.765625, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 7.585580825805664, "learning_rate": 4.144736842105263e-06, "loss": 0.5321, "step": 85 }, { "Batch Mean": 2.4348106384277344, "accuracy": 0.7265625, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 7.9683098793029785, "learning_rate": 4.1315789473684216e-06, "loss": 0.5224, "step": 86 }, { "Batch Mean": 2.9450817108154297, "accuracy": 0.8125, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 7.830796718597412, "learning_rate": 4.118421052631579e-06, "loss": 0.4866, "step": 87 }, { "Batch Mean": 2.9916036128997803, "accuracy": 0.765625, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 9.743931770324707, "learning_rate": 4.105263157894737e-06, "loss": 0.5408, "step": 88 }, { "Batch Mean": 3.0171022415161133, "accuracy": 0.8046875, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 7.245960235595703, "learning_rate": 4.092105263157895e-06, "loss": 0.4232, "step": 89 }, { "Batch Mean": 3.2700085639953613, "accuracy": 0.75, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 7.346113681793213, "learning_rate": 4.078947368421053e-06, "loss": 0.4981, "step": 90 }, { "Batch Mean": 3.375551700592041, "accuracy": 0.7734375, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 8.201985359191895, "learning_rate": 4.065789473684211e-06, "loss": 0.4761, "step": 91 }, { "Batch Mean": 3.438584327697754, "accuracy": 0.7265625, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 8.22840690612793, "learning_rate": 4.052631578947368e-06, "loss": 0.5708, "step": 92 }, { "Batch Mean": 3.2276859283447266, "accuracy": 0.71875, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 7.885838508605957, "learning_rate": 4.0394736842105265e-06, "loss": 0.5382, "step": 93 }, { "Batch Mean": 3.2703161239624023, "accuracy": 0.765625, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 5.956949234008789, "learning_rate": 4.026315789473684e-06, "loss": 0.4753, "step": 94 }, { "Batch Mean": 2.999190330505371, "accuracy": 0.7578125, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 6.309754848480225, "learning_rate": 4.013157894736842e-06, "loss": 0.5308, "step": 95 }, { "Batch Mean": 2.9133434295654297, "accuracy": 0.7734375, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 7.293734073638916, "learning_rate": 4.000000000000001e-06, "loss": 0.4746, "step": 96 }, { "Batch Mean": 2.940767288208008, "accuracy": 0.7734375, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 5.970653533935547, "learning_rate": 3.986842105263158e-06, "loss": 0.4666, "step": 97 }, { "Batch Mean": 2.4930241107940674, "accuracy": 0.828125, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 5.983125686645508, "learning_rate": 3.9736842105263165e-06, "loss": 0.4405, "step": 98 }, { "Batch Mean": 2.768970012664795, "accuracy": 0.6875, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 7.222211837768555, "learning_rate": 3.960526315789474e-06, "loss": 0.5831, "step": 99 }, { "Batch Mean": 2.600813150405884, "accuracy": 0.8125, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 6.174572944641113, "learning_rate": 3.947368421052632e-06, "loss": 0.3996, "step": 100 }, { "Batch Mean": 2.7795987129211426, "accuracy": 0.7421875, "epoch": 0.25, "step": 100 }, { "epoch": 0.2525, "grad_norm": 6.83671236038208, "learning_rate": 3.93421052631579e-06, "loss": 0.5049, "step": 101 }, { "Batch Mean": 3.153959274291992, "accuracy": 0.7578125, "epoch": 0.2525, "step": 101 }, { "epoch": 0.255, "grad_norm": 7.497275352478027, "learning_rate": 3.921052631578947e-06, "loss": 0.4726, "step": 102 }, { "Batch Mean": 3.3440189361572266, "accuracy": 0.8125, "epoch": 0.255, "step": 102 }, { "epoch": 0.2575, "grad_norm": 6.329479217529297, "learning_rate": 3.907894736842106e-06, "loss": 0.3769, "step": 103 }, { "Batch Mean": 3.1725075244903564, "accuracy": 0.765625, "epoch": 0.2575, "step": 103 }, { "epoch": 0.26, "grad_norm": 7.60507869720459, "learning_rate": 3.894736842105263e-06, "loss": 0.4352, "step": 104 }, { "Batch Mean": 3.409212827682495, "accuracy": 0.7578125, "epoch": 0.26, "step": 104 }, { "epoch": 0.2625, "grad_norm": 9.034660339355469, "learning_rate": 3.8815789473684214e-06, "loss": 0.4505, "step": 105 }, { "Batch Mean": 3.7719802856445312, "accuracy": 0.765625, "epoch": 0.2625, "step": 105 }, { "epoch": 0.265, "grad_norm": 8.710013389587402, "learning_rate": 3.868421052631579e-06, "loss": 0.4257, "step": 106 }, { "Batch Mean": 4.079001426696777, "accuracy": 0.71875, "epoch": 0.265, "step": 106 }, { "epoch": 0.2675, "grad_norm": 11.614973068237305, "learning_rate": 3.855263157894737e-06, "loss": 0.487, "step": 107 }, { "Batch Mean": 4.675480842590332, "accuracy": 0.7578125, "epoch": 0.2675, "step": 107 }, { "epoch": 0.27, "grad_norm": 11.474054336547852, "learning_rate": 3.842105263157895e-06, "loss": 0.5035, "step": 108 }, { "Batch Mean": 4.677275657653809, "accuracy": 0.765625, "epoch": 0.27, "step": 108 }, { "epoch": 0.2725, "grad_norm": 12.407071113586426, "learning_rate": 3.828947368421053e-06, "loss": 0.5013, "step": 109 }, { "Batch Mean": 5.106083869934082, "accuracy": 0.8125, "epoch": 0.2725, "step": 109 }, { "epoch": 0.275, "grad_norm": 10.379884719848633, "learning_rate": 3.815789473684211e-06, "loss": 0.3935, "step": 110 }, { "Batch Mean": 5.2830119132995605, "accuracy": 0.8359375, "epoch": 0.275, "step": 110 }, { "epoch": 0.2775, "grad_norm": 8.18240737915039, "learning_rate": 3.802631578947369e-06, "loss": 0.3212, "step": 111 }, { "Batch Mean": 4.785458564758301, "accuracy": 0.84375, "epoch": 0.2775, "step": 111 }, { "epoch": 0.28, "grad_norm": 8.109498977661133, "learning_rate": 3.789473684210527e-06, "loss": 0.325, "step": 112 }, { "Batch Mean": 4.754400730133057, "accuracy": 0.78125, "epoch": 0.28, "step": 112 }, { "epoch": 0.2825, "grad_norm": 11.198568344116211, "learning_rate": 3.7763157894736847e-06, "loss": 0.477, "step": 113 }, { "Batch Mean": 4.87004280090332, "accuracy": 0.796875, "epoch": 0.2825, "step": 113 }, { "epoch": 0.285, "grad_norm": 10.391942977905273, "learning_rate": 3.7631578947368426e-06, "loss": 0.4144, "step": 114 }, { "Batch Mean": 5.210759162902832, "accuracy": 0.6953125, "epoch": 0.285, "step": 114 }, { "epoch": 0.2875, "grad_norm": 13.044468879699707, "learning_rate": 3.7500000000000005e-06, "loss": 0.5755, "step": 115 }, { "Batch Mean": 4.891458511352539, "accuracy": 0.75, "epoch": 0.2875, "step": 115 }, { "epoch": 0.29, "grad_norm": 10.2129487991333, "learning_rate": 3.736842105263158e-06, "loss": 0.5067, "step": 116 }, { "Batch Mean": 4.567095756530762, "accuracy": 0.828125, "epoch": 0.29, "step": 116 }, { "epoch": 0.2925, "grad_norm": 7.831666469573975, "learning_rate": 3.723684210526316e-06, "loss": 0.3706, "step": 117 }, { "Batch Mean": 4.688891410827637, "accuracy": 0.8125, "epoch": 0.2925, "step": 117 }, { "epoch": 0.295, "grad_norm": 8.019044876098633, "learning_rate": 3.710526315789474e-06, "loss": 0.398, "step": 118 }, { "Batch Mean": 4.864707946777344, "accuracy": 0.7109375, "epoch": 0.295, "step": 118 }, { "epoch": 0.2975, "grad_norm": 9.05286979675293, "learning_rate": 3.6973684210526317e-06, "loss": 0.5245, "step": 119 }, { "Batch Mean": 4.6876678466796875, "accuracy": 0.7890625, "epoch": 0.2975, "step": 119 }, { "epoch": 0.3, "grad_norm": 8.225624084472656, "learning_rate": 3.6842105263157896e-06, "loss": 0.442, "step": 120 }, { "Batch Mean": 4.1729278564453125, "accuracy": 0.7734375, "epoch": 0.3, "step": 120 }, { "epoch": 0.3025, "grad_norm": 7.661060810089111, "learning_rate": 3.6710526315789476e-06, "loss": 0.4628, "step": 121 }, { "Batch Mean": 4.011091232299805, "accuracy": 0.8359375, "epoch": 0.3025, "step": 121 }, { "epoch": 0.305, "grad_norm": 7.079358100891113, "learning_rate": 3.657894736842106e-06, "loss": 0.4218, "step": 122 }, { "Batch Mean": 4.086966514587402, "accuracy": 0.7890625, "epoch": 0.305, "step": 122 }, { "epoch": 0.3075, "grad_norm": 6.512862682342529, "learning_rate": 3.644736842105264e-06, "loss": 0.4282, "step": 123 }, { "Batch Mean": 4.260967254638672, "accuracy": 0.796875, "epoch": 0.3075, "step": 123 }, { "epoch": 0.31, "grad_norm": 7.515966892242432, "learning_rate": 3.6315789473684217e-06, "loss": 0.4666, "step": 124 }, { "Batch Mean": 4.493681907653809, "accuracy": 0.71875, "epoch": 0.31, "step": 124 }, { "epoch": 0.3125, "grad_norm": 7.867684364318848, "learning_rate": 3.618421052631579e-06, "loss": 0.5071, "step": 125 }, { "Batch Mean": 4.890687942504883, "accuracy": 0.8125, "epoch": 0.3125, "step": 125 }, { "epoch": 0.315, "grad_norm": 7.532710552215576, "learning_rate": 3.605263157894737e-06, "loss": 0.4652, "step": 126 }, { "Batch Mean": 4.562954902648926, "accuracy": 0.765625, "epoch": 0.315, "step": 126 }, { "epoch": 0.3175, "grad_norm": 8.537458419799805, "learning_rate": 3.592105263157895e-06, "loss": 0.4776, "step": 127 }, { "Batch Mean": 4.587188720703125, "accuracy": 0.78125, "epoch": 0.3175, "step": 127 }, { "epoch": 0.32, "grad_norm": 6.799316883087158, "learning_rate": 3.578947368421053e-06, "loss": 0.4191, "step": 128 }, { "Batch Mean": 3.9915571212768555, "accuracy": 0.75, "epoch": 0.32, "step": 128 }, { "epoch": 0.3225, "grad_norm": 8.045788764953613, "learning_rate": 3.565789473684211e-06, "loss": 0.506, "step": 129 }, { "Batch Mean": 4.073477745056152, "accuracy": 0.7578125, "epoch": 0.3225, "step": 129 }, { "epoch": 0.325, "grad_norm": 8.907149314880371, "learning_rate": 3.5526315789473687e-06, "loss": 0.4877, "step": 130 }, { "Batch Mean": 4.1734771728515625, "accuracy": 0.75, "epoch": 0.325, "step": 130 }, { "epoch": 0.3275, "grad_norm": 9.162409782409668, "learning_rate": 3.5394736842105266e-06, "loss": 0.5122, "step": 131 }, { "Batch Mean": 3.731600761413574, "accuracy": 0.7890625, "epoch": 0.3275, "step": 131 }, { "epoch": 0.33, "grad_norm": 6.665674209594727, "learning_rate": 3.5263157894736846e-06, "loss": 0.4075, "step": 132 }, { "Batch Mean": 3.5332088470458984, "accuracy": 0.765625, "epoch": 0.33, "step": 132 }, { "epoch": 0.3325, "grad_norm": 7.407108783721924, "learning_rate": 3.513157894736842e-06, "loss": 0.4435, "step": 133 }, { "Batch Mean": 3.2184605598449707, "accuracy": 0.78125, "epoch": 0.3325, "step": 133 }, { "epoch": 0.335, "grad_norm": 8.552702903747559, "learning_rate": 3.5e-06, "loss": 0.4817, "step": 134 }, { "Batch Mean": 3.3773207664489746, "accuracy": 0.8046875, "epoch": 0.335, "step": 134 }, { "epoch": 0.3375, "grad_norm": 8.001891136169434, "learning_rate": 3.486842105263158e-06, "loss": 0.4724, "step": 135 }, { "Batch Mean": 2.997677803039551, "accuracy": 0.75, "epoch": 0.3375, "step": 135 }, { "epoch": 0.34, "grad_norm": 9.054019927978516, "learning_rate": 3.473684210526316e-06, "loss": 0.4988, "step": 136 }, { "Batch Mean": 2.929046154022217, "accuracy": 0.8125, "epoch": 0.34, "step": 136 }, { "epoch": 0.3425, "grad_norm": 8.104907989501953, "learning_rate": 3.460526315789474e-06, "loss": 0.4243, "step": 137 }, { "Batch Mean": 2.75551176071167, "accuracy": 0.8125, "epoch": 0.3425, "step": 137 }, { "epoch": 0.345, "grad_norm": 8.473715782165527, "learning_rate": 3.447368421052632e-06, "loss": 0.4427, "step": 138 }, { "Batch Mean": 2.5024139881134033, "accuracy": 0.7890625, "epoch": 0.345, "step": 138 }, { "epoch": 0.3475, "grad_norm": 9.045927047729492, "learning_rate": 3.43421052631579e-06, "loss": 0.4255, "step": 139 }, { "Batch Mean": 2.417640209197998, "accuracy": 0.796875, "epoch": 0.3475, "step": 139 }, { "epoch": 0.35, "grad_norm": 7.710657119750977, "learning_rate": 3.421052631578948e-06, "loss": 0.443, "step": 140 }, { "Batch Mean": 2.505678653717041, "accuracy": 0.7890625, "epoch": 0.35, "step": 140 }, { "epoch": 0.3525, "grad_norm": 7.663838863372803, "learning_rate": 3.4078947368421057e-06, "loss": 0.4049, "step": 141 }, { "Batch Mean": 3.0147056579589844, "accuracy": 0.75, "epoch": 0.3525, "step": 141 }, { "epoch": 0.355, "grad_norm": 8.560474395751953, "learning_rate": 3.3947368421052636e-06, "loss": 0.5149, "step": 142 }, { "Batch Mean": 2.588562488555908, "accuracy": 0.7578125, "epoch": 0.355, "step": 142 }, { "epoch": 0.3575, "grad_norm": 8.25622272491455, "learning_rate": 3.381578947368421e-06, "loss": 0.4934, "step": 143 }, { "Batch Mean": 2.6298677921295166, "accuracy": 0.828125, "epoch": 0.3575, "step": 143 }, { "epoch": 0.36, "grad_norm": 7.083525657653809, "learning_rate": 3.368421052631579e-06, "loss": 0.4115, "step": 144 }, { "Batch Mean": 2.7647485733032227, "accuracy": 0.796875, "epoch": 0.36, "step": 144 }, { "epoch": 0.3625, "grad_norm": 6.853006362915039, "learning_rate": 3.355263157894737e-06, "loss": 0.4469, "step": 145 }, { "Batch Mean": 2.769733190536499, "accuracy": 0.78125, "epoch": 0.3625, "step": 145 }, { "epoch": 0.365, "grad_norm": 7.704352855682373, "learning_rate": 3.342105263157895e-06, "loss": 0.4753, "step": 146 }, { "Batch Mean": 3.3755264282226562, "accuracy": 0.765625, "epoch": 0.365, "step": 146 }, { "epoch": 0.3675, "grad_norm": 7.600855350494385, "learning_rate": 3.3289473684210528e-06, "loss": 0.46, "step": 147 }, { "Batch Mean": 2.6412248611450195, "accuracy": 0.7578125, "epoch": 0.3675, "step": 147 }, { "epoch": 0.37, "grad_norm": 8.489302635192871, "learning_rate": 3.3157894736842107e-06, "loss": 0.5111, "step": 148 }, { "Batch Mean": 2.835446357727051, "accuracy": 0.7421875, "epoch": 0.37, "step": 148 }, { "epoch": 0.3725, "grad_norm": 7.343207359313965, "learning_rate": 3.302631578947369e-06, "loss": 0.5032, "step": 149 }, { "Batch Mean": 2.8000075817108154, "accuracy": 0.8203125, "epoch": 0.3725, "step": 149 }, { "epoch": 0.375, "grad_norm": 7.448639869689941, "learning_rate": 3.289473684210527e-06, "loss": 0.4456, "step": 150 }, { "Batch Mean": 2.8405256271362305, "accuracy": 0.7890625, "epoch": 0.375, "step": 150 }, { "epoch": 0.3775, "grad_norm": 6.841508865356445, "learning_rate": 3.276315789473685e-06, "loss": 0.4306, "step": 151 }, { "Batch Mean": 2.7483787536621094, "accuracy": 0.796875, "epoch": 0.3775, "step": 151 }, { "epoch": 0.38, "grad_norm": 7.562384128570557, "learning_rate": 3.2631578947368423e-06, "loss": 0.5062, "step": 152 }, { "Batch Mean": 2.8838186264038086, "accuracy": 0.7734375, "epoch": 0.38, "step": 152 }, { "epoch": 0.3825, "grad_norm": 6.867684841156006, "learning_rate": 3.2500000000000002e-06, "loss": 0.447, "step": 153 }, { "Batch Mean": 2.926253318786621, "accuracy": 0.7421875, "epoch": 0.3825, "step": 153 }, { "epoch": 0.385, "grad_norm": 8.004990577697754, "learning_rate": 3.236842105263158e-06, "loss": 0.464, "step": 154 }, { "Batch Mean": 3.1090264320373535, "accuracy": 0.8125, "epoch": 0.385, "step": 154 }, { "epoch": 0.3875, "grad_norm": 6.201688766479492, "learning_rate": 3.223684210526316e-06, "loss": 0.365, "step": 155 }, { "Batch Mean": 3.2874813079833984, "accuracy": 0.8515625, "epoch": 0.3875, "step": 155 }, { "epoch": 0.39, "grad_norm": 6.851440906524658, "learning_rate": 3.210526315789474e-06, "loss": 0.3668, "step": 156 }, { "Batch Mean": 3.5518083572387695, "accuracy": 0.8125, "epoch": 0.39, "step": 156 }, { "epoch": 0.3925, "grad_norm": 7.0716023445129395, "learning_rate": 3.197368421052632e-06, "loss": 0.3978, "step": 157 }, { "Batch Mean": 3.0985703468322754, "accuracy": 0.7734375, "epoch": 0.3925, "step": 157 }, { "epoch": 0.395, "grad_norm": 8.339123725891113, "learning_rate": 3.1842105263157898e-06, "loss": 0.3998, "step": 158 }, { "Batch Mean": 3.281148910522461, "accuracy": 0.7890625, "epoch": 0.395, "step": 158 }, { "epoch": 0.3975, "grad_norm": 8.356124877929688, "learning_rate": 3.1710526315789477e-06, "loss": 0.4491, "step": 159 }, { "Batch Mean": 3.783808708190918, "accuracy": 0.796875, "epoch": 0.3975, "step": 159 }, { "epoch": 0.4, "grad_norm": 9.633821487426758, "learning_rate": 3.157894736842105e-06, "loss": 0.432, "step": 160 }, { "Batch Mean": 3.9359283447265625, "accuracy": 0.78125, "epoch": 0.4, "step": 160 }, { "epoch": 0.4025, "grad_norm": 8.254521369934082, "learning_rate": 3.144736842105263e-06, "loss": 0.3891, "step": 161 }, { "Batch Mean": 4.607031345367432, "accuracy": 0.7265625, "epoch": 0.4025, "step": 161 }, { "epoch": 0.405, "grad_norm": 9.970298767089844, "learning_rate": 3.131578947368421e-06, "loss": 0.4982, "step": 162 }, { "Batch Mean": 4.176942825317383, "accuracy": 0.8125, "epoch": 0.405, "step": 162 }, { "epoch": 0.4075, "grad_norm": 12.374481201171875, "learning_rate": 3.1184210526315793e-06, "loss": 0.4349, "step": 163 }, { "Batch Mean": 4.015748977661133, "accuracy": 0.8046875, "epoch": 0.4075, "step": 163 }, { "epoch": 0.41, "grad_norm": 11.244656562805176, "learning_rate": 3.1052631578947372e-06, "loss": 0.421, "step": 164 }, { "Batch Mean": 4.22213888168335, "accuracy": 0.7578125, "epoch": 0.41, "step": 164 }, { "epoch": 0.4125, "grad_norm": 11.150482177734375, "learning_rate": 3.092105263157895e-06, "loss": 0.4978, "step": 165 }, { "Batch Mean": 3.7064056396484375, "accuracy": 0.78125, "epoch": 0.4125, "step": 165 }, { "epoch": 0.415, "grad_norm": 9.92993450164795, "learning_rate": 3.078947368421053e-06, "loss": 0.3748, "step": 166 }, { "Batch Mean": 4.073935508728027, "accuracy": 0.765625, "epoch": 0.415, "step": 166 }, { "epoch": 0.4175, "grad_norm": 9.726029396057129, "learning_rate": 3.065789473684211e-06, "loss": 0.3952, "step": 167 }, { "Batch Mean": 3.741079807281494, "accuracy": 0.796875, "epoch": 0.4175, "step": 167 }, { "epoch": 0.42, "grad_norm": 10.072457313537598, "learning_rate": 3.052631578947369e-06, "loss": 0.4475, "step": 168 }, { "Batch Mean": 3.423218011856079, "accuracy": 0.734375, "epoch": 0.42, "step": 168 }, { "epoch": 0.4225, "grad_norm": 9.74998950958252, "learning_rate": 3.0394736842105268e-06, "loss": 0.4507, "step": 169 }, { "Batch Mean": 3.3439035415649414, "accuracy": 0.78125, "epoch": 0.4225, "step": 169 }, { "epoch": 0.425, "grad_norm": 9.590579986572266, "learning_rate": 3.0263157894736843e-06, "loss": 0.4641, "step": 170 }, { "Batch Mean": 3.2388744354248047, "accuracy": 0.8828125, "epoch": 0.425, "step": 170 }, { "epoch": 0.4275, "grad_norm": 7.974471569061279, "learning_rate": 3.013157894736842e-06, "loss": 0.2888, "step": 171 }, { "Batch Mean": 2.912121295928955, "accuracy": 0.828125, "epoch": 0.4275, "step": 171 }, { "epoch": 0.43, "grad_norm": 8.418153762817383, "learning_rate": 3e-06, "loss": 0.4083, "step": 172 }, { "Batch Mean": 2.921308994293213, "accuracy": 0.7734375, "epoch": 0.43, "step": 172 }, { "epoch": 0.4325, "grad_norm": 9.274598121643066, "learning_rate": 2.986842105263158e-06, "loss": 0.4237, "step": 173 }, { "Batch Mean": 2.707150459289551, "accuracy": 0.765625, "epoch": 0.4325, "step": 173 }, { "epoch": 0.435, "grad_norm": 10.08448314666748, "learning_rate": 2.973684210526316e-06, "loss": 0.4084, "step": 174 }, { "Batch Mean": 3.289186477661133, "accuracy": 0.78125, "epoch": 0.435, "step": 174 }, { "epoch": 0.4375, "grad_norm": 10.216947555541992, "learning_rate": 2.960526315789474e-06, "loss": 0.4265, "step": 175 }, { "Batch Mean": 3.0034594535827637, "accuracy": 0.734375, "epoch": 0.4375, "step": 175 }, { "epoch": 0.44, "grad_norm": 9.921527862548828, "learning_rate": 2.9473684210526317e-06, "loss": 0.4555, "step": 176 }, { "Batch Mean": 3.1962521076202393, "accuracy": 0.796875, "epoch": 0.44, "step": 176 }, { "epoch": 0.4425, "grad_norm": 9.140130043029785, "learning_rate": 2.93421052631579e-06, "loss": 0.433, "step": 177 }, { "Batch Mean": 3.105267286300659, "accuracy": 0.8359375, "epoch": 0.4425, "step": 177 }, { "epoch": 0.445, "grad_norm": 8.577014923095703, "learning_rate": 2.921052631578948e-06, "loss": 0.3558, "step": 178 }, { "Batch Mean": 3.05171799659729, "accuracy": 0.8515625, "epoch": 0.445, "step": 178 }, { "epoch": 0.4475, "grad_norm": 9.902350425720215, "learning_rate": 2.907894736842106e-06, "loss": 0.3446, "step": 179 }, { "Batch Mean": 2.8749771118164062, "accuracy": 0.7734375, "epoch": 0.4475, "step": 179 }, { "epoch": 0.45, "grad_norm": 8.572197914123535, "learning_rate": 2.8947368421052634e-06, "loss": 0.4343, "step": 180 }, { "Batch Mean": 3.5201892852783203, "accuracy": 0.7734375, "epoch": 0.45, "step": 180 }, { "epoch": 0.4525, "grad_norm": 9.538017272949219, "learning_rate": 2.8815789473684213e-06, "loss": 0.4501, "step": 181 }, { "Batch Mean": 3.441112518310547, "accuracy": 0.8203125, "epoch": 0.4525, "step": 181 }, { "epoch": 0.455, "grad_norm": 8.626726150512695, "learning_rate": 2.868421052631579e-06, "loss": 0.4032, "step": 182 }, { "Batch Mean": 3.4753060340881348, "accuracy": 0.7109375, "epoch": 0.455, "step": 182 }, { "epoch": 0.4575, "grad_norm": 11.793852806091309, "learning_rate": 2.855263157894737e-06, "loss": 0.5783, "step": 183 }, { "Batch Mean": 3.088036060333252, "accuracy": 0.765625, "epoch": 0.4575, "step": 183 }, { "epoch": 0.46, "grad_norm": 9.752645492553711, "learning_rate": 2.842105263157895e-06, "loss": 0.4706, "step": 184 }, { "Batch Mean": 2.640962600708008, "accuracy": 0.71875, "epoch": 0.46, "step": 184 }, { "epoch": 0.4625, "grad_norm": 9.9888277053833, "learning_rate": 2.828947368421053e-06, "loss": 0.5268, "step": 185 }, { "Batch Mean": 2.834287405014038, "accuracy": 0.765625, "epoch": 0.4625, "step": 185 }, { "epoch": 0.465, "grad_norm": 8.2410306930542, "learning_rate": 2.815789473684211e-06, "loss": 0.4375, "step": 186 }, { "Batch Mean": 2.8364157676696777, "accuracy": 0.765625, "epoch": 0.465, "step": 186 }, { "epoch": 0.4675, "grad_norm": 7.653415679931641, "learning_rate": 2.8026315789473683e-06, "loss": 0.3908, "step": 187 }, { "Batch Mean": 2.0791072845458984, "accuracy": 0.78125, "epoch": 0.4675, "step": 187 }, { "epoch": 0.47, "grad_norm": 6.701351165771484, "learning_rate": 2.789473684210526e-06, "loss": 0.3761, "step": 188 }, { "Batch Mean": 2.188058853149414, "accuracy": 0.8125, "epoch": 0.47, "step": 188 }, { "epoch": 0.4725, "grad_norm": 7.052455425262451, "learning_rate": 2.776315789473684e-06, "loss": 0.4348, "step": 189 }, { "Batch Mean": 2.0506675243377686, "accuracy": 0.828125, "epoch": 0.4725, "step": 189 }, { "epoch": 0.475, "grad_norm": 7.033082962036133, "learning_rate": 2.7631578947368424e-06, "loss": 0.3651, "step": 190 }, { "Batch Mean": 2.3128931522369385, "accuracy": 0.8203125, "epoch": 0.475, "step": 190 }, { "epoch": 0.4775, "grad_norm": 7.838496685028076, "learning_rate": 2.7500000000000004e-06, "loss": 0.3981, "step": 191 }, { "Batch Mean": 2.032851219177246, "accuracy": 0.875, "epoch": 0.4775, "step": 191 }, { "epoch": 0.48, "grad_norm": 6.573253631591797, "learning_rate": 2.7368421052631583e-06, "loss": 0.3628, "step": 192 }, { "Batch Mean": 2.3109028339385986, "accuracy": 0.796875, "epoch": 0.48, "step": 192 }, { "epoch": 0.4825, "grad_norm": 8.022283554077148, "learning_rate": 2.723684210526316e-06, "loss": 0.4369, "step": 193 }, { "Batch Mean": 2.6557040214538574, "accuracy": 0.7734375, "epoch": 0.4825, "step": 193 }, { "epoch": 0.485, "grad_norm": 8.877519607543945, "learning_rate": 2.710526315789474e-06, "loss": 0.4504, "step": 194 }, { "Batch Mean": 2.9424595832824707, "accuracy": 0.75, "epoch": 0.485, "step": 194 }, { "epoch": 0.4875, "grad_norm": 7.829299449920654, "learning_rate": 2.697368421052632e-06, "loss": 0.4254, "step": 195 }, { "Batch Mean": 2.644308090209961, "accuracy": 0.7578125, "epoch": 0.4875, "step": 195 }, { "epoch": 0.49, "grad_norm": 9.070138931274414, "learning_rate": 2.68421052631579e-06, "loss": 0.4922, "step": 196 }, { "Batch Mean": 2.4906630516052246, "accuracy": 0.765625, "epoch": 0.49, "step": 196 }, { "epoch": 0.4925, "grad_norm": 9.63489818572998, "learning_rate": 2.6710526315789474e-06, "loss": 0.4602, "step": 197 }, { "Batch Mean": 2.2591946125030518, "accuracy": 0.796875, "epoch": 0.4925, "step": 197 }, { "epoch": 0.495, "grad_norm": 8.234044075012207, "learning_rate": 2.6578947368421053e-06, "loss": 0.398, "step": 198 }, { "Batch Mean": 2.6223649978637695, "accuracy": 0.8125, "epoch": 0.495, "step": 198 }, { "epoch": 0.4975, "grad_norm": 7.958437442779541, "learning_rate": 2.644736842105263e-06, "loss": 0.3909, "step": 199 }, { "Batch Mean": 2.5589590072631836, "accuracy": 0.875, "epoch": 0.4975, "step": 199 }, { "epoch": 0.5, "grad_norm": 7.556171417236328, "learning_rate": 2.631578947368421e-06, "loss": 0.3513, "step": 200 }, { "Batch Mean": 2.3716750144958496, "accuracy": 0.7265625, "epoch": 0.5, "step": 200 }, { "epoch": 0.5025, "grad_norm": 9.392867088317871, "learning_rate": 2.618421052631579e-06, "loss": 0.4831, "step": 201 }, { "Batch Mean": 2.27783203125, "accuracy": 0.8203125, "epoch": 0.5025, "step": 201 }, { "epoch": 0.505, "grad_norm": 8.140517234802246, "learning_rate": 2.605263157894737e-06, "loss": 0.3929, "step": 202 }, { "Batch Mean": 2.535848617553711, "accuracy": 0.84375, "epoch": 0.505, "step": 202 }, { "epoch": 0.5075, "grad_norm": 6.947089195251465, "learning_rate": 2.592105263157895e-06, "loss": 0.3478, "step": 203 }, { "Batch Mean": 2.651693344116211, "accuracy": 0.8046875, "epoch": 0.5075, "step": 203 }, { "epoch": 0.51, "grad_norm": 8.759857177734375, "learning_rate": 2.578947368421053e-06, "loss": 0.4076, "step": 204 }, { "Batch Mean": 2.571043014526367, "accuracy": 0.8046875, "epoch": 0.51, "step": 204 }, { "epoch": 0.5125, "grad_norm": 9.00507926940918, "learning_rate": 2.565789473684211e-06, "loss": 0.3996, "step": 205 }, { "Batch Mean": 2.7687175273895264, "accuracy": 0.75, "epoch": 0.5125, "step": 205 }, { "epoch": 0.515, "grad_norm": 10.412758827209473, "learning_rate": 2.552631578947369e-06, "loss": 0.5306, "step": 206 }, { "Batch Mean": 2.5285263061523438, "accuracy": 0.84375, "epoch": 0.515, "step": 206 }, { "epoch": 0.5175, "grad_norm": 7.771270751953125, "learning_rate": 2.5394736842105265e-06, "loss": 0.3507, "step": 207 }, { "Batch Mean": 2.688607692718506, "accuracy": 0.8359375, "epoch": 0.5175, "step": 207 }, { "epoch": 0.52, "grad_norm": 8.726874351501465, "learning_rate": 2.5263157894736844e-06, "loss": 0.4068, "step": 208 }, { "Batch Mean": 2.94492244720459, "accuracy": 0.8125, "epoch": 0.52, "step": 208 }, { "epoch": 0.5225, "grad_norm": 8.50943660736084, "learning_rate": 2.5131578947368423e-06, "loss": 0.3774, "step": 209 }, { "Batch Mean": 2.7862486839294434, "accuracy": 0.796875, "epoch": 0.5225, "step": 209 }, { "epoch": 0.525, "grad_norm": 8.98309326171875, "learning_rate": 2.5e-06, "loss": 0.4266, "step": 210 }, { "Batch Mean": 2.61392879486084, "accuracy": 0.765625, "epoch": 0.525, "step": 210 }, { "epoch": 0.5275, "grad_norm": 10.080408096313477, "learning_rate": 2.486842105263158e-06, "loss": 0.4663, "step": 211 }, { "Batch Mean": 2.9034132957458496, "accuracy": 0.8203125, "epoch": 0.5275, "step": 211 }, { "epoch": 0.53, "grad_norm": 8.575652122497559, "learning_rate": 2.473684210526316e-06, "loss": 0.3926, "step": 212 }, { "Batch Mean": 2.8661410808563232, "accuracy": 0.75, "epoch": 0.53, "step": 212 }, { "epoch": 0.5325, "grad_norm": 10.517654418945312, "learning_rate": 2.460526315789474e-06, "loss": 0.5594, "step": 213 }, { "Batch Mean": 3.062936782836914, "accuracy": 0.7578125, "epoch": 0.5325, "step": 213 }, { "epoch": 0.535, "grad_norm": 9.07347583770752, "learning_rate": 2.447368421052632e-06, "loss": 0.433, "step": 214 }, { "Batch Mean": 2.8995747566223145, "accuracy": 0.8125, "epoch": 0.535, "step": 214 }, { "epoch": 0.5375, "grad_norm": 9.181224822998047, "learning_rate": 2.4342105263157898e-06, "loss": 0.4336, "step": 215 }, { "Batch Mean": 2.7490530014038086, "accuracy": 0.7421875, "epoch": 0.5375, "step": 215 }, { "epoch": 0.54, "grad_norm": 9.431602478027344, "learning_rate": 2.4210526315789477e-06, "loss": 0.4583, "step": 216 }, { "Batch Mean": 2.801156520843506, "accuracy": 0.7734375, "epoch": 0.54, "step": 216 }, { "epoch": 0.5425, "grad_norm": 8.67341423034668, "learning_rate": 2.4078947368421056e-06, "loss": 0.4219, "step": 217 }, { "Batch Mean": 2.785233497619629, "accuracy": 0.8125, "epoch": 0.5425, "step": 217 }, { "epoch": 0.545, "grad_norm": 8.614130973815918, "learning_rate": 2.3947368421052635e-06, "loss": 0.4214, "step": 218 }, { "Batch Mean": 2.322535514831543, "accuracy": 0.8046875, "epoch": 0.545, "step": 218 }, { "epoch": 0.5475, "grad_norm": 8.403548240661621, "learning_rate": 2.381578947368421e-06, "loss": 0.425, "step": 219 }, { "Batch Mean": 2.722550392150879, "accuracy": 0.8515625, "epoch": 0.5475, "step": 219 }, { "epoch": 0.55, "grad_norm": 8.205376625061035, "learning_rate": 2.368421052631579e-06, "loss": 0.3821, "step": 220 }, { "Batch Mean": 2.360077142715454, "accuracy": 0.7734375, "epoch": 0.55, "step": 220 }, { "epoch": 0.5525, "grad_norm": 9.49571418762207, "learning_rate": 2.355263157894737e-06, "loss": 0.4661, "step": 221 }, { "Batch Mean": 3.0066745281219482, "accuracy": 0.7265625, "epoch": 0.5525, "step": 221 }, { "epoch": 0.555, "grad_norm": 9.534995079040527, "learning_rate": 2.342105263157895e-06, "loss": 0.5174, "step": 222 }, { "Batch Mean": 2.9200942516326904, "accuracy": 0.8359375, "epoch": 0.555, "step": 222 }, { "epoch": 0.5575, "grad_norm": 7.970925331115723, "learning_rate": 2.328947368421053e-06, "loss": 0.4063, "step": 223 }, { "Batch Mean": 3.1014528274536133, "accuracy": 0.8359375, "epoch": 0.5575, "step": 223 }, { "epoch": 0.56, "grad_norm": 7.537857532501221, "learning_rate": 2.3157894736842105e-06, "loss": 0.3926, "step": 224 }, { "Batch Mean": 2.5743775367736816, "accuracy": 0.7890625, "epoch": 0.56, "step": 224 }, { "epoch": 0.5625, "grad_norm": 8.670877456665039, "learning_rate": 2.3026315789473684e-06, "loss": 0.4287, "step": 225 }, { "Batch Mean": 3.103597640991211, "accuracy": 0.8359375, "epoch": 0.5625, "step": 225 }, { "epoch": 0.565, "grad_norm": 7.964511871337891, "learning_rate": 2.2894736842105263e-06, "loss": 0.3888, "step": 226 }, { "Batch Mean": 3.0492055416107178, "accuracy": 0.859375, "epoch": 0.565, "step": 226 }, { "epoch": 0.5675, "grad_norm": 7.54857873916626, "learning_rate": 2.2763157894736847e-06, "loss": 0.3208, "step": 227 }, { "Batch Mean": 3.119175434112549, "accuracy": 0.7734375, "epoch": 0.5675, "step": 227 }, { "epoch": 0.57, "grad_norm": 7.656146049499512, "learning_rate": 2.2631578947368426e-06, "loss": 0.4383, "step": 228 }, { "Batch Mean": 2.846890449523926, "accuracy": 0.75, "epoch": 0.57, "step": 228 }, { "epoch": 0.5725, "grad_norm": 9.830333709716797, "learning_rate": 2.25e-06, "loss": 0.5166, "step": 229 }, { "Batch Mean": 2.3330116271972656, "accuracy": 0.84375, "epoch": 0.5725, "step": 229 }, { "epoch": 0.575, "grad_norm": 9.917115211486816, "learning_rate": 2.236842105263158e-06, "loss": 0.4154, "step": 230 }, { "Batch Mean": 3.0601701736450195, "accuracy": 0.8046875, "epoch": 0.575, "step": 230 }, { "epoch": 0.5775, "grad_norm": 8.37639331817627, "learning_rate": 2.223684210526316e-06, "loss": 0.4456, "step": 231 }, { "Batch Mean": 2.553410530090332, "accuracy": 0.78125, "epoch": 0.5775, "step": 231 }, { "epoch": 0.58, "grad_norm": 8.588013648986816, "learning_rate": 2.2105263157894738e-06, "loss": 0.4204, "step": 232 }, { "Batch Mean": 2.790724754333496, "accuracy": 0.8359375, "epoch": 0.58, "step": 232 }, { "epoch": 0.5825, "grad_norm": 8.133465766906738, "learning_rate": 2.1973684210526317e-06, "loss": 0.3772, "step": 233 }, { "Batch Mean": 2.7968082427978516, "accuracy": 0.84375, "epoch": 0.5825, "step": 233 }, { "epoch": 0.585, "grad_norm": 8.12721061706543, "learning_rate": 2.1842105263157896e-06, "loss": 0.3624, "step": 234 }, { "Batch Mean": 3.189887523651123, "accuracy": 0.8046875, "epoch": 0.585, "step": 234 }, { "epoch": 0.5875, "grad_norm": 8.726601600646973, "learning_rate": 2.1710526315789475e-06, "loss": 0.4139, "step": 235 }, { "Batch Mean": 3.4596574306488037, "accuracy": 0.8515625, "epoch": 0.5875, "step": 235 }, { "epoch": 0.59, "grad_norm": 8.571916580200195, "learning_rate": 2.1578947368421054e-06, "loss": 0.4047, "step": 236 }, { "Batch Mean": 3.403088092803955, "accuracy": 0.8671875, "epoch": 0.59, "step": 236 }, { "epoch": 0.5925, "grad_norm": 8.29578971862793, "learning_rate": 2.1447368421052633e-06, "loss": 0.3556, "step": 237 }, { "Batch Mean": 2.97831654548645, "accuracy": 0.796875, "epoch": 0.5925, "step": 237 }, { "epoch": 0.595, "grad_norm": 9.06924819946289, "learning_rate": 2.1315789473684212e-06, "loss": 0.394, "step": 238 }, { "Batch Mean": 3.729241371154785, "accuracy": 0.8515625, "epoch": 0.595, "step": 238 }, { "epoch": 0.5975, "grad_norm": 9.479219436645508, "learning_rate": 2.118421052631579e-06, "loss": 0.3934, "step": 239 }, { "Batch Mean": 3.851926803588867, "accuracy": 0.78125, "epoch": 0.5975, "step": 239 }, { "epoch": 0.6, "grad_norm": 11.203536033630371, "learning_rate": 2.105263157894737e-06, "loss": 0.4106, "step": 240 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }