{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": 1.9420166015625, "accuracy": 0.46875, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 9.666593551635742, "learning_rate": 2.5000000000000004e-07, "loss": 0.7076, "step": 1 }, { "Batch Mean": 1.9312744140625, "accuracy": 0.4375, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 7.076696395874023, "learning_rate": 5.000000000000001e-07, "loss": 0.7073, "step": 2 }, { "Batch Mean": 1.9347686767578125, "accuracy": 0.5, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 6.997435092926025, "learning_rate": 7.5e-07, "loss": 0.6918, "step": 3 }, { "Batch Mean": 1.94329833984375, "accuracy": 0.53125, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 7.863523006439209, "learning_rate": 1.0000000000000002e-06, "loss": 0.6877, "step": 4 }, { "Batch Mean": 1.94287109375, "accuracy": 0.5703125, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 6.760534286499023, "learning_rate": 1.25e-06, "loss": 0.6861, "step": 5 }, { "Batch Mean": 1.97332763671875, "accuracy": 0.53125, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 8.08924674987793, "learning_rate": 1.5e-06, "loss": 0.6869, "step": 6 }, { "Batch Mean": 2.018096923828125, "accuracy": 0.46875, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 6.236896991729736, "learning_rate": 1.75e-06, "loss": 0.7053, "step": 7 }, { "Batch Mean": 2.14398193359375, "accuracy": 0.5390625, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 6.7978620529174805, "learning_rate": 2.0000000000000003e-06, "loss": 0.6925, "step": 8 }, { "Batch Mean": 2.214202880859375, "accuracy": 0.4453125, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 7.25023889541626, "learning_rate": 2.25e-06, "loss": 0.7048, "step": 9 }, { "Batch Mean": 2.3272705078125, "accuracy": 0.5859375, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 6.383249759674072, "learning_rate": 2.5e-06, "loss": 0.6815, "step": 10 }, { "Batch Mean": 2.49114990234375, "accuracy": 0.6640625, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 9.219279289245605, "learning_rate": 2.7500000000000004e-06, "loss": 0.6547, "step": 11 }, { "Batch Mean": 2.550567626953125, "accuracy": 0.6328125, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 7.704563617706299, "learning_rate": 3e-06, "loss": 0.6432, "step": 12 }, { "Batch Mean": 2.645782470703125, "accuracy": 0.671875, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 7.639222145080566, "learning_rate": 3.2500000000000002e-06, "loss": 0.6365, "step": 13 }, { "Batch Mean": 2.8451995849609375, "accuracy": 0.6171875, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 13.006075859069824, "learning_rate": 3.5e-06, "loss": 0.659, "step": 14 }, { "Batch Mean": 3.0160064697265625, "accuracy": 0.609375, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 11.644806861877441, "learning_rate": 3.7500000000000005e-06, "loss": 0.6574, "step": 15 }, { "Batch Mean": 3.0444812774658203, "accuracy": 0.671875, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 12.809576034545898, "learning_rate": 4.000000000000001e-06, "loss": 0.6138, "step": 16 }, { "Batch Mean": 3.1425399780273438, "accuracy": 0.6875, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 10.398422241210938, "learning_rate": 4.25e-06, "loss": 0.5373, "step": 17 }, { "Batch Mean": 2.997763156890869, "accuracy": 0.6015625, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 9.611076354980469, "learning_rate": 4.5e-06, "loss": 0.5884, "step": 18 }, { "Batch Mean": 2.822673797607422, "accuracy": 0.6953125, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 11.101300239562988, "learning_rate": 4.75e-06, "loss": 0.5819, "step": 19 }, { "Batch Mean": 2.9509963989257812, "accuracy": 0.6796875, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 11.16808032989502, "learning_rate": 5e-06, "loss": 0.5849, "step": 20 }, { "Batch Mean": 2.7339065074920654, "accuracy": 0.609375, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 13.900754928588867, "learning_rate": 4.986842105263158e-06, "loss": 0.6193, "step": 21 }, { "Batch Mean": 2.5777530670166016, "accuracy": 0.71875, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 9.619379997253418, "learning_rate": 4.973684210526316e-06, "loss": 0.599, "step": 22 }, { "Batch Mean": 2.6994237899780273, "accuracy": 0.65625, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 9.464532852172852, "learning_rate": 4.960526315789474e-06, "loss": 0.6118, "step": 23 }, { "Batch Mean": 2.70928955078125, "accuracy": 0.6484375, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 8.05068588256836, "learning_rate": 4.947368421052632e-06, "loss": 0.6214, "step": 24 }, { "Batch Mean": 2.8170089721679688, "accuracy": 0.6796875, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 9.21177864074707, "learning_rate": 4.9342105263157895e-06, "loss": 0.6165, "step": 25 }, { "Batch Mean": 2.7590713500976562, "accuracy": 0.6171875, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 6.078210353851318, "learning_rate": 4.921052631578948e-06, "loss": 0.6027, "step": 26 }, { "Batch Mean": 2.8484420776367188, "accuracy": 0.6796875, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 5.933135032653809, "learning_rate": 4.907894736842106e-06, "loss": 0.5721, "step": 27 }, { "Batch Mean": 2.9572601318359375, "accuracy": 0.6796875, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 6.4003143310546875, "learning_rate": 4.894736842105264e-06, "loss": 0.5626, "step": 28 }, { "Batch Mean": 2.9868011474609375, "accuracy": 0.6796875, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 4.7936859130859375, "learning_rate": 4.881578947368422e-06, "loss": 0.5809, "step": 29 }, { "Batch Mean": 3.106109619140625, "accuracy": 0.65625, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 5.339725017547607, "learning_rate": 4.8684210526315795e-06, "loss": 0.6132, "step": 30 }, { "Batch Mean": 3.29296875, "accuracy": 0.734375, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 5.850327491760254, "learning_rate": 4.855263157894737e-06, "loss": 0.584, "step": 31 }, { "Batch Mean": 3.51275634765625, "accuracy": 0.65625, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 6.158403396606445, "learning_rate": 4.842105263157895e-06, "loss": 0.5911, "step": 32 }, { "Batch Mean": 3.4871044158935547, "accuracy": 0.71875, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 5.687869071960449, "learning_rate": 4.828947368421053e-06, "loss": 0.5243, "step": 33 }, { "Batch Mean": 3.6311264038085938, "accuracy": 0.8203125, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 6.883860111236572, "learning_rate": 4.815789473684211e-06, "loss": 0.4525, "step": 34 }, { "Batch Mean": 3.747730255126953, "accuracy": 0.6875, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 8.108345031738281, "learning_rate": 4.802631578947369e-06, "loss": 0.5392, "step": 35 }, { "Batch Mean": 4.052231788635254, "accuracy": 0.671875, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 8.877079963684082, "learning_rate": 4.789473684210527e-06, "loss": 0.5759, "step": 36 }, { "Batch Mean": 4.003006458282471, "accuracy": 0.7421875, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 9.152173042297363, "learning_rate": 4.7763157894736844e-06, "loss": 0.5118, "step": 37 }, { "Batch Mean": 4.063720703125, "accuracy": 0.6171875, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 10.134416580200195, "learning_rate": 4.763157894736842e-06, "loss": 0.6653, "step": 38 }, { "Batch Mean": 4.406402587890625, "accuracy": 0.703125, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 7.997857570648193, "learning_rate": 4.75e-06, "loss": 0.525, "step": 39 }, { "Batch Mean": 4.049163818359375, "accuracy": 0.7578125, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 8.24782657623291, "learning_rate": 4.736842105263158e-06, "loss": 0.4568, "step": 40 }, { "Batch Mean": 4.293998718261719, "accuracy": 0.7890625, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 7.328802108764648, "learning_rate": 4.723684210526316e-06, "loss": 0.4618, "step": 41 }, { "Batch Mean": 4.420158386230469, "accuracy": 0.75, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 7.795598030090332, "learning_rate": 4.710526315789474e-06, "loss": 0.5121, "step": 42 }, { "Batch Mean": 4.374538421630859, "accuracy": 0.6953125, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 9.923321723937988, "learning_rate": 4.697368421052632e-06, "loss": 0.5919, "step": 43 }, { "Batch Mean": 4.3918304443359375, "accuracy": 0.78125, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 6.950074672698975, "learning_rate": 4.68421052631579e-06, "loss": 0.499, "step": 44 }, { "Batch Mean": 4.343505859375, "accuracy": 0.7578125, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 7.051249980926514, "learning_rate": 4.671052631578948e-06, "loss": 0.5362, "step": 45 }, { "Batch Mean": 4.268529891967773, "accuracy": 0.7265625, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 7.27433443069458, "learning_rate": 4.657894736842106e-06, "loss": 0.527, "step": 46 }, { "Batch Mean": 4.0211334228515625, "accuracy": 0.796875, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 7.680591106414795, "learning_rate": 4.6447368421052635e-06, "loss": 0.5004, "step": 47 }, { "Batch Mean": 4.125762939453125, "accuracy": 0.7578125, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 6.570550441741943, "learning_rate": 4.631578947368421e-06, "loss": 0.4887, "step": 48 }, { "Batch Mean": 3.82733154296875, "accuracy": 0.6953125, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 6.966435432434082, "learning_rate": 4.618421052631579e-06, "loss": 0.5351, "step": 49 }, { "Batch Mean": 3.548530101776123, "accuracy": 0.6796875, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 9.178398132324219, "learning_rate": 4.605263157894737e-06, "loss": 0.6348, "step": 50 }, { "Batch Mean": 3.5271453857421875, "accuracy": 0.7421875, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 7.21920108795166, "learning_rate": 4.592105263157895e-06, "loss": 0.5138, "step": 51 }, { "Batch Mean": 3.3803443908691406, "accuracy": 0.75, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 6.866484642028809, "learning_rate": 4.578947368421053e-06, "loss": 0.5163, "step": 52 }, { "Batch Mean": 3.42218017578125, "accuracy": 0.734375, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 5.488794803619385, "learning_rate": 4.565789473684211e-06, "loss": 0.5135, "step": 53 }, { "Batch Mean": 3.271286964416504, "accuracy": 0.6484375, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 6.306389808654785, "learning_rate": 4.552631578947369e-06, "loss": 0.5649, "step": 54 }, { "Batch Mean": 3.0698814392089844, "accuracy": 0.7578125, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 6.536716461181641, "learning_rate": 4.539473684210527e-06, "loss": 0.5415, "step": 55 }, { "Batch Mean": 3.088502883911133, "accuracy": 0.75, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 7.343382358551025, "learning_rate": 4.526315789473685e-06, "loss": 0.5206, "step": 56 }, { "Batch Mean": 3.2023324966430664, "accuracy": 0.6796875, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 7.171923637390137, "learning_rate": 4.513157894736843e-06, "loss": 0.5765, "step": 57 }, { "Batch Mean": 2.9620323181152344, "accuracy": 0.75, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 7.703383922576904, "learning_rate": 4.5e-06, "loss": 0.5144, "step": 58 }, { "Batch Mean": 2.995695114135742, "accuracy": 0.734375, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 8.559469223022461, "learning_rate": 4.4868421052631584e-06, "loss": 0.4926, "step": 59 }, { "Batch Mean": 3.2689762115478516, "accuracy": 0.7421875, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 6.663382530212402, "learning_rate": 4.473684210526316e-06, "loss": 0.4705, "step": 60 }, { "Batch Mean": 2.9887285232543945, "accuracy": 0.7265625, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 7.883590221405029, "learning_rate": 4.460526315789474e-06, "loss": 0.5447, "step": 61 }, { "Batch Mean": 2.901198387145996, "accuracy": 0.78125, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 7.087621688842773, "learning_rate": 4.447368421052632e-06, "loss": 0.4382, "step": 62 }, { "Batch Mean": 2.978053569793701, "accuracy": 0.7265625, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 8.361074447631836, "learning_rate": 4.43421052631579e-06, "loss": 0.5379, "step": 63 }, { "Batch Mean": 2.9929721355438232, "accuracy": 0.765625, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 10.397054672241211, "learning_rate": 4.4210526315789476e-06, "loss": 0.489, "step": 64 }, { "Batch Mean": 2.9623823165893555, "accuracy": 0.734375, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 11.552237510681152, "learning_rate": 4.407894736842105e-06, "loss": 0.5394, "step": 65 }, { "Batch Mean": 2.897951602935791, "accuracy": 0.734375, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 9.772311210632324, "learning_rate": 4.394736842105263e-06, "loss": 0.5274, "step": 66 }, { "Batch Mean": 2.8228421211242676, "accuracy": 0.75, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 8.786587715148926, "learning_rate": 4.381578947368421e-06, "loss": 0.5151, "step": 67 }, { "Batch Mean": 2.93172550201416, "accuracy": 0.8125, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 7.272209167480469, "learning_rate": 4.368421052631579e-06, "loss": 0.4259, "step": 68 }, { "Batch Mean": 3.0047380924224854, "accuracy": 0.6953125, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 10.469968795776367, "learning_rate": 4.3552631578947375e-06, "loss": 0.5409, "step": 69 }, { "Batch Mean": 3.2963790893554688, "accuracy": 0.75, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 8.683908462524414, "learning_rate": 4.342105263157895e-06, "loss": 0.4782, "step": 70 }, { "Batch Mean": 3.195692300796509, "accuracy": 0.7734375, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 7.932145118713379, "learning_rate": 4.328947368421053e-06, "loss": 0.4693, "step": 71 }, { "Batch Mean": 3.188863754272461, "accuracy": 0.8125, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 6.82938814163208, "learning_rate": 4.315789473684211e-06, "loss": 0.3917, "step": 72 }, { "Batch Mean": 3.1435837745666504, "accuracy": 0.7421875, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 6.974360942840576, "learning_rate": 4.302631578947369e-06, "loss": 0.49, "step": 73 }, { "Batch Mean": 3.5599288940429688, "accuracy": 0.78125, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 7.391234397888184, "learning_rate": 4.289473684210527e-06, "loss": 0.507, "step": 74 }, { "Batch Mean": 3.1217479705810547, "accuracy": 0.7421875, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 6.925199508666992, "learning_rate": 4.276315789473684e-06, "loss": 0.5203, "step": 75 }, { "Batch Mean": 3.3439712524414062, "accuracy": 0.7109375, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 8.055448532104492, "learning_rate": 4.2631578947368425e-06, "loss": 0.5173, "step": 76 }, { "Batch Mean": 3.372138023376465, "accuracy": 0.7734375, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 7.226714611053467, "learning_rate": 4.25e-06, "loss": 0.487, "step": 77 }, { "Batch Mean": 3.52044677734375, "accuracy": 0.8359375, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 7.280280113220215, "learning_rate": 4.236842105263158e-06, "loss": 0.4553, "step": 78 }, { "Batch Mean": 3.2147364616394043, "accuracy": 0.8359375, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 6.3655781745910645, "learning_rate": 4.223684210526316e-06, "loss": 0.4328, "step": 79 }, { "Batch Mean": 3.4875216484069824, "accuracy": 0.8203125, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 6.6272759437561035, "learning_rate": 4.210526315789474e-06, "loss": 0.4119, "step": 80 }, { "Batch Mean": 3.427578926086426, "accuracy": 0.78125, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 8.57567024230957, "learning_rate": 4.197368421052632e-06, "loss": 0.4891, "step": 81 }, { "Batch Mean": 3.7187185287475586, "accuracy": 0.7578125, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 7.171870708465576, "learning_rate": 4.18421052631579e-06, "loss": 0.439, "step": 82 }, { "Batch Mean": 3.990264892578125, "accuracy": 0.7734375, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 8.19045639038086, "learning_rate": 4.171052631578948e-06, "loss": 0.4541, "step": 83 }, { "Batch Mean": 4.509843826293945, "accuracy": 0.7265625, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 9.934694290161133, "learning_rate": 4.157894736842106e-06, "loss": 0.5258, "step": 84 }, { "Batch Mean": 4.694008827209473, "accuracy": 0.8125, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 7.037469863891602, "learning_rate": 4.144736842105263e-06, "loss": 0.4123, "step": 85 }, { "Batch Mean": 4.4774580001831055, "accuracy": 0.71875, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 9.059208869934082, "learning_rate": 4.1315789473684216e-06, "loss": 0.541, "step": 86 }, { "Batch Mean": 4.323337078094482, "accuracy": 0.7734375, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 7.794407367706299, "learning_rate": 4.118421052631579e-06, "loss": 0.4797, "step": 87 }, { "Batch Mean": 4.874683380126953, "accuracy": 0.7578125, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 9.554245948791504, "learning_rate": 4.105263157894737e-06, "loss": 0.5024, "step": 88 }, { "Batch Mean": 4.5128173828125, "accuracy": 0.8046875, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 7.580309867858887, "learning_rate": 4.092105263157895e-06, "loss": 0.4127, "step": 89 }, { "Batch Mean": 4.122764587402344, "accuracy": 0.7890625, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 7.205588340759277, "learning_rate": 4.078947368421053e-06, "loss": 0.4207, "step": 90 }, { "Batch Mean": 4.157896041870117, "accuracy": 0.765625, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 9.179079055786133, "learning_rate": 4.065789473684211e-06, "loss": 0.505, "step": 91 }, { "Batch Mean": 4.137635231018066, "accuracy": 0.7890625, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 6.633444309234619, "learning_rate": 4.052631578947368e-06, "loss": 0.3927, "step": 92 }, { "Batch Mean": 3.7938804626464844, "accuracy": 0.828125, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 6.795446395874023, "learning_rate": 4.0394736842105265e-06, "loss": 0.4159, "step": 93 }, { "Batch Mean": 3.357052803039551, "accuracy": 0.8046875, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 7.158699989318848, "learning_rate": 4.026315789473684e-06, "loss": 0.4018, "step": 94 }, { "Batch Mean": 3.3779516220092773, "accuracy": 0.75, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 7.249587059020996, "learning_rate": 4.013157894736842e-06, "loss": 0.4374, "step": 95 }, { "Batch Mean": 3.2399396896362305, "accuracy": 0.8046875, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 7.230793476104736, "learning_rate": 4.000000000000001e-06, "loss": 0.4345, "step": 96 }, { "Batch Mean": 3.1136474609375, "accuracy": 0.8046875, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 8.101882934570312, "learning_rate": 3.986842105263158e-06, "loss": 0.4576, "step": 97 }, { "Batch Mean": 3.214869260787964, "accuracy": 0.8125, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 8.142618179321289, "learning_rate": 3.9736842105263165e-06, "loss": 0.4481, "step": 98 }, { "Batch Mean": 3.2699995040893555, "accuracy": 0.8046875, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 8.672248840332031, "learning_rate": 3.960526315789474e-06, "loss": 0.4518, "step": 99 }, { "Batch Mean": 3.287128448486328, "accuracy": 0.7578125, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 9.054008483886719, "learning_rate": 3.947368421052632e-06, "loss": 0.4934, "step": 100 }, { "Batch Mean": 2.6553401947021484, "accuracy": 0.7734375, "epoch": 0.25, "step": 100 }, { "epoch": 0.2525, "grad_norm": 9.374197006225586, "learning_rate": 3.93421052631579e-06, "loss": 0.5046, "step": 101 }, { "Batch Mean": 2.436018943786621, "accuracy": 0.78125, "epoch": 0.2525, "step": 101 }, { "epoch": 0.255, "grad_norm": 9.066594123840332, "learning_rate": 3.921052631578947e-06, "loss": 0.5216, "step": 102 }, { "Batch Mean": 2.2069954872131348, "accuracy": 0.7734375, "epoch": 0.255, "step": 102 }, { "epoch": 0.2575, "grad_norm": 6.94660758972168, "learning_rate": 3.907894736842106e-06, "loss": 0.4278, "step": 103 }, { "Batch Mean": 2.225872755050659, "accuracy": 0.7734375, "epoch": 0.2575, "step": 103 }, { "epoch": 0.26, "grad_norm": 6.518649101257324, "learning_rate": 3.894736842105263e-06, "loss": 0.4473, "step": 104 }, { "Batch Mean": 2.217318058013916, "accuracy": 0.796875, "epoch": 0.26, "step": 104 }, { "epoch": 0.2625, "grad_norm": 6.582048416137695, "learning_rate": 3.8815789473684214e-06, "loss": 0.4356, "step": 105 }, { "Batch Mean": 1.917119026184082, "accuracy": 0.828125, "epoch": 0.2625, "step": 105 }, { "epoch": 0.265, "grad_norm": 7.497337818145752, "learning_rate": 3.868421052631579e-06, "loss": 0.3825, "step": 106 }, { "Batch Mean": 1.5780539512634277, "accuracy": 0.7578125, "epoch": 0.265, "step": 106 }, { "epoch": 0.2675, "grad_norm": 8.559232711791992, "learning_rate": 3.855263157894737e-06, "loss": 0.4984, "step": 107 }, { "Batch Mean": 1.9931042194366455, "accuracy": 0.8203125, "epoch": 0.2675, "step": 107 }, { "epoch": 0.27, "grad_norm": 6.91691780090332, "learning_rate": 3.842105263157895e-06, "loss": 0.401, "step": 108 }, { "Batch Mean": 2.131106376647949, "accuracy": 0.8046875, "epoch": 0.27, "step": 108 }, { "epoch": 0.2725, "grad_norm": 6.739565849304199, "learning_rate": 3.828947368421053e-06, "loss": 0.3582, "step": 109 }, { "Batch Mean": 2.5897467136383057, "accuracy": 0.8671875, "epoch": 0.2725, "step": 109 }, { "epoch": 0.275, "grad_norm": 8.209200859069824, "learning_rate": 3.815789473684211e-06, "loss": 0.3216, "step": 110 }, { "Batch Mean": 2.6740541458129883, "accuracy": 0.75, "epoch": 0.275, "step": 110 }, { "epoch": 0.2775, "grad_norm": 10.885510444641113, "learning_rate": 3.802631578947369e-06, "loss": 0.5196, "step": 111 }, { "Batch Mean": 2.9480109214782715, "accuracy": 0.703125, "epoch": 0.2775, "step": 111 }, { "epoch": 0.28, "grad_norm": 13.55850887298584, "learning_rate": 3.789473684210527e-06, "loss": 0.5931, "step": 112 }, { "Batch Mean": 3.3308558464050293, "accuracy": 0.8515625, "epoch": 0.28, "step": 112 }, { "epoch": 0.2825, "grad_norm": 9.246851921081543, "learning_rate": 3.7763157894736847e-06, "loss": 0.3901, "step": 113 }, { "Batch Mean": 2.8297948837280273, "accuracy": 0.7890625, "epoch": 0.2825, "step": 113 }, { "epoch": 0.285, "grad_norm": 9.644515037536621, "learning_rate": 3.7631578947368426e-06, "loss": 0.4573, "step": 114 }, { "Batch Mean": 3.7451376914978027, "accuracy": 0.7265625, "epoch": 0.285, "step": 114 }, { "epoch": 0.2875, "grad_norm": 12.58398151397705, "learning_rate": 3.7500000000000005e-06, "loss": 0.5545, "step": 115 }, { "Batch Mean": 3.6171021461486816, "accuracy": 0.7890625, "epoch": 0.2875, "step": 115 }, { "epoch": 0.29, "grad_norm": 9.254029273986816, "learning_rate": 3.736842105263158e-06, "loss": 0.4333, "step": 116 }, { "Batch Mean": 2.7716548442840576, "accuracy": 0.8046875, "epoch": 0.29, "step": 116 }, { "epoch": 0.2925, "grad_norm": 10.669205665588379, "learning_rate": 3.723684210526316e-06, "loss": 0.4602, "step": 117 }, { "Batch Mean": 2.781661033630371, "accuracy": 0.7734375, "epoch": 0.2925, "step": 117 }, { "epoch": 0.295, "grad_norm": 10.094223976135254, "learning_rate": 3.710526315789474e-06, "loss": 0.528, "step": 118 }, { "Batch Mean": 2.198807716369629, "accuracy": 0.7578125, "epoch": 0.295, "step": 118 }, { "epoch": 0.2975, "grad_norm": 7.635989189147949, "learning_rate": 3.6973684210526317e-06, "loss": 0.4268, "step": 119 }, { "Batch Mean": 2.0033488273620605, "accuracy": 0.7734375, "epoch": 0.2975, "step": 119 }, { "epoch": 0.3, "grad_norm": 7.30623197555542, "learning_rate": 3.6842105263157896e-06, "loss": 0.4528, "step": 120 }, { "Batch Mean": 1.8473689556121826, "accuracy": 0.859375, "epoch": 0.3, "step": 120 }, { "epoch": 0.3025, "grad_norm": 6.51129674911499, "learning_rate": 3.6710526315789476e-06, "loss": 0.4022, "step": 121 }, { "Batch Mean": 1.650212287902832, "accuracy": 0.78125, "epoch": 0.3025, "step": 121 }, { "epoch": 0.305, "grad_norm": 7.191035270690918, "learning_rate": 3.657894736842106e-06, "loss": 0.4487, "step": 122 }, { "Batch Mean": 1.7975679636001587, "accuracy": 0.7890625, "epoch": 0.305, "step": 122 }, { "epoch": 0.3075, "grad_norm": 6.7035651206970215, "learning_rate": 3.644736842105264e-06, "loss": 0.4338, "step": 123 }, { "Batch Mean": 1.5073213577270508, "accuracy": 0.8125, "epoch": 0.3075, "step": 123 }, { "epoch": 0.31, "grad_norm": 6.955989837646484, "learning_rate": 3.6315789473684217e-06, "loss": 0.4144, "step": 124 }, { "Batch Mean": 1.7353585958480835, "accuracy": 0.75, "epoch": 0.31, "step": 124 }, { "epoch": 0.3125, "grad_norm": 8.312121391296387, "learning_rate": 3.618421052631579e-06, "loss": 0.5017, "step": 125 }, { "Batch Mean": 1.718948245048523, "accuracy": 0.734375, "epoch": 0.3125, "step": 125 }, { "epoch": 0.315, "grad_norm": 8.334877967834473, "learning_rate": 3.605263157894737e-06, "loss": 0.4731, "step": 126 }, { "Batch Mean": 1.7114816904067993, "accuracy": 0.8359375, "epoch": 0.315, "step": 126 }, { "epoch": 0.3175, "grad_norm": 7.221029281616211, "learning_rate": 3.592105263157895e-06, "loss": 0.3843, "step": 127 }, { "Batch Mean": 1.466775894165039, "accuracy": 0.7578125, "epoch": 0.3175, "step": 127 }, { "epoch": 0.32, "grad_norm": 9.671870231628418, "learning_rate": 3.578947368421053e-06, "loss": 0.517, "step": 128 }, { "Batch Mean": 2.0674805641174316, "accuracy": 0.8125, "epoch": 0.32, "step": 128 }, { "epoch": 0.3225, "grad_norm": 7.535254955291748, "learning_rate": 3.565789473684211e-06, "loss": 0.4002, "step": 129 }, { "Batch Mean": 1.9499692916870117, "accuracy": 0.8046875, "epoch": 0.3225, "step": 129 }, { "epoch": 0.325, "grad_norm": 8.19841480255127, "learning_rate": 3.5526315789473687e-06, "loss": 0.4101, "step": 130 }, { "Batch Mean": 1.8141679763793945, "accuracy": 0.7890625, "epoch": 0.325, "step": 130 }, { "epoch": 0.3275, "grad_norm": 8.877778053283691, "learning_rate": 3.5394736842105266e-06, "loss": 0.458, "step": 131 }, { "Batch Mean": 2.349834680557251, "accuracy": 0.8046875, "epoch": 0.3275, "step": 131 }, { "epoch": 0.33, "grad_norm": 7.818430423736572, "learning_rate": 3.5263157894736846e-06, "loss": 0.3948, "step": 132 }, { "Batch Mean": 2.4295411109924316, "accuracy": 0.765625, "epoch": 0.33, "step": 132 }, { "epoch": 0.3325, "grad_norm": 10.082527160644531, "learning_rate": 3.513157894736842e-06, "loss": 0.4616, "step": 133 }, { "Batch Mean": 2.609513282775879, "accuracy": 0.8359375, "epoch": 0.3325, "step": 133 }, { "epoch": 0.335, "grad_norm": 8.784564971923828, "learning_rate": 3.5e-06, "loss": 0.3703, "step": 134 }, { "Batch Mean": 2.7128183841705322, "accuracy": 0.7890625, "epoch": 0.335, "step": 134 }, { "epoch": 0.3375, "grad_norm": 9.766796112060547, "learning_rate": 3.486842105263158e-06, "loss": 0.3847, "step": 135 }, { "Batch Mean": 3.1417746543884277, "accuracy": 0.7265625, "epoch": 0.3375, "step": 135 }, { "epoch": 0.34, "grad_norm": 11.59825325012207, "learning_rate": 3.473684210526316e-06, "loss": 0.5143, "step": 136 }, { "Batch Mean": 2.894944190979004, "accuracy": 0.7421875, "epoch": 0.34, "step": 136 }, { "epoch": 0.3425, "grad_norm": 12.137927055358887, "learning_rate": 3.460526315789474e-06, "loss": 0.5355, "step": 137 }, { "Batch Mean": 2.7786073684692383, "accuracy": 0.7890625, "epoch": 0.3425, "step": 137 }, { "epoch": 0.345, "grad_norm": 9.075066566467285, "learning_rate": 3.447368421052632e-06, "loss": 0.4115, "step": 138 }, { "Batch Mean": 3.0002739429473877, "accuracy": 0.734375, "epoch": 0.345, "step": 138 }, { "epoch": 0.3475, "grad_norm": 10.028230667114258, "learning_rate": 3.43421052631579e-06, "loss": 0.4722, "step": 139 }, { "Batch Mean": 2.4786624908447266, "accuracy": 0.765625, "epoch": 0.3475, "step": 139 }, { "epoch": 0.35, "grad_norm": 9.266149520874023, "learning_rate": 3.421052631578948e-06, "loss": 0.4656, "step": 140 }, { "Batch Mean": 2.1704983711242676, "accuracy": 0.8125, "epoch": 0.35, "step": 140 }, { "epoch": 0.3525, "grad_norm": 6.836582660675049, "learning_rate": 3.4078947368421057e-06, "loss": 0.3686, "step": 141 }, { "Batch Mean": 2.178023338317871, "accuracy": 0.828125, "epoch": 0.3525, "step": 141 }, { "epoch": 0.355, "grad_norm": 7.482564449310303, "learning_rate": 3.3947368421052636e-06, "loss": 0.4434, "step": 142 }, { "Batch Mean": 2.207672595977783, "accuracy": 0.8359375, "epoch": 0.355, "step": 142 }, { "epoch": 0.3575, "grad_norm": 9.649080276489258, "learning_rate": 3.381578947368421e-06, "loss": 0.4042, "step": 143 }, { "Batch Mean": 2.477532386779785, "accuracy": 0.8046875, "epoch": 0.3575, "step": 143 }, { "epoch": 0.36, "grad_norm": 6.769596576690674, "learning_rate": 3.368421052631579e-06, "loss": 0.3742, "step": 144 }, { "Batch Mean": 2.3647336959838867, "accuracy": 0.7578125, "epoch": 0.36, "step": 144 }, { "epoch": 0.3625, "grad_norm": 9.769651412963867, "learning_rate": 3.355263157894737e-06, "loss": 0.4972, "step": 145 }, { "Batch Mean": 2.282806396484375, "accuracy": 0.765625, "epoch": 0.3625, "step": 145 }, { "epoch": 0.365, "grad_norm": 9.153217315673828, "learning_rate": 3.342105263157895e-06, "loss": 0.4581, "step": 146 }, { "Batch Mean": 2.2191848754882812, "accuracy": 0.7421875, "epoch": 0.365, "step": 146 }, { "epoch": 0.3675, "grad_norm": 7.448163032531738, "learning_rate": 3.3289473684210528e-06, "loss": 0.4751, "step": 147 }, { "Batch Mean": 2.034508228302002, "accuracy": 0.7109375, "epoch": 0.3675, "step": 147 }, { "epoch": 0.37, "grad_norm": 8.64769458770752, "learning_rate": 3.3157894736842107e-06, "loss": 0.55, "step": 148 }, { "Batch Mean": 1.7862319946289062, "accuracy": 0.859375, "epoch": 0.37, "step": 148 }, { "epoch": 0.3725, "grad_norm": 6.058819770812988, "learning_rate": 3.302631578947369e-06, "loss": 0.3536, "step": 149 }, { "Batch Mean": 1.8278677463531494, "accuracy": 0.8046875, "epoch": 0.3725, "step": 149 }, { "epoch": 0.375, "grad_norm": 7.812012672424316, "learning_rate": 3.289473684210527e-06, "loss": 0.4398, "step": 150 }, { "Batch Mean": 1.9891557693481445, "accuracy": 0.8203125, "epoch": 0.375, "step": 150 }, { "epoch": 0.3775, "grad_norm": 7.331231117248535, "learning_rate": 3.276315789473685e-06, "loss": 0.4227, "step": 151 }, { "Batch Mean": 1.683842658996582, "accuracy": 0.7578125, "epoch": 0.3775, "step": 151 }, { "epoch": 0.38, "grad_norm": 6.605899333953857, "learning_rate": 3.2631578947368423e-06, "loss": 0.4956, "step": 152 }, { "Batch Mean": 1.8491768836975098, "accuracy": 0.8203125, "epoch": 0.38, "step": 152 }, { "epoch": 0.3825, "grad_norm": 6.61679744720459, "learning_rate": 3.2500000000000002e-06, "loss": 0.3815, "step": 153 }, { "Batch Mean": 1.5500710010528564, "accuracy": 0.8203125, "epoch": 0.3825, "step": 153 }, { "epoch": 0.385, "grad_norm": 7.707108497619629, "learning_rate": 3.236842105263158e-06, "loss": 0.4151, "step": 154 }, { "Batch Mean": 1.4066014289855957, "accuracy": 0.7890625, "epoch": 0.385, "step": 154 }, { "epoch": 0.3875, "grad_norm": 7.169893741607666, "learning_rate": 3.223684210526316e-06, "loss": 0.4474, "step": 155 }, { "Batch Mean": 1.7346997261047363, "accuracy": 0.7734375, "epoch": 0.3875, "step": 155 }, { "epoch": 0.39, "grad_norm": 10.779695510864258, "learning_rate": 3.210526315789474e-06, "loss": 0.4299, "step": 156 }, { "Batch Mean": 1.810314655303955, "accuracy": 0.796875, "epoch": 0.39, "step": 156 }, { "epoch": 0.3925, "grad_norm": 7.573057174682617, "learning_rate": 3.197368421052632e-06, "loss": 0.4399, "step": 157 }, { "Batch Mean": 1.7872644662857056, "accuracy": 0.7734375, "epoch": 0.3925, "step": 157 }, { "epoch": 0.395, "grad_norm": 7.816164493560791, "learning_rate": 3.1842105263157898e-06, "loss": 0.4131, "step": 158 }, { "Batch Mean": 1.804029941558838, "accuracy": 0.765625, "epoch": 0.395, "step": 158 }, { "epoch": 0.3975, "grad_norm": 8.655269622802734, "learning_rate": 3.1710526315789477e-06, "loss": 0.4737, "step": 159 }, { "Batch Mean": 1.565956711769104, "accuracy": 0.796875, "epoch": 0.3975, "step": 159 }, { "epoch": 0.4, "grad_norm": 7.699905872344971, "learning_rate": 3.157894736842105e-06, "loss": 0.4022, "step": 160 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }