diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,89582 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 1000, + "global_step": 12776, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00015654351909830932, + "grad_norm": Infinity, + "learning_rate": 0.0, + "loss": 10.2049, + "step": 1 + }, + { + "epoch": 0.00031308703819661864, + "grad_norm": Infinity, + "learning_rate": 0.0, + "loss": 10.1719, + "step": 2 + }, + { + "epoch": 0.000469630557294928, + "grad_norm": 10.289196014404297, + "learning_rate": 2.0000000000000002e-07, + "loss": 11.8275, + "step": 3 + }, + { + "epoch": 0.0006261740763932373, + "grad_norm": 8.432656288146973, + "learning_rate": 4.0000000000000003e-07, + "loss": 9.669, + "step": 4 + }, + { + "epoch": 0.0007827175954915466, + "grad_norm": 10.236390113830566, + "learning_rate": 6.000000000000001e-07, + "loss": 11.888, + "step": 5 + }, + { + "epoch": 0.000939261114589856, + "grad_norm": 9.02259635925293, + "learning_rate": 8.000000000000001e-07, + "loss": 11.0998, + "step": 6 + }, + { + "epoch": 0.0010958046336881652, + "grad_norm": 9.155373573303223, + "learning_rate": 1.0000000000000002e-06, + "loss": 11.0313, + "step": 7 + }, + { + "epoch": 0.0012523481527864746, + "grad_norm": 15.195652961730957, + "learning_rate": 1.2000000000000002e-06, + "loss": 18.5518, + "step": 8 + }, + { + "epoch": 0.001408891671884784, + "grad_norm": 8.246197700500488, + "learning_rate": 1.4000000000000001e-06, + "loss": 9.8139, + "step": 9 + }, + { + "epoch": 0.0015654351909830933, + "grad_norm": 8.51622200012207, + "learning_rate": 1.6000000000000001e-06, + "loss": 10.5818, + "step": 10 + }, + { + "epoch": 0.0017219787100814026, + "grad_norm": 8.064775466918945, + "learning_rate": 1.8e-06, + "loss": 9.6972, + "step": 11 + }, + { + "epoch": 0.001878522229179712, + "grad_norm": 12.738836288452148, + "learning_rate": 2.0000000000000003e-06, + "loss": 15.5603, + "step": 12 + }, + { + "epoch": 0.002035065748278021, + "grad_norm": 14.75306224822998, + "learning_rate": 2.2e-06, + "loss": 21.199, + "step": 13 + }, + { + "epoch": 0.0021916092673763305, + "grad_norm": 14.606815338134766, + "learning_rate": 2.4000000000000003e-06, + "loss": 20.1541, + "step": 14 + }, + { + "epoch": 0.00234815278647464, + "grad_norm": Infinity, + "learning_rate": 2.4000000000000003e-06, + "loss": 25.9435, + "step": 15 + }, + { + "epoch": 0.002504696305572949, + "grad_norm": 7.288573265075684, + "learning_rate": 2.6e-06, + "loss": 9.0748, + "step": 16 + }, + { + "epoch": 0.0026612398246712585, + "grad_norm": 14.452120780944824, + "learning_rate": 2.8000000000000003e-06, + "loss": 19.6447, + "step": 17 + }, + { + "epoch": 0.002817783343769568, + "grad_norm": 24.08975601196289, + "learning_rate": 3e-06, + "loss": 24.9117, + "step": 18 + }, + { + "epoch": 0.002974326862867877, + "grad_norm": 10.497502326965332, + "learning_rate": 3.2000000000000003e-06, + "loss": 14.4203, + "step": 19 + }, + { + "epoch": 0.0031308703819661866, + "grad_norm": 8.929752349853516, + "learning_rate": 3.4000000000000005e-06, + "loss": 11.2469, + "step": 20 + }, + { + "epoch": 0.003287413901064496, + "grad_norm": 8.217044830322266, + "learning_rate": 3.6e-06, + "loss": 10.1311, + "step": 21 + }, + { + "epoch": 0.0034439574201628053, + "grad_norm": 9.737432479858398, + "learning_rate": 3.8e-06, + "loss": 12.5837, + "step": 22 + }, + { + "epoch": 0.0036005009392611146, + "grad_norm": 10.685907363891602, + "learning_rate": 4.000000000000001e-06, + "loss": 15.1975, + "step": 23 + }, + { + "epoch": 0.003757044458359424, + "grad_norm": 8.28918743133545, + "learning_rate": 4.2000000000000004e-06, + "loss": 11.9349, + "step": 24 + }, + { + "epoch": 0.003913587977457733, + "grad_norm": 11.126505851745605, + "learning_rate": 4.4e-06, + "loss": 16.8713, + "step": 25 + }, + { + "epoch": 0.004070131496556042, + "grad_norm": 10.970772743225098, + "learning_rate": 4.6e-06, + "loss": 15.4774, + "step": 26 + }, + { + "epoch": 0.004226675015654352, + "grad_norm": 13.610772132873535, + "learning_rate": 4.800000000000001e-06, + "loss": 19.628, + "step": 27 + }, + { + "epoch": 0.004383218534752661, + "grad_norm": 8.59617805480957, + "learning_rate": 5e-06, + "loss": 13.1668, + "step": 28 + }, + { + "epoch": 0.004539762053850971, + "grad_norm": 9.875232696533203, + "learning_rate": 5.2e-06, + "loss": 14.8651, + "step": 29 + }, + { + "epoch": 0.00469630557294928, + "grad_norm": 9.437064170837402, + "learning_rate": 5.4e-06, + "loss": 13.7075, + "step": 30 + }, + { + "epoch": 0.004852849092047589, + "grad_norm": 10.477542877197266, + "learning_rate": 5.600000000000001e-06, + "loss": 16.8727, + "step": 31 + }, + { + "epoch": 0.005009392611145898, + "grad_norm": 13.334623336791992, + "learning_rate": 5.8e-06, + "loss": 19.5269, + "step": 32 + }, + { + "epoch": 0.005165936130244208, + "grad_norm": 11.546553611755371, + "learning_rate": 6e-06, + "loss": 17.6682, + "step": 33 + }, + { + "epoch": 0.005322479649342517, + "grad_norm": 9.285508155822754, + "learning_rate": 6.2e-06, + "loss": 15.027, + "step": 34 + }, + { + "epoch": 0.005479023168440827, + "grad_norm": 9.90186595916748, + "learning_rate": 6.4000000000000006e-06, + "loss": 15.7925, + "step": 35 + }, + { + "epoch": 0.005635566687539136, + "grad_norm": 7.72845458984375, + "learning_rate": 6.6e-06, + "loss": 13.447, + "step": 36 + }, + { + "epoch": 0.0057921102066374455, + "grad_norm": 9.977320671081543, + "learning_rate": 6.800000000000001e-06, + "loss": 15.7313, + "step": 37 + }, + { + "epoch": 0.005948653725735754, + "grad_norm": 10.13780403137207, + "learning_rate": 7.000000000000001e-06, + "loss": 16.8345, + "step": 38 + }, + { + "epoch": 0.006105197244834064, + "grad_norm": 7.814809322357178, + "learning_rate": 7.2e-06, + "loss": 12.8454, + "step": 39 + }, + { + "epoch": 0.006261740763932373, + "grad_norm": 8.81053638458252, + "learning_rate": 7.4e-06, + "loss": 13.9008, + "step": 40 + }, + { + "epoch": 0.006418284283030683, + "grad_norm": 6.512957572937012, + "learning_rate": 7.6e-06, + "loss": 12.1843, + "step": 41 + }, + { + "epoch": 0.006574827802128992, + "grad_norm": 7.509063243865967, + "learning_rate": 7.8e-06, + "loss": 13.6724, + "step": 42 + }, + { + "epoch": 0.006731371321227302, + "grad_norm": 7.068925380706787, + "learning_rate": 8.000000000000001e-06, + "loss": 12.5917, + "step": 43 + }, + { + "epoch": 0.0068879148403256105, + "grad_norm": 6.228107929229736, + "learning_rate": 8.200000000000001e-06, + "loss": 11.2285, + "step": 44 + }, + { + "epoch": 0.007044458359423919, + "grad_norm": 5.5484466552734375, + "learning_rate": 8.400000000000001e-06, + "loss": 10.3379, + "step": 45 + }, + { + "epoch": 0.007201001878522229, + "grad_norm": 6.9069037437438965, + "learning_rate": 8.599999999999999e-06, + "loss": 10.4253, + "step": 46 + }, + { + "epoch": 0.007357545397620538, + "grad_norm": 5.554916858673096, + "learning_rate": 8.8e-06, + "loss": 9.4062, + "step": 47 + }, + { + "epoch": 0.007514088916718848, + "grad_norm": 5.1056437492370605, + "learning_rate": 9e-06, + "loss": 9.0005, + "step": 48 + }, + { + "epoch": 0.007670632435817157, + "grad_norm": 4.115481853485107, + "learning_rate": 9.2e-06, + "loss": 7.7773, + "step": 49 + }, + { + "epoch": 0.007827175954915467, + "grad_norm": 3.3567614555358887, + "learning_rate": 9.4e-06, + "loss": 6.7583, + "step": 50 + }, + { + "epoch": 0.007983719474013776, + "grad_norm": 11.3549222946167, + "learning_rate": 9.600000000000001e-06, + "loss": 12.2524, + "step": 51 + }, + { + "epoch": 0.008140262993112084, + "grad_norm": 10.585112571716309, + "learning_rate": 9.800000000000001e-06, + "loss": 11.2391, + "step": 52 + }, + { + "epoch": 0.008296806512210394, + "grad_norm": 16.029098510742188, + "learning_rate": 1e-05, + "loss": 15.9319, + "step": 53 + }, + { + "epoch": 0.008453350031308704, + "grad_norm": 8.521809577941895, + "learning_rate": 1.02e-05, + "loss": 9.1192, + "step": 54 + }, + { + "epoch": 0.008609893550407014, + "grad_norm": 9.090985298156738, + "learning_rate": 1.04e-05, + "loss": 9.9237, + "step": 55 + }, + { + "epoch": 0.008766437069505322, + "grad_norm": 9.114990234375, + "learning_rate": 1.06e-05, + "loss": 10.4122, + "step": 56 + }, + { + "epoch": 0.008922980588603632, + "grad_norm": 9.590527534484863, + "learning_rate": 1.08e-05, + "loss": 10.3857, + "step": 57 + }, + { + "epoch": 0.009079524107701941, + "grad_norm": 11.639906883239746, + "learning_rate": 1.1000000000000001e-05, + "loss": 13.073, + "step": 58 + }, + { + "epoch": 0.009236067626800251, + "grad_norm": 10.141020774841309, + "learning_rate": 1.1200000000000001e-05, + "loss": 11.9011, + "step": 59 + }, + { + "epoch": 0.00939261114589856, + "grad_norm": 9.746541976928711, + "learning_rate": 1.1400000000000001e-05, + "loss": 10.7365, + "step": 60 + }, + { + "epoch": 0.009549154664996869, + "grad_norm": 11.524857521057129, + "learning_rate": 1.16e-05, + "loss": 13.0425, + "step": 61 + }, + { + "epoch": 0.009705698184095179, + "grad_norm": 8.035857200622559, + "learning_rate": 1.18e-05, + "loss": 8.6194, + "step": 62 + }, + { + "epoch": 0.009862241703193489, + "grad_norm": 10.302777290344238, + "learning_rate": 1.2e-05, + "loss": 12.4105, + "step": 63 + }, + { + "epoch": 0.010018785222291797, + "grad_norm": 10.186317443847656, + "learning_rate": 1.22e-05, + "loss": 11.4829, + "step": 64 + }, + { + "epoch": 0.010175328741390106, + "grad_norm": 8.001344680786133, + "learning_rate": 1.24e-05, + "loss": 8.7528, + "step": 65 + }, + { + "epoch": 0.010331872260488416, + "grad_norm": 15.147337913513184, + "learning_rate": 1.2600000000000001e-05, + "loss": 18.1788, + "step": 66 + }, + { + "epoch": 0.010488415779586726, + "grad_norm": 9.064345359802246, + "learning_rate": 1.2800000000000001e-05, + "loss": 9.96, + "step": 67 + }, + { + "epoch": 0.010644959298685034, + "grad_norm": 16.115659713745117, + "learning_rate": 1.3000000000000001e-05, + "loss": 20.5702, + "step": 68 + }, + { + "epoch": 0.010801502817783344, + "grad_norm": 14.07148551940918, + "learning_rate": 1.32e-05, + "loss": 12.3018, + "step": 69 + }, + { + "epoch": 0.010958046336881654, + "grad_norm": 7.789936542510986, + "learning_rate": 1.3400000000000002e-05, + "loss": 8.8257, + "step": 70 + }, + { + "epoch": 0.011114589855979962, + "grad_norm": 12.446857452392578, + "learning_rate": 1.3600000000000002e-05, + "loss": 16.0076, + "step": 71 + }, + { + "epoch": 0.011271133375078271, + "grad_norm": 15.518702507019043, + "learning_rate": 1.3800000000000002e-05, + "loss": 19.8583, + "step": 72 + }, + { + "epoch": 0.011427676894176581, + "grad_norm": 10.532947540283203, + "learning_rate": 1.4000000000000001e-05, + "loss": 12.3533, + "step": 73 + }, + { + "epoch": 0.011584220413274891, + "grad_norm": 9.899404525756836, + "learning_rate": 1.42e-05, + "loss": 12.4135, + "step": 74 + }, + { + "epoch": 0.011740763932373199, + "grad_norm": 17.51387596130371, + "learning_rate": 1.44e-05, + "loss": 22.1451, + "step": 75 + }, + { + "epoch": 0.011897307451471509, + "grad_norm": 9.365567207336426, + "learning_rate": 1.4599999999999999e-05, + "loss": 11.6745, + "step": 76 + }, + { + "epoch": 0.012053850970569819, + "grad_norm": 17.38188362121582, + "learning_rate": 1.48e-05, + "loss": 19.6323, + "step": 77 + }, + { + "epoch": 0.012210394489668128, + "grad_norm": 20.479537963867188, + "learning_rate": 1.5e-05, + "loss": 23.757, + "step": 78 + }, + { + "epoch": 0.012366938008766436, + "grad_norm": 11.466914176940918, + "learning_rate": 1.52e-05, + "loss": 14.9466, + "step": 79 + }, + { + "epoch": 0.012523481527864746, + "grad_norm": 15.431199073791504, + "learning_rate": 1.54e-05, + "loss": 18.5333, + "step": 80 + }, + { + "epoch": 0.012680025046963056, + "grad_norm": 20.146408081054688, + "learning_rate": 1.56e-05, + "loss": 25.5397, + "step": 81 + }, + { + "epoch": 0.012836568566061366, + "grad_norm": 16.584779739379883, + "learning_rate": 1.58e-05, + "loss": 19.3745, + "step": 82 + }, + { + "epoch": 0.012993112085159674, + "grad_norm": 11.672348022460938, + "learning_rate": 1.6000000000000003e-05, + "loss": 14.0746, + "step": 83 + }, + { + "epoch": 0.013149655604257984, + "grad_norm": 13.137657165527344, + "learning_rate": 1.62e-05, + "loss": 17.2705, + "step": 84 + }, + { + "epoch": 0.013306199123356293, + "grad_norm": 16.063127517700195, + "learning_rate": 1.6400000000000002e-05, + "loss": 11.3737, + "step": 85 + }, + { + "epoch": 0.013462742642454603, + "grad_norm": 12.471244812011719, + "learning_rate": 1.66e-05, + "loss": 14.7664, + "step": 86 + }, + { + "epoch": 0.013619286161552911, + "grad_norm": 8.394390106201172, + "learning_rate": 1.6800000000000002e-05, + "loss": 13.4235, + "step": 87 + }, + { + "epoch": 0.013775829680651221, + "grad_norm": 13.5564546585083, + "learning_rate": 1.7000000000000003e-05, + "loss": 14.7222, + "step": 88 + }, + { + "epoch": 0.01393237319974953, + "grad_norm": 8.611299514770508, + "learning_rate": 1.7199999999999998e-05, + "loss": 13.0431, + "step": 89 + }, + { + "epoch": 0.014088916718847839, + "grad_norm": 12.868847846984863, + "learning_rate": 1.74e-05, + "loss": 17.0878, + "step": 90 + }, + { + "epoch": 0.014245460237946149, + "grad_norm": 12.087831497192383, + "learning_rate": 1.76e-05, + "loss": 14.7459, + "step": 91 + }, + { + "epoch": 0.014402003757044458, + "grad_norm": 10.171838760375977, + "learning_rate": 1.78e-05, + "loss": 11.0744, + "step": 92 + }, + { + "epoch": 0.014558547276142768, + "grad_norm": 10.221874237060547, + "learning_rate": 1.8e-05, + "loss": 11.2593, + "step": 93 + }, + { + "epoch": 0.014715090795241076, + "grad_norm": 10.228421211242676, + "learning_rate": 1.8200000000000002e-05, + "loss": 10.4019, + "step": 94 + }, + { + "epoch": 0.014871634314339386, + "grad_norm": 10.988655090332031, + "learning_rate": 1.84e-05, + "loss": 10.0481, + "step": 95 + }, + { + "epoch": 0.015028177833437696, + "grad_norm": 8.86600399017334, + "learning_rate": 1.86e-05, + "loss": 8.2492, + "step": 96 + }, + { + "epoch": 0.015184721352536006, + "grad_norm": 8.387210845947266, + "learning_rate": 1.88e-05, + "loss": 7.6949, + "step": 97 + }, + { + "epoch": 0.015341264871634314, + "grad_norm": 7.695061206817627, + "learning_rate": 1.9e-05, + "loss": 6.9783, + "step": 98 + }, + { + "epoch": 0.015497808390732623, + "grad_norm": 7.67902135848999, + "learning_rate": 1.9200000000000003e-05, + "loss": 6.571, + "step": 99 + }, + { + "epoch": 0.015654351909830933, + "grad_norm": 6.287416458129883, + "learning_rate": 1.94e-05, + "loss": 5.621, + "step": 100 + }, + { + "epoch": 0.01581089542892924, + "grad_norm": 8.696687698364258, + "learning_rate": 1.9600000000000002e-05, + "loss": 9.5936, + "step": 101 + }, + { + "epoch": 0.015967438948027553, + "grad_norm": 8.218658447265625, + "learning_rate": 1.9800000000000004e-05, + "loss": 9.2277, + "step": 102 + }, + { + "epoch": 0.01612398246712586, + "grad_norm": 7.567336559295654, + "learning_rate": 2e-05, + "loss": 8.8725, + "step": 103 + }, + { + "epoch": 0.01628052598622417, + "grad_norm": 27.14945411682129, + "learning_rate": 2.0200000000000003e-05, + "loss": 18.5167, + "step": 104 + }, + { + "epoch": 0.01643706950532248, + "grad_norm": 14.602490425109863, + "learning_rate": 2.04e-05, + "loss": 12.2964, + "step": 105 + }, + { + "epoch": 0.01659361302442079, + "grad_norm": 9.870688438415527, + "learning_rate": 2.06e-05, + "loss": 9.2905, + "step": 106 + }, + { + "epoch": 0.0167501565435191, + "grad_norm": 8.920544624328613, + "learning_rate": 2.08e-05, + "loss": 7.8249, + "step": 107 + }, + { + "epoch": 0.016906700062617408, + "grad_norm": 13.759035110473633, + "learning_rate": 2.1e-05, + "loss": 9.0049, + "step": 108 + }, + { + "epoch": 0.017063243581715716, + "grad_norm": 11.794614791870117, + "learning_rate": 2.12e-05, + "loss": 8.0616, + "step": 109 + }, + { + "epoch": 0.017219787100814028, + "grad_norm": 16.095571517944336, + "learning_rate": 2.1400000000000002e-05, + "loss": 8.7168, + "step": 110 + }, + { + "epoch": 0.017376330619912336, + "grad_norm": 34.732295989990234, + "learning_rate": 2.16e-05, + "loss": 14.5265, + "step": 111 + }, + { + "epoch": 0.017532874139010644, + "grad_norm": 23.19559097290039, + "learning_rate": 2.18e-05, + "loss": 10.0108, + "step": 112 + }, + { + "epoch": 0.017689417658108955, + "grad_norm": 21.117944717407227, + "learning_rate": 2.2000000000000003e-05, + "loss": 9.1726, + "step": 113 + }, + { + "epoch": 0.017845961177207263, + "grad_norm": 19.646760940551758, + "learning_rate": 2.22e-05, + "loss": 8.1041, + "step": 114 + }, + { + "epoch": 0.01800250469630557, + "grad_norm": 19.7663631439209, + "learning_rate": 2.2400000000000002e-05, + "loss": 7.9443, + "step": 115 + }, + { + "epoch": 0.018159048215403883, + "grad_norm": 16.962858200073242, + "learning_rate": 2.26e-05, + "loss": 6.8379, + "step": 116 + }, + { + "epoch": 0.01831559173450219, + "grad_norm": 32.949920654296875, + "learning_rate": 2.2800000000000002e-05, + "loss": 11.155, + "step": 117 + }, + { + "epoch": 0.018472135253600502, + "grad_norm": 13.858436584472656, + "learning_rate": 2.3000000000000003e-05, + "loss": 5.7732, + "step": 118 + }, + { + "epoch": 0.01862867877269881, + "grad_norm": 27.87127685546875, + "learning_rate": 2.32e-05, + "loss": 9.0885, + "step": 119 + }, + { + "epoch": 0.01878522229179712, + "grad_norm": 24.32346534729004, + "learning_rate": 2.3400000000000003e-05, + "loss": 7.9423, + "step": 120 + }, + { + "epoch": 0.01894176581089543, + "grad_norm": 16.088197708129883, + "learning_rate": 2.36e-05, + "loss": 5.9494, + "step": 121 + }, + { + "epoch": 0.019098309329993738, + "grad_norm": 21.5980167388916, + "learning_rate": 2.38e-05, + "loss": 7.0504, + "step": 122 + }, + { + "epoch": 0.019254852849092046, + "grad_norm": 29.333011627197266, + "learning_rate": 2.4e-05, + "loss": 8.4159, + "step": 123 + }, + { + "epoch": 0.019411396368190358, + "grad_norm": 21.223533630371094, + "learning_rate": 2.4200000000000002e-05, + "loss": 6.7414, + "step": 124 + }, + { + "epoch": 0.019567939887288666, + "grad_norm": 11.861080169677734, + "learning_rate": 2.44e-05, + "loss": 4.863, + "step": 125 + }, + { + "epoch": 0.019724483406386977, + "grad_norm": Infinity, + "learning_rate": 2.44e-05, + "loss": 9.5221, + "step": 126 + }, + { + "epoch": 0.019881026925485285, + "grad_norm": 39.67204666137695, + "learning_rate": 2.46e-05, + "loss": 9.9496, + "step": 127 + }, + { + "epoch": 0.020037570444583593, + "grad_norm": 24.057369232177734, + "learning_rate": 2.48e-05, + "loss": 6.9111, + "step": 128 + }, + { + "epoch": 0.020194113963681905, + "grad_norm": 40.45210266113281, + "learning_rate": 2.5e-05, + "loss": 9.8373, + "step": 129 + }, + { + "epoch": 0.020350657482780213, + "grad_norm": 31.353267669677734, + "learning_rate": 2.5200000000000003e-05, + "loss": 8.0255, + "step": 130 + }, + { + "epoch": 0.02050720100187852, + "grad_norm": 40.567840576171875, + "learning_rate": 2.54e-05, + "loss": 9.4022, + "step": 131 + }, + { + "epoch": 0.020663744520976832, + "grad_norm": 19.0320987701416, + "learning_rate": 2.5600000000000002e-05, + "loss": 5.8117, + "step": 132 + }, + { + "epoch": 0.02082028804007514, + "grad_norm": 29.481473922729492, + "learning_rate": 2.58e-05, + "loss": 7.3835, + "step": 133 + }, + { + "epoch": 0.020976831559173452, + "grad_norm": 25.981218338012695, + "learning_rate": 2.6000000000000002e-05, + "loss": 6.7594, + "step": 134 + }, + { + "epoch": 0.02113337507827176, + "grad_norm": 25.114015579223633, + "learning_rate": 2.6200000000000003e-05, + "loss": 6.6127, + "step": 135 + }, + { + "epoch": 0.021289918597370068, + "grad_norm": 23.675310134887695, + "learning_rate": 2.64e-05, + "loss": 6.2401, + "step": 136 + }, + { + "epoch": 0.02144646211646838, + "grad_norm": 23.078712463378906, + "learning_rate": 2.6600000000000003e-05, + "loss": 6.0089, + "step": 137 + }, + { + "epoch": 0.021603005635566688, + "grad_norm": 20.985675811767578, + "learning_rate": 2.6800000000000004e-05, + "loss": 5.7219, + "step": 138 + }, + { + "epoch": 0.021759549154664996, + "grad_norm": 28.904874801635742, + "learning_rate": 2.7000000000000002e-05, + "loss": 6.7695, + "step": 139 + }, + { + "epoch": 0.021916092673763307, + "grad_norm": 30.217592239379883, + "learning_rate": 2.7200000000000004e-05, + "loss": 6.9622, + "step": 140 + }, + { + "epoch": 0.022072636192861615, + "grad_norm": 27.330184936523438, + "learning_rate": 2.7400000000000002e-05, + "loss": 6.3655, + "step": 141 + }, + { + "epoch": 0.022229179711959923, + "grad_norm": 16.98777961730957, + "learning_rate": 2.7600000000000003e-05, + "loss": 5.0268, + "step": 142 + }, + { + "epoch": 0.022385723231058235, + "grad_norm": 13.369125366210938, + "learning_rate": 2.7800000000000005e-05, + "loss": 4.6184, + "step": 143 + }, + { + "epoch": 0.022542266750156543, + "grad_norm": 12.565454483032227, + "learning_rate": 2.8000000000000003e-05, + "loss": 4.4323, + "step": 144 + }, + { + "epoch": 0.022698810269254854, + "grad_norm": 14.671064376831055, + "learning_rate": 2.8199999999999998e-05, + "loss": 4.6675, + "step": 145 + }, + { + "epoch": 0.022855353788353162, + "grad_norm": 14.536849021911621, + "learning_rate": 2.84e-05, + "loss": 4.6707, + "step": 146 + }, + { + "epoch": 0.02301189730745147, + "grad_norm": 14.086499214172363, + "learning_rate": 2.86e-05, + "loss": 4.554, + "step": 147 + }, + { + "epoch": 0.023168440826549782, + "grad_norm": 9.147937774658203, + "learning_rate": 2.88e-05, + "loss": 4.051, + "step": 148 + }, + { + "epoch": 0.02332498434564809, + "grad_norm": 12.760455131530762, + "learning_rate": 2.9e-05, + "loss": 4.4031, + "step": 149 + }, + { + "epoch": 0.023481527864746398, + "grad_norm": 7.349221229553223, + "learning_rate": 2.9199999999999998e-05, + "loss": 3.822, + "step": 150 + }, + { + "epoch": 0.02363807138384471, + "grad_norm": 21.74277114868164, + "learning_rate": 2.94e-05, + "loss": 5.2668, + "step": 151 + }, + { + "epoch": 0.023794614902943018, + "grad_norm": 19.71260643005371, + "learning_rate": 2.96e-05, + "loss": 4.9737, + "step": 152 + }, + { + "epoch": 0.02395115842204133, + "grad_norm": 14.87806224822998, + "learning_rate": 2.98e-05, + "loss": 4.3851, + "step": 153 + }, + { + "epoch": 0.024107701941139637, + "grad_norm": 22.1054744720459, + "learning_rate": 3e-05, + "loss": 5.2704, + "step": 154 + }, + { + "epoch": 0.024264245460237945, + "grad_norm": 12.699929237365723, + "learning_rate": 3.02e-05, + "loss": 4.1213, + "step": 155 + }, + { + "epoch": 0.024420788979336257, + "grad_norm": 13.654191970825195, + "learning_rate": 3.04e-05, + "loss": 4.2192, + "step": 156 + }, + { + "epoch": 0.024577332498434565, + "grad_norm": 9.101218223571777, + "learning_rate": 3.06e-05, + "loss": 3.7822, + "step": 157 + }, + { + "epoch": 0.024733876017532873, + "grad_norm": 16.10308265686035, + "learning_rate": 3.08e-05, + "loss": 4.5165, + "step": 158 + }, + { + "epoch": 0.024890419536631184, + "grad_norm": 14.330178260803223, + "learning_rate": 3.1e-05, + "loss": 4.213, + "step": 159 + }, + { + "epoch": 0.025046963055729492, + "grad_norm": 14.484580039978027, + "learning_rate": 3.12e-05, + "loss": 4.2953, + "step": 160 + }, + { + "epoch": 0.0252035065748278, + "grad_norm": 7.8323469161987305, + "learning_rate": 3.1400000000000004e-05, + "loss": 3.7, + "step": 161 + }, + { + "epoch": 0.025360050093926112, + "grad_norm": 11.545485496520996, + "learning_rate": 3.16e-05, + "loss": 3.9896, + "step": 162 + }, + { + "epoch": 0.02551659361302442, + "grad_norm": 51.33173751831055, + "learning_rate": 3.18e-05, + "loss": 8.5352, + "step": 163 + }, + { + "epoch": 0.02567313713212273, + "grad_norm": 8.228883743286133, + "learning_rate": 3.2000000000000005e-05, + "loss": 3.6897, + "step": 164 + }, + { + "epoch": 0.02582968065122104, + "grad_norm": 6.990713596343994, + "learning_rate": 3.2200000000000003e-05, + "loss": 3.6186, + "step": 165 + }, + { + "epoch": 0.025986224170319348, + "grad_norm": 20.033954620361328, + "learning_rate": 3.24e-05, + "loss": 4.7724, + "step": 166 + }, + { + "epoch": 0.02614276768941766, + "grad_norm": 19.50939178466797, + "learning_rate": 3.26e-05, + "loss": 4.8395, + "step": 167 + }, + { + "epoch": 0.026299311208515967, + "grad_norm": 41.35626220703125, + "learning_rate": 3.2800000000000004e-05, + "loss": 7.1561, + "step": 168 + }, + { + "epoch": 0.026455854727614275, + "grad_norm": 11.095070838928223, + "learning_rate": 3.3e-05, + "loss": 3.8432, + "step": 169 + }, + { + "epoch": 0.026612398246712587, + "grad_norm": 14.025952339172363, + "learning_rate": 3.32e-05, + "loss": 4.1932, + "step": 170 + }, + { + "epoch": 0.026768941765810895, + "grad_norm": 15.360281944274902, + "learning_rate": 3.3400000000000005e-05, + "loss": 4.3251, + "step": 171 + }, + { + "epoch": 0.026925485284909206, + "grad_norm": 32.055580139160156, + "learning_rate": 3.3600000000000004e-05, + "loss": 5.893, + "step": 172 + }, + { + "epoch": 0.027082028804007514, + "grad_norm": 11.38963508605957, + "learning_rate": 3.38e-05, + "loss": 3.9364, + "step": 173 + }, + { + "epoch": 0.027238572323105822, + "grad_norm": 15.267151832580566, + "learning_rate": 3.4000000000000007e-05, + "loss": 4.3381, + "step": 174 + }, + { + "epoch": 0.027395115842204134, + "grad_norm": 30.341156005859375, + "learning_rate": 3.4200000000000005e-05, + "loss": 5.6901, + "step": 175 + }, + { + "epoch": 0.027551659361302442, + "grad_norm": 17.694860458374023, + "learning_rate": 3.4399999999999996e-05, + "loss": 4.5433, + "step": 176 + }, + { + "epoch": 0.02770820288040075, + "grad_norm": 14.969650268554688, + "learning_rate": 3.46e-05, + "loss": 4.1974, + "step": 177 + }, + { + "epoch": 0.02786474639949906, + "grad_norm": 16.058910369873047, + "learning_rate": 3.48e-05, + "loss": 4.4021, + "step": 178 + }, + { + "epoch": 0.02802128991859737, + "grad_norm": 25.310165405273438, + "learning_rate": 3.5e-05, + "loss": 5.2114, + "step": 179 + }, + { + "epoch": 0.028177833437695678, + "grad_norm": 30.894515991210938, + "learning_rate": 3.52e-05, + "loss": 5.7932, + "step": 180 + }, + { + "epoch": 0.02833437695679399, + "grad_norm": 16.750886917114258, + "learning_rate": 3.54e-05, + "loss": 4.4193, + "step": 181 + }, + { + "epoch": 0.028490920475892297, + "grad_norm": 19.973085403442383, + "learning_rate": 3.56e-05, + "loss": 4.6592, + "step": 182 + }, + { + "epoch": 0.02864746399499061, + "grad_norm": 19.43536376953125, + "learning_rate": 3.58e-05, + "loss": 4.7873, + "step": 183 + }, + { + "epoch": 0.028804007514088917, + "grad_norm": 14.666500091552734, + "learning_rate": 3.6e-05, + "loss": 4.4556, + "step": 184 + }, + { + "epoch": 0.028960551033187225, + "grad_norm": 22.204801559448242, + "learning_rate": 3.62e-05, + "loss": 4.9843, + "step": 185 + }, + { + "epoch": 0.029117094552285536, + "grad_norm": 15.47268295288086, + "learning_rate": 3.6400000000000004e-05, + "loss": 4.2321, + "step": 186 + }, + { + "epoch": 0.029273638071383844, + "grad_norm": 15.948758125305176, + "learning_rate": 3.66e-05, + "loss": 4.2902, + "step": 187 + }, + { + "epoch": 0.029430181590482152, + "grad_norm": 16.723644256591797, + "learning_rate": 3.68e-05, + "loss": 4.4199, + "step": 188 + }, + { + "epoch": 0.029586725109580464, + "grad_norm": 14.253995895385742, + "learning_rate": 3.7e-05, + "loss": 4.2503, + "step": 189 + }, + { + "epoch": 0.029743268628678772, + "grad_norm": 10.968061447143555, + "learning_rate": 3.72e-05, + "loss": 4.0432, + "step": 190 + }, + { + "epoch": 0.029899812147777084, + "grad_norm": 13.674737930297852, + "learning_rate": 3.74e-05, + "loss": 4.155, + "step": 191 + }, + { + "epoch": 0.03005635566687539, + "grad_norm": 13.597500801086426, + "learning_rate": 3.76e-05, + "loss": 4.1802, + "step": 192 + }, + { + "epoch": 0.0302128991859737, + "grad_norm": 7.852551460266113, + "learning_rate": 3.7800000000000004e-05, + "loss": 3.766, + "step": 193 + }, + { + "epoch": 0.03036944270507201, + "grad_norm": 9.920880317687988, + "learning_rate": 3.8e-05, + "loss": 3.9178, + "step": 194 + }, + { + "epoch": 0.03052598622417032, + "grad_norm": 7.1803412437438965, + "learning_rate": 3.82e-05, + "loss": 3.7227, + "step": 195 + }, + { + "epoch": 0.030682529743268627, + "grad_norm": 9.741308212280273, + "learning_rate": 3.8400000000000005e-05, + "loss": 3.8153, + "step": 196 + }, + { + "epoch": 0.03083907326236694, + "grad_norm": 7.21779203414917, + "learning_rate": 3.86e-05, + "loss": 3.6853, + "step": 197 + }, + { + "epoch": 0.030995616781465247, + "grad_norm": 6.697176933288574, + "learning_rate": 3.88e-05, + "loss": 3.6266, + "step": 198 + }, + { + "epoch": 0.03115216030056356, + "grad_norm": 4.710512161254883, + "learning_rate": 3.9000000000000006e-05, + "loss": 3.4917, + "step": 199 + }, + { + "epoch": 0.031308703819661866, + "grad_norm": 3.291118860244751, + "learning_rate": 3.9200000000000004e-05, + "loss": 3.4332, + "step": 200 + }, + { + "epoch": 0.031465247338760174, + "grad_norm": Infinity, + "learning_rate": 3.9200000000000004e-05, + "loss": 9.8534, + "step": 201 + }, + { + "epoch": 0.03162179085785848, + "grad_norm": 22.184192657470703, + "learning_rate": 3.94e-05, + "loss": 4.778, + "step": 202 + }, + { + "epoch": 0.03177833437695679, + "grad_norm": 5.981218338012695, + "learning_rate": 3.960000000000001e-05, + "loss": 3.3893, + "step": 203 + }, + { + "epoch": 0.031934877896055106, + "grad_norm": 5.971397876739502, + "learning_rate": 3.9800000000000005e-05, + "loss": 3.3625, + "step": 204 + }, + { + "epoch": 0.032091421415153414, + "grad_norm": 12.560132026672363, + "learning_rate": 4e-05, + "loss": 3.7725, + "step": 205 + }, + { + "epoch": 0.03224796493425172, + "grad_norm": 10.769306182861328, + "learning_rate": 4.02e-05, + "loss": 3.7446, + "step": 206 + }, + { + "epoch": 0.03240450845335003, + "grad_norm": 10.794076919555664, + "learning_rate": 4.0400000000000006e-05, + "loss": 3.7855, + "step": 207 + }, + { + "epoch": 0.03256105197244834, + "grad_norm": 3.5326619148254395, + "learning_rate": 4.0600000000000004e-05, + "loss": 3.2544, + "step": 208 + }, + { + "epoch": 0.03271759549154665, + "grad_norm": 8.631234169006348, + "learning_rate": 4.08e-05, + "loss": 3.5642, + "step": 209 + }, + { + "epoch": 0.03287413901064496, + "grad_norm": 6.4292497634887695, + "learning_rate": 4.1e-05, + "loss": 3.4436, + "step": 210 + }, + { + "epoch": 0.03303068252974327, + "grad_norm": 4.76118278503418, + "learning_rate": 4.12e-05, + "loss": 3.3502, + "step": 211 + }, + { + "epoch": 0.03318722604884158, + "grad_norm": 9.673348426818848, + "learning_rate": 4.14e-05, + "loss": 3.6682, + "step": 212 + }, + { + "epoch": 0.033343769567939885, + "grad_norm": 9.921127319335938, + "learning_rate": 4.16e-05, + "loss": 3.6372, + "step": 213 + }, + { + "epoch": 0.0335003130870382, + "grad_norm": 7.6689605712890625, + "learning_rate": 4.18e-05, + "loss": 3.4827, + "step": 214 + }, + { + "epoch": 0.03365685660613651, + "grad_norm": 2.695232391357422, + "learning_rate": 4.2e-05, + "loss": 3.2387, + "step": 215 + }, + { + "epoch": 0.033813400125234816, + "grad_norm": 4.6286540031433105, + "learning_rate": 4.22e-05, + "loss": 3.3067, + "step": 216 + }, + { + "epoch": 0.033969943644333124, + "grad_norm": 34.55182647705078, + "learning_rate": 4.24e-05, + "loss": 5.7498, + "step": 217 + }, + { + "epoch": 0.03412648716343143, + "grad_norm": 13.464936256408691, + "learning_rate": 4.26e-05, + "loss": 3.9081, + "step": 218 + }, + { + "epoch": 0.03428303068252974, + "grad_norm": 13.176374435424805, + "learning_rate": 4.2800000000000004e-05, + "loss": 3.9945, + "step": 219 + }, + { + "epoch": 0.034439574201628055, + "grad_norm": 7.782674789428711, + "learning_rate": 4.3e-05, + "loss": 3.5641, + "step": 220 + }, + { + "epoch": 0.03459611772072636, + "grad_norm": 20.85167121887207, + "learning_rate": 4.32e-05, + "loss": 4.644, + "step": 221 + }, + { + "epoch": 0.03475266123982467, + "grad_norm": 9.268744468688965, + "learning_rate": 4.3400000000000005e-05, + "loss": 3.7557, + "step": 222 + }, + { + "epoch": 0.03490920475892298, + "grad_norm": 6.538976669311523, + "learning_rate": 4.36e-05, + "loss": 3.5416, + "step": 223 + }, + { + "epoch": 0.03506574827802129, + "grad_norm": 5.487597942352295, + "learning_rate": 4.38e-05, + "loss": 3.4296, + "step": 224 + }, + { + "epoch": 0.0352222917971196, + "grad_norm": 15.991875648498535, + "learning_rate": 4.4000000000000006e-05, + "loss": 4.2775, + "step": 225 + }, + { + "epoch": 0.03537883531621791, + "grad_norm": 18.148515701293945, + "learning_rate": 4.4200000000000004e-05, + "loss": 4.4786, + "step": 226 + }, + { + "epoch": 0.03553537883531622, + "grad_norm": 3.8178341388702393, + "learning_rate": 4.44e-05, + "loss": 3.3261, + "step": 227 + }, + { + "epoch": 0.035691922354414526, + "grad_norm": 9.934024810791016, + "learning_rate": 4.46e-05, + "loss": 3.8437, + "step": 228 + }, + { + "epoch": 0.035848465873512834, + "grad_norm": 5.05556058883667, + "learning_rate": 4.4800000000000005e-05, + "loss": 3.4722, + "step": 229 + }, + { + "epoch": 0.03600500939261114, + "grad_norm": 14.90172004699707, + "learning_rate": 4.5e-05, + "loss": 4.1091, + "step": 230 + }, + { + "epoch": 0.03616155291170946, + "grad_norm": 12.77217960357666, + "learning_rate": 4.52e-05, + "loss": 3.9494, + "step": 231 + }, + { + "epoch": 0.036318096430807766, + "grad_norm": 7.696062088012695, + "learning_rate": 4.5400000000000006e-05, + "loss": 3.6504, + "step": 232 + }, + { + "epoch": 0.036474639949906074, + "grad_norm": 8.367423057556152, + "learning_rate": 4.5600000000000004e-05, + "loss": 3.7716, + "step": 233 + }, + { + "epoch": 0.03663118346900438, + "grad_norm": 6.823204517364502, + "learning_rate": 4.58e-05, + "loss": 3.6397, + "step": 234 + }, + { + "epoch": 0.03678772698810269, + "grad_norm": 14.142634391784668, + "learning_rate": 4.600000000000001e-05, + "loss": 4.0811, + "step": 235 + }, + { + "epoch": 0.036944270507201005, + "grad_norm": 7.286068439483643, + "learning_rate": 4.6200000000000005e-05, + "loss": 3.6792, + "step": 236 + }, + { + "epoch": 0.03710081402629931, + "grad_norm": 6.292399883270264, + "learning_rate": 4.64e-05, + "loss": 3.6734, + "step": 237 + }, + { + "epoch": 0.03725735754539762, + "grad_norm": 6.299436569213867, + "learning_rate": 4.660000000000001e-05, + "loss": 3.5537, + "step": 238 + }, + { + "epoch": 0.03741390106449593, + "grad_norm": 9.55305290222168, + "learning_rate": 4.6800000000000006e-05, + "loss": 3.8113, + "step": 239 + }, + { + "epoch": 0.03757044458359424, + "grad_norm": 8.143728256225586, + "learning_rate": 4.7e-05, + "loss": 3.6495, + "step": 240 + }, + { + "epoch": 0.03772698810269255, + "grad_norm": 7.797530174255371, + "learning_rate": 4.72e-05, + "loss": 3.6241, + "step": 241 + }, + { + "epoch": 0.03788353162179086, + "grad_norm": 6.819427967071533, + "learning_rate": 4.74e-05, + "loss": 3.5495, + "step": 242 + }, + { + "epoch": 0.03804007514088917, + "grad_norm": 5.199346542358398, + "learning_rate": 4.76e-05, + "loss": 3.5297, + "step": 243 + }, + { + "epoch": 0.038196618659987476, + "grad_norm": 2.913816213607788, + "learning_rate": 4.78e-05, + "loss": 3.4094, + "step": 244 + }, + { + "epoch": 0.038353162179085784, + "grad_norm": 6.832897663116455, + "learning_rate": 4.8e-05, + "loss": 3.5307, + "step": 245 + }, + { + "epoch": 0.03850970569818409, + "grad_norm": 2.835822582244873, + "learning_rate": 4.82e-05, + "loss": 3.298, + "step": 246 + }, + { + "epoch": 0.03866624921728241, + "grad_norm": 3.3336946964263916, + "learning_rate": 4.8400000000000004e-05, + "loss": 3.2857, + "step": 247 + }, + { + "epoch": 0.038822792736380715, + "grad_norm": 3.107677936553955, + "learning_rate": 4.86e-05, + "loss": 3.2357, + "step": 248 + }, + { + "epoch": 0.03897933625547902, + "grad_norm": 3.2189292907714844, + "learning_rate": 4.88e-05, + "loss": 3.1742, + "step": 249 + }, + { + "epoch": 0.03913587977457733, + "grad_norm": 3.81461763381958, + "learning_rate": 4.9e-05, + "loss": 3.0936, + "step": 250 + }, + { + "epoch": 0.03929242329367564, + "grad_norm": 4.165604591369629, + "learning_rate": 4.92e-05, + "loss": 3.1622, + "step": 251 + }, + { + "epoch": 0.039448966812773954, + "grad_norm": 3.632925510406494, + "learning_rate": 4.94e-05, + "loss": 3.1736, + "step": 252 + }, + { + "epoch": 0.03960551033187226, + "grad_norm": 8.786075592041016, + "learning_rate": 4.96e-05, + "loss": 3.4318, + "step": 253 + }, + { + "epoch": 0.03976205385097057, + "grad_norm": 4.034289836883545, + "learning_rate": 4.9800000000000004e-05, + "loss": 3.1604, + "step": 254 + }, + { + "epoch": 0.03991859737006888, + "grad_norm": 3.53901743888855, + "learning_rate": 5e-05, + "loss": 3.1726, + "step": 255 + }, + { + "epoch": 0.040075140889167186, + "grad_norm": 2.7627885341644287, + "learning_rate": 5.02e-05, + "loss": 3.0986, + "step": 256 + }, + { + "epoch": 0.040231684408265495, + "grad_norm": 1.7575182914733887, + "learning_rate": 5.0400000000000005e-05, + "loss": 3.0969, + "step": 257 + }, + { + "epoch": 0.04038822792736381, + "grad_norm": 1.823673963546753, + "learning_rate": 5.0600000000000003e-05, + "loss": 3.0894, + "step": 258 + }, + { + "epoch": 0.04054477144646212, + "grad_norm": 13.54356861114502, + "learning_rate": 5.08e-05, + "loss": 4.0668, + "step": 259 + }, + { + "epoch": 0.040701314965560426, + "grad_norm": 3.4412031173706055, + "learning_rate": 5.1000000000000006e-05, + "loss": 3.2036, + "step": 260 + }, + { + "epoch": 0.040857858484658734, + "grad_norm": 2.6128854751586914, + "learning_rate": 5.1200000000000004e-05, + "loss": 3.1316, + "step": 261 + }, + { + "epoch": 0.04101440200375704, + "grad_norm": 1.3837140798568726, + "learning_rate": 5.14e-05, + "loss": 3.042, + "step": 262 + }, + { + "epoch": 0.04117094552285536, + "grad_norm": 2.358736276626587, + "learning_rate": 5.16e-05, + "loss": 3.1616, + "step": 263 + }, + { + "epoch": 0.041327489041953665, + "grad_norm": 1.4322761297225952, + "learning_rate": 5.1800000000000005e-05, + "loss": 3.1052, + "step": 264 + }, + { + "epoch": 0.04148403256105197, + "grad_norm": 2.254499912261963, + "learning_rate": 5.2000000000000004e-05, + "loss": 3.0974, + "step": 265 + }, + { + "epoch": 0.04164057608015028, + "grad_norm": 3.228977680206299, + "learning_rate": 5.22e-05, + "loss": 3.1535, + "step": 266 + }, + { + "epoch": 0.04179711959924859, + "grad_norm": 13.388189315795898, + "learning_rate": 5.2400000000000007e-05, + "loss": 4.0321, + "step": 267 + }, + { + "epoch": 0.041953663118346904, + "grad_norm": 4.086214065551758, + "learning_rate": 5.2600000000000005e-05, + "loss": 3.2214, + "step": 268 + }, + { + "epoch": 0.04211020663744521, + "grad_norm": 2.551823616027832, + "learning_rate": 5.28e-05, + "loss": 3.1118, + "step": 269 + }, + { + "epoch": 0.04226675015654352, + "grad_norm": 5.2355451583862305, + "learning_rate": 5.300000000000001e-05, + "loss": 3.2749, + "step": 270 + }, + { + "epoch": 0.04242329367564183, + "grad_norm": 18.21733856201172, + "learning_rate": 5.3200000000000006e-05, + "loss": 4.2814, + "step": 271 + }, + { + "epoch": 0.042579837194740136, + "grad_norm": 9.372861862182617, + "learning_rate": 5.3400000000000004e-05, + "loss": 3.6992, + "step": 272 + }, + { + "epoch": 0.042736380713838444, + "grad_norm": 2.8045504093170166, + "learning_rate": 5.360000000000001e-05, + "loss": 3.193, + "step": 273 + }, + { + "epoch": 0.04289292423293676, + "grad_norm": 11.04323673248291, + "learning_rate": 5.380000000000001e-05, + "loss": 3.6698, + "step": 274 + }, + { + "epoch": 0.04304946775203507, + "grad_norm": 1.2183548212051392, + "learning_rate": 5.4000000000000005e-05, + "loss": 3.2263, + "step": 275 + }, + { + "epoch": 0.043206011271133375, + "grad_norm": 1.4358868598937988, + "learning_rate": 5.420000000000001e-05, + "loss": 3.1167, + "step": 276 + }, + { + "epoch": 0.04336255479023168, + "grad_norm": 2.6909029483795166, + "learning_rate": 5.440000000000001e-05, + "loss": 3.1922, + "step": 277 + }, + { + "epoch": 0.04351909830932999, + "grad_norm": 16.534326553344727, + "learning_rate": 5.4600000000000006e-05, + "loss": 4.0462, + "step": 278 + }, + { + "epoch": 0.043675641828428306, + "grad_norm": 13.074368476867676, + "learning_rate": 5.4800000000000004e-05, + "loss": 3.8717, + "step": 279 + }, + { + "epoch": 0.043832185347526614, + "grad_norm": 5.982373237609863, + "learning_rate": 5.500000000000001e-05, + "loss": 3.3086, + "step": 280 + }, + { + "epoch": 0.04398872886662492, + "grad_norm": 2.4414570331573486, + "learning_rate": 5.520000000000001e-05, + "loss": 3.2709, + "step": 281 + }, + { + "epoch": 0.04414527238572323, + "grad_norm": 4.70213508605957, + "learning_rate": 5.5400000000000005e-05, + "loss": 3.3875, + "step": 282 + }, + { + "epoch": 0.04430181590482154, + "grad_norm": 5.329479694366455, + "learning_rate": 5.560000000000001e-05, + "loss": 3.3987, + "step": 283 + }, + { + "epoch": 0.044458359423919847, + "grad_norm": 6.294430255889893, + "learning_rate": 5.580000000000001e-05, + "loss": 3.5346, + "step": 284 + }, + { + "epoch": 0.04461490294301816, + "grad_norm": 6.935766220092773, + "learning_rate": 5.6000000000000006e-05, + "loss": 3.5978, + "step": 285 + }, + { + "epoch": 0.04477144646211647, + "grad_norm": 1.8166944980621338, + "learning_rate": 5.620000000000001e-05, + "loss": 3.1879, + "step": 286 + }, + { + "epoch": 0.04492798998121478, + "grad_norm": 2.6236770153045654, + "learning_rate": 5.6399999999999995e-05, + "loss": 3.2963, + "step": 287 + }, + { + "epoch": 0.045084533500313086, + "grad_norm": 8.652965545654297, + "learning_rate": 5.66e-05, + "loss": 3.5349, + "step": 288 + }, + { + "epoch": 0.045241077019411394, + "grad_norm": 6.5180559158325195, + "learning_rate": 5.68e-05, + "loss": 3.4941, + "step": 289 + }, + { + "epoch": 0.04539762053850971, + "grad_norm": 4.287855625152588, + "learning_rate": 5.6999999999999996e-05, + "loss": 3.3618, + "step": 290 + }, + { + "epoch": 0.04555416405760802, + "grad_norm": 5.221789360046387, + "learning_rate": 5.72e-05, + "loss": 3.39, + "step": 291 + }, + { + "epoch": 0.045710707576706325, + "grad_norm": 2.1194238662719727, + "learning_rate": 5.74e-05, + "loss": 3.1295, + "step": 292 + }, + { + "epoch": 0.04586725109580463, + "grad_norm": 1.5354821681976318, + "learning_rate": 5.76e-05, + "loss": 3.2514, + "step": 293 + }, + { + "epoch": 0.04602379461490294, + "grad_norm": 2.3909995555877686, + "learning_rate": 5.7799999999999995e-05, + "loss": 3.1719, + "step": 294 + }, + { + "epoch": 0.04618033813400125, + "grad_norm": 2.923290252685547, + "learning_rate": 5.8e-05, + "loss": 3.2764, + "step": 295 + }, + { + "epoch": 0.046336881653099564, + "grad_norm": 2.920510768890381, + "learning_rate": 5.82e-05, + "loss": 3.1765, + "step": 296 + }, + { + "epoch": 0.04649342517219787, + "grad_norm": 3.3770008087158203, + "learning_rate": 5.8399999999999997e-05, + "loss": 3.0798, + "step": 297 + }, + { + "epoch": 0.04664996869129618, + "grad_norm": 3.6365253925323486, + "learning_rate": 5.86e-05, + "loss": 3.0383, + "step": 298 + }, + { + "epoch": 0.04680651221039449, + "grad_norm": 3.6288559436798096, + "learning_rate": 5.88e-05, + "loss": 2.9985, + "step": 299 + }, + { + "epoch": 0.046963055729492796, + "grad_norm": 2.0963525772094727, + "learning_rate": 5.9e-05, + "loss": 2.9894, + "step": 300 + }, + { + "epoch": 0.04711959924859111, + "grad_norm": 47.83268737792969, + "learning_rate": 5.92e-05, + "loss": 6.5786, + "step": 301 + }, + { + "epoch": 0.04727614276768942, + "grad_norm": 4.936108112335205, + "learning_rate": 5.94e-05, + "loss": 3.0658, + "step": 302 + }, + { + "epoch": 0.04743268628678773, + "grad_norm": 25.654502868652344, + "learning_rate": 5.96e-05, + "loss": 4.8207, + "step": 303 + }, + { + "epoch": 0.047589229805886035, + "grad_norm": 2.4520301818847656, + "learning_rate": 5.9800000000000003e-05, + "loss": 3.0223, + "step": 304 + }, + { + "epoch": 0.04774577332498434, + "grad_norm": 1.5844178199768066, + "learning_rate": 6e-05, + "loss": 3.0164, + "step": 305 + }, + { + "epoch": 0.04790231684408266, + "grad_norm": 1.9270457029342651, + "learning_rate": 6.02e-05, + "loss": 3.0176, + "step": 306 + }, + { + "epoch": 0.048058860363180966, + "grad_norm": 1.2164374589920044, + "learning_rate": 6.04e-05, + "loss": 3.0149, + "step": 307 + }, + { + "epoch": 0.048215403882279274, + "grad_norm": 1.379023790359497, + "learning_rate": 6.06e-05, + "loss": 3.0479, + "step": 308 + }, + { + "epoch": 0.04837194740137758, + "grad_norm": 1.631529688835144, + "learning_rate": 6.08e-05, + "loss": 2.9724, + "step": 309 + }, + { + "epoch": 0.04852849092047589, + "grad_norm": 4.565341472625732, + "learning_rate": 6.1e-05, + "loss": 3.0777, + "step": 310 + }, + { + "epoch": 0.0486850344395742, + "grad_norm": 5.438143730163574, + "learning_rate": 6.12e-05, + "loss": 3.1023, + "step": 311 + }, + { + "epoch": 0.048841577958672514, + "grad_norm": 0.9749715924263, + "learning_rate": 6.14e-05, + "loss": 2.9575, + "step": 312 + }, + { + "epoch": 0.04899812147777082, + "grad_norm": 5.366253852844238, + "learning_rate": 6.16e-05, + "loss": 3.2184, + "step": 313 + }, + { + "epoch": 0.04915466499686913, + "grad_norm": 2.6491782665252686, + "learning_rate": 6.18e-05, + "loss": 3.07, + "step": 314 + }, + { + "epoch": 0.04931120851596744, + "grad_norm": 3.571376085281372, + "learning_rate": 6.2e-05, + "loss": 3.1648, + "step": 315 + }, + { + "epoch": 0.049467752035065746, + "grad_norm": 1.6718486547470093, + "learning_rate": 6.220000000000001e-05, + "loss": 3.1377, + "step": 316 + }, + { + "epoch": 0.04962429555416406, + "grad_norm": 8.639182090759277, + "learning_rate": 6.24e-05, + "loss": 3.4292, + "step": 317 + }, + { + "epoch": 0.04978083907326237, + "grad_norm": 3.1077969074249268, + "learning_rate": 6.26e-05, + "loss": 3.0532, + "step": 318 + }, + { + "epoch": 0.04993738259236068, + "grad_norm": 1.745723009109497, + "learning_rate": 6.280000000000001e-05, + "loss": 3.0162, + "step": 319 + }, + { + "epoch": 0.050093926111458985, + "grad_norm": 5.069284915924072, + "learning_rate": 6.3e-05, + "loss": 3.1562, + "step": 320 + }, + { + "epoch": 0.05025046963055729, + "grad_norm": 3.8137240409851074, + "learning_rate": 6.32e-05, + "loss": 3.0532, + "step": 321 + }, + { + "epoch": 0.0504070131496556, + "grad_norm": 6.193761348724365, + "learning_rate": 6.340000000000001e-05, + "loss": 3.2521, + "step": 322 + }, + { + "epoch": 0.050563556668753916, + "grad_norm": 4.709444999694824, + "learning_rate": 6.36e-05, + "loss": 3.2324, + "step": 323 + }, + { + "epoch": 0.050720100187852224, + "grad_norm": 1.3377902507781982, + "learning_rate": 6.38e-05, + "loss": 3.083, + "step": 324 + }, + { + "epoch": 0.05087664370695053, + "grad_norm": 0.9965777397155762, + "learning_rate": 6.400000000000001e-05, + "loss": 2.9347, + "step": 325 + }, + { + "epoch": 0.05103318722604884, + "grad_norm": 5.538130283355713, + "learning_rate": 6.42e-05, + "loss": 3.155, + "step": 326 + }, + { + "epoch": 0.05118973074514715, + "grad_norm": 7.558338165283203, + "learning_rate": 6.440000000000001e-05, + "loss": 3.2423, + "step": 327 + }, + { + "epoch": 0.05134627426424546, + "grad_norm": 8.109700202941895, + "learning_rate": 6.460000000000001e-05, + "loss": 3.3273, + "step": 328 + }, + { + "epoch": 0.05150281778334377, + "grad_norm": 8.236330032348633, + "learning_rate": 6.48e-05, + "loss": 3.2651, + "step": 329 + }, + { + "epoch": 0.05165936130244208, + "grad_norm": 5.092519283294678, + "learning_rate": 6.500000000000001e-05, + "loss": 3.1825, + "step": 330 + }, + { + "epoch": 0.05181590482154039, + "grad_norm": 5.976173400878906, + "learning_rate": 6.52e-05, + "loss": 3.1891, + "step": 331 + }, + { + "epoch": 0.051972448340638695, + "grad_norm": 1.4518507719039917, + "learning_rate": 6.54e-05, + "loss": 3.1047, + "step": 332 + }, + { + "epoch": 0.05212899185973701, + "grad_norm": 4.320392608642578, + "learning_rate": 6.560000000000001e-05, + "loss": 3.2545, + "step": 333 + }, + { + "epoch": 0.05228553537883532, + "grad_norm": 3.0381386280059814, + "learning_rate": 6.58e-05, + "loss": 3.1361, + "step": 334 + }, + { + "epoch": 0.052442078897933626, + "grad_norm": 5.30869197845459, + "learning_rate": 6.6e-05, + "loss": 3.3179, + "step": 335 + }, + { + "epoch": 0.052598622417031934, + "grad_norm": 2.355376958847046, + "learning_rate": 6.620000000000001e-05, + "loss": 3.0825, + "step": 336 + }, + { + "epoch": 0.05275516593613024, + "grad_norm": 4.259149074554443, + "learning_rate": 6.64e-05, + "loss": 3.227, + "step": 337 + }, + { + "epoch": 0.05291170945522855, + "grad_norm": 1.9587697982788086, + "learning_rate": 6.66e-05, + "loss": 3.1395, + "step": 338 + }, + { + "epoch": 0.053068252974326866, + "grad_norm": 2.6956167221069336, + "learning_rate": 6.680000000000001e-05, + "loss": 3.2229, + "step": 339 + }, + { + "epoch": 0.053224796493425174, + "grad_norm": 1.6882343292236328, + "learning_rate": 6.7e-05, + "loss": 3.0976, + "step": 340 + }, + { + "epoch": 0.05338134001252348, + "grad_norm": 1.5278631448745728, + "learning_rate": 6.720000000000001e-05, + "loss": 3.1665, + "step": 341 + }, + { + "epoch": 0.05353788353162179, + "grad_norm": 4.2343525886535645, + "learning_rate": 6.740000000000001e-05, + "loss": 3.2444, + "step": 342 + }, + { + "epoch": 0.0536944270507201, + "grad_norm": 2.8302063941955566, + "learning_rate": 6.76e-05, + "loss": 3.1507, + "step": 343 + }, + { + "epoch": 0.05385097056981841, + "grad_norm": 2.6727993488311768, + "learning_rate": 6.780000000000001e-05, + "loss": 3.1349, + "step": 344 + }, + { + "epoch": 0.05400751408891672, + "grad_norm": 2.1870620250701904, + "learning_rate": 6.800000000000001e-05, + "loss": 3.1589, + "step": 345 + }, + { + "epoch": 0.05416405760801503, + "grad_norm": 2.1801397800445557, + "learning_rate": 6.82e-05, + "loss": 3.0711, + "step": 346 + }, + { + "epoch": 0.05432060112711334, + "grad_norm": 1.859655737876892, + "learning_rate": 6.840000000000001e-05, + "loss": 3.0078, + "step": 347 + }, + { + "epoch": 0.054477144646211645, + "grad_norm": 2.038893938064575, + "learning_rate": 6.860000000000001e-05, + "loss": 2.9257, + "step": 348 + }, + { + "epoch": 0.05463368816530995, + "grad_norm": 2.1303551197052, + "learning_rate": 6.879999999999999e-05, + "loss": 3.0635, + "step": 349 + }, + { + "epoch": 0.05479023168440827, + "grad_norm": 2.5755460262298584, + "learning_rate": 6.9e-05, + "loss": 2.9403, + "step": 350 + }, + { + "epoch": 0.054946775203506576, + "grad_norm": 4.361547470092773, + "learning_rate": 6.92e-05, + "loss": 2.9691, + "step": 351 + }, + { + "epoch": 0.055103318722604884, + "grad_norm": 7.480945587158203, + "learning_rate": 6.939999999999999e-05, + "loss": 3.3453, + "step": 352 + }, + { + "epoch": 0.05525986224170319, + "grad_norm": 11.388420104980469, + "learning_rate": 6.96e-05, + "loss": 3.6855, + "step": 353 + }, + { + "epoch": 0.0554164057608015, + "grad_norm": 1.398400902748108, + "learning_rate": 6.98e-05, + "loss": 2.9332, + "step": 354 + }, + { + "epoch": 0.055572949279899815, + "grad_norm": 1.444447636604309, + "learning_rate": 7e-05, + "loss": 2.9696, + "step": 355 + }, + { + "epoch": 0.05572949279899812, + "grad_norm": 1.109475016593933, + "learning_rate": 7.02e-05, + "loss": 2.9349, + "step": 356 + }, + { + "epoch": 0.05588603631809643, + "grad_norm": 0.9742197394371033, + "learning_rate": 7.04e-05, + "loss": 2.8856, + "step": 357 + }, + { + "epoch": 0.05604257983719474, + "grad_norm": 1.620835542678833, + "learning_rate": 7.06e-05, + "loss": 2.9019, + "step": 358 + }, + { + "epoch": 0.05619912335629305, + "grad_norm": 13.107057571411133, + "learning_rate": 7.08e-05, + "loss": 3.7182, + "step": 359 + }, + { + "epoch": 0.056355666875391355, + "grad_norm": 0.7456068396568298, + "learning_rate": 7.1e-05, + "loss": 2.9409, + "step": 360 + }, + { + "epoch": 0.05651221039448967, + "grad_norm": 13.15251350402832, + "learning_rate": 7.12e-05, + "loss": 3.671, + "step": 361 + }, + { + "epoch": 0.05666875391358798, + "grad_norm": 2.371724843978882, + "learning_rate": 7.14e-05, + "loss": 3.1249, + "step": 362 + }, + { + "epoch": 0.056825297432686286, + "grad_norm": 1.2083814144134521, + "learning_rate": 7.16e-05, + "loss": 2.932, + "step": 363 + }, + { + "epoch": 0.056981840951784594, + "grad_norm": 7.292394638061523, + "learning_rate": 7.18e-05, + "loss": 3.3882, + "step": 364 + }, + { + "epoch": 0.0571383844708829, + "grad_norm": 1.991248607635498, + "learning_rate": 7.2e-05, + "loss": 2.9134, + "step": 365 + }, + { + "epoch": 0.05729492798998122, + "grad_norm": 3.265146493911743, + "learning_rate": 7.22e-05, + "loss": 3.0689, + "step": 366 + }, + { + "epoch": 0.057451471509079526, + "grad_norm": 3.838470458984375, + "learning_rate": 7.24e-05, + "loss": 3.0629, + "step": 367 + }, + { + "epoch": 0.057608015028177834, + "grad_norm": 0.9954794645309448, + "learning_rate": 7.26e-05, + "loss": 2.9561, + "step": 368 + }, + { + "epoch": 0.05776455854727614, + "grad_norm": 1.3720093965530396, + "learning_rate": 7.280000000000001e-05, + "loss": 2.9339, + "step": 369 + }, + { + "epoch": 0.05792110206637445, + "grad_norm": 1.7053205966949463, + "learning_rate": 7.3e-05, + "loss": 2.9356, + "step": 370 + }, + { + "epoch": 0.058077645585472765, + "grad_norm": 0.8591640591621399, + "learning_rate": 7.32e-05, + "loss": 3.004, + "step": 371 + }, + { + "epoch": 0.05823418910457107, + "grad_norm": 1.1055607795715332, + "learning_rate": 7.340000000000001e-05, + "loss": 2.98, + "step": 372 + }, + { + "epoch": 0.05839073262366938, + "grad_norm": 4.056722640991211, + "learning_rate": 7.36e-05, + "loss": 3.1241, + "step": 373 + }, + { + "epoch": 0.05854727614276769, + "grad_norm": 14.80190372467041, + "learning_rate": 7.38e-05, + "loss": 3.8728, + "step": 374 + }, + { + "epoch": 0.058703819661866, + "grad_norm": 3.558429479598999, + "learning_rate": 7.4e-05, + "loss": 3.0338, + "step": 375 + }, + { + "epoch": 0.058860363180964305, + "grad_norm": 5.17110013961792, + "learning_rate": 7.42e-05, + "loss": 3.1475, + "step": 376 + }, + { + "epoch": 0.05901690670006262, + "grad_norm": 3.077529191970825, + "learning_rate": 7.44e-05, + "loss": 3.1154, + "step": 377 + }, + { + "epoch": 0.05917345021916093, + "grad_norm": 2.238739490509033, + "learning_rate": 7.46e-05, + "loss": 3.0501, + "step": 378 + }, + { + "epoch": 0.059329993738259236, + "grad_norm": 1.5538197755813599, + "learning_rate": 7.48e-05, + "loss": 3.0432, + "step": 379 + }, + { + "epoch": 0.059486537257357544, + "grad_norm": 4.06220006942749, + "learning_rate": 7.500000000000001e-05, + "loss": 3.166, + "step": 380 + }, + { + "epoch": 0.05964308077645585, + "grad_norm": 6.772061347961426, + "learning_rate": 7.52e-05, + "loss": 3.1971, + "step": 381 + }, + { + "epoch": 0.05979962429555417, + "grad_norm": 5.081016540527344, + "learning_rate": 7.54e-05, + "loss": 3.1373, + "step": 382 + }, + { + "epoch": 0.059956167814652475, + "grad_norm": 2.5491018295288086, + "learning_rate": 7.560000000000001e-05, + "loss": 2.9297, + "step": 383 + }, + { + "epoch": 0.06011271133375078, + "grad_norm": 0.9181234240531921, + "learning_rate": 7.58e-05, + "loss": 3.0536, + "step": 384 + }, + { + "epoch": 0.06026925485284909, + "grad_norm": 1.1351193189620972, + "learning_rate": 7.6e-05, + "loss": 3.0756, + "step": 385 + }, + { + "epoch": 0.0604257983719474, + "grad_norm": 4.452815055847168, + "learning_rate": 7.620000000000001e-05, + "loss": 3.1471, + "step": 386 + }, + { + "epoch": 0.06058234189104571, + "grad_norm": 1.5569177865982056, + "learning_rate": 7.64e-05, + "loss": 2.9956, + "step": 387 + }, + { + "epoch": 0.06073888541014402, + "grad_norm": 2.090049982070923, + "learning_rate": 7.66e-05, + "loss": 3.057, + "step": 388 + }, + { + "epoch": 0.06089542892924233, + "grad_norm": 2.666649580001831, + "learning_rate": 7.680000000000001e-05, + "loss": 3.0938, + "step": 389 + }, + { + "epoch": 0.06105197244834064, + "grad_norm": 4.774649620056152, + "learning_rate": 7.7e-05, + "loss": 3.0927, + "step": 390 + }, + { + "epoch": 0.061208515967438946, + "grad_norm": 4.450115203857422, + "learning_rate": 7.72e-05, + "loss": 3.2131, + "step": 391 + }, + { + "epoch": 0.061365059486537255, + "grad_norm": 4.115021228790283, + "learning_rate": 7.740000000000001e-05, + "loss": 3.1859, + "step": 392 + }, + { + "epoch": 0.06152160300563557, + "grad_norm": 5.35309362411499, + "learning_rate": 7.76e-05, + "loss": 3.2383, + "step": 393 + }, + { + "epoch": 0.06167814652473388, + "grad_norm": 3.8355298042297363, + "learning_rate": 7.780000000000001e-05, + "loss": 2.9918, + "step": 394 + }, + { + "epoch": 0.061834690043832186, + "grad_norm": 2.3736090660095215, + "learning_rate": 7.800000000000001e-05, + "loss": 3.047, + "step": 395 + }, + { + "epoch": 0.061991233562930494, + "grad_norm": 3.8014211654663086, + "learning_rate": 7.82e-05, + "loss": 3.0161, + "step": 396 + }, + { + "epoch": 0.0621477770820288, + "grad_norm": 2.919722080230713, + "learning_rate": 7.840000000000001e-05, + "loss": 2.8762, + "step": 397 + }, + { + "epoch": 0.06230432060112712, + "grad_norm": 3.160149097442627, + "learning_rate": 7.860000000000001e-05, + "loss": 3.0907, + "step": 398 + }, + { + "epoch": 0.062460864120225425, + "grad_norm": 1.9981915950775146, + "learning_rate": 7.88e-05, + "loss": 2.9243, + "step": 399 + }, + { + "epoch": 0.06261740763932373, + "grad_norm": 3.9645607471466064, + "learning_rate": 7.900000000000001e-05, + "loss": 2.7801, + "step": 400 + }, + { + "epoch": 0.06277395115842205, + "grad_norm": 23.070011138916016, + "learning_rate": 7.920000000000001e-05, + "loss": 4.6197, + "step": 401 + }, + { + "epoch": 0.06293049467752035, + "grad_norm": 4.538593292236328, + "learning_rate": 7.94e-05, + "loss": 2.9935, + "step": 402 + }, + { + "epoch": 0.06308703819661866, + "grad_norm": 5.76887321472168, + "learning_rate": 7.960000000000001e-05, + "loss": 3.1494, + "step": 403 + }, + { + "epoch": 0.06324358171571696, + "grad_norm": 1.5008190870285034, + "learning_rate": 7.98e-05, + "loss": 2.9604, + "step": 404 + }, + { + "epoch": 0.06340012523481528, + "grad_norm": 1.2106801271438599, + "learning_rate": 8e-05, + "loss": 2.9207, + "step": 405 + }, + { + "epoch": 0.06355666875391358, + "grad_norm": 1.0942622423171997, + "learning_rate": 8.020000000000001e-05, + "loss": 2.9469, + "step": 406 + }, + { + "epoch": 0.0637132122730119, + "grad_norm": 1.501484990119934, + "learning_rate": 8.04e-05, + "loss": 2.8845, + "step": 407 + }, + { + "epoch": 0.06386975579211021, + "grad_norm": 3.8120570182800293, + "learning_rate": 8.060000000000001e-05, + "loss": 2.9764, + "step": 408 + }, + { + "epoch": 0.06402629931120851, + "grad_norm": 2.950920820236206, + "learning_rate": 8.080000000000001e-05, + "loss": 2.8994, + "step": 409 + }, + { + "epoch": 0.06418284283030683, + "grad_norm": 1.6383147239685059, + "learning_rate": 8.1e-05, + "loss": 2.8723, + "step": 410 + }, + { + "epoch": 0.06433938634940513, + "grad_norm": 0.8971958756446838, + "learning_rate": 8.120000000000001e-05, + "loss": 2.9714, + "step": 411 + }, + { + "epoch": 0.06449592986850344, + "grad_norm": 6.733994007110596, + "learning_rate": 8.14e-05, + "loss": 3.354, + "step": 412 + }, + { + "epoch": 0.06465247338760176, + "grad_norm": 1.206620454788208, + "learning_rate": 8.16e-05, + "loss": 2.9594, + "step": 413 + }, + { + "epoch": 0.06480901690670006, + "grad_norm": 0.4296848773956299, + "learning_rate": 8.18e-05, + "loss": 2.8918, + "step": 414 + }, + { + "epoch": 0.06496556042579837, + "grad_norm": 0.5171148777008057, + "learning_rate": 8.2e-05, + "loss": 2.9055, + "step": 415 + }, + { + "epoch": 0.06512210394489668, + "grad_norm": 2.334181308746338, + "learning_rate": 8.22e-05, + "loss": 2.8987, + "step": 416 + }, + { + "epoch": 0.06527864746399499, + "grad_norm": 1.5807446241378784, + "learning_rate": 8.24e-05, + "loss": 2.9125, + "step": 417 + }, + { + "epoch": 0.0654351909830933, + "grad_norm": 0.6922141909599304, + "learning_rate": 8.26e-05, + "loss": 2.8867, + "step": 418 + }, + { + "epoch": 0.0655917345021916, + "grad_norm": 1.6518949270248413, + "learning_rate": 8.28e-05, + "loss": 2.9798, + "step": 419 + }, + { + "epoch": 0.06574827802128992, + "grad_norm": 1.8079628944396973, + "learning_rate": 8.3e-05, + "loss": 2.9021, + "step": 420 + }, + { + "epoch": 0.06590482154038822, + "grad_norm": 0.9470207691192627, + "learning_rate": 8.32e-05, + "loss": 2.9432, + "step": 421 + }, + { + "epoch": 0.06606136505948654, + "grad_norm": 1.8708336353302002, + "learning_rate": 8.34e-05, + "loss": 2.9965, + "step": 422 + }, + { + "epoch": 0.06621790857858485, + "grad_norm": 0.8324593305587769, + "learning_rate": 8.36e-05, + "loss": 2.8895, + "step": 423 + }, + { + "epoch": 0.06637445209768315, + "grad_norm": 2.2902581691741943, + "learning_rate": 8.38e-05, + "loss": 2.9819, + "step": 424 + }, + { + "epoch": 0.06653099561678147, + "grad_norm": 5.253636360168457, + "learning_rate": 8.4e-05, + "loss": 3.1121, + "step": 425 + }, + { + "epoch": 0.06668753913587977, + "grad_norm": 2.256481647491455, + "learning_rate": 8.42e-05, + "loss": 2.9855, + "step": 426 + }, + { + "epoch": 0.06684408265497808, + "grad_norm": 0.6181924343109131, + "learning_rate": 8.44e-05, + "loss": 2.908, + "step": 427 + }, + { + "epoch": 0.0670006261740764, + "grad_norm": 1.7374522686004639, + "learning_rate": 8.46e-05, + "loss": 3.0154, + "step": 428 + }, + { + "epoch": 0.0671571696931747, + "grad_norm": 1.786274790763855, + "learning_rate": 8.48e-05, + "loss": 2.962, + "step": 429 + }, + { + "epoch": 0.06731371321227302, + "grad_norm": 2.908925771713257, + "learning_rate": 8.5e-05, + "loss": 3.0107, + "step": 430 + }, + { + "epoch": 0.06747025673137132, + "grad_norm": 0.7813953757286072, + "learning_rate": 8.52e-05, + "loss": 2.916, + "step": 431 + }, + { + "epoch": 0.06762680025046963, + "grad_norm": 3.5205273628234863, + "learning_rate": 8.54e-05, + "loss": 3.0515, + "step": 432 + }, + { + "epoch": 0.06778334376956793, + "grad_norm": 3.3851892948150635, + "learning_rate": 8.560000000000001e-05, + "loss": 3.0648, + "step": 433 + }, + { + "epoch": 0.06793988728866625, + "grad_norm": 1.0928279161453247, + "learning_rate": 8.58e-05, + "loss": 2.938, + "step": 434 + }, + { + "epoch": 0.06809643080776456, + "grad_norm": 1.0478990077972412, + "learning_rate": 8.6e-05, + "loss": 3.0415, + "step": 435 + }, + { + "epoch": 0.06825297432686286, + "grad_norm": 2.884532928466797, + "learning_rate": 8.620000000000001e-05, + "loss": 3.0026, + "step": 436 + }, + { + "epoch": 0.06840951784596118, + "grad_norm": 1.3973681926727295, + "learning_rate": 8.64e-05, + "loss": 3.1129, + "step": 437 + }, + { + "epoch": 0.06856606136505948, + "grad_norm": 3.250415563583374, + "learning_rate": 8.66e-05, + "loss": 3.0229, + "step": 438 + }, + { + "epoch": 0.0687226048841578, + "grad_norm": 2.605431318283081, + "learning_rate": 8.680000000000001e-05, + "loss": 2.9444, + "step": 439 + }, + { + "epoch": 0.06887914840325611, + "grad_norm": NaN, + "learning_rate": 8.680000000000001e-05, + "loss": 0.0, + "step": 440 + }, + { + "epoch": 0.06903569192235441, + "grad_norm": 2.8638381958007812, + "learning_rate": 8.7e-05, + "loss": 2.9764, + "step": 441 + }, + { + "epoch": 0.06919223544145273, + "grad_norm": 1.3199659585952759, + "learning_rate": 8.72e-05, + "loss": 3.0154, + "step": 442 + }, + { + "epoch": 0.06934877896055103, + "grad_norm": 3.88041353225708, + "learning_rate": 8.740000000000001e-05, + "loss": 3.1617, + "step": 443 + }, + { + "epoch": 0.06950532247964934, + "grad_norm": 4.255744934082031, + "learning_rate": 8.76e-05, + "loss": 2.8631, + "step": 444 + }, + { + "epoch": 0.06966186599874766, + "grad_norm": 5.390447616577148, + "learning_rate": 8.78e-05, + "loss": 3.0866, + "step": 445 + }, + { + "epoch": 0.06981840951784596, + "grad_norm": 2.7069644927978516, + "learning_rate": 8.800000000000001e-05, + "loss": 2.7761, + "step": 446 + }, + { + "epoch": 0.06997495303694427, + "grad_norm": 1.6831870079040527, + "learning_rate": 8.82e-05, + "loss": 2.6755, + "step": 447 + }, + { + "epoch": 0.07013149655604257, + "grad_norm": 2.929691791534424, + "learning_rate": 8.840000000000001e-05, + "loss": 2.6986, + "step": 448 + }, + { + "epoch": 0.07028804007514089, + "grad_norm": 1.4641650915145874, + "learning_rate": 8.86e-05, + "loss": 2.6563, + "step": 449 + }, + { + "epoch": 0.0704445835942392, + "grad_norm": 2.8799333572387695, + "learning_rate": 8.88e-05, + "loss": 2.6042, + "step": 450 + }, + { + "epoch": 0.0706011271133375, + "grad_norm": 1.3799349069595337, + "learning_rate": 8.900000000000001e-05, + "loss": 2.8924, + "step": 451 + }, + { + "epoch": 0.07075767063243582, + "grad_norm": 7.687436103820801, + "learning_rate": 8.92e-05, + "loss": 3.3758, + "step": 452 + }, + { + "epoch": 0.07091421415153412, + "grad_norm": 0.787386417388916, + "learning_rate": 8.94e-05, + "loss": 2.9233, + "step": 453 + }, + { + "epoch": 0.07107075767063244, + "grad_norm": 1.0670377016067505, + "learning_rate": 8.960000000000001e-05, + "loss": 2.9148, + "step": 454 + }, + { + "epoch": 0.07122730118973075, + "grad_norm": 0.6946200728416443, + "learning_rate": 8.98e-05, + "loss": 2.8947, + "step": 455 + }, + { + "epoch": 0.07138384470882905, + "grad_norm": 1.7103030681610107, + "learning_rate": 9e-05, + "loss": 2.852, + "step": 456 + }, + { + "epoch": 0.07154038822792737, + "grad_norm": 1.9751858711242676, + "learning_rate": 9.020000000000001e-05, + "loss": 2.8435, + "step": 457 + }, + { + "epoch": 0.07169693174702567, + "grad_norm": 1.1818535327911377, + "learning_rate": 9.04e-05, + "loss": 2.8556, + "step": 458 + }, + { + "epoch": 0.07185347526612398, + "grad_norm": 0.7181888818740845, + "learning_rate": 9.06e-05, + "loss": 2.9155, + "step": 459 + }, + { + "epoch": 0.07201001878522229, + "grad_norm": 1.3990308046340942, + "learning_rate": 9.080000000000001e-05, + "loss": 2.8455, + "step": 460 + }, + { + "epoch": 0.0721665623043206, + "grad_norm": 2.0379927158355713, + "learning_rate": 9.1e-05, + "loss": 2.8767, + "step": 461 + }, + { + "epoch": 0.07232310582341892, + "grad_norm": 1.3799035549163818, + "learning_rate": 9.120000000000001e-05, + "loss": 2.9, + "step": 462 + }, + { + "epoch": 0.07247964934251722, + "grad_norm": 4.234987258911133, + "learning_rate": 9.140000000000001e-05, + "loss": 2.9943, + "step": 463 + }, + { + "epoch": 0.07263619286161553, + "grad_norm": 3.2387020587921143, + "learning_rate": 9.16e-05, + "loss": 2.9544, + "step": 464 + }, + { + "epoch": 0.07279273638071383, + "grad_norm": 2.4125988483428955, + "learning_rate": 9.180000000000001e-05, + "loss": 2.8608, + "step": 465 + }, + { + "epoch": 0.07294927989981215, + "grad_norm": 2.1635520458221436, + "learning_rate": 9.200000000000001e-05, + "loss": 2.9061, + "step": 466 + }, + { + "epoch": 0.07310582341891046, + "grad_norm": 1.2092010974884033, + "learning_rate": 9.22e-05, + "loss": 2.8707, + "step": 467 + }, + { + "epoch": 0.07326236693800876, + "grad_norm": 2.8443586826324463, + "learning_rate": 9.240000000000001e-05, + "loss": 3.0235, + "step": 468 + }, + { + "epoch": 0.07341891045710708, + "grad_norm": 2.6111397743225098, + "learning_rate": 9.260000000000001e-05, + "loss": 3.0029, + "step": 469 + }, + { + "epoch": 0.07357545397620538, + "grad_norm": 0.5747168660163879, + "learning_rate": 9.28e-05, + "loss": 2.9219, + "step": 470 + }, + { + "epoch": 0.0737319974953037, + "grad_norm": 5.140597343444824, + "learning_rate": 9.300000000000001e-05, + "loss": 3.1017, + "step": 471 + }, + { + "epoch": 0.07388854101440201, + "grad_norm": 1.6809124946594238, + "learning_rate": 9.320000000000002e-05, + "loss": 2.9594, + "step": 472 + }, + { + "epoch": 0.07404508453350031, + "grad_norm": 2.4442269802093506, + "learning_rate": 9.340000000000001e-05, + "loss": 3.0333, + "step": 473 + }, + { + "epoch": 0.07420162805259863, + "grad_norm": 3.128978967666626, + "learning_rate": 9.360000000000001e-05, + "loss": 2.9079, + "step": 474 + }, + { + "epoch": 0.07435817157169693, + "grad_norm": 3.0268845558166504, + "learning_rate": 9.38e-05, + "loss": 2.9297, + "step": 475 + }, + { + "epoch": 0.07451471509079524, + "grad_norm": 4.046169757843018, + "learning_rate": 9.4e-05, + "loss": 3.0158, + "step": 476 + }, + { + "epoch": 0.07467125860989356, + "grad_norm": 1.329559326171875, + "learning_rate": 9.42e-05, + "loss": 2.8907, + "step": 477 + }, + { + "epoch": 0.07482780212899186, + "grad_norm": 1.0470929145812988, + "learning_rate": 9.44e-05, + "loss": 2.9147, + "step": 478 + }, + { + "epoch": 0.07498434564809017, + "grad_norm": 2.834249973297119, + "learning_rate": 9.46e-05, + "loss": 2.8957, + "step": 479 + }, + { + "epoch": 0.07514088916718847, + "grad_norm": 2.2937402725219727, + "learning_rate": 9.48e-05, + "loss": 2.9037, + "step": 480 + }, + { + "epoch": 0.07529743268628679, + "grad_norm": 11.690685272216797, + "learning_rate": 9.5e-05, + "loss": 3.4954, + "step": 481 + }, + { + "epoch": 0.0754539762053851, + "grad_norm": 1.7081512212753296, + "learning_rate": 9.52e-05, + "loss": 2.8782, + "step": 482 + }, + { + "epoch": 0.0756105197244834, + "grad_norm": 1.4048514366149902, + "learning_rate": 9.54e-05, + "loss": 2.9336, + "step": 483 + }, + { + "epoch": 0.07576706324358172, + "grad_norm": 1.2520028352737427, + "learning_rate": 9.56e-05, + "loss": 2.9178, + "step": 484 + }, + { + "epoch": 0.07592360676268002, + "grad_norm": 1.4318352937698364, + "learning_rate": 9.58e-05, + "loss": 2.921, + "step": 485 + }, + { + "epoch": 0.07608015028177834, + "grad_norm": 1.8462992906570435, + "learning_rate": 9.6e-05, + "loss": 2.8308, + "step": 486 + }, + { + "epoch": 0.07623669380087664, + "grad_norm": 3.258965253829956, + "learning_rate": 9.620000000000001e-05, + "loss": 3.0323, + "step": 487 + }, + { + "epoch": 0.07639323731997495, + "grad_norm": 2.1273250579833984, + "learning_rate": 9.64e-05, + "loss": 2.8649, + "step": 488 + }, + { + "epoch": 0.07654978083907327, + "grad_norm": 3.1350739002227783, + "learning_rate": 9.66e-05, + "loss": 2.9396, + "step": 489 + }, + { + "epoch": 0.07670632435817157, + "grad_norm": 2.9007985591888428, + "learning_rate": 9.680000000000001e-05, + "loss": 2.7319, + "step": 490 + }, + { + "epoch": 0.07686286787726988, + "grad_norm": 2.0893924236297607, + "learning_rate": 9.7e-05, + "loss": 2.8276, + "step": 491 + }, + { + "epoch": 0.07701941139636818, + "grad_norm": 4.586045265197754, + "learning_rate": 9.72e-05, + "loss": 2.7048, + "step": 492 + }, + { + "epoch": 0.0771759549154665, + "grad_norm": 1.9299709796905518, + "learning_rate": 9.74e-05, + "loss": 2.8418, + "step": 493 + }, + { + "epoch": 0.07733249843456481, + "grad_norm": 1.7819651365280151, + "learning_rate": 9.76e-05, + "loss": 2.7241, + "step": 494 + }, + { + "epoch": 0.07748904195366312, + "grad_norm": 1.3945015668869019, + "learning_rate": 9.78e-05, + "loss": 2.571, + "step": 495 + }, + { + "epoch": 0.07764558547276143, + "grad_norm": 1.4951081275939941, + "learning_rate": 9.8e-05, + "loss": 2.5154, + "step": 496 + }, + { + "epoch": 0.07780212899185973, + "grad_norm": 2.0147128105163574, + "learning_rate": 9.82e-05, + "loss": 2.5413, + "step": 497 + }, + { + "epoch": 0.07795867251095805, + "grad_norm": 1.8145289421081543, + "learning_rate": 9.84e-05, + "loss": 2.4091, + "step": 498 + }, + { + "epoch": 0.07811521603005636, + "grad_norm": 1.7071666717529297, + "learning_rate": 9.86e-05, + "loss": 2.32, + "step": 499 + }, + { + "epoch": 0.07827175954915466, + "grad_norm": 3.2004926204681396, + "learning_rate": 9.88e-05, + "loss": 2.3265, + "step": 500 + }, + { + "epoch": 0.07842830306825298, + "grad_norm": 2.753878355026245, + "learning_rate": 9.900000000000001e-05, + "loss": 2.9608, + "step": 501 + }, + { + "epoch": 0.07858484658735128, + "grad_norm": 1.7243417501449585, + "learning_rate": 9.92e-05, + "loss": 2.9093, + "step": 502 + }, + { + "epoch": 0.0787413901064496, + "grad_norm": 1.4103032350540161, + "learning_rate": 9.94e-05, + "loss": 2.8596, + "step": 503 + }, + { + "epoch": 0.07889793362554791, + "grad_norm": 1.6483265161514282, + "learning_rate": 9.960000000000001e-05, + "loss": 2.9604, + "step": 504 + }, + { + "epoch": 0.07905447714464621, + "grad_norm": 1.3204388618469238, + "learning_rate": 9.98e-05, + "loss": 2.8613, + "step": 505 + }, + { + "epoch": 0.07921102066374452, + "grad_norm": 1.0860481262207031, + "learning_rate": 0.0001, + "loss": 2.8665, + "step": 506 + }, + { + "epoch": 0.07936756418284283, + "grad_norm": 1.3780769109725952, + "learning_rate": 9.99918540241121e-05, + "loss": 2.8288, + "step": 507 + }, + { + "epoch": 0.07952410770194114, + "grad_norm": 0.9804167747497559, + "learning_rate": 9.998370804822417e-05, + "loss": 2.8334, + "step": 508 + }, + { + "epoch": 0.07968065122103946, + "grad_norm": 2.0898029804229736, + "learning_rate": 9.997556207233627e-05, + "loss": 2.8609, + "step": 509 + }, + { + "epoch": 0.07983719474013776, + "grad_norm": 4.400528907775879, + "learning_rate": 9.996741609644837e-05, + "loss": 2.941, + "step": 510 + }, + { + "epoch": 0.07999373825923607, + "grad_norm": 0.7346179485321045, + "learning_rate": 9.995927012056045e-05, + "loss": 2.843, + "step": 511 + }, + { + "epoch": 0.08015028177833437, + "grad_norm": 0.8281723260879517, + "learning_rate": 9.995112414467253e-05, + "loss": 2.8722, + "step": 512 + }, + { + "epoch": 0.08030682529743269, + "grad_norm": 0.6123642325401306, + "learning_rate": 9.994297816878463e-05, + "loss": 2.8533, + "step": 513 + }, + { + "epoch": 0.08046336881653099, + "grad_norm": 1.5796165466308594, + "learning_rate": 9.993483219289672e-05, + "loss": 2.8654, + "step": 514 + }, + { + "epoch": 0.0806199123356293, + "grad_norm": 0.698348879814148, + "learning_rate": 9.99266862170088e-05, + "loss": 2.833, + "step": 515 + }, + { + "epoch": 0.08077645585472762, + "grad_norm": 1.345316767692566, + "learning_rate": 9.99185402411209e-05, + "loss": 2.9522, + "step": 516 + }, + { + "epoch": 0.08093299937382592, + "grad_norm": 1.6996620893478394, + "learning_rate": 9.991039426523298e-05, + "loss": 2.8663, + "step": 517 + }, + { + "epoch": 0.08108954289292424, + "grad_norm": 2.2054755687713623, + "learning_rate": 9.990224828934506e-05, + "loss": 2.9609, + "step": 518 + }, + { + "epoch": 0.08124608641202254, + "grad_norm": 1.0510523319244385, + "learning_rate": 9.989410231345716e-05, + "loss": 2.8631, + "step": 519 + }, + { + "epoch": 0.08140262993112085, + "grad_norm": 0.9775800704956055, + "learning_rate": 9.988595633756925e-05, + "loss": 2.8314, + "step": 520 + }, + { + "epoch": 0.08155917345021917, + "grad_norm": 0.601208508014679, + "learning_rate": 9.987781036168133e-05, + "loss": 2.9251, + "step": 521 + }, + { + "epoch": 0.08171571696931747, + "grad_norm": 0.6896950602531433, + "learning_rate": 9.986966438579343e-05, + "loss": 2.858, + "step": 522 + }, + { + "epoch": 0.08187226048841578, + "grad_norm": 1.524906873703003, + "learning_rate": 9.986151840990551e-05, + "loss": 2.8574, + "step": 523 + }, + { + "epoch": 0.08202880400751408, + "grad_norm": 1.004073143005371, + "learning_rate": 9.98533724340176e-05, + "loss": 2.889, + "step": 524 + }, + { + "epoch": 0.0821853475266124, + "grad_norm": 2.8993184566497803, + "learning_rate": 9.984522645812969e-05, + "loss": 2.8799, + "step": 525 + }, + { + "epoch": 0.08234189104571071, + "grad_norm": 1.9987448453903198, + "learning_rate": 9.983708048224177e-05, + "loss": 2.7768, + "step": 526 + }, + { + "epoch": 0.08249843456480901, + "grad_norm": 7.786111831665039, + "learning_rate": 9.982893450635387e-05, + "loss": 3.2986, + "step": 527 + }, + { + "epoch": 0.08265497808390733, + "grad_norm": 0.7437018752098083, + "learning_rate": 9.982078853046596e-05, + "loss": 2.8405, + "step": 528 + }, + { + "epoch": 0.08281152160300563, + "grad_norm": 1.3410731554031372, + "learning_rate": 9.981264255457804e-05, + "loss": 2.827, + "step": 529 + }, + { + "epoch": 0.08296806512210395, + "grad_norm": 2.9547839164733887, + "learning_rate": 9.980449657869014e-05, + "loss": 2.9319, + "step": 530 + }, + { + "epoch": 0.08312460864120226, + "grad_norm": 2.733804941177368, + "learning_rate": 9.979635060280222e-05, + "loss": 2.8708, + "step": 531 + }, + { + "epoch": 0.08328115216030056, + "grad_norm": 1.2816842794418335, + "learning_rate": 9.97882046269143e-05, + "loss": 2.7212, + "step": 532 + }, + { + "epoch": 0.08343769567939888, + "grad_norm": 1.2535456418991089, + "learning_rate": 9.97800586510264e-05, + "loss": 2.9317, + "step": 533 + }, + { + "epoch": 0.08359423919849718, + "grad_norm": 1.3349040746688843, + "learning_rate": 9.977191267513849e-05, + "loss": 2.693, + "step": 534 + }, + { + "epoch": 0.08375078271759549, + "grad_norm": 1.6219457387924194, + "learning_rate": 9.976376669925057e-05, + "loss": 2.7893, + "step": 535 + }, + { + "epoch": 0.08390732623669381, + "grad_norm": 1.4069089889526367, + "learning_rate": 9.975562072336267e-05, + "loss": 2.6888, + "step": 536 + }, + { + "epoch": 0.08406386975579211, + "grad_norm": 2.4547863006591797, + "learning_rate": 9.974747474747475e-05, + "loss": 2.6845, + "step": 537 + }, + { + "epoch": 0.08422041327489042, + "grad_norm": 2.0938034057617188, + "learning_rate": 9.973932877158683e-05, + "loss": 2.8088, + "step": 538 + }, + { + "epoch": 0.08437695679398872, + "grad_norm": 1.9214909076690674, + "learning_rate": 9.973118279569893e-05, + "loss": 2.6797, + "step": 539 + }, + { + "epoch": 0.08453350031308704, + "grad_norm": 1.4723831415176392, + "learning_rate": 9.972303681981103e-05, + "loss": 2.7537, + "step": 540 + }, + { + "epoch": 0.08469004383218534, + "grad_norm": 3.078045129776001, + "learning_rate": 9.97148908439231e-05, + "loss": 2.7532, + "step": 541 + }, + { + "epoch": 0.08484658735128366, + "grad_norm": 2.2700068950653076, + "learning_rate": 9.97067448680352e-05, + "loss": 2.705, + "step": 542 + }, + { + "epoch": 0.08500313087038197, + "grad_norm": 1.4482707977294922, + "learning_rate": 9.96985988921473e-05, + "loss": 2.5877, + "step": 543 + }, + { + "epoch": 0.08515967438948027, + "grad_norm": 1.293514370918274, + "learning_rate": 9.969045291625936e-05, + "loss": 2.5548, + "step": 544 + }, + { + "epoch": 0.08531621790857859, + "grad_norm": 1.434036135673523, + "learning_rate": 9.968230694037146e-05, + "loss": 2.5587, + "step": 545 + }, + { + "epoch": 0.08547276142767689, + "grad_norm": 2.4284398555755615, + "learning_rate": 9.967416096448356e-05, + "loss": 2.2661, + "step": 546 + }, + { + "epoch": 0.0856293049467752, + "grad_norm": 2.210675001144409, + "learning_rate": 9.966601498859564e-05, + "loss": 2.4346, + "step": 547 + }, + { + "epoch": 0.08578584846587352, + "grad_norm": 2.8863906860351562, + "learning_rate": 9.965786901270773e-05, + "loss": 2.4851, + "step": 548 + }, + { + "epoch": 0.08594239198497182, + "grad_norm": 1.6791318655014038, + "learning_rate": 9.964972303681982e-05, + "loss": 2.2914, + "step": 549 + }, + { + "epoch": 0.08609893550407013, + "grad_norm": 1.6521124839782715, + "learning_rate": 9.96415770609319e-05, + "loss": 2.3938, + "step": 550 + }, + { + "epoch": 0.08625547902316844, + "grad_norm": 4.407156944274902, + "learning_rate": 9.963343108504399e-05, + "loss": 2.9793, + "step": 551 + }, + { + "epoch": 0.08641202254226675, + "grad_norm": 3.2600412368774414, + "learning_rate": 9.962528510915609e-05, + "loss": 2.9622, + "step": 552 + }, + { + "epoch": 0.08656856606136507, + "grad_norm": 1.5560517311096191, + "learning_rate": 9.961713913326817e-05, + "loss": 2.9193, + "step": 553 + }, + { + "epoch": 0.08672510958046337, + "grad_norm": 1.01088547706604, + "learning_rate": 9.960899315738026e-05, + "loss": 2.902, + "step": 554 + }, + { + "epoch": 0.08688165309956168, + "grad_norm": 0.6974119544029236, + "learning_rate": 9.960084718149235e-05, + "loss": 2.8476, + "step": 555 + }, + { + "epoch": 0.08703819661865998, + "grad_norm": 0.671410322189331, + "learning_rate": 9.959270120560444e-05, + "loss": 2.855, + "step": 556 + }, + { + "epoch": 0.0871947401377583, + "grad_norm": 1.5326281785964966, + "learning_rate": 9.958455522971652e-05, + "loss": 2.8233, + "step": 557 + }, + { + "epoch": 0.08735128365685661, + "grad_norm": 8.423707008361816, + "learning_rate": 9.957640925382862e-05, + "loss": 3.2744, + "step": 558 + }, + { + "epoch": 0.08750782717595491, + "grad_norm": 1.3894635438919067, + "learning_rate": 9.95682632779407e-05, + "loss": 2.8786, + "step": 559 + }, + { + "epoch": 0.08766437069505323, + "grad_norm": 0.8202598690986633, + "learning_rate": 9.956011730205278e-05, + "loss": 2.8706, + "step": 560 + }, + { + "epoch": 0.08782091421415153, + "grad_norm": 0.7782045006752014, + "learning_rate": 9.955197132616488e-05, + "loss": 2.8537, + "step": 561 + }, + { + "epoch": 0.08797745773324984, + "grad_norm": 0.9023594856262207, + "learning_rate": 9.954382535027697e-05, + "loss": 2.8263, + "step": 562 + }, + { + "epoch": 0.08813400125234815, + "grad_norm": 0.6753450632095337, + "learning_rate": 9.953567937438906e-05, + "loss": 2.8357, + "step": 563 + }, + { + "epoch": 0.08829054477144646, + "grad_norm": 1.2191085815429688, + "learning_rate": 9.952753339850115e-05, + "loss": 2.9041, + "step": 564 + }, + { + "epoch": 0.08844708829054478, + "grad_norm": 1.6364171504974365, + "learning_rate": 9.951938742261323e-05, + "loss": 2.8138, + "step": 565 + }, + { + "epoch": 0.08860363180964308, + "grad_norm": 0.642795741558075, + "learning_rate": 9.951124144672533e-05, + "loss": 2.8561, + "step": 566 + }, + { + "epoch": 0.08876017532874139, + "grad_norm": 0.7554330229759216, + "learning_rate": 9.950309547083741e-05, + "loss": 2.8742, + "step": 567 + }, + { + "epoch": 0.08891671884783969, + "grad_norm": 1.6295794248580933, + "learning_rate": 9.94949494949495e-05, + "loss": 2.9231, + "step": 568 + }, + { + "epoch": 0.08907326236693801, + "grad_norm": 1.1250358819961548, + "learning_rate": 9.948680351906159e-05, + "loss": 2.8369, + "step": 569 + }, + { + "epoch": 0.08922980588603632, + "grad_norm": 1.4467201232910156, + "learning_rate": 9.947865754317368e-05, + "loss": 2.854, + "step": 570 + }, + { + "epoch": 0.08938634940513462, + "grad_norm": 1.215562343597412, + "learning_rate": 9.947051156728576e-05, + "loss": 2.8763, + "step": 571 + }, + { + "epoch": 0.08954289292423294, + "grad_norm": 1.5266473293304443, + "learning_rate": 9.946236559139786e-05, + "loss": 2.8331, + "step": 572 + }, + { + "epoch": 0.08969943644333124, + "grad_norm": 2.6568052768707275, + "learning_rate": 9.945421961550994e-05, + "loss": 2.8956, + "step": 573 + }, + { + "epoch": 0.08985597996242956, + "grad_norm": 1.211342692375183, + "learning_rate": 9.944607363962203e-05, + "loss": 2.7712, + "step": 574 + }, + { + "epoch": 0.09001252348152787, + "grad_norm": 1.8281949758529663, + "learning_rate": 9.943792766373412e-05, + "loss": 2.8415, + "step": 575 + }, + { + "epoch": 0.09016906700062617, + "grad_norm": 3.124251127243042, + "learning_rate": 9.942978168784622e-05, + "loss": 2.8172, + "step": 576 + }, + { + "epoch": 0.09032561051972449, + "grad_norm": 3.2173542976379395, + "learning_rate": 9.942163571195829e-05, + "loss": 2.894, + "step": 577 + }, + { + "epoch": 0.09048215403882279, + "grad_norm": 0.773543119430542, + "learning_rate": 9.941348973607039e-05, + "loss": 2.894, + "step": 578 + }, + { + "epoch": 0.0906386975579211, + "grad_norm": 1.2210344076156616, + "learning_rate": 9.940534376018248e-05, + "loss": 2.7707, + "step": 579 + }, + { + "epoch": 0.09079524107701942, + "grad_norm": 5.238592147827148, + "learning_rate": 9.939719778429455e-05, + "loss": 2.959, + "step": 580 + }, + { + "epoch": 0.09095178459611772, + "grad_norm": 1.1681221723556519, + "learning_rate": 9.938905180840665e-05, + "loss": 2.6702, + "step": 581 + }, + { + "epoch": 0.09110832811521603, + "grad_norm": 0.7689877152442932, + "learning_rate": 9.938090583251875e-05, + "loss": 2.7409, + "step": 582 + }, + { + "epoch": 0.09126487163431433, + "grad_norm": 1.8897547721862793, + "learning_rate": 9.937275985663082e-05, + "loss": 2.8709, + "step": 583 + }, + { + "epoch": 0.09142141515341265, + "grad_norm": 2.1146109104156494, + "learning_rate": 9.936461388074292e-05, + "loss": 2.6485, + "step": 584 + }, + { + "epoch": 0.09157795867251096, + "grad_norm": 1.8371776342391968, + "learning_rate": 9.935646790485501e-05, + "loss": 2.8329, + "step": 585 + }, + { + "epoch": 0.09173450219160927, + "grad_norm": 3.417442560195923, + "learning_rate": 9.93483219289671e-05, + "loss": 2.8566, + "step": 586 + }, + { + "epoch": 0.09189104571070758, + "grad_norm": 1.4840067625045776, + "learning_rate": 9.934017595307918e-05, + "loss": 2.6774, + "step": 587 + }, + { + "epoch": 0.09204758922980588, + "grad_norm": 1.7971484661102295, + "learning_rate": 9.933202997719128e-05, + "loss": 2.4615, + "step": 588 + }, + { + "epoch": 0.0922041327489042, + "grad_norm": 3.3690404891967773, + "learning_rate": 9.932388400130336e-05, + "loss": 2.5711, + "step": 589 + }, + { + "epoch": 0.0923606762680025, + "grad_norm": 1.2822954654693604, + "learning_rate": 9.931573802541545e-05, + "loss": 2.5678, + "step": 590 + }, + { + "epoch": 0.09251721978710081, + "grad_norm": 1.6855024099349976, + "learning_rate": 9.930759204952754e-05, + "loss": 2.469, + "step": 591 + }, + { + "epoch": 0.09267376330619913, + "grad_norm": 1.3822269439697266, + "learning_rate": 9.929944607363963e-05, + "loss": 2.5912, + "step": 592 + }, + { + "epoch": 0.09283030682529743, + "grad_norm": 2.458326816558838, + "learning_rate": 9.929130009775171e-05, + "loss": 2.3643, + "step": 593 + }, + { + "epoch": 0.09298685034439574, + "grad_norm": 1.5007473230361938, + "learning_rate": 9.928315412186381e-05, + "loss": 2.5019, + "step": 594 + }, + { + "epoch": 0.09314339386349405, + "grad_norm": 2.336089611053467, + "learning_rate": 9.927500814597589e-05, + "loss": 2.4641, + "step": 595 + }, + { + "epoch": 0.09329993738259236, + "grad_norm": 2.506908655166626, + "learning_rate": 9.926686217008798e-05, + "loss": 2.1144, + "step": 596 + }, + { + "epoch": 0.09345648090169068, + "grad_norm": 1.9266726970672607, + "learning_rate": 9.925871619420007e-05, + "loss": 2.3174, + "step": 597 + }, + { + "epoch": 0.09361302442078898, + "grad_norm": 2.1315064430236816, + "learning_rate": 9.925057021831216e-05, + "loss": 2.4065, + "step": 598 + }, + { + "epoch": 0.09376956793988729, + "grad_norm": 2.014691114425659, + "learning_rate": 9.924242424242425e-05, + "loss": 2.141, + "step": 599 + }, + { + "epoch": 0.09392611145898559, + "grad_norm": 2.5499374866485596, + "learning_rate": 9.923427826653634e-05, + "loss": 2.0739, + "step": 600 + }, + { + "epoch": 0.09408265497808391, + "grad_norm": 7.4930572509765625, + "learning_rate": 9.922613229064842e-05, + "loss": 3.1505, + "step": 601 + }, + { + "epoch": 0.09423919849718222, + "grad_norm": 5.589356422424316, + "learning_rate": 9.921798631476052e-05, + "loss": 2.9966, + "step": 602 + }, + { + "epoch": 0.09439574201628052, + "grad_norm": 3.298902988433838, + "learning_rate": 9.92098403388726e-05, + "loss": 2.8898, + "step": 603 + }, + { + "epoch": 0.09455228553537884, + "grad_norm": 1.0123564004898071, + "learning_rate": 9.920169436298469e-05, + "loss": 2.8392, + "step": 604 + }, + { + "epoch": 0.09470882905447714, + "grad_norm": 1.3472812175750732, + "learning_rate": 9.919354838709678e-05, + "loss": 2.8208, + "step": 605 + }, + { + "epoch": 0.09486537257357545, + "grad_norm": 1.9637954235076904, + "learning_rate": 9.918540241120887e-05, + "loss": 2.8553, + "step": 606 + }, + { + "epoch": 0.09502191609267377, + "grad_norm": 2.111715316772461, + "learning_rate": 9.917725643532095e-05, + "loss": 2.8438, + "step": 607 + }, + { + "epoch": 0.09517845961177207, + "grad_norm": 0.9426678419113159, + "learning_rate": 9.916911045943305e-05, + "loss": 2.8284, + "step": 608 + }, + { + "epoch": 0.09533500313087039, + "grad_norm": 1.2772736549377441, + "learning_rate": 9.916096448354513e-05, + "loss": 2.8173, + "step": 609 + }, + { + "epoch": 0.09549154664996869, + "grad_norm": 1.7921041250228882, + "learning_rate": 9.915281850765722e-05, + "loss": 2.8259, + "step": 610 + }, + { + "epoch": 0.095648090169067, + "grad_norm": 1.7995425462722778, + "learning_rate": 9.914467253176931e-05, + "loss": 2.8171, + "step": 611 + }, + { + "epoch": 0.09580463368816532, + "grad_norm": 0.3737086355686188, + "learning_rate": 9.91365265558814e-05, + "loss": 2.8019, + "step": 612 + }, + { + "epoch": 0.09596117720726362, + "grad_norm": 1.4430029392242432, + "learning_rate": 9.912838057999348e-05, + "loss": 2.8708, + "step": 613 + }, + { + "epoch": 0.09611772072636193, + "grad_norm": 0.6397246718406677, + "learning_rate": 9.912023460410558e-05, + "loss": 2.8561, + "step": 614 + }, + { + "epoch": 0.09627426424546023, + "grad_norm": 2.2208468914031982, + "learning_rate": 9.911208862821768e-05, + "loss": 2.8215, + "step": 615 + }, + { + "epoch": 0.09643080776455855, + "grad_norm": 0.5539305806159973, + "learning_rate": 9.910394265232975e-05, + "loss": 2.8165, + "step": 616 + }, + { + "epoch": 0.09658735128365685, + "grad_norm": 0.6151803135871887, + "learning_rate": 9.909579667644184e-05, + "loss": 2.8071, + "step": 617 + }, + { + "epoch": 0.09674389480275516, + "grad_norm": 1.689487099647522, + "learning_rate": 9.908765070055394e-05, + "loss": 2.7988, + "step": 618 + }, + { + "epoch": 0.09690043832185348, + "grad_norm": 1.2290040254592896, + "learning_rate": 9.907950472466601e-05, + "loss": 2.8069, + "step": 619 + }, + { + "epoch": 0.09705698184095178, + "grad_norm": 0.6253260374069214, + "learning_rate": 9.907135874877811e-05, + "loss": 2.7924, + "step": 620 + }, + { + "epoch": 0.0972135253600501, + "grad_norm": 1.0453952550888062, + "learning_rate": 9.90632127728902e-05, + "loss": 2.7365, + "step": 621 + }, + { + "epoch": 0.0973700688791484, + "grad_norm": 0.8569234609603882, + "learning_rate": 9.905506679700229e-05, + "loss": 2.8503, + "step": 622 + }, + { + "epoch": 0.09752661239824671, + "grad_norm": 0.8582557439804077, + "learning_rate": 9.904692082111437e-05, + "loss": 2.7664, + "step": 623 + }, + { + "epoch": 0.09768315591734503, + "grad_norm": 1.3812282085418701, + "learning_rate": 9.903877484522647e-05, + "loss": 2.7871, + "step": 624 + }, + { + "epoch": 0.09783969943644333, + "grad_norm": 0.9953295588493347, + "learning_rate": 9.903062886933855e-05, + "loss": 2.7557, + "step": 625 + }, + { + "epoch": 0.09799624295554164, + "grad_norm": 0.9126530289649963, + "learning_rate": 9.902248289345064e-05, + "loss": 2.7761, + "step": 626 + }, + { + "epoch": 0.09815278647463994, + "grad_norm": 0.7534878253936768, + "learning_rate": 9.901433691756273e-05, + "loss": 2.8153, + "step": 627 + }, + { + "epoch": 0.09830932999373826, + "grad_norm": 0.870887815952301, + "learning_rate": 9.900619094167482e-05, + "loss": 2.739, + "step": 628 + }, + { + "epoch": 0.09846587351283657, + "grad_norm": 1.2719275951385498, + "learning_rate": 9.89980449657869e-05, + "loss": 2.6743, + "step": 629 + }, + { + "epoch": 0.09862241703193488, + "grad_norm": 2.399301290512085, + "learning_rate": 9.8989898989899e-05, + "loss": 2.6396, + "step": 630 + }, + { + "epoch": 0.09877896055103319, + "grad_norm": 0.818570613861084, + "learning_rate": 9.898175301401108e-05, + "loss": 2.6682, + "step": 631 + }, + { + "epoch": 0.09893550407013149, + "grad_norm": 0.9305054545402527, + "learning_rate": 9.897360703812317e-05, + "loss": 2.69, + "step": 632 + }, + { + "epoch": 0.0990920475892298, + "grad_norm": 1.377770185470581, + "learning_rate": 9.896546106223526e-05, + "loss": 2.6868, + "step": 633 + }, + { + "epoch": 0.09924859110832812, + "grad_norm": 1.0729553699493408, + "learning_rate": 9.895731508634735e-05, + "loss": 2.5621, + "step": 634 + }, + { + "epoch": 0.09940513462742642, + "grad_norm": 1.517203450202942, + "learning_rate": 9.894916911045945e-05, + "loss": 2.7653, + "step": 635 + }, + { + "epoch": 0.09956167814652474, + "grad_norm": 2.129059314727783, + "learning_rate": 9.894102313457153e-05, + "loss": 2.6188, + "step": 636 + }, + { + "epoch": 0.09971822166562304, + "grad_norm": 1.3608496189117432, + "learning_rate": 9.893287715868361e-05, + "loss": 2.3855, + "step": 637 + }, + { + "epoch": 0.09987476518472135, + "grad_norm": 5.484130859375, + "learning_rate": 9.892473118279571e-05, + "loss": 2.6781, + "step": 638 + }, + { + "epoch": 0.10003130870381967, + "grad_norm": 1.7033277750015259, + "learning_rate": 9.89165852069078e-05, + "loss": 2.5815, + "step": 639 + }, + { + "epoch": 0.10018785222291797, + "grad_norm": 2.1340200901031494, + "learning_rate": 9.890843923101988e-05, + "loss": 2.6716, + "step": 640 + }, + { + "epoch": 0.10034439574201628, + "grad_norm": 2.5156519412994385, + "learning_rate": 9.890029325513198e-05, + "loss": 2.5635, + "step": 641 + }, + { + "epoch": 0.10050093926111459, + "grad_norm": 2.666149854660034, + "learning_rate": 9.889214727924406e-05, + "loss": 2.4854, + "step": 642 + }, + { + "epoch": 0.1006574827802129, + "grad_norm": 1.4829959869384766, + "learning_rate": 9.888400130335614e-05, + "loss": 2.3857, + "step": 643 + }, + { + "epoch": 0.1008140262993112, + "grad_norm": 1.8378692865371704, + "learning_rate": 9.887585532746824e-05, + "loss": 2.5456, + "step": 644 + }, + { + "epoch": 0.10097056981840952, + "grad_norm": 3.3553576469421387, + "learning_rate": 9.886770935158032e-05, + "loss": 2.428, + "step": 645 + }, + { + "epoch": 0.10112711333750783, + "grad_norm": 3.028073787689209, + "learning_rate": 9.885956337569241e-05, + "loss": 2.3568, + "step": 646 + }, + { + "epoch": 0.10128365685660613, + "grad_norm": 1.9638338088989258, + "learning_rate": 9.88514173998045e-05, + "loss": 2.3458, + "step": 647 + }, + { + "epoch": 0.10144020037570445, + "grad_norm": 2.9545176029205322, + "learning_rate": 9.884327142391659e-05, + "loss": 2.0199, + "step": 648 + }, + { + "epoch": 0.10159674389480275, + "grad_norm": 1.472367525100708, + "learning_rate": 9.883512544802867e-05, + "loss": 2.1555, + "step": 649 + }, + { + "epoch": 0.10175328741390106, + "grad_norm": 2.6447904109954834, + "learning_rate": 9.882697947214077e-05, + "loss": 1.9666, + "step": 650 + }, + { + "epoch": 0.10190983093299938, + "grad_norm": 3.7425897121429443, + "learning_rate": 9.881883349625287e-05, + "loss": 2.8856, + "step": 651 + }, + { + "epoch": 0.10206637445209768, + "grad_norm": 2.5512330532073975, + "learning_rate": 9.881068752036494e-05, + "loss": 2.8129, + "step": 652 + }, + { + "epoch": 0.102222917971196, + "grad_norm": 0.9332075119018555, + "learning_rate": 9.880254154447703e-05, + "loss": 2.8265, + "step": 653 + }, + { + "epoch": 0.1023794614902943, + "grad_norm": 1.0653643608093262, + "learning_rate": 9.879439556858913e-05, + "loss": 2.7925, + "step": 654 + }, + { + "epoch": 0.10253600500939261, + "grad_norm": 0.6242510676383972, + "learning_rate": 9.87862495927012e-05, + "loss": 2.7984, + "step": 655 + }, + { + "epoch": 0.10269254852849093, + "grad_norm": 4.300251483917236, + "learning_rate": 9.87781036168133e-05, + "loss": 2.9462, + "step": 656 + }, + { + "epoch": 0.10284909204758923, + "grad_norm": 2.9649693965911865, + "learning_rate": 9.87699576409254e-05, + "loss": 2.805, + "step": 657 + }, + { + "epoch": 0.10300563556668754, + "grad_norm": 3.7881345748901367, + "learning_rate": 9.876181166503748e-05, + "loss": 2.8824, + "step": 658 + }, + { + "epoch": 0.10316217908578584, + "grad_norm": 2.1568150520324707, + "learning_rate": 9.875366568914956e-05, + "loss": 2.7384, + "step": 659 + }, + { + "epoch": 0.10331872260488416, + "grad_norm": 0.6318357586860657, + "learning_rate": 9.874551971326166e-05, + "loss": 2.7484, + "step": 660 + }, + { + "epoch": 0.10347526612398247, + "grad_norm": 2.8990180492401123, + "learning_rate": 9.873737373737374e-05, + "loss": 2.8566, + "step": 661 + }, + { + "epoch": 0.10363180964308077, + "grad_norm": 0.9093145132064819, + "learning_rate": 9.872922776148583e-05, + "loss": 2.7695, + "step": 662 + }, + { + "epoch": 0.10378835316217909, + "grad_norm": 0.8827502131462097, + "learning_rate": 9.872108178559793e-05, + "loss": 2.7644, + "step": 663 + }, + { + "epoch": 0.10394489668127739, + "grad_norm": 1.601769208908081, + "learning_rate": 9.871293580971001e-05, + "loss": 2.7777, + "step": 664 + }, + { + "epoch": 0.1041014402003757, + "grad_norm": 1.3840761184692383, + "learning_rate": 9.87047898338221e-05, + "loss": 2.7662, + "step": 665 + }, + { + "epoch": 0.10425798371947402, + "grad_norm": 1.469943642616272, + "learning_rate": 9.869664385793419e-05, + "loss": 2.7149, + "step": 666 + }, + { + "epoch": 0.10441452723857232, + "grad_norm": 4.227337837219238, + "learning_rate": 9.868849788204627e-05, + "loss": 3.0076, + "step": 667 + }, + { + "epoch": 0.10457107075767064, + "grad_norm": 1.1082732677459717, + "learning_rate": 9.868035190615836e-05, + "loss": 2.7248, + "step": 668 + }, + { + "epoch": 0.10472761427676894, + "grad_norm": 1.4060354232788086, + "learning_rate": 9.867220593027046e-05, + "loss": 2.686, + "step": 669 + }, + { + "epoch": 0.10488415779586725, + "grad_norm": 1.2748308181762695, + "learning_rate": 9.866405995438254e-05, + "loss": 2.6346, + "step": 670 + }, + { + "epoch": 0.10504070131496555, + "grad_norm": 0.8268353343009949, + "learning_rate": 9.865591397849462e-05, + "loss": 2.6386, + "step": 671 + }, + { + "epoch": 0.10519724483406387, + "grad_norm": 1.1318106651306152, + "learning_rate": 9.864776800260672e-05, + "loss": 2.6078, + "step": 672 + }, + { + "epoch": 0.10535378835316218, + "grad_norm": 0.9703989028930664, + "learning_rate": 9.86396220267188e-05, + "loss": 2.6714, + "step": 673 + }, + { + "epoch": 0.10551033187226048, + "grad_norm": 5.083658695220947, + "learning_rate": 9.86314760508309e-05, + "loss": 2.7395, + "step": 674 + }, + { + "epoch": 0.1056668753913588, + "grad_norm": 2.0768179893493652, + "learning_rate": 9.862333007494299e-05, + "loss": 2.4646, + "step": 675 + }, + { + "epoch": 0.1058234189104571, + "grad_norm": 1.299415111541748, + "learning_rate": 9.861518409905507e-05, + "loss": 2.6451, + "step": 676 + }, + { + "epoch": 0.10597996242955542, + "grad_norm": 0.7822179794311523, + "learning_rate": 9.860703812316717e-05, + "loss": 2.5565, + "step": 677 + }, + { + "epoch": 0.10613650594865373, + "grad_norm": 1.5704103708267212, + "learning_rate": 9.859889214727925e-05, + "loss": 2.4721, + "step": 678 + }, + { + "epoch": 0.10629304946775203, + "grad_norm": 1.2918601036071777, + "learning_rate": 9.859074617139133e-05, + "loss": 2.5473, + "step": 679 + }, + { + "epoch": 0.10644959298685035, + "grad_norm": 1.4173246622085571, + "learning_rate": 9.858260019550343e-05, + "loss": 2.6939, + "step": 680 + }, + { + "epoch": 0.10660613650594865, + "grad_norm": 1.6399637460708618, + "learning_rate": 9.857445421961551e-05, + "loss": 2.81, + "step": 681 + }, + { + "epoch": 0.10676268002504696, + "grad_norm": 1.380361557006836, + "learning_rate": 9.85663082437276e-05, + "loss": 2.5377, + "step": 682 + }, + { + "epoch": 0.10691922354414528, + "grad_norm": 1.0578728914260864, + "learning_rate": 9.85581622678397e-05, + "loss": 2.44, + "step": 683 + }, + { + "epoch": 0.10707576706324358, + "grad_norm": 1.5509865283966064, + "learning_rate": 9.855001629195178e-05, + "loss": 2.4388, + "step": 684 + }, + { + "epoch": 0.1072323105823419, + "grad_norm": 8.801727294921875, + "learning_rate": 9.854187031606386e-05, + "loss": 2.866, + "step": 685 + }, + { + "epoch": 0.1073888541014402, + "grad_norm": 1.904396653175354, + "learning_rate": 9.853372434017596e-05, + "loss": 2.4297, + "step": 686 + }, + { + "epoch": 0.10754539762053851, + "grad_norm": 1.9379316568374634, + "learning_rate": 9.852557836428806e-05, + "loss": 2.4725, + "step": 687 + }, + { + "epoch": 0.10770194113963683, + "grad_norm": 1.441853404045105, + "learning_rate": 9.851743238840013e-05, + "loss": 2.6125, + "step": 688 + }, + { + "epoch": 0.10785848465873513, + "grad_norm": 2.102762460708618, + "learning_rate": 9.850928641251223e-05, + "loss": 2.6002, + "step": 689 + }, + { + "epoch": 0.10801502817783344, + "grad_norm": 3.122065305709839, + "learning_rate": 9.850114043662432e-05, + "loss": 2.5832, + "step": 690 + }, + { + "epoch": 0.10817157169693174, + "grad_norm": 2.9246673583984375, + "learning_rate": 9.849299446073639e-05, + "loss": 2.5198, + "step": 691 + }, + { + "epoch": 0.10832811521603006, + "grad_norm": 2.952025890350342, + "learning_rate": 9.848484848484849e-05, + "loss": 2.7023, + "step": 692 + }, + { + "epoch": 0.10848465873512837, + "grad_norm": 2.9609484672546387, + "learning_rate": 9.847670250896059e-05, + "loss": 2.4949, + "step": 693 + }, + { + "epoch": 0.10864120225422667, + "grad_norm": 1.5384304523468018, + "learning_rate": 9.846855653307267e-05, + "loss": 2.2039, + "step": 694 + }, + { + "epoch": 0.10879774577332499, + "grad_norm": 1.660617709159851, + "learning_rate": 9.846041055718475e-05, + "loss": 2.4463, + "step": 695 + }, + { + "epoch": 0.10895428929242329, + "grad_norm": 1.8039430379867554, + "learning_rate": 9.845226458129685e-05, + "loss": 1.9995, + "step": 696 + }, + { + "epoch": 0.1091108328115216, + "grad_norm": 1.94100022315979, + "learning_rate": 9.844411860540894e-05, + "loss": 2.2166, + "step": 697 + }, + { + "epoch": 0.1092673763306199, + "grad_norm": 1.4337977170944214, + "learning_rate": 9.843597262952102e-05, + "loss": 2.0883, + "step": 698 + }, + { + "epoch": 0.10942391984971822, + "grad_norm": 1.4229611158370972, + "learning_rate": 9.842782665363312e-05, + "loss": 1.967, + "step": 699 + }, + { + "epoch": 0.10958046336881654, + "grad_norm": 2.167865514755249, + "learning_rate": 9.84196806777452e-05, + "loss": 1.9773, + "step": 700 + }, + { + "epoch": 0.10973700688791484, + "grad_norm": 1.914004921913147, + "learning_rate": 9.841153470185728e-05, + "loss": 2.7147, + "step": 701 + }, + { + "epoch": 0.10989355040701315, + "grad_norm": 1.218900442123413, + "learning_rate": 9.840338872596938e-05, + "loss": 2.6607, + "step": 702 + }, + { + "epoch": 0.11005009392611145, + "grad_norm": 0.9041356444358826, + "learning_rate": 9.839524275008147e-05, + "loss": 2.5797, + "step": 703 + }, + { + "epoch": 0.11020663744520977, + "grad_norm": 0.8730372190475464, + "learning_rate": 9.838709677419355e-05, + "loss": 2.5905, + "step": 704 + }, + { + "epoch": 0.11036318096430808, + "grad_norm": 1.5772796869277954, + "learning_rate": 9.837895079830565e-05, + "loss": 2.5849, + "step": 705 + }, + { + "epoch": 0.11051972448340638, + "grad_norm": 0.8971035480499268, + "learning_rate": 9.837080482241773e-05, + "loss": 2.5384, + "step": 706 + }, + { + "epoch": 0.1106762680025047, + "grad_norm": 0.720401406288147, + "learning_rate": 9.836265884652981e-05, + "loss": 2.4604, + "step": 707 + }, + { + "epoch": 0.110832811521603, + "grad_norm": 0.8826258778572083, + "learning_rate": 9.835451287064191e-05, + "loss": 2.4648, + "step": 708 + }, + { + "epoch": 0.11098935504070132, + "grad_norm": 0.9547269344329834, + "learning_rate": 9.8346366894754e-05, + "loss": 2.4657, + "step": 709 + }, + { + "epoch": 0.11114589855979963, + "grad_norm": 0.8945643901824951, + "learning_rate": 9.833822091886609e-05, + "loss": 2.4271, + "step": 710 + }, + { + "epoch": 0.11130244207889793, + "grad_norm": 2.3480100631713867, + "learning_rate": 9.833007494297818e-05, + "loss": 2.5882, + "step": 711 + }, + { + "epoch": 0.11145898559799625, + "grad_norm": 0.7994589805603027, + "learning_rate": 9.832192896709026e-05, + "loss": 2.408, + "step": 712 + }, + { + "epoch": 0.11161552911709455, + "grad_norm": 1.8750884532928467, + "learning_rate": 9.831378299120236e-05, + "loss": 2.5265, + "step": 713 + }, + { + "epoch": 0.11177207263619286, + "grad_norm": 0.8022245168685913, + "learning_rate": 9.830563701531444e-05, + "loss": 2.4954, + "step": 714 + }, + { + "epoch": 0.11192861615529118, + "grad_norm": 2.8064677715301514, + "learning_rate": 9.829749103942652e-05, + "loss": 2.3927, + "step": 715 + }, + { + "epoch": 0.11208515967438948, + "grad_norm": 1.0804543495178223, + "learning_rate": 9.828934506353862e-05, + "loss": 2.4538, + "step": 716 + }, + { + "epoch": 0.1122417031934878, + "grad_norm": 0.9353273510932922, + "learning_rate": 9.82811990876507e-05, + "loss": 2.4229, + "step": 717 + }, + { + "epoch": 0.1123982467125861, + "grad_norm": 1.5260183811187744, + "learning_rate": 9.827305311176279e-05, + "loss": 2.4419, + "step": 718 + }, + { + "epoch": 0.11255479023168441, + "grad_norm": 1.8410511016845703, + "learning_rate": 9.826490713587489e-05, + "loss": 2.4495, + "step": 719 + }, + { + "epoch": 0.11271133375078271, + "grad_norm": 1.2425888776779175, + "learning_rate": 9.825676115998697e-05, + "loss": 2.3215, + "step": 720 + }, + { + "epoch": 0.11286787726988103, + "grad_norm": 0.9190022349357605, + "learning_rate": 9.824861518409905e-05, + "loss": 2.3174, + "step": 721 + }, + { + "epoch": 0.11302442078897934, + "grad_norm": 1.095627784729004, + "learning_rate": 9.824046920821115e-05, + "loss": 2.3817, + "step": 722 + }, + { + "epoch": 0.11318096430807764, + "grad_norm": 0.9646661281585693, + "learning_rate": 9.823232323232325e-05, + "loss": 2.3846, + "step": 723 + }, + { + "epoch": 0.11333750782717596, + "grad_norm": 1.1609097719192505, + "learning_rate": 9.822417725643532e-05, + "loss": 2.268, + "step": 724 + }, + { + "epoch": 0.11349405134627426, + "grad_norm": 1.4689620733261108, + "learning_rate": 9.821603128054742e-05, + "loss": 2.3416, + "step": 725 + }, + { + "epoch": 0.11365059486537257, + "grad_norm": 1.2581948041915894, + "learning_rate": 9.820788530465951e-05, + "loss": 2.3486, + "step": 726 + }, + { + "epoch": 0.11380713838447089, + "grad_norm": 2.65815806388855, + "learning_rate": 9.819973932877158e-05, + "loss": 2.5941, + "step": 727 + }, + { + "epoch": 0.11396368190356919, + "grad_norm": 1.5818512439727783, + "learning_rate": 9.819159335288368e-05, + "loss": 2.2632, + "step": 728 + }, + { + "epoch": 0.1141202254226675, + "grad_norm": 0.9965378046035767, + "learning_rate": 9.818344737699578e-05, + "loss": 2.3148, + "step": 729 + }, + { + "epoch": 0.1142767689417658, + "grad_norm": 3.91255521774292, + "learning_rate": 9.817530140110785e-05, + "loss": 2.3793, + "step": 730 + }, + { + "epoch": 0.11443331246086412, + "grad_norm": 1.3783539533615112, + "learning_rate": 9.816715542521995e-05, + "loss": 2.2352, + "step": 731 + }, + { + "epoch": 0.11458985597996243, + "grad_norm": 1.755076289176941, + "learning_rate": 9.815900944933204e-05, + "loss": 2.3616, + "step": 732 + }, + { + "epoch": 0.11474639949906074, + "grad_norm": 1.83636474609375, + "learning_rate": 9.815086347344413e-05, + "loss": 2.2542, + "step": 733 + }, + { + "epoch": 0.11490294301815905, + "grad_norm": 2.4447803497314453, + "learning_rate": 9.814271749755621e-05, + "loss": 2.1063, + "step": 734 + }, + { + "epoch": 0.11505948653725735, + "grad_norm": 1.6063470840454102, + "learning_rate": 9.813457152166831e-05, + "loss": 2.1863, + "step": 735 + }, + { + "epoch": 0.11521603005635567, + "grad_norm": 4.463958263397217, + "learning_rate": 9.812642554578039e-05, + "loss": 2.517, + "step": 736 + }, + { + "epoch": 0.11537257357545398, + "grad_norm": 2.6691408157348633, + "learning_rate": 9.811827956989248e-05, + "loss": 2.3913, + "step": 737 + }, + { + "epoch": 0.11552911709455228, + "grad_norm": 1.8220747709274292, + "learning_rate": 9.811013359400457e-05, + "loss": 2.4272, + "step": 738 + }, + { + "epoch": 0.1156856606136506, + "grad_norm": 2.357574462890625, + "learning_rate": 9.810198761811666e-05, + "loss": 2.3729, + "step": 739 + }, + { + "epoch": 0.1158422041327489, + "grad_norm": 1.777024507522583, + "learning_rate": 9.809384164222874e-05, + "loss": 2.2397, + "step": 740 + }, + { + "epoch": 0.11599874765184721, + "grad_norm": 2.1567578315734863, + "learning_rate": 9.808569566634084e-05, + "loss": 2.2517, + "step": 741 + }, + { + "epoch": 0.11615529117094553, + "grad_norm": 3.3562700748443604, + "learning_rate": 9.807754969045292e-05, + "loss": 2.2101, + "step": 742 + }, + { + "epoch": 0.11631183469004383, + "grad_norm": 1.9700833559036255, + "learning_rate": 9.8069403714565e-05, + "loss": 2.0386, + "step": 743 + }, + { + "epoch": 0.11646837820914215, + "grad_norm": 1.7324656248092651, + "learning_rate": 9.80612577386771e-05, + "loss": 2.015, + "step": 744 + }, + { + "epoch": 0.11662492172824045, + "grad_norm": 2.527266263961792, + "learning_rate": 9.805311176278919e-05, + "loss": 1.7999, + "step": 745 + }, + { + "epoch": 0.11678146524733876, + "grad_norm": 1.4505702257156372, + "learning_rate": 9.804496578690128e-05, + "loss": 2.1929, + "step": 746 + }, + { + "epoch": 0.11693800876643706, + "grad_norm": 2.9072721004486084, + "learning_rate": 9.803681981101337e-05, + "loss": 1.9325, + "step": 747 + }, + { + "epoch": 0.11709455228553538, + "grad_norm": 2.41085147857666, + "learning_rate": 9.802867383512545e-05, + "loss": 2.2686, + "step": 748 + }, + { + "epoch": 0.11725109580463369, + "grad_norm": 1.6397075653076172, + "learning_rate": 9.802052785923755e-05, + "loss": 1.8325, + "step": 749 + }, + { + "epoch": 0.117407639323732, + "grad_norm": 1.691079020500183, + "learning_rate": 9.801238188334963e-05, + "loss": 2.0385, + "step": 750 + }, + { + "epoch": 0.11756418284283031, + "grad_norm": 1.9352020025253296, + "learning_rate": 9.800423590746172e-05, + "loss": 2.3773, + "step": 751 + }, + { + "epoch": 0.11772072636192861, + "grad_norm": 1.0383591651916504, + "learning_rate": 9.799608993157381e-05, + "loss": 2.1942, + "step": 752 + }, + { + "epoch": 0.11787726988102692, + "grad_norm": 1.628342866897583, + "learning_rate": 9.79879439556859e-05, + "loss": 2.1788, + "step": 753 + }, + { + "epoch": 0.11803381340012524, + "grad_norm": 2.1213371753692627, + "learning_rate": 9.797979797979798e-05, + "loss": 2.1331, + "step": 754 + }, + { + "epoch": 0.11819035691922354, + "grad_norm": 1.4745807647705078, + "learning_rate": 9.797165200391008e-05, + "loss": 2.0511, + "step": 755 + }, + { + "epoch": 0.11834690043832186, + "grad_norm": 1.64266836643219, + "learning_rate": 9.796350602802216e-05, + "loss": 2.0944, + "step": 756 + }, + { + "epoch": 0.11850344395742016, + "grad_norm": 0.8786399960517883, + "learning_rate": 9.795536005213425e-05, + "loss": 2.0615, + "step": 757 + }, + { + "epoch": 0.11865998747651847, + "grad_norm": 0.9886221289634705, + "learning_rate": 9.794721407624634e-05, + "loss": 2.0041, + "step": 758 + }, + { + "epoch": 0.11881653099561679, + "grad_norm": 1.1628881692886353, + "learning_rate": 9.793906810035843e-05, + "loss": 2.0647, + "step": 759 + }, + { + "epoch": 0.11897307451471509, + "grad_norm": 1.3403279781341553, + "learning_rate": 9.793092212447051e-05, + "loss": 2.0357, + "step": 760 + }, + { + "epoch": 0.1191296180338134, + "grad_norm": 1.4677627086639404, + "learning_rate": 9.792277614858261e-05, + "loss": 2.0715, + "step": 761 + }, + { + "epoch": 0.1192861615529117, + "grad_norm": 1.6665476560592651, + "learning_rate": 9.791463017269469e-05, + "loss": 2.0226, + "step": 762 + }, + { + "epoch": 0.11944270507201002, + "grad_norm": 1.3574062585830688, + "learning_rate": 9.790648419680678e-05, + "loss": 2.007, + "step": 763 + }, + { + "epoch": 0.11959924859110833, + "grad_norm": 0.9220423698425293, + "learning_rate": 9.789833822091887e-05, + "loss": 1.9159, + "step": 764 + }, + { + "epoch": 0.11975579211020664, + "grad_norm": 1.241769552230835, + "learning_rate": 9.789019224503096e-05, + "loss": 1.9044, + "step": 765 + }, + { + "epoch": 0.11991233562930495, + "grad_norm": 1.067286729812622, + "learning_rate": 9.788204626914304e-05, + "loss": 2.0154, + "step": 766 + }, + { + "epoch": 0.12006887914840325, + "grad_norm": 2.1560277938842773, + "learning_rate": 9.787390029325514e-05, + "loss": 2.0485, + "step": 767 + }, + { + "epoch": 0.12022542266750157, + "grad_norm": 1.0813205242156982, + "learning_rate": 9.786575431736722e-05, + "loss": 2.0176, + "step": 768 + }, + { + "epoch": 0.12038196618659988, + "grad_norm": 1.2919522523880005, + "learning_rate": 9.785760834147932e-05, + "loss": 2.0344, + "step": 769 + }, + { + "epoch": 0.12053850970569818, + "grad_norm": 1.2721227407455444, + "learning_rate": 9.78494623655914e-05, + "loss": 1.8828, + "step": 770 + }, + { + "epoch": 0.1206950532247965, + "grad_norm": 1.2588865756988525, + "learning_rate": 9.784131638970349e-05, + "loss": 1.9586, + "step": 771 + }, + { + "epoch": 0.1208515967438948, + "grad_norm": 1.8528761863708496, + "learning_rate": 9.783317041381558e-05, + "loss": 1.9322, + "step": 772 + }, + { + "epoch": 0.12100814026299311, + "grad_norm": 1.4660450220108032, + "learning_rate": 9.782502443792767e-05, + "loss": 1.9653, + "step": 773 + }, + { + "epoch": 0.12116468378209141, + "grad_norm": 1.1041374206542969, + "learning_rate": 9.781687846203975e-05, + "loss": 2.0107, + "step": 774 + }, + { + "epoch": 0.12132122730118973, + "grad_norm": 2.1899356842041016, + "learning_rate": 9.780873248615185e-05, + "loss": 2.1568, + "step": 775 + }, + { + "epoch": 0.12147777082028804, + "grad_norm": 1.7058265209197998, + "learning_rate": 9.780058651026393e-05, + "loss": 1.9599, + "step": 776 + }, + { + "epoch": 0.12163431433938635, + "grad_norm": 1.6339792013168335, + "learning_rate": 9.779244053437602e-05, + "loss": 2.1102, + "step": 777 + }, + { + "epoch": 0.12179085785848466, + "grad_norm": 1.4348654747009277, + "learning_rate": 9.778429455848811e-05, + "loss": 1.8609, + "step": 778 + }, + { + "epoch": 0.12194740137758296, + "grad_norm": 3.0359885692596436, + "learning_rate": 9.77761485826002e-05, + "loss": 2.1203, + "step": 779 + }, + { + "epoch": 0.12210394489668128, + "grad_norm": 3.2349183559417725, + "learning_rate": 9.776800260671228e-05, + "loss": 2.231, + "step": 780 + }, + { + "epoch": 0.12226048841577959, + "grad_norm": 2.184856414794922, + "learning_rate": 9.775985663082438e-05, + "loss": 2.0209, + "step": 781 + }, + { + "epoch": 0.12241703193487789, + "grad_norm": 2.3892173767089844, + "learning_rate": 9.775171065493646e-05, + "loss": 2.2725, + "step": 782 + }, + { + "epoch": 0.12257357545397621, + "grad_norm": 2.1004064083099365, + "learning_rate": 9.774356467904855e-05, + "loss": 1.9846, + "step": 783 + }, + { + "epoch": 0.12273011897307451, + "grad_norm": 2.271934747695923, + "learning_rate": 9.773541870316064e-05, + "loss": 2.2885, + "step": 784 + }, + { + "epoch": 0.12288666249217282, + "grad_norm": 3.3486995697021484, + "learning_rate": 9.772727272727274e-05, + "loss": 2.2583, + "step": 785 + }, + { + "epoch": 0.12304320601127114, + "grad_norm": 2.358915328979492, + "learning_rate": 9.771912675138481e-05, + "loss": 2.2176, + "step": 786 + }, + { + "epoch": 0.12319974953036944, + "grad_norm": 2.1071901321411133, + "learning_rate": 9.771098077549691e-05, + "loss": 2.4141, + "step": 787 + }, + { + "epoch": 0.12335629304946776, + "grad_norm": 6.548426628112793, + "learning_rate": 9.7702834799609e-05, + "loss": 2.2001, + "step": 788 + }, + { + "epoch": 0.12351283656856606, + "grad_norm": 3.975735902786255, + "learning_rate": 9.769468882372107e-05, + "loss": 1.961, + "step": 789 + }, + { + "epoch": 0.12366938008766437, + "grad_norm": 5.876523971557617, + "learning_rate": 9.768654284783317e-05, + "loss": 2.1469, + "step": 790 + }, + { + "epoch": 0.12382592360676269, + "grad_norm": 2.4555540084838867, + "learning_rate": 9.767839687194527e-05, + "loss": 2.2203, + "step": 791 + }, + { + "epoch": 0.12398246712586099, + "grad_norm": 1.6745222806930542, + "learning_rate": 9.767025089605735e-05, + "loss": 1.801, + "step": 792 + }, + { + "epoch": 0.1241390106449593, + "grad_norm": 3.715613842010498, + "learning_rate": 9.766210492016944e-05, + "loss": 2.1912, + "step": 793 + }, + { + "epoch": 0.1242955541640576, + "grad_norm": 3.8292274475097656, + "learning_rate": 9.765395894428153e-05, + "loss": 1.8449, + "step": 794 + }, + { + "epoch": 0.12445209768315592, + "grad_norm": 3.09757137298584, + "learning_rate": 9.764581296839362e-05, + "loss": 2.2123, + "step": 795 + }, + { + "epoch": 0.12460864120225423, + "grad_norm": 3.661050319671631, + "learning_rate": 9.76376669925057e-05, + "loss": 1.7017, + "step": 796 + }, + { + "epoch": 0.12476518472135253, + "grad_norm": 2.855267286300659, + "learning_rate": 9.76295210166178e-05, + "loss": 1.5107, + "step": 797 + }, + { + "epoch": 0.12492172824045085, + "grad_norm": 2.329166889190674, + "learning_rate": 9.762137504072988e-05, + "loss": 1.4341, + "step": 798 + }, + { + "epoch": 0.12507827175954916, + "grad_norm": 4.806323051452637, + "learning_rate": 9.761322906484197e-05, + "loss": 1.8384, + "step": 799 + }, + { + "epoch": 0.12523481527864747, + "grad_norm": 2.7230560779571533, + "learning_rate": 9.760508308895406e-05, + "loss": 1.6785, + "step": 800 + }, + { + "epoch": 0.12539135879774577, + "grad_norm": 1.1485917568206787, + "learning_rate": 9.759693711306615e-05, + "loss": 1.8052, + "step": 801 + }, + { + "epoch": 0.1255479023168441, + "grad_norm": 1.1472573280334473, + "learning_rate": 9.758879113717823e-05, + "loss": 1.7016, + "step": 802 + }, + { + "epoch": 0.1257044458359424, + "grad_norm": 0.8442760705947876, + "learning_rate": 9.758064516129033e-05, + "loss": 1.6318, + "step": 803 + }, + { + "epoch": 0.1258609893550407, + "grad_norm": 0.8991712927818298, + "learning_rate": 9.757249918540241e-05, + "loss": 1.6497, + "step": 804 + }, + { + "epoch": 0.126017532874139, + "grad_norm": 0.9394484162330627, + "learning_rate": 9.756435320951451e-05, + "loss": 1.5788, + "step": 805 + }, + { + "epoch": 0.12617407639323733, + "grad_norm": 1.0831384658813477, + "learning_rate": 9.75562072336266e-05, + "loss": 1.6556, + "step": 806 + }, + { + "epoch": 0.12633061991233563, + "grad_norm": 1.013847827911377, + "learning_rate": 9.754806125773868e-05, + "loss": 1.5078, + "step": 807 + }, + { + "epoch": 0.12648716343143393, + "grad_norm": 1.8270509243011475, + "learning_rate": 9.753991528185077e-05, + "loss": 1.6284, + "step": 808 + }, + { + "epoch": 0.12664370695053226, + "grad_norm": 1.065319538116455, + "learning_rate": 9.753176930596286e-05, + "loss": 1.6251, + "step": 809 + }, + { + "epoch": 0.12680025046963056, + "grad_norm": 1.2120510339736938, + "learning_rate": 9.752362333007494e-05, + "loss": 1.556, + "step": 810 + }, + { + "epoch": 0.12695679398872886, + "grad_norm": 1.3012330532073975, + "learning_rate": 9.751547735418704e-05, + "loss": 1.5984, + "step": 811 + }, + { + "epoch": 0.12711333750782716, + "grad_norm": 0.9613692760467529, + "learning_rate": 9.750733137829912e-05, + "loss": 1.5889, + "step": 812 + }, + { + "epoch": 0.1272698810269255, + "grad_norm": 1.4072967767715454, + "learning_rate": 9.74991854024112e-05, + "loss": 1.5751, + "step": 813 + }, + { + "epoch": 0.1274264245460238, + "grad_norm": 1.4158782958984375, + "learning_rate": 9.74910394265233e-05, + "loss": 1.5194, + "step": 814 + }, + { + "epoch": 0.1275829680651221, + "grad_norm": 2.283658266067505, + "learning_rate": 9.748289345063539e-05, + "loss": 1.7314, + "step": 815 + }, + { + "epoch": 0.12773951158422042, + "grad_norm": 1.21562659740448, + "learning_rate": 9.747474747474747e-05, + "loss": 1.4895, + "step": 816 + }, + { + "epoch": 0.12789605510331872, + "grad_norm": 1.6406673192977905, + "learning_rate": 9.746660149885957e-05, + "loss": 1.4649, + "step": 817 + }, + { + "epoch": 0.12805259862241702, + "grad_norm": 2.8717029094696045, + "learning_rate": 9.745845552297165e-05, + "loss": 1.5665, + "step": 818 + }, + { + "epoch": 0.12820914214151535, + "grad_norm": 1.9165825843811035, + "learning_rate": 9.745030954708374e-05, + "loss": 1.6461, + "step": 819 + }, + { + "epoch": 0.12836568566061365, + "grad_norm": 1.9414699077606201, + "learning_rate": 9.744216357119583e-05, + "loss": 1.7124, + "step": 820 + }, + { + "epoch": 0.12852222917971196, + "grad_norm": 1.8303266763687134, + "learning_rate": 9.743401759530793e-05, + "loss": 1.6519, + "step": 821 + }, + { + "epoch": 0.12867877269881026, + "grad_norm": 2.0649194717407227, + "learning_rate": 9.742587161942e-05, + "loss": 1.521, + "step": 822 + }, + { + "epoch": 0.12883531621790859, + "grad_norm": 2.496152877807617, + "learning_rate": 9.74177256435321e-05, + "loss": 1.8625, + "step": 823 + }, + { + "epoch": 0.1289918597370069, + "grad_norm": 3.4803409576416016, + "learning_rate": 9.74095796676442e-05, + "loss": 2.1247, + "step": 824 + }, + { + "epoch": 0.1291484032561052, + "grad_norm": 1.256300687789917, + "learning_rate": 9.740143369175627e-05, + "loss": 1.7271, + "step": 825 + }, + { + "epoch": 0.12930494677520352, + "grad_norm": 2.079848289489746, + "learning_rate": 9.739328771586836e-05, + "loss": 1.8038, + "step": 826 + }, + { + "epoch": 0.12946149029430182, + "grad_norm": 2.611163377761841, + "learning_rate": 9.738514173998046e-05, + "loss": 1.9586, + "step": 827 + }, + { + "epoch": 0.12961803381340012, + "grad_norm": 5.013198375701904, + "learning_rate": 9.737699576409254e-05, + "loss": 1.4994, + "step": 828 + }, + { + "epoch": 0.12977457733249845, + "grad_norm": 2.1290438175201416, + "learning_rate": 9.736884978820463e-05, + "loss": 1.7098, + "step": 829 + }, + { + "epoch": 0.12993112085159675, + "grad_norm": 2.9733328819274902, + "learning_rate": 9.736070381231673e-05, + "loss": 2.0742, + "step": 830 + }, + { + "epoch": 0.13008766437069505, + "grad_norm": 2.7970967292785645, + "learning_rate": 9.735255783642881e-05, + "loss": 1.5923, + "step": 831 + }, + { + "epoch": 0.13024420788979335, + "grad_norm": 2.336737871170044, + "learning_rate": 9.734441186054089e-05, + "loss": 1.5841, + "step": 832 + }, + { + "epoch": 0.13040075140889168, + "grad_norm": 4.911342620849609, + "learning_rate": 9.733626588465299e-05, + "loss": 1.8033, + "step": 833 + }, + { + "epoch": 0.13055729492798998, + "grad_norm": 2.673963785171509, + "learning_rate": 9.732811990876507e-05, + "loss": 2.0372, + "step": 834 + }, + { + "epoch": 0.13071383844708828, + "grad_norm": 5.9152092933654785, + "learning_rate": 9.731997393287716e-05, + "loss": 1.9883, + "step": 835 + }, + { + "epoch": 0.1308703819661866, + "grad_norm": 2.2653706073760986, + "learning_rate": 9.731182795698925e-05, + "loss": 1.7966, + "step": 836 + }, + { + "epoch": 0.1310269254852849, + "grad_norm": 2.3123934268951416, + "learning_rate": 9.730368198110134e-05, + "loss": 1.9512, + "step": 837 + }, + { + "epoch": 0.1311834690043832, + "grad_norm": 2.482802629470825, + "learning_rate": 9.729553600521342e-05, + "loss": 1.895, + "step": 838 + }, + { + "epoch": 0.13134001252348151, + "grad_norm": 3.410831928253174, + "learning_rate": 9.728739002932552e-05, + "loss": 2.3563, + "step": 839 + }, + { + "epoch": 0.13149655604257984, + "grad_norm": 4.016936779022217, + "learning_rate": 9.72792440534376e-05, + "loss": 2.1741, + "step": 840 + }, + { + "epoch": 0.13165309956167814, + "grad_norm": 4.186375617980957, + "learning_rate": 9.727109807754969e-05, + "loss": 1.9035, + "step": 841 + }, + { + "epoch": 0.13180964308077645, + "grad_norm": 3.2103140354156494, + "learning_rate": 9.726295210166178e-05, + "loss": 1.8483, + "step": 842 + }, + { + "epoch": 0.13196618659987477, + "grad_norm": 4.028421878814697, + "learning_rate": 9.725480612577387e-05, + "loss": 2.1755, + "step": 843 + }, + { + "epoch": 0.13212273011897308, + "grad_norm": 11.0090970993042, + "learning_rate": 9.724666014988597e-05, + "loss": 1.9555, + "step": 844 + }, + { + "epoch": 0.13227927363807138, + "grad_norm": 3.227513551712036, + "learning_rate": 9.723851417399805e-05, + "loss": 2.6147, + "step": 845 + }, + { + "epoch": 0.1324358171571697, + "grad_norm": 3.5128073692321777, + "learning_rate": 9.723036819811013e-05, + "loss": 1.7813, + "step": 846 + }, + { + "epoch": 0.132592360676268, + "grad_norm": 2.56373929977417, + "learning_rate": 9.722222222222223e-05, + "loss": 1.1899, + "step": 847 + }, + { + "epoch": 0.1327489041953663, + "grad_norm": 2.915318489074707, + "learning_rate": 9.721407624633431e-05, + "loss": 1.8814, + "step": 848 + }, + { + "epoch": 0.1329054477144646, + "grad_norm": 2.4615111351013184, + "learning_rate": 9.72059302704464e-05, + "loss": 1.5551, + "step": 849 + }, + { + "epoch": 0.13306199123356294, + "grad_norm": 2.4703595638275146, + "learning_rate": 9.71977842945585e-05, + "loss": 1.4312, + "step": 850 + }, + { + "epoch": 0.13321853475266124, + "grad_norm": 1.2108956575393677, + "learning_rate": 9.718963831867058e-05, + "loss": 1.6229, + "step": 851 + }, + { + "epoch": 0.13337507827175954, + "grad_norm": 1.0948721170425415, + "learning_rate": 9.718149234278266e-05, + "loss": 1.4131, + "step": 852 + }, + { + "epoch": 0.13353162179085787, + "grad_norm": 1.2100574970245361, + "learning_rate": 9.717334636689476e-05, + "loss": 1.4388, + "step": 853 + }, + { + "epoch": 0.13368816530995617, + "grad_norm": 1.2138878107070923, + "learning_rate": 9.716520039100684e-05, + "loss": 1.3708, + "step": 854 + }, + { + "epoch": 0.13384470882905447, + "grad_norm": 1.1821008920669556, + "learning_rate": 9.715705441511893e-05, + "loss": 1.2604, + "step": 855 + }, + { + "epoch": 0.1340012523481528, + "grad_norm": 0.8906177282333374, + "learning_rate": 9.714890843923102e-05, + "loss": 1.1743, + "step": 856 + }, + { + "epoch": 0.1341577958672511, + "grad_norm": 1.0792192220687866, + "learning_rate": 9.714076246334312e-05, + "loss": 1.2507, + "step": 857 + }, + { + "epoch": 0.1343143393863494, + "grad_norm": 1.1176416873931885, + "learning_rate": 9.713261648745519e-05, + "loss": 1.3231, + "step": 858 + }, + { + "epoch": 0.1344708829054477, + "grad_norm": 1.3883886337280273, + "learning_rate": 9.712447051156729e-05, + "loss": 1.1961, + "step": 859 + }, + { + "epoch": 0.13462742642454603, + "grad_norm": 1.2354166507720947, + "learning_rate": 9.711632453567939e-05, + "loss": 1.1809, + "step": 860 + }, + { + "epoch": 0.13478396994364433, + "grad_norm": 1.2885087728500366, + "learning_rate": 9.710817855979146e-05, + "loss": 1.2042, + "step": 861 + }, + { + "epoch": 0.13494051346274263, + "grad_norm": 1.0842562913894653, + "learning_rate": 9.710003258390355e-05, + "loss": 1.2687, + "step": 862 + }, + { + "epoch": 0.13509705698184096, + "grad_norm": 1.6284326314926147, + "learning_rate": 9.709188660801565e-05, + "loss": 1.3784, + "step": 863 + }, + { + "epoch": 0.13525360050093926, + "grad_norm": 2.246206760406494, + "learning_rate": 9.708374063212774e-05, + "loss": 1.4734, + "step": 864 + }, + { + "epoch": 0.13541014402003757, + "grad_norm": 1.6838433742523193, + "learning_rate": 9.707559465623982e-05, + "loss": 1.2997, + "step": 865 + }, + { + "epoch": 0.13556668753913587, + "grad_norm": 1.2978309392929077, + "learning_rate": 9.706744868035192e-05, + "loss": 1.2675, + "step": 866 + }, + { + "epoch": 0.1357232310582342, + "grad_norm": 2.912743091583252, + "learning_rate": 9.7059302704464e-05, + "loss": 1.7471, + "step": 867 + }, + { + "epoch": 0.1358797745773325, + "grad_norm": 2.304304361343384, + "learning_rate": 9.705115672857608e-05, + "loss": 1.3287, + "step": 868 + }, + { + "epoch": 0.1360363180964308, + "grad_norm": 2.441387176513672, + "learning_rate": 9.704301075268818e-05, + "loss": 1.2437, + "step": 869 + }, + { + "epoch": 0.13619286161552913, + "grad_norm": 2.8442437648773193, + "learning_rate": 9.703486477680026e-05, + "loss": 1.3996, + "step": 870 + }, + { + "epoch": 0.13634940513462743, + "grad_norm": 2.409034490585327, + "learning_rate": 9.702671880091235e-05, + "loss": 1.4709, + "step": 871 + }, + { + "epoch": 0.13650594865372573, + "grad_norm": 1.7133383750915527, + "learning_rate": 9.701857282502445e-05, + "loss": 1.3477, + "step": 872 + }, + { + "epoch": 0.13666249217282406, + "grad_norm": 1.5932588577270508, + "learning_rate": 9.701042684913653e-05, + "loss": 1.3625, + "step": 873 + }, + { + "epoch": 0.13681903569192236, + "grad_norm": 2.2663605213165283, + "learning_rate": 9.700228087324861e-05, + "loss": 1.478, + "step": 874 + }, + { + "epoch": 0.13697557921102066, + "grad_norm": 3.7696282863616943, + "learning_rate": 9.699413489736071e-05, + "loss": 1.6356, + "step": 875 + }, + { + "epoch": 0.13713212273011896, + "grad_norm": 1.5953259468078613, + "learning_rate": 9.69859889214728e-05, + "loss": 1.6521, + "step": 876 + }, + { + "epoch": 0.1372886662492173, + "grad_norm": 2.330009937286377, + "learning_rate": 9.697784294558488e-05, + "loss": 1.809, + "step": 877 + }, + { + "epoch": 0.1374452097683156, + "grad_norm": 3.0731348991394043, + "learning_rate": 9.696969696969698e-05, + "loss": 1.8876, + "step": 878 + }, + { + "epoch": 0.1376017532874139, + "grad_norm": 3.2127604484558105, + "learning_rate": 9.696155099380906e-05, + "loss": 1.7138, + "step": 879 + }, + { + "epoch": 0.13775829680651222, + "grad_norm": 2.264784336090088, + "learning_rate": 9.695340501792116e-05, + "loss": 1.4171, + "step": 880 + }, + { + "epoch": 0.13791484032561052, + "grad_norm": 1.8154480457305908, + "learning_rate": 9.694525904203324e-05, + "loss": 1.4869, + "step": 881 + }, + { + "epoch": 0.13807138384470882, + "grad_norm": 1.9811089038848877, + "learning_rate": 9.693711306614532e-05, + "loss": 1.5198, + "step": 882 + }, + { + "epoch": 0.13822792736380715, + "grad_norm": 2.017547845840454, + "learning_rate": 9.692896709025742e-05, + "loss": 2.0287, + "step": 883 + }, + { + "epoch": 0.13838447088290545, + "grad_norm": 2.5256223678588867, + "learning_rate": 9.69208211143695e-05, + "loss": 1.4215, + "step": 884 + }, + { + "epoch": 0.13854101440200375, + "grad_norm": 4.478567600250244, + "learning_rate": 9.691267513848159e-05, + "loss": 1.6992, + "step": 885 + }, + { + "epoch": 0.13869755792110205, + "grad_norm": 2.7614383697509766, + "learning_rate": 9.690452916259369e-05, + "loss": 1.9592, + "step": 886 + }, + { + "epoch": 0.13885410144020038, + "grad_norm": 3.7140135765075684, + "learning_rate": 9.689638318670577e-05, + "loss": 1.7883, + "step": 887 + }, + { + "epoch": 0.13901064495929868, + "grad_norm": 3.1692569255828857, + "learning_rate": 9.688823721081785e-05, + "loss": 1.9728, + "step": 888 + }, + { + "epoch": 0.13916718847839699, + "grad_norm": 4.123902797698975, + "learning_rate": 9.688009123492995e-05, + "loss": 2.0512, + "step": 889 + }, + { + "epoch": 0.13932373199749531, + "grad_norm": 2.730212450027466, + "learning_rate": 9.687194525904203e-05, + "loss": 1.9099, + "step": 890 + }, + { + "epoch": 0.13948027551659362, + "grad_norm": 3.6577870845794678, + "learning_rate": 9.686379928315412e-05, + "loss": 2.1041, + "step": 891 + }, + { + "epoch": 0.13963681903569192, + "grad_norm": 3.9528298377990723, + "learning_rate": 9.685565330726622e-05, + "loss": 2.1264, + "step": 892 + }, + { + "epoch": 0.13979336255479022, + "grad_norm": 4.5699381828308105, + "learning_rate": 9.684750733137831e-05, + "loss": 2.0923, + "step": 893 + }, + { + "epoch": 0.13994990607388855, + "grad_norm": 2.5513224601745605, + "learning_rate": 9.683936135549038e-05, + "loss": 1.9173, + "step": 894 + }, + { + "epoch": 0.14010644959298685, + "grad_norm": 2.802976369857788, + "learning_rate": 9.683121537960248e-05, + "loss": 1.9513, + "step": 895 + }, + { + "epoch": 0.14026299311208515, + "grad_norm": 3.266899824142456, + "learning_rate": 9.682306940371458e-05, + "loss": 1.7242, + "step": 896 + }, + { + "epoch": 0.14041953663118348, + "grad_norm": 3.5130019187927246, + "learning_rate": 9.681492342782665e-05, + "loss": 1.7153, + "step": 897 + }, + { + "epoch": 0.14057608015028178, + "grad_norm": 1.8894069194793701, + "learning_rate": 9.680677745193875e-05, + "loss": 1.1901, + "step": 898 + }, + { + "epoch": 0.14073262366938008, + "grad_norm": 3.328007698059082, + "learning_rate": 9.679863147605084e-05, + "loss": 1.2766, + "step": 899 + }, + { + "epoch": 0.1408891671884784, + "grad_norm": 4.045617580413818, + "learning_rate": 9.679048550016291e-05, + "loss": 1.6675, + "step": 900 + }, + { + "epoch": 0.1410457107075767, + "grad_norm": 1.5419467687606812, + "learning_rate": 9.678233952427501e-05, + "loss": 1.1873, + "step": 901 + }, + { + "epoch": 0.141202254226675, + "grad_norm": 0.9029827117919922, + "learning_rate": 9.677419354838711e-05, + "loss": 1.0275, + "step": 902 + }, + { + "epoch": 0.1413587977457733, + "grad_norm": 1.2270982265472412, + "learning_rate": 9.676604757249919e-05, + "loss": 1.0327, + "step": 903 + }, + { + "epoch": 0.14151534126487164, + "grad_norm": 1.253551959991455, + "learning_rate": 9.675790159661128e-05, + "loss": 0.9844, + "step": 904 + }, + { + "epoch": 0.14167188478396994, + "grad_norm": 1.4439759254455566, + "learning_rate": 9.674975562072337e-05, + "loss": 1.1854, + "step": 905 + }, + { + "epoch": 0.14182842830306824, + "grad_norm": 1.1444814205169678, + "learning_rate": 9.674160964483546e-05, + "loss": 1.0963, + "step": 906 + }, + { + "epoch": 0.14198497182216657, + "grad_norm": 1.324904203414917, + "learning_rate": 9.673346366894754e-05, + "loss": 0.9935, + "step": 907 + }, + { + "epoch": 0.14214151534126487, + "grad_norm": 0.9667438864707947, + "learning_rate": 9.672531769305964e-05, + "loss": 1.0394, + "step": 908 + }, + { + "epoch": 0.14229805886036317, + "grad_norm": 0.9309449791908264, + "learning_rate": 9.671717171717172e-05, + "loss": 1.149, + "step": 909 + }, + { + "epoch": 0.1424546023794615, + "grad_norm": 2.1525955200195312, + "learning_rate": 9.67090257412838e-05, + "loss": 1.3175, + "step": 910 + }, + { + "epoch": 0.1426111458985598, + "grad_norm": 1.2317824363708496, + "learning_rate": 9.67008797653959e-05, + "loss": 0.9958, + "step": 911 + }, + { + "epoch": 0.1427676894176581, + "grad_norm": 1.196698784828186, + "learning_rate": 9.669273378950799e-05, + "loss": 0.9798, + "step": 912 + }, + { + "epoch": 0.1429242329367564, + "grad_norm": 2.7884347438812256, + "learning_rate": 9.668458781362007e-05, + "loss": 1.09, + "step": 913 + }, + { + "epoch": 0.14308077645585474, + "grad_norm": 1.498465895652771, + "learning_rate": 9.667644183773217e-05, + "loss": 0.9925, + "step": 914 + }, + { + "epoch": 0.14323731997495304, + "grad_norm": 1.1894043684005737, + "learning_rate": 9.666829586184425e-05, + "loss": 1.0914, + "step": 915 + }, + { + "epoch": 0.14339386349405134, + "grad_norm": 2.079848527908325, + "learning_rate": 9.666014988595635e-05, + "loss": 1.143, + "step": 916 + }, + { + "epoch": 0.14355040701314967, + "grad_norm": 9.592923164367676, + "learning_rate": 9.665200391006843e-05, + "loss": 1.8854, + "step": 917 + }, + { + "epoch": 0.14370695053224797, + "grad_norm": 1.4452065229415894, + "learning_rate": 9.664385793418052e-05, + "loss": 0.9981, + "step": 918 + }, + { + "epoch": 0.14386349405134627, + "grad_norm": 1.5402915477752686, + "learning_rate": 9.663571195829261e-05, + "loss": 1.0601, + "step": 919 + }, + { + "epoch": 0.14402003757044457, + "grad_norm": 2.67574143409729, + "learning_rate": 9.66275659824047e-05, + "loss": 1.2592, + "step": 920 + }, + { + "epoch": 0.1441765810895429, + "grad_norm": 4.331721305847168, + "learning_rate": 9.661942000651678e-05, + "loss": 1.0477, + "step": 921 + }, + { + "epoch": 0.1443331246086412, + "grad_norm": 3.067800760269165, + "learning_rate": 9.661127403062888e-05, + "loss": 1.1932, + "step": 922 + }, + { + "epoch": 0.1444896681277395, + "grad_norm": 2.098222494125366, + "learning_rate": 9.660312805474096e-05, + "loss": 1.3302, + "step": 923 + }, + { + "epoch": 0.14464621164683783, + "grad_norm": 3.9632644653320312, + "learning_rate": 9.659498207885304e-05, + "loss": 1.5346, + "step": 924 + }, + { + "epoch": 0.14480275516593613, + "grad_norm": 1.9439150094985962, + "learning_rate": 9.658683610296514e-05, + "loss": 1.1084, + "step": 925 + }, + { + "epoch": 0.14495929868503443, + "grad_norm": 3.3947010040283203, + "learning_rate": 9.657869012707723e-05, + "loss": 1.7395, + "step": 926 + }, + { + "epoch": 0.14511584220413276, + "grad_norm": 3.146083116531372, + "learning_rate": 9.657054415118931e-05, + "loss": 1.455, + "step": 927 + }, + { + "epoch": 0.14527238572323106, + "grad_norm": 2.5860531330108643, + "learning_rate": 9.656239817530141e-05, + "loss": 1.9448, + "step": 928 + }, + { + "epoch": 0.14542892924232936, + "grad_norm": 3.560807466506958, + "learning_rate": 9.655425219941349e-05, + "loss": 1.6274, + "step": 929 + }, + { + "epoch": 0.14558547276142766, + "grad_norm": 2.5606064796447754, + "learning_rate": 9.654610622352557e-05, + "loss": 1.4887, + "step": 930 + }, + { + "epoch": 0.145742016280526, + "grad_norm": 3.4679317474365234, + "learning_rate": 9.653796024763767e-05, + "loss": 1.3891, + "step": 931 + }, + { + "epoch": 0.1458985597996243, + "grad_norm": 3.548588991165161, + "learning_rate": 9.652981427174977e-05, + "loss": 1.8366, + "step": 932 + }, + { + "epoch": 0.1460551033187226, + "grad_norm": 2.095719337463379, + "learning_rate": 9.652166829586184e-05, + "loss": 1.0361, + "step": 933 + }, + { + "epoch": 0.14621164683782092, + "grad_norm": 3.9341657161712646, + "learning_rate": 9.651352231997394e-05, + "loss": 1.8457, + "step": 934 + }, + { + "epoch": 0.14636819035691923, + "grad_norm": 3.3655426502227783, + "learning_rate": 9.650537634408603e-05, + "loss": 2.1477, + "step": 935 + }, + { + "epoch": 0.14652473387601753, + "grad_norm": 5.000679969787598, + "learning_rate": 9.64972303681981e-05, + "loss": 1.71, + "step": 936 + }, + { + "epoch": 0.14668127739511586, + "grad_norm": 2.997213363647461, + "learning_rate": 9.64890843923102e-05, + "loss": 1.4258, + "step": 937 + }, + { + "epoch": 0.14683782091421416, + "grad_norm": 2.5168704986572266, + "learning_rate": 9.64809384164223e-05, + "loss": 2.0093, + "step": 938 + }, + { + "epoch": 0.14699436443331246, + "grad_norm": 6.429799556732178, + "learning_rate": 9.647279244053438e-05, + "loss": 1.8864, + "step": 939 + }, + { + "epoch": 0.14715090795241076, + "grad_norm": 4.181130409240723, + "learning_rate": 9.646464646464647e-05, + "loss": 1.335, + "step": 940 + }, + { + "epoch": 0.1473074514715091, + "grad_norm": 3.4889976978302, + "learning_rate": 9.645650048875856e-05, + "loss": 1.5606, + "step": 941 + }, + { + "epoch": 0.1474639949906074, + "grad_norm": 2.818127155303955, + "learning_rate": 9.644835451287065e-05, + "loss": 1.8012, + "step": 942 + }, + { + "epoch": 0.1476205385097057, + "grad_norm": 2.248926877975464, + "learning_rate": 9.644020853698273e-05, + "loss": 1.5818, + "step": 943 + }, + { + "epoch": 0.14777708202880402, + "grad_norm": 1.9063016176223755, + "learning_rate": 9.643206256109483e-05, + "loss": 1.446, + "step": 944 + }, + { + "epoch": 0.14793362554790232, + "grad_norm": 6.114660263061523, + "learning_rate": 9.642391658520691e-05, + "loss": 1.5579, + "step": 945 + }, + { + "epoch": 0.14809016906700062, + "grad_norm": 4.836997032165527, + "learning_rate": 9.6415770609319e-05, + "loss": 1.6265, + "step": 946 + }, + { + "epoch": 0.14824671258609892, + "grad_norm": 2.4638617038726807, + "learning_rate": 9.640762463343109e-05, + "loss": 1.2437, + "step": 947 + }, + { + "epoch": 0.14840325610519725, + "grad_norm": 2.7769691944122314, + "learning_rate": 9.639947865754318e-05, + "loss": 1.6382, + "step": 948 + }, + { + "epoch": 0.14855979962429555, + "grad_norm": 5.634922027587891, + "learning_rate": 9.639133268165526e-05, + "loss": 1.721, + "step": 949 + }, + { + "epoch": 0.14871634314339385, + "grad_norm": 2.53835391998291, + "learning_rate": 9.638318670576736e-05, + "loss": 1.9418, + "step": 950 + }, + { + "epoch": 0.14887288666249218, + "grad_norm": 1.7419637441635132, + "learning_rate": 9.637504072987944e-05, + "loss": 1.157, + "step": 951 + }, + { + "epoch": 0.14902943018159048, + "grad_norm": 1.716998815536499, + "learning_rate": 9.636689475399154e-05, + "loss": 1.0593, + "step": 952 + }, + { + "epoch": 0.14918597370068878, + "grad_norm": 0.9781287312507629, + "learning_rate": 9.635874877810362e-05, + "loss": 1.2596, + "step": 953 + }, + { + "epoch": 0.1493425172197871, + "grad_norm": 1.4375218152999878, + "learning_rate": 9.63506028022157e-05, + "loss": 0.8402, + "step": 954 + }, + { + "epoch": 0.14949906073888541, + "grad_norm": 1.502808690071106, + "learning_rate": 9.63424568263278e-05, + "loss": 1.0193, + "step": 955 + }, + { + "epoch": 0.14965560425798372, + "grad_norm": 1.3457111120224, + "learning_rate": 9.633431085043989e-05, + "loss": 1.0432, + "step": 956 + }, + { + "epoch": 0.14981214777708202, + "grad_norm": 1.2280945777893066, + "learning_rate": 9.632616487455197e-05, + "loss": 1.0198, + "step": 957 + }, + { + "epoch": 0.14996869129618035, + "grad_norm": 1.7529019117355347, + "learning_rate": 9.631801889866407e-05, + "loss": 0.9628, + "step": 958 + }, + { + "epoch": 0.15012523481527865, + "grad_norm": 0.9682857990264893, + "learning_rate": 9.630987292277615e-05, + "loss": 0.8454, + "step": 959 + }, + { + "epoch": 0.15028177833437695, + "grad_norm": 1.5114736557006836, + "learning_rate": 9.630172694688824e-05, + "loss": 0.9278, + "step": 960 + }, + { + "epoch": 0.15043832185347528, + "grad_norm": 1.836646556854248, + "learning_rate": 9.629358097100033e-05, + "loss": 1.0577, + "step": 961 + }, + { + "epoch": 0.15059486537257358, + "grad_norm": 3.6072099208831787, + "learning_rate": 9.628543499511242e-05, + "loss": 0.9274, + "step": 962 + }, + { + "epoch": 0.15075140889167188, + "grad_norm": 1.7352055311203003, + "learning_rate": 9.62772890192245e-05, + "loss": 1.3099, + "step": 963 + }, + { + "epoch": 0.1509079524107702, + "grad_norm": 1.4671733379364014, + "learning_rate": 9.62691430433366e-05, + "loss": 1.0105, + "step": 964 + }, + { + "epoch": 0.1510644959298685, + "grad_norm": 1.163083553314209, + "learning_rate": 9.626099706744868e-05, + "loss": 0.7628, + "step": 965 + }, + { + "epoch": 0.1512210394489668, + "grad_norm": 1.4567064046859741, + "learning_rate": 9.625285109156077e-05, + "loss": 0.9613, + "step": 966 + }, + { + "epoch": 0.1513775829680651, + "grad_norm": 2.1245105266571045, + "learning_rate": 9.624470511567286e-05, + "loss": 1.1335, + "step": 967 + }, + { + "epoch": 0.15153412648716344, + "grad_norm": 2.2995636463165283, + "learning_rate": 9.623655913978496e-05, + "loss": 1.1421, + "step": 968 + }, + { + "epoch": 0.15169067000626174, + "grad_norm": 2.1278412342071533, + "learning_rate": 9.622841316389703e-05, + "loss": 1.1533, + "step": 969 + }, + { + "epoch": 0.15184721352536004, + "grad_norm": 1.6886886358261108, + "learning_rate": 9.622026718800913e-05, + "loss": 0.9149, + "step": 970 + }, + { + "epoch": 0.15200375704445837, + "grad_norm": 2.3391544818878174, + "learning_rate": 9.621212121212123e-05, + "loss": 1.3781, + "step": 971 + }, + { + "epoch": 0.15216030056355667, + "grad_norm": 2.479750633239746, + "learning_rate": 9.62039752362333e-05, + "loss": 1.1042, + "step": 972 + }, + { + "epoch": 0.15231684408265497, + "grad_norm": 1.7604422569274902, + "learning_rate": 9.619582926034539e-05, + "loss": 1.0587, + "step": 973 + }, + { + "epoch": 0.15247338760175327, + "grad_norm": 3.235112190246582, + "learning_rate": 9.618768328445749e-05, + "loss": 1.7288, + "step": 974 + }, + { + "epoch": 0.1526299311208516, + "grad_norm": 6.202860355377197, + "learning_rate": 9.617953730856957e-05, + "loss": 2.0684, + "step": 975 + }, + { + "epoch": 0.1527864746399499, + "grad_norm": 1.7347639799118042, + "learning_rate": 9.617139133268166e-05, + "loss": 1.2137, + "step": 976 + }, + { + "epoch": 0.1529430181590482, + "grad_norm": 2.2677195072174072, + "learning_rate": 9.616324535679375e-05, + "loss": 1.3833, + "step": 977 + }, + { + "epoch": 0.15309956167814653, + "grad_norm": 3.256716251373291, + "learning_rate": 9.615509938090584e-05, + "loss": 1.2535, + "step": 978 + }, + { + "epoch": 0.15325610519724484, + "grad_norm": 2.4920737743377686, + "learning_rate": 9.614695340501792e-05, + "loss": 1.8053, + "step": 979 + }, + { + "epoch": 0.15341264871634314, + "grad_norm": 2.5001542568206787, + "learning_rate": 9.613880742913002e-05, + "loss": 1.6277, + "step": 980 + }, + { + "epoch": 0.15356919223544147, + "grad_norm": 1.7961387634277344, + "learning_rate": 9.61306614532421e-05, + "loss": 1.3905, + "step": 981 + }, + { + "epoch": 0.15372573575453977, + "grad_norm": 1.8796049356460571, + "learning_rate": 9.612251547735419e-05, + "loss": 1.5488, + "step": 982 + }, + { + "epoch": 0.15388227927363807, + "grad_norm": 2.191243886947632, + "learning_rate": 9.611436950146628e-05, + "loss": 1.4801, + "step": 983 + }, + { + "epoch": 0.15403882279273637, + "grad_norm": 2.861710786819458, + "learning_rate": 9.610622352557837e-05, + "loss": 1.6707, + "step": 984 + }, + { + "epoch": 0.1541953663118347, + "grad_norm": 4.244983196258545, + "learning_rate": 9.609807754969045e-05, + "loss": 1.6002, + "step": 985 + }, + { + "epoch": 0.154351909830933, + "grad_norm": 2.349391222000122, + "learning_rate": 9.608993157380255e-05, + "loss": 1.45, + "step": 986 + }, + { + "epoch": 0.1545084533500313, + "grad_norm": 2.8642773628234863, + "learning_rate": 9.608178559791463e-05, + "loss": 1.9901, + "step": 987 + }, + { + "epoch": 0.15466499686912963, + "grad_norm": 5.328644752502441, + "learning_rate": 9.607363962202672e-05, + "loss": 1.8629, + "step": 988 + }, + { + "epoch": 0.15482154038822793, + "grad_norm": 3.0458178520202637, + "learning_rate": 9.606549364613881e-05, + "loss": 1.6897, + "step": 989 + }, + { + "epoch": 0.15497808390732623, + "grad_norm": 3.3398406505584717, + "learning_rate": 9.60573476702509e-05, + "loss": 2.2003, + "step": 990 + }, + { + "epoch": 0.15513462742642456, + "grad_norm": 3.425037384033203, + "learning_rate": 9.6049201694363e-05, + "loss": 1.4385, + "step": 991 + }, + { + "epoch": 0.15529117094552286, + "grad_norm": 2.5883686542510986, + "learning_rate": 9.604105571847508e-05, + "loss": 1.4078, + "step": 992 + }, + { + "epoch": 0.15544771446462116, + "grad_norm": 6.162259101867676, + "learning_rate": 9.603290974258716e-05, + "loss": 1.9789, + "step": 993 + }, + { + "epoch": 0.15560425798371946, + "grad_norm": 4.884918689727783, + "learning_rate": 9.602476376669926e-05, + "loss": 1.6092, + "step": 994 + }, + { + "epoch": 0.1557608015028178, + "grad_norm": 2.848358392715454, + "learning_rate": 9.601661779081134e-05, + "loss": 1.7265, + "step": 995 + }, + { + "epoch": 0.1559173450219161, + "grad_norm": 2.8840444087982178, + "learning_rate": 9.600847181492343e-05, + "loss": 0.9626, + "step": 996 + }, + { + "epoch": 0.1560738885410144, + "grad_norm": 3.4887866973876953, + "learning_rate": 9.600032583903552e-05, + "loss": 1.3895, + "step": 997 + }, + { + "epoch": 0.15623043206011272, + "grad_norm": 4.872214317321777, + "learning_rate": 9.599217986314761e-05, + "loss": 1.2521, + "step": 998 + }, + { + "epoch": 0.15638697557921102, + "grad_norm": 3.438655138015747, + "learning_rate": 9.598403388725969e-05, + "loss": 1.4059, + "step": 999 + }, + { + "epoch": 0.15654351909830932, + "grad_norm": 2.7114155292510986, + "learning_rate": 9.597588791137179e-05, + "loss": 1.0732, + "step": 1000 + }, + { + "epoch": 0.15654351909830932, + "eval_loss": 1.1350525617599487, + "eval_runtime": 203.9115, + "eval_samples_per_second": 60.727, + "eval_steps_per_second": 3.796, + "eval_wer": 0.6738423613915039, + "step": 1000 + }, + { + "epoch": 0.15670006261740763, + "grad_norm": 0.9531450867652893, + "learning_rate": 9.596774193548387e-05, + "loss": 0.8152, + "step": 1001 + }, + { + "epoch": 0.15685660613650595, + "grad_norm": 0.7769560217857361, + "learning_rate": 9.595959595959596e-05, + "loss": 0.7985, + "step": 1002 + }, + { + "epoch": 0.15701314965560426, + "grad_norm": 0.7457296848297119, + "learning_rate": 9.595144998370805e-05, + "loss": 0.6932, + "step": 1003 + }, + { + "epoch": 0.15716969317470256, + "grad_norm": 0.9561570286750793, + "learning_rate": 9.594330400782015e-05, + "loss": 0.7467, + "step": 1004 + }, + { + "epoch": 0.15732623669380089, + "grad_norm": 1.0754642486572266, + "learning_rate": 9.593515803193222e-05, + "loss": 0.8135, + "step": 1005 + }, + { + "epoch": 0.1574827802128992, + "grad_norm": 0.851302444934845, + "learning_rate": 9.592701205604432e-05, + "loss": 0.7929, + "step": 1006 + }, + { + "epoch": 0.1576393237319975, + "grad_norm": 0.9841870069503784, + "learning_rate": 9.591886608015642e-05, + "loss": 0.7995, + "step": 1007 + }, + { + "epoch": 0.15779586725109582, + "grad_norm": 1.9775196313858032, + "learning_rate": 9.591072010426849e-05, + "loss": 1.1823, + "step": 1008 + }, + { + "epoch": 0.15795241077019412, + "grad_norm": 0.9140826463699341, + "learning_rate": 9.590257412838058e-05, + "loss": 0.7889, + "step": 1009 + }, + { + "epoch": 0.15810895428929242, + "grad_norm": 1.2752490043640137, + "learning_rate": 9.589442815249268e-05, + "loss": 0.773, + "step": 1010 + }, + { + "epoch": 0.15826549780839072, + "grad_norm": 1.0028424263000488, + "learning_rate": 9.588628217660476e-05, + "loss": 0.8773, + "step": 1011 + }, + { + "epoch": 0.15842204132748905, + "grad_norm": 1.181913137435913, + "learning_rate": 9.587813620071685e-05, + "loss": 1.0807, + "step": 1012 + }, + { + "epoch": 0.15857858484658735, + "grad_norm": 2.0587635040283203, + "learning_rate": 9.586999022482895e-05, + "loss": 1.2404, + "step": 1013 + }, + { + "epoch": 0.15873512836568565, + "grad_norm": 1.2923892736434937, + "learning_rate": 9.586184424894103e-05, + "loss": 0.8642, + "step": 1014 + }, + { + "epoch": 0.15889167188478398, + "grad_norm": 1.1257104873657227, + "learning_rate": 9.585369827305311e-05, + "loss": 0.9369, + "step": 1015 + }, + { + "epoch": 0.15904821540388228, + "grad_norm": 2.424154043197632, + "learning_rate": 9.584555229716521e-05, + "loss": 0.9101, + "step": 1016 + }, + { + "epoch": 0.15920475892298058, + "grad_norm": 1.894492506980896, + "learning_rate": 9.58374063212773e-05, + "loss": 0.9079, + "step": 1017 + }, + { + "epoch": 0.1593613024420789, + "grad_norm": 2.948173761367798, + "learning_rate": 9.582926034538938e-05, + "loss": 0.9259, + "step": 1018 + }, + { + "epoch": 0.1595178459611772, + "grad_norm": 1.9940412044525146, + "learning_rate": 9.582111436950148e-05, + "loss": 1.2066, + "step": 1019 + }, + { + "epoch": 0.1596743894802755, + "grad_norm": 2.5504751205444336, + "learning_rate": 9.581296839361356e-05, + "loss": 1.2941, + "step": 1020 + }, + { + "epoch": 0.15983093299937381, + "grad_norm": 1.862389326095581, + "learning_rate": 9.580482241772564e-05, + "loss": 0.8632, + "step": 1021 + }, + { + "epoch": 0.15998747651847214, + "grad_norm": 2.0161893367767334, + "learning_rate": 9.579667644183774e-05, + "loss": 0.9776, + "step": 1022 + }, + { + "epoch": 0.16014402003757044, + "grad_norm": 2.298166513442993, + "learning_rate": 9.578853046594982e-05, + "loss": 1.237, + "step": 1023 + }, + { + "epoch": 0.16030056355666875, + "grad_norm": 2.2490007877349854, + "learning_rate": 9.578038449006191e-05, + "loss": 0.8558, + "step": 1024 + }, + { + "epoch": 0.16045710707576707, + "grad_norm": 3.4412598609924316, + "learning_rate": 9.5772238514174e-05, + "loss": 1.2123, + "step": 1025 + }, + { + "epoch": 0.16061365059486538, + "grad_norm": 2.985023021697998, + "learning_rate": 9.576409253828609e-05, + "loss": 1.6248, + "step": 1026 + }, + { + "epoch": 0.16077019411396368, + "grad_norm": 3.460242509841919, + "learning_rate": 9.575594656239819e-05, + "loss": 1.2789, + "step": 1027 + }, + { + "epoch": 0.16092673763306198, + "grad_norm": 2.397108554840088, + "learning_rate": 9.574780058651027e-05, + "loss": 1.4555, + "step": 1028 + }, + { + "epoch": 0.1610832811521603, + "grad_norm": 4.131384372711182, + "learning_rate": 9.573965461062235e-05, + "loss": 1.5975, + "step": 1029 + }, + { + "epoch": 0.1612398246712586, + "grad_norm": 1.7228927612304688, + "learning_rate": 9.573150863473445e-05, + "loss": 0.6524, + "step": 1030 + }, + { + "epoch": 0.1613963681903569, + "grad_norm": 4.286611080169678, + "learning_rate": 9.572336265884653e-05, + "loss": 1.5826, + "step": 1031 + }, + { + "epoch": 0.16155291170945524, + "grad_norm": 5.048844337463379, + "learning_rate": 9.571521668295862e-05, + "loss": 2.1253, + "step": 1032 + }, + { + "epoch": 0.16170945522855354, + "grad_norm": 1.9035298824310303, + "learning_rate": 9.570707070707072e-05, + "loss": 0.9895, + "step": 1033 + }, + { + "epoch": 0.16186599874765184, + "grad_norm": 2.3778278827667236, + "learning_rate": 9.56989247311828e-05, + "loss": 1.5051, + "step": 1034 + }, + { + "epoch": 0.16202254226675017, + "grad_norm": 4.093505382537842, + "learning_rate": 9.569077875529488e-05, + "loss": 1.2292, + "step": 1035 + }, + { + "epoch": 0.16217908578584847, + "grad_norm": 3.6302754878997803, + "learning_rate": 9.568263277940698e-05, + "loss": 1.6192, + "step": 1036 + }, + { + "epoch": 0.16233562930494677, + "grad_norm": 6.307554244995117, + "learning_rate": 9.567448680351906e-05, + "loss": 1.4843, + "step": 1037 + }, + { + "epoch": 0.16249217282404507, + "grad_norm": 6.163361072540283, + "learning_rate": 9.566634082763115e-05, + "loss": 2.2958, + "step": 1038 + }, + { + "epoch": 0.1626487163431434, + "grad_norm": 3.393357753753662, + "learning_rate": 9.565819485174325e-05, + "loss": 0.8356, + "step": 1039 + }, + { + "epoch": 0.1628052598622417, + "grad_norm": 3.039391279220581, + "learning_rate": 9.565004887585534e-05, + "loss": 1.565, + "step": 1040 + }, + { + "epoch": 0.16296180338134, + "grad_norm": 4.3762617111206055, + "learning_rate": 9.564190289996741e-05, + "loss": 1.6783, + "step": 1041 + }, + { + "epoch": 0.16311834690043833, + "grad_norm": 5.242745399475098, + "learning_rate": 9.563375692407951e-05, + "loss": 1.2235, + "step": 1042 + }, + { + "epoch": 0.16327489041953663, + "grad_norm": 6.761044979095459, + "learning_rate": 9.562561094819161e-05, + "loss": 1.7883, + "step": 1043 + }, + { + "epoch": 0.16343143393863493, + "grad_norm": 10.31752872467041, + "learning_rate": 9.561746497230368e-05, + "loss": 2.1496, + "step": 1044 + }, + { + "epoch": 0.16358797745773326, + "grad_norm": 4.462409019470215, + "learning_rate": 9.560931899641577e-05, + "loss": 2.0931, + "step": 1045 + }, + { + "epoch": 0.16374452097683156, + "grad_norm": 8.65395450592041, + "learning_rate": 9.560117302052787e-05, + "loss": 1.4773, + "step": 1046 + }, + { + "epoch": 0.16390106449592987, + "grad_norm": 4.261375904083252, + "learning_rate": 9.559302704463994e-05, + "loss": 1.2668, + "step": 1047 + }, + { + "epoch": 0.16405760801502817, + "grad_norm": 3.7332468032836914, + "learning_rate": 9.558488106875204e-05, + "loss": 0.7341, + "step": 1048 + }, + { + "epoch": 0.1642141515341265, + "grad_norm": 3.115171432495117, + "learning_rate": 9.557673509286414e-05, + "loss": 0.8578, + "step": 1049 + }, + { + "epoch": 0.1643706950532248, + "grad_norm": 5.172712326049805, + "learning_rate": 9.556858911697622e-05, + "loss": 0.9173, + "step": 1050 + }, + { + "epoch": 0.1645272385723231, + "grad_norm": 0.9873601198196411, + "learning_rate": 9.55604431410883e-05, + "loss": 0.7649, + "step": 1051 + }, + { + "epoch": 0.16468378209142143, + "grad_norm": 0.7696318626403809, + "learning_rate": 9.55522971652004e-05, + "loss": 0.7348, + "step": 1052 + }, + { + "epoch": 0.16484032561051973, + "grad_norm": 0.8334832787513733, + "learning_rate": 9.554415118931249e-05, + "loss": 0.7011, + "step": 1053 + }, + { + "epoch": 0.16499686912961803, + "grad_norm": 1.0102719068527222, + "learning_rate": 9.553600521342457e-05, + "loss": 0.8028, + "step": 1054 + }, + { + "epoch": 0.16515341264871633, + "grad_norm": 1.0260852575302124, + "learning_rate": 9.552785923753667e-05, + "loss": 0.7267, + "step": 1055 + }, + { + "epoch": 0.16530995616781466, + "grad_norm": 1.4459941387176514, + "learning_rate": 9.551971326164875e-05, + "loss": 0.8086, + "step": 1056 + }, + { + "epoch": 0.16546649968691296, + "grad_norm": 2.7410800457000732, + "learning_rate": 9.551156728576083e-05, + "loss": 0.6564, + "step": 1057 + }, + { + "epoch": 0.16562304320601126, + "grad_norm": 1.3714137077331543, + "learning_rate": 9.550342130987293e-05, + "loss": 0.7952, + "step": 1058 + }, + { + "epoch": 0.1657795867251096, + "grad_norm": 1.0472644567489624, + "learning_rate": 9.549527533398502e-05, + "loss": 0.6113, + "step": 1059 + }, + { + "epoch": 0.1659361302442079, + "grad_norm": 0.9431442022323608, + "learning_rate": 9.54871293580971e-05, + "loss": 0.6519, + "step": 1060 + }, + { + "epoch": 0.1660926737633062, + "grad_norm": 0.8547930121421814, + "learning_rate": 9.54789833822092e-05, + "loss": 0.6962, + "step": 1061 + }, + { + "epoch": 0.16624921728240452, + "grad_norm": 1.1977424621582031, + "learning_rate": 9.547083740632128e-05, + "loss": 0.8072, + "step": 1062 + }, + { + "epoch": 0.16640576080150282, + "grad_norm": 2.788224458694458, + "learning_rate": 9.546269143043338e-05, + "loss": 0.8047, + "step": 1063 + }, + { + "epoch": 0.16656230432060112, + "grad_norm": 1.5802689790725708, + "learning_rate": 9.545454545454546e-05, + "loss": 0.7138, + "step": 1064 + }, + { + "epoch": 0.16671884783969942, + "grad_norm": 1.5701861381530762, + "learning_rate": 9.544639947865754e-05, + "loss": 1.033, + "step": 1065 + }, + { + "epoch": 0.16687539135879775, + "grad_norm": 2.039886713027954, + "learning_rate": 9.543825350276964e-05, + "loss": 1.0685, + "step": 1066 + }, + { + "epoch": 0.16703193487789605, + "grad_norm": 1.9437602758407593, + "learning_rate": 9.543010752688173e-05, + "loss": 0.9144, + "step": 1067 + }, + { + "epoch": 0.16718847839699436, + "grad_norm": 1.4733003377914429, + "learning_rate": 9.542196155099381e-05, + "loss": 0.9155, + "step": 1068 + }, + { + "epoch": 0.16734502191609268, + "grad_norm": 1.8339866399765015, + "learning_rate": 9.541381557510591e-05, + "loss": 1.1199, + "step": 1069 + }, + { + "epoch": 0.16750156543519099, + "grad_norm": 1.528225064277649, + "learning_rate": 9.540566959921799e-05, + "loss": 0.8835, + "step": 1070 + }, + { + "epoch": 0.1676581089542893, + "grad_norm": 2.2835094928741455, + "learning_rate": 9.539752362333007e-05, + "loss": 0.9335, + "step": 1071 + }, + { + "epoch": 0.16781465247338762, + "grad_norm": 3.221619129180908, + "learning_rate": 9.538937764744217e-05, + "loss": 1.2275, + "step": 1072 + }, + { + "epoch": 0.16797119599248592, + "grad_norm": 1.4798215627670288, + "learning_rate": 9.538123167155426e-05, + "loss": 0.9484, + "step": 1073 + }, + { + "epoch": 0.16812773951158422, + "grad_norm": 2.3079402446746826, + "learning_rate": 9.537308569566634e-05, + "loss": 0.9527, + "step": 1074 + }, + { + "epoch": 0.16828428303068252, + "grad_norm": 1.4711717367172241, + "learning_rate": 9.536493971977844e-05, + "loss": 1.0739, + "step": 1075 + }, + { + "epoch": 0.16844082654978085, + "grad_norm": 3.0114810466766357, + "learning_rate": 9.535679374389052e-05, + "loss": 1.0311, + "step": 1076 + }, + { + "epoch": 0.16859737006887915, + "grad_norm": 2.13387393951416, + "learning_rate": 9.53486477680026e-05, + "loss": 1.4825, + "step": 1077 + }, + { + "epoch": 0.16875391358797745, + "grad_norm": 3.288351058959961, + "learning_rate": 9.53405017921147e-05, + "loss": 1.1395, + "step": 1078 + }, + { + "epoch": 0.16891045710707578, + "grad_norm": 3.699436902999878, + "learning_rate": 9.53323558162268e-05, + "loss": 0.9779, + "step": 1079 + }, + { + "epoch": 0.16906700062617408, + "grad_norm": 3.306152105331421, + "learning_rate": 9.532420984033887e-05, + "loss": 1.0538, + "step": 1080 + }, + { + "epoch": 0.16922354414527238, + "grad_norm": 5.133028030395508, + "learning_rate": 9.531606386445097e-05, + "loss": 1.7582, + "step": 1081 + }, + { + "epoch": 0.16938008766437068, + "grad_norm": 4.835809707641602, + "learning_rate": 9.530791788856306e-05, + "loss": 1.6484, + "step": 1082 + }, + { + "epoch": 0.169536631183469, + "grad_norm": 3.0064570903778076, + "learning_rate": 9.529977191267513e-05, + "loss": 1.3993, + "step": 1083 + }, + { + "epoch": 0.1696931747025673, + "grad_norm": 5.005606174468994, + "learning_rate": 9.529162593678723e-05, + "loss": 1.7463, + "step": 1084 + }, + { + "epoch": 0.1698497182216656, + "grad_norm": 3.083587408065796, + "learning_rate": 9.528347996089933e-05, + "loss": 1.3685, + "step": 1085 + }, + { + "epoch": 0.17000626174076394, + "grad_norm": 3.3160738945007324, + "learning_rate": 9.527533398501141e-05, + "loss": 1.6868, + "step": 1086 + }, + { + "epoch": 0.17016280525986224, + "grad_norm": 3.981855869293213, + "learning_rate": 9.52671880091235e-05, + "loss": 1.409, + "step": 1087 + }, + { + "epoch": 0.17031934877896054, + "grad_norm": 2.8466546535491943, + "learning_rate": 9.525904203323559e-05, + "loss": 1.4107, + "step": 1088 + }, + { + "epoch": 0.17047589229805887, + "grad_norm": 4.239485263824463, + "learning_rate": 9.525089605734768e-05, + "loss": 1.4547, + "step": 1089 + }, + { + "epoch": 0.17063243581715717, + "grad_norm": 2.1982407569885254, + "learning_rate": 9.524275008145976e-05, + "loss": 1.3726, + "step": 1090 + }, + { + "epoch": 0.17078897933625548, + "grad_norm": 1.9008493423461914, + "learning_rate": 9.523460410557186e-05, + "loss": 1.1435, + "step": 1091 + }, + { + "epoch": 0.17094552285535378, + "grad_norm": 3.173610210418701, + "learning_rate": 9.522645812968394e-05, + "loss": 2.0373, + "step": 1092 + }, + { + "epoch": 0.1711020663744521, + "grad_norm": 3.5338315963745117, + "learning_rate": 9.521831215379603e-05, + "loss": 2.1819, + "step": 1093 + }, + { + "epoch": 0.1712586098935504, + "grad_norm": 4.940959930419922, + "learning_rate": 9.521016617790812e-05, + "loss": 1.9818, + "step": 1094 + }, + { + "epoch": 0.1714151534126487, + "grad_norm": 3.308788537979126, + "learning_rate": 9.52020202020202e-05, + "loss": 1.4178, + "step": 1095 + }, + { + "epoch": 0.17157169693174704, + "grad_norm": 7.22727108001709, + "learning_rate": 9.519387422613229e-05, + "loss": 1.6679, + "step": 1096 + }, + { + "epoch": 0.17172824045084534, + "grad_norm": 3.178661823272705, + "learning_rate": 9.518572825024439e-05, + "loss": 0.8437, + "step": 1097 + }, + { + "epoch": 0.17188478396994364, + "grad_norm": 4.312224388122559, + "learning_rate": 9.517758227435647e-05, + "loss": 1.0463, + "step": 1098 + }, + { + "epoch": 0.17204132748904197, + "grad_norm": 2.982117176055908, + "learning_rate": 9.516943629846857e-05, + "loss": 0.732, + "step": 1099 + }, + { + "epoch": 0.17219787100814027, + "grad_norm": 2.6110002994537354, + "learning_rate": 9.516129032258065e-05, + "loss": 1.2017, + "step": 1100 + }, + { + "epoch": 0.17235441452723857, + "grad_norm": 0.9847604036331177, + "learning_rate": 9.515314434669274e-05, + "loss": 0.7104, + "step": 1101 + }, + { + "epoch": 0.17251095804633687, + "grad_norm": 0.9324378967285156, + "learning_rate": 9.514499837080483e-05, + "loss": 0.7428, + "step": 1102 + }, + { + "epoch": 0.1726675015654352, + "grad_norm": 0.9221226572990417, + "learning_rate": 9.513685239491692e-05, + "loss": 0.5929, + "step": 1103 + }, + { + "epoch": 0.1728240450845335, + "grad_norm": 0.7576819062232971, + "learning_rate": 9.5128706419029e-05, + "loss": 0.6831, + "step": 1104 + }, + { + "epoch": 0.1729805886036318, + "grad_norm": 0.8950808048248291, + "learning_rate": 9.51205604431411e-05, + "loss": 0.6684, + "step": 1105 + }, + { + "epoch": 0.17313713212273013, + "grad_norm": 0.7278364896774292, + "learning_rate": 9.511241446725318e-05, + "loss": 0.6413, + "step": 1106 + }, + { + "epoch": 0.17329367564182843, + "grad_norm": 1.2092113494873047, + "learning_rate": 9.510426849136527e-05, + "loss": 0.766, + "step": 1107 + }, + { + "epoch": 0.17345021916092673, + "grad_norm": 1.8978537321090698, + "learning_rate": 9.509612251547736e-05, + "loss": 0.7266, + "step": 1108 + }, + { + "epoch": 0.17360676268002503, + "grad_norm": 1.7443413734436035, + "learning_rate": 9.508797653958945e-05, + "loss": 0.762, + "step": 1109 + }, + { + "epoch": 0.17376330619912336, + "grad_norm": 1.6560287475585938, + "learning_rate": 9.507983056370153e-05, + "loss": 0.6109, + "step": 1110 + }, + { + "epoch": 0.17391984971822166, + "grad_norm": 1.552686333656311, + "learning_rate": 9.507168458781363e-05, + "loss": 0.8509, + "step": 1111 + }, + { + "epoch": 0.17407639323731997, + "grad_norm": 2.7378814220428467, + "learning_rate": 9.506353861192571e-05, + "loss": 0.7704, + "step": 1112 + }, + { + "epoch": 0.1742329367564183, + "grad_norm": 2.1378254890441895, + "learning_rate": 9.50553926360378e-05, + "loss": 0.9347, + "step": 1113 + }, + { + "epoch": 0.1743894802755166, + "grad_norm": 1.6242401599884033, + "learning_rate": 9.504724666014989e-05, + "loss": 0.7694, + "step": 1114 + }, + { + "epoch": 0.1745460237946149, + "grad_norm": 2.905447483062744, + "learning_rate": 9.503910068426199e-05, + "loss": 1.0583, + "step": 1115 + }, + { + "epoch": 0.17470256731371323, + "grad_norm": 3.930457353591919, + "learning_rate": 9.503095470837406e-05, + "loss": 1.3108, + "step": 1116 + }, + { + "epoch": 0.17485911083281153, + "grad_norm": 1.1807172298431396, + "learning_rate": 9.502280873248616e-05, + "loss": 0.914, + "step": 1117 + }, + { + "epoch": 0.17501565435190983, + "grad_norm": 1.655260443687439, + "learning_rate": 9.501466275659825e-05, + "loss": 0.6275, + "step": 1118 + }, + { + "epoch": 0.17517219787100813, + "grad_norm": 2.1202175617218018, + "learning_rate": 9.500651678071032e-05, + "loss": 1.0737, + "step": 1119 + }, + { + "epoch": 0.17532874139010646, + "grad_norm": 4.123349666595459, + "learning_rate": 9.499837080482242e-05, + "loss": 1.0678, + "step": 1120 + }, + { + "epoch": 0.17548528490920476, + "grad_norm": 8.729193687438965, + "learning_rate": 9.499022482893452e-05, + "loss": 1.2164, + "step": 1121 + }, + { + "epoch": 0.17564182842830306, + "grad_norm": 1.9060620069503784, + "learning_rate": 9.49820788530466e-05, + "loss": 0.8777, + "step": 1122 + }, + { + "epoch": 0.1757983719474014, + "grad_norm": 4.939797401428223, + "learning_rate": 9.497393287715869e-05, + "loss": 0.9758, + "step": 1123 + }, + { + "epoch": 0.1759549154664997, + "grad_norm": 3.8546531200408936, + "learning_rate": 9.496578690127078e-05, + "loss": 1.3298, + "step": 1124 + }, + { + "epoch": 0.176111458985598, + "grad_norm": 2.9741199016571045, + "learning_rate": 9.495764092538287e-05, + "loss": 1.4133, + "step": 1125 + }, + { + "epoch": 0.1762680025046963, + "grad_norm": 1.7873649597167969, + "learning_rate": 9.494949494949495e-05, + "loss": 1.0942, + "step": 1126 + }, + { + "epoch": 0.17642454602379462, + "grad_norm": 2.3002583980560303, + "learning_rate": 9.494134897360705e-05, + "loss": 1.1082, + "step": 1127 + }, + { + "epoch": 0.17658108954289292, + "grad_norm": 1.7904318571090698, + "learning_rate": 9.493320299771913e-05, + "loss": 0.9294, + "step": 1128 + }, + { + "epoch": 0.17673763306199122, + "grad_norm": 2.6225645542144775, + "learning_rate": 9.492505702183122e-05, + "loss": 1.4153, + "step": 1129 + }, + { + "epoch": 0.17689417658108955, + "grad_norm": 3.000235080718994, + "learning_rate": 9.491691104594331e-05, + "loss": 0.9942, + "step": 1130 + }, + { + "epoch": 0.17705072010018785, + "grad_norm": 2.583517074584961, + "learning_rate": 9.49087650700554e-05, + "loss": 1.1972, + "step": 1131 + }, + { + "epoch": 0.17720726361928615, + "grad_norm": 3.3811450004577637, + "learning_rate": 9.490061909416748e-05, + "loss": 1.2139, + "step": 1132 + }, + { + "epoch": 0.17736380713838448, + "grad_norm": 4.761298656463623, + "learning_rate": 9.489247311827958e-05, + "loss": 1.0591, + "step": 1133 + }, + { + "epoch": 0.17752035065748278, + "grad_norm": 5.012853622436523, + "learning_rate": 9.488432714239166e-05, + "loss": 1.7178, + "step": 1134 + }, + { + "epoch": 0.17767689417658108, + "grad_norm": 3.476820945739746, + "learning_rate": 9.487618116650375e-05, + "loss": 1.3925, + "step": 1135 + }, + { + "epoch": 0.17783343769567939, + "grad_norm": 7.287431716918945, + "learning_rate": 9.486803519061584e-05, + "loss": 1.3303, + "step": 1136 + }, + { + "epoch": 0.17798998121477771, + "grad_norm": 3.3680341243743896, + "learning_rate": 9.485988921472793e-05, + "loss": 1.6347, + "step": 1137 + }, + { + "epoch": 0.17814652473387602, + "grad_norm": 2.5378506183624268, + "learning_rate": 9.485174323884002e-05, + "loss": 1.4957, + "step": 1138 + }, + { + "epoch": 0.17830306825297432, + "grad_norm": 2.944915294647217, + "learning_rate": 9.484359726295211e-05, + "loss": 1.2187, + "step": 1139 + }, + { + "epoch": 0.17845961177207265, + "grad_norm": 3.5701425075531006, + "learning_rate": 9.483545128706419e-05, + "loss": 1.5289, + "step": 1140 + }, + { + "epoch": 0.17861615529117095, + "grad_norm": 5.002267837524414, + "learning_rate": 9.482730531117629e-05, + "loss": 2.1122, + "step": 1141 + }, + { + "epoch": 0.17877269881026925, + "grad_norm": 3.0300426483154297, + "learning_rate": 9.481915933528837e-05, + "loss": 1.4081, + "step": 1142 + }, + { + "epoch": 0.17892924232936758, + "grad_norm": 2.2293832302093506, + "learning_rate": 9.481101335940046e-05, + "loss": 1.5246, + "step": 1143 + }, + { + "epoch": 0.17908578584846588, + "grad_norm": 4.065706729888916, + "learning_rate": 9.480286738351255e-05, + "loss": 1.9908, + "step": 1144 + }, + { + "epoch": 0.17924232936756418, + "grad_norm": 3.873182773590088, + "learning_rate": 9.479472140762464e-05, + "loss": 1.337, + "step": 1145 + }, + { + "epoch": 0.17939887288666248, + "grad_norm": 2.3088061809539795, + "learning_rate": 9.478657543173672e-05, + "loss": 0.9853, + "step": 1146 + }, + { + "epoch": 0.1795554164057608, + "grad_norm": 5.1622748374938965, + "learning_rate": 9.477842945584882e-05, + "loss": 1.1065, + "step": 1147 + }, + { + "epoch": 0.1797119599248591, + "grad_norm": 1.9889600276947021, + "learning_rate": 9.47702834799609e-05, + "loss": 0.9886, + "step": 1148 + }, + { + "epoch": 0.1798685034439574, + "grad_norm": 3.3510069847106934, + "learning_rate": 9.476213750407299e-05, + "loss": 1.7016, + "step": 1149 + }, + { + "epoch": 0.18002504696305574, + "grad_norm": 1.5941027402877808, + "learning_rate": 9.475399152818508e-05, + "loss": 1.5244, + "step": 1150 + }, + { + "epoch": 0.18018159048215404, + "grad_norm": 1.4589072465896606, + "learning_rate": 9.474584555229718e-05, + "loss": 0.7268, + "step": 1151 + }, + { + "epoch": 0.18033813400125234, + "grad_norm": 1.0607377290725708, + "learning_rate": 9.473769957640925e-05, + "loss": 0.6319, + "step": 1152 + }, + { + "epoch": 0.18049467752035064, + "grad_norm": 0.7452928423881531, + "learning_rate": 9.472955360052135e-05, + "loss": 0.5693, + "step": 1153 + }, + { + "epoch": 0.18065122103944897, + "grad_norm": 1.099410891532898, + "learning_rate": 9.472140762463345e-05, + "loss": 0.5741, + "step": 1154 + }, + { + "epoch": 0.18080776455854727, + "grad_norm": 1.2153986692428589, + "learning_rate": 9.471326164874552e-05, + "loss": 0.858, + "step": 1155 + }, + { + "epoch": 0.18096430807764557, + "grad_norm": 1.1005873680114746, + "learning_rate": 9.470511567285761e-05, + "loss": 0.6238, + "step": 1156 + }, + { + "epoch": 0.1811208515967439, + "grad_norm": 0.9822128415107727, + "learning_rate": 9.469696969696971e-05, + "loss": 0.6234, + "step": 1157 + }, + { + "epoch": 0.1812773951158422, + "grad_norm": 1.122702956199646, + "learning_rate": 9.46888237210818e-05, + "loss": 0.644, + "step": 1158 + }, + { + "epoch": 0.1814339386349405, + "grad_norm": 1.3628664016723633, + "learning_rate": 9.468067774519388e-05, + "loss": 0.7375, + "step": 1159 + }, + { + "epoch": 0.18159048215403883, + "grad_norm": 1.7432889938354492, + "learning_rate": 9.467253176930598e-05, + "loss": 0.7631, + "step": 1160 + }, + { + "epoch": 0.18174702567313714, + "grad_norm": 1.5762158632278442, + "learning_rate": 9.466438579341806e-05, + "loss": 0.6272, + "step": 1161 + }, + { + "epoch": 0.18190356919223544, + "grad_norm": 1.7399516105651855, + "learning_rate": 9.465623981753014e-05, + "loss": 0.8289, + "step": 1162 + }, + { + "epoch": 0.18206011271133374, + "grad_norm": 1.8797175884246826, + "learning_rate": 9.464809384164224e-05, + "loss": 0.751, + "step": 1163 + }, + { + "epoch": 0.18221665623043207, + "grad_norm": 1.1856813430786133, + "learning_rate": 9.463994786575432e-05, + "loss": 0.7461, + "step": 1164 + }, + { + "epoch": 0.18237319974953037, + "grad_norm": 2.323901653289795, + "learning_rate": 9.463180188986641e-05, + "loss": 1.1684, + "step": 1165 + }, + { + "epoch": 0.18252974326862867, + "grad_norm": 1.6843814849853516, + "learning_rate": 9.46236559139785e-05, + "loss": 0.6844, + "step": 1166 + }, + { + "epoch": 0.182686286787727, + "grad_norm": 1.5257223844528198, + "learning_rate": 9.461550993809059e-05, + "loss": 0.8988, + "step": 1167 + }, + { + "epoch": 0.1828428303068253, + "grad_norm": 1.4310457706451416, + "learning_rate": 9.460736396220267e-05, + "loss": 0.9618, + "step": 1168 + }, + { + "epoch": 0.1829993738259236, + "grad_norm": 3.0463268756866455, + "learning_rate": 9.459921798631477e-05, + "loss": 1.4946, + "step": 1169 + }, + { + "epoch": 0.18315591734502193, + "grad_norm": 1.3979969024658203, + "learning_rate": 9.459107201042685e-05, + "loss": 1.0757, + "step": 1170 + }, + { + "epoch": 0.18331246086412023, + "grad_norm": 1.396156907081604, + "learning_rate": 9.458292603453894e-05, + "loss": 0.9352, + "step": 1171 + }, + { + "epoch": 0.18346900438321853, + "grad_norm": 1.8601503372192383, + "learning_rate": 9.457478005865103e-05, + "loss": 1.2166, + "step": 1172 + }, + { + "epoch": 0.18362554790231683, + "grad_norm": 1.7677547931671143, + "learning_rate": 9.456663408276312e-05, + "loss": 0.9782, + "step": 1173 + }, + { + "epoch": 0.18378209142141516, + "grad_norm": 2.256854295730591, + "learning_rate": 9.455848810687522e-05, + "loss": 0.8846, + "step": 1174 + }, + { + "epoch": 0.18393863494051346, + "grad_norm": 1.7589635848999023, + "learning_rate": 9.45503421309873e-05, + "loss": 1.068, + "step": 1175 + }, + { + "epoch": 0.18409517845961176, + "grad_norm": 4.210563659667969, + "learning_rate": 9.454219615509938e-05, + "loss": 0.8592, + "step": 1176 + }, + { + "epoch": 0.1842517219787101, + "grad_norm": 2.5477404594421387, + "learning_rate": 9.453405017921148e-05, + "loss": 1.1716, + "step": 1177 + }, + { + "epoch": 0.1844082654978084, + "grad_norm": 4.0200276374816895, + "learning_rate": 9.452590420332356e-05, + "loss": 1.1848, + "step": 1178 + }, + { + "epoch": 0.1845648090169067, + "grad_norm": 2.890082836151123, + "learning_rate": 9.451775822743565e-05, + "loss": 1.1072, + "step": 1179 + }, + { + "epoch": 0.184721352536005, + "grad_norm": 2.02858567237854, + "learning_rate": 9.450961225154775e-05, + "loss": 1.171, + "step": 1180 + }, + { + "epoch": 0.18487789605510332, + "grad_norm": 3.0327439308166504, + "learning_rate": 9.450146627565983e-05, + "loss": 1.2175, + "step": 1181 + }, + { + "epoch": 0.18503443957420163, + "grad_norm": 3.5908946990966797, + "learning_rate": 9.449332029977191e-05, + "loss": 1.5344, + "step": 1182 + }, + { + "epoch": 0.18519098309329993, + "grad_norm": 4.689337730407715, + "learning_rate": 9.448517432388401e-05, + "loss": 1.5296, + "step": 1183 + }, + { + "epoch": 0.18534752661239826, + "grad_norm": 3.519727945327759, + "learning_rate": 9.44770283479961e-05, + "loss": 1.2727, + "step": 1184 + }, + { + "epoch": 0.18550407013149656, + "grad_norm": 5.951746940612793, + "learning_rate": 9.446888237210818e-05, + "loss": 1.7531, + "step": 1185 + }, + { + "epoch": 0.18566061365059486, + "grad_norm": 4.25757360458374, + "learning_rate": 9.446073639622027e-05, + "loss": 1.6622, + "step": 1186 + }, + { + "epoch": 0.1858171571696932, + "grad_norm": 2.9970099925994873, + "learning_rate": 9.445259042033237e-05, + "loss": 1.353, + "step": 1187 + }, + { + "epoch": 0.1859737006887915, + "grad_norm": 3.1121950149536133, + "learning_rate": 9.444444444444444e-05, + "loss": 1.6591, + "step": 1188 + }, + { + "epoch": 0.1861302442078898, + "grad_norm": 4.308781147003174, + "learning_rate": 9.443629846855654e-05, + "loss": 1.7488, + "step": 1189 + }, + { + "epoch": 0.1862867877269881, + "grad_norm": 5.041873455047607, + "learning_rate": 9.442815249266864e-05, + "loss": 1.7264, + "step": 1190 + }, + { + "epoch": 0.18644333124608642, + "grad_norm": 2.421640396118164, + "learning_rate": 9.442000651678071e-05, + "loss": 1.2846, + "step": 1191 + }, + { + "epoch": 0.18659987476518472, + "grad_norm": 2.6136326789855957, + "learning_rate": 9.44118605408928e-05, + "loss": 2.0526, + "step": 1192 + }, + { + "epoch": 0.18675641828428302, + "grad_norm": 4.037798881530762, + "learning_rate": 9.44037145650049e-05, + "loss": 1.4805, + "step": 1193 + }, + { + "epoch": 0.18691296180338135, + "grad_norm": 3.452742099761963, + "learning_rate": 9.439556858911697e-05, + "loss": 1.3365, + "step": 1194 + }, + { + "epoch": 0.18706950532247965, + "grad_norm": 2.6259775161743164, + "learning_rate": 9.438742261322907e-05, + "loss": 1.2613, + "step": 1195 + }, + { + "epoch": 0.18722604884157795, + "grad_norm": 1.3920942544937134, + "learning_rate": 9.437927663734117e-05, + "loss": 1.0826, + "step": 1196 + }, + { + "epoch": 0.18738259236067628, + "grad_norm": 7.60518217086792, + "learning_rate": 9.437113066145325e-05, + "loss": 1.0323, + "step": 1197 + }, + { + "epoch": 0.18753913587977458, + "grad_norm": 2.491382598876953, + "learning_rate": 9.436298468556533e-05, + "loss": 1.0908, + "step": 1198 + }, + { + "epoch": 0.18769567939887288, + "grad_norm": 2.2622053623199463, + "learning_rate": 9.435483870967743e-05, + "loss": 1.237, + "step": 1199 + }, + { + "epoch": 0.18785222291797118, + "grad_norm": 1.9169540405273438, + "learning_rate": 9.434669273378951e-05, + "loss": 1.301, + "step": 1200 + }, + { + "epoch": 0.1880087664370695, + "grad_norm": 0.8661024570465088, + "learning_rate": 9.43385467579016e-05, + "loss": 0.6322, + "step": 1201 + }, + { + "epoch": 0.18816530995616781, + "grad_norm": 0.7090408802032471, + "learning_rate": 9.43304007820137e-05, + "loss": 0.4858, + "step": 1202 + }, + { + "epoch": 0.18832185347526612, + "grad_norm": 0.9260220527648926, + "learning_rate": 9.432225480612578e-05, + "loss": 0.7937, + "step": 1203 + }, + { + "epoch": 0.18847839699436444, + "grad_norm": 1.0918890237808228, + "learning_rate": 9.431410883023786e-05, + "loss": 0.6664, + "step": 1204 + }, + { + "epoch": 0.18863494051346275, + "grad_norm": 0.8582043647766113, + "learning_rate": 9.430596285434996e-05, + "loss": 0.6598, + "step": 1205 + }, + { + "epoch": 0.18879148403256105, + "grad_norm": 1.0242986679077148, + "learning_rate": 9.429781687846204e-05, + "loss": 0.6102, + "step": 1206 + }, + { + "epoch": 0.18894802755165935, + "grad_norm": 1.5157390832901, + "learning_rate": 9.428967090257413e-05, + "loss": 0.7144, + "step": 1207 + }, + { + "epoch": 0.18910457107075768, + "grad_norm": 1.0453393459320068, + "learning_rate": 9.428152492668623e-05, + "loss": 0.5453, + "step": 1208 + }, + { + "epoch": 0.18926111458985598, + "grad_norm": 1.32743501663208, + "learning_rate": 9.427337895079831e-05, + "loss": 0.6691, + "step": 1209 + }, + { + "epoch": 0.18941765810895428, + "grad_norm": 1.2345647811889648, + "learning_rate": 9.42652329749104e-05, + "loss": 0.5811, + "step": 1210 + }, + { + "epoch": 0.1895742016280526, + "grad_norm": 1.1013319492340088, + "learning_rate": 9.425708699902249e-05, + "loss": 0.5373, + "step": 1211 + }, + { + "epoch": 0.1897307451471509, + "grad_norm": 1.820619821548462, + "learning_rate": 9.424894102313457e-05, + "loss": 1.2584, + "step": 1212 + }, + { + "epoch": 0.1898872886662492, + "grad_norm": 2.223928451538086, + "learning_rate": 9.424079504724667e-05, + "loss": 0.7796, + "step": 1213 + }, + { + "epoch": 0.19004383218534754, + "grad_norm": 1.1811271905899048, + "learning_rate": 9.423264907135876e-05, + "loss": 0.6762, + "step": 1214 + }, + { + "epoch": 0.19020037570444584, + "grad_norm": 2.331373691558838, + "learning_rate": 9.422450309547084e-05, + "loss": 0.6996, + "step": 1215 + }, + { + "epoch": 0.19035691922354414, + "grad_norm": 0.9533920288085938, + "learning_rate": 9.421635711958294e-05, + "loss": 0.6634, + "step": 1216 + }, + { + "epoch": 0.19051346274264244, + "grad_norm": 1.376050353050232, + "learning_rate": 9.420821114369502e-05, + "loss": 0.7473, + "step": 1217 + }, + { + "epoch": 0.19067000626174077, + "grad_norm": 2.3395798206329346, + "learning_rate": 9.42000651678071e-05, + "loss": 1.1864, + "step": 1218 + }, + { + "epoch": 0.19082654978083907, + "grad_norm": 1.7855169773101807, + "learning_rate": 9.41919191919192e-05, + "loss": 1.0084, + "step": 1219 + }, + { + "epoch": 0.19098309329993737, + "grad_norm": 2.2739052772521973, + "learning_rate": 9.418377321603128e-05, + "loss": 0.6761, + "step": 1220 + }, + { + "epoch": 0.1911396368190357, + "grad_norm": 1.9794983863830566, + "learning_rate": 9.417562724014337e-05, + "loss": 0.7392, + "step": 1221 + }, + { + "epoch": 0.191296180338134, + "grad_norm": 2.34328556060791, + "learning_rate": 9.416748126425547e-05, + "loss": 1.135, + "step": 1222 + }, + { + "epoch": 0.1914527238572323, + "grad_norm": 1.3545399904251099, + "learning_rate": 9.415933528836755e-05, + "loss": 0.7857, + "step": 1223 + }, + { + "epoch": 0.19160926737633063, + "grad_norm": 1.8019368648529053, + "learning_rate": 9.415118931247963e-05, + "loss": 0.7308, + "step": 1224 + }, + { + "epoch": 0.19176581089542893, + "grad_norm": 4.050002574920654, + "learning_rate": 9.414304333659173e-05, + "loss": 1.0107, + "step": 1225 + }, + { + "epoch": 0.19192235441452724, + "grad_norm": 2.3719847202301025, + "learning_rate": 9.413489736070383e-05, + "loss": 1.5422, + "step": 1226 + }, + { + "epoch": 0.19207889793362554, + "grad_norm": 2.4973092079162598, + "learning_rate": 9.41267513848159e-05, + "loss": 0.9898, + "step": 1227 + }, + { + "epoch": 0.19223544145272387, + "grad_norm": 2.6661057472229004, + "learning_rate": 9.4118605408928e-05, + "loss": 1.0172, + "step": 1228 + }, + { + "epoch": 0.19239198497182217, + "grad_norm": 1.8262989521026611, + "learning_rate": 9.411045943304009e-05, + "loss": 1.0259, + "step": 1229 + }, + { + "epoch": 0.19254852849092047, + "grad_norm": 2.442551851272583, + "learning_rate": 9.410231345715216e-05, + "loss": 0.8116, + "step": 1230 + }, + { + "epoch": 0.1927050720100188, + "grad_norm": 2.48152494430542, + "learning_rate": 9.409416748126426e-05, + "loss": 1.1904, + "step": 1231 + }, + { + "epoch": 0.1928616155291171, + "grad_norm": 2.646953582763672, + "learning_rate": 9.408602150537636e-05, + "loss": 1.1935, + "step": 1232 + }, + { + "epoch": 0.1930181590482154, + "grad_norm": 4.678431510925293, + "learning_rate": 9.407787552948844e-05, + "loss": 1.2386, + "step": 1233 + }, + { + "epoch": 0.1931747025673137, + "grad_norm": 2.2991528511047363, + "learning_rate": 9.406972955360053e-05, + "loss": 1.5982, + "step": 1234 + }, + { + "epoch": 0.19333124608641203, + "grad_norm": 2.5070464611053467, + "learning_rate": 9.406158357771262e-05, + "loss": 1.4338, + "step": 1235 + }, + { + "epoch": 0.19348778960551033, + "grad_norm": 4.98615026473999, + "learning_rate": 9.40534376018247e-05, + "loss": 1.4173, + "step": 1236 + }, + { + "epoch": 0.19364433312460863, + "grad_norm": 3.010223150253296, + "learning_rate": 9.404529162593679e-05, + "loss": 1.516, + "step": 1237 + }, + { + "epoch": 0.19380087664370696, + "grad_norm": 3.4859137535095215, + "learning_rate": 9.403714565004889e-05, + "loss": 1.3162, + "step": 1238 + }, + { + "epoch": 0.19395742016280526, + "grad_norm": 3.1408956050872803, + "learning_rate": 9.402899967416097e-05, + "loss": 1.0587, + "step": 1239 + }, + { + "epoch": 0.19411396368190356, + "grad_norm": 7.333593368530273, + "learning_rate": 9.402085369827305e-05, + "loss": 1.4503, + "step": 1240 + }, + { + "epoch": 0.1942705072010019, + "grad_norm": 6.244460582733154, + "learning_rate": 9.401270772238515e-05, + "loss": 2.1686, + "step": 1241 + }, + { + "epoch": 0.1944270507201002, + "grad_norm": 4.436493396759033, + "learning_rate": 9.400456174649724e-05, + "loss": 1.2225, + "step": 1242 + }, + { + "epoch": 0.1945835942391985, + "grad_norm": 3.8962366580963135, + "learning_rate": 9.399641577060932e-05, + "loss": 1.1513, + "step": 1243 + }, + { + "epoch": 0.1947401377582968, + "grad_norm": 8.012316703796387, + "learning_rate": 9.398826979472142e-05, + "loss": 1.2379, + "step": 1244 + }, + { + "epoch": 0.19489668127739512, + "grad_norm": 3.156188726425171, + "learning_rate": 9.39801238188335e-05, + "loss": 0.391, + "step": 1245 + }, + { + "epoch": 0.19505322479649342, + "grad_norm": 8.6339111328125, + "learning_rate": 9.39719778429456e-05, + "loss": 0.5806, + "step": 1246 + }, + { + "epoch": 0.19520976831559173, + "grad_norm": 5.6307854652404785, + "learning_rate": 9.396383186705768e-05, + "loss": 1.1113, + "step": 1247 + }, + { + "epoch": 0.19536631183469005, + "grad_norm": 6.830837249755859, + "learning_rate": 9.395568589116977e-05, + "loss": 1.9252, + "step": 1248 + }, + { + "epoch": 0.19552285535378836, + "grad_norm": 5.6442365646362305, + "learning_rate": 9.394753991528186e-05, + "loss": 1.7109, + "step": 1249 + }, + { + "epoch": 0.19567939887288666, + "grad_norm": 5.421234130859375, + "learning_rate": 9.393939393939395e-05, + "loss": 1.7375, + "step": 1250 + }, + { + "epoch": 0.19583594239198499, + "grad_norm": 0.9163038730621338, + "learning_rate": 9.393124796350603e-05, + "loss": 0.5861, + "step": 1251 + }, + { + "epoch": 0.1959924859110833, + "grad_norm": 1.3764950037002563, + "learning_rate": 9.392310198761813e-05, + "loss": 0.8077, + "step": 1252 + }, + { + "epoch": 0.1961490294301816, + "grad_norm": 1.0001063346862793, + "learning_rate": 9.391495601173021e-05, + "loss": 0.7313, + "step": 1253 + }, + { + "epoch": 0.1963055729492799, + "grad_norm": 0.8562139868736267, + "learning_rate": 9.39068100358423e-05, + "loss": 0.6418, + "step": 1254 + }, + { + "epoch": 0.19646211646837822, + "grad_norm": 1.0316778421401978, + "learning_rate": 9.389866405995439e-05, + "loss": 0.5201, + "step": 1255 + }, + { + "epoch": 0.19661865998747652, + "grad_norm": 1.113510012626648, + "learning_rate": 9.389051808406648e-05, + "loss": 0.4609, + "step": 1256 + }, + { + "epoch": 0.19677520350657482, + "grad_norm": 1.545259952545166, + "learning_rate": 9.388237210817856e-05, + "loss": 0.5838, + "step": 1257 + }, + { + "epoch": 0.19693174702567315, + "grad_norm": 1.073952078819275, + "learning_rate": 9.387422613229066e-05, + "loss": 0.4825, + "step": 1258 + }, + { + "epoch": 0.19708829054477145, + "grad_norm": 1.9349051713943481, + "learning_rate": 9.386608015640274e-05, + "loss": 0.7106, + "step": 1259 + }, + { + "epoch": 0.19724483406386975, + "grad_norm": 0.8273319005966187, + "learning_rate": 9.385793418051482e-05, + "loss": 0.4427, + "step": 1260 + }, + { + "epoch": 0.19740137758296805, + "grad_norm": 0.7567356824874878, + "learning_rate": 9.384978820462692e-05, + "loss": 0.4958, + "step": 1261 + }, + { + "epoch": 0.19755792110206638, + "grad_norm": 1.1207046508789062, + "learning_rate": 9.384164222873902e-05, + "loss": 0.6386, + "step": 1262 + }, + { + "epoch": 0.19771446462116468, + "grad_norm": 0.9304898381233215, + "learning_rate": 9.383349625285109e-05, + "loss": 0.6811, + "step": 1263 + }, + { + "epoch": 0.19787100814026298, + "grad_norm": 1.0706266164779663, + "learning_rate": 9.382535027696319e-05, + "loss": 0.6782, + "step": 1264 + }, + { + "epoch": 0.1980275516593613, + "grad_norm": 0.9343996644020081, + "learning_rate": 9.381720430107528e-05, + "loss": 0.5053, + "step": 1265 + }, + { + "epoch": 0.1981840951784596, + "grad_norm": 1.761564016342163, + "learning_rate": 9.380905832518735e-05, + "loss": 0.6244, + "step": 1266 + }, + { + "epoch": 0.1983406386975579, + "grad_norm": 1.354180097579956, + "learning_rate": 9.380091234929945e-05, + "loss": 0.6874, + "step": 1267 + }, + { + "epoch": 0.19849718221665624, + "grad_norm": 4.931132793426514, + "learning_rate": 9.379276637341155e-05, + "loss": 0.7482, + "step": 1268 + }, + { + "epoch": 0.19865372573575454, + "grad_norm": 2.4626882076263428, + "learning_rate": 9.378462039752363e-05, + "loss": 0.9531, + "step": 1269 + }, + { + "epoch": 0.19881026925485284, + "grad_norm": 2.597654342651367, + "learning_rate": 9.377647442163572e-05, + "loss": 0.8217, + "step": 1270 + }, + { + "epoch": 0.19896681277395115, + "grad_norm": 1.7531356811523438, + "learning_rate": 9.376832844574781e-05, + "loss": 0.6511, + "step": 1271 + }, + { + "epoch": 0.19912335629304947, + "grad_norm": 9.555440902709961, + "learning_rate": 9.37601824698599e-05, + "loss": 1.4784, + "step": 1272 + }, + { + "epoch": 0.19927989981214778, + "grad_norm": 1.507347822189331, + "learning_rate": 9.375203649397198e-05, + "loss": 0.9405, + "step": 1273 + }, + { + "epoch": 0.19943644333124608, + "grad_norm": 3.1928799152374268, + "learning_rate": 9.374389051808406e-05, + "loss": 1.0975, + "step": 1274 + }, + { + "epoch": 0.1995929868503444, + "grad_norm": 1.813779354095459, + "learning_rate": 9.373574454219616e-05, + "loss": 1.0423, + "step": 1275 + }, + { + "epoch": 0.1997495303694427, + "grad_norm": 2.060410499572754, + "learning_rate": 9.372759856630825e-05, + "loss": 1.1083, + "step": 1276 + }, + { + "epoch": 0.199906073888541, + "grad_norm": 2.229388952255249, + "learning_rate": 9.371945259042033e-05, + "loss": 1.1035, + "step": 1277 + }, + { + "epoch": 0.20006261740763934, + "grad_norm": 1.8868850469589233, + "learning_rate": 9.371130661453243e-05, + "loss": 1.1548, + "step": 1278 + }, + { + "epoch": 0.20021916092673764, + "grad_norm": 2.243337392807007, + "learning_rate": 9.370316063864451e-05, + "loss": 1.1123, + "step": 1279 + }, + { + "epoch": 0.20037570444583594, + "grad_norm": 3.1789510250091553, + "learning_rate": 9.36950146627566e-05, + "loss": 1.2563, + "step": 1280 + }, + { + "epoch": 0.20053224796493424, + "grad_norm": 2.0098800659179688, + "learning_rate": 9.368686868686869e-05, + "loss": 0.8743, + "step": 1281 + }, + { + "epoch": 0.20068879148403257, + "grad_norm": 2.4257969856262207, + "learning_rate": 9.367872271098078e-05, + "loss": 1.2374, + "step": 1282 + }, + { + "epoch": 0.20084533500313087, + "grad_norm": 1.7483782768249512, + "learning_rate": 9.367057673509286e-05, + "loss": 1.1698, + "step": 1283 + }, + { + "epoch": 0.20100187852222917, + "grad_norm": 4.912705421447754, + "learning_rate": 9.366243075920496e-05, + "loss": 1.1464, + "step": 1284 + }, + { + "epoch": 0.2011584220413275, + "grad_norm": 2.687391757965088, + "learning_rate": 9.365428478331705e-05, + "loss": 1.3903, + "step": 1285 + }, + { + "epoch": 0.2013149655604258, + "grad_norm": 1.9235968589782715, + "learning_rate": 9.364613880742912e-05, + "loss": 1.3238, + "step": 1286 + }, + { + "epoch": 0.2014715090795241, + "grad_norm": 1.7629886865615845, + "learning_rate": 9.363799283154122e-05, + "loss": 1.1748, + "step": 1287 + }, + { + "epoch": 0.2016280525986224, + "grad_norm": 3.3347394466400146, + "learning_rate": 9.362984685565332e-05, + "loss": 1.0821, + "step": 1288 + }, + { + "epoch": 0.20178459611772073, + "grad_norm": 3.7036519050598145, + "learning_rate": 9.362170087976539e-05, + "loss": 1.7971, + "step": 1289 + }, + { + "epoch": 0.20194113963681903, + "grad_norm": 2.8648264408111572, + "learning_rate": 9.361355490387749e-05, + "loss": 1.2899, + "step": 1290 + }, + { + "epoch": 0.20209768315591733, + "grad_norm": 3.632009983062744, + "learning_rate": 9.360540892798958e-05, + "loss": 1.8949, + "step": 1291 + }, + { + "epoch": 0.20225422667501566, + "grad_norm": 6.7197442054748535, + "learning_rate": 9.359726295210167e-05, + "loss": 1.7806, + "step": 1292 + }, + { + "epoch": 0.20241077019411396, + "grad_norm": 1.979057788848877, + "learning_rate": 9.358911697621375e-05, + "loss": 1.2174, + "step": 1293 + }, + { + "epoch": 0.20256731371321227, + "grad_norm": 1.9837782382965088, + "learning_rate": 9.358097100032585e-05, + "loss": 1.225, + "step": 1294 + }, + { + "epoch": 0.2027238572323106, + "grad_norm": 3.6657867431640625, + "learning_rate": 9.357282502443793e-05, + "loss": 1.5789, + "step": 1295 + }, + { + "epoch": 0.2028804007514089, + "grad_norm": 1.779935598373413, + "learning_rate": 9.356467904855002e-05, + "loss": 0.8684, + "step": 1296 + }, + { + "epoch": 0.2030369442705072, + "grad_norm": 2.5677852630615234, + "learning_rate": 9.355653307266211e-05, + "loss": 0.845, + "step": 1297 + }, + { + "epoch": 0.2031934877896055, + "grad_norm": 3.765028953552246, + "learning_rate": 9.35483870967742e-05, + "loss": 0.9966, + "step": 1298 + }, + { + "epoch": 0.20335003130870383, + "grad_norm": 2.8814315795898438, + "learning_rate": 9.354024112088628e-05, + "loss": 1.2393, + "step": 1299 + }, + { + "epoch": 0.20350657482780213, + "grad_norm": 3.1904704570770264, + "learning_rate": 9.353209514499838e-05, + "loss": 1.2735, + "step": 1300 + }, + { + "epoch": 0.20366311834690043, + "grad_norm": 1.065990924835205, + "learning_rate": 9.352394916911046e-05, + "loss": 0.6504, + "step": 1301 + }, + { + "epoch": 0.20381966186599876, + "grad_norm": 1.0243330001831055, + "learning_rate": 9.351580319322255e-05, + "loss": 0.5616, + "step": 1302 + }, + { + "epoch": 0.20397620538509706, + "grad_norm": 1.2944272756576538, + "learning_rate": 9.350765721733464e-05, + "loss": 0.7233, + "step": 1303 + }, + { + "epoch": 0.20413274890419536, + "grad_norm": 1.1203150749206543, + "learning_rate": 9.349951124144673e-05, + "loss": 0.5799, + "step": 1304 + }, + { + "epoch": 0.2042892924232937, + "grad_norm": 1.1982793807983398, + "learning_rate": 9.349136526555881e-05, + "loss": 0.6881, + "step": 1305 + }, + { + "epoch": 0.204445835942392, + "grad_norm": 2.1102166175842285, + "learning_rate": 9.348321928967091e-05, + "loss": 0.6604, + "step": 1306 + }, + { + "epoch": 0.2046023794614903, + "grad_norm": 1.2727168798446655, + "learning_rate": 9.347507331378299e-05, + "loss": 0.6403, + "step": 1307 + }, + { + "epoch": 0.2047589229805886, + "grad_norm": 2.502011299133301, + "learning_rate": 9.346692733789509e-05, + "loss": 0.6814, + "step": 1308 + }, + { + "epoch": 0.20491546649968692, + "grad_norm": 1.487305998802185, + "learning_rate": 9.345878136200717e-05, + "loss": 0.5588, + "step": 1309 + }, + { + "epoch": 0.20507201001878522, + "grad_norm": 0.9032906293869019, + "learning_rate": 9.345063538611926e-05, + "loss": 0.562, + "step": 1310 + }, + { + "epoch": 0.20522855353788352, + "grad_norm": 9.182206153869629, + "learning_rate": 9.344248941023135e-05, + "loss": 1.7004, + "step": 1311 + }, + { + "epoch": 0.20538509705698185, + "grad_norm": 1.2059659957885742, + "learning_rate": 9.343434343434344e-05, + "loss": 0.537, + "step": 1312 + }, + { + "epoch": 0.20554164057608015, + "grad_norm": 1.5699447393417358, + "learning_rate": 9.342619745845552e-05, + "loss": 0.8089, + "step": 1313 + }, + { + "epoch": 0.20569818409517845, + "grad_norm": 1.4109132289886475, + "learning_rate": 9.341805148256762e-05, + "loss": 0.7127, + "step": 1314 + }, + { + "epoch": 0.20585472761427676, + "grad_norm": 2.4728777408599854, + "learning_rate": 9.34099055066797e-05, + "loss": 0.9266, + "step": 1315 + }, + { + "epoch": 0.20601127113337508, + "grad_norm": 1.4319103956222534, + "learning_rate": 9.340175953079179e-05, + "loss": 0.7595, + "step": 1316 + }, + { + "epoch": 0.20616781465247339, + "grad_norm": 1.5819519758224487, + "learning_rate": 9.339361355490388e-05, + "loss": 0.525, + "step": 1317 + }, + { + "epoch": 0.2063243581715717, + "grad_norm": 2.322563409805298, + "learning_rate": 9.338546757901597e-05, + "loss": 0.9118, + "step": 1318 + }, + { + "epoch": 0.20648090169067002, + "grad_norm": 3.988162040710449, + "learning_rate": 9.337732160312805e-05, + "loss": 0.9608, + "step": 1319 + }, + { + "epoch": 0.20663744520976832, + "grad_norm": 1.6593596935272217, + "learning_rate": 9.336917562724015e-05, + "loss": 0.8975, + "step": 1320 + }, + { + "epoch": 0.20679398872886662, + "grad_norm": 2.8438005447387695, + "learning_rate": 9.336102965135224e-05, + "loss": 0.7564, + "step": 1321 + }, + { + "epoch": 0.20695053224796495, + "grad_norm": 1.7777326107025146, + "learning_rate": 9.335288367546432e-05, + "loss": 0.6987, + "step": 1322 + }, + { + "epoch": 0.20710707576706325, + "grad_norm": 2.5962610244750977, + "learning_rate": 9.334473769957641e-05, + "loss": 1.0713, + "step": 1323 + }, + { + "epoch": 0.20726361928616155, + "grad_norm": 1.596942663192749, + "learning_rate": 9.333659172368851e-05, + "loss": 0.7015, + "step": 1324 + }, + { + "epoch": 0.20742016280525985, + "grad_norm": 2.4506750106811523, + "learning_rate": 9.332844574780058e-05, + "loss": 0.8415, + "step": 1325 + }, + { + "epoch": 0.20757670632435818, + "grad_norm": 2.6221978664398193, + "learning_rate": 9.332029977191268e-05, + "loss": 0.89, + "step": 1326 + }, + { + "epoch": 0.20773324984345648, + "grad_norm": 4.247326850891113, + "learning_rate": 9.331215379602477e-05, + "loss": 1.4443, + "step": 1327 + }, + { + "epoch": 0.20788979336255478, + "grad_norm": 2.8862500190734863, + "learning_rate": 9.330400782013686e-05, + "loss": 0.896, + "step": 1328 + }, + { + "epoch": 0.2080463368816531, + "grad_norm": 3.7354438304901123, + "learning_rate": 9.329586184424894e-05, + "loss": 0.7501, + "step": 1329 + }, + { + "epoch": 0.2082028804007514, + "grad_norm": 1.9672240018844604, + "learning_rate": 9.328771586836104e-05, + "loss": 1.0458, + "step": 1330 + }, + { + "epoch": 0.2083594239198497, + "grad_norm": 3.0387041568756104, + "learning_rate": 9.327956989247312e-05, + "loss": 0.9896, + "step": 1331 + }, + { + "epoch": 0.20851596743894804, + "grad_norm": 1.8042062520980835, + "learning_rate": 9.327142391658521e-05, + "loss": 0.9002, + "step": 1332 + }, + { + "epoch": 0.20867251095804634, + "grad_norm": 2.8317506313323975, + "learning_rate": 9.32632779406973e-05, + "loss": 1.2379, + "step": 1333 + }, + { + "epoch": 0.20882905447714464, + "grad_norm": 2.0722391605377197, + "learning_rate": 9.325513196480939e-05, + "loss": 1.0703, + "step": 1334 + }, + { + "epoch": 0.20898559799624294, + "grad_norm": 4.213514804840088, + "learning_rate": 9.324698598892147e-05, + "loss": 1.5835, + "step": 1335 + }, + { + "epoch": 0.20914214151534127, + "grad_norm": 2.878437042236328, + "learning_rate": 9.323884001303357e-05, + "loss": 1.2967, + "step": 1336 + }, + { + "epoch": 0.20929868503443957, + "grad_norm": 4.726464748382568, + "learning_rate": 9.323069403714565e-05, + "loss": 1.479, + "step": 1337 + }, + { + "epoch": 0.20945522855353788, + "grad_norm": 4.130889415740967, + "learning_rate": 9.322254806125774e-05, + "loss": 1.6146, + "step": 1338 + }, + { + "epoch": 0.2096117720726362, + "grad_norm": 4.002620220184326, + "learning_rate": 9.321440208536983e-05, + "loss": 1.4481, + "step": 1339 + }, + { + "epoch": 0.2097683155917345, + "grad_norm": 3.8769068717956543, + "learning_rate": 9.320625610948192e-05, + "loss": 0.9713, + "step": 1340 + }, + { + "epoch": 0.2099248591108328, + "grad_norm": 4.346937656402588, + "learning_rate": 9.3198110133594e-05, + "loss": 1.4391, + "step": 1341 + }, + { + "epoch": 0.2100814026299311, + "grad_norm": 3.4901018142700195, + "learning_rate": 9.31899641577061e-05, + "loss": 1.353, + "step": 1342 + }, + { + "epoch": 0.21023794614902944, + "grad_norm": 5.876597881317139, + "learning_rate": 9.318181818181818e-05, + "loss": 1.1164, + "step": 1343 + }, + { + "epoch": 0.21039448966812774, + "grad_norm": 3.9651377201080322, + "learning_rate": 9.317367220593028e-05, + "loss": 1.4202, + "step": 1344 + }, + { + "epoch": 0.21055103318722604, + "grad_norm": 3.5283522605895996, + "learning_rate": 9.316552623004236e-05, + "loss": 1.3319, + "step": 1345 + }, + { + "epoch": 0.21070757670632437, + "grad_norm": 3.0536913871765137, + "learning_rate": 9.315738025415445e-05, + "loss": 0.9184, + "step": 1346 + }, + { + "epoch": 0.21086412022542267, + "grad_norm": 3.0349607467651367, + "learning_rate": 9.314923427826654e-05, + "loss": 0.9091, + "step": 1347 + }, + { + "epoch": 0.21102066374452097, + "grad_norm": 5.009030342102051, + "learning_rate": 9.314108830237863e-05, + "loss": 1.2181, + "step": 1348 + }, + { + "epoch": 0.2111772072636193, + "grad_norm": 3.432375907897949, + "learning_rate": 9.313294232649071e-05, + "loss": 0.7257, + "step": 1349 + }, + { + "epoch": 0.2113337507827176, + "grad_norm": 2.318659782409668, + "learning_rate": 9.312479635060281e-05, + "loss": 1.1679, + "step": 1350 + }, + { + "epoch": 0.2114902943018159, + "grad_norm": 0.5922752618789673, + "learning_rate": 9.311665037471489e-05, + "loss": 0.4955, + "step": 1351 + }, + { + "epoch": 0.2116468378209142, + "grad_norm": 1.0195848941802979, + "learning_rate": 9.310850439882698e-05, + "loss": 0.6851, + "step": 1352 + }, + { + "epoch": 0.21180338134001253, + "grad_norm": 0.8145252466201782, + "learning_rate": 9.310035842293907e-05, + "loss": 0.5109, + "step": 1353 + }, + { + "epoch": 0.21195992485911083, + "grad_norm": 0.9016205668449402, + "learning_rate": 9.309221244705116e-05, + "loss": 0.5495, + "step": 1354 + }, + { + "epoch": 0.21211646837820913, + "grad_norm": 0.6659998297691345, + "learning_rate": 9.308406647116324e-05, + "loss": 0.4263, + "step": 1355 + }, + { + "epoch": 0.21227301189730746, + "grad_norm": 1.074156403541565, + "learning_rate": 9.307592049527534e-05, + "loss": 0.6891, + "step": 1356 + }, + { + "epoch": 0.21242955541640576, + "grad_norm": 1.7256790399551392, + "learning_rate": 9.306777451938744e-05, + "loss": 0.7369, + "step": 1357 + }, + { + "epoch": 0.21258609893550406, + "grad_norm": 1.440400242805481, + "learning_rate": 9.30596285434995e-05, + "loss": 0.7195, + "step": 1358 + }, + { + "epoch": 0.2127426424546024, + "grad_norm": 1.2697504758834839, + "learning_rate": 9.30514825676116e-05, + "loss": 0.5994, + "step": 1359 + }, + { + "epoch": 0.2128991859737007, + "grad_norm": 0.8751150965690613, + "learning_rate": 9.30433365917237e-05, + "loss": 0.4889, + "step": 1360 + }, + { + "epoch": 0.213055729492799, + "grad_norm": 1.0634349584579468, + "learning_rate": 9.303519061583577e-05, + "loss": 0.7204, + "step": 1361 + }, + { + "epoch": 0.2132122730118973, + "grad_norm": 1.3616931438446045, + "learning_rate": 9.302704463994787e-05, + "loss": 0.7535, + "step": 1362 + }, + { + "epoch": 0.21336881653099563, + "grad_norm": 1.3447051048278809, + "learning_rate": 9.301889866405997e-05, + "loss": 0.7837, + "step": 1363 + }, + { + "epoch": 0.21352536005009393, + "grad_norm": 1.4100804328918457, + "learning_rate": 9.301075268817204e-05, + "loss": 0.5246, + "step": 1364 + }, + { + "epoch": 0.21368190356919223, + "grad_norm": 1.442376971244812, + "learning_rate": 9.300260671228413e-05, + "loss": 0.5298, + "step": 1365 + }, + { + "epoch": 0.21383844708829056, + "grad_norm": 1.7990407943725586, + "learning_rate": 9.299446073639623e-05, + "loss": 0.8569, + "step": 1366 + }, + { + "epoch": 0.21399499060738886, + "grad_norm": 2.030313730239868, + "learning_rate": 9.298631476050831e-05, + "loss": 1.1292, + "step": 1367 + }, + { + "epoch": 0.21415153412648716, + "grad_norm": 1.2522867918014526, + "learning_rate": 9.29781687846204e-05, + "loss": 0.5369, + "step": 1368 + }, + { + "epoch": 0.21430807764558546, + "grad_norm": 3.218400716781616, + "learning_rate": 9.29700228087325e-05, + "loss": 1.2701, + "step": 1369 + }, + { + "epoch": 0.2144646211646838, + "grad_norm": 2.592564344406128, + "learning_rate": 9.296187683284458e-05, + "loss": 1.1301, + "step": 1370 + }, + { + "epoch": 0.2146211646837821, + "grad_norm": 1.8669476509094238, + "learning_rate": 9.295373085695666e-05, + "loss": 0.6319, + "step": 1371 + }, + { + "epoch": 0.2147777082028804, + "grad_norm": 2.526078462600708, + "learning_rate": 9.294558488106876e-05, + "loss": 0.6903, + "step": 1372 + }, + { + "epoch": 0.21493425172197872, + "grad_norm": 4.61427116394043, + "learning_rate": 9.293743890518084e-05, + "loss": 1.0084, + "step": 1373 + }, + { + "epoch": 0.21509079524107702, + "grad_norm": 2.789186716079712, + "learning_rate": 9.292929292929293e-05, + "loss": 0.9949, + "step": 1374 + }, + { + "epoch": 0.21524733876017532, + "grad_norm": 2.208681583404541, + "learning_rate": 9.292114695340502e-05, + "loss": 0.958, + "step": 1375 + }, + { + "epoch": 0.21540388227927365, + "grad_norm": 3.633007526397705, + "learning_rate": 9.291300097751711e-05, + "loss": 0.9502, + "step": 1376 + }, + { + "epoch": 0.21556042579837195, + "grad_norm": 2.8654401302337646, + "learning_rate": 9.290485500162919e-05, + "loss": 0.9536, + "step": 1377 + }, + { + "epoch": 0.21571696931747025, + "grad_norm": 3.3582777976989746, + "learning_rate": 9.289670902574129e-05, + "loss": 0.6716, + "step": 1378 + }, + { + "epoch": 0.21587351283656855, + "grad_norm": 1.4983841180801392, + "learning_rate": 9.288856304985337e-05, + "loss": 0.8229, + "step": 1379 + }, + { + "epoch": 0.21603005635566688, + "grad_norm": 3.9039127826690674, + "learning_rate": 9.288041707396547e-05, + "loss": 1.1788, + "step": 1380 + }, + { + "epoch": 0.21618659987476518, + "grad_norm": 3.0855376720428467, + "learning_rate": 9.287227109807755e-05, + "loss": 0.9716, + "step": 1381 + }, + { + "epoch": 0.21634314339386349, + "grad_norm": 2.4269497394561768, + "learning_rate": 9.286412512218964e-05, + "loss": 1.0321, + "step": 1382 + }, + { + "epoch": 0.21649968691296181, + "grad_norm": 2.8434529304504395, + "learning_rate": 9.285597914630174e-05, + "loss": 1.0714, + "step": 1383 + }, + { + "epoch": 0.21665623043206012, + "grad_norm": 2.6682474613189697, + "learning_rate": 9.284783317041382e-05, + "loss": 1.4751, + "step": 1384 + }, + { + "epoch": 0.21681277395115842, + "grad_norm": 3.3393876552581787, + "learning_rate": 9.28396871945259e-05, + "loss": 1.4657, + "step": 1385 + }, + { + "epoch": 0.21696931747025675, + "grad_norm": 5.05244779586792, + "learning_rate": 9.2831541218638e-05, + "loss": 1.3253, + "step": 1386 + }, + { + "epoch": 0.21712586098935505, + "grad_norm": 3.540489912033081, + "learning_rate": 9.282339524275008e-05, + "loss": 1.3896, + "step": 1387 + }, + { + "epoch": 0.21728240450845335, + "grad_norm": 4.3837056159973145, + "learning_rate": 9.281524926686217e-05, + "loss": 1.3156, + "step": 1388 + }, + { + "epoch": 0.21743894802755165, + "grad_norm": 4.260959625244141, + "learning_rate": 9.280710329097427e-05, + "loss": 1.6383, + "step": 1389 + }, + { + "epoch": 0.21759549154664998, + "grad_norm": 4.492507457733154, + "learning_rate": 9.279895731508635e-05, + "loss": 1.2715, + "step": 1390 + }, + { + "epoch": 0.21775203506574828, + "grad_norm": 4.508911609649658, + "learning_rate": 9.279081133919843e-05, + "loss": 1.6974, + "step": 1391 + }, + { + "epoch": 0.21790857858484658, + "grad_norm": 4.42802619934082, + "learning_rate": 9.278266536331053e-05, + "loss": 1.7917, + "step": 1392 + }, + { + "epoch": 0.2180651221039449, + "grad_norm": 8.301690101623535, + "learning_rate": 9.277451938742261e-05, + "loss": 1.4999, + "step": 1393 + }, + { + "epoch": 0.2182216656230432, + "grad_norm": 3.355602502822876, + "learning_rate": 9.27663734115347e-05, + "loss": 1.1966, + "step": 1394 + }, + { + "epoch": 0.2183782091421415, + "grad_norm": 3.5317845344543457, + "learning_rate": 9.27582274356468e-05, + "loss": 2.2579, + "step": 1395 + }, + { + "epoch": 0.2185347526612398, + "grad_norm": 3.53718638420105, + "learning_rate": 9.275008145975889e-05, + "loss": 1.0853, + "step": 1396 + }, + { + "epoch": 0.21869129618033814, + "grad_norm": 3.869640350341797, + "learning_rate": 9.274193548387096e-05, + "loss": 1.0166, + "step": 1397 + }, + { + "epoch": 0.21884783969943644, + "grad_norm": 4.703973293304443, + "learning_rate": 9.273378950798306e-05, + "loss": 1.4152, + "step": 1398 + }, + { + "epoch": 0.21900438321853474, + "grad_norm": 5.324431419372559, + "learning_rate": 9.272564353209516e-05, + "loss": 1.8427, + "step": 1399 + }, + { + "epoch": 0.21916092673763307, + "grad_norm": 4.419987678527832, + "learning_rate": 9.271749755620723e-05, + "loss": 1.8388, + "step": 1400 + }, + { + "epoch": 0.21931747025673137, + "grad_norm": 1.7905488014221191, + "learning_rate": 9.270935158031932e-05, + "loss": 0.5226, + "step": 1401 + }, + { + "epoch": 0.21947401377582967, + "grad_norm": 0.8043189644813538, + "learning_rate": 9.270120560443142e-05, + "loss": 0.5089, + "step": 1402 + }, + { + "epoch": 0.219630557294928, + "grad_norm": 0.8196176886558533, + "learning_rate": 9.26930596285435e-05, + "loss": 0.497, + "step": 1403 + }, + { + "epoch": 0.2197871008140263, + "grad_norm": 0.9577687382698059, + "learning_rate": 9.268491365265559e-05, + "loss": 0.6124, + "step": 1404 + }, + { + "epoch": 0.2199436443331246, + "grad_norm": 1.3607045412063599, + "learning_rate": 9.267676767676769e-05, + "loss": 0.5306, + "step": 1405 + }, + { + "epoch": 0.2201001878522229, + "grad_norm": 1.048966407775879, + "learning_rate": 9.266862170087977e-05, + "loss": 0.5552, + "step": 1406 + }, + { + "epoch": 0.22025673137132123, + "grad_norm": 0.7352248430252075, + "learning_rate": 9.266047572499185e-05, + "loss": 0.5005, + "step": 1407 + }, + { + "epoch": 0.22041327489041954, + "grad_norm": 0.8931574821472168, + "learning_rate": 9.265232974910395e-05, + "loss": 0.4698, + "step": 1408 + }, + { + "epoch": 0.22056981840951784, + "grad_norm": 0.7106713652610779, + "learning_rate": 9.264418377321603e-05, + "loss": 0.3995, + "step": 1409 + }, + { + "epoch": 0.22072636192861617, + "grad_norm": 1.1096149682998657, + "learning_rate": 9.263603779732812e-05, + "loss": 0.5018, + "step": 1410 + }, + { + "epoch": 0.22088290544771447, + "grad_norm": 1.0810846090316772, + "learning_rate": 9.262789182144022e-05, + "loss": 0.6657, + "step": 1411 + }, + { + "epoch": 0.22103944896681277, + "grad_norm": 0.9990880489349365, + "learning_rate": 9.26197458455523e-05, + "loss": 0.6419, + "step": 1412 + }, + { + "epoch": 0.2211959924859111, + "grad_norm": 1.1680829524993896, + "learning_rate": 9.261159986966438e-05, + "loss": 0.5961, + "step": 1413 + }, + { + "epoch": 0.2213525360050094, + "grad_norm": 1.1811153888702393, + "learning_rate": 9.260345389377648e-05, + "loss": 0.4339, + "step": 1414 + }, + { + "epoch": 0.2215090795241077, + "grad_norm": 1.1097323894500732, + "learning_rate": 9.259530791788856e-05, + "loss": 0.5938, + "step": 1415 + }, + { + "epoch": 0.221665623043206, + "grad_norm": 1.2891976833343506, + "learning_rate": 9.258716194200066e-05, + "loss": 0.8579, + "step": 1416 + }, + { + "epoch": 0.22182216656230433, + "grad_norm": 3.2133665084838867, + "learning_rate": 9.257901596611275e-05, + "loss": 0.8831, + "step": 1417 + }, + { + "epoch": 0.22197871008140263, + "grad_norm": 2.1581976413726807, + "learning_rate": 9.257086999022483e-05, + "loss": 0.8311, + "step": 1418 + }, + { + "epoch": 0.22213525360050093, + "grad_norm": 1.5241210460662842, + "learning_rate": 9.256272401433693e-05, + "loss": 0.7847, + "step": 1419 + }, + { + "epoch": 0.22229179711959926, + "grad_norm": 2.106407642364502, + "learning_rate": 9.255457803844901e-05, + "loss": 0.4729, + "step": 1420 + }, + { + "epoch": 0.22244834063869756, + "grad_norm": 1.4887981414794922, + "learning_rate": 9.25464320625611e-05, + "loss": 0.6722, + "step": 1421 + }, + { + "epoch": 0.22260488415779586, + "grad_norm": 2.2446999549865723, + "learning_rate": 9.253828608667319e-05, + "loss": 0.6978, + "step": 1422 + }, + { + "epoch": 0.22276142767689416, + "grad_norm": 2.6446547508239746, + "learning_rate": 9.253014011078528e-05, + "loss": 0.6133, + "step": 1423 + }, + { + "epoch": 0.2229179711959925, + "grad_norm": 4.168311595916748, + "learning_rate": 9.252199413489736e-05, + "loss": 0.9358, + "step": 1424 + }, + { + "epoch": 0.2230745147150908, + "grad_norm": 3.646155834197998, + "learning_rate": 9.251384815900946e-05, + "loss": 1.1217, + "step": 1425 + }, + { + "epoch": 0.2232310582341891, + "grad_norm": 1.5539034605026245, + "learning_rate": 9.250570218312154e-05, + "loss": 0.6517, + "step": 1426 + }, + { + "epoch": 0.22338760175328742, + "grad_norm": 2.0876173973083496, + "learning_rate": 9.249755620723362e-05, + "loss": 0.906, + "step": 1427 + }, + { + "epoch": 0.22354414527238572, + "grad_norm": 2.6132850646972656, + "learning_rate": 9.248941023134572e-05, + "loss": 0.8086, + "step": 1428 + }, + { + "epoch": 0.22370068879148403, + "grad_norm": 3.6342012882232666, + "learning_rate": 9.24812642554578e-05, + "loss": 0.8827, + "step": 1429 + }, + { + "epoch": 0.22385723231058235, + "grad_norm": 3.5953073501586914, + "learning_rate": 9.247311827956989e-05, + "loss": 1.1811, + "step": 1430 + }, + { + "epoch": 0.22401377582968066, + "grad_norm": 4.3246331214904785, + "learning_rate": 9.246497230368199e-05, + "loss": 1.2023, + "step": 1431 + }, + { + "epoch": 0.22417031934877896, + "grad_norm": 3.5542216300964355, + "learning_rate": 9.245682632779408e-05, + "loss": 1.4694, + "step": 1432 + }, + { + "epoch": 0.22432686286787726, + "grad_norm": 2.5139083862304688, + "learning_rate": 9.244868035190615e-05, + "loss": 1.2494, + "step": 1433 + }, + { + "epoch": 0.2244834063869756, + "grad_norm": 2.9384849071502686, + "learning_rate": 9.244053437601825e-05, + "loss": 1.5667, + "step": 1434 + }, + { + "epoch": 0.2246399499060739, + "grad_norm": 2.258528232574463, + "learning_rate": 9.243238840013035e-05, + "loss": 1.3355, + "step": 1435 + }, + { + "epoch": 0.2247964934251722, + "grad_norm": 2.8593075275421143, + "learning_rate": 9.242424242424242e-05, + "loss": 1.2488, + "step": 1436 + }, + { + "epoch": 0.22495303694427052, + "grad_norm": 4.671872138977051, + "learning_rate": 9.241609644835452e-05, + "loss": 1.721, + "step": 1437 + }, + { + "epoch": 0.22510958046336882, + "grad_norm": 2.8909621238708496, + "learning_rate": 9.240795047246661e-05, + "loss": 1.1471, + "step": 1438 + }, + { + "epoch": 0.22526612398246712, + "grad_norm": 2.837310791015625, + "learning_rate": 9.23998044965787e-05, + "loss": 1.6426, + "step": 1439 + }, + { + "epoch": 0.22542266750156542, + "grad_norm": 1.9264709949493408, + "learning_rate": 9.239165852069078e-05, + "loss": 1.1154, + "step": 1440 + }, + { + "epoch": 0.22557921102066375, + "grad_norm": 1.863747000694275, + "learning_rate": 9.238351254480288e-05, + "loss": 0.9397, + "step": 1441 + }, + { + "epoch": 0.22573575453976205, + "grad_norm": 3.0765929222106934, + "learning_rate": 9.237536656891496e-05, + "loss": 1.4077, + "step": 1442 + }, + { + "epoch": 0.22589229805886035, + "grad_norm": 3.449446201324463, + "learning_rate": 9.236722059302705e-05, + "loss": 1.368, + "step": 1443 + }, + { + "epoch": 0.22604884157795868, + "grad_norm": 3.229566812515259, + "learning_rate": 9.235907461713914e-05, + "loss": 1.8187, + "step": 1444 + }, + { + "epoch": 0.22620538509705698, + "grad_norm": 3.724670171737671, + "learning_rate": 9.235092864125123e-05, + "loss": 1.4662, + "step": 1445 + }, + { + "epoch": 0.22636192861615528, + "grad_norm": 3.382648468017578, + "learning_rate": 9.234278266536331e-05, + "loss": 0.7785, + "step": 1446 + }, + { + "epoch": 0.2265184721352536, + "grad_norm": 2.431091547012329, + "learning_rate": 9.233463668947541e-05, + "loss": 0.9966, + "step": 1447 + }, + { + "epoch": 0.2266750156543519, + "grad_norm": 3.9033782482147217, + "learning_rate": 9.232649071358749e-05, + "loss": 1.2105, + "step": 1448 + }, + { + "epoch": 0.22683155917345021, + "grad_norm": 2.74283504486084, + "learning_rate": 9.231834473769957e-05, + "loss": 0.8585, + "step": 1449 + }, + { + "epoch": 0.22698810269254852, + "grad_norm": 2.7151055335998535, + "learning_rate": 9.231019876181167e-05, + "loss": 1.3607, + "step": 1450 + }, + { + "epoch": 0.22714464621164684, + "grad_norm": 0.7671235799789429, + "learning_rate": 9.230205278592376e-05, + "loss": 0.4718, + "step": 1451 + }, + { + "epoch": 0.22730118973074515, + "grad_norm": 0.8017595410346985, + "learning_rate": 9.229390681003584e-05, + "loss": 0.4847, + "step": 1452 + }, + { + "epoch": 0.22745773324984345, + "grad_norm": 0.9094266891479492, + "learning_rate": 9.228576083414794e-05, + "loss": 0.5797, + "step": 1453 + }, + { + "epoch": 0.22761427676894178, + "grad_norm": 0.7241607904434204, + "learning_rate": 9.227761485826002e-05, + "loss": 0.5769, + "step": 1454 + }, + { + "epoch": 0.22777082028804008, + "grad_norm": 0.8387868404388428, + "learning_rate": 9.226946888237212e-05, + "loss": 0.4337, + "step": 1455 + }, + { + "epoch": 0.22792736380713838, + "grad_norm": 1.2089283466339111, + "learning_rate": 9.22613229064842e-05, + "loss": 0.6208, + "step": 1456 + }, + { + "epoch": 0.2280839073262367, + "grad_norm": 1.0257102251052856, + "learning_rate": 9.225317693059629e-05, + "loss": 0.7304, + "step": 1457 + }, + { + "epoch": 0.228240450845335, + "grad_norm": 3.415764570236206, + "learning_rate": 9.224503095470838e-05, + "loss": 0.5567, + "step": 1458 + }, + { + "epoch": 0.2283969943644333, + "grad_norm": 1.812423825263977, + "learning_rate": 9.223688497882047e-05, + "loss": 0.5035, + "step": 1459 + }, + { + "epoch": 0.2285535378835316, + "grad_norm": 1.1289501190185547, + "learning_rate": 9.222873900293255e-05, + "loss": 0.4879, + "step": 1460 + }, + { + "epoch": 0.22871008140262994, + "grad_norm": 3.8060145378112793, + "learning_rate": 9.222059302704465e-05, + "loss": 0.5466, + "step": 1461 + }, + { + "epoch": 0.22886662492172824, + "grad_norm": 1.3389984369277954, + "learning_rate": 9.221244705115673e-05, + "loss": 0.6154, + "step": 1462 + }, + { + "epoch": 0.22902316844082654, + "grad_norm": 1.453336477279663, + "learning_rate": 9.220430107526881e-05, + "loss": 0.8184, + "step": 1463 + }, + { + "epoch": 0.22917971195992487, + "grad_norm": 0.8985715508460999, + "learning_rate": 9.219615509938091e-05, + "loss": 0.5591, + "step": 1464 + }, + { + "epoch": 0.22933625547902317, + "grad_norm": 2.257568359375, + "learning_rate": 9.2188009123493e-05, + "loss": 0.8164, + "step": 1465 + }, + { + "epoch": 0.22949279899812147, + "grad_norm": 1.5148940086364746, + "learning_rate": 9.217986314760508e-05, + "loss": 0.6575, + "step": 1466 + }, + { + "epoch": 0.22964934251721977, + "grad_norm": 1.702968955039978, + "learning_rate": 9.217171717171718e-05, + "loss": 0.8484, + "step": 1467 + }, + { + "epoch": 0.2298058860363181, + "grad_norm": 1.673514485359192, + "learning_rate": 9.216357119582927e-05, + "loss": 0.7186, + "step": 1468 + }, + { + "epoch": 0.2299624295554164, + "grad_norm": 1.6834641695022583, + "learning_rate": 9.215542521994134e-05, + "loss": 0.7331, + "step": 1469 + }, + { + "epoch": 0.2301189730745147, + "grad_norm": 2.341947078704834, + "learning_rate": 9.214727924405344e-05, + "loss": 0.6827, + "step": 1470 + }, + { + "epoch": 0.23027551659361303, + "grad_norm": 1.4842687845230103, + "learning_rate": 9.213913326816554e-05, + "loss": 0.5435, + "step": 1471 + }, + { + "epoch": 0.23043206011271133, + "grad_norm": 1.1877425909042358, + "learning_rate": 9.213098729227761e-05, + "loss": 0.6922, + "step": 1472 + }, + { + "epoch": 0.23058860363180964, + "grad_norm": 1.8043116331100464, + "learning_rate": 9.21228413163897e-05, + "loss": 0.8033, + "step": 1473 + }, + { + "epoch": 0.23074514715090796, + "grad_norm": 1.838260293006897, + "learning_rate": 9.21146953405018e-05, + "loss": 0.5845, + "step": 1474 + }, + { + "epoch": 0.23090169067000627, + "grad_norm": 3.708636999130249, + "learning_rate": 9.210654936461389e-05, + "loss": 0.9537, + "step": 1475 + }, + { + "epoch": 0.23105823418910457, + "grad_norm": 2.4867138862609863, + "learning_rate": 9.209840338872597e-05, + "loss": 0.5874, + "step": 1476 + }, + { + "epoch": 0.23121477770820287, + "grad_norm": 1.3953874111175537, + "learning_rate": 9.209025741283807e-05, + "loss": 0.488, + "step": 1477 + }, + { + "epoch": 0.2313713212273012, + "grad_norm": 2.4623658657073975, + "learning_rate": 9.208211143695015e-05, + "loss": 0.8541, + "step": 1478 + }, + { + "epoch": 0.2315278647463995, + "grad_norm": 4.799334526062012, + "learning_rate": 9.207396546106224e-05, + "loss": 1.1104, + "step": 1479 + }, + { + "epoch": 0.2316844082654978, + "grad_norm": 2.5269739627838135, + "learning_rate": 9.206581948517433e-05, + "loss": 0.9435, + "step": 1480 + }, + { + "epoch": 0.23184095178459613, + "grad_norm": 2.966486692428589, + "learning_rate": 9.205767350928642e-05, + "loss": 1.2271, + "step": 1481 + }, + { + "epoch": 0.23199749530369443, + "grad_norm": 3.668581485748291, + "learning_rate": 9.20495275333985e-05, + "loss": 1.0853, + "step": 1482 + }, + { + "epoch": 0.23215403882279273, + "grad_norm": 2.8218774795532227, + "learning_rate": 9.20413815575106e-05, + "loss": 1.2902, + "step": 1483 + }, + { + "epoch": 0.23231058234189106, + "grad_norm": 3.456632375717163, + "learning_rate": 9.203323558162268e-05, + "loss": 0.752, + "step": 1484 + }, + { + "epoch": 0.23246712586098936, + "grad_norm": 3.2039906978607178, + "learning_rate": 9.202508960573477e-05, + "loss": 1.0997, + "step": 1485 + }, + { + "epoch": 0.23262366938008766, + "grad_norm": 3.2955286502838135, + "learning_rate": 9.201694362984686e-05, + "loss": 1.5695, + "step": 1486 + }, + { + "epoch": 0.23278021289918596, + "grad_norm": 2.9204723834991455, + "learning_rate": 9.200879765395895e-05, + "loss": 0.9921, + "step": 1487 + }, + { + "epoch": 0.2329367564182843, + "grad_norm": 3.5132505893707275, + "learning_rate": 9.200065167807103e-05, + "loss": 0.9145, + "step": 1488 + }, + { + "epoch": 0.2330932999373826, + "grad_norm": 5.012818336486816, + "learning_rate": 9.199250570218313e-05, + "loss": 1.7124, + "step": 1489 + }, + { + "epoch": 0.2332498434564809, + "grad_norm": 3.997135877609253, + "learning_rate": 9.198435972629521e-05, + "loss": 1.5294, + "step": 1490 + }, + { + "epoch": 0.23340638697557922, + "grad_norm": 3.155620813369751, + "learning_rate": 9.197621375040731e-05, + "loss": 1.7235, + "step": 1491 + }, + { + "epoch": 0.23356293049467752, + "grad_norm": 3.3822197914123535, + "learning_rate": 9.196806777451939e-05, + "loss": 0.8573, + "step": 1492 + }, + { + "epoch": 0.23371947401377582, + "grad_norm": 3.117121934890747, + "learning_rate": 9.195992179863148e-05, + "loss": 1.029, + "step": 1493 + }, + { + "epoch": 0.23387601753287413, + "grad_norm": 2.9554665088653564, + "learning_rate": 9.195177582274357e-05, + "loss": 1.562, + "step": 1494 + }, + { + "epoch": 0.23403256105197245, + "grad_norm": 3.076686143875122, + "learning_rate": 9.194362984685566e-05, + "loss": 1.8939, + "step": 1495 + }, + { + "epoch": 0.23418910457107076, + "grad_norm": 2.505218982696533, + "learning_rate": 9.193548387096774e-05, + "loss": 1.2188, + "step": 1496 + }, + { + "epoch": 0.23434564809016906, + "grad_norm": 3.7564492225646973, + "learning_rate": 9.192733789507984e-05, + "loss": 0.9549, + "step": 1497 + }, + { + "epoch": 0.23450219160926739, + "grad_norm": 3.2301712036132812, + "learning_rate": 9.191919191919192e-05, + "loss": 0.8657, + "step": 1498 + }, + { + "epoch": 0.2346587351283657, + "grad_norm": 1.9877567291259766, + "learning_rate": 9.1911045943304e-05, + "loss": 0.6474, + "step": 1499 + }, + { + "epoch": 0.234815278647464, + "grad_norm": 1.6756012439727783, + "learning_rate": 9.19028999674161e-05, + "loss": 1.0935, + "step": 1500 + }, + { + "epoch": 0.23497182216656232, + "grad_norm": 0.654725968837738, + "learning_rate": 9.189475399152819e-05, + "loss": 0.4698, + "step": 1501 + }, + { + "epoch": 0.23512836568566062, + "grad_norm": 1.1975436210632324, + "learning_rate": 9.188660801564027e-05, + "loss": 0.5779, + "step": 1502 + }, + { + "epoch": 0.23528490920475892, + "grad_norm": 0.770732581615448, + "learning_rate": 9.187846203975237e-05, + "loss": 0.3715, + "step": 1503 + }, + { + "epoch": 0.23544145272385722, + "grad_norm": 1.1249791383743286, + "learning_rate": 9.187031606386447e-05, + "loss": 0.5003, + "step": 1504 + }, + { + "epoch": 0.23559799624295555, + "grad_norm": 0.8006545305252075, + "learning_rate": 9.186217008797654e-05, + "loss": 0.3832, + "step": 1505 + }, + { + "epoch": 0.23575453976205385, + "grad_norm": 0.8922159075737, + "learning_rate": 9.185402411208863e-05, + "loss": 0.4273, + "step": 1506 + }, + { + "epoch": 0.23591108328115215, + "grad_norm": 1.0311733484268188, + "learning_rate": 9.184587813620073e-05, + "loss": 0.5471, + "step": 1507 + }, + { + "epoch": 0.23606762680025048, + "grad_norm": 0.7608013153076172, + "learning_rate": 9.18377321603128e-05, + "loss": 0.523, + "step": 1508 + }, + { + "epoch": 0.23622417031934878, + "grad_norm": 14.660826683044434, + "learning_rate": 9.18295861844249e-05, + "loss": 1.5411, + "step": 1509 + }, + { + "epoch": 0.23638071383844708, + "grad_norm": 1.940919280052185, + "learning_rate": 9.1821440208537e-05, + "loss": 0.4767, + "step": 1510 + }, + { + "epoch": 0.2365372573575454, + "grad_norm": 1.5322067737579346, + "learning_rate": 9.181329423264907e-05, + "loss": 0.7043, + "step": 1511 + }, + { + "epoch": 0.2366938008766437, + "grad_norm": 1.7056244611740112, + "learning_rate": 9.180514825676116e-05, + "loss": 0.5431, + "step": 1512 + }, + { + "epoch": 0.236850344395742, + "grad_norm": 4.825175762176514, + "learning_rate": 9.179700228087326e-05, + "loss": 0.7863, + "step": 1513 + }, + { + "epoch": 0.23700688791484031, + "grad_norm": 1.311272144317627, + "learning_rate": 9.178885630498534e-05, + "loss": 0.6016, + "step": 1514 + }, + { + "epoch": 0.23716343143393864, + "grad_norm": 1.361728549003601, + "learning_rate": 9.178071032909743e-05, + "loss": 0.5456, + "step": 1515 + }, + { + "epoch": 0.23731997495303694, + "grad_norm": 1.169175148010254, + "learning_rate": 9.177256435320952e-05, + "loss": 0.5331, + "step": 1516 + }, + { + "epoch": 0.23747651847213525, + "grad_norm": 2.060609817504883, + "learning_rate": 9.176441837732161e-05, + "loss": 0.7881, + "step": 1517 + }, + { + "epoch": 0.23763306199123357, + "grad_norm": 1.7569166421890259, + "learning_rate": 9.175627240143369e-05, + "loss": 0.6925, + "step": 1518 + }, + { + "epoch": 0.23778960551033188, + "grad_norm": 1.114730715751648, + "learning_rate": 9.174812642554579e-05, + "loss": 0.5776, + "step": 1519 + }, + { + "epoch": 0.23794614902943018, + "grad_norm": 1.9240381717681885, + "learning_rate": 9.173998044965787e-05, + "loss": 0.927, + "step": 1520 + }, + { + "epoch": 0.23810269254852848, + "grad_norm": 8.241540908813477, + "learning_rate": 9.173183447376996e-05, + "loss": 0.7489, + "step": 1521 + }, + { + "epoch": 0.2382592360676268, + "grad_norm": 2.7536470890045166, + "learning_rate": 9.172368849788205e-05, + "loss": 1.0492, + "step": 1522 + }, + { + "epoch": 0.2384157795867251, + "grad_norm": 1.803187608718872, + "learning_rate": 9.171554252199414e-05, + "loss": 0.7124, + "step": 1523 + }, + { + "epoch": 0.2385723231058234, + "grad_norm": 1.9374476671218872, + "learning_rate": 9.170739654610622e-05, + "loss": 0.8103, + "step": 1524 + }, + { + "epoch": 0.23872886662492174, + "grad_norm": 2.3620128631591797, + "learning_rate": 9.169925057021832e-05, + "loss": 1.0359, + "step": 1525 + }, + { + "epoch": 0.23888541014402004, + "grad_norm": 2.344478130340576, + "learning_rate": 9.16911045943304e-05, + "loss": 1.003, + "step": 1526 + }, + { + "epoch": 0.23904195366311834, + "grad_norm": 6.848521709442139, + "learning_rate": 9.16829586184425e-05, + "loss": 1.2193, + "step": 1527 + }, + { + "epoch": 0.23919849718221667, + "grad_norm": 2.4467098712921143, + "learning_rate": 9.167481264255458e-05, + "loss": 0.5637, + "step": 1528 + }, + { + "epoch": 0.23935504070131497, + "grad_norm": 2.7949576377868652, + "learning_rate": 9.166666666666667e-05, + "loss": 0.991, + "step": 1529 + }, + { + "epoch": 0.23951158422041327, + "grad_norm": 3.718968629837036, + "learning_rate": 9.165852069077876e-05, + "loss": 1.164, + "step": 1530 + }, + { + "epoch": 0.23966812773951157, + "grad_norm": 2.595773458480835, + "learning_rate": 9.165037471489085e-05, + "loss": 1.2782, + "step": 1531 + }, + { + "epoch": 0.2398246712586099, + "grad_norm": 3.2337300777435303, + "learning_rate": 9.164222873900293e-05, + "loss": 1.1076, + "step": 1532 + }, + { + "epoch": 0.2399812147777082, + "grad_norm": 3.5293526649475098, + "learning_rate": 9.163408276311503e-05, + "loss": 1.5232, + "step": 1533 + }, + { + "epoch": 0.2401377582968065, + "grad_norm": 3.425532579421997, + "learning_rate": 9.162593678722711e-05, + "loss": 0.8465, + "step": 1534 + }, + { + "epoch": 0.24029430181590483, + "grad_norm": 1.4524097442626953, + "learning_rate": 9.16177908113392e-05, + "loss": 0.9788, + "step": 1535 + }, + { + "epoch": 0.24045084533500313, + "grad_norm": 3.146641731262207, + "learning_rate": 9.16096448354513e-05, + "loss": 0.8796, + "step": 1536 + }, + { + "epoch": 0.24060738885410143, + "grad_norm": 4.070197105407715, + "learning_rate": 9.160149885956338e-05, + "loss": 1.3747, + "step": 1537 + }, + { + "epoch": 0.24076393237319976, + "grad_norm": 3.252741813659668, + "learning_rate": 9.159335288367546e-05, + "loss": 0.7483, + "step": 1538 + }, + { + "epoch": 0.24092047589229806, + "grad_norm": 7.216932773590088, + "learning_rate": 9.158520690778756e-05, + "loss": 1.4487, + "step": 1539 + }, + { + "epoch": 0.24107701941139636, + "grad_norm": 3.951306104660034, + "learning_rate": 9.157706093189964e-05, + "loss": 1.0486, + "step": 1540 + }, + { + "epoch": 0.24123356293049467, + "grad_norm": 6.267361640930176, + "learning_rate": 9.156891495601173e-05, + "loss": 1.6989, + "step": 1541 + }, + { + "epoch": 0.241390106449593, + "grad_norm": 4.549798011779785, + "learning_rate": 9.156076898012382e-05, + "loss": 1.5736, + "step": 1542 + }, + { + "epoch": 0.2415466499686913, + "grad_norm": 5.262148857116699, + "learning_rate": 9.155262300423592e-05, + "loss": 1.2265, + "step": 1543 + }, + { + "epoch": 0.2417031934877896, + "grad_norm": 5.391066074371338, + "learning_rate": 9.154447702834799e-05, + "loss": 1.9016, + "step": 1544 + }, + { + "epoch": 0.24185973700688793, + "grad_norm": 7.153601169586182, + "learning_rate": 9.153633105246009e-05, + "loss": 1.9532, + "step": 1545 + }, + { + "epoch": 0.24201628052598623, + "grad_norm": 4.1610798835754395, + "learning_rate": 9.152818507657219e-05, + "loss": 1.2368, + "step": 1546 + }, + { + "epoch": 0.24217282404508453, + "grad_norm": 3.6862473487854004, + "learning_rate": 9.152003910068426e-05, + "loss": 1.1912, + "step": 1547 + }, + { + "epoch": 0.24232936756418283, + "grad_norm": 2.0746371746063232, + "learning_rate": 9.151189312479635e-05, + "loss": 0.769, + "step": 1548 + }, + { + "epoch": 0.24248591108328116, + "grad_norm": 4.274667263031006, + "learning_rate": 9.150374714890845e-05, + "loss": 1.7531, + "step": 1549 + }, + { + "epoch": 0.24264245460237946, + "grad_norm": 1.6213219165802002, + "learning_rate": 9.149560117302053e-05, + "loss": 0.7792, + "step": 1550 + }, + { + "epoch": 0.24279899812147776, + "grad_norm": 1.4033104181289673, + "learning_rate": 9.148745519713262e-05, + "loss": 0.4279, + "step": 1551 + }, + { + "epoch": 0.2429555416405761, + "grad_norm": 0.7244923710823059, + "learning_rate": 9.147930922124472e-05, + "loss": 0.3983, + "step": 1552 + }, + { + "epoch": 0.2431120851596744, + "grad_norm": 0.699213981628418, + "learning_rate": 9.14711632453568e-05, + "loss": 0.5011, + "step": 1553 + }, + { + "epoch": 0.2432686286787727, + "grad_norm": 1.314353585243225, + "learning_rate": 9.146301726946888e-05, + "loss": 0.4416, + "step": 1554 + }, + { + "epoch": 0.24342517219787102, + "grad_norm": 0.6825692653656006, + "learning_rate": 9.145487129358098e-05, + "loss": 0.4339, + "step": 1555 + }, + { + "epoch": 0.24358171571696932, + "grad_norm": 1.4417661428451538, + "learning_rate": 9.144672531769306e-05, + "loss": 0.6524, + "step": 1556 + }, + { + "epoch": 0.24373825923606762, + "grad_norm": 1.055100440979004, + "learning_rate": 9.143857934180515e-05, + "loss": 0.5206, + "step": 1557 + }, + { + "epoch": 0.24389480275516592, + "grad_norm": 0.8214629888534546, + "learning_rate": 9.143043336591725e-05, + "loss": 0.6201, + "step": 1558 + }, + { + "epoch": 0.24405134627426425, + "grad_norm": 1.063862681388855, + "learning_rate": 9.142228739002933e-05, + "loss": 0.3785, + "step": 1559 + }, + { + "epoch": 0.24420788979336255, + "grad_norm": 1.0760605335235596, + "learning_rate": 9.141414141414141e-05, + "loss": 0.493, + "step": 1560 + }, + { + "epoch": 0.24436443331246085, + "grad_norm": 1.4380874633789062, + "learning_rate": 9.140599543825351e-05, + "loss": 0.5508, + "step": 1561 + }, + { + "epoch": 0.24452097683155918, + "grad_norm": 1.3920549154281616, + "learning_rate": 9.13978494623656e-05, + "loss": 0.5899, + "step": 1562 + }, + { + "epoch": 0.24467752035065748, + "grad_norm": 1.1402065753936768, + "learning_rate": 9.138970348647769e-05, + "loss": 0.5601, + "step": 1563 + }, + { + "epoch": 0.24483406386975579, + "grad_norm": 1.9645951986312866, + "learning_rate": 9.138155751058978e-05, + "loss": 0.5544, + "step": 1564 + }, + { + "epoch": 0.24499060738885411, + "grad_norm": 1.5780631303787231, + "learning_rate": 9.137341153470186e-05, + "loss": 0.5295, + "step": 1565 + }, + { + "epoch": 0.24514715090795242, + "grad_norm": 1.0612848997116089, + "learning_rate": 9.136526555881396e-05, + "loss": 0.6036, + "step": 1566 + }, + { + "epoch": 0.24530369442705072, + "grad_norm": 1.6388953924179077, + "learning_rate": 9.135711958292604e-05, + "loss": 0.9994, + "step": 1567 + }, + { + "epoch": 0.24546023794614902, + "grad_norm": 1.1568728685379028, + "learning_rate": 9.134897360703812e-05, + "loss": 0.7464, + "step": 1568 + }, + { + "epoch": 0.24561678146524735, + "grad_norm": 2.8503894805908203, + "learning_rate": 9.134082763115022e-05, + "loss": 0.9695, + "step": 1569 + }, + { + "epoch": 0.24577332498434565, + "grad_norm": 2.297910213470459, + "learning_rate": 9.13326816552623e-05, + "loss": 0.8798, + "step": 1570 + }, + { + "epoch": 0.24592986850344395, + "grad_norm": 2.4507484436035156, + "learning_rate": 9.132453567937439e-05, + "loss": 0.6167, + "step": 1571 + }, + { + "epoch": 0.24608641202254228, + "grad_norm": 4.300799369812012, + "learning_rate": 9.131638970348649e-05, + "loss": 1.0724, + "step": 1572 + }, + { + "epoch": 0.24624295554164058, + "grad_norm": 2.708364248275757, + "learning_rate": 9.130824372759857e-05, + "loss": 1.0105, + "step": 1573 + }, + { + "epoch": 0.24639949906073888, + "grad_norm": 1.6466705799102783, + "learning_rate": 9.130009775171065e-05, + "loss": 0.7737, + "step": 1574 + }, + { + "epoch": 0.24655604257983718, + "grad_norm": 4.2303690910339355, + "learning_rate": 9.129195177582275e-05, + "loss": 1.055, + "step": 1575 + }, + { + "epoch": 0.2467125860989355, + "grad_norm": 2.390787363052368, + "learning_rate": 9.128380579993483e-05, + "loss": 0.9541, + "step": 1576 + }, + { + "epoch": 0.2468691296180338, + "grad_norm": 3.4098098278045654, + "learning_rate": 9.127565982404692e-05, + "loss": 1.1671, + "step": 1577 + }, + { + "epoch": 0.2470256731371321, + "grad_norm": 2.0526301860809326, + "learning_rate": 9.126751384815902e-05, + "loss": 0.802, + "step": 1578 + }, + { + "epoch": 0.24718221665623044, + "grad_norm": 1.6776294708251953, + "learning_rate": 9.125936787227111e-05, + "loss": 1.0003, + "step": 1579 + }, + { + "epoch": 0.24733876017532874, + "grad_norm": 1.8371450901031494, + "learning_rate": 9.125122189638318e-05, + "loss": 0.913, + "step": 1580 + }, + { + "epoch": 0.24749530369442704, + "grad_norm": 1.7404975891113281, + "learning_rate": 9.124307592049528e-05, + "loss": 0.4819, + "step": 1581 + }, + { + "epoch": 0.24765184721352537, + "grad_norm": 2.12988543510437, + "learning_rate": 9.123492994460738e-05, + "loss": 1.4782, + "step": 1582 + }, + { + "epoch": 0.24780839073262367, + "grad_norm": 2.755577325820923, + "learning_rate": 9.122678396871945e-05, + "loss": 1.2958, + "step": 1583 + }, + { + "epoch": 0.24796493425172197, + "grad_norm": 3.2767527103424072, + "learning_rate": 9.121863799283154e-05, + "loss": 1.3459, + "step": 1584 + }, + { + "epoch": 0.24812147777082028, + "grad_norm": 2.0096747875213623, + "learning_rate": 9.121049201694364e-05, + "loss": 1.1335, + "step": 1585 + }, + { + "epoch": 0.2482780212899186, + "grad_norm": 2.582836627960205, + "learning_rate": 9.120234604105573e-05, + "loss": 1.683, + "step": 1586 + }, + { + "epoch": 0.2484345648090169, + "grad_norm": 4.891449928283691, + "learning_rate": 9.119420006516781e-05, + "loss": 1.0616, + "step": 1587 + }, + { + "epoch": 0.2485911083281152, + "grad_norm": 4.819225311279297, + "learning_rate": 9.118605408927991e-05, + "loss": 2.0753, + "step": 1588 + }, + { + "epoch": 0.24874765184721354, + "grad_norm": 3.243363380432129, + "learning_rate": 9.117790811339199e-05, + "loss": 1.5655, + "step": 1589 + }, + { + "epoch": 0.24890419536631184, + "grad_norm": 4.306556701660156, + "learning_rate": 9.116976213750407e-05, + "loss": 1.1804, + "step": 1590 + }, + { + "epoch": 0.24906073888541014, + "grad_norm": 4.479059219360352, + "learning_rate": 9.116161616161617e-05, + "loss": 1.8629, + "step": 1591 + }, + { + "epoch": 0.24921728240450847, + "grad_norm": 8.211525917053223, + "learning_rate": 9.115347018572826e-05, + "loss": 2.32, + "step": 1592 + }, + { + "epoch": 0.24937382592360677, + "grad_norm": 4.586742877960205, + "learning_rate": 9.114532420984034e-05, + "loss": 1.8132, + "step": 1593 + }, + { + "epoch": 0.24953036944270507, + "grad_norm": 3.1156604290008545, + "learning_rate": 9.113717823395244e-05, + "loss": 1.8647, + "step": 1594 + }, + { + "epoch": 0.24968691296180337, + "grad_norm": 2.543215036392212, + "learning_rate": 9.112903225806452e-05, + "loss": 1.3775, + "step": 1595 + }, + { + "epoch": 0.2498434564809017, + "grad_norm": 2.3778741359710693, + "learning_rate": 9.11208862821766e-05, + "loss": 1.6514, + "step": 1596 + }, + { + "epoch": 0.25, + "grad_norm": 2.2693607807159424, + "learning_rate": 9.11127403062887e-05, + "loss": 0.7986, + "step": 1597 + }, + { + "epoch": 0.25015654351909833, + "grad_norm": 2.3479838371276855, + "learning_rate": 9.110459433040079e-05, + "loss": 1.1838, + "step": 1598 + }, + { + "epoch": 0.2503130870381966, + "grad_norm": 2.312018394470215, + "learning_rate": 9.109644835451287e-05, + "loss": 0.5678, + "step": 1599 + }, + { + "epoch": 0.25046963055729493, + "grad_norm": 2.5846107006073, + "learning_rate": 9.108830237862497e-05, + "loss": 1.019, + "step": 1600 + }, + { + "epoch": 0.25062617407639326, + "grad_norm": 0.7615079283714294, + "learning_rate": 9.108015640273705e-05, + "loss": 0.4037, + "step": 1601 + }, + { + "epoch": 0.25078271759549153, + "grad_norm": 0.7009828090667725, + "learning_rate": 9.107201042684915e-05, + "loss": 0.3433, + "step": 1602 + }, + { + "epoch": 0.25093926111458986, + "grad_norm": 1.0094730854034424, + "learning_rate": 9.106386445096123e-05, + "loss": 0.4553, + "step": 1603 + }, + { + "epoch": 0.2510958046336882, + "grad_norm": 0.8302375078201294, + "learning_rate": 9.105571847507331e-05, + "loss": 0.461, + "step": 1604 + }, + { + "epoch": 0.25125234815278646, + "grad_norm": 1.6403794288635254, + "learning_rate": 9.104757249918541e-05, + "loss": 1.1161, + "step": 1605 + }, + { + "epoch": 0.2514088916718848, + "grad_norm": 0.8357635140419006, + "learning_rate": 9.10394265232975e-05, + "loss": 0.5878, + "step": 1606 + }, + { + "epoch": 0.25156543519098307, + "grad_norm": 4.080342769622803, + "learning_rate": 9.103128054740958e-05, + "loss": 0.6583, + "step": 1607 + }, + { + "epoch": 0.2517219787100814, + "grad_norm": 0.5744903087615967, + "learning_rate": 9.102313457152168e-05, + "loss": 0.3548, + "step": 1608 + }, + { + "epoch": 0.2518785222291797, + "grad_norm": 1.2131882905960083, + "learning_rate": 9.101498859563376e-05, + "loss": 0.5934, + "step": 1609 + }, + { + "epoch": 0.252035065748278, + "grad_norm": 1.003551959991455, + "learning_rate": 9.100684261974584e-05, + "loss": 0.4789, + "step": 1610 + }, + { + "epoch": 0.2521916092673763, + "grad_norm": 0.9214762449264526, + "learning_rate": 9.099869664385794e-05, + "loss": 0.4537, + "step": 1611 + }, + { + "epoch": 0.25234815278647466, + "grad_norm": 1.3175299167633057, + "learning_rate": 9.099055066797003e-05, + "loss": 0.6017, + "step": 1612 + }, + { + "epoch": 0.25250469630557293, + "grad_norm": 1.165985107421875, + "learning_rate": 9.098240469208211e-05, + "loss": 0.4966, + "step": 1613 + }, + { + "epoch": 0.25266123982467126, + "grad_norm": 1.5195963382720947, + "learning_rate": 9.09742587161942e-05, + "loss": 0.536, + "step": 1614 + }, + { + "epoch": 0.2528177833437696, + "grad_norm": 1.5125010013580322, + "learning_rate": 9.09661127403063e-05, + "loss": 0.5981, + "step": 1615 + }, + { + "epoch": 0.25297432686286786, + "grad_norm": 1.2026898860931396, + "learning_rate": 9.095796676441837e-05, + "loss": 0.4326, + "step": 1616 + }, + { + "epoch": 0.2531308703819662, + "grad_norm": 2.036567449569702, + "learning_rate": 9.094982078853047e-05, + "loss": 0.8181, + "step": 1617 + }, + { + "epoch": 0.2532874139010645, + "grad_norm": 1.3778742551803589, + "learning_rate": 9.094167481264257e-05, + "loss": 0.5379, + "step": 1618 + }, + { + "epoch": 0.2534439574201628, + "grad_norm": 1.4616771936416626, + "learning_rate": 9.093352883675464e-05, + "loss": 0.3897, + "step": 1619 + }, + { + "epoch": 0.2536005009392611, + "grad_norm": 1.2987499237060547, + "learning_rate": 9.092538286086674e-05, + "loss": 0.8474, + "step": 1620 + }, + { + "epoch": 0.25375704445835945, + "grad_norm": 2.347700834274292, + "learning_rate": 9.091723688497883e-05, + "loss": 0.6714, + "step": 1621 + }, + { + "epoch": 0.2539135879774577, + "grad_norm": 2.857924461364746, + "learning_rate": 9.090909090909092e-05, + "loss": 1.1436, + "step": 1622 + }, + { + "epoch": 0.25407013149655605, + "grad_norm": 2.7028846740722656, + "learning_rate": 9.0900944933203e-05, + "loss": 0.8462, + "step": 1623 + }, + { + "epoch": 0.2542266750156543, + "grad_norm": 2.4914047718048096, + "learning_rate": 9.08927989573151e-05, + "loss": 0.94, + "step": 1624 + }, + { + "epoch": 0.25438321853475265, + "grad_norm": 3.032986879348755, + "learning_rate": 9.088465298142718e-05, + "loss": 0.8406, + "step": 1625 + }, + { + "epoch": 0.254539762053851, + "grad_norm": 2.157381772994995, + "learning_rate": 9.087650700553927e-05, + "loss": 0.8622, + "step": 1626 + }, + { + "epoch": 0.25469630557294926, + "grad_norm": 3.1868395805358887, + "learning_rate": 9.086836102965136e-05, + "loss": 0.9, + "step": 1627 + }, + { + "epoch": 0.2548528490920476, + "grad_norm": 2.6547060012817383, + "learning_rate": 9.086021505376345e-05, + "loss": 0.9523, + "step": 1628 + }, + { + "epoch": 0.2550093926111459, + "grad_norm": 3.358323812484741, + "learning_rate": 9.085206907787553e-05, + "loss": 1.0113, + "step": 1629 + }, + { + "epoch": 0.2551659361302442, + "grad_norm": 2.82916259765625, + "learning_rate": 9.084392310198763e-05, + "loss": 0.9327, + "step": 1630 + }, + { + "epoch": 0.2553224796493425, + "grad_norm": 2.1918342113494873, + "learning_rate": 9.083577712609971e-05, + "loss": 0.8334, + "step": 1631 + }, + { + "epoch": 0.25547902316844084, + "grad_norm": 5.886850357055664, + "learning_rate": 9.08276311502118e-05, + "loss": 1.1663, + "step": 1632 + }, + { + "epoch": 0.2556355666875391, + "grad_norm": 3.6913528442382812, + "learning_rate": 9.081948517432389e-05, + "loss": 0.8418, + "step": 1633 + }, + { + "epoch": 0.25579211020663745, + "grad_norm": 2.8966310024261475, + "learning_rate": 9.081133919843598e-05, + "loss": 1.3818, + "step": 1634 + }, + { + "epoch": 0.2559486537257358, + "grad_norm": 3.769637107849121, + "learning_rate": 9.080319322254806e-05, + "loss": 1.1715, + "step": 1635 + }, + { + "epoch": 0.25610519724483405, + "grad_norm": 3.657241106033325, + "learning_rate": 9.079504724666016e-05, + "loss": 1.2392, + "step": 1636 + }, + { + "epoch": 0.2562617407639324, + "grad_norm": 4.470808982849121, + "learning_rate": 9.078690127077224e-05, + "loss": 1.6409, + "step": 1637 + }, + { + "epoch": 0.2564182842830307, + "grad_norm": 3.5172672271728516, + "learning_rate": 9.077875529488434e-05, + "loss": 1.2644, + "step": 1638 + }, + { + "epoch": 0.256574827802129, + "grad_norm": 6.295013904571533, + "learning_rate": 9.077060931899642e-05, + "loss": 1.5176, + "step": 1639 + }, + { + "epoch": 0.2567313713212273, + "grad_norm": 3.5026683807373047, + "learning_rate": 9.07624633431085e-05, + "loss": 1.1836, + "step": 1640 + }, + { + "epoch": 0.25688791484032564, + "grad_norm": 5.222485542297363, + "learning_rate": 9.07543173672206e-05, + "loss": 1.3121, + "step": 1641 + }, + { + "epoch": 0.2570444583594239, + "grad_norm": 3.3413779735565186, + "learning_rate": 9.074617139133269e-05, + "loss": 1.2276, + "step": 1642 + }, + { + "epoch": 0.25720100187852224, + "grad_norm": 2.993256092071533, + "learning_rate": 9.073802541544477e-05, + "loss": 0.9463, + "step": 1643 + }, + { + "epoch": 0.2573575453976205, + "grad_norm": 3.361138343811035, + "learning_rate": 9.072987943955687e-05, + "loss": 0.6409, + "step": 1644 + }, + { + "epoch": 0.25751408891671884, + "grad_norm": 3.553858757019043, + "learning_rate": 9.072173346366895e-05, + "loss": 1.8548, + "step": 1645 + }, + { + "epoch": 0.25767063243581717, + "grad_norm": 3.8532843589782715, + "learning_rate": 9.071358748778104e-05, + "loss": 1.2143, + "step": 1646 + }, + { + "epoch": 0.25782717595491544, + "grad_norm": 3.0282227993011475, + "learning_rate": 9.070544151189313e-05, + "loss": 0.7123, + "step": 1647 + }, + { + "epoch": 0.2579837194740138, + "grad_norm": 2.499903678894043, + "learning_rate": 9.069729553600522e-05, + "loss": 0.8567, + "step": 1648 + }, + { + "epoch": 0.2581402629931121, + "grad_norm": 1.5922579765319824, + "learning_rate": 9.06891495601173e-05, + "loss": 0.6039, + "step": 1649 + }, + { + "epoch": 0.2582968065122104, + "grad_norm": 2.7286875247955322, + "learning_rate": 9.06810035842294e-05, + "loss": 1.3921, + "step": 1650 + }, + { + "epoch": 0.2584533500313087, + "grad_norm": 0.6657518148422241, + "learning_rate": 9.06728576083415e-05, + "loss": 0.4539, + "step": 1651 + }, + { + "epoch": 0.25860989355040703, + "grad_norm": 0.787168562412262, + "learning_rate": 9.066471163245357e-05, + "loss": 0.4257, + "step": 1652 + }, + { + "epoch": 0.2587664370695053, + "grad_norm": 1.6137168407440186, + "learning_rate": 9.065656565656566e-05, + "loss": 0.6278, + "step": 1653 + }, + { + "epoch": 0.25892298058860364, + "grad_norm": 0.594913125038147, + "learning_rate": 9.064841968067776e-05, + "loss": 0.3825, + "step": 1654 + }, + { + "epoch": 0.25907952410770196, + "grad_norm": 0.908819854259491, + "learning_rate": 9.064027370478983e-05, + "loss": 0.4578, + "step": 1655 + }, + { + "epoch": 0.25923606762680024, + "grad_norm": 0.6562874913215637, + "learning_rate": 9.063212772890193e-05, + "loss": 0.3857, + "step": 1656 + }, + { + "epoch": 0.25939261114589857, + "grad_norm": 0.9133744835853577, + "learning_rate": 9.062398175301402e-05, + "loss": 0.4537, + "step": 1657 + }, + { + "epoch": 0.2595491546649969, + "grad_norm": 1.0636394023895264, + "learning_rate": 9.06158357771261e-05, + "loss": 0.6773, + "step": 1658 + }, + { + "epoch": 0.25970569818409517, + "grad_norm": 0.9667505025863647, + "learning_rate": 9.060768980123819e-05, + "loss": 0.61, + "step": 1659 + }, + { + "epoch": 0.2598622417031935, + "grad_norm": 0.8574533462524414, + "learning_rate": 9.059954382535029e-05, + "loss": 0.3644, + "step": 1660 + }, + { + "epoch": 0.26001878522229177, + "grad_norm": 1.6245876550674438, + "learning_rate": 9.059139784946237e-05, + "loss": 0.5598, + "step": 1661 + }, + { + "epoch": 0.2601753287413901, + "grad_norm": 1.2327978610992432, + "learning_rate": 9.058325187357446e-05, + "loss": 0.4995, + "step": 1662 + }, + { + "epoch": 0.26033187226048843, + "grad_norm": 1.1649653911590576, + "learning_rate": 9.057510589768655e-05, + "loss": 0.4021, + "step": 1663 + }, + { + "epoch": 0.2604884157795867, + "grad_norm": 1.353814959526062, + "learning_rate": 9.056695992179864e-05, + "loss": 0.5515, + "step": 1664 + }, + { + "epoch": 0.26064495929868503, + "grad_norm": 2.012350082397461, + "learning_rate": 9.055881394591072e-05, + "loss": 0.7084, + "step": 1665 + }, + { + "epoch": 0.26080150281778336, + "grad_norm": 1.4614052772521973, + "learning_rate": 9.055066797002282e-05, + "loss": 0.622, + "step": 1666 + }, + { + "epoch": 0.26095804633688163, + "grad_norm": 2.7074451446533203, + "learning_rate": 9.05425219941349e-05, + "loss": 0.8758, + "step": 1667 + }, + { + "epoch": 0.26111458985597996, + "grad_norm": 1.5104464292526245, + "learning_rate": 9.053437601824699e-05, + "loss": 0.5702, + "step": 1668 + }, + { + "epoch": 0.2612711333750783, + "grad_norm": 2.0121097564697266, + "learning_rate": 9.052623004235908e-05, + "loss": 0.5332, + "step": 1669 + }, + { + "epoch": 0.26142767689417656, + "grad_norm": 1.6523654460906982, + "learning_rate": 9.051808406647117e-05, + "loss": 0.8796, + "step": 1670 + }, + { + "epoch": 0.2615842204132749, + "grad_norm": 2.0530481338500977, + "learning_rate": 9.050993809058325e-05, + "loss": 0.6581, + "step": 1671 + }, + { + "epoch": 0.2617407639323732, + "grad_norm": 2.4215760231018066, + "learning_rate": 9.050179211469535e-05, + "loss": 0.6282, + "step": 1672 + }, + { + "epoch": 0.2618973074514715, + "grad_norm": 2.286248207092285, + "learning_rate": 9.049364613880743e-05, + "loss": 0.8398, + "step": 1673 + }, + { + "epoch": 0.2620538509705698, + "grad_norm": 3.3042654991149902, + "learning_rate": 9.048550016291953e-05, + "loss": 0.9843, + "step": 1674 + }, + { + "epoch": 0.26221039448966815, + "grad_norm": 2.328876256942749, + "learning_rate": 9.047735418703161e-05, + "loss": 0.994, + "step": 1675 + }, + { + "epoch": 0.2623669380087664, + "grad_norm": 1.803959846496582, + "learning_rate": 9.04692082111437e-05, + "loss": 0.965, + "step": 1676 + }, + { + "epoch": 0.26252348152786475, + "grad_norm": 2.496783494949341, + "learning_rate": 9.04610622352558e-05, + "loss": 0.886, + "step": 1677 + }, + { + "epoch": 0.26268002504696303, + "grad_norm": 2.7487871646881104, + "learning_rate": 9.045291625936788e-05, + "loss": 0.8454, + "step": 1678 + }, + { + "epoch": 0.26283656856606136, + "grad_norm": 2.602621078491211, + "learning_rate": 9.044477028347996e-05, + "loss": 0.9837, + "step": 1679 + }, + { + "epoch": 0.2629931120851597, + "grad_norm": 2.456606149673462, + "learning_rate": 9.043662430759206e-05, + "loss": 1.0488, + "step": 1680 + }, + { + "epoch": 0.26314965560425796, + "grad_norm": 2.144101142883301, + "learning_rate": 9.042847833170414e-05, + "loss": 1.1661, + "step": 1681 + }, + { + "epoch": 0.2633061991233563, + "grad_norm": 5.027491569519043, + "learning_rate": 9.042033235581623e-05, + "loss": 1.4259, + "step": 1682 + }, + { + "epoch": 0.2634627426424546, + "grad_norm": 4.946202754974365, + "learning_rate": 9.041218637992832e-05, + "loss": 1.1726, + "step": 1683 + }, + { + "epoch": 0.2636192861615529, + "grad_norm": 2.7110588550567627, + "learning_rate": 9.040404040404041e-05, + "loss": 0.9856, + "step": 1684 + }, + { + "epoch": 0.2637758296806512, + "grad_norm": 4.476512908935547, + "learning_rate": 9.039589442815249e-05, + "loss": 1.3978, + "step": 1685 + }, + { + "epoch": 0.26393237319974955, + "grad_norm": 2.999312162399292, + "learning_rate": 9.038774845226459e-05, + "loss": 0.9537, + "step": 1686 + }, + { + "epoch": 0.2640889167188478, + "grad_norm": 2.533498525619507, + "learning_rate": 9.037960247637667e-05, + "loss": 0.97, + "step": 1687 + }, + { + "epoch": 0.26424546023794615, + "grad_norm": 2.3402750492095947, + "learning_rate": 9.037145650048876e-05, + "loss": 1.224, + "step": 1688 + }, + { + "epoch": 0.2644020037570445, + "grad_norm": 2.765549898147583, + "learning_rate": 9.036331052460085e-05, + "loss": 1.0625, + "step": 1689 + }, + { + "epoch": 0.26455854727614275, + "grad_norm": 4.330612659454346, + "learning_rate": 9.035516454871295e-05, + "loss": 1.2913, + "step": 1690 + }, + { + "epoch": 0.2647150907952411, + "grad_norm": 2.518183469772339, + "learning_rate": 9.034701857282502e-05, + "loss": 1.2905, + "step": 1691 + }, + { + "epoch": 0.2648716343143394, + "grad_norm": 4.496226787567139, + "learning_rate": 9.033887259693712e-05, + "loss": 1.9425, + "step": 1692 + }, + { + "epoch": 0.2650281778334377, + "grad_norm": 3.2142744064331055, + "learning_rate": 9.033072662104922e-05, + "loss": 1.6144, + "step": 1693 + }, + { + "epoch": 0.265184721352536, + "grad_norm": 2.629340410232544, + "learning_rate": 9.032258064516129e-05, + "loss": 1.3193, + "step": 1694 + }, + { + "epoch": 0.26534126487163434, + "grad_norm": 4.831298828125, + "learning_rate": 9.031443466927338e-05, + "loss": 1.4297, + "step": 1695 + }, + { + "epoch": 0.2654978083907326, + "grad_norm": 3.33380389213562, + "learning_rate": 9.030628869338548e-05, + "loss": 1.1568, + "step": 1696 + }, + { + "epoch": 0.26565435190983094, + "grad_norm": 5.370350360870361, + "learning_rate": 9.029814271749756e-05, + "loss": 1.2004, + "step": 1697 + }, + { + "epoch": 0.2658108954289292, + "grad_norm": 3.3081161975860596, + "learning_rate": 9.028999674160965e-05, + "loss": 1.1573, + "step": 1698 + }, + { + "epoch": 0.26596743894802755, + "grad_norm": 3.527355909347534, + "learning_rate": 9.028185076572175e-05, + "loss": 1.2441, + "step": 1699 + }, + { + "epoch": 0.2661239824671259, + "grad_norm": 3.603729248046875, + "learning_rate": 9.027370478983383e-05, + "loss": 1.7841, + "step": 1700 + }, + { + "epoch": 0.26628052598622415, + "grad_norm": 0.6103606224060059, + "learning_rate": 9.026555881394591e-05, + "loss": 0.4058, + "step": 1701 + }, + { + "epoch": 0.2664370695053225, + "grad_norm": 0.7400172352790833, + "learning_rate": 9.025741283805801e-05, + "loss": 0.5411, + "step": 1702 + }, + { + "epoch": 0.2665936130244208, + "grad_norm": 1.6036503314971924, + "learning_rate": 9.02492668621701e-05, + "loss": 0.5738, + "step": 1703 + }, + { + "epoch": 0.2667501565435191, + "grad_norm": 1.5236568450927734, + "learning_rate": 9.024112088628218e-05, + "loss": 0.4247, + "step": 1704 + }, + { + "epoch": 0.2669067000626174, + "grad_norm": 1.0431320667266846, + "learning_rate": 9.023297491039427e-05, + "loss": 0.5388, + "step": 1705 + }, + { + "epoch": 0.26706324358171574, + "grad_norm": 0.8525140881538391, + "learning_rate": 9.022482893450636e-05, + "loss": 0.9074, + "step": 1706 + }, + { + "epoch": 0.267219787100814, + "grad_norm": 3.179527521133423, + "learning_rate": 9.021668295861844e-05, + "loss": 0.7242, + "step": 1707 + }, + { + "epoch": 0.26737633061991234, + "grad_norm": 0.9574748277664185, + "learning_rate": 9.020853698273054e-05, + "loss": 0.3998, + "step": 1708 + }, + { + "epoch": 0.26753287413901067, + "grad_norm": 1.1568480730056763, + "learning_rate": 9.020039100684262e-05, + "loss": 0.4476, + "step": 1709 + }, + { + "epoch": 0.26768941765810894, + "grad_norm": 0.6595796942710876, + "learning_rate": 9.019224503095472e-05, + "loss": 0.3742, + "step": 1710 + }, + { + "epoch": 0.26784596117720727, + "grad_norm": 1.7888679504394531, + "learning_rate": 9.01840990550668e-05, + "loss": 0.7501, + "step": 1711 + }, + { + "epoch": 0.2680025046963056, + "grad_norm": 1.164945125579834, + "learning_rate": 9.017595307917889e-05, + "loss": 0.5068, + "step": 1712 + }, + { + "epoch": 0.2681590482154039, + "grad_norm": 3.1099612712860107, + "learning_rate": 9.016780710329099e-05, + "loss": 0.8778, + "step": 1713 + }, + { + "epoch": 0.2683155917345022, + "grad_norm": 1.224118947982788, + "learning_rate": 9.015966112740307e-05, + "loss": 0.627, + "step": 1714 + }, + { + "epoch": 0.2684721352536005, + "grad_norm": 1.3357973098754883, + "learning_rate": 9.015151515151515e-05, + "loss": 0.6413, + "step": 1715 + }, + { + "epoch": 0.2686286787726988, + "grad_norm": 1.5514767169952393, + "learning_rate": 9.014336917562725e-05, + "loss": 0.7528, + "step": 1716 + }, + { + "epoch": 0.26878522229179713, + "grad_norm": 2.183199882507324, + "learning_rate": 9.013522319973933e-05, + "loss": 0.8443, + "step": 1717 + }, + { + "epoch": 0.2689417658108954, + "grad_norm": 1.321953535079956, + "learning_rate": 9.012707722385142e-05, + "loss": 0.4563, + "step": 1718 + }, + { + "epoch": 0.26909830932999373, + "grad_norm": 1.4958527088165283, + "learning_rate": 9.011893124796352e-05, + "loss": 0.5381, + "step": 1719 + }, + { + "epoch": 0.26925485284909206, + "grad_norm": 1.0742700099945068, + "learning_rate": 9.01107852720756e-05, + "loss": 0.4514, + "step": 1720 + }, + { + "epoch": 0.26941139636819034, + "grad_norm": 3.2764134407043457, + "learning_rate": 9.010263929618768e-05, + "loss": 0.6676, + "step": 1721 + }, + { + "epoch": 0.26956793988728867, + "grad_norm": 2.5475590229034424, + "learning_rate": 9.009449332029978e-05, + "loss": 0.6416, + "step": 1722 + }, + { + "epoch": 0.269724483406387, + "grad_norm": 3.839150905609131, + "learning_rate": 9.008634734441186e-05, + "loss": 0.6927, + "step": 1723 + }, + { + "epoch": 0.26988102692548527, + "grad_norm": 2.346546173095703, + "learning_rate": 9.007820136852395e-05, + "loss": 0.7533, + "step": 1724 + }, + { + "epoch": 0.2700375704445836, + "grad_norm": 1.9705690145492554, + "learning_rate": 9.007005539263604e-05, + "loss": 0.8238, + "step": 1725 + }, + { + "epoch": 0.2701941139636819, + "grad_norm": 3.73823881149292, + "learning_rate": 9.006190941674814e-05, + "loss": 1.014, + "step": 1726 + }, + { + "epoch": 0.2703506574827802, + "grad_norm": 1.8648747205734253, + "learning_rate": 9.005376344086021e-05, + "loss": 0.7913, + "step": 1727 + }, + { + "epoch": 0.27050720100187853, + "grad_norm": 3.4976582527160645, + "learning_rate": 9.004561746497231e-05, + "loss": 0.93, + "step": 1728 + }, + { + "epoch": 0.27066374452097686, + "grad_norm": 2.8193020820617676, + "learning_rate": 9.003747148908441e-05, + "loss": 0.9783, + "step": 1729 + }, + { + "epoch": 0.27082028804007513, + "grad_norm": 4.979451656341553, + "learning_rate": 9.002932551319648e-05, + "loss": 0.9171, + "step": 1730 + }, + { + "epoch": 0.27097683155917346, + "grad_norm": 2.7158496379852295, + "learning_rate": 9.002117953730857e-05, + "loss": 1.1379, + "step": 1731 + }, + { + "epoch": 0.27113337507827173, + "grad_norm": 2.321378707885742, + "learning_rate": 9.001303356142067e-05, + "loss": 0.9931, + "step": 1732 + }, + { + "epoch": 0.27128991859737006, + "grad_norm": 3.621854782104492, + "learning_rate": 9.000488758553276e-05, + "loss": 0.9273, + "step": 1733 + }, + { + "epoch": 0.2714464621164684, + "grad_norm": 3.179936647415161, + "learning_rate": 8.999674160964484e-05, + "loss": 1.2843, + "step": 1734 + }, + { + "epoch": 0.27160300563556666, + "grad_norm": 2.4412384033203125, + "learning_rate": 8.998859563375694e-05, + "loss": 1.0553, + "step": 1735 + }, + { + "epoch": 0.271759549154665, + "grad_norm": 2.9159348011016846, + "learning_rate": 8.998044965786902e-05, + "loss": 1.0547, + "step": 1736 + }, + { + "epoch": 0.2719160926737633, + "grad_norm": 7.587355613708496, + "learning_rate": 8.99723036819811e-05, + "loss": 1.8562, + "step": 1737 + }, + { + "epoch": 0.2720726361928616, + "grad_norm": 3.8985774517059326, + "learning_rate": 8.99641577060932e-05, + "loss": 1.103, + "step": 1738 + }, + { + "epoch": 0.2722291797119599, + "grad_norm": 2.7449703216552734, + "learning_rate": 8.995601173020529e-05, + "loss": 1.5154, + "step": 1739 + }, + { + "epoch": 0.27238572323105825, + "grad_norm": 4.474976539611816, + "learning_rate": 8.994786575431737e-05, + "loss": 2.1392, + "step": 1740 + }, + { + "epoch": 0.2725422667501565, + "grad_norm": 5.5950469970703125, + "learning_rate": 8.993971977842947e-05, + "loss": 1.3676, + "step": 1741 + }, + { + "epoch": 0.27269881026925485, + "grad_norm": 3.719576120376587, + "learning_rate": 8.993157380254155e-05, + "loss": 1.2099, + "step": 1742 + }, + { + "epoch": 0.2728553537883532, + "grad_norm": 2.438223123550415, + "learning_rate": 8.992342782665363e-05, + "loss": 1.2515, + "step": 1743 + }, + { + "epoch": 0.27301189730745146, + "grad_norm": 3.9577677249908447, + "learning_rate": 8.991528185076573e-05, + "loss": 1.7369, + "step": 1744 + }, + { + "epoch": 0.2731684408265498, + "grad_norm": 2.649766683578491, + "learning_rate": 8.990713587487781e-05, + "loss": 1.6032, + "step": 1745 + }, + { + "epoch": 0.2733249843456481, + "grad_norm": 2.7576379776000977, + "learning_rate": 8.98989898989899e-05, + "loss": 1.2934, + "step": 1746 + }, + { + "epoch": 0.2734815278647464, + "grad_norm": 2.2006499767303467, + "learning_rate": 8.9890843923102e-05, + "loss": 1.0568, + "step": 1747 + }, + { + "epoch": 0.2736380713838447, + "grad_norm": 1.9137835502624512, + "learning_rate": 8.988269794721408e-05, + "loss": 0.7872, + "step": 1748 + }, + { + "epoch": 0.27379461490294305, + "grad_norm": 2.90268874168396, + "learning_rate": 8.987455197132618e-05, + "loss": 1.4402, + "step": 1749 + }, + { + "epoch": 0.2739511584220413, + "grad_norm": 2.7601609230041504, + "learning_rate": 8.986640599543826e-05, + "loss": 1.3172, + "step": 1750 + }, + { + "epoch": 0.27410770194113965, + "grad_norm": 0.8512994647026062, + "learning_rate": 8.985826001955034e-05, + "loss": 0.4097, + "step": 1751 + }, + { + "epoch": 0.2742642454602379, + "grad_norm": 0.7609471082687378, + "learning_rate": 8.985011404366244e-05, + "loss": 0.3386, + "step": 1752 + }, + { + "epoch": 0.27442078897933625, + "grad_norm": 0.7038943767547607, + "learning_rate": 8.984196806777453e-05, + "loss": 0.4182, + "step": 1753 + }, + { + "epoch": 0.2745773324984346, + "grad_norm": 0.8505272269248962, + "learning_rate": 8.983382209188661e-05, + "loss": 0.3802, + "step": 1754 + }, + { + "epoch": 0.27473387601753285, + "grad_norm": 0.7352429628372192, + "learning_rate": 8.98256761159987e-05, + "loss": 0.3112, + "step": 1755 + }, + { + "epoch": 0.2748904195366312, + "grad_norm": 1.3457746505737305, + "learning_rate": 8.981753014011079e-05, + "loss": 0.5168, + "step": 1756 + }, + { + "epoch": 0.2750469630557295, + "grad_norm": 0.9564936757087708, + "learning_rate": 8.980938416422287e-05, + "loss": 0.4884, + "step": 1757 + }, + { + "epoch": 0.2752035065748278, + "grad_norm": 1.1036410331726074, + "learning_rate": 8.980123818833497e-05, + "loss": 0.5044, + "step": 1758 + }, + { + "epoch": 0.2753600500939261, + "grad_norm": 0.8353188037872314, + "learning_rate": 8.979309221244705e-05, + "loss": 0.4715, + "step": 1759 + }, + { + "epoch": 0.27551659361302444, + "grad_norm": 1.0620090961456299, + "learning_rate": 8.978494623655914e-05, + "loss": 0.4231, + "step": 1760 + }, + { + "epoch": 0.2756731371321227, + "grad_norm": 1.3064796924591064, + "learning_rate": 8.977680026067124e-05, + "loss": 0.437, + "step": 1761 + }, + { + "epoch": 0.27582968065122104, + "grad_norm": 0.9244285225868225, + "learning_rate": 8.976865428478333e-05, + "loss": 0.4039, + "step": 1762 + }, + { + "epoch": 0.27598622417031937, + "grad_norm": 1.173123836517334, + "learning_rate": 8.97605083088954e-05, + "loss": 0.4774, + "step": 1763 + }, + { + "epoch": 0.27614276768941765, + "grad_norm": 2.190730333328247, + "learning_rate": 8.97523623330075e-05, + "loss": 0.637, + "step": 1764 + }, + { + "epoch": 0.276299311208516, + "grad_norm": 1.4236063957214355, + "learning_rate": 8.97442163571196e-05, + "loss": 0.5274, + "step": 1765 + }, + { + "epoch": 0.2764558547276143, + "grad_norm": 2.0668857097625732, + "learning_rate": 8.973607038123167e-05, + "loss": 0.8776, + "step": 1766 + }, + { + "epoch": 0.2766123982467126, + "grad_norm": 1.3029193878173828, + "learning_rate": 8.972792440534377e-05, + "loss": 0.5981, + "step": 1767 + }, + { + "epoch": 0.2767689417658109, + "grad_norm": 1.766518473625183, + "learning_rate": 8.971977842945586e-05, + "loss": 0.7476, + "step": 1768 + }, + { + "epoch": 0.2769254852849092, + "grad_norm": 1.7212575674057007, + "learning_rate": 8.971163245356793e-05, + "loss": 0.5748, + "step": 1769 + }, + { + "epoch": 0.2770820288040075, + "grad_norm": 1.23622727394104, + "learning_rate": 8.970348647768003e-05, + "loss": 0.5386, + "step": 1770 + }, + { + "epoch": 0.27723857232310584, + "grad_norm": 1.6381832361221313, + "learning_rate": 8.969534050179213e-05, + "loss": 0.6918, + "step": 1771 + }, + { + "epoch": 0.2773951158422041, + "grad_norm": 7.553649425506592, + "learning_rate": 8.968719452590421e-05, + "loss": 0.8757, + "step": 1772 + }, + { + "epoch": 0.27755165936130244, + "grad_norm": 1.9549211263656616, + "learning_rate": 8.96790485500163e-05, + "loss": 0.581, + "step": 1773 + }, + { + "epoch": 0.27770820288040077, + "grad_norm": 1.5482878684997559, + "learning_rate": 8.967090257412839e-05, + "loss": 0.6678, + "step": 1774 + }, + { + "epoch": 0.27786474639949904, + "grad_norm": 3.391881227493286, + "learning_rate": 8.966275659824048e-05, + "loss": 1.1538, + "step": 1775 + }, + { + "epoch": 0.27802128991859737, + "grad_norm": 2.505235433578491, + "learning_rate": 8.965461062235256e-05, + "loss": 1.1102, + "step": 1776 + }, + { + "epoch": 0.2781778334376957, + "grad_norm": 2.5876293182373047, + "learning_rate": 8.964646464646466e-05, + "loss": 0.7176, + "step": 1777 + }, + { + "epoch": 0.27833437695679397, + "grad_norm": 3.0793161392211914, + "learning_rate": 8.963831867057674e-05, + "loss": 1.0563, + "step": 1778 + }, + { + "epoch": 0.2784909204758923, + "grad_norm": 3.170732259750366, + "learning_rate": 8.963017269468882e-05, + "loss": 0.7883, + "step": 1779 + }, + { + "epoch": 0.27864746399499063, + "grad_norm": 4.032870292663574, + "learning_rate": 8.962202671880092e-05, + "loss": 0.969, + "step": 1780 + }, + { + "epoch": 0.2788040075140889, + "grad_norm": 2.2664260864257812, + "learning_rate": 8.9613880742913e-05, + "loss": 0.8181, + "step": 1781 + }, + { + "epoch": 0.27896055103318723, + "grad_norm": 2.042433977127075, + "learning_rate": 8.960573476702509e-05, + "loss": 0.6259, + "step": 1782 + }, + { + "epoch": 0.27911709455228556, + "grad_norm": 2.56362247467041, + "learning_rate": 8.959758879113719e-05, + "loss": 1.5091, + "step": 1783 + }, + { + "epoch": 0.27927363807138383, + "grad_norm": 2.4623000621795654, + "learning_rate": 8.958944281524927e-05, + "loss": 1.3454, + "step": 1784 + }, + { + "epoch": 0.27943018159048216, + "grad_norm": 4.326159954071045, + "learning_rate": 8.958129683936137e-05, + "loss": 1.1744, + "step": 1785 + }, + { + "epoch": 0.27958672510958044, + "grad_norm": 3.1618175506591797, + "learning_rate": 8.957315086347344e-05, + "loss": 1.3202, + "step": 1786 + }, + { + "epoch": 0.27974326862867877, + "grad_norm": 4.600327968597412, + "learning_rate": 8.956500488758554e-05, + "loss": 1.5306, + "step": 1787 + }, + { + "epoch": 0.2798998121477771, + "grad_norm": 3.388803482055664, + "learning_rate": 8.955685891169763e-05, + "loss": 1.2957, + "step": 1788 + }, + { + "epoch": 0.28005635566687537, + "grad_norm": 5.55760383605957, + "learning_rate": 8.95487129358097e-05, + "loss": 1.3931, + "step": 1789 + }, + { + "epoch": 0.2802128991859737, + "grad_norm": 3.614032030105591, + "learning_rate": 8.95405669599218e-05, + "loss": 1.3556, + "step": 1790 + }, + { + "epoch": 0.280369442705072, + "grad_norm": 4.4853105545043945, + "learning_rate": 8.95324209840339e-05, + "loss": 1.2581, + "step": 1791 + }, + { + "epoch": 0.2805259862241703, + "grad_norm": 3.590324878692627, + "learning_rate": 8.952427500814598e-05, + "loss": 1.3532, + "step": 1792 + }, + { + "epoch": 0.2806825297432686, + "grad_norm": 5.803063869476318, + "learning_rate": 8.951612903225806e-05, + "loss": 1.083, + "step": 1793 + }, + { + "epoch": 0.28083907326236696, + "grad_norm": 1.6977397203445435, + "learning_rate": 8.950798305637016e-05, + "loss": 0.8995, + "step": 1794 + }, + { + "epoch": 0.28099561678146523, + "grad_norm": 2.5930075645446777, + "learning_rate": 8.949983708048225e-05, + "loss": 1.7919, + "step": 1795 + }, + { + "epoch": 0.28115216030056356, + "grad_norm": 4.576322555541992, + "learning_rate": 8.949169110459433e-05, + "loss": 1.1844, + "step": 1796 + }, + { + "epoch": 0.2813087038196619, + "grad_norm": 3.4652140140533447, + "learning_rate": 8.948354512870643e-05, + "loss": 0.7451, + "step": 1797 + }, + { + "epoch": 0.28146524733876016, + "grad_norm": 2.1688907146453857, + "learning_rate": 8.947539915281851e-05, + "loss": 0.705, + "step": 1798 + }, + { + "epoch": 0.2816217908578585, + "grad_norm": 4.255823612213135, + "learning_rate": 8.94672531769306e-05, + "loss": 1.7138, + "step": 1799 + }, + { + "epoch": 0.2817783343769568, + "grad_norm": 2.3684945106506348, + "learning_rate": 8.945910720104269e-05, + "loss": 1.1939, + "step": 1800 + }, + { + "epoch": 0.2819348778960551, + "grad_norm": 0.6221145987510681, + "learning_rate": 8.945096122515478e-05, + "loss": 0.325, + "step": 1801 + }, + { + "epoch": 0.2820914214151534, + "grad_norm": 0.7639713883399963, + "learning_rate": 8.944281524926686e-05, + "loss": 0.3852, + "step": 1802 + }, + { + "epoch": 0.28224796493425175, + "grad_norm": 0.8481013178825378, + "learning_rate": 8.943466927337896e-05, + "loss": 0.4643, + "step": 1803 + }, + { + "epoch": 0.28240450845335, + "grad_norm": 0.6845239400863647, + "learning_rate": 8.942652329749104e-05, + "loss": 0.3558, + "step": 1804 + }, + { + "epoch": 0.28256105197244835, + "grad_norm": 0.9087247848510742, + "learning_rate": 8.941837732160312e-05, + "loss": 0.4233, + "step": 1805 + }, + { + "epoch": 0.2827175954915466, + "grad_norm": 0.9834880828857422, + "learning_rate": 8.941023134571522e-05, + "loss": 0.4903, + "step": 1806 + }, + { + "epoch": 0.28287413901064495, + "grad_norm": 1.1650323867797852, + "learning_rate": 8.94020853698273e-05, + "loss": 0.4452, + "step": 1807 + }, + { + "epoch": 0.2830306825297433, + "grad_norm": 0.8333632349967957, + "learning_rate": 8.93939393939394e-05, + "loss": 0.5153, + "step": 1808 + }, + { + "epoch": 0.28318722604884156, + "grad_norm": 1.5383979082107544, + "learning_rate": 8.938579341805149e-05, + "loss": 0.5096, + "step": 1809 + }, + { + "epoch": 0.2833437695679399, + "grad_norm": 1.6592988967895508, + "learning_rate": 8.937764744216357e-05, + "loss": 0.5723, + "step": 1810 + }, + { + "epoch": 0.2835003130870382, + "grad_norm": 1.9601925611495972, + "learning_rate": 8.936950146627567e-05, + "loss": 0.5626, + "step": 1811 + }, + { + "epoch": 0.2836568566061365, + "grad_norm": 1.2728314399719238, + "learning_rate": 8.936135549038775e-05, + "loss": 0.4945, + "step": 1812 + }, + { + "epoch": 0.2838134001252348, + "grad_norm": 1.2619824409484863, + "learning_rate": 8.935320951449983e-05, + "loss": 0.5074, + "step": 1813 + }, + { + "epoch": 0.28396994364433314, + "grad_norm": 2.5774314403533936, + "learning_rate": 8.934506353861193e-05, + "loss": 0.7879, + "step": 1814 + }, + { + "epoch": 0.2841264871634314, + "grad_norm": 2.527723789215088, + "learning_rate": 8.933691756272402e-05, + "loss": 0.505, + "step": 1815 + }, + { + "epoch": 0.28428303068252975, + "grad_norm": 1.1131607294082642, + "learning_rate": 8.93287715868361e-05, + "loss": 0.4697, + "step": 1816 + }, + { + "epoch": 0.2844395742016281, + "grad_norm": 1.5273760557174683, + "learning_rate": 8.93206256109482e-05, + "loss": 0.6491, + "step": 1817 + }, + { + "epoch": 0.28459611772072635, + "grad_norm": 1.2223633527755737, + "learning_rate": 8.931247963506028e-05, + "loss": 0.5732, + "step": 1818 + }, + { + "epoch": 0.2847526612398247, + "grad_norm": 2.0504844188690186, + "learning_rate": 8.930433365917236e-05, + "loss": 0.7298, + "step": 1819 + }, + { + "epoch": 0.284909204758923, + "grad_norm": 1.3588988780975342, + "learning_rate": 8.929618768328446e-05, + "loss": 0.466, + "step": 1820 + }, + { + "epoch": 0.2850657482780213, + "grad_norm": 1.2064194679260254, + "learning_rate": 8.928804170739656e-05, + "loss": 0.5392, + "step": 1821 + }, + { + "epoch": 0.2852222917971196, + "grad_norm": 1.8374475240707397, + "learning_rate": 8.927989573150863e-05, + "loss": 0.6484, + "step": 1822 + }, + { + "epoch": 0.2853788353162179, + "grad_norm": 9.254456520080566, + "learning_rate": 8.927174975562073e-05, + "loss": 0.6382, + "step": 1823 + }, + { + "epoch": 0.2855353788353162, + "grad_norm": 3.163681983947754, + "learning_rate": 8.926360377973282e-05, + "loss": 0.9593, + "step": 1824 + }, + { + "epoch": 0.28569192235441454, + "grad_norm": 3.5363292694091797, + "learning_rate": 8.92554578038449e-05, + "loss": 0.9047, + "step": 1825 + }, + { + "epoch": 0.2858484658735128, + "grad_norm": 5.192300319671631, + "learning_rate": 8.924731182795699e-05, + "loss": 0.7968, + "step": 1826 + }, + { + "epoch": 0.28600500939261114, + "grad_norm": 3.389878511428833, + "learning_rate": 8.923916585206909e-05, + "loss": 0.9269, + "step": 1827 + }, + { + "epoch": 0.28616155291170947, + "grad_norm": 1.9425809383392334, + "learning_rate": 8.923101987618116e-05, + "loss": 0.7722, + "step": 1828 + }, + { + "epoch": 0.28631809643080774, + "grad_norm": 3.7216553688049316, + "learning_rate": 8.922287390029326e-05, + "loss": 1.0462, + "step": 1829 + }, + { + "epoch": 0.2864746399499061, + "grad_norm": 3.4610157012939453, + "learning_rate": 8.921472792440535e-05, + "loss": 0.8781, + "step": 1830 + }, + { + "epoch": 0.2866311834690044, + "grad_norm": 1.7822812795639038, + "learning_rate": 8.920658194851744e-05, + "loss": 0.6044, + "step": 1831 + }, + { + "epoch": 0.2867877269881027, + "grad_norm": 7.701934814453125, + "learning_rate": 8.919843597262952e-05, + "loss": 1.2144, + "step": 1832 + }, + { + "epoch": 0.286944270507201, + "grad_norm": 3.4626471996307373, + "learning_rate": 8.919028999674162e-05, + "loss": 0.9313, + "step": 1833 + }, + { + "epoch": 0.28710081402629933, + "grad_norm": 2.785343647003174, + "learning_rate": 8.91821440208537e-05, + "loss": 1.0507, + "step": 1834 + }, + { + "epoch": 0.2872573575453976, + "grad_norm": 3.1687753200531006, + "learning_rate": 8.917399804496579e-05, + "loss": 1.1584, + "step": 1835 + }, + { + "epoch": 0.28741390106449594, + "grad_norm": 1.6342514753341675, + "learning_rate": 8.916585206907788e-05, + "loss": 0.8061, + "step": 1836 + }, + { + "epoch": 0.28757044458359426, + "grad_norm": 3.7539985179901123, + "learning_rate": 8.915770609318997e-05, + "loss": 1.3897, + "step": 1837 + }, + { + "epoch": 0.28772698810269254, + "grad_norm": 3.0565879344940186, + "learning_rate": 8.914956011730205e-05, + "loss": 0.9185, + "step": 1838 + }, + { + "epoch": 0.28788353162179087, + "grad_norm": 4.856945037841797, + "learning_rate": 8.914141414141415e-05, + "loss": 1.1651, + "step": 1839 + }, + { + "epoch": 0.28804007514088914, + "grad_norm": 2.464050769805908, + "learning_rate": 8.913326816552623e-05, + "loss": 1.3773, + "step": 1840 + }, + { + "epoch": 0.28819661865998747, + "grad_norm": 2.135143756866455, + "learning_rate": 8.912512218963832e-05, + "loss": 1.27, + "step": 1841 + }, + { + "epoch": 0.2883531621790858, + "grad_norm": 2.7232370376586914, + "learning_rate": 8.911697621375041e-05, + "loss": 1.2149, + "step": 1842 + }, + { + "epoch": 0.28850970569818407, + "grad_norm": 2.330357551574707, + "learning_rate": 8.91088302378625e-05, + "loss": 1.6471, + "step": 1843 + }, + { + "epoch": 0.2886662492172824, + "grad_norm": 2.979038715362549, + "learning_rate": 8.91006842619746e-05, + "loss": 1.8597, + "step": 1844 + }, + { + "epoch": 0.28882279273638073, + "grad_norm": 3.7491841316223145, + "learning_rate": 8.909253828608668e-05, + "loss": 2.0154, + "step": 1845 + }, + { + "epoch": 0.288979336255479, + "grad_norm": 2.207730770111084, + "learning_rate": 8.908439231019876e-05, + "loss": 1.0706, + "step": 1846 + }, + { + "epoch": 0.28913587977457733, + "grad_norm": 2.9485859870910645, + "learning_rate": 8.907624633431086e-05, + "loss": 1.6923, + "step": 1847 + }, + { + "epoch": 0.28929242329367566, + "grad_norm": 2.8472659587860107, + "learning_rate": 8.906810035842294e-05, + "loss": 1.2866, + "step": 1848 + }, + { + "epoch": 0.28944896681277393, + "grad_norm": 7.264497756958008, + "learning_rate": 8.905995438253503e-05, + "loss": 1.2332, + "step": 1849 + }, + { + "epoch": 0.28960551033187226, + "grad_norm": 2.0563066005706787, + "learning_rate": 8.905180840664712e-05, + "loss": 1.5016, + "step": 1850 + }, + { + "epoch": 0.2897620538509706, + "grad_norm": 0.8994681239128113, + "learning_rate": 8.904366243075921e-05, + "loss": 0.5265, + "step": 1851 + }, + { + "epoch": 0.28991859737006886, + "grad_norm": 0.6556351780891418, + "learning_rate": 8.903551645487129e-05, + "loss": 0.2767, + "step": 1852 + }, + { + "epoch": 0.2900751408891672, + "grad_norm": 1.2671905755996704, + "learning_rate": 8.902737047898339e-05, + "loss": 0.4405, + "step": 1853 + }, + { + "epoch": 0.2902316844082655, + "grad_norm": 0.9340997338294983, + "learning_rate": 8.901922450309547e-05, + "loss": 0.4529, + "step": 1854 + }, + { + "epoch": 0.2903882279273638, + "grad_norm": 0.8327419757843018, + "learning_rate": 8.901107852720756e-05, + "loss": 0.4605, + "step": 1855 + }, + { + "epoch": 0.2905447714464621, + "grad_norm": 3.003971815109253, + "learning_rate": 8.900293255131965e-05, + "loss": 0.938, + "step": 1856 + }, + { + "epoch": 0.29070131496556045, + "grad_norm": 0.9908445477485657, + "learning_rate": 8.899478657543174e-05, + "loss": 0.5667, + "step": 1857 + }, + { + "epoch": 0.2908578584846587, + "grad_norm": 1.3735657930374146, + "learning_rate": 8.898664059954382e-05, + "loss": 0.2547, + "step": 1858 + }, + { + "epoch": 0.29101440200375706, + "grad_norm": 1.1039601564407349, + "learning_rate": 8.897849462365592e-05, + "loss": 0.4255, + "step": 1859 + }, + { + "epoch": 0.29117094552285533, + "grad_norm": 1.7538162469863892, + "learning_rate": 8.897034864776801e-05, + "loss": 0.3395, + "step": 1860 + }, + { + "epoch": 0.29132748904195366, + "grad_norm": 1.5583144426345825, + "learning_rate": 8.896220267188009e-05, + "loss": 0.5326, + "step": 1861 + }, + { + "epoch": 0.291484032561052, + "grad_norm": 1.3862868547439575, + "learning_rate": 8.895405669599218e-05, + "loss": 0.3931, + "step": 1862 + }, + { + "epoch": 0.29164057608015026, + "grad_norm": 1.939098834991455, + "learning_rate": 8.894591072010428e-05, + "loss": 0.5491, + "step": 1863 + }, + { + "epoch": 0.2917971195992486, + "grad_norm": 1.1352040767669678, + "learning_rate": 8.893776474421635e-05, + "loss": 0.447, + "step": 1864 + }, + { + "epoch": 0.2919536631183469, + "grad_norm": 1.6663120985031128, + "learning_rate": 8.892961876832845e-05, + "loss": 0.5787, + "step": 1865 + }, + { + "epoch": 0.2921102066374452, + "grad_norm": 2.7614102363586426, + "learning_rate": 8.892147279244054e-05, + "loss": 0.6636, + "step": 1866 + }, + { + "epoch": 0.2922667501565435, + "grad_norm": 1.2335480451583862, + "learning_rate": 8.891332681655263e-05, + "loss": 0.6903, + "step": 1867 + }, + { + "epoch": 0.29242329367564185, + "grad_norm": 1.9517966508865356, + "learning_rate": 8.890518084066471e-05, + "loss": 0.6686, + "step": 1868 + }, + { + "epoch": 0.2925798371947401, + "grad_norm": 2.9337942600250244, + "learning_rate": 8.889703486477681e-05, + "loss": 0.7743, + "step": 1869 + }, + { + "epoch": 0.29273638071383845, + "grad_norm": 1.9798587560653687, + "learning_rate": 8.888888888888889e-05, + "loss": 0.713, + "step": 1870 + }, + { + "epoch": 0.2928929242329368, + "grad_norm": 2.7326884269714355, + "learning_rate": 8.888074291300098e-05, + "loss": 0.8673, + "step": 1871 + }, + { + "epoch": 0.29304946775203505, + "grad_norm": 2.378056526184082, + "learning_rate": 8.887259693711307e-05, + "loss": 0.5673, + "step": 1872 + }, + { + "epoch": 0.2932060112711334, + "grad_norm": 2.507891893386841, + "learning_rate": 8.886445096122516e-05, + "loss": 0.8136, + "step": 1873 + }, + { + "epoch": 0.2933625547902317, + "grad_norm": 2.242013454437256, + "learning_rate": 8.885630498533724e-05, + "loss": 0.6717, + "step": 1874 + }, + { + "epoch": 0.29351909830933, + "grad_norm": 3.72845721244812, + "learning_rate": 8.884815900944934e-05, + "loss": 0.7939, + "step": 1875 + }, + { + "epoch": 0.2936756418284283, + "grad_norm": 3.259105682373047, + "learning_rate": 8.884001303356142e-05, + "loss": 0.7554, + "step": 1876 + }, + { + "epoch": 0.2938321853475266, + "grad_norm": 4.963883876800537, + "learning_rate": 8.88318670576735e-05, + "loss": 0.8804, + "step": 1877 + }, + { + "epoch": 0.2939887288666249, + "grad_norm": 4.920091152191162, + "learning_rate": 8.88237210817856e-05, + "loss": 0.6536, + "step": 1878 + }, + { + "epoch": 0.29414527238572324, + "grad_norm": 3.481536626815796, + "learning_rate": 8.881557510589769e-05, + "loss": 1.2798, + "step": 1879 + }, + { + "epoch": 0.2943018159048215, + "grad_norm": 4.095098495483398, + "learning_rate": 8.880742913000978e-05, + "loss": 1.0706, + "step": 1880 + }, + { + "epoch": 0.29445835942391985, + "grad_norm": 3.1488897800445557, + "learning_rate": 8.879928315412187e-05, + "loss": 1.1732, + "step": 1881 + }, + { + "epoch": 0.2946149029430182, + "grad_norm": 2.824317693710327, + "learning_rate": 8.879113717823395e-05, + "loss": 1.0249, + "step": 1882 + }, + { + "epoch": 0.29477144646211645, + "grad_norm": 2.9047725200653076, + "learning_rate": 8.878299120234605e-05, + "loss": 0.9823, + "step": 1883 + }, + { + "epoch": 0.2949279899812148, + "grad_norm": 2.729094982147217, + "learning_rate": 8.877484522645813e-05, + "loss": 1.1188, + "step": 1884 + }, + { + "epoch": 0.2950845335003131, + "grad_norm": 3.078465700149536, + "learning_rate": 8.876669925057022e-05, + "loss": 1.2203, + "step": 1885 + }, + { + "epoch": 0.2952410770194114, + "grad_norm": 3.572333335876465, + "learning_rate": 8.875855327468231e-05, + "loss": 1.5925, + "step": 1886 + }, + { + "epoch": 0.2953976205385097, + "grad_norm": 4.519033432006836, + "learning_rate": 8.87504072987944e-05, + "loss": 0.9121, + "step": 1887 + }, + { + "epoch": 0.29555416405760804, + "grad_norm": 4.113150119781494, + "learning_rate": 8.874226132290648e-05, + "loss": 1.4363, + "step": 1888 + }, + { + "epoch": 0.2957107075767063, + "grad_norm": 3.098543643951416, + "learning_rate": 8.873411534701858e-05, + "loss": 1.4466, + "step": 1889 + }, + { + "epoch": 0.29586725109580464, + "grad_norm": 3.0643410682678223, + "learning_rate": 8.872596937113066e-05, + "loss": 0.857, + "step": 1890 + }, + { + "epoch": 0.29602379461490297, + "grad_norm": 4.259333610534668, + "learning_rate": 8.871782339524275e-05, + "loss": 1.6116, + "step": 1891 + }, + { + "epoch": 0.29618033813400124, + "grad_norm": 2.9427287578582764, + "learning_rate": 8.870967741935484e-05, + "loss": 1.6114, + "step": 1892 + }, + { + "epoch": 0.29633688165309957, + "grad_norm": 3.6431381702423096, + "learning_rate": 8.870153144346693e-05, + "loss": 1.4819, + "step": 1893 + }, + { + "epoch": 0.29649342517219784, + "grad_norm": 5.553595542907715, + "learning_rate": 8.869338546757901e-05, + "loss": 2.1768, + "step": 1894 + }, + { + "epoch": 0.2966499686912962, + "grad_norm": 4.403077125549316, + "learning_rate": 8.868523949169111e-05, + "loss": 1.4366, + "step": 1895 + }, + { + "epoch": 0.2968065122103945, + "grad_norm": NaN, + "learning_rate": 8.868523949169111e-05, + "loss": 0.0, + "step": 1896 + }, + { + "epoch": 0.2969630557294928, + "grad_norm": 2.612093687057495, + "learning_rate": 8.86770935158032e-05, + "loss": 1.0124, + "step": 1897 + }, + { + "epoch": 0.2971195992485911, + "grad_norm": 5.299230098724365, + "learning_rate": 8.866894753991528e-05, + "loss": 0.5645, + "step": 1898 + }, + { + "epoch": 0.29727614276768943, + "grad_norm": 3.1130571365356445, + "learning_rate": 8.866080156402737e-05, + "loss": 0.9777, + "step": 1899 + }, + { + "epoch": 0.2974326862867877, + "grad_norm": 3.1861791610717773, + "learning_rate": 8.865265558813947e-05, + "loss": 1.2846, + "step": 1900 + }, + { + "epoch": 0.29758922980588604, + "grad_norm": 0.7234946489334106, + "learning_rate": 8.864450961225154e-05, + "loss": 0.3447, + "step": 1901 + }, + { + "epoch": 0.29774577332498436, + "grad_norm": 0.6404730677604675, + "learning_rate": 8.863636363636364e-05, + "loss": 0.4208, + "step": 1902 + }, + { + "epoch": 0.29790231684408264, + "grad_norm": 0.8849920630455017, + "learning_rate": 8.862821766047574e-05, + "loss": 0.3418, + "step": 1903 + }, + { + "epoch": 0.29805886036318097, + "grad_norm": 0.7377390265464783, + "learning_rate": 8.862007168458782e-05, + "loss": 0.2606, + "step": 1904 + }, + { + "epoch": 0.2982154038822793, + "grad_norm": 0.6726051568984985, + "learning_rate": 8.86119257086999e-05, + "loss": 0.3439, + "step": 1905 + }, + { + "epoch": 0.29837194740137757, + "grad_norm": 0.8146868944168091, + "learning_rate": 8.8603779732812e-05, + "loss": 0.3712, + "step": 1906 + }, + { + "epoch": 0.2985284909204759, + "grad_norm": 0.8280919194221497, + "learning_rate": 8.859563375692408e-05, + "loss": 0.4395, + "step": 1907 + }, + { + "epoch": 0.2986850344395742, + "grad_norm": 1.3734103441238403, + "learning_rate": 8.858748778103617e-05, + "loss": 0.4464, + "step": 1908 + }, + { + "epoch": 0.2988415779586725, + "grad_norm": 1.081286072731018, + "learning_rate": 8.857934180514827e-05, + "loss": 0.3731, + "step": 1909 + }, + { + "epoch": 0.29899812147777083, + "grad_norm": 1.2388032674789429, + "learning_rate": 8.857119582926035e-05, + "loss": 0.8275, + "step": 1910 + }, + { + "epoch": 0.29915466499686916, + "grad_norm": 1.1351431608200073, + "learning_rate": 8.856304985337243e-05, + "loss": 0.4832, + "step": 1911 + }, + { + "epoch": 0.29931120851596743, + "grad_norm": 2.039752721786499, + "learning_rate": 8.855490387748453e-05, + "loss": 0.5968, + "step": 1912 + }, + { + "epoch": 0.29946775203506576, + "grad_norm": 1.1946221590042114, + "learning_rate": 8.854675790159661e-05, + "loss": 0.5558, + "step": 1913 + }, + { + "epoch": 0.29962429555416403, + "grad_norm": 1.142924427986145, + "learning_rate": 8.85386119257087e-05, + "loss": 0.6164, + "step": 1914 + }, + { + "epoch": 0.29978083907326236, + "grad_norm": 2.5692436695098877, + "learning_rate": 8.85304659498208e-05, + "loss": 0.8327, + "step": 1915 + }, + { + "epoch": 0.2999373825923607, + "grad_norm": 1.1967829465866089, + "learning_rate": 8.852231997393288e-05, + "loss": 0.5404, + "step": 1916 + }, + { + "epoch": 0.30009392611145896, + "grad_norm": 2.4320409297943115, + "learning_rate": 8.851417399804496e-05, + "loss": 0.519, + "step": 1917 + }, + { + "epoch": 0.3002504696305573, + "grad_norm": 1.568655014038086, + "learning_rate": 8.850602802215706e-05, + "loss": 0.5936, + "step": 1918 + }, + { + "epoch": 0.3004070131496556, + "grad_norm": 1.3401926755905151, + "learning_rate": 8.849788204626914e-05, + "loss": 0.6481, + "step": 1919 + }, + { + "epoch": 0.3005635566687539, + "grad_norm": 2.952014446258545, + "learning_rate": 8.848973607038124e-05, + "loss": 1.1766, + "step": 1920 + }, + { + "epoch": 0.3007201001878522, + "grad_norm": 1.416664481163025, + "learning_rate": 8.848159009449332e-05, + "loss": 0.8234, + "step": 1921 + }, + { + "epoch": 0.30087664370695055, + "grad_norm": 3.244565725326538, + "learning_rate": 8.847344411860541e-05, + "loss": 0.8304, + "step": 1922 + }, + { + "epoch": 0.3010331872260488, + "grad_norm": 1.3387657403945923, + "learning_rate": 8.84652981427175e-05, + "loss": 0.5963, + "step": 1923 + }, + { + "epoch": 0.30118973074514716, + "grad_norm": 1.4198715686798096, + "learning_rate": 8.845715216682959e-05, + "loss": 0.4284, + "step": 1924 + }, + { + "epoch": 0.3013462742642455, + "grad_norm": 6.774830341339111, + "learning_rate": 8.844900619094167e-05, + "loss": 1.6707, + "step": 1925 + }, + { + "epoch": 0.30150281778334376, + "grad_norm": 2.7941489219665527, + "learning_rate": 8.844086021505377e-05, + "loss": 0.9994, + "step": 1926 + }, + { + "epoch": 0.3016593613024421, + "grad_norm": 1.6777962446212769, + "learning_rate": 8.843271423916585e-05, + "loss": 0.7703, + "step": 1927 + }, + { + "epoch": 0.3018159048215404, + "grad_norm": 5.908127307891846, + "learning_rate": 8.842456826327794e-05, + "loss": 1.4108, + "step": 1928 + }, + { + "epoch": 0.3019724483406387, + "grad_norm": 3.2248497009277344, + "learning_rate": 8.841642228739004e-05, + "loss": 0.8247, + "step": 1929 + }, + { + "epoch": 0.302128991859737, + "grad_norm": 2.157628297805786, + "learning_rate": 8.840827631150212e-05, + "loss": 0.8339, + "step": 1930 + }, + { + "epoch": 0.3022855353788353, + "grad_norm": 3.3587310314178467, + "learning_rate": 8.84001303356142e-05, + "loss": 1.004, + "step": 1931 + }, + { + "epoch": 0.3024420788979336, + "grad_norm": 2.559804916381836, + "learning_rate": 8.83919843597263e-05, + "loss": 0.8169, + "step": 1932 + }, + { + "epoch": 0.30259862241703195, + "grad_norm": 2.853111982345581, + "learning_rate": 8.83838383838384e-05, + "loss": 1.0843, + "step": 1933 + }, + { + "epoch": 0.3027551659361302, + "grad_norm": 2.171250581741333, + "learning_rate": 8.837569240795047e-05, + "loss": 0.8999, + "step": 1934 + }, + { + "epoch": 0.30291170945522855, + "grad_norm": 12.491081237792969, + "learning_rate": 8.836754643206256e-05, + "loss": 1.4252, + "step": 1935 + }, + { + "epoch": 0.3030682529743269, + "grad_norm": 2.198378086090088, + "learning_rate": 8.835940045617466e-05, + "loss": 1.4267, + "step": 1936 + }, + { + "epoch": 0.30322479649342515, + "grad_norm": 2.7470266819000244, + "learning_rate": 8.835125448028673e-05, + "loss": 1.644, + "step": 1937 + }, + { + "epoch": 0.3033813400125235, + "grad_norm": 8.717738151550293, + "learning_rate": 8.834310850439883e-05, + "loss": 1.1839, + "step": 1938 + }, + { + "epoch": 0.3035378835316218, + "grad_norm": 3.174227237701416, + "learning_rate": 8.833496252851093e-05, + "loss": 1.488, + "step": 1939 + }, + { + "epoch": 0.3036944270507201, + "grad_norm": 3.8922793865203857, + "learning_rate": 8.832681655262301e-05, + "loss": 1.022, + "step": 1940 + }, + { + "epoch": 0.3038509705698184, + "grad_norm": 3.2935373783111572, + "learning_rate": 8.83186705767351e-05, + "loss": 1.8253, + "step": 1941 + }, + { + "epoch": 0.30400751408891674, + "grad_norm": 3.151597738265991, + "learning_rate": 8.831052460084719e-05, + "loss": 1.3769, + "step": 1942 + }, + { + "epoch": 0.304164057608015, + "grad_norm": 3.010619640350342, + "learning_rate": 8.830237862495928e-05, + "loss": 1.4323, + "step": 1943 + }, + { + "epoch": 0.30432060112711334, + "grad_norm": 2.4194791316986084, + "learning_rate": 8.829423264907136e-05, + "loss": 1.5404, + "step": 1944 + }, + { + "epoch": 0.3044771446462117, + "grad_norm": 2.958662271499634, + "learning_rate": 8.828608667318346e-05, + "loss": 1.2457, + "step": 1945 + }, + { + "epoch": 0.30463368816530995, + "grad_norm": 2.7292213439941406, + "learning_rate": 8.827794069729554e-05, + "loss": 1.2762, + "step": 1946 + }, + { + "epoch": 0.3047902316844083, + "grad_norm": 6.456068515777588, + "learning_rate": 8.826979472140762e-05, + "loss": 1.2043, + "step": 1947 + }, + { + "epoch": 0.30494677520350655, + "grad_norm": 2.9928371906280518, + "learning_rate": 8.826164874551972e-05, + "loss": 0.897, + "step": 1948 + }, + { + "epoch": 0.3051033187226049, + "grad_norm": 2.398953914642334, + "learning_rate": 8.82535027696318e-05, + "loss": 1.2585, + "step": 1949 + }, + { + "epoch": 0.3052598622417032, + "grad_norm": 2.459451675415039, + "learning_rate": 8.824535679374389e-05, + "loss": 1.4696, + "step": 1950 + }, + { + "epoch": 0.3054164057608015, + "grad_norm": 0.7492189407348633, + "learning_rate": 8.823721081785599e-05, + "loss": 0.3557, + "step": 1951 + }, + { + "epoch": 0.3055729492798998, + "grad_norm": 0.8600263595581055, + "learning_rate": 8.822906484196807e-05, + "loss": 0.4602, + "step": 1952 + }, + { + "epoch": 0.30572949279899814, + "grad_norm": 0.7178438901901245, + "learning_rate": 8.822091886608015e-05, + "loss": 0.388, + "step": 1953 + }, + { + "epoch": 0.3058860363180964, + "grad_norm": 0.6664970517158508, + "learning_rate": 8.821277289019225e-05, + "loss": 0.4097, + "step": 1954 + }, + { + "epoch": 0.30604257983719474, + "grad_norm": 0.9541361331939697, + "learning_rate": 8.820462691430433e-05, + "loss": 0.4427, + "step": 1955 + }, + { + "epoch": 0.30619912335629307, + "grad_norm": 0.9960927367210388, + "learning_rate": 8.819648093841643e-05, + "loss": 0.4143, + "step": 1956 + }, + { + "epoch": 0.30635566687539134, + "grad_norm": 0.5378838777542114, + "learning_rate": 8.818833496252852e-05, + "loss": 0.3112, + "step": 1957 + }, + { + "epoch": 0.30651221039448967, + "grad_norm": 0.8386194705963135, + "learning_rate": 8.81801889866406e-05, + "loss": 0.3642, + "step": 1958 + }, + { + "epoch": 0.306668753913588, + "grad_norm": 1.215386152267456, + "learning_rate": 8.81720430107527e-05, + "loss": 0.615, + "step": 1959 + }, + { + "epoch": 0.3068252974326863, + "grad_norm": 1.3096256256103516, + "learning_rate": 8.816389703486478e-05, + "loss": 0.3412, + "step": 1960 + }, + { + "epoch": 0.3069818409517846, + "grad_norm": 2.169621229171753, + "learning_rate": 8.815575105897686e-05, + "loss": 0.4968, + "step": 1961 + }, + { + "epoch": 0.30713838447088293, + "grad_norm": 1.014849066734314, + "learning_rate": 8.814760508308896e-05, + "loss": 0.3561, + "step": 1962 + }, + { + "epoch": 0.3072949279899812, + "grad_norm": 1.0465404987335205, + "learning_rate": 8.813945910720105e-05, + "loss": 0.3666, + "step": 1963 + }, + { + "epoch": 0.30745147150907953, + "grad_norm": 1.8653658628463745, + "learning_rate": 8.813131313131313e-05, + "loss": 0.6684, + "step": 1964 + }, + { + "epoch": 0.3076080150281778, + "grad_norm": 1.3417658805847168, + "learning_rate": 8.812316715542523e-05, + "loss": 0.6149, + "step": 1965 + }, + { + "epoch": 0.30776455854727613, + "grad_norm": 2.666325092315674, + "learning_rate": 8.811502117953731e-05, + "loss": 0.8373, + "step": 1966 + }, + { + "epoch": 0.30792110206637446, + "grad_norm": 1.744035005569458, + "learning_rate": 8.81068752036494e-05, + "loss": 0.5316, + "step": 1967 + }, + { + "epoch": 0.30807764558547274, + "grad_norm": 1.491253137588501, + "learning_rate": 8.809872922776149e-05, + "loss": 0.6164, + "step": 1968 + }, + { + "epoch": 0.30823418910457107, + "grad_norm": 1.467315673828125, + "learning_rate": 8.809058325187359e-05, + "loss": 0.6146, + "step": 1969 + }, + { + "epoch": 0.3083907326236694, + "grad_norm": 1.818896770477295, + "learning_rate": 8.808243727598566e-05, + "loss": 0.7588, + "step": 1970 + }, + { + "epoch": 0.30854727614276767, + "grad_norm": 2.273716449737549, + "learning_rate": 8.807429130009776e-05, + "loss": 0.5891, + "step": 1971 + }, + { + "epoch": 0.308703819661866, + "grad_norm": 1.7820109128952026, + "learning_rate": 8.806614532420985e-05, + "loss": 0.8615, + "step": 1972 + }, + { + "epoch": 0.3088603631809643, + "grad_norm": 9.813129425048828, + "learning_rate": 8.805799934832192e-05, + "loss": 0.988, + "step": 1973 + }, + { + "epoch": 0.3090169067000626, + "grad_norm": 1.585134506225586, + "learning_rate": 8.804985337243402e-05, + "loss": 0.6547, + "step": 1974 + }, + { + "epoch": 0.30917345021916093, + "grad_norm": 2.622584819793701, + "learning_rate": 8.804170739654612e-05, + "loss": 0.7031, + "step": 1975 + }, + { + "epoch": 0.30932999373825926, + "grad_norm": 2.317131519317627, + "learning_rate": 8.803356142065819e-05, + "loss": 0.8381, + "step": 1976 + }, + { + "epoch": 0.30948653725735753, + "grad_norm": 11.435327529907227, + "learning_rate": 8.802541544477029e-05, + "loss": 1.0216, + "step": 1977 + }, + { + "epoch": 0.30964308077645586, + "grad_norm": 1.918535828590393, + "learning_rate": 8.801726946888238e-05, + "loss": 0.8846, + "step": 1978 + }, + { + "epoch": 0.3097996242955542, + "grad_norm": 3.1633853912353516, + "learning_rate": 8.800912349299447e-05, + "loss": 0.9591, + "step": 1979 + }, + { + "epoch": 0.30995616781465246, + "grad_norm": 1.8861132860183716, + "learning_rate": 8.800097751710655e-05, + "loss": 0.7827, + "step": 1980 + }, + { + "epoch": 0.3101127113337508, + "grad_norm": 1.6260969638824463, + "learning_rate": 8.799283154121865e-05, + "loss": 0.831, + "step": 1981 + }, + { + "epoch": 0.3102692548528491, + "grad_norm": 3.2569732666015625, + "learning_rate": 8.798468556533073e-05, + "loss": 1.4121, + "step": 1982 + }, + { + "epoch": 0.3104257983719474, + "grad_norm": 2.381507635116577, + "learning_rate": 8.797653958944282e-05, + "loss": 0.4587, + "step": 1983 + }, + { + "epoch": 0.3105823418910457, + "grad_norm": 4.035518646240234, + "learning_rate": 8.796839361355491e-05, + "loss": 1.0613, + "step": 1984 + }, + { + "epoch": 0.310738885410144, + "grad_norm": 2.4049019813537598, + "learning_rate": 8.7960247637667e-05, + "loss": 0.8544, + "step": 1985 + }, + { + "epoch": 0.3108954289292423, + "grad_norm": 3.119093656539917, + "learning_rate": 8.795210166177908e-05, + "loss": 1.1552, + "step": 1986 + }, + { + "epoch": 0.31105197244834065, + "grad_norm": 3.884453058242798, + "learning_rate": 8.794395568589118e-05, + "loss": 1.1737, + "step": 1987 + }, + { + "epoch": 0.3112085159674389, + "grad_norm": 6.004324913024902, + "learning_rate": 8.793580971000326e-05, + "loss": 1.0646, + "step": 1988 + }, + { + "epoch": 0.31136505948653725, + "grad_norm": 7.916918754577637, + "learning_rate": 8.792766373411534e-05, + "loss": 1.6778, + "step": 1989 + }, + { + "epoch": 0.3115216030056356, + "grad_norm": 3.3031816482543945, + "learning_rate": 8.791951775822744e-05, + "loss": 1.2307, + "step": 1990 + }, + { + "epoch": 0.31167814652473386, + "grad_norm": 2.160187244415283, + "learning_rate": 8.791137178233953e-05, + "loss": 1.5051, + "step": 1991 + }, + { + "epoch": 0.3118346900438322, + "grad_norm": 2.4796862602233887, + "learning_rate": 8.790322580645162e-05, + "loss": 1.3517, + "step": 1992 + }, + { + "epoch": 0.3119912335629305, + "grad_norm": 3.175830841064453, + "learning_rate": 8.789507983056371e-05, + "loss": 1.494, + "step": 1993 + }, + { + "epoch": 0.3121477770820288, + "grad_norm": 2.8282599449157715, + "learning_rate": 8.788693385467579e-05, + "loss": 1.1333, + "step": 1994 + }, + { + "epoch": 0.3123043206011271, + "grad_norm": 6.8760809898376465, + "learning_rate": 8.787878787878789e-05, + "loss": 1.3573, + "step": 1995 + }, + { + "epoch": 0.31246086412022545, + "grad_norm": 12.71776294708252, + "learning_rate": 8.787064190289997e-05, + "loss": 1.0802, + "step": 1996 + }, + { + "epoch": 0.3126174076393237, + "grad_norm": 9.506250381469727, + "learning_rate": 8.786249592701206e-05, + "loss": 1.0868, + "step": 1997 + }, + { + "epoch": 0.31277395115842205, + "grad_norm": 7.733770370483398, + "learning_rate": 8.785434995112415e-05, + "loss": 1.766, + "step": 1998 + }, + { + "epoch": 0.3129304946775204, + "grad_norm": 2.842456817626953, + "learning_rate": 8.784620397523624e-05, + "loss": 0.994, + "step": 1999 + }, + { + "epoch": 0.31308703819661865, + "grad_norm": 3.3281333446502686, + "learning_rate": 8.783805799934832e-05, + "loss": 1.4052, + "step": 2000 + }, + { + "epoch": 0.31308703819661865, + "eval_loss": 0.7311397194862366, + "eval_runtime": 203.5702, + "eval_samples_per_second": 60.829, + "eval_steps_per_second": 3.802, + "eval_wer": 0.40826019508255945, + "step": 2000 + }, + { + "epoch": 0.313243581715717, + "grad_norm": 2.0226709842681885, + "learning_rate": 8.782991202346042e-05, + "loss": 0.5349, + "step": 2001 + }, + { + "epoch": 0.31340012523481525, + "grad_norm": 0.7856681942939758, + "learning_rate": 8.78217660475725e-05, + "loss": 0.3936, + "step": 2002 + }, + { + "epoch": 0.3135566687539136, + "grad_norm": 0.9928082227706909, + "learning_rate": 8.781362007168459e-05, + "loss": 0.4639, + "step": 2003 + }, + { + "epoch": 0.3137132122730119, + "grad_norm": 0.8502695560455322, + "learning_rate": 8.780547409579668e-05, + "loss": 0.3326, + "step": 2004 + }, + { + "epoch": 0.3138697557921102, + "grad_norm": 1.1199010610580444, + "learning_rate": 8.779732811990877e-05, + "loss": 0.4597, + "step": 2005 + }, + { + "epoch": 0.3140262993112085, + "grad_norm": 1.2372764348983765, + "learning_rate": 8.778918214402085e-05, + "loss": 0.4414, + "step": 2006 + }, + { + "epoch": 0.31418284283030684, + "grad_norm": 0.7484068870544434, + "learning_rate": 8.778103616813295e-05, + "loss": 0.3225, + "step": 2007 + }, + { + "epoch": 0.3143393863494051, + "grad_norm": 0.9331351518630981, + "learning_rate": 8.777289019224504e-05, + "loss": 0.5828, + "step": 2008 + }, + { + "epoch": 0.31449592986850344, + "grad_norm": 1.1956268548965454, + "learning_rate": 8.776474421635711e-05, + "loss": 0.4123, + "step": 2009 + }, + { + "epoch": 0.31465247338760177, + "grad_norm": 1.6731507778167725, + "learning_rate": 8.775659824046921e-05, + "loss": 0.4705, + "step": 2010 + }, + { + "epoch": 0.31480901690670005, + "grad_norm": 1.2394781112670898, + "learning_rate": 8.774845226458131e-05, + "loss": 0.5712, + "step": 2011 + }, + { + "epoch": 0.3149655604257984, + "grad_norm": 1.5027315616607666, + "learning_rate": 8.774030628869338e-05, + "loss": 0.5186, + "step": 2012 + }, + { + "epoch": 0.3151221039448967, + "grad_norm": 1.9851617813110352, + "learning_rate": 8.773216031280548e-05, + "loss": 0.4999, + "step": 2013 + }, + { + "epoch": 0.315278647463995, + "grad_norm": 1.3970203399658203, + "learning_rate": 8.772401433691757e-05, + "loss": 0.5039, + "step": 2014 + }, + { + "epoch": 0.3154351909830933, + "grad_norm": 1.1210103034973145, + "learning_rate": 8.771586836102966e-05, + "loss": 0.4997, + "step": 2015 + }, + { + "epoch": 0.31559173450219163, + "grad_norm": 2.1712141036987305, + "learning_rate": 8.770772238514174e-05, + "loss": 0.8385, + "step": 2016 + }, + { + "epoch": 0.3157482780212899, + "grad_norm": 2.3497190475463867, + "learning_rate": 8.769957640925384e-05, + "loss": 0.5997, + "step": 2017 + }, + { + "epoch": 0.31590482154038824, + "grad_norm": 3.0846762657165527, + "learning_rate": 8.769143043336592e-05, + "loss": 0.9014, + "step": 2018 + }, + { + "epoch": 0.3160613650594865, + "grad_norm": 3.7818188667297363, + "learning_rate": 8.7683284457478e-05, + "loss": 1.0346, + "step": 2019 + }, + { + "epoch": 0.31621790857858484, + "grad_norm": 1.8717782497406006, + "learning_rate": 8.76751384815901e-05, + "loss": 0.8142, + "step": 2020 + }, + { + "epoch": 0.31637445209768317, + "grad_norm": 2.2463347911834717, + "learning_rate": 8.766699250570219e-05, + "loss": 0.7582, + "step": 2021 + }, + { + "epoch": 0.31653099561678144, + "grad_norm": 2.015465497970581, + "learning_rate": 8.765884652981427e-05, + "loss": 1.2059, + "step": 2022 + }, + { + "epoch": 0.31668753913587977, + "grad_norm": 3.009493827819824, + "learning_rate": 8.765070055392637e-05, + "loss": 1.0705, + "step": 2023 + }, + { + "epoch": 0.3168440826549781, + "grad_norm": 1.1529815196990967, + "learning_rate": 8.764255457803845e-05, + "loss": 0.5959, + "step": 2024 + }, + { + "epoch": 0.31700062617407637, + "grad_norm": 3.4592642784118652, + "learning_rate": 8.763440860215054e-05, + "loss": 0.93, + "step": 2025 + }, + { + "epoch": 0.3171571696931747, + "grad_norm": 1.8674730062484741, + "learning_rate": 8.762626262626263e-05, + "loss": 0.8463, + "step": 2026 + }, + { + "epoch": 0.31731371321227303, + "grad_norm": 1.3837932348251343, + "learning_rate": 8.761811665037472e-05, + "loss": 0.5498, + "step": 2027 + }, + { + "epoch": 0.3174702567313713, + "grad_norm": 2.007528781890869, + "learning_rate": 8.760997067448681e-05, + "loss": 0.6735, + "step": 2028 + }, + { + "epoch": 0.31762680025046963, + "grad_norm": 2.0828633308410645, + "learning_rate": 8.76018246985989e-05, + "loss": 1.2003, + "step": 2029 + }, + { + "epoch": 0.31778334376956796, + "grad_norm": 2.5516321659088135, + "learning_rate": 8.759367872271098e-05, + "loss": 0.9383, + "step": 2030 + }, + { + "epoch": 0.31793988728866623, + "grad_norm": 2.059753179550171, + "learning_rate": 8.758553274682308e-05, + "loss": 0.7786, + "step": 2031 + }, + { + "epoch": 0.31809643080776456, + "grad_norm": 3.8051798343658447, + "learning_rate": 8.757738677093516e-05, + "loss": 1.0235, + "step": 2032 + }, + { + "epoch": 0.3182529743268629, + "grad_norm": 2.7131199836730957, + "learning_rate": 8.756924079504725e-05, + "loss": 1.149, + "step": 2033 + }, + { + "epoch": 0.31840951784596117, + "grad_norm": 3.2730836868286133, + "learning_rate": 8.756109481915934e-05, + "loss": 1.2082, + "step": 2034 + }, + { + "epoch": 0.3185660613650595, + "grad_norm": 3.7365362644195557, + "learning_rate": 8.755294884327143e-05, + "loss": 1.0404, + "step": 2035 + }, + { + "epoch": 0.3187226048841578, + "grad_norm": 1.9146473407745361, + "learning_rate": 8.754480286738351e-05, + "loss": 0.8964, + "step": 2036 + }, + { + "epoch": 0.3188791484032561, + "grad_norm": 8.080965042114258, + "learning_rate": 8.753665689149561e-05, + "loss": 1.478, + "step": 2037 + }, + { + "epoch": 0.3190356919223544, + "grad_norm": 2.4073383808135986, + "learning_rate": 8.752851091560769e-05, + "loss": 1.3149, + "step": 2038 + }, + { + "epoch": 0.3191922354414527, + "grad_norm": 1.7344671487808228, + "learning_rate": 8.752036493971978e-05, + "loss": 1.0844, + "step": 2039 + }, + { + "epoch": 0.319348778960551, + "grad_norm": 3.0369362831115723, + "learning_rate": 8.751221896383187e-05, + "loss": 1.6408, + "step": 2040 + }, + { + "epoch": 0.31950532247964936, + "grad_norm": 2.2310447692871094, + "learning_rate": 8.750407298794396e-05, + "loss": 1.6602, + "step": 2041 + }, + { + "epoch": 0.31966186599874763, + "grad_norm": 2.576904296875, + "learning_rate": 8.749592701205604e-05, + "loss": 1.4512, + "step": 2042 + }, + { + "epoch": 0.31981840951784596, + "grad_norm": 4.155710697174072, + "learning_rate": 8.748778103616814e-05, + "loss": 1.4358, + "step": 2043 + }, + { + "epoch": 0.3199749530369443, + "grad_norm": 3.336122989654541, + "learning_rate": 8.747963506028024e-05, + "loss": 1.4522, + "step": 2044 + }, + { + "epoch": 0.32013149655604256, + "grad_norm": 1.8603070974349976, + "learning_rate": 8.74714890843923e-05, + "loss": 0.8768, + "step": 2045 + }, + { + "epoch": 0.3202880400751409, + "grad_norm": 2.538658857345581, + "learning_rate": 8.74633431085044e-05, + "loss": 1.0337, + "step": 2046 + }, + { + "epoch": 0.3204445835942392, + "grad_norm": 3.2652199268341064, + "learning_rate": 8.74551971326165e-05, + "loss": 1.2476, + "step": 2047 + }, + { + "epoch": 0.3206011271133375, + "grad_norm": 3.060098171234131, + "learning_rate": 8.744705115672857e-05, + "loss": 1.4386, + "step": 2048 + }, + { + "epoch": 0.3207576706324358, + "grad_norm": 1.2018884420394897, + "learning_rate": 8.743890518084067e-05, + "loss": 0.8566, + "step": 2049 + }, + { + "epoch": 0.32091421415153415, + "grad_norm": 4.215924263000488, + "learning_rate": 8.743075920495277e-05, + "loss": 1.5058, + "step": 2050 + }, + { + "epoch": 0.3210707576706324, + "grad_norm": 0.7327137589454651, + "learning_rate": 8.742261322906485e-05, + "loss": 0.2832, + "step": 2051 + }, + { + "epoch": 0.32122730118973075, + "grad_norm": 0.47043535113334656, + "learning_rate": 8.741446725317693e-05, + "loss": 0.3165, + "step": 2052 + }, + { + "epoch": 0.3213838447088291, + "grad_norm": 1.0921357870101929, + "learning_rate": 8.740632127728903e-05, + "loss": 0.4851, + "step": 2053 + }, + { + "epoch": 0.32154038822792735, + "grad_norm": 4.582512855529785, + "learning_rate": 8.739817530140111e-05, + "loss": 0.4513, + "step": 2054 + }, + { + "epoch": 0.3216969317470257, + "grad_norm": 0.799677312374115, + "learning_rate": 8.73900293255132e-05, + "loss": 0.3912, + "step": 2055 + }, + { + "epoch": 0.32185347526612396, + "grad_norm": 0.7201223373413086, + "learning_rate": 8.73818833496253e-05, + "loss": 0.3759, + "step": 2056 + }, + { + "epoch": 0.3220100187852223, + "grad_norm": 1.3923983573913574, + "learning_rate": 8.737373737373738e-05, + "loss": 0.4499, + "step": 2057 + }, + { + "epoch": 0.3221665623043206, + "grad_norm": 1.33921217918396, + "learning_rate": 8.736559139784946e-05, + "loss": 0.4573, + "step": 2058 + }, + { + "epoch": 0.3223231058234189, + "grad_norm": 1.0516282320022583, + "learning_rate": 8.735744542196156e-05, + "loss": 0.5536, + "step": 2059 + }, + { + "epoch": 0.3224796493425172, + "grad_norm": 1.1176118850708008, + "learning_rate": 8.734929944607364e-05, + "loss": 0.5259, + "step": 2060 + }, + { + "epoch": 0.32263619286161555, + "grad_norm": 2.8193209171295166, + "learning_rate": 8.734115347018573e-05, + "loss": 0.4979, + "step": 2061 + }, + { + "epoch": 0.3227927363807138, + "grad_norm": 1.0632636547088623, + "learning_rate": 8.733300749429782e-05, + "loss": 0.6072, + "step": 2062 + }, + { + "epoch": 0.32294927989981215, + "grad_norm": 1.8330038785934448, + "learning_rate": 8.732486151840991e-05, + "loss": 0.5786, + "step": 2063 + }, + { + "epoch": 0.3231058234189105, + "grad_norm": 1.102245569229126, + "learning_rate": 8.731671554252199e-05, + "loss": 0.5739, + "step": 2064 + }, + { + "epoch": 0.32326236693800875, + "grad_norm": 2.978999614715576, + "learning_rate": 8.730856956663409e-05, + "loss": 0.9743, + "step": 2065 + }, + { + "epoch": 0.3234189104571071, + "grad_norm": 1.517822027206421, + "learning_rate": 8.730042359074617e-05, + "loss": 0.3959, + "step": 2066 + }, + { + "epoch": 0.3235754539762054, + "grad_norm": 1.8682866096496582, + "learning_rate": 8.729227761485827e-05, + "loss": 0.527, + "step": 2067 + }, + { + "epoch": 0.3237319974953037, + "grad_norm": 2.374990940093994, + "learning_rate": 8.728413163897035e-05, + "loss": 0.7357, + "step": 2068 + }, + { + "epoch": 0.323888541014402, + "grad_norm": 1.1828864812850952, + "learning_rate": 8.727598566308244e-05, + "loss": 0.5035, + "step": 2069 + }, + { + "epoch": 0.32404508453350034, + "grad_norm": 1.851452112197876, + "learning_rate": 8.726783968719454e-05, + "loss": 0.6842, + "step": 2070 + }, + { + "epoch": 0.3242016280525986, + "grad_norm": 1.491336464881897, + "learning_rate": 8.725969371130662e-05, + "loss": 0.7285, + "step": 2071 + }, + { + "epoch": 0.32435817157169694, + "grad_norm": 2.051851511001587, + "learning_rate": 8.72515477354187e-05, + "loss": 0.7942, + "step": 2072 + }, + { + "epoch": 0.3245147150907952, + "grad_norm": 1.871109962463379, + "learning_rate": 8.72434017595308e-05, + "loss": 0.6523, + "step": 2073 + }, + { + "epoch": 0.32467125860989354, + "grad_norm": 2.001929521560669, + "learning_rate": 8.723525578364288e-05, + "loss": 0.7822, + "step": 2074 + }, + { + "epoch": 0.32482780212899187, + "grad_norm": 4.223846435546875, + "learning_rate": 8.722710980775497e-05, + "loss": 1.2995, + "step": 2075 + }, + { + "epoch": 0.32498434564809014, + "grad_norm": 1.357825756072998, + "learning_rate": 8.721896383186706e-05, + "loss": 0.6124, + "step": 2076 + }, + { + "epoch": 0.3251408891671885, + "grad_norm": 2.1204488277435303, + "learning_rate": 8.721081785597915e-05, + "loss": 0.6979, + "step": 2077 + }, + { + "epoch": 0.3252974326862868, + "grad_norm": 5.052711486816406, + "learning_rate": 8.720267188009123e-05, + "loss": 0.7346, + "step": 2078 + }, + { + "epoch": 0.3254539762053851, + "grad_norm": 3.639622688293457, + "learning_rate": 8.719452590420333e-05, + "loss": 0.7986, + "step": 2079 + }, + { + "epoch": 0.3256105197244834, + "grad_norm": 4.432771682739258, + "learning_rate": 8.718637992831543e-05, + "loss": 1.2309, + "step": 2080 + }, + { + "epoch": 0.32576706324358173, + "grad_norm": 1.5321617126464844, + "learning_rate": 8.71782339524275e-05, + "loss": 0.5419, + "step": 2081 + }, + { + "epoch": 0.32592360676268, + "grad_norm": 3.3234121799468994, + "learning_rate": 8.71700879765396e-05, + "loss": 1.4669, + "step": 2082 + }, + { + "epoch": 0.32608015028177834, + "grad_norm": 2.3526275157928467, + "learning_rate": 8.716194200065169e-05, + "loss": 1.211, + "step": 2083 + }, + { + "epoch": 0.32623669380087666, + "grad_norm": 3.1062750816345215, + "learning_rate": 8.715379602476376e-05, + "loss": 1.3878, + "step": 2084 + }, + { + "epoch": 0.32639323731997494, + "grad_norm": 2.094278573989868, + "learning_rate": 8.714565004887586e-05, + "loss": 0.6204, + "step": 2085 + }, + { + "epoch": 0.32654978083907327, + "grad_norm": 4.858953475952148, + "learning_rate": 8.713750407298796e-05, + "loss": 1.7536, + "step": 2086 + }, + { + "epoch": 0.3267063243581716, + "grad_norm": 2.720527410507202, + "learning_rate": 8.712935809710004e-05, + "loss": 1.0058, + "step": 2087 + }, + { + "epoch": 0.32686286787726987, + "grad_norm": 4.125729084014893, + "learning_rate": 8.712121212121212e-05, + "loss": 1.5973, + "step": 2088 + }, + { + "epoch": 0.3270194113963682, + "grad_norm": 1.5061450004577637, + "learning_rate": 8.711306614532422e-05, + "loss": 0.685, + "step": 2089 + }, + { + "epoch": 0.3271759549154665, + "grad_norm": 4.343020915985107, + "learning_rate": 8.71049201694363e-05, + "loss": 1.3312, + "step": 2090 + }, + { + "epoch": 0.3273324984345648, + "grad_norm": 2.355247974395752, + "learning_rate": 8.709677419354839e-05, + "loss": 1.1916, + "step": 2091 + }, + { + "epoch": 0.32748904195366313, + "grad_norm": 8.003994941711426, + "learning_rate": 8.708862821766049e-05, + "loss": 1.0071, + "step": 2092 + }, + { + "epoch": 0.3276455854727614, + "grad_norm": 2.8104543685913086, + "learning_rate": 8.708048224177257e-05, + "loss": 1.823, + "step": 2093 + }, + { + "epoch": 0.32780212899185973, + "grad_norm": 7.04930305480957, + "learning_rate": 8.707233626588465e-05, + "loss": 1.3568, + "step": 2094 + }, + { + "epoch": 0.32795867251095806, + "grad_norm": 6.124314308166504, + "learning_rate": 8.706419028999675e-05, + "loss": 1.1884, + "step": 2095 + }, + { + "epoch": 0.32811521603005633, + "grad_norm": 6.5831403732299805, + "learning_rate": 8.705604431410883e-05, + "loss": 0.8941, + "step": 2096 + }, + { + "epoch": 0.32827175954915466, + "grad_norm": 2.401827096939087, + "learning_rate": 8.704789833822092e-05, + "loss": 0.5647, + "step": 2097 + }, + { + "epoch": 0.328428303068253, + "grad_norm": 4.983480453491211, + "learning_rate": 8.703975236233302e-05, + "loss": 1.1781, + "step": 2098 + }, + { + "epoch": 0.32858484658735126, + "grad_norm": 3.553452730178833, + "learning_rate": 8.70316063864451e-05, + "loss": 1.0959, + "step": 2099 + }, + { + "epoch": 0.3287413901064496, + "grad_norm": 3.40087890625, + "learning_rate": 8.702346041055718e-05, + "loss": 1.2252, + "step": 2100 + }, + { + "epoch": 0.3288979336255479, + "grad_norm": 0.8306208848953247, + "learning_rate": 8.701531443466928e-05, + "loss": 0.356, + "step": 2101 + }, + { + "epoch": 0.3290544771446462, + "grad_norm": 0.5926739573478699, + "learning_rate": 8.700716845878136e-05, + "loss": 0.3237, + "step": 2102 + }, + { + "epoch": 0.3292110206637445, + "grad_norm": 0.9827213287353516, + "learning_rate": 8.699902248289346e-05, + "loss": 0.4429, + "step": 2103 + }, + { + "epoch": 0.32936756418284285, + "grad_norm": 0.971408486366272, + "learning_rate": 8.699087650700555e-05, + "loss": 0.43, + "step": 2104 + }, + { + "epoch": 0.3295241077019411, + "grad_norm": 1.3380309343338013, + "learning_rate": 8.698273053111763e-05, + "loss": 0.4214, + "step": 2105 + }, + { + "epoch": 0.32968065122103946, + "grad_norm": 0.9952098727226257, + "learning_rate": 8.697458455522973e-05, + "loss": 0.5421, + "step": 2106 + }, + { + "epoch": 0.3298371947401378, + "grad_norm": 1.0120177268981934, + "learning_rate": 8.696643857934181e-05, + "loss": 0.4249, + "step": 2107 + }, + { + "epoch": 0.32999373825923606, + "grad_norm": 0.663582444190979, + "learning_rate": 8.69582926034539e-05, + "loss": 0.3802, + "step": 2108 + }, + { + "epoch": 0.3301502817783344, + "grad_norm": 1.0191502571105957, + "learning_rate": 8.695014662756599e-05, + "loss": 0.4324, + "step": 2109 + }, + { + "epoch": 0.33030682529743266, + "grad_norm": 0.9592987895011902, + "learning_rate": 8.694200065167807e-05, + "loss": 0.4589, + "step": 2110 + }, + { + "epoch": 0.330463368816531, + "grad_norm": 1.1550506353378296, + "learning_rate": 8.693385467579016e-05, + "loss": 0.5041, + "step": 2111 + }, + { + "epoch": 0.3306199123356293, + "grad_norm": 1.3807802200317383, + "learning_rate": 8.692570869990226e-05, + "loss": 0.4259, + "step": 2112 + }, + { + "epoch": 0.3307764558547276, + "grad_norm": 1.779161810874939, + "learning_rate": 8.691756272401434e-05, + "loss": 0.7218, + "step": 2113 + }, + { + "epoch": 0.3309329993738259, + "grad_norm": 2.4593405723571777, + "learning_rate": 8.690941674812642e-05, + "loss": 0.7198, + "step": 2114 + }, + { + "epoch": 0.33108954289292425, + "grad_norm": 2.0291593074798584, + "learning_rate": 8.690127077223852e-05, + "loss": 0.8373, + "step": 2115 + }, + { + "epoch": 0.3312460864120225, + "grad_norm": 1.284353494644165, + "learning_rate": 8.689312479635062e-05, + "loss": 0.4805, + "step": 2116 + }, + { + "epoch": 0.33140262993112085, + "grad_norm": 1.4886727333068848, + "learning_rate": 8.688497882046269e-05, + "loss": 0.5389, + "step": 2117 + }, + { + "epoch": 0.3315591734502192, + "grad_norm": 2.4895145893096924, + "learning_rate": 8.687683284457479e-05, + "loss": 0.626, + "step": 2118 + }, + { + "epoch": 0.33171571696931745, + "grad_norm": 1.5577806234359741, + "learning_rate": 8.686868686868688e-05, + "loss": 0.5936, + "step": 2119 + }, + { + "epoch": 0.3318722604884158, + "grad_norm": 2.7910399436950684, + "learning_rate": 8.686054089279895e-05, + "loss": 0.6403, + "step": 2120 + }, + { + "epoch": 0.3320288040075141, + "grad_norm": 1.5100765228271484, + "learning_rate": 8.685239491691105e-05, + "loss": 0.4635, + "step": 2121 + }, + { + "epoch": 0.3321853475266124, + "grad_norm": 3.10421085357666, + "learning_rate": 8.684424894102315e-05, + "loss": 0.7375, + "step": 2122 + }, + { + "epoch": 0.3323418910457107, + "grad_norm": 2.310443639755249, + "learning_rate": 8.683610296513522e-05, + "loss": 0.5149, + "step": 2123 + }, + { + "epoch": 0.33249843456480904, + "grad_norm": 3.762582302093506, + "learning_rate": 8.682795698924732e-05, + "loss": 0.6385, + "step": 2124 + }, + { + "epoch": 0.3326549780839073, + "grad_norm": 4.222784996032715, + "learning_rate": 8.681981101335941e-05, + "loss": 1.0885, + "step": 2125 + }, + { + "epoch": 0.33281152160300564, + "grad_norm": 3.7152764797210693, + "learning_rate": 8.68116650374715e-05, + "loss": 1.1734, + "step": 2126 + }, + { + "epoch": 0.3329680651221039, + "grad_norm": 1.7080432176589966, + "learning_rate": 8.680351906158358e-05, + "loss": 0.5992, + "step": 2127 + }, + { + "epoch": 0.33312460864120225, + "grad_norm": 1.5596426725387573, + "learning_rate": 8.679537308569568e-05, + "loss": 0.9728, + "step": 2128 + }, + { + "epoch": 0.3332811521603006, + "grad_norm": 3.2601191997528076, + "learning_rate": 8.678722710980776e-05, + "loss": 1.0052, + "step": 2129 + }, + { + "epoch": 0.33343769567939885, + "grad_norm": 2.1633460521698, + "learning_rate": 8.677908113391984e-05, + "loss": 0.8977, + "step": 2130 + }, + { + "epoch": 0.3335942391984972, + "grad_norm": 3.4698731899261475, + "learning_rate": 8.677093515803194e-05, + "loss": 0.7171, + "step": 2131 + }, + { + "epoch": 0.3337507827175955, + "grad_norm": 2.582594156265259, + "learning_rate": 8.676278918214403e-05, + "loss": 0.7755, + "step": 2132 + }, + { + "epoch": 0.3339073262366938, + "grad_norm": 1.6939363479614258, + "learning_rate": 8.675464320625611e-05, + "loss": 0.8226, + "step": 2133 + }, + { + "epoch": 0.3340638697557921, + "grad_norm": 5.04578161239624, + "learning_rate": 8.67464972303682e-05, + "loss": 0.6973, + "step": 2134 + }, + { + "epoch": 0.33422041327489044, + "grad_norm": 3.1464505195617676, + "learning_rate": 8.673835125448029e-05, + "loss": 0.9322, + "step": 2135 + }, + { + "epoch": 0.3343769567939887, + "grad_norm": 4.240480422973633, + "learning_rate": 8.673020527859237e-05, + "loss": 1.1724, + "step": 2136 + }, + { + "epoch": 0.33453350031308704, + "grad_norm": 3.0530552864074707, + "learning_rate": 8.672205930270447e-05, + "loss": 1.351, + "step": 2137 + }, + { + "epoch": 0.33469004383218537, + "grad_norm": 4.9708709716796875, + "learning_rate": 8.671391332681656e-05, + "loss": 1.3193, + "step": 2138 + }, + { + "epoch": 0.33484658735128364, + "grad_norm": 4.049728870391846, + "learning_rate": 8.670576735092865e-05, + "loss": 0.9826, + "step": 2139 + }, + { + "epoch": 0.33500313087038197, + "grad_norm": 1.7288857698440552, + "learning_rate": 8.669762137504074e-05, + "loss": 0.7935, + "step": 2140 + }, + { + "epoch": 0.3351596743894803, + "grad_norm": 4.30653190612793, + "learning_rate": 8.668947539915282e-05, + "loss": 2.1096, + "step": 2141 + }, + { + "epoch": 0.3353162179085786, + "grad_norm": 4.318230152130127, + "learning_rate": 8.668132942326492e-05, + "loss": 1.2479, + "step": 2142 + }, + { + "epoch": 0.3354727614276769, + "grad_norm": 3.238116502761841, + "learning_rate": 8.6673183447377e-05, + "loss": 1.3598, + "step": 2143 + }, + { + "epoch": 0.33562930494677523, + "grad_norm": 3.3352150917053223, + "learning_rate": 8.666503747148908e-05, + "loss": 1.1509, + "step": 2144 + }, + { + "epoch": 0.3357858484658735, + "grad_norm": 5.889420509338379, + "learning_rate": 8.665689149560118e-05, + "loss": 1.401, + "step": 2145 + }, + { + "epoch": 0.33594239198497183, + "grad_norm": 5.421206474304199, + "learning_rate": 8.664874551971327e-05, + "loss": 0.7993, + "step": 2146 + }, + { + "epoch": 0.3360989355040701, + "grad_norm": 3.3103268146514893, + "learning_rate": 8.664059954382535e-05, + "loss": 0.9852, + "step": 2147 + }, + { + "epoch": 0.33625547902316844, + "grad_norm": 2.299802541732788, + "learning_rate": 8.663245356793745e-05, + "loss": 0.9351, + "step": 2148 + }, + { + "epoch": 0.33641202254226676, + "grad_norm": 4.058669567108154, + "learning_rate": 8.662430759204953e-05, + "loss": 0.921, + "step": 2149 + }, + { + "epoch": 0.33656856606136504, + "grad_norm": 3.966507911682129, + "learning_rate": 8.661616161616161e-05, + "loss": 1.0623, + "step": 2150 + }, + { + "epoch": 0.33672510958046337, + "grad_norm": 0.7651963233947754, + "learning_rate": 8.660801564027371e-05, + "loss": 0.3501, + "step": 2151 + }, + { + "epoch": 0.3368816530995617, + "grad_norm": 0.7096244692802429, + "learning_rate": 8.65998696643858e-05, + "loss": 0.3934, + "step": 2152 + }, + { + "epoch": 0.33703819661865997, + "grad_norm": 1.0906423330307007, + "learning_rate": 8.659172368849788e-05, + "loss": 0.3084, + "step": 2153 + }, + { + "epoch": 0.3371947401377583, + "grad_norm": 1.0769031047821045, + "learning_rate": 8.658357771260998e-05, + "loss": 0.3266, + "step": 2154 + }, + { + "epoch": 0.3373512836568566, + "grad_norm": 0.892730712890625, + "learning_rate": 8.657543173672207e-05, + "loss": 0.4484, + "step": 2155 + }, + { + "epoch": 0.3375078271759549, + "grad_norm": 1.5157899856567383, + "learning_rate": 8.656728576083414e-05, + "loss": 0.4173, + "step": 2156 + }, + { + "epoch": 0.33766437069505323, + "grad_norm": 1.8251396417617798, + "learning_rate": 8.655913978494624e-05, + "loss": 0.5582, + "step": 2157 + }, + { + "epoch": 0.33782091421415156, + "grad_norm": 1.1860986948013306, + "learning_rate": 8.655099380905834e-05, + "loss": 0.4243, + "step": 2158 + }, + { + "epoch": 0.33797745773324983, + "grad_norm": 1.065689206123352, + "learning_rate": 8.654284783317041e-05, + "loss": 0.4714, + "step": 2159 + }, + { + "epoch": 0.33813400125234816, + "grad_norm": 1.4568936824798584, + "learning_rate": 8.65347018572825e-05, + "loss": 0.5739, + "step": 2160 + }, + { + "epoch": 0.3382905447714465, + "grad_norm": 0.7497048377990723, + "learning_rate": 8.65265558813946e-05, + "loss": 0.3496, + "step": 2161 + }, + { + "epoch": 0.33844708829054476, + "grad_norm": 1.5627093315124512, + "learning_rate": 8.651840990550669e-05, + "loss": 0.7836, + "step": 2162 + }, + { + "epoch": 0.3386036318096431, + "grad_norm": 2.4880125522613525, + "learning_rate": 8.651026392961877e-05, + "loss": 0.5186, + "step": 2163 + }, + { + "epoch": 0.33876017532874136, + "grad_norm": 2.2047536373138428, + "learning_rate": 8.650211795373087e-05, + "loss": 0.3736, + "step": 2164 + }, + { + "epoch": 0.3389167188478397, + "grad_norm": 1.079923391342163, + "learning_rate": 8.649397197784295e-05, + "loss": 0.4548, + "step": 2165 + }, + { + "epoch": 0.339073262366938, + "grad_norm": 1.9169459342956543, + "learning_rate": 8.648582600195504e-05, + "loss": 0.8702, + "step": 2166 + }, + { + "epoch": 0.3392298058860363, + "grad_norm": 1.5100040435791016, + "learning_rate": 8.647768002606713e-05, + "loss": 0.5956, + "step": 2167 + }, + { + "epoch": 0.3393863494051346, + "grad_norm": 1.5141657590866089, + "learning_rate": 8.646953405017922e-05, + "loss": 0.6242, + "step": 2168 + }, + { + "epoch": 0.33954289292423295, + "grad_norm": 1.39118230342865, + "learning_rate": 8.64613880742913e-05, + "loss": 0.7181, + "step": 2169 + }, + { + "epoch": 0.3396994364433312, + "grad_norm": 2.2996184825897217, + "learning_rate": 8.64532420984034e-05, + "loss": 0.8067, + "step": 2170 + }, + { + "epoch": 0.33985597996242956, + "grad_norm": 1.628635048866272, + "learning_rate": 8.644509612251548e-05, + "loss": 0.6251, + "step": 2171 + }, + { + "epoch": 0.3400125234815279, + "grad_norm": 1.8793636560440063, + "learning_rate": 8.643695014662757e-05, + "loss": 0.9134, + "step": 2172 + }, + { + "epoch": 0.34016906700062616, + "grad_norm": 1.794350028038025, + "learning_rate": 8.642880417073966e-05, + "loss": 0.6315, + "step": 2173 + }, + { + "epoch": 0.3403256105197245, + "grad_norm": 2.103614091873169, + "learning_rate": 8.642065819485175e-05, + "loss": 0.8792, + "step": 2174 + }, + { + "epoch": 0.3404821540388228, + "grad_norm": 2.3434531688690186, + "learning_rate": 8.641251221896384e-05, + "loss": 0.6553, + "step": 2175 + }, + { + "epoch": 0.3406386975579211, + "grad_norm": 2.448655605316162, + "learning_rate": 8.640436624307593e-05, + "loss": 1.1438, + "step": 2176 + }, + { + "epoch": 0.3407952410770194, + "grad_norm": 2.731719732284546, + "learning_rate": 8.639622026718801e-05, + "loss": 1.0782, + "step": 2177 + }, + { + "epoch": 0.34095178459611775, + "grad_norm": 2.7872471809387207, + "learning_rate": 8.638807429130011e-05, + "loss": 0.7843, + "step": 2178 + }, + { + "epoch": 0.341108328115216, + "grad_norm": 3.17773699760437, + "learning_rate": 8.637992831541219e-05, + "loss": 0.9005, + "step": 2179 + }, + { + "epoch": 0.34126487163431435, + "grad_norm": 5.604252338409424, + "learning_rate": 8.637178233952428e-05, + "loss": 1.0222, + "step": 2180 + }, + { + "epoch": 0.3414214151534126, + "grad_norm": 2.157769203186035, + "learning_rate": 8.636363636363637e-05, + "loss": 0.8939, + "step": 2181 + }, + { + "epoch": 0.34157795867251095, + "grad_norm": 3.1177141666412354, + "learning_rate": 8.635549038774846e-05, + "loss": 1.0707, + "step": 2182 + }, + { + "epoch": 0.3417345021916093, + "grad_norm": 3.5270304679870605, + "learning_rate": 8.634734441186054e-05, + "loss": 1.1132, + "step": 2183 + }, + { + "epoch": 0.34189104571070755, + "grad_norm": 4.487339019775391, + "learning_rate": 8.633919843597264e-05, + "loss": 1.3019, + "step": 2184 + }, + { + "epoch": 0.3420475892298059, + "grad_norm": 3.1634695529937744, + "learning_rate": 8.633105246008472e-05, + "loss": 1.1047, + "step": 2185 + }, + { + "epoch": 0.3422041327489042, + "grad_norm": 3.9237680435180664, + "learning_rate": 8.63229064841968e-05, + "loss": 1.4243, + "step": 2186 + }, + { + "epoch": 0.3423606762680025, + "grad_norm": 3.2632253170013428, + "learning_rate": 8.63147605083089e-05, + "loss": 1.272, + "step": 2187 + }, + { + "epoch": 0.3425172197871008, + "grad_norm": 3.2364025115966797, + "learning_rate": 8.630661453242099e-05, + "loss": 1.5675, + "step": 2188 + }, + { + "epoch": 0.34267376330619914, + "grad_norm": 3.413972854614258, + "learning_rate": 8.629846855653307e-05, + "loss": 1.2725, + "step": 2189 + }, + { + "epoch": 0.3428303068252974, + "grad_norm": 5.479183197021484, + "learning_rate": 8.629032258064517e-05, + "loss": 1.4622, + "step": 2190 + }, + { + "epoch": 0.34298685034439574, + "grad_norm": 5.390576362609863, + "learning_rate": 8.628217660475727e-05, + "loss": 0.9201, + "step": 2191 + }, + { + "epoch": 0.3431433938634941, + "grad_norm": 2.891740560531616, + "learning_rate": 8.627403062886934e-05, + "loss": 1.1457, + "step": 2192 + }, + { + "epoch": 0.34329993738259235, + "grad_norm": 3.2900171279907227, + "learning_rate": 8.626588465298143e-05, + "loss": 1.5848, + "step": 2193 + }, + { + "epoch": 0.3434564809016907, + "grad_norm": 2.5981924533843994, + "learning_rate": 8.625773867709353e-05, + "loss": 1.0989, + "step": 2194 + }, + { + "epoch": 0.343613024420789, + "grad_norm": 2.9454598426818848, + "learning_rate": 8.62495927012056e-05, + "loss": 0.952, + "step": 2195 + }, + { + "epoch": 0.3437695679398873, + "grad_norm": 2.85093092918396, + "learning_rate": 8.62414467253177e-05, + "loss": 1.6057, + "step": 2196 + }, + { + "epoch": 0.3439261114589856, + "grad_norm": 3.140848159790039, + "learning_rate": 8.62333007494298e-05, + "loss": 1.0378, + "step": 2197 + }, + { + "epoch": 0.34408265497808393, + "grad_norm": 2.146031379699707, + "learning_rate": 8.622515477354188e-05, + "loss": 0.8894, + "step": 2198 + }, + { + "epoch": 0.3442391984971822, + "grad_norm": 2.8449649810791016, + "learning_rate": 8.621700879765396e-05, + "loss": 0.8193, + "step": 2199 + }, + { + "epoch": 0.34439574201628054, + "grad_norm": 2.170389175415039, + "learning_rate": 8.620886282176606e-05, + "loss": 1.3095, + "step": 2200 + }, + { + "epoch": 0.3445522855353788, + "grad_norm": 0.5027259588241577, + "learning_rate": 8.620071684587814e-05, + "loss": 0.3665, + "step": 2201 + }, + { + "epoch": 0.34470882905447714, + "grad_norm": 0.8152298331260681, + "learning_rate": 8.619257086999023e-05, + "loss": 0.5214, + "step": 2202 + }, + { + "epoch": 0.34486537257357547, + "grad_norm": 0.9443122744560242, + "learning_rate": 8.618442489410232e-05, + "loss": 0.5234, + "step": 2203 + }, + { + "epoch": 0.34502191609267374, + "grad_norm": 0.750850260257721, + "learning_rate": 8.617627891821441e-05, + "loss": 0.3608, + "step": 2204 + }, + { + "epoch": 0.34517845961177207, + "grad_norm": 0.8489469289779663, + "learning_rate": 8.616813294232649e-05, + "loss": 0.4301, + "step": 2205 + }, + { + "epoch": 0.3453350031308704, + "grad_norm": 2.4425745010375977, + "learning_rate": 8.615998696643859e-05, + "loss": 0.3484, + "step": 2206 + }, + { + "epoch": 0.3454915466499687, + "grad_norm": 0.9429333209991455, + "learning_rate": 8.615184099055067e-05, + "loss": 0.4294, + "step": 2207 + }, + { + "epoch": 0.345648090169067, + "grad_norm": 1.3120408058166504, + "learning_rate": 8.614369501466276e-05, + "loss": 0.4755, + "step": 2208 + }, + { + "epoch": 0.34580463368816533, + "grad_norm": 1.4720733165740967, + "learning_rate": 8.613554903877485e-05, + "loss": 0.4656, + "step": 2209 + }, + { + "epoch": 0.3459611772072636, + "grad_norm": 1.280274510383606, + "learning_rate": 8.612740306288694e-05, + "loss": 0.5474, + "step": 2210 + }, + { + "epoch": 0.34611772072636193, + "grad_norm": 0.8267464637756348, + "learning_rate": 8.611925708699902e-05, + "loss": 0.3693, + "step": 2211 + }, + { + "epoch": 0.34627426424546026, + "grad_norm": 0.981605052947998, + "learning_rate": 8.611111111111112e-05, + "loss": 0.3602, + "step": 2212 + }, + { + "epoch": 0.34643080776455853, + "grad_norm": 0.9669626951217651, + "learning_rate": 8.61029651352232e-05, + "loss": 0.2754, + "step": 2213 + }, + { + "epoch": 0.34658735128365686, + "grad_norm": 1.3211981058120728, + "learning_rate": 8.60948191593353e-05, + "loss": 0.5425, + "step": 2214 + }, + { + "epoch": 0.3467438948027552, + "grad_norm": 3.0621390342712402, + "learning_rate": 8.608667318344738e-05, + "loss": 0.8349, + "step": 2215 + }, + { + "epoch": 0.34690043832185347, + "grad_norm": 1.1061192750930786, + "learning_rate": 8.607852720755947e-05, + "loss": 0.402, + "step": 2216 + }, + { + "epoch": 0.3470569818409518, + "grad_norm": 1.8544092178344727, + "learning_rate": 8.607038123167156e-05, + "loss": 0.9101, + "step": 2217 + }, + { + "epoch": 0.34721352536005007, + "grad_norm": 2.6814422607421875, + "learning_rate": 8.606223525578365e-05, + "loss": 0.5716, + "step": 2218 + }, + { + "epoch": 0.3473700688791484, + "grad_norm": 0.8424213528633118, + "learning_rate": 8.605408927989573e-05, + "loss": 0.4215, + "step": 2219 + }, + { + "epoch": 0.3475266123982467, + "grad_norm": 2.2635767459869385, + "learning_rate": 8.604594330400783e-05, + "loss": 0.5913, + "step": 2220 + }, + { + "epoch": 0.347683155917345, + "grad_norm": 1.6337071657180786, + "learning_rate": 8.603779732811991e-05, + "loss": 0.7259, + "step": 2221 + }, + { + "epoch": 0.34783969943644333, + "grad_norm": 1.4793334007263184, + "learning_rate": 8.6029651352232e-05, + "loss": 0.538, + "step": 2222 + }, + { + "epoch": 0.34799624295554166, + "grad_norm": 2.04260516166687, + "learning_rate": 8.60215053763441e-05, + "loss": 0.5442, + "step": 2223 + }, + { + "epoch": 0.34815278647463993, + "grad_norm": 1.8766299486160278, + "learning_rate": 8.601335940045618e-05, + "loss": 0.8102, + "step": 2224 + }, + { + "epoch": 0.34830932999373826, + "grad_norm": 2.885748863220215, + "learning_rate": 8.600521342456826e-05, + "loss": 0.718, + "step": 2225 + }, + { + "epoch": 0.3484658735128366, + "grad_norm": 1.3632066249847412, + "learning_rate": 8.599706744868036e-05, + "loss": 0.6193, + "step": 2226 + }, + { + "epoch": 0.34862241703193486, + "grad_norm": 2.860996723175049, + "learning_rate": 8.598892147279246e-05, + "loss": 0.7556, + "step": 2227 + }, + { + "epoch": 0.3487789605510332, + "grad_norm": 1.7054604291915894, + "learning_rate": 8.598077549690453e-05, + "loss": 0.5608, + "step": 2228 + }, + { + "epoch": 0.3489355040701315, + "grad_norm": 2.5315184593200684, + "learning_rate": 8.597262952101662e-05, + "loss": 0.9074, + "step": 2229 + }, + { + "epoch": 0.3490920475892298, + "grad_norm": 3.610147714614868, + "learning_rate": 8.596448354512872e-05, + "loss": 1.1111, + "step": 2230 + }, + { + "epoch": 0.3492485911083281, + "grad_norm": 3.0808944702148438, + "learning_rate": 8.595633756924079e-05, + "loss": 1.0008, + "step": 2231 + }, + { + "epoch": 0.34940513462742645, + "grad_norm": 2.4797070026397705, + "learning_rate": 8.594819159335289e-05, + "loss": 0.7429, + "step": 2232 + }, + { + "epoch": 0.3495616781465247, + "grad_norm": 3.2579941749572754, + "learning_rate": 8.594004561746499e-05, + "loss": 0.9853, + "step": 2233 + }, + { + "epoch": 0.34971822166562305, + "grad_norm": 2.065136671066284, + "learning_rate": 8.593189964157706e-05, + "loss": 0.669, + "step": 2234 + }, + { + "epoch": 0.3498747651847213, + "grad_norm": 3.208768844604492, + "learning_rate": 8.592375366568915e-05, + "loss": 1.0245, + "step": 2235 + }, + { + "epoch": 0.35003130870381965, + "grad_norm": 5.428733825683594, + "learning_rate": 8.591560768980125e-05, + "loss": 1.3734, + "step": 2236 + }, + { + "epoch": 0.350187852222918, + "grad_norm": 4.584397315979004, + "learning_rate": 8.590746171391333e-05, + "loss": 0.9169, + "step": 2237 + }, + { + "epoch": 0.35034439574201626, + "grad_norm": 2.1845760345458984, + "learning_rate": 8.589931573802542e-05, + "loss": 1.1323, + "step": 2238 + }, + { + "epoch": 0.3505009392611146, + "grad_norm": 1.8042018413543701, + "learning_rate": 8.589116976213752e-05, + "loss": 1.1029, + "step": 2239 + }, + { + "epoch": 0.3506574827802129, + "grad_norm": 4.261810779571533, + "learning_rate": 8.58830237862496e-05, + "loss": 1.248, + "step": 2240 + }, + { + "epoch": 0.3508140262993112, + "grad_norm": 4.058051586151123, + "learning_rate": 8.587487781036168e-05, + "loss": 1.3374, + "step": 2241 + }, + { + "epoch": 0.3509705698184095, + "grad_norm": 2.998431444168091, + "learning_rate": 8.586673183447378e-05, + "loss": 1.5307, + "step": 2242 + }, + { + "epoch": 0.35112711333750785, + "grad_norm": 2.950613021850586, + "learning_rate": 8.585858585858586e-05, + "loss": 0.8733, + "step": 2243 + }, + { + "epoch": 0.3512836568566061, + "grad_norm": 3.2630057334899902, + "learning_rate": 8.585043988269795e-05, + "loss": 1.7096, + "step": 2244 + }, + { + "epoch": 0.35144020037570445, + "grad_norm": 4.64705228805542, + "learning_rate": 8.584229390681004e-05, + "loss": 1.0574, + "step": 2245 + }, + { + "epoch": 0.3515967438948028, + "grad_norm": 2.504232168197632, + "learning_rate": 8.583414793092213e-05, + "loss": 1.6011, + "step": 2246 + }, + { + "epoch": 0.35175328741390105, + "grad_norm": 2.7300121784210205, + "learning_rate": 8.582600195503421e-05, + "loss": 0.9964, + "step": 2247 + }, + { + "epoch": 0.3519098309329994, + "grad_norm": 3.4807255268096924, + "learning_rate": 8.581785597914631e-05, + "loss": 0.8756, + "step": 2248 + }, + { + "epoch": 0.3520663744520977, + "grad_norm": 3.262986660003662, + "learning_rate": 8.58097100032584e-05, + "loss": 0.846, + "step": 2249 + }, + { + "epoch": 0.352222917971196, + "grad_norm": 3.0556986331939697, + "learning_rate": 8.580156402737049e-05, + "loss": 1.0668, + "step": 2250 + }, + { + "epoch": 0.3523794614902943, + "grad_norm": 0.7203706502914429, + "learning_rate": 8.579341805148257e-05, + "loss": 0.3345, + "step": 2251 + }, + { + "epoch": 0.3525360050093926, + "grad_norm": 0.655221700668335, + "learning_rate": 8.578527207559466e-05, + "loss": 0.281, + "step": 2252 + }, + { + "epoch": 0.3526925485284909, + "grad_norm": 0.8771984577178955, + "learning_rate": 8.577712609970676e-05, + "loss": 0.4176, + "step": 2253 + }, + { + "epoch": 0.35284909204758924, + "grad_norm": 1.1717931032180786, + "learning_rate": 8.576898012381884e-05, + "loss": 0.2697, + "step": 2254 + }, + { + "epoch": 0.3530056355666875, + "grad_norm": 0.8640686869621277, + "learning_rate": 8.576083414793092e-05, + "loss": 0.3767, + "step": 2255 + }, + { + "epoch": 0.35316217908578584, + "grad_norm": 0.6525649428367615, + "learning_rate": 8.575268817204302e-05, + "loss": 0.3112, + "step": 2256 + }, + { + "epoch": 0.3533187226048842, + "grad_norm": 0.7585033178329468, + "learning_rate": 8.57445421961551e-05, + "loss": 0.2916, + "step": 2257 + }, + { + "epoch": 0.35347526612398245, + "grad_norm": 1.6398924589157104, + "learning_rate": 8.573639622026719e-05, + "loss": 0.6892, + "step": 2258 + }, + { + "epoch": 0.3536318096430808, + "grad_norm": 1.4268845319747925, + "learning_rate": 8.572825024437929e-05, + "loss": 0.3306, + "step": 2259 + }, + { + "epoch": 0.3537883531621791, + "grad_norm": 1.082749605178833, + "learning_rate": 8.572010426849137e-05, + "loss": 0.4666, + "step": 2260 + }, + { + "epoch": 0.3539448966812774, + "grad_norm": 0.8330399394035339, + "learning_rate": 8.571195829260345e-05, + "loss": 0.3443, + "step": 2261 + }, + { + "epoch": 0.3541014402003757, + "grad_norm": 1.4259440898895264, + "learning_rate": 8.570381231671555e-05, + "loss": 0.4228, + "step": 2262 + }, + { + "epoch": 0.35425798371947403, + "grad_norm": 0.997711181640625, + "learning_rate": 8.569566634082763e-05, + "loss": 0.3775, + "step": 2263 + }, + { + "epoch": 0.3544145272385723, + "grad_norm": 1.3412407636642456, + "learning_rate": 8.568752036493972e-05, + "loss": 0.5019, + "step": 2264 + }, + { + "epoch": 0.35457107075767064, + "grad_norm": 2.1021480560302734, + "learning_rate": 8.567937438905181e-05, + "loss": 0.4964, + "step": 2265 + }, + { + "epoch": 0.35472761427676897, + "grad_norm": 1.6874146461486816, + "learning_rate": 8.567122841316391e-05, + "loss": 0.5822, + "step": 2266 + }, + { + "epoch": 0.35488415779586724, + "grad_norm": 1.6612932682037354, + "learning_rate": 8.566308243727598e-05, + "loss": 0.9324, + "step": 2267 + }, + { + "epoch": 0.35504070131496557, + "grad_norm": 1.6773579120635986, + "learning_rate": 8.565493646138808e-05, + "loss": 0.5476, + "step": 2268 + }, + { + "epoch": 0.3551972448340639, + "grad_norm": 1.4900628328323364, + "learning_rate": 8.564679048550018e-05, + "loss": 0.7372, + "step": 2269 + }, + { + "epoch": 0.35535378835316217, + "grad_norm": 1.6895347833633423, + "learning_rate": 8.563864450961225e-05, + "loss": 0.6001, + "step": 2270 + }, + { + "epoch": 0.3555103318722605, + "grad_norm": 1.72140371799469, + "learning_rate": 8.563049853372434e-05, + "loss": 0.7538, + "step": 2271 + }, + { + "epoch": 0.35566687539135877, + "grad_norm": 3.05175518989563, + "learning_rate": 8.562235255783644e-05, + "loss": 0.6469, + "step": 2272 + }, + { + "epoch": 0.3558234189104571, + "grad_norm": 2.8240885734558105, + "learning_rate": 8.561420658194853e-05, + "loss": 0.5495, + "step": 2273 + }, + { + "epoch": 0.35597996242955543, + "grad_norm": 2.3170013427734375, + "learning_rate": 8.560606060606061e-05, + "loss": 1.1199, + "step": 2274 + }, + { + "epoch": 0.3561365059486537, + "grad_norm": 2.7356414794921875, + "learning_rate": 8.55979146301727e-05, + "loss": 0.8631, + "step": 2275 + }, + { + "epoch": 0.35629304946775203, + "grad_norm": 2.4377636909484863, + "learning_rate": 8.558976865428479e-05, + "loss": 1.0844, + "step": 2276 + }, + { + "epoch": 0.35644959298685036, + "grad_norm": 3.3142921924591064, + "learning_rate": 8.558162267839687e-05, + "loss": 1.2296, + "step": 2277 + }, + { + "epoch": 0.35660613650594863, + "grad_norm": 1.8071049451828003, + "learning_rate": 8.557347670250897e-05, + "loss": 0.5808, + "step": 2278 + }, + { + "epoch": 0.35676268002504696, + "grad_norm": 3.348052978515625, + "learning_rate": 8.556533072662106e-05, + "loss": 1.191, + "step": 2279 + }, + { + "epoch": 0.3569192235441453, + "grad_norm": 1.930125117301941, + "learning_rate": 8.555718475073314e-05, + "loss": 0.9748, + "step": 2280 + }, + { + "epoch": 0.35707576706324357, + "grad_norm": 4.609613418579102, + "learning_rate": 8.554903877484524e-05, + "loss": 1.3742, + "step": 2281 + }, + { + "epoch": 0.3572323105823419, + "grad_norm": 3.434202194213867, + "learning_rate": 8.554089279895732e-05, + "loss": 0.7279, + "step": 2282 + }, + { + "epoch": 0.3573888541014402, + "grad_norm": 2.384532928466797, + "learning_rate": 8.55327468230694e-05, + "loss": 1.0765, + "step": 2283 + }, + { + "epoch": 0.3575453976205385, + "grad_norm": 2.805187463760376, + "learning_rate": 8.55246008471815e-05, + "loss": 1.1427, + "step": 2284 + }, + { + "epoch": 0.3577019411396368, + "grad_norm": 2.8195481300354004, + "learning_rate": 8.551645487129358e-05, + "loss": 1.221, + "step": 2285 + }, + { + "epoch": 0.35785848465873515, + "grad_norm": 2.1555662155151367, + "learning_rate": 8.550830889540568e-05, + "loss": 1.5139, + "step": 2286 + }, + { + "epoch": 0.3580150281778334, + "grad_norm": 3.9756252765655518, + "learning_rate": 8.550016291951777e-05, + "loss": 1.5262, + "step": 2287 + }, + { + "epoch": 0.35817157169693176, + "grad_norm": 3.108473539352417, + "learning_rate": 8.549201694362985e-05, + "loss": 1.1529, + "step": 2288 + }, + { + "epoch": 0.35832811521603003, + "grad_norm": 2.796553134918213, + "learning_rate": 8.548387096774195e-05, + "loss": 1.3819, + "step": 2289 + }, + { + "epoch": 0.35848465873512836, + "grad_norm": 2.954596519470215, + "learning_rate": 8.547572499185403e-05, + "loss": 1.0609, + "step": 2290 + }, + { + "epoch": 0.3586412022542267, + "grad_norm": 3.352926015853882, + "learning_rate": 8.546757901596611e-05, + "loss": 1.9551, + "step": 2291 + }, + { + "epoch": 0.35879774577332496, + "grad_norm": 6.4388580322265625, + "learning_rate": 8.545943304007821e-05, + "loss": 1.1108, + "step": 2292 + }, + { + "epoch": 0.3589542892924233, + "grad_norm": 2.0778164863586426, + "learning_rate": 8.54512870641903e-05, + "loss": 1.5618, + "step": 2293 + }, + { + "epoch": 0.3591108328115216, + "grad_norm": 2.799607992172241, + "learning_rate": 8.544314108830238e-05, + "loss": 1.4214, + "step": 2294 + }, + { + "epoch": 0.3592673763306199, + "grad_norm": 1.5568077564239502, + "learning_rate": 8.543499511241448e-05, + "loss": 1.3773, + "step": 2295 + }, + { + "epoch": 0.3594239198497182, + "grad_norm": 1.851014256477356, + "learning_rate": 8.542684913652656e-05, + "loss": 0.8319, + "step": 2296 + }, + { + "epoch": 0.35958046336881655, + "grad_norm": 3.9719626903533936, + "learning_rate": 8.541870316063864e-05, + "loss": 1.0484, + "step": 2297 + }, + { + "epoch": 0.3597370068879148, + "grad_norm": 1.767194151878357, + "learning_rate": 8.541055718475074e-05, + "loss": 0.8619, + "step": 2298 + }, + { + "epoch": 0.35989355040701315, + "grad_norm": 2.412625789642334, + "learning_rate": 8.540241120886282e-05, + "loss": 1.2365, + "step": 2299 + }, + { + "epoch": 0.3600500939261115, + "grad_norm": 2.4082891941070557, + "learning_rate": 8.539426523297491e-05, + "loss": 1.3915, + "step": 2300 + }, + { + "epoch": 0.36020663744520975, + "grad_norm": 0.5844564437866211, + "learning_rate": 8.5386119257087e-05, + "loss": 0.3044, + "step": 2301 + }, + { + "epoch": 0.3603631809643081, + "grad_norm": 0.6608380675315857, + "learning_rate": 8.537797328119909e-05, + "loss": 0.463, + "step": 2302 + }, + { + "epoch": 0.3605197244834064, + "grad_norm": 0.8052379488945007, + "learning_rate": 8.536982730531117e-05, + "loss": 0.3961, + "step": 2303 + }, + { + "epoch": 0.3606762680025047, + "grad_norm": 1.7635958194732666, + "learning_rate": 8.536168132942327e-05, + "loss": 0.3753, + "step": 2304 + }, + { + "epoch": 0.360832811521603, + "grad_norm": 0.7758100628852844, + "learning_rate": 8.535353535353535e-05, + "loss": 0.3635, + "step": 2305 + }, + { + "epoch": 0.3609893550407013, + "grad_norm": 1.1044784784317017, + "learning_rate": 8.534538937764744e-05, + "loss": 0.3058, + "step": 2306 + }, + { + "epoch": 0.3611458985597996, + "grad_norm": 1.1746175289154053, + "learning_rate": 8.533724340175954e-05, + "loss": 0.3008, + "step": 2307 + }, + { + "epoch": 0.36130244207889795, + "grad_norm": 0.9880885481834412, + "learning_rate": 8.532909742587162e-05, + "loss": 0.4136, + "step": 2308 + }, + { + "epoch": 0.3614589855979962, + "grad_norm": 0.7127903699874878, + "learning_rate": 8.532095144998372e-05, + "loss": 0.395, + "step": 2309 + }, + { + "epoch": 0.36161552911709455, + "grad_norm": 11.1856107711792, + "learning_rate": 8.53128054740958e-05, + "loss": 1.5714, + "step": 2310 + }, + { + "epoch": 0.3617720726361929, + "grad_norm": 0.9463358521461487, + "learning_rate": 8.530465949820788e-05, + "loss": 0.3594, + "step": 2311 + }, + { + "epoch": 0.36192861615529115, + "grad_norm": 0.9890004992485046, + "learning_rate": 8.529651352231998e-05, + "loss": 0.4453, + "step": 2312 + }, + { + "epoch": 0.3620851596743895, + "grad_norm": 1.157268762588501, + "learning_rate": 8.528836754643207e-05, + "loss": 0.6087, + "step": 2313 + }, + { + "epoch": 0.3622417031934878, + "grad_norm": 1.7385170459747314, + "learning_rate": 8.528022157054415e-05, + "loss": 0.6098, + "step": 2314 + }, + { + "epoch": 0.3623982467125861, + "grad_norm": 2.25040864944458, + "learning_rate": 8.527207559465625e-05, + "loss": 0.908, + "step": 2315 + }, + { + "epoch": 0.3625547902316844, + "grad_norm": 1.0793660879135132, + "learning_rate": 8.526392961876833e-05, + "loss": 0.6767, + "step": 2316 + }, + { + "epoch": 0.36271133375078274, + "grad_norm": 1.2808445692062378, + "learning_rate": 8.525578364288041e-05, + "loss": 0.4489, + "step": 2317 + }, + { + "epoch": 0.362867877269881, + "grad_norm": 1.494083285331726, + "learning_rate": 8.524763766699251e-05, + "loss": 0.609, + "step": 2318 + }, + { + "epoch": 0.36302442078897934, + "grad_norm": 2.3720285892486572, + "learning_rate": 8.52394916911046e-05, + "loss": 0.6672, + "step": 2319 + }, + { + "epoch": 0.36318096430807767, + "grad_norm": 1.4294685125350952, + "learning_rate": 8.523134571521668e-05, + "loss": 0.6509, + "step": 2320 + }, + { + "epoch": 0.36333750782717594, + "grad_norm": 1.3711622953414917, + "learning_rate": 8.522319973932878e-05, + "loss": 0.4843, + "step": 2321 + }, + { + "epoch": 0.36349405134627427, + "grad_norm": 1.8161498308181763, + "learning_rate": 8.521505376344086e-05, + "loss": 0.6854, + "step": 2322 + }, + { + "epoch": 0.3636505948653726, + "grad_norm": 1.6050763130187988, + "learning_rate": 8.520690778755294e-05, + "loss": 0.5186, + "step": 2323 + }, + { + "epoch": 0.3638071383844709, + "grad_norm": 1.984257698059082, + "learning_rate": 8.519876181166504e-05, + "loss": 0.623, + "step": 2324 + }, + { + "epoch": 0.3639636819035692, + "grad_norm": 3.877303123474121, + "learning_rate": 8.519061583577714e-05, + "loss": 0.8148, + "step": 2325 + }, + { + "epoch": 0.3641202254226675, + "grad_norm": 4.115382194519043, + "learning_rate": 8.518246985988921e-05, + "loss": 1.0586, + "step": 2326 + }, + { + "epoch": 0.3642767689417658, + "grad_norm": 1.9576081037521362, + "learning_rate": 8.51743238840013e-05, + "loss": 0.6433, + "step": 2327 + }, + { + "epoch": 0.36443331246086413, + "grad_norm": 1.6005738973617554, + "learning_rate": 8.51661779081134e-05, + "loss": 0.6937, + "step": 2328 + }, + { + "epoch": 0.3645898559799624, + "grad_norm": 3.7088332176208496, + "learning_rate": 8.515803193222547e-05, + "loss": 0.9454, + "step": 2329 + }, + { + "epoch": 0.36474639949906074, + "grad_norm": 1.6640735864639282, + "learning_rate": 8.514988595633757e-05, + "loss": 0.824, + "step": 2330 + }, + { + "epoch": 0.36490294301815906, + "grad_norm": 1.4374011754989624, + "learning_rate": 8.514173998044967e-05, + "loss": 0.421, + "step": 2331 + }, + { + "epoch": 0.36505948653725734, + "grad_norm": 2.4659008979797363, + "learning_rate": 8.513359400456175e-05, + "loss": 1.2343, + "step": 2332 + }, + { + "epoch": 0.36521603005635567, + "grad_norm": 5.4693756103515625, + "learning_rate": 8.512544802867384e-05, + "loss": 1.1576, + "step": 2333 + }, + { + "epoch": 0.365372573575454, + "grad_norm": 3.710590362548828, + "learning_rate": 8.511730205278593e-05, + "loss": 1.1715, + "step": 2334 + }, + { + "epoch": 0.36552911709455227, + "grad_norm": 7.145907402038574, + "learning_rate": 8.510915607689802e-05, + "loss": 0.8952, + "step": 2335 + }, + { + "epoch": 0.3656856606136506, + "grad_norm": 3.2192699909210205, + "learning_rate": 8.51010101010101e-05, + "loss": 1.2311, + "step": 2336 + }, + { + "epoch": 0.3658422041327489, + "grad_norm": 4.198760509490967, + "learning_rate": 8.50928641251222e-05, + "loss": 1.1391, + "step": 2337 + }, + { + "epoch": 0.3659987476518472, + "grad_norm": 8.997225761413574, + "learning_rate": 8.508471814923428e-05, + "loss": 1.3391, + "step": 2338 + }, + { + "epoch": 0.36615529117094553, + "grad_norm": 2.4636154174804688, + "learning_rate": 8.507657217334636e-05, + "loss": 0.9341, + "step": 2339 + }, + { + "epoch": 0.36631183469004386, + "grad_norm": 1.7005985975265503, + "learning_rate": 8.506842619745846e-05, + "loss": 1.0242, + "step": 2340 + }, + { + "epoch": 0.36646837820914213, + "grad_norm": 6.185354232788086, + "learning_rate": 8.506028022157055e-05, + "loss": 1.7764, + "step": 2341 + }, + { + "epoch": 0.36662492172824046, + "grad_norm": 1.9015063047409058, + "learning_rate": 8.505213424568263e-05, + "loss": 1.074, + "step": 2342 + }, + { + "epoch": 0.36678146524733873, + "grad_norm": 4.260881423950195, + "learning_rate": 8.504398826979473e-05, + "loss": 1.5953, + "step": 2343 + }, + { + "epoch": 0.36693800876643706, + "grad_norm": 3.085517406463623, + "learning_rate": 8.503584229390681e-05, + "loss": 1.2077, + "step": 2344 + }, + { + "epoch": 0.3670945522855354, + "grad_norm": 4.4408040046691895, + "learning_rate": 8.502769631801891e-05, + "loss": 1.7539, + "step": 2345 + }, + { + "epoch": 0.36725109580463366, + "grad_norm": 3.342402696609497, + "learning_rate": 8.501955034213099e-05, + "loss": 1.3047, + "step": 2346 + }, + { + "epoch": 0.367407639323732, + "grad_norm": 1.4930315017700195, + "learning_rate": 8.501140436624308e-05, + "loss": 0.6562, + "step": 2347 + }, + { + "epoch": 0.3675641828428303, + "grad_norm": 1.2134774923324585, + "learning_rate": 8.500325839035517e-05, + "loss": 0.7433, + "step": 2348 + }, + { + "epoch": 0.3677207263619286, + "grad_norm": 4.32563591003418, + "learning_rate": 8.499511241446726e-05, + "loss": 1.6138, + "step": 2349 + }, + { + "epoch": 0.3678772698810269, + "grad_norm": 3.1536569595336914, + "learning_rate": 8.498696643857934e-05, + "loss": 0.845, + "step": 2350 + }, + { + "epoch": 0.36803381340012525, + "grad_norm": 1.4156365394592285, + "learning_rate": 8.497882046269144e-05, + "loss": 0.3549, + "step": 2351 + }, + { + "epoch": 0.3681903569192235, + "grad_norm": 0.7551148533821106, + "learning_rate": 8.497067448680352e-05, + "loss": 0.3154, + "step": 2352 + }, + { + "epoch": 0.36834690043832186, + "grad_norm": 0.8448994159698486, + "learning_rate": 8.49625285109156e-05, + "loss": 0.2968, + "step": 2353 + }, + { + "epoch": 0.3685034439574202, + "grad_norm": 0.5138726830482483, + "learning_rate": 8.49543825350277e-05, + "loss": 0.2312, + "step": 2354 + }, + { + "epoch": 0.36865998747651846, + "grad_norm": 0.780196487903595, + "learning_rate": 8.494623655913979e-05, + "loss": 0.3787, + "step": 2355 + }, + { + "epoch": 0.3688165309956168, + "grad_norm": 1.5466417074203491, + "learning_rate": 8.493809058325187e-05, + "loss": 0.4123, + "step": 2356 + }, + { + "epoch": 0.3689730745147151, + "grad_norm": 1.3150808811187744, + "learning_rate": 8.492994460736397e-05, + "loss": 0.4988, + "step": 2357 + }, + { + "epoch": 0.3691296180338134, + "grad_norm": 0.9680297374725342, + "learning_rate": 8.492179863147605e-05, + "loss": 0.3358, + "step": 2358 + }, + { + "epoch": 0.3692861615529117, + "grad_norm": 1.531290888786316, + "learning_rate": 8.491365265558813e-05, + "loss": 0.612, + "step": 2359 + }, + { + "epoch": 0.36944270507201, + "grad_norm": 1.7034754753112793, + "learning_rate": 8.490550667970023e-05, + "loss": 0.4671, + "step": 2360 + }, + { + "epoch": 0.3695992485911083, + "grad_norm": 2.298607110977173, + "learning_rate": 8.489736070381233e-05, + "loss": 0.6354, + "step": 2361 + }, + { + "epoch": 0.36975579211020665, + "grad_norm": 1.3246272802352905, + "learning_rate": 8.48892147279244e-05, + "loss": 0.3877, + "step": 2362 + }, + { + "epoch": 0.3699123356293049, + "grad_norm": 1.0996886491775513, + "learning_rate": 8.48810687520365e-05, + "loss": 0.5832, + "step": 2363 + }, + { + "epoch": 0.37006887914840325, + "grad_norm": 1.3181322813034058, + "learning_rate": 8.48729227761486e-05, + "loss": 0.4033, + "step": 2364 + }, + { + "epoch": 0.3702254226675016, + "grad_norm": 1.8365288972854614, + "learning_rate": 8.486477680026066e-05, + "loss": 0.7704, + "step": 2365 + }, + { + "epoch": 0.37038196618659985, + "grad_norm": 1.1043983697891235, + "learning_rate": 8.485663082437276e-05, + "loss": 0.4033, + "step": 2366 + }, + { + "epoch": 0.3705385097056982, + "grad_norm": 1.7805801630020142, + "learning_rate": 8.484848484848486e-05, + "loss": 0.7143, + "step": 2367 + }, + { + "epoch": 0.3706950532247965, + "grad_norm": 1.395138144493103, + "learning_rate": 8.484033887259694e-05, + "loss": 0.5489, + "step": 2368 + }, + { + "epoch": 0.3708515967438948, + "grad_norm": 2.380077838897705, + "learning_rate": 8.483219289670903e-05, + "loss": 0.7368, + "step": 2369 + }, + { + "epoch": 0.3710081402629931, + "grad_norm": 3.499084711074829, + "learning_rate": 8.482404692082112e-05, + "loss": 1.098, + "step": 2370 + }, + { + "epoch": 0.37116468378209144, + "grad_norm": 3.5124149322509766, + "learning_rate": 8.481590094493321e-05, + "loss": 0.618, + "step": 2371 + }, + { + "epoch": 0.3713212273011897, + "grad_norm": 1.9400047063827515, + "learning_rate": 8.480775496904529e-05, + "loss": 0.9708, + "step": 2372 + }, + { + "epoch": 0.37147777082028804, + "grad_norm": 4.111591339111328, + "learning_rate": 8.479960899315739e-05, + "loss": 0.7265, + "step": 2373 + }, + { + "epoch": 0.3716343143393864, + "grad_norm": 6.212151527404785, + "learning_rate": 8.479146301726947e-05, + "loss": 0.7051, + "step": 2374 + }, + { + "epoch": 0.37179085785848465, + "grad_norm": 1.6426112651824951, + "learning_rate": 8.478331704138156e-05, + "loss": 0.7386, + "step": 2375 + }, + { + "epoch": 0.371947401377583, + "grad_norm": 2.2969727516174316, + "learning_rate": 8.477517106549365e-05, + "loss": 0.5899, + "step": 2376 + }, + { + "epoch": 0.3721039448966813, + "grad_norm": 1.9495407342910767, + "learning_rate": 8.476702508960574e-05, + "loss": 0.6756, + "step": 2377 + }, + { + "epoch": 0.3722604884157796, + "grad_norm": 4.209596157073975, + "learning_rate": 8.475887911371782e-05, + "loss": 0.867, + "step": 2378 + }, + { + "epoch": 0.3724170319348779, + "grad_norm": 4.069092750549316, + "learning_rate": 8.475073313782992e-05, + "loss": 0.6998, + "step": 2379 + }, + { + "epoch": 0.3725735754539762, + "grad_norm": 2.7914535999298096, + "learning_rate": 8.4742587161942e-05, + "loss": 0.7096, + "step": 2380 + }, + { + "epoch": 0.3727301189730745, + "grad_norm": 2.3799538612365723, + "learning_rate": 8.473444118605409e-05, + "loss": 1.0392, + "step": 2381 + }, + { + "epoch": 0.37288666249217284, + "grad_norm": 5.156087398529053, + "learning_rate": 8.472629521016618e-05, + "loss": 0.8696, + "step": 2382 + }, + { + "epoch": 0.3730432060112711, + "grad_norm": 2.1128573417663574, + "learning_rate": 8.471814923427827e-05, + "loss": 0.6825, + "step": 2383 + }, + { + "epoch": 0.37319974953036944, + "grad_norm": 3.820441246032715, + "learning_rate": 8.471000325839036e-05, + "loss": 0.9742, + "step": 2384 + }, + { + "epoch": 0.37335629304946777, + "grad_norm": 7.054225921630859, + "learning_rate": 8.470185728250245e-05, + "loss": 1.6708, + "step": 2385 + }, + { + "epoch": 0.37351283656856604, + "grad_norm": 3.6152474880218506, + "learning_rate": 8.469371130661453e-05, + "loss": 0.8605, + "step": 2386 + }, + { + "epoch": 0.37366938008766437, + "grad_norm": 5.8458051681518555, + "learning_rate": 8.468556533072663e-05, + "loss": 1.1232, + "step": 2387 + }, + { + "epoch": 0.3738259236067627, + "grad_norm": 3.293506145477295, + "learning_rate": 8.467741935483871e-05, + "loss": 1.3209, + "step": 2388 + }, + { + "epoch": 0.373982467125861, + "grad_norm": 3.3236336708068848, + "learning_rate": 8.46692733789508e-05, + "loss": 1.5399, + "step": 2389 + }, + { + "epoch": 0.3741390106449593, + "grad_norm": 3.3099591732025146, + "learning_rate": 8.46611274030629e-05, + "loss": 0.8291, + "step": 2390 + }, + { + "epoch": 0.37429555416405763, + "grad_norm": 2.7506790161132812, + "learning_rate": 8.465298142717498e-05, + "loss": 1.2425, + "step": 2391 + }, + { + "epoch": 0.3744520976831559, + "grad_norm": 4.721302032470703, + "learning_rate": 8.464483545128706e-05, + "loss": 1.5479, + "step": 2392 + }, + { + "epoch": 0.37460864120225423, + "grad_norm": 2.190546989440918, + "learning_rate": 8.463668947539916e-05, + "loss": 1.2934, + "step": 2393 + }, + { + "epoch": 0.37476518472135256, + "grad_norm": 2.910036325454712, + "learning_rate": 8.462854349951124e-05, + "loss": 1.4126, + "step": 2394 + }, + { + "epoch": 0.37492172824045084, + "grad_norm": 2.9838707447052, + "learning_rate": 8.462039752362333e-05, + "loss": 1.1514, + "step": 2395 + }, + { + "epoch": 0.37507827175954916, + "grad_norm": 2.903693675994873, + "learning_rate": 8.461225154773542e-05, + "loss": 1.2217, + "step": 2396 + }, + { + "epoch": 0.37523481527864744, + "grad_norm": 1.8571443557739258, + "learning_rate": 8.460410557184752e-05, + "loss": 0.6441, + "step": 2397 + }, + { + "epoch": 0.37539135879774577, + "grad_norm": 4.751452922821045, + "learning_rate": 8.459595959595959e-05, + "loss": 0.66, + "step": 2398 + }, + { + "epoch": 0.3755479023168441, + "grad_norm": 3.9615116119384766, + "learning_rate": 8.458781362007169e-05, + "loss": 1.3128, + "step": 2399 + }, + { + "epoch": 0.37570444583594237, + "grad_norm": 2.348771572113037, + "learning_rate": 8.457966764418379e-05, + "loss": 1.0358, + "step": 2400 + }, + { + "epoch": 0.3758609893550407, + "grad_norm": 0.7909599542617798, + "learning_rate": 8.457152166829586e-05, + "loss": 0.3544, + "step": 2401 + }, + { + "epoch": 0.376017532874139, + "grad_norm": 0.8848404884338379, + "learning_rate": 8.456337569240795e-05, + "loss": 0.3649, + "step": 2402 + }, + { + "epoch": 0.3761740763932373, + "grad_norm": 1.0034829378128052, + "learning_rate": 8.455522971652005e-05, + "loss": 0.4679, + "step": 2403 + }, + { + "epoch": 0.37633061991233563, + "grad_norm": 1.2943195104599, + "learning_rate": 8.454708374063213e-05, + "loss": 0.6452, + "step": 2404 + }, + { + "epoch": 0.37648716343143396, + "grad_norm": 0.8271375894546509, + "learning_rate": 8.453893776474422e-05, + "loss": 0.4221, + "step": 2405 + }, + { + "epoch": 0.37664370695053223, + "grad_norm": 0.8222814202308655, + "learning_rate": 8.453079178885631e-05, + "loss": 0.4539, + "step": 2406 + }, + { + "epoch": 0.37680025046963056, + "grad_norm": 1.0666009187698364, + "learning_rate": 8.45226458129684e-05, + "loss": 0.369, + "step": 2407 + }, + { + "epoch": 0.3769567939887289, + "grad_norm": 1.276857614517212, + "learning_rate": 8.451449983708048e-05, + "loss": 0.5746, + "step": 2408 + }, + { + "epoch": 0.37711333750782716, + "grad_norm": 1.6762863397598267, + "learning_rate": 8.450635386119258e-05, + "loss": 0.407, + "step": 2409 + }, + { + "epoch": 0.3772698810269255, + "grad_norm": 1.189470648765564, + "learning_rate": 8.449820788530466e-05, + "loss": 0.4727, + "step": 2410 + }, + { + "epoch": 0.3774264245460238, + "grad_norm": 1.7212421894073486, + "learning_rate": 8.449006190941675e-05, + "loss": 0.6244, + "step": 2411 + }, + { + "epoch": 0.3775829680651221, + "grad_norm": 1.6164180040359497, + "learning_rate": 8.448191593352884e-05, + "loss": 0.6327, + "step": 2412 + }, + { + "epoch": 0.3777395115842204, + "grad_norm": 1.1293511390686035, + "learning_rate": 8.447376995764093e-05, + "loss": 0.4434, + "step": 2413 + }, + { + "epoch": 0.3778960551033187, + "grad_norm": 1.4631608724594116, + "learning_rate": 8.446562398175301e-05, + "loss": 0.4996, + "step": 2414 + }, + { + "epoch": 0.378052598622417, + "grad_norm": 1.4019782543182373, + "learning_rate": 8.445747800586511e-05, + "loss": 0.7313, + "step": 2415 + }, + { + "epoch": 0.37820914214151535, + "grad_norm": 1.48589026927948, + "learning_rate": 8.444933202997719e-05, + "loss": 0.592, + "step": 2416 + }, + { + "epoch": 0.3783656856606136, + "grad_norm": 1.9068306684494019, + "learning_rate": 8.444118605408928e-05, + "loss": 0.5528, + "step": 2417 + }, + { + "epoch": 0.37852222917971196, + "grad_norm": 2.619640827178955, + "learning_rate": 8.443304007820137e-05, + "loss": 0.6782, + "step": 2418 + }, + { + "epoch": 0.3786787726988103, + "grad_norm": 1.3305810689926147, + "learning_rate": 8.442489410231346e-05, + "loss": 0.5092, + "step": 2419 + }, + { + "epoch": 0.37883531621790856, + "grad_norm": 3.934696674346924, + "learning_rate": 8.441674812642555e-05, + "loss": 0.7523, + "step": 2420 + }, + { + "epoch": 0.3789918597370069, + "grad_norm": 4.578168869018555, + "learning_rate": 8.440860215053764e-05, + "loss": 0.7708, + "step": 2421 + }, + { + "epoch": 0.3791484032561052, + "grad_norm": 2.5810563564300537, + "learning_rate": 8.440045617464972e-05, + "loss": 0.7345, + "step": 2422 + }, + { + "epoch": 0.3793049467752035, + "grad_norm": 1.8746509552001953, + "learning_rate": 8.439231019876182e-05, + "loss": 0.7604, + "step": 2423 + }, + { + "epoch": 0.3794614902943018, + "grad_norm": 5.869009494781494, + "learning_rate": 8.43841642228739e-05, + "loss": 1.3347, + "step": 2424 + }, + { + "epoch": 0.37961803381340015, + "grad_norm": 2.014249563217163, + "learning_rate": 8.437601824698599e-05, + "loss": 0.7247, + "step": 2425 + }, + { + "epoch": 0.3797745773324984, + "grad_norm": 3.485337257385254, + "learning_rate": 8.436787227109808e-05, + "loss": 0.9453, + "step": 2426 + }, + { + "epoch": 0.37993112085159675, + "grad_norm": 2.424673318862915, + "learning_rate": 8.435972629521017e-05, + "loss": 0.8665, + "step": 2427 + }, + { + "epoch": 0.3800876643706951, + "grad_norm": 1.8328760862350464, + "learning_rate": 8.435158031932225e-05, + "loss": 0.429, + "step": 2428 + }, + { + "epoch": 0.38024420788979335, + "grad_norm": 2.3923392295837402, + "learning_rate": 8.434343434343435e-05, + "loss": 0.9974, + "step": 2429 + }, + { + "epoch": 0.3804007514088917, + "grad_norm": 4.444287300109863, + "learning_rate": 8.433528836754643e-05, + "loss": 1.0113, + "step": 2430 + }, + { + "epoch": 0.38055729492799, + "grad_norm": 2.2732949256896973, + "learning_rate": 8.432714239165852e-05, + "loss": 1.0011, + "step": 2431 + }, + { + "epoch": 0.3807138384470883, + "grad_norm": 5.4124250411987305, + "learning_rate": 8.431899641577061e-05, + "loss": 0.9508, + "step": 2432 + }, + { + "epoch": 0.3808703819661866, + "grad_norm": 1.8403764963150024, + "learning_rate": 8.431085043988271e-05, + "loss": 1.1074, + "step": 2433 + }, + { + "epoch": 0.3810269254852849, + "grad_norm": 2.050846576690674, + "learning_rate": 8.430270446399478e-05, + "loss": 0.6564, + "step": 2434 + }, + { + "epoch": 0.3811834690043832, + "grad_norm": 4.402859210968018, + "learning_rate": 8.429455848810688e-05, + "loss": 1.1449, + "step": 2435 + }, + { + "epoch": 0.38134001252348154, + "grad_norm": 2.7183239459991455, + "learning_rate": 8.428641251221898e-05, + "loss": 0.8285, + "step": 2436 + }, + { + "epoch": 0.3814965560425798, + "grad_norm": 2.7535314559936523, + "learning_rate": 8.427826653633105e-05, + "loss": 0.938, + "step": 2437 + }, + { + "epoch": 0.38165309956167814, + "grad_norm": 4.436733245849609, + "learning_rate": 8.427012056044314e-05, + "loss": 1.5729, + "step": 2438 + }, + { + "epoch": 0.3818096430807765, + "grad_norm": 2.694502353668213, + "learning_rate": 8.426197458455524e-05, + "loss": 1.4034, + "step": 2439 + }, + { + "epoch": 0.38196618659987475, + "grad_norm": 1.7004159688949585, + "learning_rate": 8.425382860866731e-05, + "loss": 1.2815, + "step": 2440 + }, + { + "epoch": 0.3821227301189731, + "grad_norm": 3.749476909637451, + "learning_rate": 8.424568263277941e-05, + "loss": 1.1638, + "step": 2441 + }, + { + "epoch": 0.3822792736380714, + "grad_norm": 2.0231194496154785, + "learning_rate": 8.42375366568915e-05, + "loss": 0.9678, + "step": 2442 + }, + { + "epoch": 0.3824358171571697, + "grad_norm": 3.7617945671081543, + "learning_rate": 8.422939068100359e-05, + "loss": 1.3572, + "step": 2443 + }, + { + "epoch": 0.382592360676268, + "grad_norm": 2.4181857109069824, + "learning_rate": 8.422124470511567e-05, + "loss": 1.5169, + "step": 2444 + }, + { + "epoch": 0.38274890419536634, + "grad_norm": 6.7129106521606445, + "learning_rate": 8.421309872922777e-05, + "loss": 1.3423, + "step": 2445 + }, + { + "epoch": 0.3829054477144646, + "grad_norm": 3.1849403381347656, + "learning_rate": 8.420495275333985e-05, + "loss": 0.9978, + "step": 2446 + }, + { + "epoch": 0.38306199123356294, + "grad_norm": 2.526445150375366, + "learning_rate": 8.419680677745194e-05, + "loss": 1.1577, + "step": 2447 + }, + { + "epoch": 0.38321853475266127, + "grad_norm": 2.341017484664917, + "learning_rate": 8.418866080156404e-05, + "loss": 0.6096, + "step": 2448 + }, + { + "epoch": 0.38337507827175954, + "grad_norm": 1.6737663745880127, + "learning_rate": 8.418051482567612e-05, + "loss": 0.9645, + "step": 2449 + }, + { + "epoch": 0.38353162179085787, + "grad_norm": 2.1398401260375977, + "learning_rate": 8.41723688497882e-05, + "loss": 1.506, + "step": 2450 + }, + { + "epoch": 0.38368816530995614, + "grad_norm": 0.5184167623519897, + "learning_rate": 8.41642228739003e-05, + "loss": 0.3473, + "step": 2451 + }, + { + "epoch": 0.38384470882905447, + "grad_norm": 0.5970902442932129, + "learning_rate": 8.415607689801238e-05, + "loss": 0.4027, + "step": 2452 + }, + { + "epoch": 0.3840012523481528, + "grad_norm": 0.6473549604415894, + "learning_rate": 8.414793092212447e-05, + "loss": 0.3151, + "step": 2453 + }, + { + "epoch": 0.3841577958672511, + "grad_norm": 0.9271225929260254, + "learning_rate": 8.413978494623657e-05, + "loss": 0.4761, + "step": 2454 + }, + { + "epoch": 0.3843143393863494, + "grad_norm": 0.6827710866928101, + "learning_rate": 8.413163897034865e-05, + "loss": 0.3928, + "step": 2455 + }, + { + "epoch": 0.38447088290544773, + "grad_norm": 1.0095326900482178, + "learning_rate": 8.412349299446075e-05, + "loss": 0.3833, + "step": 2456 + }, + { + "epoch": 0.384627426424546, + "grad_norm": 1.676073431968689, + "learning_rate": 8.411534701857283e-05, + "loss": 0.5571, + "step": 2457 + }, + { + "epoch": 0.38478396994364433, + "grad_norm": 1.26688551902771, + "learning_rate": 8.410720104268491e-05, + "loss": 0.4707, + "step": 2458 + }, + { + "epoch": 0.38494051346274266, + "grad_norm": 1.0373142957687378, + "learning_rate": 8.409905506679701e-05, + "loss": 0.3362, + "step": 2459 + }, + { + "epoch": 0.38509705698184094, + "grad_norm": 1.8283287286758423, + "learning_rate": 8.40909090909091e-05, + "loss": 0.4388, + "step": 2460 + }, + { + "epoch": 0.38525360050093926, + "grad_norm": 1.3188328742980957, + "learning_rate": 8.408276311502118e-05, + "loss": 0.4349, + "step": 2461 + }, + { + "epoch": 0.3854101440200376, + "grad_norm": 0.795573890209198, + "learning_rate": 8.407461713913328e-05, + "loss": 0.3109, + "step": 2462 + }, + { + "epoch": 0.38556668753913587, + "grad_norm": 1.3712382316589355, + "learning_rate": 8.406647116324536e-05, + "loss": 0.5034, + "step": 2463 + }, + { + "epoch": 0.3857232310582342, + "grad_norm": 1.3913567066192627, + "learning_rate": 8.405832518735744e-05, + "loss": 0.5555, + "step": 2464 + }, + { + "epoch": 0.3858797745773325, + "grad_norm": 2.661825180053711, + "learning_rate": 8.405017921146954e-05, + "loss": 0.7569, + "step": 2465 + }, + { + "epoch": 0.3860363180964308, + "grad_norm": 1.8753658533096313, + "learning_rate": 8.404203323558162e-05, + "loss": 0.589, + "step": 2466 + }, + { + "epoch": 0.3861928616155291, + "grad_norm": 1.7687991857528687, + "learning_rate": 8.403388725969371e-05, + "loss": 0.3821, + "step": 2467 + }, + { + "epoch": 0.3863494051346274, + "grad_norm": 1.0938034057617188, + "learning_rate": 8.40257412838058e-05, + "loss": 0.4295, + "step": 2468 + }, + { + "epoch": 0.38650594865372573, + "grad_norm": 2.499660015106201, + "learning_rate": 8.401759530791789e-05, + "loss": 0.7218, + "step": 2469 + }, + { + "epoch": 0.38666249217282406, + "grad_norm": 1.723483920097351, + "learning_rate": 8.400944933202997e-05, + "loss": 0.4463, + "step": 2470 + }, + { + "epoch": 0.38681903569192233, + "grad_norm": 1.5120686292648315, + "learning_rate": 8.400130335614207e-05, + "loss": 0.6959, + "step": 2471 + }, + { + "epoch": 0.38697557921102066, + "grad_norm": 7.68450403213501, + "learning_rate": 8.399315738025417e-05, + "loss": 0.8405, + "step": 2472 + }, + { + "epoch": 0.387132122730119, + "grad_norm": 1.9140945672988892, + "learning_rate": 8.398501140436624e-05, + "loss": 0.4575, + "step": 2473 + }, + { + "epoch": 0.38728866624921726, + "grad_norm": 2.2224535942077637, + "learning_rate": 8.397686542847833e-05, + "loss": 0.7977, + "step": 2474 + }, + { + "epoch": 0.3874452097683156, + "grad_norm": 2.614286184310913, + "learning_rate": 8.396871945259043e-05, + "loss": 0.8828, + "step": 2475 + }, + { + "epoch": 0.3876017532874139, + "grad_norm": 2.1503045558929443, + "learning_rate": 8.39605734767025e-05, + "loss": 0.9119, + "step": 2476 + }, + { + "epoch": 0.3877582968065122, + "grad_norm": 3.17335844039917, + "learning_rate": 8.39524275008146e-05, + "loss": 0.5665, + "step": 2477 + }, + { + "epoch": 0.3879148403256105, + "grad_norm": 1.6091634035110474, + "learning_rate": 8.39442815249267e-05, + "loss": 0.7489, + "step": 2478 + }, + { + "epoch": 0.38807138384470885, + "grad_norm": 2.1044256687164307, + "learning_rate": 8.393613554903878e-05, + "loss": 0.8265, + "step": 2479 + }, + { + "epoch": 0.3882279273638071, + "grad_norm": 2.818763256072998, + "learning_rate": 8.392798957315086e-05, + "loss": 0.6356, + "step": 2480 + }, + { + "epoch": 0.38838447088290545, + "grad_norm": 2.2581369876861572, + "learning_rate": 8.391984359726296e-05, + "loss": 0.8867, + "step": 2481 + }, + { + "epoch": 0.3885410144020038, + "grad_norm": 2.8148763179779053, + "learning_rate": 8.391169762137505e-05, + "loss": 0.6895, + "step": 2482 + }, + { + "epoch": 0.38869755792110205, + "grad_norm": 2.3352138996124268, + "learning_rate": 8.390355164548713e-05, + "loss": 1.0511, + "step": 2483 + }, + { + "epoch": 0.3888541014402004, + "grad_norm": 7.182852268218994, + "learning_rate": 8.389540566959923e-05, + "loss": 1.3314, + "step": 2484 + }, + { + "epoch": 0.3890106449592987, + "grad_norm": 2.5725276470184326, + "learning_rate": 8.388725969371131e-05, + "loss": 0.5932, + "step": 2485 + }, + { + "epoch": 0.389167188478397, + "grad_norm": 7.586211204528809, + "learning_rate": 8.38791137178234e-05, + "loss": 1.0879, + "step": 2486 + }, + { + "epoch": 0.3893237319974953, + "grad_norm": 3.0450384616851807, + "learning_rate": 8.387096774193549e-05, + "loss": 1.0056, + "step": 2487 + }, + { + "epoch": 0.3894802755165936, + "grad_norm": 3.3957767486572266, + "learning_rate": 8.386282176604758e-05, + "loss": 1.2052, + "step": 2488 + }, + { + "epoch": 0.3896368190356919, + "grad_norm": 3.126351833343506, + "learning_rate": 8.385467579015966e-05, + "loss": 1.6494, + "step": 2489 + }, + { + "epoch": 0.38979336255479025, + "grad_norm": 3.648054361343384, + "learning_rate": 8.384652981427176e-05, + "loss": 1.0824, + "step": 2490 + }, + { + "epoch": 0.3899499060738885, + "grad_norm": 2.9438016414642334, + "learning_rate": 8.383838383838384e-05, + "loss": 1.1101, + "step": 2491 + }, + { + "epoch": 0.39010644959298685, + "grad_norm": 4.259184837341309, + "learning_rate": 8.383023786249594e-05, + "loss": 1.5169, + "step": 2492 + }, + { + "epoch": 0.3902629931120852, + "grad_norm": 2.931788921356201, + "learning_rate": 8.382209188660802e-05, + "loss": 1.2675, + "step": 2493 + }, + { + "epoch": 0.39041953663118345, + "grad_norm": 2.0848467350006104, + "learning_rate": 8.38139459107201e-05, + "loss": 1.146, + "step": 2494 + }, + { + "epoch": 0.3905760801502818, + "grad_norm": 3.3274104595184326, + "learning_rate": 8.38057999348322e-05, + "loss": 2.0684, + "step": 2495 + }, + { + "epoch": 0.3907326236693801, + "grad_norm": 1.1278772354125977, + "learning_rate": 8.379765395894429e-05, + "loss": 0.6656, + "step": 2496 + }, + { + "epoch": 0.3908891671884784, + "grad_norm": 3.616924524307251, + "learning_rate": 8.378950798305637e-05, + "loss": 0.9297, + "step": 2497 + }, + { + "epoch": 0.3910457107075767, + "grad_norm": 2.342200517654419, + "learning_rate": 8.378136200716847e-05, + "loss": 0.967, + "step": 2498 + }, + { + "epoch": 0.39120225422667504, + "grad_norm": 3.8918070793151855, + "learning_rate": 8.377321603128055e-05, + "loss": 1.414, + "step": 2499 + }, + { + "epoch": 0.3913587977457733, + "grad_norm": 2.401630163192749, + "learning_rate": 8.376507005539263e-05, + "loss": 1.2912, + "step": 2500 + }, + { + "epoch": 0.39151534126487164, + "grad_norm": 0.5632853507995605, + "learning_rate": 8.375692407950473e-05, + "loss": 0.3676, + "step": 2501 + }, + { + "epoch": 0.39167188478396997, + "grad_norm": 0.4977540075778961, + "learning_rate": 8.374877810361682e-05, + "loss": 0.2274, + "step": 2502 + }, + { + "epoch": 0.39182842830306824, + "grad_norm": 0.6810317039489746, + "learning_rate": 8.37406321277289e-05, + "loss": 0.3371, + "step": 2503 + }, + { + "epoch": 0.3919849718221666, + "grad_norm": 0.7754554748535156, + "learning_rate": 8.3732486151841e-05, + "loss": 0.2899, + "step": 2504 + }, + { + "epoch": 0.39214151534126485, + "grad_norm": 0.7560040354728699, + "learning_rate": 8.372434017595308e-05, + "loss": 0.2551, + "step": 2505 + }, + { + "epoch": 0.3922980588603632, + "grad_norm": 0.5908394455909729, + "learning_rate": 8.371619420006516e-05, + "loss": 0.2424, + "step": 2506 + }, + { + "epoch": 0.3924546023794615, + "grad_norm": 1.068758249282837, + "learning_rate": 8.370804822417726e-05, + "loss": 0.2744, + "step": 2507 + }, + { + "epoch": 0.3926111458985598, + "grad_norm": 1.3649100065231323, + "learning_rate": 8.369990224828936e-05, + "loss": 0.3273, + "step": 2508 + }, + { + "epoch": 0.3927676894176581, + "grad_norm": 3.3541805744171143, + "learning_rate": 8.369175627240143e-05, + "loss": 1.1214, + "step": 2509 + }, + { + "epoch": 0.39292423293675643, + "grad_norm": 3.178973913192749, + "learning_rate": 8.368361029651353e-05, + "loss": 0.4588, + "step": 2510 + }, + { + "epoch": 0.3930807764558547, + "grad_norm": 0.8240559697151184, + "learning_rate": 8.367546432062562e-05, + "loss": 0.2481, + "step": 2511 + }, + { + "epoch": 0.39323731997495304, + "grad_norm": 1.575239658355713, + "learning_rate": 8.36673183447377e-05, + "loss": 0.5052, + "step": 2512 + }, + { + "epoch": 0.39339386349405137, + "grad_norm": 1.561784267425537, + "learning_rate": 8.365917236884979e-05, + "loss": 0.6486, + "step": 2513 + }, + { + "epoch": 0.39355040701314964, + "grad_norm": 1.9187620878219604, + "learning_rate": 8.365102639296189e-05, + "loss": 0.5824, + "step": 2514 + }, + { + "epoch": 0.39370695053224797, + "grad_norm": 1.4028180837631226, + "learning_rate": 8.364288041707397e-05, + "loss": 0.3877, + "step": 2515 + }, + { + "epoch": 0.3938634940513463, + "grad_norm": 1.2903163433074951, + "learning_rate": 8.363473444118606e-05, + "loss": 0.3894, + "step": 2516 + }, + { + "epoch": 0.39402003757044457, + "grad_norm": 2.2321248054504395, + "learning_rate": 8.362658846529815e-05, + "loss": 0.6035, + "step": 2517 + }, + { + "epoch": 0.3941765810895429, + "grad_norm": 1.32001793384552, + "learning_rate": 8.361844248941024e-05, + "loss": 0.4043, + "step": 2518 + }, + { + "epoch": 0.39433312460864123, + "grad_norm": 1.906328797340393, + "learning_rate": 8.361029651352232e-05, + "loss": 0.5894, + "step": 2519 + }, + { + "epoch": 0.3944896681277395, + "grad_norm": 3.34924054145813, + "learning_rate": 8.360215053763442e-05, + "loss": 0.8233, + "step": 2520 + }, + { + "epoch": 0.39464621164683783, + "grad_norm": 1.2378047704696655, + "learning_rate": 8.35940045617465e-05, + "loss": 0.6186, + "step": 2521 + }, + { + "epoch": 0.3948027551659361, + "grad_norm": 2.168297290802002, + "learning_rate": 8.358585858585859e-05, + "loss": 0.6127, + "step": 2522 + }, + { + "epoch": 0.39495929868503443, + "grad_norm": 9.660816192626953, + "learning_rate": 8.357771260997068e-05, + "loss": 0.8925, + "step": 2523 + }, + { + "epoch": 0.39511584220413276, + "grad_norm": 2.2078657150268555, + "learning_rate": 8.356956663408277e-05, + "loss": 0.8028, + "step": 2524 + }, + { + "epoch": 0.39527238572323103, + "grad_norm": 2.8287160396575928, + "learning_rate": 8.356142065819485e-05, + "loss": 1.173, + "step": 2525 + }, + { + "epoch": 0.39542892924232936, + "grad_norm": 5.449653625488281, + "learning_rate": 8.355327468230695e-05, + "loss": 0.8279, + "step": 2526 + }, + { + "epoch": 0.3955854727614277, + "grad_norm": 4.624921798706055, + "learning_rate": 8.354512870641903e-05, + "loss": 1.1557, + "step": 2527 + }, + { + "epoch": 0.39574201628052597, + "grad_norm": 1.4528766870498657, + "learning_rate": 8.353698273053111e-05, + "loss": 0.6564, + "step": 2528 + }, + { + "epoch": 0.3958985597996243, + "grad_norm": 2.961862325668335, + "learning_rate": 8.352883675464321e-05, + "loss": 0.5394, + "step": 2529 + }, + { + "epoch": 0.3960551033187226, + "grad_norm": 8.631179809570312, + "learning_rate": 8.35206907787553e-05, + "loss": 1.0111, + "step": 2530 + }, + { + "epoch": 0.3962116468378209, + "grad_norm": 2.170156478881836, + "learning_rate": 8.351254480286739e-05, + "loss": 0.7811, + "step": 2531 + }, + { + "epoch": 0.3963681903569192, + "grad_norm": 2.450563907623291, + "learning_rate": 8.350439882697948e-05, + "loss": 0.9979, + "step": 2532 + }, + { + "epoch": 0.39652473387601755, + "grad_norm": 4.840505123138428, + "learning_rate": 8.349625285109156e-05, + "loss": 1.1845, + "step": 2533 + }, + { + "epoch": 0.3966812773951158, + "grad_norm": 5.4894537925720215, + "learning_rate": 8.348810687520366e-05, + "loss": 1.5145, + "step": 2534 + }, + { + "epoch": 0.39683782091421416, + "grad_norm": 3.5611605644226074, + "learning_rate": 8.347996089931574e-05, + "loss": 0.9566, + "step": 2535 + }, + { + "epoch": 0.3969943644333125, + "grad_norm": 2.6020116806030273, + "learning_rate": 8.347181492342783e-05, + "loss": 1.2707, + "step": 2536 + }, + { + "epoch": 0.39715090795241076, + "grad_norm": 2.4330384731292725, + "learning_rate": 8.346366894753992e-05, + "loss": 0.7705, + "step": 2537 + }, + { + "epoch": 0.3973074514715091, + "grad_norm": 2.287334680557251, + "learning_rate": 8.3455522971652e-05, + "loss": 0.8399, + "step": 2538 + }, + { + "epoch": 0.3974639949906074, + "grad_norm": 4.232561111450195, + "learning_rate": 8.344737699576409e-05, + "loss": 1.0573, + "step": 2539 + }, + { + "epoch": 0.3976205385097057, + "grad_norm": 2.8765480518341064, + "learning_rate": 8.343923101987619e-05, + "loss": 1.1934, + "step": 2540 + }, + { + "epoch": 0.397777082028804, + "grad_norm": 3.547661542892456, + "learning_rate": 8.343108504398827e-05, + "loss": 1.3453, + "step": 2541 + }, + { + "epoch": 0.3979336255479023, + "grad_norm": 3.22652268409729, + "learning_rate": 8.342293906810036e-05, + "loss": 1.4607, + "step": 2542 + }, + { + "epoch": 0.3980901690670006, + "grad_norm": 1.6326192617416382, + "learning_rate": 8.341479309221245e-05, + "loss": 0.9274, + "step": 2543 + }, + { + "epoch": 0.39824671258609895, + "grad_norm": 2.9979257583618164, + "learning_rate": 8.340664711632455e-05, + "loss": 1.8519, + "step": 2544 + }, + { + "epoch": 0.3984032561051972, + "grad_norm": 3.133972406387329, + "learning_rate": 8.339850114043662e-05, + "loss": 1.3792, + "step": 2545 + }, + { + "epoch": 0.39855979962429555, + "grad_norm": 1.2136322259902954, + "learning_rate": 8.339035516454872e-05, + "loss": 0.6657, + "step": 2546 + }, + { + "epoch": 0.3987163431433939, + "grad_norm": 3.2275726795196533, + "learning_rate": 8.338220918866081e-05, + "loss": 1.6191, + "step": 2547 + }, + { + "epoch": 0.39887288666249215, + "grad_norm": 3.215532064437866, + "learning_rate": 8.337406321277288e-05, + "loss": 1.1745, + "step": 2548 + }, + { + "epoch": 0.3990294301815905, + "grad_norm": 2.09382963180542, + "learning_rate": 8.336591723688498e-05, + "loss": 0.5307, + "step": 2549 + }, + { + "epoch": 0.3991859737006888, + "grad_norm": 2.0953879356384277, + "learning_rate": 8.335777126099708e-05, + "loss": 1.3024, + "step": 2550 + }, + { + "epoch": 0.3993425172197871, + "grad_norm": 0.6173037886619568, + "learning_rate": 8.334962528510916e-05, + "loss": 0.3343, + "step": 2551 + }, + { + "epoch": 0.3994990607388854, + "grad_norm": 0.6297442317008972, + "learning_rate": 8.334147930922125e-05, + "loss": 0.3823, + "step": 2552 + }, + { + "epoch": 0.39965560425798374, + "grad_norm": 0.6954314708709717, + "learning_rate": 8.333333333333334e-05, + "loss": 0.3523, + "step": 2553 + }, + { + "epoch": 0.399812147777082, + "grad_norm": 0.4944305419921875, + "learning_rate": 8.332518735744543e-05, + "loss": 0.2988, + "step": 2554 + }, + { + "epoch": 0.39996869129618035, + "grad_norm": 0.8481062054634094, + "learning_rate": 8.331704138155751e-05, + "loss": 0.2859, + "step": 2555 + }, + { + "epoch": 0.4001252348152787, + "grad_norm": 0.729377269744873, + "learning_rate": 8.330889540566961e-05, + "loss": 0.3061, + "step": 2556 + }, + { + "epoch": 0.40028177833437695, + "grad_norm": 0.9411975145339966, + "learning_rate": 8.330074942978169e-05, + "loss": 0.3625, + "step": 2557 + }, + { + "epoch": 0.4004383218534753, + "grad_norm": 1.9836124181747437, + "learning_rate": 8.329260345389378e-05, + "loss": 0.5328, + "step": 2558 + }, + { + "epoch": 0.40059486537257355, + "grad_norm": 1.3325960636138916, + "learning_rate": 8.328445747800587e-05, + "loss": 0.36, + "step": 2559 + }, + { + "epoch": 0.4007514088916719, + "grad_norm": 1.4235843420028687, + "learning_rate": 8.327631150211796e-05, + "loss": 0.5297, + "step": 2560 + }, + { + "epoch": 0.4009079524107702, + "grad_norm": 0.9627918004989624, + "learning_rate": 8.326816552623004e-05, + "loss": 0.3803, + "step": 2561 + }, + { + "epoch": 0.4010644959298685, + "grad_norm": 1.034505844116211, + "learning_rate": 8.326001955034214e-05, + "loss": 0.3231, + "step": 2562 + }, + { + "epoch": 0.4012210394489668, + "grad_norm": 1.0713493824005127, + "learning_rate": 8.325187357445422e-05, + "loss": 0.5094, + "step": 2563 + }, + { + "epoch": 0.40137758296806514, + "grad_norm": 1.4581630229949951, + "learning_rate": 8.32437275985663e-05, + "loss": 0.5069, + "step": 2564 + }, + { + "epoch": 0.4015341264871634, + "grad_norm": 3.0184566974639893, + "learning_rate": 8.32355816226784e-05, + "loss": 0.4878, + "step": 2565 + }, + { + "epoch": 0.40169067000626174, + "grad_norm": 1.3254204988479614, + "learning_rate": 8.322743564679049e-05, + "loss": 0.595, + "step": 2566 + }, + { + "epoch": 0.40184721352536007, + "grad_norm": 1.2053519487380981, + "learning_rate": 8.321928967090258e-05, + "loss": 0.4251, + "step": 2567 + }, + { + "epoch": 0.40200375704445834, + "grad_norm": 1.8113480806350708, + "learning_rate": 8.321114369501467e-05, + "loss": 0.4884, + "step": 2568 + }, + { + "epoch": 0.40216030056355667, + "grad_norm": 1.170629858970642, + "learning_rate": 8.320299771912675e-05, + "loss": 0.623, + "step": 2569 + }, + { + "epoch": 0.402316844082655, + "grad_norm": 1.3244833946228027, + "learning_rate": 8.319485174323885e-05, + "loss": 0.4591, + "step": 2570 + }, + { + "epoch": 0.4024733876017533, + "grad_norm": 1.8103623390197754, + "learning_rate": 8.318670576735093e-05, + "loss": 0.7168, + "step": 2571 + }, + { + "epoch": 0.4026299311208516, + "grad_norm": 1.5530775785446167, + "learning_rate": 8.317855979146302e-05, + "loss": 0.4552, + "step": 2572 + }, + { + "epoch": 0.40278647463994993, + "grad_norm": 2.244208812713623, + "learning_rate": 8.317041381557511e-05, + "loss": 0.6083, + "step": 2573 + }, + { + "epoch": 0.4029430181590482, + "grad_norm": 1.5958282947540283, + "learning_rate": 8.31622678396872e-05, + "loss": 0.5705, + "step": 2574 + }, + { + "epoch": 0.40309956167814653, + "grad_norm": 2.7635080814361572, + "learning_rate": 8.315412186379928e-05, + "loss": 0.5506, + "step": 2575 + }, + { + "epoch": 0.4032561051972448, + "grad_norm": 3.411409378051758, + "learning_rate": 8.314597588791138e-05, + "loss": 0.9674, + "step": 2576 + }, + { + "epoch": 0.40341264871634314, + "grad_norm": 2.121091604232788, + "learning_rate": 8.313782991202346e-05, + "loss": 0.5876, + "step": 2577 + }, + { + "epoch": 0.40356919223544147, + "grad_norm": 1.3540626764297485, + "learning_rate": 8.312968393613555e-05, + "loss": 0.8504, + "step": 2578 + }, + { + "epoch": 0.40372573575453974, + "grad_norm": 2.519728422164917, + "learning_rate": 8.312153796024764e-05, + "loss": 1.2061, + "step": 2579 + }, + { + "epoch": 0.40388227927363807, + "grad_norm": 4.829939842224121, + "learning_rate": 8.311339198435974e-05, + "loss": 1.1101, + "step": 2580 + }, + { + "epoch": 0.4040388227927364, + "grad_norm": 2.532017469406128, + "learning_rate": 8.310524600847181e-05, + "loss": 0.5779, + "step": 2581 + }, + { + "epoch": 0.40419536631183467, + "grad_norm": 7.605701446533203, + "learning_rate": 8.309710003258391e-05, + "loss": 1.3447, + "step": 2582 + }, + { + "epoch": 0.404351909830933, + "grad_norm": 1.8953660726547241, + "learning_rate": 8.3088954056696e-05, + "loss": 0.7057, + "step": 2583 + }, + { + "epoch": 0.4045084533500313, + "grad_norm": 3.4706013202667236, + "learning_rate": 8.308080808080808e-05, + "loss": 1.4579, + "step": 2584 + }, + { + "epoch": 0.4046649968691296, + "grad_norm": 3.0348057746887207, + "learning_rate": 8.307266210492017e-05, + "loss": 1.0221, + "step": 2585 + }, + { + "epoch": 0.40482154038822793, + "grad_norm": 3.1837635040283203, + "learning_rate": 8.306451612903227e-05, + "loss": 1.3814, + "step": 2586 + }, + { + "epoch": 0.40497808390732626, + "grad_norm": 2.9540014266967773, + "learning_rate": 8.305637015314434e-05, + "loss": 0.6731, + "step": 2587 + }, + { + "epoch": 0.40513462742642453, + "grad_norm": 2.39153790473938, + "learning_rate": 8.304822417725644e-05, + "loss": 1.1247, + "step": 2588 + }, + { + "epoch": 0.40529117094552286, + "grad_norm": 5.778711795806885, + "learning_rate": 8.304007820136854e-05, + "loss": 1.0709, + "step": 2589 + }, + { + "epoch": 0.4054477144646212, + "grad_norm": 2.4092698097229004, + "learning_rate": 8.303193222548062e-05, + "loss": 0.7723, + "step": 2590 + }, + { + "epoch": 0.40560425798371946, + "grad_norm": 2.731600046157837, + "learning_rate": 8.30237862495927e-05, + "loss": 1.7399, + "step": 2591 + }, + { + "epoch": 0.4057608015028178, + "grad_norm": 2.4067537784576416, + "learning_rate": 8.30156402737048e-05, + "loss": 1.0268, + "step": 2592 + }, + { + "epoch": 0.40591734502191607, + "grad_norm": 4.092016696929932, + "learning_rate": 8.300749429781688e-05, + "loss": 1.6219, + "step": 2593 + }, + { + "epoch": 0.4060738885410144, + "grad_norm": 2.9959371089935303, + "learning_rate": 8.299934832192897e-05, + "loss": 1.3936, + "step": 2594 + }, + { + "epoch": 0.4062304320601127, + "grad_norm": 4.56360387802124, + "learning_rate": 8.299120234604106e-05, + "loss": 1.3498, + "step": 2595 + }, + { + "epoch": 0.406386975579211, + "grad_norm": 6.666701316833496, + "learning_rate": 8.298305637015315e-05, + "loss": 0.7397, + "step": 2596 + }, + { + "epoch": 0.4065435190983093, + "grad_norm": 3.47222638130188, + "learning_rate": 8.297491039426523e-05, + "loss": 1.0965, + "step": 2597 + }, + { + "epoch": 0.40670006261740765, + "grad_norm": 2.9479899406433105, + "learning_rate": 8.296676441837733e-05, + "loss": 0.9849, + "step": 2598 + }, + { + "epoch": 0.4068566061365059, + "grad_norm": 3.5128297805786133, + "learning_rate": 8.295861844248941e-05, + "loss": 0.7953, + "step": 2599 + }, + { + "epoch": 0.40701314965560426, + "grad_norm": 1.89620041847229, + "learning_rate": 8.29504724666015e-05, + "loss": 0.9771, + "step": 2600 + }, + { + "epoch": 0.4071696931747026, + "grad_norm": 0.8961688876152039, + "learning_rate": 8.29423264907136e-05, + "loss": 0.3464, + "step": 2601 + }, + { + "epoch": 0.40732623669380086, + "grad_norm": 0.6232130527496338, + "learning_rate": 8.293418051482568e-05, + "loss": 0.3318, + "step": 2602 + }, + { + "epoch": 0.4074827802128992, + "grad_norm": 0.7670448422431946, + "learning_rate": 8.292603453893778e-05, + "loss": 0.3471, + "step": 2603 + }, + { + "epoch": 0.4076393237319975, + "grad_norm": 0.7137314677238464, + "learning_rate": 8.291788856304986e-05, + "loss": 0.3375, + "step": 2604 + }, + { + "epoch": 0.4077958672510958, + "grad_norm": 0.7204191088676453, + "learning_rate": 8.290974258716194e-05, + "loss": 0.4558, + "step": 2605 + }, + { + "epoch": 0.4079524107701941, + "grad_norm": 0.5628373026847839, + "learning_rate": 8.290159661127404e-05, + "loss": 0.265, + "step": 2606 + }, + { + "epoch": 0.40810895428929245, + "grad_norm": 0.9666054844856262, + "learning_rate": 8.289345063538612e-05, + "loss": 0.4287, + "step": 2607 + }, + { + "epoch": 0.4082654978083907, + "grad_norm": 1.0504450798034668, + "learning_rate": 8.288530465949821e-05, + "loss": 0.3306, + "step": 2608 + }, + { + "epoch": 0.40842204132748905, + "grad_norm": 0.9504714012145996, + "learning_rate": 8.28771586836103e-05, + "loss": 0.3738, + "step": 2609 + }, + { + "epoch": 0.4085785848465874, + "grad_norm": 2.1281979084014893, + "learning_rate": 8.286901270772239e-05, + "loss": 0.7904, + "step": 2610 + }, + { + "epoch": 0.40873512836568565, + "grad_norm": 0.873519241809845, + "learning_rate": 8.286086673183447e-05, + "loss": 0.3738, + "step": 2611 + }, + { + "epoch": 0.408891671884784, + "grad_norm": 1.8012670278549194, + "learning_rate": 8.285272075594657e-05, + "loss": 0.6571, + "step": 2612 + }, + { + "epoch": 0.40904821540388225, + "grad_norm": 1.8505358695983887, + "learning_rate": 8.284457478005865e-05, + "loss": 0.4083, + "step": 2613 + }, + { + "epoch": 0.4092047589229806, + "grad_norm": 3.3936376571655273, + "learning_rate": 8.283642880417074e-05, + "loss": 0.9782, + "step": 2614 + }, + { + "epoch": 0.4093613024420789, + "grad_norm": 1.1330876350402832, + "learning_rate": 8.282828282828283e-05, + "loss": 0.4446, + "step": 2615 + }, + { + "epoch": 0.4095178459611772, + "grad_norm": 1.4483535289764404, + "learning_rate": 8.282013685239492e-05, + "loss": 0.6094, + "step": 2616 + }, + { + "epoch": 0.4096743894802755, + "grad_norm": 1.3595499992370605, + "learning_rate": 8.2811990876507e-05, + "loss": 0.5293, + "step": 2617 + }, + { + "epoch": 0.40983093299937384, + "grad_norm": 1.5329173803329468, + "learning_rate": 8.28038449006191e-05, + "loss": 0.6222, + "step": 2618 + }, + { + "epoch": 0.4099874765184721, + "grad_norm": 1.8512136936187744, + "learning_rate": 8.27956989247312e-05, + "loss": 0.6124, + "step": 2619 + }, + { + "epoch": 0.41014402003757044, + "grad_norm": 1.5361077785491943, + "learning_rate": 8.278755294884327e-05, + "loss": 0.6487, + "step": 2620 + }, + { + "epoch": 0.4103005635566688, + "grad_norm": 1.6810826063156128, + "learning_rate": 8.277940697295536e-05, + "loss": 0.6689, + "step": 2621 + }, + { + "epoch": 0.41045710707576705, + "grad_norm": 5.088306903839111, + "learning_rate": 8.277126099706746e-05, + "loss": 0.9911, + "step": 2622 + }, + { + "epoch": 0.4106136505948654, + "grad_norm": 2.2201666831970215, + "learning_rate": 8.276311502117953e-05, + "loss": 0.8052, + "step": 2623 + }, + { + "epoch": 0.4107701941139637, + "grad_norm": 2.472966432571411, + "learning_rate": 8.275496904529163e-05, + "loss": 0.9076, + "step": 2624 + }, + { + "epoch": 0.410926737633062, + "grad_norm": 3.589059591293335, + "learning_rate": 8.274682306940373e-05, + "loss": 1.1636, + "step": 2625 + }, + { + "epoch": 0.4110832811521603, + "grad_norm": 2.4436044692993164, + "learning_rate": 8.273867709351581e-05, + "loss": 0.9859, + "step": 2626 + }, + { + "epoch": 0.41123982467125864, + "grad_norm": 2.4492316246032715, + "learning_rate": 8.27305311176279e-05, + "loss": 0.8172, + "step": 2627 + }, + { + "epoch": 0.4113963681903569, + "grad_norm": 2.4531474113464355, + "learning_rate": 8.272238514173999e-05, + "loss": 0.7863, + "step": 2628 + }, + { + "epoch": 0.41155291170945524, + "grad_norm": 1.7753920555114746, + "learning_rate": 8.271423916585207e-05, + "loss": 0.7199, + "step": 2629 + }, + { + "epoch": 0.4117094552285535, + "grad_norm": 2.539292573928833, + "learning_rate": 8.270609318996416e-05, + "loss": 0.9934, + "step": 2630 + }, + { + "epoch": 0.41186599874765184, + "grad_norm": 1.5767935514450073, + "learning_rate": 8.269794721407626e-05, + "loss": 0.5284, + "step": 2631 + }, + { + "epoch": 0.41202254226675017, + "grad_norm": 2.3537609577178955, + "learning_rate": 8.268980123818834e-05, + "loss": 0.9076, + "step": 2632 + }, + { + "epoch": 0.41217908578584844, + "grad_norm": 2.90519380569458, + "learning_rate": 8.268165526230042e-05, + "loss": 0.5544, + "step": 2633 + }, + { + "epoch": 0.41233562930494677, + "grad_norm": 3.1489179134368896, + "learning_rate": 8.267350928641252e-05, + "loss": 0.7599, + "step": 2634 + }, + { + "epoch": 0.4124921728240451, + "grad_norm": 2.9018657207489014, + "learning_rate": 8.26653633105246e-05, + "loss": 1.5089, + "step": 2635 + }, + { + "epoch": 0.4126487163431434, + "grad_norm": 3.8413703441619873, + "learning_rate": 8.265721733463669e-05, + "loss": 1.1196, + "step": 2636 + }, + { + "epoch": 0.4128052598622417, + "grad_norm": 5.8021626472473145, + "learning_rate": 8.264907135874879e-05, + "loss": 1.4741, + "step": 2637 + }, + { + "epoch": 0.41296180338134003, + "grad_norm": 3.8157663345336914, + "learning_rate": 8.264092538286087e-05, + "loss": 1.4478, + "step": 2638 + }, + { + "epoch": 0.4131183469004383, + "grad_norm": 2.721600294113159, + "learning_rate": 8.263277940697297e-05, + "loss": 1.7699, + "step": 2639 + }, + { + "epoch": 0.41327489041953663, + "grad_norm": 13.873735427856445, + "learning_rate": 8.262463343108505e-05, + "loss": 1.2436, + "step": 2640 + }, + { + "epoch": 0.41343143393863496, + "grad_norm": 3.2105748653411865, + "learning_rate": 8.261648745519713e-05, + "loss": 1.1449, + "step": 2641 + }, + { + "epoch": 0.41358797745773324, + "grad_norm": 2.654463052749634, + "learning_rate": 8.260834147930923e-05, + "loss": 1.3124, + "step": 2642 + }, + { + "epoch": 0.41374452097683156, + "grad_norm": 3.3114144802093506, + "learning_rate": 8.260019550342132e-05, + "loss": 1.5039, + "step": 2643 + }, + { + "epoch": 0.4139010644959299, + "grad_norm": 3.861344814300537, + "learning_rate": 8.25920495275334e-05, + "loss": 1.42, + "step": 2644 + }, + { + "epoch": 0.41405760801502817, + "grad_norm": 2.924420118331909, + "learning_rate": 8.25839035516455e-05, + "loss": 1.1011, + "step": 2645 + }, + { + "epoch": 0.4142141515341265, + "grad_norm": 3.082174301147461, + "learning_rate": 8.257575757575758e-05, + "loss": 1.475, + "step": 2646 + }, + { + "epoch": 0.41437069505322477, + "grad_norm": 2.274315118789673, + "learning_rate": 8.256761159986966e-05, + "loss": 1.3936, + "step": 2647 + }, + { + "epoch": 0.4145272385723231, + "grad_norm": 3.5735626220703125, + "learning_rate": 8.255946562398176e-05, + "loss": 0.8753, + "step": 2648 + }, + { + "epoch": 0.4146837820914214, + "grad_norm": 3.430712938308716, + "learning_rate": 8.255131964809384e-05, + "loss": 1.2743, + "step": 2649 + }, + { + "epoch": 0.4148403256105197, + "grad_norm": 2.9506773948669434, + "learning_rate": 8.254317367220593e-05, + "loss": 2.0645, + "step": 2650 + }, + { + "epoch": 0.41499686912961803, + "grad_norm": 0.9172519445419312, + "learning_rate": 8.253502769631803e-05, + "loss": 0.3791, + "step": 2651 + }, + { + "epoch": 0.41515341264871636, + "grad_norm": 0.5932250022888184, + "learning_rate": 8.252688172043011e-05, + "loss": 0.3382, + "step": 2652 + }, + { + "epoch": 0.41530995616781463, + "grad_norm": 0.8589972257614136, + "learning_rate": 8.25187357445422e-05, + "loss": 0.3223, + "step": 2653 + }, + { + "epoch": 0.41546649968691296, + "grad_norm": 1.2267651557922363, + "learning_rate": 8.251058976865429e-05, + "loss": 0.4421, + "step": 2654 + }, + { + "epoch": 0.4156230432060113, + "grad_norm": 0.9969246983528137, + "learning_rate": 8.250244379276639e-05, + "loss": 0.3525, + "step": 2655 + }, + { + "epoch": 0.41577958672510956, + "grad_norm": 0.8269015550613403, + "learning_rate": 8.249429781687846e-05, + "loss": 0.3131, + "step": 2656 + }, + { + "epoch": 0.4159361302442079, + "grad_norm": 0.9900327920913696, + "learning_rate": 8.248615184099056e-05, + "loss": 0.3777, + "step": 2657 + }, + { + "epoch": 0.4160926737633062, + "grad_norm": 0.8818584680557251, + "learning_rate": 8.247800586510265e-05, + "loss": 0.2995, + "step": 2658 + }, + { + "epoch": 0.4162492172824045, + "grad_norm": 0.6494611501693726, + "learning_rate": 8.246985988921472e-05, + "loss": 0.356, + "step": 2659 + }, + { + "epoch": 0.4164057608015028, + "grad_norm": 1.4358936548233032, + "learning_rate": 8.246171391332682e-05, + "loss": 0.5716, + "step": 2660 + }, + { + "epoch": 0.41656230432060115, + "grad_norm": 1.0386440753936768, + "learning_rate": 8.245356793743892e-05, + "loss": 0.4798, + "step": 2661 + }, + { + "epoch": 0.4167188478396994, + "grad_norm": 1.2764841318130493, + "learning_rate": 8.2445421961551e-05, + "loss": 0.3559, + "step": 2662 + }, + { + "epoch": 0.41687539135879775, + "grad_norm": 3.3652615547180176, + "learning_rate": 8.243727598566309e-05, + "loss": 0.7966, + "step": 2663 + }, + { + "epoch": 0.4170319348778961, + "grad_norm": 1.5606852769851685, + "learning_rate": 8.242913000977518e-05, + "loss": 0.5429, + "step": 2664 + }, + { + "epoch": 0.41718847839699436, + "grad_norm": 1.4782997369766235, + "learning_rate": 8.242098403388727e-05, + "loss": 0.4872, + "step": 2665 + }, + { + "epoch": 0.4173450219160927, + "grad_norm": 1.7492541074752808, + "learning_rate": 8.241283805799935e-05, + "loss": 0.5033, + "step": 2666 + }, + { + "epoch": 0.41750156543519096, + "grad_norm": 1.3200974464416504, + "learning_rate": 8.240469208211145e-05, + "loss": 0.5758, + "step": 2667 + }, + { + "epoch": 0.4176581089542893, + "grad_norm": 1.5226317644119263, + "learning_rate": 8.239654610622353e-05, + "loss": 0.7438, + "step": 2668 + }, + { + "epoch": 0.4178146524733876, + "grad_norm": 1.244423270225525, + "learning_rate": 8.238840013033561e-05, + "loss": 0.5394, + "step": 2669 + }, + { + "epoch": 0.4179711959924859, + "grad_norm": 2.1114330291748047, + "learning_rate": 8.238025415444771e-05, + "loss": 0.6751, + "step": 2670 + }, + { + "epoch": 0.4181277395115842, + "grad_norm": 1.5974907875061035, + "learning_rate": 8.23721081785598e-05, + "loss": 0.7335, + "step": 2671 + }, + { + "epoch": 0.41828428303068255, + "grad_norm": 1.8007210493087769, + "learning_rate": 8.236396220267188e-05, + "loss": 0.4183, + "step": 2672 + }, + { + "epoch": 0.4184408265497808, + "grad_norm": 2.0072269439697266, + "learning_rate": 8.235581622678398e-05, + "loss": 0.858, + "step": 2673 + }, + { + "epoch": 0.41859737006887915, + "grad_norm": 3.3257341384887695, + "learning_rate": 8.234767025089606e-05, + "loss": 0.9359, + "step": 2674 + }, + { + "epoch": 0.4187539135879775, + "grad_norm": 2.957892417907715, + "learning_rate": 8.233952427500814e-05, + "loss": 0.6415, + "step": 2675 + }, + { + "epoch": 0.41891045710707575, + "grad_norm": 2.60086989402771, + "learning_rate": 8.233137829912024e-05, + "loss": 0.8872, + "step": 2676 + }, + { + "epoch": 0.4190670006261741, + "grad_norm": 2.8794424533843994, + "learning_rate": 8.232323232323233e-05, + "loss": 0.7674, + "step": 2677 + }, + { + "epoch": 0.4192235441452724, + "grad_norm": 1.6250749826431274, + "learning_rate": 8.231508634734442e-05, + "loss": 0.6332, + "step": 2678 + }, + { + "epoch": 0.4193800876643707, + "grad_norm": 2.6352524757385254, + "learning_rate": 8.23069403714565e-05, + "loss": 1.0266, + "step": 2679 + }, + { + "epoch": 0.419536631183469, + "grad_norm": 2.6701343059539795, + "learning_rate": 8.229879439556859e-05, + "loss": 0.8184, + "step": 2680 + }, + { + "epoch": 0.41969317470256734, + "grad_norm": 2.5076053142547607, + "learning_rate": 8.229064841968069e-05, + "loss": 0.7462, + "step": 2681 + }, + { + "epoch": 0.4198497182216656, + "grad_norm": 3.1536593437194824, + "learning_rate": 8.228250244379277e-05, + "loss": 1.0027, + "step": 2682 + }, + { + "epoch": 0.42000626174076394, + "grad_norm": 2.5935890674591064, + "learning_rate": 8.227435646790485e-05, + "loss": 1.1002, + "step": 2683 + }, + { + "epoch": 0.4201628052598622, + "grad_norm": 3.5133261680603027, + "learning_rate": 8.226621049201695e-05, + "loss": 0.9388, + "step": 2684 + }, + { + "epoch": 0.42031934877896054, + "grad_norm": 3.262798309326172, + "learning_rate": 8.225806451612904e-05, + "loss": 1.1756, + "step": 2685 + }, + { + "epoch": 0.4204758922980589, + "grad_norm": 2.393240451812744, + "learning_rate": 8.224991854024112e-05, + "loss": 1.2994, + "step": 2686 + }, + { + "epoch": 0.42063243581715715, + "grad_norm": 4.547251224517822, + "learning_rate": 8.224177256435322e-05, + "loss": 1.048, + "step": 2687 + }, + { + "epoch": 0.4207889793362555, + "grad_norm": 2.305793523788452, + "learning_rate": 8.22336265884653e-05, + "loss": 0.7238, + "step": 2688 + }, + { + "epoch": 0.4209455228553538, + "grad_norm": 1.2858657836914062, + "learning_rate": 8.222548061257738e-05, + "loss": 0.7355, + "step": 2689 + }, + { + "epoch": 0.4211020663744521, + "grad_norm": 2.7634668350219727, + "learning_rate": 8.221733463668948e-05, + "loss": 1.5936, + "step": 2690 + }, + { + "epoch": 0.4212586098935504, + "grad_norm": 2.630012035369873, + "learning_rate": 8.220918866080158e-05, + "loss": 1.6641, + "step": 2691 + }, + { + "epoch": 0.42141515341264874, + "grad_norm": 4.65290641784668, + "learning_rate": 8.220104268491365e-05, + "loss": 2.1556, + "step": 2692 + }, + { + "epoch": 0.421571696931747, + "grad_norm": 2.3899648189544678, + "learning_rate": 8.219289670902575e-05, + "loss": 1.3444, + "step": 2693 + }, + { + "epoch": 0.42172824045084534, + "grad_norm": 2.444601058959961, + "learning_rate": 8.218475073313784e-05, + "loss": 1.7573, + "step": 2694 + }, + { + "epoch": 0.42188478396994367, + "grad_norm": 2.4991633892059326, + "learning_rate": 8.217660475724991e-05, + "loss": 1.292, + "step": 2695 + }, + { + "epoch": 0.42204132748904194, + "grad_norm": 1.3956232070922852, + "learning_rate": 8.216845878136201e-05, + "loss": 0.856, + "step": 2696 + }, + { + "epoch": 0.42219787100814027, + "grad_norm": 1.5244959592819214, + "learning_rate": 8.216031280547411e-05, + "loss": 0.7709, + "step": 2697 + }, + { + "epoch": 0.4223544145272386, + "grad_norm": 1.7594654560089111, + "learning_rate": 8.215216682958619e-05, + "loss": 0.6056, + "step": 2698 + }, + { + "epoch": 0.42251095804633687, + "grad_norm": 2.3898210525512695, + "learning_rate": 8.214402085369828e-05, + "loss": 1.0481, + "step": 2699 + }, + { + "epoch": 0.4226675015654352, + "grad_norm": 3.229755401611328, + "learning_rate": 8.213587487781037e-05, + "loss": 1.4184, + "step": 2700 + }, + { + "epoch": 0.4228240450845335, + "grad_norm": 0.957429051399231, + "learning_rate": 8.212772890192246e-05, + "loss": 0.3458, + "step": 2701 + }, + { + "epoch": 0.4229805886036318, + "grad_norm": 1.2141510248184204, + "learning_rate": 8.211958292603454e-05, + "loss": 0.4739, + "step": 2702 + }, + { + "epoch": 0.42313713212273013, + "grad_norm": 0.46773117780685425, + "learning_rate": 8.211143695014664e-05, + "loss": 0.2213, + "step": 2703 + }, + { + "epoch": 0.4232936756418284, + "grad_norm": 0.87724769115448, + "learning_rate": 8.210329097425872e-05, + "loss": 0.2945, + "step": 2704 + }, + { + "epoch": 0.42345021916092673, + "grad_norm": 0.6975058913230896, + "learning_rate": 8.20951449983708e-05, + "loss": 0.329, + "step": 2705 + }, + { + "epoch": 0.42360676268002506, + "grad_norm": 0.7229014039039612, + "learning_rate": 8.20869990224829e-05, + "loss": 0.3792, + "step": 2706 + }, + { + "epoch": 0.42376330619912334, + "grad_norm": 0.8196581602096558, + "learning_rate": 8.207885304659499e-05, + "loss": 0.3541, + "step": 2707 + }, + { + "epoch": 0.42391984971822166, + "grad_norm": 1.8391809463500977, + "learning_rate": 8.207070707070707e-05, + "loss": 0.434, + "step": 2708 + }, + { + "epoch": 0.42407639323732, + "grad_norm": 1.0249658823013306, + "learning_rate": 8.206256109481917e-05, + "loss": 0.3396, + "step": 2709 + }, + { + "epoch": 0.42423293675641827, + "grad_norm": 0.9471437931060791, + "learning_rate": 8.205441511893125e-05, + "loss": 0.3818, + "step": 2710 + }, + { + "epoch": 0.4243894802755166, + "grad_norm": 0.8738710284233093, + "learning_rate": 8.204626914304334e-05, + "loss": 0.3047, + "step": 2711 + }, + { + "epoch": 0.4245460237946149, + "grad_norm": 1.900139570236206, + "learning_rate": 8.203812316715543e-05, + "loss": 0.5079, + "step": 2712 + }, + { + "epoch": 0.4247025673137132, + "grad_norm": 1.4264801740646362, + "learning_rate": 8.202997719126752e-05, + "loss": 0.4234, + "step": 2713 + }, + { + "epoch": 0.4248591108328115, + "grad_norm": 1.0411368608474731, + "learning_rate": 8.202183121537961e-05, + "loss": 0.4034, + "step": 2714 + }, + { + "epoch": 0.42501565435190986, + "grad_norm": 1.3233569860458374, + "learning_rate": 8.20136852394917e-05, + "loss": 0.551, + "step": 2715 + }, + { + "epoch": 0.42517219787100813, + "grad_norm": 1.4242123365402222, + "learning_rate": 8.200553926360378e-05, + "loss": 0.4945, + "step": 2716 + }, + { + "epoch": 0.42532874139010646, + "grad_norm": 1.4528719186782837, + "learning_rate": 8.199739328771588e-05, + "loss": 0.4873, + "step": 2717 + }, + { + "epoch": 0.4254852849092048, + "grad_norm": 1.4333778619766235, + "learning_rate": 8.198924731182796e-05, + "loss": 0.5428, + "step": 2718 + }, + { + "epoch": 0.42564182842830306, + "grad_norm": 2.0993947982788086, + "learning_rate": 8.198110133594005e-05, + "loss": 0.499, + "step": 2719 + }, + { + "epoch": 0.4257983719474014, + "grad_norm": 1.5313053131103516, + "learning_rate": 8.197295536005214e-05, + "loss": 0.482, + "step": 2720 + }, + { + "epoch": 0.42595491546649966, + "grad_norm": 1.2606666088104248, + "learning_rate": 8.196480938416423e-05, + "loss": 0.5504, + "step": 2721 + }, + { + "epoch": 0.426111458985598, + "grad_norm": 1.1688698530197144, + "learning_rate": 8.195666340827631e-05, + "loss": 0.5612, + "step": 2722 + }, + { + "epoch": 0.4262680025046963, + "grad_norm": 1.824518084526062, + "learning_rate": 8.194851743238841e-05, + "loss": 0.6394, + "step": 2723 + }, + { + "epoch": 0.4264245460237946, + "grad_norm": 1.5038676261901855, + "learning_rate": 8.194037145650049e-05, + "loss": 0.5742, + "step": 2724 + }, + { + "epoch": 0.4265810895428929, + "grad_norm": 2.3827297687530518, + "learning_rate": 8.193222548061258e-05, + "loss": 1.0383, + "step": 2725 + }, + { + "epoch": 0.42673763306199125, + "grad_norm": 1.9653202295303345, + "learning_rate": 8.192407950472467e-05, + "loss": 0.6978, + "step": 2726 + }, + { + "epoch": 0.4268941765810895, + "grad_norm": 2.0670368671417236, + "learning_rate": 8.191593352883677e-05, + "loss": 0.4117, + "step": 2727 + }, + { + "epoch": 0.42705072010018785, + "grad_norm": 2.6230709552764893, + "learning_rate": 8.190778755294884e-05, + "loss": 0.5647, + "step": 2728 + }, + { + "epoch": 0.4272072636192862, + "grad_norm": 2.7050156593322754, + "learning_rate": 8.189964157706094e-05, + "loss": 0.8639, + "step": 2729 + }, + { + "epoch": 0.42736380713838445, + "grad_norm": 2.9081945419311523, + "learning_rate": 8.189149560117304e-05, + "loss": 1.3334, + "step": 2730 + }, + { + "epoch": 0.4275203506574828, + "grad_norm": 2.221109628677368, + "learning_rate": 8.18833496252851e-05, + "loss": 1.2089, + "step": 2731 + }, + { + "epoch": 0.4276768941765811, + "grad_norm": 2.3325672149658203, + "learning_rate": 8.18752036493972e-05, + "loss": 0.8298, + "step": 2732 + }, + { + "epoch": 0.4278334376956794, + "grad_norm": 2.372469186782837, + "learning_rate": 8.18670576735093e-05, + "loss": 1.4846, + "step": 2733 + }, + { + "epoch": 0.4279899812147777, + "grad_norm": 8.569228172302246, + "learning_rate": 8.185891169762137e-05, + "loss": 1.4438, + "step": 2734 + }, + { + "epoch": 0.42814652473387604, + "grad_norm": 3.566615343093872, + "learning_rate": 8.185076572173347e-05, + "loss": 0.9517, + "step": 2735 + }, + { + "epoch": 0.4283030682529743, + "grad_norm": 2.1885530948638916, + "learning_rate": 8.184261974584556e-05, + "loss": 0.9474, + "step": 2736 + }, + { + "epoch": 0.42845961177207265, + "grad_norm": 2.86911940574646, + "learning_rate": 8.183447376995765e-05, + "loss": 1.0429, + "step": 2737 + }, + { + "epoch": 0.4286161552911709, + "grad_norm": 4.709109783172607, + "learning_rate": 8.182632779406973e-05, + "loss": 1.3305, + "step": 2738 + }, + { + "epoch": 0.42877269881026925, + "grad_norm": 2.908367156982422, + "learning_rate": 8.181818181818183e-05, + "loss": 1.3679, + "step": 2739 + }, + { + "epoch": 0.4289292423293676, + "grad_norm": 3.214965581893921, + "learning_rate": 8.181003584229391e-05, + "loss": 1.1527, + "step": 2740 + }, + { + "epoch": 0.42908578584846585, + "grad_norm": 2.3039419651031494, + "learning_rate": 8.1801889866406e-05, + "loss": 1.3719, + "step": 2741 + }, + { + "epoch": 0.4292423293675642, + "grad_norm": 2.2516367435455322, + "learning_rate": 8.17937438905181e-05, + "loss": 0.881, + "step": 2742 + }, + { + "epoch": 0.4293988728866625, + "grad_norm": 5.137165546417236, + "learning_rate": 8.178559791463018e-05, + "loss": 1.4406, + "step": 2743 + }, + { + "epoch": 0.4295554164057608, + "grad_norm": 2.807131767272949, + "learning_rate": 8.177745193874226e-05, + "loss": 0.8478, + "step": 2744 + }, + { + "epoch": 0.4297119599248591, + "grad_norm": 4.143092155456543, + "learning_rate": 8.176930596285436e-05, + "loss": 1.6032, + "step": 2745 + }, + { + "epoch": 0.42986850344395744, + "grad_norm": 7.7117743492126465, + "learning_rate": 8.176115998696644e-05, + "loss": 1.466, + "step": 2746 + }, + { + "epoch": 0.4300250469630557, + "grad_norm": 2.175323963165283, + "learning_rate": 8.175301401107853e-05, + "loss": 0.6014, + "step": 2747 + }, + { + "epoch": 0.43018159048215404, + "grad_norm": 3.053974151611328, + "learning_rate": 8.174486803519062e-05, + "loss": 0.6178, + "step": 2748 + }, + { + "epoch": 0.43033813400125237, + "grad_norm": 4.045997142791748, + "learning_rate": 8.173672205930271e-05, + "loss": 0.6687, + "step": 2749 + }, + { + "epoch": 0.43049467752035064, + "grad_norm": 6.878659248352051, + "learning_rate": 8.17285760834148e-05, + "loss": 1.1488, + "step": 2750 + }, + { + "epoch": 0.430651221039449, + "grad_norm": 0.8808267712593079, + "learning_rate": 8.172043010752689e-05, + "loss": 0.3215, + "step": 2751 + }, + { + "epoch": 0.4308077645585473, + "grad_norm": 0.7655764818191528, + "learning_rate": 8.171228413163897e-05, + "loss": 0.4208, + "step": 2752 + }, + { + "epoch": 0.4309643080776456, + "grad_norm": 0.708686888217926, + "learning_rate": 8.170413815575107e-05, + "loss": 0.3186, + "step": 2753 + }, + { + "epoch": 0.4311208515967439, + "grad_norm": 0.5763437747955322, + "learning_rate": 8.169599217986315e-05, + "loss": 0.247, + "step": 2754 + }, + { + "epoch": 0.4312773951158422, + "grad_norm": 0.9321039319038391, + "learning_rate": 8.168784620397524e-05, + "loss": 0.3851, + "step": 2755 + }, + { + "epoch": 0.4314339386349405, + "grad_norm": 0.6352066993713379, + "learning_rate": 8.167970022808733e-05, + "loss": 0.285, + "step": 2756 + }, + { + "epoch": 0.43159048215403883, + "grad_norm": 1.2994331121444702, + "learning_rate": 8.167155425219942e-05, + "loss": 0.5495, + "step": 2757 + }, + { + "epoch": 0.4317470256731371, + "grad_norm": 1.0662486553192139, + "learning_rate": 8.16634082763115e-05, + "loss": 0.4637, + "step": 2758 + }, + { + "epoch": 0.43190356919223544, + "grad_norm": 0.8653490543365479, + "learning_rate": 8.16552623004236e-05, + "loss": 0.2882, + "step": 2759 + }, + { + "epoch": 0.43206011271133377, + "grad_norm": 1.2584497928619385, + "learning_rate": 8.164711632453568e-05, + "loss": 0.5538, + "step": 2760 + }, + { + "epoch": 0.43221665623043204, + "grad_norm": 1.1589221954345703, + "learning_rate": 8.163897034864777e-05, + "loss": 0.4161, + "step": 2761 + }, + { + "epoch": 0.43237319974953037, + "grad_norm": 1.094875693321228, + "learning_rate": 8.163082437275986e-05, + "loss": 0.392, + "step": 2762 + }, + { + "epoch": 0.4325297432686287, + "grad_norm": 1.1767079830169678, + "learning_rate": 8.162267839687195e-05, + "loss": 0.3056, + "step": 2763 + }, + { + "epoch": 0.43268628678772697, + "grad_norm": 1.4692115783691406, + "learning_rate": 8.161453242098403e-05, + "loss": 0.6779, + "step": 2764 + }, + { + "epoch": 0.4328428303068253, + "grad_norm": 1.4015921354293823, + "learning_rate": 8.160638644509613e-05, + "loss": 0.3869, + "step": 2765 + }, + { + "epoch": 0.43299937382592363, + "grad_norm": 3.1475203037261963, + "learning_rate": 8.159824046920823e-05, + "loss": 0.7218, + "step": 2766 + }, + { + "epoch": 0.4331559173450219, + "grad_norm": 1.400355577468872, + "learning_rate": 8.15900944933203e-05, + "loss": 0.6133, + "step": 2767 + }, + { + "epoch": 0.43331246086412023, + "grad_norm": 1.239412546157837, + "learning_rate": 8.15819485174324e-05, + "loss": 0.4952, + "step": 2768 + }, + { + "epoch": 0.43346900438321856, + "grad_norm": 3.6992623805999756, + "learning_rate": 8.157380254154449e-05, + "loss": 1.053, + "step": 2769 + }, + { + "epoch": 0.43362554790231683, + "grad_norm": 2.1929450035095215, + "learning_rate": 8.156565656565656e-05, + "loss": 0.5503, + "step": 2770 + }, + { + "epoch": 0.43378209142141516, + "grad_norm": 2.5561177730560303, + "learning_rate": 8.155751058976866e-05, + "loss": 1.0566, + "step": 2771 + }, + { + "epoch": 0.4339386349405135, + "grad_norm": 1.5914829969406128, + "learning_rate": 8.154936461388076e-05, + "loss": 0.4435, + "step": 2772 + }, + { + "epoch": 0.43409517845961176, + "grad_norm": 1.6459897756576538, + "learning_rate": 8.154121863799284e-05, + "loss": 0.638, + "step": 2773 + }, + { + "epoch": 0.4342517219787101, + "grad_norm": 2.08575177192688, + "learning_rate": 8.153307266210492e-05, + "loss": 0.6343, + "step": 2774 + }, + { + "epoch": 0.43440826549780837, + "grad_norm": 2.0982089042663574, + "learning_rate": 8.152492668621702e-05, + "loss": 0.5403, + "step": 2775 + }, + { + "epoch": 0.4345648090169067, + "grad_norm": 1.641592025756836, + "learning_rate": 8.15167807103291e-05, + "loss": 0.5415, + "step": 2776 + }, + { + "epoch": 0.434721352536005, + "grad_norm": 1.9232577085494995, + "learning_rate": 8.150863473444119e-05, + "loss": 0.5638, + "step": 2777 + }, + { + "epoch": 0.4348778960551033, + "grad_norm": 3.3794970512390137, + "learning_rate": 8.150048875855329e-05, + "loss": 1.0807, + "step": 2778 + }, + { + "epoch": 0.4350344395742016, + "grad_norm": 1.810838222503662, + "learning_rate": 8.149234278266537e-05, + "loss": 0.5455, + "step": 2779 + }, + { + "epoch": 0.43519098309329995, + "grad_norm": 3.3545122146606445, + "learning_rate": 8.148419680677745e-05, + "loss": 0.6105, + "step": 2780 + }, + { + "epoch": 0.43534752661239823, + "grad_norm": 2.4081473350524902, + "learning_rate": 8.147605083088955e-05, + "loss": 0.986, + "step": 2781 + }, + { + "epoch": 0.43550407013149656, + "grad_norm": 5.119198799133301, + "learning_rate": 8.146790485500163e-05, + "loss": 0.8484, + "step": 2782 + }, + { + "epoch": 0.4356606136505949, + "grad_norm": 2.0117993354797363, + "learning_rate": 8.145975887911372e-05, + "loss": 0.9127, + "step": 2783 + }, + { + "epoch": 0.43581715716969316, + "grad_norm": 4.1639323234558105, + "learning_rate": 8.145161290322582e-05, + "loss": 1.2778, + "step": 2784 + }, + { + "epoch": 0.4359737006887915, + "grad_norm": 1.7116960287094116, + "learning_rate": 8.14434669273379e-05, + "loss": 0.7166, + "step": 2785 + }, + { + "epoch": 0.4361302442078898, + "grad_norm": 1.6969119310379028, + "learning_rate": 8.143532095144998e-05, + "loss": 0.7605, + "step": 2786 + }, + { + "epoch": 0.4362867877269881, + "grad_norm": 4.299317836761475, + "learning_rate": 8.142717497556208e-05, + "loss": 1.0385, + "step": 2787 + }, + { + "epoch": 0.4364433312460864, + "grad_norm": 3.8228604793548584, + "learning_rate": 8.141902899967416e-05, + "loss": 1.3055, + "step": 2788 + }, + { + "epoch": 0.43659987476518475, + "grad_norm": 2.465538740158081, + "learning_rate": 8.141088302378626e-05, + "loss": 1.4366, + "step": 2789 + }, + { + "epoch": 0.436756418284283, + "grad_norm": 3.733886957168579, + "learning_rate": 8.140273704789834e-05, + "loss": 1.3028, + "step": 2790 + }, + { + "epoch": 0.43691296180338135, + "grad_norm": 2.5048134326934814, + "learning_rate": 8.139459107201043e-05, + "loss": 1.0638, + "step": 2791 + }, + { + "epoch": 0.4370695053224796, + "grad_norm": 2.0961997509002686, + "learning_rate": 8.138644509612253e-05, + "loss": 1.0617, + "step": 2792 + }, + { + "epoch": 0.43722604884157795, + "grad_norm": 5.24486780166626, + "learning_rate": 8.137829912023461e-05, + "loss": 1.5426, + "step": 2793 + }, + { + "epoch": 0.4373825923606763, + "grad_norm": 3.1121582984924316, + "learning_rate": 8.137015314434669e-05, + "loss": 1.1879, + "step": 2794 + }, + { + "epoch": 0.43753913587977455, + "grad_norm": 1.8506050109863281, + "learning_rate": 8.136200716845879e-05, + "loss": 1.1274, + "step": 2795 + }, + { + "epoch": 0.4376956793988729, + "grad_norm": 3.177858829498291, + "learning_rate": 8.135386119257087e-05, + "loss": 0.5567, + "step": 2796 + }, + { + "epoch": 0.4378522229179712, + "grad_norm": 1.8846962451934814, + "learning_rate": 8.134571521668296e-05, + "loss": 0.6096, + "step": 2797 + }, + { + "epoch": 0.4380087664370695, + "grad_norm": 1.6052924394607544, + "learning_rate": 8.133756924079506e-05, + "loss": 0.6183, + "step": 2798 + }, + { + "epoch": 0.4381653099561678, + "grad_norm": 3.840191125869751, + "learning_rate": 8.132942326490714e-05, + "loss": 1.6486, + "step": 2799 + }, + { + "epoch": 0.43832185347526614, + "grad_norm": 4.938685894012451, + "learning_rate": 8.132127728901922e-05, + "loss": 1.1185, + "step": 2800 + }, + { + "epoch": 0.4384783969943644, + "grad_norm": 1.1183987855911255, + "learning_rate": 8.131313131313132e-05, + "loss": 0.4552, + "step": 2801 + }, + { + "epoch": 0.43863494051346275, + "grad_norm": 0.6097233295440674, + "learning_rate": 8.130498533724342e-05, + "loss": 0.2175, + "step": 2802 + }, + { + "epoch": 0.4387914840325611, + "grad_norm": 0.849781334400177, + "learning_rate": 8.129683936135549e-05, + "loss": 0.3373, + "step": 2803 + }, + { + "epoch": 0.43894802755165935, + "grad_norm": 2.527769088745117, + "learning_rate": 8.128869338546758e-05, + "loss": 0.8349, + "step": 2804 + }, + { + "epoch": 0.4391045710707577, + "grad_norm": 1.8236910104751587, + "learning_rate": 8.128054740957968e-05, + "loss": 0.3665, + "step": 2805 + }, + { + "epoch": 0.439261114589856, + "grad_norm": 0.7921452522277832, + "learning_rate": 8.127240143369175e-05, + "loss": 0.4148, + "step": 2806 + }, + { + "epoch": 0.4394176581089543, + "grad_norm": 1.3364264965057373, + "learning_rate": 8.126425545780385e-05, + "loss": 0.3419, + "step": 2807 + }, + { + "epoch": 0.4395742016280526, + "grad_norm": 0.925592839717865, + "learning_rate": 8.125610948191595e-05, + "loss": 0.4457, + "step": 2808 + }, + { + "epoch": 0.4397307451471509, + "grad_norm": 0.7285001873970032, + "learning_rate": 8.124796350602803e-05, + "loss": 0.3958, + "step": 2809 + }, + { + "epoch": 0.4398872886662492, + "grad_norm": 0.8011772632598877, + "learning_rate": 8.123981753014011e-05, + "loss": 0.385, + "step": 2810 + }, + { + "epoch": 0.44004383218534754, + "grad_norm": 0.76143479347229, + "learning_rate": 8.12316715542522e-05, + "loss": 0.3175, + "step": 2811 + }, + { + "epoch": 0.4402003757044458, + "grad_norm": 1.9794632196426392, + "learning_rate": 8.12235255783643e-05, + "loss": 0.5064, + "step": 2812 + }, + { + "epoch": 0.44035691922354414, + "grad_norm": 1.5266938209533691, + "learning_rate": 8.121537960247638e-05, + "loss": 0.5092, + "step": 2813 + }, + { + "epoch": 0.44051346274264247, + "grad_norm": 1.9251618385314941, + "learning_rate": 8.120723362658846e-05, + "loss": 0.6696, + "step": 2814 + }, + { + "epoch": 0.44067000626174074, + "grad_norm": 2.121750593185425, + "learning_rate": 8.119908765070056e-05, + "loss": 0.4921, + "step": 2815 + }, + { + "epoch": 0.44082654978083907, + "grad_norm": 1.9971908330917358, + "learning_rate": 8.119094167481264e-05, + "loss": 0.5273, + "step": 2816 + }, + { + "epoch": 0.4409830932999374, + "grad_norm": 2.8368053436279297, + "learning_rate": 8.118279569892473e-05, + "loss": 0.7584, + "step": 2817 + }, + { + "epoch": 0.4411396368190357, + "grad_norm": 3.1812362670898438, + "learning_rate": 8.117464972303683e-05, + "loss": 0.6765, + "step": 2818 + }, + { + "epoch": 0.441296180338134, + "grad_norm": 1.5998988151550293, + "learning_rate": 8.116650374714891e-05, + "loss": 0.4153, + "step": 2819 + }, + { + "epoch": 0.44145272385723233, + "grad_norm": 3.547515630722046, + "learning_rate": 8.115835777126099e-05, + "loss": 0.9198, + "step": 2820 + }, + { + "epoch": 0.4416092673763306, + "grad_norm": 1.8574042320251465, + "learning_rate": 8.115021179537309e-05, + "loss": 0.3401, + "step": 2821 + }, + { + "epoch": 0.44176581089542893, + "grad_norm": 2.21220064163208, + "learning_rate": 8.114206581948517e-05, + "loss": 0.5619, + "step": 2822 + }, + { + "epoch": 0.44192235441452726, + "grad_norm": 2.2149124145507812, + "learning_rate": 8.113391984359726e-05, + "loss": 0.6101, + "step": 2823 + }, + { + "epoch": 0.44207889793362554, + "grad_norm": 1.7646706104278564, + "learning_rate": 8.112577386770935e-05, + "loss": 0.8221, + "step": 2824 + }, + { + "epoch": 0.44223544145272387, + "grad_norm": 2.203505039215088, + "learning_rate": 8.111762789182145e-05, + "loss": 0.7751, + "step": 2825 + }, + { + "epoch": 0.4423919849718222, + "grad_norm": 2.007540225982666, + "learning_rate": 8.110948191593352e-05, + "loss": 0.7258, + "step": 2826 + }, + { + "epoch": 0.44254852849092047, + "grad_norm": 1.4505175352096558, + "learning_rate": 8.110133594004562e-05, + "loss": 0.4594, + "step": 2827 + }, + { + "epoch": 0.4427050720100188, + "grad_norm": 2.781369924545288, + "learning_rate": 8.109318996415772e-05, + "loss": 0.5581, + "step": 2828 + }, + { + "epoch": 0.44286161552911707, + "grad_norm": 1.7649500370025635, + "learning_rate": 8.108504398826979e-05, + "loss": 0.6404, + "step": 2829 + }, + { + "epoch": 0.4430181590482154, + "grad_norm": 2.238065481185913, + "learning_rate": 8.107689801238188e-05, + "loss": 0.8351, + "step": 2830 + }, + { + "epoch": 0.4431747025673137, + "grad_norm": 6.362175464630127, + "learning_rate": 8.106875203649398e-05, + "loss": 1.2868, + "step": 2831 + }, + { + "epoch": 0.443331246086412, + "grad_norm": 3.6732707023620605, + "learning_rate": 8.106060606060607e-05, + "loss": 1.2233, + "step": 2832 + }, + { + "epoch": 0.44348778960551033, + "grad_norm": 4.408255577087402, + "learning_rate": 8.105246008471815e-05, + "loss": 0.5924, + "step": 2833 + }, + { + "epoch": 0.44364433312460866, + "grad_norm": 4.937877178192139, + "learning_rate": 8.104431410883025e-05, + "loss": 1.6611, + "step": 2834 + }, + { + "epoch": 0.44380087664370693, + "grad_norm": 4.452988624572754, + "learning_rate": 8.103616813294233e-05, + "loss": 1.0125, + "step": 2835 + }, + { + "epoch": 0.44395742016280526, + "grad_norm": 2.610893964767456, + "learning_rate": 8.102802215705441e-05, + "loss": 1.0911, + "step": 2836 + }, + { + "epoch": 0.4441139636819036, + "grad_norm": 2.6124930381774902, + "learning_rate": 8.101987618116651e-05, + "loss": 1.3688, + "step": 2837 + }, + { + "epoch": 0.44427050720100186, + "grad_norm": 1.7294862270355225, + "learning_rate": 8.10117302052786e-05, + "loss": 0.8647, + "step": 2838 + }, + { + "epoch": 0.4444270507201002, + "grad_norm": 9.32268238067627, + "learning_rate": 8.100358422939068e-05, + "loss": 1.1029, + "step": 2839 + }, + { + "epoch": 0.4445835942391985, + "grad_norm": 3.255897045135498, + "learning_rate": 8.099543825350278e-05, + "loss": 1.7215, + "step": 2840 + }, + { + "epoch": 0.4447401377582968, + "grad_norm": 7.8426737785339355, + "learning_rate": 8.098729227761486e-05, + "loss": 1.3397, + "step": 2841 + }, + { + "epoch": 0.4448966812773951, + "grad_norm": 4.426883220672607, + "learning_rate": 8.097914630172694e-05, + "loss": 1.136, + "step": 2842 + }, + { + "epoch": 0.44505322479649345, + "grad_norm": 5.609326362609863, + "learning_rate": 8.097100032583904e-05, + "loss": 1.1354, + "step": 2843 + }, + { + "epoch": 0.4452097683155917, + "grad_norm": 3.4451870918273926, + "learning_rate": 8.096285434995112e-05, + "loss": 1.0447, + "step": 2844 + }, + { + "epoch": 0.44536631183469005, + "grad_norm": 3.29667592048645, + "learning_rate": 8.095470837406321e-05, + "loss": 1.1328, + "step": 2845 + }, + { + "epoch": 0.4455228553537883, + "grad_norm": 4.043338298797607, + "learning_rate": 8.09465623981753e-05, + "loss": 0.6741, + "step": 2846 + }, + { + "epoch": 0.44567939887288666, + "grad_norm": 1.4747103452682495, + "learning_rate": 8.093841642228739e-05, + "loss": 0.4326, + "step": 2847 + }, + { + "epoch": 0.445835942391985, + "grad_norm": 3.0496132373809814, + "learning_rate": 8.093027044639949e-05, + "loss": 0.9729, + "step": 2848 + }, + { + "epoch": 0.44599248591108326, + "grad_norm": 2.254161834716797, + "learning_rate": 8.092212447051157e-05, + "loss": 1.0143, + "step": 2849 + }, + { + "epoch": 0.4461490294301816, + "grad_norm": 2.4551901817321777, + "learning_rate": 8.091397849462365e-05, + "loss": 0.7519, + "step": 2850 + }, + { + "epoch": 0.4463055729492799, + "grad_norm": 0.6084877252578735, + "learning_rate": 8.090583251873575e-05, + "loss": 0.3479, + "step": 2851 + }, + { + "epoch": 0.4464621164683782, + "grad_norm": 0.621954619884491, + "learning_rate": 8.089768654284784e-05, + "loss": 0.2487, + "step": 2852 + }, + { + "epoch": 0.4466186599874765, + "grad_norm": 0.6140806674957275, + "learning_rate": 8.088954056695992e-05, + "loss": 0.2703, + "step": 2853 + }, + { + "epoch": 0.44677520350657485, + "grad_norm": 0.6374353170394897, + "learning_rate": 8.088139459107202e-05, + "loss": 0.2897, + "step": 2854 + }, + { + "epoch": 0.4469317470256731, + "grad_norm": 1.1576578617095947, + "learning_rate": 8.08732486151841e-05, + "loss": 0.358, + "step": 2855 + }, + { + "epoch": 0.44708829054477145, + "grad_norm": 0.6152721047401428, + "learning_rate": 8.086510263929618e-05, + "loss": 0.2975, + "step": 2856 + }, + { + "epoch": 0.4472448340638698, + "grad_norm": 1.2797480821609497, + "learning_rate": 8.085695666340828e-05, + "loss": 0.3901, + "step": 2857 + }, + { + "epoch": 0.44740137758296805, + "grad_norm": 1.2719287872314453, + "learning_rate": 8.084881068752036e-05, + "loss": 0.345, + "step": 2858 + }, + { + "epoch": 0.4475579211020664, + "grad_norm": 1.331619381904602, + "learning_rate": 8.084066471163245e-05, + "loss": 0.4928, + "step": 2859 + }, + { + "epoch": 0.4477144646211647, + "grad_norm": 1.2656965255737305, + "learning_rate": 8.083251873574455e-05, + "loss": 0.4831, + "step": 2860 + }, + { + "epoch": 0.447871008140263, + "grad_norm": 1.061784029006958, + "learning_rate": 8.082437275985664e-05, + "loss": 0.4977, + "step": 2861 + }, + { + "epoch": 0.4480275516593613, + "grad_norm": 1.010353922843933, + "learning_rate": 8.081622678396871e-05, + "loss": 0.3572, + "step": 2862 + }, + { + "epoch": 0.4481840951784596, + "grad_norm": 3.2566840648651123, + "learning_rate": 8.080808080808081e-05, + "loss": 0.6112, + "step": 2863 + }, + { + "epoch": 0.4483406386975579, + "grad_norm": 1.2826138734817505, + "learning_rate": 8.079993483219291e-05, + "loss": 0.6499, + "step": 2864 + }, + { + "epoch": 0.44849718221665624, + "grad_norm": 1.0976066589355469, + "learning_rate": 8.079178885630498e-05, + "loss": 0.4306, + "step": 2865 + }, + { + "epoch": 0.4486537257357545, + "grad_norm": 1.7329628467559814, + "learning_rate": 8.078364288041708e-05, + "loss": 0.6414, + "step": 2866 + }, + { + "epoch": 0.44881026925485284, + "grad_norm": 0.8268832564353943, + "learning_rate": 8.077549690452917e-05, + "loss": 0.5144, + "step": 2867 + }, + { + "epoch": 0.4489668127739512, + "grad_norm": 1.1173930168151855, + "learning_rate": 8.076735092864126e-05, + "loss": 0.4481, + "step": 2868 + }, + { + "epoch": 0.44912335629304945, + "grad_norm": 0.8843953013420105, + "learning_rate": 8.075920495275334e-05, + "loss": 0.4932, + "step": 2869 + }, + { + "epoch": 0.4492798998121478, + "grad_norm": 1.206457257270813, + "learning_rate": 8.075105897686544e-05, + "loss": 0.5172, + "step": 2870 + }, + { + "epoch": 0.4494364433312461, + "grad_norm": 4.525677680969238, + "learning_rate": 8.074291300097752e-05, + "loss": 0.937, + "step": 2871 + }, + { + "epoch": 0.4495929868503444, + "grad_norm": 2.7271652221679688, + "learning_rate": 8.07347670250896e-05, + "loss": 0.6098, + "step": 2872 + }, + { + "epoch": 0.4497495303694427, + "grad_norm": 1.1077992916107178, + "learning_rate": 8.07266210492017e-05, + "loss": 0.4499, + "step": 2873 + }, + { + "epoch": 0.44990607388854104, + "grad_norm": 2.480269193649292, + "learning_rate": 8.071847507331379e-05, + "loss": 0.592, + "step": 2874 + }, + { + "epoch": 0.4500626174076393, + "grad_norm": 1.6211040019989014, + "learning_rate": 8.071032909742587e-05, + "loss": 0.7779, + "step": 2875 + }, + { + "epoch": 0.45021916092673764, + "grad_norm": 2.1506128311157227, + "learning_rate": 8.070218312153797e-05, + "loss": 0.8519, + "step": 2876 + }, + { + "epoch": 0.45037570444583597, + "grad_norm": 1.469681739807129, + "learning_rate": 8.069403714565005e-05, + "loss": 0.4737, + "step": 2877 + }, + { + "epoch": 0.45053224796493424, + "grad_norm": 1.9083775281906128, + "learning_rate": 8.068589116976213e-05, + "loss": 0.6458, + "step": 2878 + }, + { + "epoch": 0.45068879148403257, + "grad_norm": 1.637611985206604, + "learning_rate": 8.067774519387423e-05, + "loss": 0.4407, + "step": 2879 + }, + { + "epoch": 0.45084533500313084, + "grad_norm": 4.772591590881348, + "learning_rate": 8.066959921798632e-05, + "loss": 0.7186, + "step": 2880 + }, + { + "epoch": 0.45100187852222917, + "grad_norm": 1.9295397996902466, + "learning_rate": 8.06614532420984e-05, + "loss": 0.4756, + "step": 2881 + }, + { + "epoch": 0.4511584220413275, + "grad_norm": 2.3159728050231934, + "learning_rate": 8.06533072662105e-05, + "loss": 0.6991, + "step": 2882 + }, + { + "epoch": 0.4513149655604258, + "grad_norm": 2.965216875076294, + "learning_rate": 8.064516129032258e-05, + "loss": 1.2951, + "step": 2883 + }, + { + "epoch": 0.4514715090795241, + "grad_norm": 2.8041718006134033, + "learning_rate": 8.063701531443468e-05, + "loss": 1.0032, + "step": 2884 + }, + { + "epoch": 0.45162805259862243, + "grad_norm": 1.1280226707458496, + "learning_rate": 8.062886933854676e-05, + "loss": 0.6525, + "step": 2885 + }, + { + "epoch": 0.4517845961177207, + "grad_norm": 2.1965789794921875, + "learning_rate": 8.062072336265885e-05, + "loss": 1.1351, + "step": 2886 + }, + { + "epoch": 0.45194113963681903, + "grad_norm": 2.9778056144714355, + "learning_rate": 8.061257738677094e-05, + "loss": 0.8442, + "step": 2887 + }, + { + "epoch": 0.45209768315591736, + "grad_norm": 3.709789991378784, + "learning_rate": 8.060443141088303e-05, + "loss": 0.9229, + "step": 2888 + }, + { + "epoch": 0.45225422667501564, + "grad_norm": 2.7435851097106934, + "learning_rate": 8.059628543499511e-05, + "loss": 1.2257, + "step": 2889 + }, + { + "epoch": 0.45241077019411396, + "grad_norm": 3.169971466064453, + "learning_rate": 8.058813945910721e-05, + "loss": 1.0581, + "step": 2890 + }, + { + "epoch": 0.4525673137132123, + "grad_norm": 2.478529691696167, + "learning_rate": 8.057999348321929e-05, + "loss": 1.5872, + "step": 2891 + }, + { + "epoch": 0.45272385723231057, + "grad_norm": 2.0431578159332275, + "learning_rate": 8.057184750733137e-05, + "loss": 1.3797, + "step": 2892 + }, + { + "epoch": 0.4528804007514089, + "grad_norm": 2.083552360534668, + "learning_rate": 8.056370153144347e-05, + "loss": 0.9417, + "step": 2893 + }, + { + "epoch": 0.4530369442705072, + "grad_norm": 2.4454681873321533, + "learning_rate": 8.055555555555556e-05, + "loss": 1.5042, + "step": 2894 + }, + { + "epoch": 0.4531934877896055, + "grad_norm": 3.4426209926605225, + "learning_rate": 8.054740957966764e-05, + "loss": 1.267, + "step": 2895 + }, + { + "epoch": 0.4533500313087038, + "grad_norm": 2.918935537338257, + "learning_rate": 8.053926360377974e-05, + "loss": 1.1165, + "step": 2896 + }, + { + "epoch": 0.45350657482780216, + "grad_norm": 2.2548668384552, + "learning_rate": 8.053111762789183e-05, + "loss": 0.5738, + "step": 2897 + }, + { + "epoch": 0.45366311834690043, + "grad_norm": 3.0210070610046387, + "learning_rate": 8.05229716520039e-05, + "loss": 0.8073, + "step": 2898 + }, + { + "epoch": 0.45381966186599876, + "grad_norm": 3.067561149597168, + "learning_rate": 8.0514825676116e-05, + "loss": 1.4626, + "step": 2899 + }, + { + "epoch": 0.45397620538509703, + "grad_norm": 4.764838695526123, + "learning_rate": 8.05066797002281e-05, + "loss": 0.9637, + "step": 2900 + }, + { + "epoch": 0.45413274890419536, + "grad_norm": 0.880632758140564, + "learning_rate": 8.049853372434017e-05, + "loss": 0.4435, + "step": 2901 + }, + { + "epoch": 0.4542892924232937, + "grad_norm": 0.6907128095626831, + "learning_rate": 8.049038774845227e-05, + "loss": 0.3356, + "step": 2902 + }, + { + "epoch": 0.45444583594239196, + "grad_norm": 1.4330724477767944, + "learning_rate": 8.048224177256436e-05, + "loss": 0.426, + "step": 2903 + }, + { + "epoch": 0.4546023794614903, + "grad_norm": 0.7611360549926758, + "learning_rate": 8.047409579667643e-05, + "loss": 0.3542, + "step": 2904 + }, + { + "epoch": 0.4547589229805886, + "grad_norm": 0.8586251735687256, + "learning_rate": 8.046594982078853e-05, + "loss": 0.2994, + "step": 2905 + }, + { + "epoch": 0.4549154664996869, + "grad_norm": 1.0436460971832275, + "learning_rate": 8.045780384490063e-05, + "loss": 0.2529, + "step": 2906 + }, + { + "epoch": 0.4550720100187852, + "grad_norm": 0.833238959312439, + "learning_rate": 8.044965786901271e-05, + "loss": 0.442, + "step": 2907 + }, + { + "epoch": 0.45522855353788355, + "grad_norm": 1.357629418373108, + "learning_rate": 8.04415118931248e-05, + "loss": 0.4791, + "step": 2908 + }, + { + "epoch": 0.4553850970569818, + "grad_norm": 1.1602712869644165, + "learning_rate": 8.04333659172369e-05, + "loss": 0.3879, + "step": 2909 + }, + { + "epoch": 0.45554164057608015, + "grad_norm": 1.6583753824234009, + "learning_rate": 8.042521994134898e-05, + "loss": 0.3549, + "step": 2910 + }, + { + "epoch": 0.4556981840951785, + "grad_norm": 0.8885524272918701, + "learning_rate": 8.041707396546106e-05, + "loss": 0.4162, + "step": 2911 + }, + { + "epoch": 0.45585472761427676, + "grad_norm": 0.9540667533874512, + "learning_rate": 8.040892798957316e-05, + "loss": 0.383, + "step": 2912 + }, + { + "epoch": 0.4560112711333751, + "grad_norm": 1.80681312084198, + "learning_rate": 8.040078201368524e-05, + "loss": 0.4041, + "step": 2913 + }, + { + "epoch": 0.4561678146524734, + "grad_norm": 0.8169831037521362, + "learning_rate": 8.039263603779733e-05, + "loss": 0.3124, + "step": 2914 + }, + { + "epoch": 0.4563243581715717, + "grad_norm": 1.9845348596572876, + "learning_rate": 8.038449006190942e-05, + "loss": 0.3836, + "step": 2915 + }, + { + "epoch": 0.45648090169067, + "grad_norm": 1.127164602279663, + "learning_rate": 8.037634408602151e-05, + "loss": 0.3401, + "step": 2916 + }, + { + "epoch": 0.4566374452097683, + "grad_norm": 2.195247173309326, + "learning_rate": 8.036819811013359e-05, + "loss": 0.5011, + "step": 2917 + }, + { + "epoch": 0.4567939887288666, + "grad_norm": 1.6334741115570068, + "learning_rate": 8.036005213424569e-05, + "loss": 0.5538, + "step": 2918 + }, + { + "epoch": 0.45695053224796495, + "grad_norm": 1.1482656002044678, + "learning_rate": 8.035190615835777e-05, + "loss": 0.5252, + "step": 2919 + }, + { + "epoch": 0.4571070757670632, + "grad_norm": 2.2566323280334473, + "learning_rate": 8.034376018246987e-05, + "loss": 0.3978, + "step": 2920 + }, + { + "epoch": 0.45726361928616155, + "grad_norm": 1.7191792726516724, + "learning_rate": 8.033561420658195e-05, + "loss": 0.4205, + "step": 2921 + }, + { + "epoch": 0.4574201628052599, + "grad_norm": 3.8777246475219727, + "learning_rate": 8.032746823069404e-05, + "loss": 0.5352, + "step": 2922 + }, + { + "epoch": 0.45757670632435815, + "grad_norm": 4.0529704093933105, + "learning_rate": 8.031932225480613e-05, + "loss": 0.6543, + "step": 2923 + }, + { + "epoch": 0.4577332498434565, + "grad_norm": 1.5614620447158813, + "learning_rate": 8.031117627891822e-05, + "loss": 0.8087, + "step": 2924 + }, + { + "epoch": 0.4578897933625548, + "grad_norm": 3.8586857318878174, + "learning_rate": 8.03030303030303e-05, + "loss": 1.1218, + "step": 2925 + }, + { + "epoch": 0.4580463368816531, + "grad_norm": 1.5215699672698975, + "learning_rate": 8.02948843271424e-05, + "loss": 0.4452, + "step": 2926 + }, + { + "epoch": 0.4582028804007514, + "grad_norm": 2.3636622428894043, + "learning_rate": 8.028673835125448e-05, + "loss": 0.6079, + "step": 2927 + }, + { + "epoch": 0.45835942391984974, + "grad_norm": 2.3164937496185303, + "learning_rate": 8.027859237536657e-05, + "loss": 0.5823, + "step": 2928 + }, + { + "epoch": 0.458515967438948, + "grad_norm": 2.0444822311401367, + "learning_rate": 8.027044639947866e-05, + "loss": 0.6522, + "step": 2929 + }, + { + "epoch": 0.45867251095804634, + "grad_norm": 2.5238804817199707, + "learning_rate": 8.026230042359075e-05, + "loss": 0.7097, + "step": 2930 + }, + { + "epoch": 0.45882905447714467, + "grad_norm": 1.7820358276367188, + "learning_rate": 8.025415444770283e-05, + "loss": 0.5827, + "step": 2931 + }, + { + "epoch": 0.45898559799624294, + "grad_norm": 1.7143723964691162, + "learning_rate": 8.024600847181493e-05, + "loss": 0.782, + "step": 2932 + }, + { + "epoch": 0.4591421415153413, + "grad_norm": 2.3868014812469482, + "learning_rate": 8.023786249592701e-05, + "loss": 0.86, + "step": 2933 + }, + { + "epoch": 0.45929868503443955, + "grad_norm": 2.925853729248047, + "learning_rate": 8.02297165200391e-05, + "loss": 0.7841, + "step": 2934 + }, + { + "epoch": 0.4594552285535379, + "grad_norm": 2.320922374725342, + "learning_rate": 8.022157054415119e-05, + "loss": 0.9145, + "step": 2935 + }, + { + "epoch": 0.4596117720726362, + "grad_norm": 3.774169445037842, + "learning_rate": 8.021342456826329e-05, + "loss": 1.1686, + "step": 2936 + }, + { + "epoch": 0.4597683155917345, + "grad_norm": 2.380906105041504, + "learning_rate": 8.020527859237536e-05, + "loss": 1.6684, + "step": 2937 + }, + { + "epoch": 0.4599248591108328, + "grad_norm": 3.4618568420410156, + "learning_rate": 8.019713261648746e-05, + "loss": 0.7996, + "step": 2938 + }, + { + "epoch": 0.46008140262993114, + "grad_norm": 3.303272247314453, + "learning_rate": 8.018898664059956e-05, + "loss": 1.1457, + "step": 2939 + }, + { + "epoch": 0.4602379461490294, + "grad_norm": 4.753242015838623, + "learning_rate": 8.018084066471163e-05, + "loss": 1.4082, + "step": 2940 + }, + { + "epoch": 0.46039448966812774, + "grad_norm": 2.786158800125122, + "learning_rate": 8.017269468882372e-05, + "loss": 0.8528, + "step": 2941 + }, + { + "epoch": 0.46055103318722607, + "grad_norm": 3.4040772914886475, + "learning_rate": 8.016454871293582e-05, + "loss": 0.9022, + "step": 2942 + }, + { + "epoch": 0.46070757670632434, + "grad_norm": 2.9747164249420166, + "learning_rate": 8.01564027370479e-05, + "loss": 1.8677, + "step": 2943 + }, + { + "epoch": 0.46086412022542267, + "grad_norm": 2.2364447116851807, + "learning_rate": 8.014825676115999e-05, + "loss": 1.1399, + "step": 2944 + }, + { + "epoch": 0.461020663744521, + "grad_norm": 4.275374412536621, + "learning_rate": 8.014011078527208e-05, + "loss": 1.6311, + "step": 2945 + }, + { + "epoch": 0.46117720726361927, + "grad_norm": 2.815577268600464, + "learning_rate": 8.013196480938417e-05, + "loss": 1.3185, + "step": 2946 + }, + { + "epoch": 0.4613337507827176, + "grad_norm": 3.5154199600219727, + "learning_rate": 8.012381883349625e-05, + "loss": 0.9046, + "step": 2947 + }, + { + "epoch": 0.46149029430181593, + "grad_norm": 3.560213088989258, + "learning_rate": 8.011567285760835e-05, + "loss": 0.9701, + "step": 2948 + }, + { + "epoch": 0.4616468378209142, + "grad_norm": 4.585635185241699, + "learning_rate": 8.010752688172043e-05, + "loss": 1.3578, + "step": 2949 + }, + { + "epoch": 0.46180338134001253, + "grad_norm": 2.460320472717285, + "learning_rate": 8.009938090583252e-05, + "loss": 1.365, + "step": 2950 + }, + { + "epoch": 0.46195992485911086, + "grad_norm": 0.6023199558258057, + "learning_rate": 8.009123492994461e-05, + "loss": 0.279, + "step": 2951 + }, + { + "epoch": 0.46211646837820913, + "grad_norm": 0.8311507105827332, + "learning_rate": 8.00830889540567e-05, + "loss": 0.3695, + "step": 2952 + }, + { + "epoch": 0.46227301189730746, + "grad_norm": 0.754533052444458, + "learning_rate": 8.007494297816878e-05, + "loss": 0.3916, + "step": 2953 + }, + { + "epoch": 0.46242955541640574, + "grad_norm": 0.8069680333137512, + "learning_rate": 8.006679700228088e-05, + "loss": 0.4487, + "step": 2954 + }, + { + "epoch": 0.46258609893550406, + "grad_norm": 0.8646244406700134, + "learning_rate": 8.005865102639296e-05, + "loss": 0.4007, + "step": 2955 + }, + { + "epoch": 0.4627426424546024, + "grad_norm": 0.5849583148956299, + "learning_rate": 8.005050505050506e-05, + "loss": 0.3721, + "step": 2956 + }, + { + "epoch": 0.46289918597370067, + "grad_norm": 0.7061209678649902, + "learning_rate": 8.004235907461714e-05, + "loss": 0.3212, + "step": 2957 + }, + { + "epoch": 0.463055729492799, + "grad_norm": 1.0417126417160034, + "learning_rate": 8.003421309872923e-05, + "loss": 0.4674, + "step": 2958 + }, + { + "epoch": 0.4632122730118973, + "grad_norm": 0.7792390584945679, + "learning_rate": 8.002606712284132e-05, + "loss": 0.3737, + "step": 2959 + }, + { + "epoch": 0.4633688165309956, + "grad_norm": 0.8680147528648376, + "learning_rate": 8.001792114695341e-05, + "loss": 0.3173, + "step": 2960 + }, + { + "epoch": 0.4635253600500939, + "grad_norm": 0.928480863571167, + "learning_rate": 8.000977517106549e-05, + "loss": 0.2971, + "step": 2961 + }, + { + "epoch": 0.46368190356919226, + "grad_norm": 1.0592268705368042, + "learning_rate": 8.000162919517759e-05, + "loss": 0.4757, + "step": 2962 + }, + { + "epoch": 0.46383844708829053, + "grad_norm": 1.7692285776138306, + "learning_rate": 7.999348321928967e-05, + "loss": 0.5246, + "step": 2963 + }, + { + "epoch": 0.46399499060738886, + "grad_norm": 1.0678504705429077, + "learning_rate": 7.998533724340176e-05, + "loss": 0.3945, + "step": 2964 + }, + { + "epoch": 0.4641515341264872, + "grad_norm": 1.1673113107681274, + "learning_rate": 7.997719126751385e-05, + "loss": 0.4767, + "step": 2965 + }, + { + "epoch": 0.46430807764558546, + "grad_norm": 2.0550880432128906, + "learning_rate": 7.996904529162594e-05, + "loss": 0.5958, + "step": 2966 + }, + { + "epoch": 0.4644646211646838, + "grad_norm": 1.6085500717163086, + "learning_rate": 7.996089931573802e-05, + "loss": 0.4966, + "step": 2967 + }, + { + "epoch": 0.4646211646837821, + "grad_norm": 2.1714086532592773, + "learning_rate": 7.995275333985012e-05, + "loss": 0.7369, + "step": 2968 + }, + { + "epoch": 0.4647777082028804, + "grad_norm": 1.75434148311615, + "learning_rate": 7.99446073639622e-05, + "loss": 0.6379, + "step": 2969 + }, + { + "epoch": 0.4649342517219787, + "grad_norm": 7.1084442138671875, + "learning_rate": 7.993646138807429e-05, + "loss": 1.2138, + "step": 2970 + }, + { + "epoch": 0.465090795241077, + "grad_norm": 1.1190106868743896, + "learning_rate": 7.992831541218638e-05, + "loss": 0.4749, + "step": 2971 + }, + { + "epoch": 0.4652473387601753, + "grad_norm": 2.6239376068115234, + "learning_rate": 7.992016943629848e-05, + "loss": 0.7319, + "step": 2972 + }, + { + "epoch": 0.46540388227927365, + "grad_norm": 2.06978702545166, + "learning_rate": 7.991202346041055e-05, + "loss": 0.6013, + "step": 2973 + }, + { + "epoch": 0.4655604257983719, + "grad_norm": 1.6858874559402466, + "learning_rate": 7.990387748452265e-05, + "loss": 0.6621, + "step": 2974 + }, + { + "epoch": 0.46571696931747025, + "grad_norm": 2.2509331703186035, + "learning_rate": 7.989573150863475e-05, + "loss": 0.4703, + "step": 2975 + }, + { + "epoch": 0.4658735128365686, + "grad_norm": 4.607326984405518, + "learning_rate": 7.988758553274682e-05, + "loss": 1.0271, + "step": 2976 + }, + { + "epoch": 0.46603005635566686, + "grad_norm": 2.030940055847168, + "learning_rate": 7.987943955685891e-05, + "loss": 0.49, + "step": 2977 + }, + { + "epoch": 0.4661865998747652, + "grad_norm": 3.6505351066589355, + "learning_rate": 7.987129358097101e-05, + "loss": 1.068, + "step": 2978 + }, + { + "epoch": 0.4663431433938635, + "grad_norm": 3.5613701343536377, + "learning_rate": 7.98631476050831e-05, + "loss": 1.0791, + "step": 2979 + }, + { + "epoch": 0.4664996869129618, + "grad_norm": 2.87314510345459, + "learning_rate": 7.985500162919518e-05, + "loss": 1.0246, + "step": 2980 + }, + { + "epoch": 0.4666562304320601, + "grad_norm": 2.6351752281188965, + "learning_rate": 7.984685565330728e-05, + "loss": 0.6712, + "step": 2981 + }, + { + "epoch": 0.46681277395115844, + "grad_norm": 2.268951416015625, + "learning_rate": 7.983870967741936e-05, + "loss": 0.6215, + "step": 2982 + }, + { + "epoch": 0.4669693174702567, + "grad_norm": 2.5145928859710693, + "learning_rate": 7.983056370153144e-05, + "loss": 0.8691, + "step": 2983 + }, + { + "epoch": 0.46712586098935505, + "grad_norm": 1.7155604362487793, + "learning_rate": 7.982241772564354e-05, + "loss": 0.8922, + "step": 2984 + }, + { + "epoch": 0.4672824045084534, + "grad_norm": 2.6324117183685303, + "learning_rate": 7.981427174975562e-05, + "loss": 1.0487, + "step": 2985 + }, + { + "epoch": 0.46743894802755165, + "grad_norm": 3.3093745708465576, + "learning_rate": 7.980612577386771e-05, + "loss": 0.7334, + "step": 2986 + }, + { + "epoch": 0.46759549154665, + "grad_norm": 3.431419610977173, + "learning_rate": 7.97979797979798e-05, + "loss": 0.9877, + "step": 2987 + }, + { + "epoch": 0.46775203506574825, + "grad_norm": 3.7362847328186035, + "learning_rate": 7.978983382209189e-05, + "loss": 1.6799, + "step": 2988 + }, + { + "epoch": 0.4679085785848466, + "grad_norm": 3.8402554988861084, + "learning_rate": 7.978168784620397e-05, + "loss": 1.2268, + "step": 2989 + }, + { + "epoch": 0.4680651221039449, + "grad_norm": 6.237053871154785, + "learning_rate": 7.977354187031607e-05, + "loss": 1.3428, + "step": 2990 + }, + { + "epoch": 0.4682216656230432, + "grad_norm": 4.320383548736572, + "learning_rate": 7.976539589442815e-05, + "loss": 1.0942, + "step": 2991 + }, + { + "epoch": 0.4683782091421415, + "grad_norm": 6.43870210647583, + "learning_rate": 7.975724991854024e-05, + "loss": 1.8635, + "step": 2992 + }, + { + "epoch": 0.46853475266123984, + "grad_norm": 2.855269432067871, + "learning_rate": 7.974910394265234e-05, + "loss": 1.4198, + "step": 2993 + }, + { + "epoch": 0.4686912961803381, + "grad_norm": 4.734680652618408, + "learning_rate": 7.974095796676442e-05, + "loss": 1.5557, + "step": 2994 + }, + { + "epoch": 0.46884783969943644, + "grad_norm": 2.421121597290039, + "learning_rate": 7.973281199087652e-05, + "loss": 1.6742, + "step": 2995 + }, + { + "epoch": 0.46900438321853477, + "grad_norm": 3.5806422233581543, + "learning_rate": 7.97246660149886e-05, + "loss": 1.0197, + "step": 2996 + }, + { + "epoch": 0.46916092673763304, + "grad_norm": 2.148378849029541, + "learning_rate": 7.971652003910068e-05, + "loss": 0.9686, + "step": 2997 + }, + { + "epoch": 0.4693174702567314, + "grad_norm": 3.144960641860962, + "learning_rate": 7.970837406321278e-05, + "loss": 1.3659, + "step": 2998 + }, + { + "epoch": 0.4694740137758297, + "grad_norm": 2.2099525928497314, + "learning_rate": 7.970022808732486e-05, + "loss": 0.8823, + "step": 2999 + }, + { + "epoch": 0.469630557294928, + "grad_norm": 1.8632392883300781, + "learning_rate": 7.969208211143695e-05, + "loss": 0.8798, + "step": 3000 + }, + { + "epoch": 0.469630557294928, + "eval_loss": 0.5889362096786499, + "eval_runtime": 203.432, + "eval_samples_per_second": 60.87, + "eval_steps_per_second": 3.805, + "eval_wer": 0.3603714643942453, + "step": 3000 + }, + { + "epoch": 0.4697871008140263, + "grad_norm": 0.5217509865760803, + "learning_rate": 7.968393613554905e-05, + "loss": 0.2581, + "step": 3001 + }, + { + "epoch": 0.46994364433312463, + "grad_norm": 0.7189522981643677, + "learning_rate": 7.967579015966113e-05, + "loss": 0.304, + "step": 3002 + }, + { + "epoch": 0.4701001878522229, + "grad_norm": 0.5404806137084961, + "learning_rate": 7.966764418377321e-05, + "loss": 0.3036, + "step": 3003 + }, + { + "epoch": 0.47025673137132123, + "grad_norm": 1.112508773803711, + "learning_rate": 7.965949820788531e-05, + "loss": 0.3198, + "step": 3004 + }, + { + "epoch": 0.47041327489041956, + "grad_norm": 0.9956707954406738, + "learning_rate": 7.96513522319974e-05, + "loss": 0.347, + "step": 3005 + }, + { + "epoch": 0.47056981840951784, + "grad_norm": 0.6068829894065857, + "learning_rate": 7.964320625610948e-05, + "loss": 0.2531, + "step": 3006 + }, + { + "epoch": 0.47072636192861617, + "grad_norm": 0.6633115410804749, + "learning_rate": 7.963506028022158e-05, + "loss": 0.3233, + "step": 3007 + }, + { + "epoch": 0.47088290544771444, + "grad_norm": 2.8153953552246094, + "learning_rate": 7.962691430433367e-05, + "loss": 0.5469, + "step": 3008 + }, + { + "epoch": 0.47103944896681277, + "grad_norm": 0.9949942827224731, + "learning_rate": 7.961876832844574e-05, + "loss": 0.3885, + "step": 3009 + }, + { + "epoch": 0.4711959924859111, + "grad_norm": 1.9298192262649536, + "learning_rate": 7.961062235255784e-05, + "loss": 0.5067, + "step": 3010 + }, + { + "epoch": 0.47135253600500937, + "grad_norm": 4.623177528381348, + "learning_rate": 7.960247637666994e-05, + "loss": 0.6875, + "step": 3011 + }, + { + "epoch": 0.4715090795241077, + "grad_norm": 0.9764482975006104, + "learning_rate": 7.959433040078201e-05, + "loss": 0.2715, + "step": 3012 + }, + { + "epoch": 0.47166562304320603, + "grad_norm": 1.0894864797592163, + "learning_rate": 7.95861844248941e-05, + "loss": 0.452, + "step": 3013 + }, + { + "epoch": 0.4718221665623043, + "grad_norm": 0.8216126561164856, + "learning_rate": 7.95780384490062e-05, + "loss": 0.3838, + "step": 3014 + }, + { + "epoch": 0.47197871008140263, + "grad_norm": 1.4501134157180786, + "learning_rate": 7.956989247311829e-05, + "loss": 0.5088, + "step": 3015 + }, + { + "epoch": 0.47213525360050096, + "grad_norm": 1.7092173099517822, + "learning_rate": 7.956174649723037e-05, + "loss": 0.4638, + "step": 3016 + }, + { + "epoch": 0.47229179711959923, + "grad_norm": 2.019563674926758, + "learning_rate": 7.955360052134247e-05, + "loss": 0.5562, + "step": 3017 + }, + { + "epoch": 0.47244834063869756, + "grad_norm": 1.5207878351211548, + "learning_rate": 7.954545454545455e-05, + "loss": 0.6743, + "step": 3018 + }, + { + "epoch": 0.4726048841577959, + "grad_norm": 1.4517743587493896, + "learning_rate": 7.953730856956663e-05, + "loss": 0.5109, + "step": 3019 + }, + { + "epoch": 0.47276142767689416, + "grad_norm": 1.5448389053344727, + "learning_rate": 7.952916259367873e-05, + "loss": 0.5216, + "step": 3020 + }, + { + "epoch": 0.4729179711959925, + "grad_norm": 1.7846829891204834, + "learning_rate": 7.952101661779082e-05, + "loss": 0.7073, + "step": 3021 + }, + { + "epoch": 0.4730745147150908, + "grad_norm": 3.1886003017425537, + "learning_rate": 7.95128706419029e-05, + "loss": 0.9668, + "step": 3022 + }, + { + "epoch": 0.4732310582341891, + "grad_norm": 1.9225293397903442, + "learning_rate": 7.9504724666015e-05, + "loss": 0.7439, + "step": 3023 + }, + { + "epoch": 0.4733876017532874, + "grad_norm": 2.4093339443206787, + "learning_rate": 7.949657869012708e-05, + "loss": 1.2156, + "step": 3024 + }, + { + "epoch": 0.4735441452723857, + "grad_norm": 2.250678062438965, + "learning_rate": 7.948843271423916e-05, + "loss": 0.6483, + "step": 3025 + }, + { + "epoch": 0.473700688791484, + "grad_norm": 2.8233745098114014, + "learning_rate": 7.948028673835126e-05, + "loss": 0.8435, + "step": 3026 + }, + { + "epoch": 0.47385723231058235, + "grad_norm": 2.352861166000366, + "learning_rate": 7.947214076246335e-05, + "loss": 0.716, + "step": 3027 + }, + { + "epoch": 0.47401377582968063, + "grad_norm": 3.2991275787353516, + "learning_rate": 7.946399478657543e-05, + "loss": 0.7475, + "step": 3028 + }, + { + "epoch": 0.47417031934877896, + "grad_norm": 3.1566150188446045, + "learning_rate": 7.945584881068753e-05, + "loss": 0.8378, + "step": 3029 + }, + { + "epoch": 0.4743268628678773, + "grad_norm": 1.012900471687317, + "learning_rate": 7.944770283479961e-05, + "loss": 0.4003, + "step": 3030 + }, + { + "epoch": 0.47448340638697556, + "grad_norm": 2.270075798034668, + "learning_rate": 7.943955685891171e-05, + "loss": 0.6438, + "step": 3031 + }, + { + "epoch": 0.4746399499060739, + "grad_norm": 1.9089967012405396, + "learning_rate": 7.943141088302379e-05, + "loss": 0.611, + "step": 3032 + }, + { + "epoch": 0.4747964934251722, + "grad_norm": 4.075137138366699, + "learning_rate": 7.942326490713587e-05, + "loss": 0.9919, + "step": 3033 + }, + { + "epoch": 0.4749530369442705, + "grad_norm": 5.7179412841796875, + "learning_rate": 7.941511893124797e-05, + "loss": 1.0492, + "step": 3034 + }, + { + "epoch": 0.4751095804633688, + "grad_norm": 3.172011613845825, + "learning_rate": 7.940697295536006e-05, + "loss": 1.0006, + "step": 3035 + }, + { + "epoch": 0.47526612398246715, + "grad_norm": 6.904861927032471, + "learning_rate": 7.939882697947214e-05, + "loss": 1.0265, + "step": 3036 + }, + { + "epoch": 0.4754226675015654, + "grad_norm": 2.7028348445892334, + "learning_rate": 7.939068100358424e-05, + "loss": 0.9987, + "step": 3037 + }, + { + "epoch": 0.47557921102066375, + "grad_norm": 2.827145576477051, + "learning_rate": 7.938253502769632e-05, + "loss": 0.8828, + "step": 3038 + }, + { + "epoch": 0.4757357545397621, + "grad_norm": 4.046535015106201, + "learning_rate": 7.93743890518084e-05, + "loss": 1.3051, + "step": 3039 + }, + { + "epoch": 0.47589229805886035, + "grad_norm": 3.882459878921509, + "learning_rate": 7.93662430759205e-05, + "loss": 1.3233, + "step": 3040 + }, + { + "epoch": 0.4760488415779587, + "grad_norm": 2.747591972351074, + "learning_rate": 7.935809710003259e-05, + "loss": 1.2402, + "step": 3041 + }, + { + "epoch": 0.47620538509705695, + "grad_norm": 3.4917314052581787, + "learning_rate": 7.934995112414467e-05, + "loss": 1.8703, + "step": 3042 + }, + { + "epoch": 0.4763619286161553, + "grad_norm": 2.843217134475708, + "learning_rate": 7.934180514825677e-05, + "loss": 0.9471, + "step": 3043 + }, + { + "epoch": 0.4765184721352536, + "grad_norm": 3.2956042289733887, + "learning_rate": 7.933365917236886e-05, + "loss": 1.8477, + "step": 3044 + }, + { + "epoch": 0.4766750156543519, + "grad_norm": 3.0140669345855713, + "learning_rate": 7.932551319648093e-05, + "loss": 1.5775, + "step": 3045 + }, + { + "epoch": 0.4768315591734502, + "grad_norm": 3.7947394847869873, + "learning_rate": 7.931736722059303e-05, + "loss": 1.1154, + "step": 3046 + }, + { + "epoch": 0.47698810269254854, + "grad_norm": 1.8269909620285034, + "learning_rate": 7.930922124470513e-05, + "loss": 0.5502, + "step": 3047 + }, + { + "epoch": 0.4771446462116468, + "grad_norm": 3.3092992305755615, + "learning_rate": 7.93010752688172e-05, + "loss": 0.797, + "step": 3048 + }, + { + "epoch": 0.47730118973074515, + "grad_norm": 2.270504951477051, + "learning_rate": 7.92929292929293e-05, + "loss": 0.9774, + "step": 3049 + }, + { + "epoch": 0.4774577332498435, + "grad_norm": 2.437229871749878, + "learning_rate": 7.92847833170414e-05, + "loss": 1.2081, + "step": 3050 + }, + { + "epoch": 0.47761427676894175, + "grad_norm": 0.7356486320495605, + "learning_rate": 7.927663734115346e-05, + "loss": 0.3829, + "step": 3051 + }, + { + "epoch": 0.4777708202880401, + "grad_norm": 0.5904699563980103, + "learning_rate": 7.926849136526556e-05, + "loss": 0.285, + "step": 3052 + }, + { + "epoch": 0.4779273638071384, + "grad_norm": 0.8380950689315796, + "learning_rate": 7.926034538937766e-05, + "loss": 0.3156, + "step": 3053 + }, + { + "epoch": 0.4780839073262367, + "grad_norm": 0.9625746011734009, + "learning_rate": 7.925219941348974e-05, + "loss": 0.4074, + "step": 3054 + }, + { + "epoch": 0.478240450845335, + "grad_norm": 0.8348032832145691, + "learning_rate": 7.924405343760183e-05, + "loss": 0.3368, + "step": 3055 + }, + { + "epoch": 0.47839699436443334, + "grad_norm": 0.8779590129852295, + "learning_rate": 7.923590746171392e-05, + "loss": 0.2637, + "step": 3056 + }, + { + "epoch": 0.4785535378835316, + "grad_norm": 0.5540851950645447, + "learning_rate": 7.9227761485826e-05, + "loss": 0.2192, + "step": 3057 + }, + { + "epoch": 0.47871008140262994, + "grad_norm": 1.0041377544403076, + "learning_rate": 7.921961550993809e-05, + "loss": 0.4748, + "step": 3058 + }, + { + "epoch": 0.47886662492172827, + "grad_norm": 0.9181697964668274, + "learning_rate": 7.921146953405019e-05, + "loss": 0.2655, + "step": 3059 + }, + { + "epoch": 0.47902316844082654, + "grad_norm": 0.8865082263946533, + "learning_rate": 7.920332355816227e-05, + "loss": 0.299, + "step": 3060 + }, + { + "epoch": 0.47917971195992487, + "grad_norm": 3.36721134185791, + "learning_rate": 7.919517758227436e-05, + "loss": 0.5853, + "step": 3061 + }, + { + "epoch": 0.47933625547902314, + "grad_norm": 1.4762372970581055, + "learning_rate": 7.918703160638645e-05, + "loss": 0.3667, + "step": 3062 + }, + { + "epoch": 0.47949279899812147, + "grad_norm": 3.4266295433044434, + "learning_rate": 7.917888563049854e-05, + "loss": 0.4955, + "step": 3063 + }, + { + "epoch": 0.4796493425172198, + "grad_norm": 1.4218260049819946, + "learning_rate": 7.917073965461062e-05, + "loss": 0.3738, + "step": 3064 + }, + { + "epoch": 0.4798058860363181, + "grad_norm": 1.084571361541748, + "learning_rate": 7.916259367872272e-05, + "loss": 0.5605, + "step": 3065 + }, + { + "epoch": 0.4799624295554164, + "grad_norm": 1.1367793083190918, + "learning_rate": 7.91544477028348e-05, + "loss": 0.4727, + "step": 3066 + }, + { + "epoch": 0.48011897307451473, + "grad_norm": 2.013141632080078, + "learning_rate": 7.91463017269469e-05, + "loss": 0.797, + "step": 3067 + }, + { + "epoch": 0.480275516593613, + "grad_norm": 3.081876516342163, + "learning_rate": 7.913815575105898e-05, + "loss": 0.6277, + "step": 3068 + }, + { + "epoch": 0.48043206011271133, + "grad_norm": 2.753858804702759, + "learning_rate": 7.913000977517107e-05, + "loss": 0.6314, + "step": 3069 + }, + { + "epoch": 0.48058860363180966, + "grad_norm": 3.3533570766448975, + "learning_rate": 7.912186379928316e-05, + "loss": 0.9293, + "step": 3070 + }, + { + "epoch": 0.48074514715090794, + "grad_norm": 1.5091831684112549, + "learning_rate": 7.911371782339525e-05, + "loss": 0.4815, + "step": 3071 + }, + { + "epoch": 0.48090169067000627, + "grad_norm": 1.7479335069656372, + "learning_rate": 7.910557184750733e-05, + "loss": 0.413, + "step": 3072 + }, + { + "epoch": 0.4810582341891046, + "grad_norm": 2.7933573722839355, + "learning_rate": 7.909742587161943e-05, + "loss": 0.7358, + "step": 3073 + }, + { + "epoch": 0.48121477770820287, + "grad_norm": 3.9775798320770264, + "learning_rate": 7.908927989573151e-05, + "loss": 0.8142, + "step": 3074 + }, + { + "epoch": 0.4813713212273012, + "grad_norm": 2.2680323123931885, + "learning_rate": 7.90811339198436e-05, + "loss": 0.8055, + "step": 3075 + }, + { + "epoch": 0.4815278647463995, + "grad_norm": 3.3212757110595703, + "learning_rate": 7.907298794395569e-05, + "loss": 0.8041, + "step": 3076 + }, + { + "epoch": 0.4816844082654978, + "grad_norm": 1.6903512477874756, + "learning_rate": 7.906484196806778e-05, + "loss": 0.525, + "step": 3077 + }, + { + "epoch": 0.4818409517845961, + "grad_norm": 3.0360748767852783, + "learning_rate": 7.905669599217986e-05, + "loss": 1.0153, + "step": 3078 + }, + { + "epoch": 0.4819974953036944, + "grad_norm": 2.9922235012054443, + "learning_rate": 7.904855001629196e-05, + "loss": 0.7801, + "step": 3079 + }, + { + "epoch": 0.48215403882279273, + "grad_norm": 1.9523526430130005, + "learning_rate": 7.904040404040404e-05, + "loss": 0.8595, + "step": 3080 + }, + { + "epoch": 0.48231058234189106, + "grad_norm": 2.3444838523864746, + "learning_rate": 7.903225806451613e-05, + "loss": 1.1364, + "step": 3081 + }, + { + "epoch": 0.48246712586098933, + "grad_norm": 2.0541446208953857, + "learning_rate": 7.902411208862822e-05, + "loss": 0.7373, + "step": 3082 + }, + { + "epoch": 0.48262366938008766, + "grad_norm": 2.5315990447998047, + "learning_rate": 7.901596611274032e-05, + "loss": 1.1187, + "step": 3083 + }, + { + "epoch": 0.482780212899186, + "grad_norm": 1.6423473358154297, + "learning_rate": 7.900782013685239e-05, + "loss": 0.7021, + "step": 3084 + }, + { + "epoch": 0.48293675641828426, + "grad_norm": 2.1148359775543213, + "learning_rate": 7.899967416096449e-05, + "loss": 0.7811, + "step": 3085 + }, + { + "epoch": 0.4830932999373826, + "grad_norm": 3.086764335632324, + "learning_rate": 7.899152818507658e-05, + "loss": 0.8899, + "step": 3086 + }, + { + "epoch": 0.4832498434564809, + "grad_norm": 2.5199499130249023, + "learning_rate": 7.898338220918865e-05, + "loss": 0.7529, + "step": 3087 + }, + { + "epoch": 0.4834063869755792, + "grad_norm": 2.6774282455444336, + "learning_rate": 7.897523623330075e-05, + "loss": 1.5901, + "step": 3088 + }, + { + "epoch": 0.4835629304946775, + "grad_norm": 3.748769760131836, + "learning_rate": 7.896709025741285e-05, + "loss": 0.8677, + "step": 3089 + }, + { + "epoch": 0.48371947401377585, + "grad_norm": 2.4003376960754395, + "learning_rate": 7.895894428152493e-05, + "loss": 1.1484, + "step": 3090 + }, + { + "epoch": 0.4838760175328741, + "grad_norm": 5.308606147766113, + "learning_rate": 7.895079830563702e-05, + "loss": 2.371, + "step": 3091 + }, + { + "epoch": 0.48403256105197245, + "grad_norm": 2.7284247875213623, + "learning_rate": 7.894265232974911e-05, + "loss": 1.0331, + "step": 3092 + }, + { + "epoch": 0.4841891045710708, + "grad_norm": 2.6465392112731934, + "learning_rate": 7.89345063538612e-05, + "loss": 1.0576, + "step": 3093 + }, + { + "epoch": 0.48434564809016906, + "grad_norm": NaN, + "learning_rate": 7.89345063538612e-05, + "loss": 0.0, + "step": 3094 + }, + { + "epoch": 0.4845021916092674, + "grad_norm": 3.170393228530884, + "learning_rate": 7.892636037797328e-05, + "loss": 1.399, + "step": 3095 + }, + { + "epoch": 0.48465873512836566, + "grad_norm": 1.5402638912200928, + "learning_rate": 7.891821440208538e-05, + "loss": 0.7845, + "step": 3096 + }, + { + "epoch": 0.484815278647464, + "grad_norm": 1.7230585813522339, + "learning_rate": 7.891006842619746e-05, + "loss": 0.872, + "step": 3097 + }, + { + "epoch": 0.4849718221665623, + "grad_norm": 4.944639682769775, + "learning_rate": 7.890192245030955e-05, + "loss": 1.4745, + "step": 3098 + }, + { + "epoch": 0.4851283656856606, + "grad_norm": 2.943614959716797, + "learning_rate": 7.889377647442164e-05, + "loss": 0.7866, + "step": 3099 + }, + { + "epoch": 0.4852849092047589, + "grad_norm": 3.025404930114746, + "learning_rate": 7.888563049853373e-05, + "loss": 1.5277, + "step": 3100 + }, + { + "epoch": 0.48544145272385725, + "grad_norm": 0.7043726444244385, + "learning_rate": 7.887748452264581e-05, + "loss": 0.319, + "step": 3101 + }, + { + "epoch": 0.4855979962429555, + "grad_norm": 0.8480181694030762, + "learning_rate": 7.886933854675791e-05, + "loss": 0.4359, + "step": 3102 + }, + { + "epoch": 0.48575453976205385, + "grad_norm": 1.3556556701660156, + "learning_rate": 7.886119257086999e-05, + "loss": 0.3302, + "step": 3103 + }, + { + "epoch": 0.4859110832811522, + "grad_norm": 0.6663347482681274, + "learning_rate": 7.885304659498209e-05, + "loss": 0.2732, + "step": 3104 + }, + { + "epoch": 0.48606762680025045, + "grad_norm": 0.8554444909095764, + "learning_rate": 7.884490061909417e-05, + "loss": 0.3491, + "step": 3105 + }, + { + "epoch": 0.4862241703193488, + "grad_norm": 0.686044454574585, + "learning_rate": 7.883675464320626e-05, + "loss": 0.3647, + "step": 3106 + }, + { + "epoch": 0.4863807138384471, + "grad_norm": 0.941618800163269, + "learning_rate": 7.882860866731835e-05, + "loss": 0.3575, + "step": 3107 + }, + { + "epoch": 0.4865372573575454, + "grad_norm": 1.0102143287658691, + "learning_rate": 7.882046269143044e-05, + "loss": 0.3726, + "step": 3108 + }, + { + "epoch": 0.4866938008766437, + "grad_norm": 0.8366485834121704, + "learning_rate": 7.881231671554252e-05, + "loss": 0.3818, + "step": 3109 + }, + { + "epoch": 0.48685034439574204, + "grad_norm": 1.1696062088012695, + "learning_rate": 7.880417073965462e-05, + "loss": 0.2809, + "step": 3110 + }, + { + "epoch": 0.4870068879148403, + "grad_norm": 1.7475327253341675, + "learning_rate": 7.87960247637667e-05, + "loss": 0.4114, + "step": 3111 + }, + { + "epoch": 0.48716343143393864, + "grad_norm": 1.8384411334991455, + "learning_rate": 7.878787878787879e-05, + "loss": 0.464, + "step": 3112 + }, + { + "epoch": 0.48731997495303697, + "grad_norm": 1.3004595041275024, + "learning_rate": 7.877973281199088e-05, + "loss": 0.4525, + "step": 3113 + }, + { + "epoch": 0.48747651847213525, + "grad_norm": 1.2025420665740967, + "learning_rate": 7.877158683610297e-05, + "loss": 0.441, + "step": 3114 + }, + { + "epoch": 0.4876330619912336, + "grad_norm": 2.899016857147217, + "learning_rate": 7.876344086021505e-05, + "loss": 0.6756, + "step": 3115 + }, + { + "epoch": 0.48778960551033185, + "grad_norm": 2.195469856262207, + "learning_rate": 7.875529488432715e-05, + "loss": 0.8265, + "step": 3116 + }, + { + "epoch": 0.4879461490294302, + "grad_norm": 2.1849050521850586, + "learning_rate": 7.874714890843923e-05, + "loss": 0.4547, + "step": 3117 + }, + { + "epoch": 0.4881026925485285, + "grad_norm": 3.410414457321167, + "learning_rate": 7.873900293255132e-05, + "loss": 0.8511, + "step": 3118 + }, + { + "epoch": 0.4882592360676268, + "grad_norm": 2.53226375579834, + "learning_rate": 7.873085695666341e-05, + "loss": 0.6198, + "step": 3119 + }, + { + "epoch": 0.4884157795867251, + "grad_norm": 2.048811197280884, + "learning_rate": 7.872271098077551e-05, + "loss": 0.5608, + "step": 3120 + }, + { + "epoch": 0.48857232310582344, + "grad_norm": 1.9281061887741089, + "learning_rate": 7.871456500488758e-05, + "loss": 0.8197, + "step": 3121 + }, + { + "epoch": 0.4887288666249217, + "grad_norm": 2.127187728881836, + "learning_rate": 7.870641902899968e-05, + "loss": 0.7313, + "step": 3122 + }, + { + "epoch": 0.48888541014402004, + "grad_norm": 1.9426624774932861, + "learning_rate": 7.869827305311178e-05, + "loss": 0.9043, + "step": 3123 + }, + { + "epoch": 0.48904195366311837, + "grad_norm": 2.0001492500305176, + "learning_rate": 7.869012707722385e-05, + "loss": 0.8624, + "step": 3124 + }, + { + "epoch": 0.48919849718221664, + "grad_norm": 1.703913688659668, + "learning_rate": 7.868198110133594e-05, + "loss": 0.6969, + "step": 3125 + }, + { + "epoch": 0.48935504070131497, + "grad_norm": 2.1304240226745605, + "learning_rate": 7.867383512544804e-05, + "loss": 0.849, + "step": 3126 + }, + { + "epoch": 0.4895115842204133, + "grad_norm": 6.221055507659912, + "learning_rate": 7.866568914956012e-05, + "loss": 1.1629, + "step": 3127 + }, + { + "epoch": 0.48966812773951157, + "grad_norm": 2.6300413608551025, + "learning_rate": 7.865754317367221e-05, + "loss": 0.9925, + "step": 3128 + }, + { + "epoch": 0.4898246712586099, + "grad_norm": 1.7858920097351074, + "learning_rate": 7.86493971977843e-05, + "loss": 0.5624, + "step": 3129 + }, + { + "epoch": 0.48998121477770823, + "grad_norm": 3.2084078788757324, + "learning_rate": 7.864125122189639e-05, + "loss": 0.8511, + "step": 3130 + }, + { + "epoch": 0.4901377582968065, + "grad_norm": 3.097597360610962, + "learning_rate": 7.863310524600847e-05, + "loss": 1.1688, + "step": 3131 + }, + { + "epoch": 0.49029430181590483, + "grad_norm": 2.498162269592285, + "learning_rate": 7.862495927012057e-05, + "loss": 0.6473, + "step": 3132 + }, + { + "epoch": 0.4904508453350031, + "grad_norm": 3.498538017272949, + "learning_rate": 7.861681329423265e-05, + "loss": 0.9628, + "step": 3133 + }, + { + "epoch": 0.49060738885410143, + "grad_norm": 2.5930304527282715, + "learning_rate": 7.860866731834474e-05, + "loss": 0.995, + "step": 3134 + }, + { + "epoch": 0.49076393237319976, + "grad_norm": 2.220874786376953, + "learning_rate": 7.860052134245683e-05, + "loss": 0.8332, + "step": 3135 + }, + { + "epoch": 0.49092047589229804, + "grad_norm": 1.9951725006103516, + "learning_rate": 7.859237536656892e-05, + "loss": 0.5877, + "step": 3136 + }, + { + "epoch": 0.49107701941139636, + "grad_norm": 2.037377119064331, + "learning_rate": 7.8584229390681e-05, + "loss": 0.8211, + "step": 3137 + }, + { + "epoch": 0.4912335629304947, + "grad_norm": 2.950134038925171, + "learning_rate": 7.85760834147931e-05, + "loss": 1.3249, + "step": 3138 + }, + { + "epoch": 0.49139010644959297, + "grad_norm": 3.628413677215576, + "learning_rate": 7.856793743890518e-05, + "loss": 1.0574, + "step": 3139 + }, + { + "epoch": 0.4915466499686913, + "grad_norm": 4.215682506561279, + "learning_rate": 7.855979146301727e-05, + "loss": 1.4236, + "step": 3140 + }, + { + "epoch": 0.4917031934877896, + "grad_norm": 3.3409643173217773, + "learning_rate": 7.855164548712936e-05, + "loss": 1.0338, + "step": 3141 + }, + { + "epoch": 0.4918597370068879, + "grad_norm": 3.118246555328369, + "learning_rate": 7.854349951124145e-05, + "loss": 1.5288, + "step": 3142 + }, + { + "epoch": 0.4920162805259862, + "grad_norm": 3.0001184940338135, + "learning_rate": 7.853535353535355e-05, + "loss": 0.946, + "step": 3143 + }, + { + "epoch": 0.49217282404508456, + "grad_norm": 2.703991413116455, + "learning_rate": 7.852720755946563e-05, + "loss": 1.1596, + "step": 3144 + }, + { + "epoch": 0.49232936756418283, + "grad_norm": 2.9061970710754395, + "learning_rate": 7.851906158357771e-05, + "loss": 1.4128, + "step": 3145 + }, + { + "epoch": 0.49248591108328116, + "grad_norm": 2.6967885494232178, + "learning_rate": 7.851091560768981e-05, + "loss": 1.0495, + "step": 3146 + }, + { + "epoch": 0.4926424546023795, + "grad_norm": 2.264474391937256, + "learning_rate": 7.85027696318019e-05, + "loss": 1.3792, + "step": 3147 + }, + { + "epoch": 0.49279899812147776, + "grad_norm": 3.546614170074463, + "learning_rate": 7.849462365591398e-05, + "loss": 0.7513, + "step": 3148 + }, + { + "epoch": 0.4929555416405761, + "grad_norm": 1.7640292644500732, + "learning_rate": 7.848647768002608e-05, + "loss": 0.7344, + "step": 3149 + }, + { + "epoch": 0.49311208515967436, + "grad_norm": 2.491363286972046, + "learning_rate": 7.847833170413816e-05, + "loss": 0.8725, + "step": 3150 + }, + { + "epoch": 0.4932686286787727, + "grad_norm": 0.742791473865509, + "learning_rate": 7.847018572825024e-05, + "loss": 0.3277, + "step": 3151 + }, + { + "epoch": 0.493425172197871, + "grad_norm": 0.9618322849273682, + "learning_rate": 7.846203975236234e-05, + "loss": 0.3592, + "step": 3152 + }, + { + "epoch": 0.4935817157169693, + "grad_norm": 0.6468261480331421, + "learning_rate": 7.845389377647442e-05, + "loss": 0.262, + "step": 3153 + }, + { + "epoch": 0.4937382592360676, + "grad_norm": 1.087634801864624, + "learning_rate": 7.844574780058651e-05, + "loss": 0.2933, + "step": 3154 + }, + { + "epoch": 0.49389480275516595, + "grad_norm": 0.5939799547195435, + "learning_rate": 7.84376018246986e-05, + "loss": 0.2741, + "step": 3155 + }, + { + "epoch": 0.4940513462742642, + "grad_norm": 0.7180101275444031, + "learning_rate": 7.84294558488107e-05, + "loss": 0.2221, + "step": 3156 + }, + { + "epoch": 0.49420788979336255, + "grad_norm": 1.235817313194275, + "learning_rate": 7.842130987292277e-05, + "loss": 0.4169, + "step": 3157 + }, + { + "epoch": 0.4943644333124609, + "grad_norm": 1.1026556491851807, + "learning_rate": 7.841316389703487e-05, + "loss": 0.3264, + "step": 3158 + }, + { + "epoch": 0.49452097683155916, + "grad_norm": 2.1601994037628174, + "learning_rate": 7.840501792114697e-05, + "loss": 0.3655, + "step": 3159 + }, + { + "epoch": 0.4946775203506575, + "grad_norm": 1.5850191116333008, + "learning_rate": 7.839687194525904e-05, + "loss": 0.5264, + "step": 3160 + }, + { + "epoch": 0.4948340638697558, + "grad_norm": 2.2184109687805176, + "learning_rate": 7.838872596937113e-05, + "loss": 0.4344, + "step": 3161 + }, + { + "epoch": 0.4949906073888541, + "grad_norm": 1.470656156539917, + "learning_rate": 7.838057999348323e-05, + "loss": 0.325, + "step": 3162 + }, + { + "epoch": 0.4951471509079524, + "grad_norm": 2.030287027359009, + "learning_rate": 7.837243401759532e-05, + "loss": 0.6411, + "step": 3163 + }, + { + "epoch": 0.49530369442705074, + "grad_norm": 2.317023515701294, + "learning_rate": 7.83642880417074e-05, + "loss": 0.7172, + "step": 3164 + }, + { + "epoch": 0.495460237946149, + "grad_norm": 1.6202462911605835, + "learning_rate": 7.83561420658195e-05, + "loss": 0.5294, + "step": 3165 + }, + { + "epoch": 0.49561678146524735, + "grad_norm": 1.397782802581787, + "learning_rate": 7.834799608993158e-05, + "loss": 0.537, + "step": 3166 + }, + { + "epoch": 0.4957733249843457, + "grad_norm": 2.5559329986572266, + "learning_rate": 7.833985011404366e-05, + "loss": 0.5966, + "step": 3167 + }, + { + "epoch": 0.49592986850344395, + "grad_norm": 1.9651652574539185, + "learning_rate": 7.833170413815576e-05, + "loss": 0.5754, + "step": 3168 + }, + { + "epoch": 0.4960864120225423, + "grad_norm": 1.2038687467575073, + "learning_rate": 7.832355816226785e-05, + "loss": 0.3535, + "step": 3169 + }, + { + "epoch": 0.49624295554164055, + "grad_norm": 2.396516799926758, + "learning_rate": 7.831541218637993e-05, + "loss": 0.4933, + "step": 3170 + }, + { + "epoch": 0.4963994990607389, + "grad_norm": 2.191012382507324, + "learning_rate": 7.830726621049203e-05, + "loss": 0.8504, + "step": 3171 + }, + { + "epoch": 0.4965560425798372, + "grad_norm": 1.3020657300949097, + "learning_rate": 7.829912023460411e-05, + "loss": 0.4463, + "step": 3172 + }, + { + "epoch": 0.4967125860989355, + "grad_norm": 1.2407599687576294, + "learning_rate": 7.82909742587162e-05, + "loss": 0.6501, + "step": 3173 + }, + { + "epoch": 0.4968691296180338, + "grad_norm": 2.6002941131591797, + "learning_rate": 7.828282828282829e-05, + "loss": 0.6034, + "step": 3174 + }, + { + "epoch": 0.49702567313713214, + "grad_norm": 1.8700629472732544, + "learning_rate": 7.827468230694037e-05, + "loss": 0.7817, + "step": 3175 + }, + { + "epoch": 0.4971822166562304, + "grad_norm": 1.730399489402771, + "learning_rate": 7.826653633105246e-05, + "loss": 0.639, + "step": 3176 + }, + { + "epoch": 0.49733876017532874, + "grad_norm": 1.786551594734192, + "learning_rate": 7.825839035516456e-05, + "loss": 0.7521, + "step": 3177 + }, + { + "epoch": 0.49749530369442707, + "grad_norm": 2.4207944869995117, + "learning_rate": 7.825024437927664e-05, + "loss": 0.9632, + "step": 3178 + }, + { + "epoch": 0.49765184721352534, + "grad_norm": 2.917489767074585, + "learning_rate": 7.824209840338874e-05, + "loss": 1.0769, + "step": 3179 + }, + { + "epoch": 0.4978083907326237, + "grad_norm": 2.7723867893218994, + "learning_rate": 7.823395242750082e-05, + "loss": 0.8083, + "step": 3180 + }, + { + "epoch": 0.497964934251722, + "grad_norm": 1.6960384845733643, + "learning_rate": 7.82258064516129e-05, + "loss": 0.6509, + "step": 3181 + }, + { + "epoch": 0.4981214777708203, + "grad_norm": 3.8660478591918945, + "learning_rate": 7.8217660475725e-05, + "loss": 0.9646, + "step": 3182 + }, + { + "epoch": 0.4982780212899186, + "grad_norm": 2.918212890625, + "learning_rate": 7.820951449983709e-05, + "loss": 0.9504, + "step": 3183 + }, + { + "epoch": 0.49843456480901693, + "grad_norm": 4.229357719421387, + "learning_rate": 7.820136852394917e-05, + "loss": 1.0282, + "step": 3184 + }, + { + "epoch": 0.4985911083281152, + "grad_norm": 3.2927231788635254, + "learning_rate": 7.819322254806127e-05, + "loss": 0.7735, + "step": 3185 + }, + { + "epoch": 0.49874765184721354, + "grad_norm": 3.0207149982452393, + "learning_rate": 7.818507657217335e-05, + "loss": 0.7775, + "step": 3186 + }, + { + "epoch": 0.4989041953663118, + "grad_norm": 3.828415870666504, + "learning_rate": 7.817693059628543e-05, + "loss": 1.0646, + "step": 3187 + }, + { + "epoch": 0.49906073888541014, + "grad_norm": 2.5942914485931396, + "learning_rate": 7.816878462039753e-05, + "loss": 0.8893, + "step": 3188 + }, + { + "epoch": 0.49921728240450847, + "grad_norm": 1.8777374029159546, + "learning_rate": 7.816063864450961e-05, + "loss": 1.1874, + "step": 3189 + }, + { + "epoch": 0.49937382592360674, + "grad_norm": 3.813481569290161, + "learning_rate": 7.81524926686217e-05, + "loss": 0.9553, + "step": 3190 + }, + { + "epoch": 0.49953036944270507, + "grad_norm": 4.398700714111328, + "learning_rate": 7.81443466927338e-05, + "loss": 1.107, + "step": 3191 + }, + { + "epoch": 0.4996869129618034, + "grad_norm": 4.9536452293396, + "learning_rate": 7.81362007168459e-05, + "loss": 1.907, + "step": 3192 + }, + { + "epoch": 0.49984345648090167, + "grad_norm": 4.147815704345703, + "learning_rate": 7.812805474095796e-05, + "loss": 1.621, + "step": 3193 + }, + { + "epoch": 0.5, + "grad_norm": 2.774986982345581, + "learning_rate": 7.811990876507006e-05, + "loss": 1.3794, + "step": 3194 + }, + { + "epoch": 0.5001565435190983, + "grad_norm": 4.507782936096191, + "learning_rate": 7.811176278918216e-05, + "loss": 1.2094, + "step": 3195 + }, + { + "epoch": 0.5003130870381967, + "grad_norm": 6.150731563568115, + "learning_rate": 7.810361681329423e-05, + "loss": 0.7043, + "step": 3196 + }, + { + "epoch": 0.5004696305572949, + "grad_norm": 5.264526844024658, + "learning_rate": 7.809547083740633e-05, + "loss": 1.332, + "step": 3197 + }, + { + "epoch": 0.5006261740763932, + "grad_norm": 3.0981757640838623, + "learning_rate": 7.808732486151842e-05, + "loss": 0.6637, + "step": 3198 + }, + { + "epoch": 0.5007827175954915, + "grad_norm": 3.569844961166382, + "learning_rate": 7.807917888563049e-05, + "loss": 0.8198, + "step": 3199 + }, + { + "epoch": 0.5009392611145899, + "grad_norm": 2.5524871349334717, + "learning_rate": 7.807103290974259e-05, + "loss": 0.7868, + "step": 3200 + }, + { + "epoch": 0.5010958046336882, + "grad_norm": 0.5585722923278809, + "learning_rate": 7.806288693385469e-05, + "loss": 0.2768, + "step": 3201 + }, + { + "epoch": 0.5012523481527865, + "grad_norm": 0.7520403265953064, + "learning_rate": 7.805474095796677e-05, + "loss": 0.3565, + "step": 3202 + }, + { + "epoch": 0.5014088916718847, + "grad_norm": 0.817961573600769, + "learning_rate": 7.804659498207886e-05, + "loss": 0.3497, + "step": 3203 + }, + { + "epoch": 0.5015654351909831, + "grad_norm": 0.6671187877655029, + "learning_rate": 7.803844900619095e-05, + "loss": 0.2871, + "step": 3204 + }, + { + "epoch": 0.5017219787100814, + "grad_norm": 0.5974734425544739, + "learning_rate": 7.803030303030304e-05, + "loss": 0.2349, + "step": 3205 + }, + { + "epoch": 0.5018785222291797, + "grad_norm": 0.898135781288147, + "learning_rate": 7.802215705441512e-05, + "loss": 0.304, + "step": 3206 + }, + { + "epoch": 0.502035065748278, + "grad_norm": 0.848680317401886, + "learning_rate": 7.801401107852722e-05, + "loss": 0.3118, + "step": 3207 + }, + { + "epoch": 0.5021916092673764, + "grad_norm": 0.8634347319602966, + "learning_rate": 7.80058651026393e-05, + "loss": 0.3197, + "step": 3208 + }, + { + "epoch": 0.5023481527864746, + "grad_norm": 1.0207642316818237, + "learning_rate": 7.799771912675138e-05, + "loss": 0.4779, + "step": 3209 + }, + { + "epoch": 0.5025046963055729, + "grad_norm": 1.3171188831329346, + "learning_rate": 7.798957315086348e-05, + "loss": 0.8941, + "step": 3210 + }, + { + "epoch": 0.5026612398246713, + "grad_norm": 0.7265933156013489, + "learning_rate": 7.798142717497557e-05, + "loss": 0.2732, + "step": 3211 + }, + { + "epoch": 0.5028177833437696, + "grad_norm": 1.3218026161193848, + "learning_rate": 7.797328119908765e-05, + "loss": 0.5583, + "step": 3212 + }, + { + "epoch": 0.5029743268628679, + "grad_norm": 1.5225598812103271, + "learning_rate": 7.796513522319975e-05, + "loss": 0.5223, + "step": 3213 + }, + { + "epoch": 0.5031308703819661, + "grad_norm": 1.9011386632919312, + "learning_rate": 7.795698924731183e-05, + "loss": 0.5436, + "step": 3214 + }, + { + "epoch": 0.5032874139010645, + "grad_norm": 1.4779123067855835, + "learning_rate": 7.794884327142393e-05, + "loss": 0.6568, + "step": 3215 + }, + { + "epoch": 0.5034439574201628, + "grad_norm": 1.2073876857757568, + "learning_rate": 7.794069729553601e-05, + "loss": 0.4835, + "step": 3216 + }, + { + "epoch": 0.5036005009392611, + "grad_norm": 1.3478662967681885, + "learning_rate": 7.79325513196481e-05, + "loss": 0.4067, + "step": 3217 + }, + { + "epoch": 0.5037570444583594, + "grad_norm": 1.3998686075210571, + "learning_rate": 7.792440534376019e-05, + "loss": 0.5638, + "step": 3218 + }, + { + "epoch": 0.5039135879774578, + "grad_norm": 1.3903826475143433, + "learning_rate": 7.791625936787228e-05, + "loss": 0.4309, + "step": 3219 + }, + { + "epoch": 0.504070131496556, + "grad_norm": 2.8172433376312256, + "learning_rate": 7.790811339198436e-05, + "loss": 0.6144, + "step": 3220 + }, + { + "epoch": 0.5042266750156543, + "grad_norm": 1.9537913799285889, + "learning_rate": 7.789996741609646e-05, + "loss": 0.4851, + "step": 3221 + }, + { + "epoch": 0.5043832185347527, + "grad_norm": 1.5136007070541382, + "learning_rate": 7.789182144020854e-05, + "loss": 0.4947, + "step": 3222 + }, + { + "epoch": 0.504539762053851, + "grad_norm": 2.1008713245391846, + "learning_rate": 7.788367546432063e-05, + "loss": 0.6876, + "step": 3223 + }, + { + "epoch": 0.5046963055729493, + "grad_norm": 2.087409496307373, + "learning_rate": 7.787552948843272e-05, + "loss": 1.0519, + "step": 3224 + }, + { + "epoch": 0.5048528490920476, + "grad_norm": 4.220847129821777, + "learning_rate": 7.78673835125448e-05, + "loss": 1.0619, + "step": 3225 + }, + { + "epoch": 0.5050093926111459, + "grad_norm": 2.4796533584594727, + "learning_rate": 7.785923753665689e-05, + "loss": 1.2494, + "step": 3226 + }, + { + "epoch": 0.5051659361302442, + "grad_norm": 1.6394214630126953, + "learning_rate": 7.785109156076899e-05, + "loss": 0.3336, + "step": 3227 + }, + { + "epoch": 0.5053224796493425, + "grad_norm": 1.9433842897415161, + "learning_rate": 7.784294558488107e-05, + "loss": 0.8587, + "step": 3228 + }, + { + "epoch": 0.5054790231684408, + "grad_norm": 2.7539944648742676, + "learning_rate": 7.783479960899315e-05, + "loss": 0.616, + "step": 3229 + }, + { + "epoch": 0.5056355666875392, + "grad_norm": 2.8338983058929443, + "learning_rate": 7.782665363310525e-05, + "loss": 0.9106, + "step": 3230 + }, + { + "epoch": 0.5057921102066374, + "grad_norm": 1.566757321357727, + "learning_rate": 7.781850765721735e-05, + "loss": 0.7795, + "step": 3231 + }, + { + "epoch": 0.5059486537257357, + "grad_norm": 3.7829463481903076, + "learning_rate": 7.781036168132942e-05, + "loss": 1.0873, + "step": 3232 + }, + { + "epoch": 0.506105197244834, + "grad_norm": 2.2621119022369385, + "learning_rate": 7.780221570544152e-05, + "loss": 0.8084, + "step": 3233 + }, + { + "epoch": 0.5062617407639324, + "grad_norm": 2.0414953231811523, + "learning_rate": 7.779406972955361e-05, + "loss": 0.7643, + "step": 3234 + }, + { + "epoch": 0.5064182842830307, + "grad_norm": 5.379373073577881, + "learning_rate": 7.778592375366568e-05, + "loss": 1.2965, + "step": 3235 + }, + { + "epoch": 0.506574827802129, + "grad_norm": 2.102553367614746, + "learning_rate": 7.777777777777778e-05, + "loss": 1.1275, + "step": 3236 + }, + { + "epoch": 0.5067313713212273, + "grad_norm": 5.550718784332275, + "learning_rate": 7.776963180188988e-05, + "loss": 1.4693, + "step": 3237 + }, + { + "epoch": 0.5068879148403256, + "grad_norm": 2.9647417068481445, + "learning_rate": 7.776148582600196e-05, + "loss": 1.1656, + "step": 3238 + }, + { + "epoch": 0.5070444583594239, + "grad_norm": 2.778164863586426, + "learning_rate": 7.775333985011405e-05, + "loss": 1.4388, + "step": 3239 + }, + { + "epoch": 0.5072010018785222, + "grad_norm": 4.0990214347839355, + "learning_rate": 7.774519387422614e-05, + "loss": 0.9256, + "step": 3240 + }, + { + "epoch": 0.5073575453976206, + "grad_norm": 1.8401204347610474, + "learning_rate": 7.773704789833823e-05, + "loss": 1.0936, + "step": 3241 + }, + { + "epoch": 0.5075140889167189, + "grad_norm": 5.800806522369385, + "learning_rate": 7.772890192245031e-05, + "loss": 1.6867, + "step": 3242 + }, + { + "epoch": 0.5076706324358171, + "grad_norm": 4.448583126068115, + "learning_rate": 7.772075594656241e-05, + "loss": 1.528, + "step": 3243 + }, + { + "epoch": 0.5078271759549154, + "grad_norm": 6.2727742195129395, + "learning_rate": 7.771260997067449e-05, + "loss": 0.8861, + "step": 3244 + }, + { + "epoch": 0.5079837194740138, + "grad_norm": 4.201441764831543, + "learning_rate": 7.770446399478658e-05, + "loss": 1.3258, + "step": 3245 + }, + { + "epoch": 0.5081402629931121, + "grad_norm": 1.3035730123519897, + "learning_rate": 7.769631801889867e-05, + "loss": 0.5229, + "step": 3246 + }, + { + "epoch": 0.5082968065122104, + "grad_norm": 2.1720705032348633, + "learning_rate": 7.768817204301076e-05, + "loss": 0.5358, + "step": 3247 + }, + { + "epoch": 0.5084533500313086, + "grad_norm": 3.0107548236846924, + "learning_rate": 7.768002606712284e-05, + "loss": 1.0349, + "step": 3248 + }, + { + "epoch": 0.508609893550407, + "grad_norm": 3.7248921394348145, + "learning_rate": 7.767188009123494e-05, + "loss": 0.7709, + "step": 3249 + }, + { + "epoch": 0.5087664370695053, + "grad_norm": 3.6065359115600586, + "learning_rate": 7.766373411534702e-05, + "loss": 1.4173, + "step": 3250 + }, + { + "epoch": 0.5089229805886036, + "grad_norm": 0.5860066413879395, + "learning_rate": 7.76555881394591e-05, + "loss": 0.3619, + "step": 3251 + }, + { + "epoch": 0.509079524107702, + "grad_norm": 0.45574429631233215, + "learning_rate": 7.76474421635712e-05, + "loss": 0.3042, + "step": 3252 + }, + { + "epoch": 0.5092360676268003, + "grad_norm": 0.643669843673706, + "learning_rate": 7.763929618768329e-05, + "loss": 0.2935, + "step": 3253 + }, + { + "epoch": 0.5093926111458985, + "grad_norm": 0.6257880330085754, + "learning_rate": 7.763115021179538e-05, + "loss": 0.3028, + "step": 3254 + }, + { + "epoch": 0.5095491546649968, + "grad_norm": 0.5098122358322144, + "learning_rate": 7.762300423590747e-05, + "loss": 0.2669, + "step": 3255 + }, + { + "epoch": 0.5097056981840952, + "grad_norm": 0.7502939701080322, + "learning_rate": 7.761485826001955e-05, + "loss": 0.4192, + "step": 3256 + }, + { + "epoch": 0.5098622417031935, + "grad_norm": 1.458650827407837, + "learning_rate": 7.760671228413165e-05, + "loss": 0.4257, + "step": 3257 + }, + { + "epoch": 0.5100187852222918, + "grad_norm": 1.0082576274871826, + "learning_rate": 7.759856630824373e-05, + "loss": 0.4442, + "step": 3258 + }, + { + "epoch": 0.5101753287413902, + "grad_norm": 0.9692723155021667, + "learning_rate": 7.759042033235582e-05, + "loss": 0.3024, + "step": 3259 + }, + { + "epoch": 0.5103318722604884, + "grad_norm": 1.5180801153182983, + "learning_rate": 7.758227435646791e-05, + "loss": 0.3765, + "step": 3260 + }, + { + "epoch": 0.5104884157795867, + "grad_norm": 2.1201748847961426, + "learning_rate": 7.757412838058e-05, + "loss": 0.5648, + "step": 3261 + }, + { + "epoch": 0.510644959298685, + "grad_norm": 1.1880407333374023, + "learning_rate": 7.756598240469208e-05, + "loss": 0.3553, + "step": 3262 + }, + { + "epoch": 0.5108015028177834, + "grad_norm": 0.9531689286231995, + "learning_rate": 7.755783642880418e-05, + "loss": 0.4336, + "step": 3263 + }, + { + "epoch": 0.5109580463368817, + "grad_norm": 1.600476861000061, + "learning_rate": 7.754969045291626e-05, + "loss": 0.5043, + "step": 3264 + }, + { + "epoch": 0.51111458985598, + "grad_norm": 1.3012099266052246, + "learning_rate": 7.754154447702835e-05, + "loss": 0.3752, + "step": 3265 + }, + { + "epoch": 0.5112711333750782, + "grad_norm": 0.870364248752594, + "learning_rate": 7.753339850114044e-05, + "loss": 0.3248, + "step": 3266 + }, + { + "epoch": 0.5114276768941766, + "grad_norm": 1.5303516387939453, + "learning_rate": 7.752525252525254e-05, + "loss": 0.4615, + "step": 3267 + }, + { + "epoch": 0.5115842204132749, + "grad_norm": 2.062457323074341, + "learning_rate": 7.751710654936461e-05, + "loss": 0.3299, + "step": 3268 + }, + { + "epoch": 0.5117407639323732, + "grad_norm": 1.4244747161865234, + "learning_rate": 7.750896057347671e-05, + "loss": 0.511, + "step": 3269 + }, + { + "epoch": 0.5118973074514716, + "grad_norm": 1.948799967765808, + "learning_rate": 7.75008145975888e-05, + "loss": 0.6519, + "step": 3270 + }, + { + "epoch": 0.5120538509705698, + "grad_norm": 2.3253939151763916, + "learning_rate": 7.749266862170088e-05, + "loss": 0.6654, + "step": 3271 + }, + { + "epoch": 0.5122103944896681, + "grad_norm": 2.025757312774658, + "learning_rate": 7.748452264581297e-05, + "loss": 0.8528, + "step": 3272 + }, + { + "epoch": 0.5123669380087664, + "grad_norm": 2.4974255561828613, + "learning_rate": 7.747637666992507e-05, + "loss": 0.5909, + "step": 3273 + }, + { + "epoch": 0.5125234815278648, + "grad_norm": 2.3762736320495605, + "learning_rate": 7.746823069403715e-05, + "loss": 1.056, + "step": 3274 + }, + { + "epoch": 0.5126800250469631, + "grad_norm": 2.057006359100342, + "learning_rate": 7.746008471814924e-05, + "loss": 0.5193, + "step": 3275 + }, + { + "epoch": 0.5128365685660614, + "grad_norm": 2.0148165225982666, + "learning_rate": 7.745193874226133e-05, + "loss": 0.6809, + "step": 3276 + }, + { + "epoch": 0.5129931120851596, + "grad_norm": 2.755544900894165, + "learning_rate": 7.744379276637342e-05, + "loss": 0.7777, + "step": 3277 + }, + { + "epoch": 0.513149655604258, + "grad_norm": 5.070794582366943, + "learning_rate": 7.74356467904855e-05, + "loss": 0.8627, + "step": 3278 + }, + { + "epoch": 0.5133061991233563, + "grad_norm": 2.280787944793701, + "learning_rate": 7.74275008145976e-05, + "loss": 0.4383, + "step": 3279 + }, + { + "epoch": 0.5134627426424546, + "grad_norm": 1.211061954498291, + "learning_rate": 7.741935483870968e-05, + "loss": 0.4874, + "step": 3280 + }, + { + "epoch": 0.513619286161553, + "grad_norm": 6.695600509643555, + "learning_rate": 7.741120886282177e-05, + "loss": 0.7883, + "step": 3281 + }, + { + "epoch": 0.5137758296806513, + "grad_norm": 2.204355239868164, + "learning_rate": 7.740306288693386e-05, + "loss": 0.6369, + "step": 3282 + }, + { + "epoch": 0.5139323731997495, + "grad_norm": 6.98603630065918, + "learning_rate": 7.739491691104595e-05, + "loss": 0.9432, + "step": 3283 + }, + { + "epoch": 0.5140889167188478, + "grad_norm": 2.1165072917938232, + "learning_rate": 7.738677093515803e-05, + "loss": 1.025, + "step": 3284 + }, + { + "epoch": 0.5142454602379462, + "grad_norm": NaN, + "learning_rate": 7.738677093515803e-05, + "loss": 0.0, + "step": 3285 + }, + { + "epoch": 0.5144020037570445, + "grad_norm": 3.7664220333099365, + "learning_rate": 7.737862495927013e-05, + "loss": 0.8271, + "step": 3286 + }, + { + "epoch": 0.5145585472761428, + "grad_norm": 3.139533758163452, + "learning_rate": 7.737047898338221e-05, + "loss": 0.7982, + "step": 3287 + }, + { + "epoch": 0.514715090795241, + "grad_norm": 3.0189554691314697, + "learning_rate": 7.73623330074943e-05, + "loss": 1.4185, + "step": 3288 + }, + { + "epoch": 0.5148716343143394, + "grad_norm": 3.457087516784668, + "learning_rate": 7.73541870316064e-05, + "loss": 1.1278, + "step": 3289 + }, + { + "epoch": 0.5150281778334377, + "grad_norm": 4.4606852531433105, + "learning_rate": 7.734604105571848e-05, + "loss": 1.3243, + "step": 3290 + }, + { + "epoch": 0.515184721352536, + "grad_norm": 6.005200386047363, + "learning_rate": 7.733789507983058e-05, + "loss": 1.4794, + "step": 3291 + }, + { + "epoch": 0.5153412648716343, + "grad_norm": 3.069075584411621, + "learning_rate": 7.732974910394266e-05, + "loss": 1.2734, + "step": 3292 + }, + { + "epoch": 0.5154978083907327, + "grad_norm": 1.7494914531707764, + "learning_rate": 7.732160312805474e-05, + "loss": 1.2937, + "step": 3293 + }, + { + "epoch": 0.5156543519098309, + "grad_norm": 2.536943197250366, + "learning_rate": 7.731345715216684e-05, + "loss": 1.6806, + "step": 3294 + }, + { + "epoch": 0.5158108954289292, + "grad_norm": 3.745630979537964, + "learning_rate": 7.730531117627892e-05, + "loss": 1.4967, + "step": 3295 + }, + { + "epoch": 0.5159674389480275, + "grad_norm": 3.710773468017578, + "learning_rate": 7.729716520039101e-05, + "loss": 0.9807, + "step": 3296 + }, + { + "epoch": 0.5161239824671259, + "grad_norm": 2.739320755004883, + "learning_rate": 7.72890192245031e-05, + "loss": 1.0992, + "step": 3297 + }, + { + "epoch": 0.5162805259862242, + "grad_norm": 4.089766979217529, + "learning_rate": 7.728087324861519e-05, + "loss": 0.4131, + "step": 3298 + }, + { + "epoch": 0.5164370695053225, + "grad_norm": 2.2386627197265625, + "learning_rate": 7.727272727272727e-05, + "loss": 0.9886, + "step": 3299 + }, + { + "epoch": 0.5165936130244208, + "grad_norm": 2.1536247730255127, + "learning_rate": 7.726458129683937e-05, + "loss": 1.1401, + "step": 3300 + }, + { + "epoch": 0.5167501565435191, + "grad_norm": 0.6447820663452148, + "learning_rate": 7.725643532095145e-05, + "loss": 0.2738, + "step": 3301 + }, + { + "epoch": 0.5169067000626174, + "grad_norm": 0.895799994468689, + "learning_rate": 7.724828934506354e-05, + "loss": 0.2756, + "step": 3302 + }, + { + "epoch": 0.5170632435817157, + "grad_norm": 0.6046528220176697, + "learning_rate": 7.724014336917563e-05, + "loss": 0.2883, + "step": 3303 + }, + { + "epoch": 0.5172197871008141, + "grad_norm": 3.3685050010681152, + "learning_rate": 7.723199739328773e-05, + "loss": 0.6136, + "step": 3304 + }, + { + "epoch": 0.5173763306199123, + "grad_norm": 0.8558064699172974, + "learning_rate": 7.72238514173998e-05, + "loss": 0.3725, + "step": 3305 + }, + { + "epoch": 0.5175328741390106, + "grad_norm": 0.9131036996841431, + "learning_rate": 7.72157054415119e-05, + "loss": 0.4187, + "step": 3306 + }, + { + "epoch": 0.5176894176581089, + "grad_norm": 1.1137261390686035, + "learning_rate": 7.7207559465624e-05, + "loss": 0.5478, + "step": 3307 + }, + { + "epoch": 0.5178459611772073, + "grad_norm": 1.1340792179107666, + "learning_rate": 7.719941348973607e-05, + "loss": 0.4161, + "step": 3308 + }, + { + "epoch": 0.5180025046963056, + "grad_norm": 2.3940184116363525, + "learning_rate": 7.719126751384816e-05, + "loss": 0.3978, + "step": 3309 + }, + { + "epoch": 0.5181590482154039, + "grad_norm": 6.755865097045898, + "learning_rate": 7.718312153796026e-05, + "loss": 0.5773, + "step": 3310 + }, + { + "epoch": 0.5183155917345021, + "grad_norm": 0.979030430316925, + "learning_rate": 7.717497556207233e-05, + "loss": 0.3812, + "step": 3311 + }, + { + "epoch": 0.5184721352536005, + "grad_norm": 1.0931179523468018, + "learning_rate": 7.716682958618443e-05, + "loss": 0.371, + "step": 3312 + }, + { + "epoch": 0.5186286787726988, + "grad_norm": 1.0465348958969116, + "learning_rate": 7.715868361029653e-05, + "loss": 0.4009, + "step": 3313 + }, + { + "epoch": 0.5187852222917971, + "grad_norm": 1.5454899072647095, + "learning_rate": 7.715053763440861e-05, + "loss": 0.3671, + "step": 3314 + }, + { + "epoch": 0.5189417658108955, + "grad_norm": 1.8327946662902832, + "learning_rate": 7.71423916585207e-05, + "loss": 0.5858, + "step": 3315 + }, + { + "epoch": 0.5190983093299938, + "grad_norm": 1.2420032024383545, + "learning_rate": 7.713424568263279e-05, + "loss": 0.5922, + "step": 3316 + }, + { + "epoch": 0.519254852849092, + "grad_norm": 1.3695108890533447, + "learning_rate": 7.712609970674487e-05, + "loss": 0.762, + "step": 3317 + }, + { + "epoch": 0.5194113963681903, + "grad_norm": 0.89469313621521, + "learning_rate": 7.711795373085696e-05, + "loss": 0.3573, + "step": 3318 + }, + { + "epoch": 0.5195679398872887, + "grad_norm": 1.7926404476165771, + "learning_rate": 7.710980775496906e-05, + "loss": 0.4819, + "step": 3319 + }, + { + "epoch": 0.519724483406387, + "grad_norm": 1.9602640867233276, + "learning_rate": 7.710166177908114e-05, + "loss": 0.4491, + "step": 3320 + }, + { + "epoch": 0.5198810269254853, + "grad_norm": 1.5297489166259766, + "learning_rate": 7.709351580319322e-05, + "loss": 0.4848, + "step": 3321 + }, + { + "epoch": 0.5200375704445835, + "grad_norm": 1.6152745485305786, + "learning_rate": 7.708536982730532e-05, + "loss": 0.6943, + "step": 3322 + }, + { + "epoch": 0.5201941139636819, + "grad_norm": 2.6172497272491455, + "learning_rate": 7.70772238514174e-05, + "loss": 0.62, + "step": 3323 + }, + { + "epoch": 0.5203506574827802, + "grad_norm": 1.9292031526565552, + "learning_rate": 7.706907787552949e-05, + "loss": 0.6048, + "step": 3324 + }, + { + "epoch": 0.5205072010018785, + "grad_norm": 3.083651304244995, + "learning_rate": 7.706093189964157e-05, + "loss": 0.6623, + "step": 3325 + }, + { + "epoch": 0.5206637445209769, + "grad_norm": 1.9697442054748535, + "learning_rate": 7.705278592375367e-05, + "loss": 0.62, + "step": 3326 + }, + { + "epoch": 0.5208202880400752, + "grad_norm": 1.9933054447174072, + "learning_rate": 7.704463994786577e-05, + "loss": 0.5459, + "step": 3327 + }, + { + "epoch": 0.5209768315591734, + "grad_norm": 1.8028401136398315, + "learning_rate": 7.703649397197784e-05, + "loss": 0.4827, + "step": 3328 + }, + { + "epoch": 0.5211333750782717, + "grad_norm": 2.9447693824768066, + "learning_rate": 7.702834799608993e-05, + "loss": 0.9621, + "step": 3329 + }, + { + "epoch": 0.5212899185973701, + "grad_norm": 3.2362301349639893, + "learning_rate": 7.702020202020203e-05, + "loss": 0.6285, + "step": 3330 + }, + { + "epoch": 0.5214464621164684, + "grad_norm": 1.1952751874923706, + "learning_rate": 7.70120560443141e-05, + "loss": 0.4623, + "step": 3331 + }, + { + "epoch": 0.5216030056355667, + "grad_norm": 2.2861287593841553, + "learning_rate": 7.70039100684262e-05, + "loss": 0.636, + "step": 3332 + }, + { + "epoch": 0.521759549154665, + "grad_norm": 5.21936559677124, + "learning_rate": 7.69957640925383e-05, + "loss": 0.7853, + "step": 3333 + }, + { + "epoch": 0.5219160926737633, + "grad_norm": 2.9163124561309814, + "learning_rate": 7.698761811665038e-05, + "loss": 0.9136, + "step": 3334 + }, + { + "epoch": 0.5220726361928616, + "grad_norm": 6.033055305480957, + "learning_rate": 7.697947214076246e-05, + "loss": 1.2614, + "step": 3335 + }, + { + "epoch": 0.5222291797119599, + "grad_norm": 1.5139204263687134, + "learning_rate": 7.697132616487456e-05, + "loss": 0.8291, + "step": 3336 + }, + { + "epoch": 0.5223857232310583, + "grad_norm": 3.4025204181671143, + "learning_rate": 7.696318018898664e-05, + "loss": 0.7433, + "step": 3337 + }, + { + "epoch": 0.5225422667501566, + "grad_norm": 3.779388666152954, + "learning_rate": 7.695503421309873e-05, + "loss": 1.3421, + "step": 3338 + }, + { + "epoch": 0.5226988102692548, + "grad_norm": 3.1912856101989746, + "learning_rate": 7.694688823721083e-05, + "loss": 1.1188, + "step": 3339 + }, + { + "epoch": 0.5228553537883531, + "grad_norm": 4.086740970611572, + "learning_rate": 7.693874226132291e-05, + "loss": 1.287, + "step": 3340 + }, + { + "epoch": 0.5230118973074515, + "grad_norm": 3.2559094429016113, + "learning_rate": 7.693059628543499e-05, + "loss": 1.6255, + "step": 3341 + }, + { + "epoch": 0.5231684408265498, + "grad_norm": 2.2990639209747314, + "learning_rate": 7.692245030954709e-05, + "loss": 1.0378, + "step": 3342 + }, + { + "epoch": 0.5233249843456481, + "grad_norm": 4.3040385246276855, + "learning_rate": 7.691430433365917e-05, + "loss": 1.1084, + "step": 3343 + }, + { + "epoch": 0.5234815278647464, + "grad_norm": 3.238898754119873, + "learning_rate": 7.690615835777126e-05, + "loss": 0.9608, + "step": 3344 + }, + { + "epoch": 0.5236380713838447, + "grad_norm": 3.160226821899414, + "learning_rate": 7.689801238188335e-05, + "loss": 1.3111, + "step": 3345 + }, + { + "epoch": 0.523794614902943, + "grad_norm": 3.8179826736450195, + "learning_rate": 7.688986640599544e-05, + "loss": 1.116, + "step": 3346 + }, + { + "epoch": 0.5239511584220413, + "grad_norm": 1.6879281997680664, + "learning_rate": 7.688172043010752e-05, + "loss": 0.7689, + "step": 3347 + }, + { + "epoch": 0.5241077019411396, + "grad_norm": 2.3110029697418213, + "learning_rate": 7.687357445421962e-05, + "loss": 1.0097, + "step": 3348 + }, + { + "epoch": 0.524264245460238, + "grad_norm": 2.9897308349609375, + "learning_rate": 7.68654284783317e-05, + "loss": 0.6236, + "step": 3349 + }, + { + "epoch": 0.5244207889793363, + "grad_norm": 2.0346357822418213, + "learning_rate": 7.68572825024438e-05, + "loss": 0.7231, + "step": 3350 + }, + { + "epoch": 0.5245773324984345, + "grad_norm": 0.6673487424850464, + "learning_rate": 7.684913652655588e-05, + "loss": 0.3681, + "step": 3351 + }, + { + "epoch": 0.5247338760175329, + "grad_norm": 0.8218129873275757, + "learning_rate": 7.684099055066797e-05, + "loss": 0.2842, + "step": 3352 + }, + { + "epoch": 0.5248904195366312, + "grad_norm": 0.7349612712860107, + "learning_rate": 7.683284457478007e-05, + "loss": 0.385, + "step": 3353 + }, + { + "epoch": 0.5250469630557295, + "grad_norm": 0.7181155681610107, + "learning_rate": 7.682469859889215e-05, + "loss": 0.2891, + "step": 3354 + }, + { + "epoch": 0.5252035065748278, + "grad_norm": 0.7574586868286133, + "learning_rate": 7.681655262300423e-05, + "loss": 0.3502, + "step": 3355 + }, + { + "epoch": 0.5253600500939261, + "grad_norm": 0.8311702013015747, + "learning_rate": 7.680840664711633e-05, + "loss": 0.3001, + "step": 3356 + }, + { + "epoch": 0.5255165936130244, + "grad_norm": 1.4756038188934326, + "learning_rate": 7.680026067122841e-05, + "loss": 0.6121, + "step": 3357 + }, + { + "epoch": 0.5256731371321227, + "grad_norm": 0.9203120470046997, + "learning_rate": 7.67921146953405e-05, + "loss": 0.3926, + "step": 3358 + }, + { + "epoch": 0.525829680651221, + "grad_norm": 1.2271647453308105, + "learning_rate": 7.67839687194526e-05, + "loss": 0.4479, + "step": 3359 + }, + { + "epoch": 0.5259862241703194, + "grad_norm": 1.258286714553833, + "learning_rate": 7.677582274356468e-05, + "loss": 0.3974, + "step": 3360 + }, + { + "epoch": 0.5261427676894177, + "grad_norm": 1.1268293857574463, + "learning_rate": 7.676767676767676e-05, + "loss": 0.4361, + "step": 3361 + }, + { + "epoch": 0.5262993112085159, + "grad_norm": 15.66916275024414, + "learning_rate": 7.675953079178886e-05, + "loss": 3.6819, + "step": 3362 + }, + { + "epoch": 0.5264558547276142, + "grad_norm": 0.9705180525779724, + "learning_rate": 7.675138481590096e-05, + "loss": 0.2973, + "step": 3363 + }, + { + "epoch": 0.5266123982467126, + "grad_norm": 1.2707167863845825, + "learning_rate": 7.674323884001303e-05, + "loss": 0.4204, + "step": 3364 + }, + { + "epoch": 0.5267689417658109, + "grad_norm": 2.697657346725464, + "learning_rate": 7.673509286412512e-05, + "loss": 0.4842, + "step": 3365 + }, + { + "epoch": 0.5269254852849092, + "grad_norm": 1.2424544095993042, + "learning_rate": 7.672694688823722e-05, + "loss": 0.5012, + "step": 3366 + }, + { + "epoch": 0.5270820288040076, + "grad_norm": 2.579538583755493, + "learning_rate": 7.671880091234929e-05, + "loss": 0.6988, + "step": 3367 + }, + { + "epoch": 0.5272385723231058, + "grad_norm": 1.8209444284439087, + "learning_rate": 7.671065493646139e-05, + "loss": 0.5544, + "step": 3368 + }, + { + "epoch": 0.5273951158422041, + "grad_norm": 1.5120891332626343, + "learning_rate": 7.670250896057349e-05, + "loss": 0.4479, + "step": 3369 + }, + { + "epoch": 0.5275516593613024, + "grad_norm": 1.7992609739303589, + "learning_rate": 7.669436298468556e-05, + "loss": 0.6007, + "step": 3370 + }, + { + "epoch": 0.5277082028804008, + "grad_norm": 1.1531298160552979, + "learning_rate": 7.668621700879765e-05, + "loss": 0.344, + "step": 3371 + }, + { + "epoch": 0.5278647463994991, + "grad_norm": 2.7541208267211914, + "learning_rate": 7.667807103290975e-05, + "loss": 0.8582, + "step": 3372 + }, + { + "epoch": 0.5280212899185974, + "grad_norm": 1.758898138999939, + "learning_rate": 7.666992505702184e-05, + "loss": 0.856, + "step": 3373 + }, + { + "epoch": 0.5281778334376956, + "grad_norm": 2.0890884399414062, + "learning_rate": 7.666177908113392e-05, + "loss": 0.5461, + "step": 3374 + }, + { + "epoch": 0.528334376956794, + "grad_norm": 2.1545403003692627, + "learning_rate": 7.665363310524602e-05, + "loss": 0.649, + "step": 3375 + }, + { + "epoch": 0.5284909204758923, + "grad_norm": 2.2777936458587646, + "learning_rate": 7.66454871293581e-05, + "loss": 0.8281, + "step": 3376 + }, + { + "epoch": 0.5286474639949906, + "grad_norm": 2.361903429031372, + "learning_rate": 7.663734115347018e-05, + "loss": 0.7163, + "step": 3377 + }, + { + "epoch": 0.528804007514089, + "grad_norm": 1.6033458709716797, + "learning_rate": 7.662919517758228e-05, + "loss": 0.64, + "step": 3378 + }, + { + "epoch": 0.5289605510331872, + "grad_norm": 6.922934055328369, + "learning_rate": 7.662104920169437e-05, + "loss": 1.2458, + "step": 3379 + }, + { + "epoch": 0.5291170945522855, + "grad_norm": 2.380204439163208, + "learning_rate": 7.661290322580645e-05, + "loss": 0.7196, + "step": 3380 + }, + { + "epoch": 0.5292736380713838, + "grad_norm": 4.197885036468506, + "learning_rate": 7.660475724991855e-05, + "loss": 0.8151, + "step": 3381 + }, + { + "epoch": 0.5294301815904822, + "grad_norm": 3.3267126083374023, + "learning_rate": 7.659661127403063e-05, + "loss": 0.9017, + "step": 3382 + }, + { + "epoch": 0.5295867251095805, + "grad_norm": 1.2234145402908325, + "learning_rate": 7.658846529814271e-05, + "loss": 0.3188, + "step": 3383 + }, + { + "epoch": 0.5297432686286788, + "grad_norm": 3.372546672821045, + "learning_rate": 7.658031932225481e-05, + "loss": 0.8772, + "step": 3384 + }, + { + "epoch": 0.529899812147777, + "grad_norm": 2.4819042682647705, + "learning_rate": 7.65721733463669e-05, + "loss": 0.621, + "step": 3385 + }, + { + "epoch": 0.5300563556668754, + "grad_norm": 2.694667100906372, + "learning_rate": 7.656402737047899e-05, + "loss": 1.4438, + "step": 3386 + }, + { + "epoch": 0.5302128991859737, + "grad_norm": 3.382354259490967, + "learning_rate": 7.655588139459108e-05, + "loss": 1.153, + "step": 3387 + }, + { + "epoch": 0.530369442705072, + "grad_norm": 4.17157506942749, + "learning_rate": 7.654773541870316e-05, + "loss": 1.2823, + "step": 3388 + }, + { + "epoch": 0.5305259862241704, + "grad_norm": 2.6846985816955566, + "learning_rate": 7.653958944281526e-05, + "loss": 0.851, + "step": 3389 + }, + { + "epoch": 0.5306825297432687, + "grad_norm": 3.0389981269836426, + "learning_rate": 7.653144346692734e-05, + "loss": 0.7871, + "step": 3390 + }, + { + "epoch": 0.5308390732623669, + "grad_norm": 3.8984694480895996, + "learning_rate": 7.652329749103942e-05, + "loss": 1.6489, + "step": 3391 + }, + { + "epoch": 0.5309956167814652, + "grad_norm": 2.7128522396087646, + "learning_rate": 7.651515151515152e-05, + "loss": 1.344, + "step": 3392 + }, + { + "epoch": 0.5311521603005636, + "grad_norm": 6.02797269821167, + "learning_rate": 7.65070055392636e-05, + "loss": 1.6661, + "step": 3393 + }, + { + "epoch": 0.5313087038196619, + "grad_norm": 2.3682825565338135, + "learning_rate": 7.649885956337569e-05, + "loss": 0.7125, + "step": 3394 + }, + { + "epoch": 0.5314652473387602, + "grad_norm": 3.491345167160034, + "learning_rate": 7.649071358748779e-05, + "loss": 1.0154, + "step": 3395 + }, + { + "epoch": 0.5316217908578584, + "grad_norm": 3.954564332962036, + "learning_rate": 7.648256761159987e-05, + "loss": 1.3852, + "step": 3396 + }, + { + "epoch": 0.5317783343769568, + "grad_norm": 3.386807918548584, + "learning_rate": 7.647442163571195e-05, + "loss": 1.0027, + "step": 3397 + }, + { + "epoch": 0.5319348778960551, + "grad_norm": 1.840760350227356, + "learning_rate": 7.646627565982405e-05, + "loss": 0.7306, + "step": 3398 + }, + { + "epoch": 0.5320914214151534, + "grad_norm": 6.10250186920166, + "learning_rate": 7.645812968393613e-05, + "loss": 1.7297, + "step": 3399 + }, + { + "epoch": 0.5322479649342517, + "grad_norm": 3.750319719314575, + "learning_rate": 7.644998370804822e-05, + "loss": 1.3335, + "step": 3400 + }, + { + "epoch": 0.5324045084533501, + "grad_norm": 0.9741947054862976, + "learning_rate": 7.644183773216032e-05, + "loss": 0.3871, + "step": 3401 + }, + { + "epoch": 0.5325610519724483, + "grad_norm": 0.5060848593711853, + "learning_rate": 7.643369175627241e-05, + "loss": 0.253, + "step": 3402 + }, + { + "epoch": 0.5327175954915466, + "grad_norm": 0.5400909781455994, + "learning_rate": 7.642554578038448e-05, + "loss": 0.2979, + "step": 3403 + }, + { + "epoch": 0.532874139010645, + "grad_norm": 0.8474114537239075, + "learning_rate": 7.641739980449658e-05, + "loss": 0.4297, + "step": 3404 + }, + { + "epoch": 0.5330306825297433, + "grad_norm": 0.4938964545726776, + "learning_rate": 7.640925382860868e-05, + "loss": 0.2122, + "step": 3405 + }, + { + "epoch": 0.5331872260488416, + "grad_norm": 0.7060590982437134, + "learning_rate": 7.640110785272075e-05, + "loss": 0.3298, + "step": 3406 + }, + { + "epoch": 0.5333437695679399, + "grad_norm": 0.7204407453536987, + "learning_rate": 7.639296187683285e-05, + "loss": 0.3351, + "step": 3407 + }, + { + "epoch": 0.5335003130870382, + "grad_norm": 1.0669149160385132, + "learning_rate": 7.638481590094494e-05, + "loss": 0.4724, + "step": 3408 + }, + { + "epoch": 0.5336568566061365, + "grad_norm": 0.6671699285507202, + "learning_rate": 7.637666992505703e-05, + "loss": 0.2926, + "step": 3409 + }, + { + "epoch": 0.5338134001252348, + "grad_norm": 1.005455493927002, + "learning_rate": 7.636852394916911e-05, + "loss": 0.4727, + "step": 3410 + }, + { + "epoch": 0.5339699436443331, + "grad_norm": 1.1427040100097656, + "learning_rate": 7.636037797328121e-05, + "loss": 0.4663, + "step": 3411 + }, + { + "epoch": 0.5341264871634315, + "grad_norm": 1.0493615865707397, + "learning_rate": 7.635223199739329e-05, + "loss": 0.4689, + "step": 3412 + }, + { + "epoch": 0.5342830306825297, + "grad_norm": 1.1950101852416992, + "learning_rate": 7.634408602150538e-05, + "loss": 0.4117, + "step": 3413 + }, + { + "epoch": 0.534439574201628, + "grad_norm": 3.1163125038146973, + "learning_rate": 7.633594004561747e-05, + "loss": 0.6023, + "step": 3414 + }, + { + "epoch": 0.5345961177207263, + "grad_norm": 1.8868329524993896, + "learning_rate": 7.632779406972956e-05, + "loss": 0.5537, + "step": 3415 + }, + { + "epoch": 0.5347526612398247, + "grad_norm": 0.9989250302314758, + "learning_rate": 7.631964809384164e-05, + "loss": 0.3709, + "step": 3416 + }, + { + "epoch": 0.534909204758923, + "grad_norm": 1.887946605682373, + "learning_rate": 7.631150211795374e-05, + "loss": 0.5661, + "step": 3417 + }, + { + "epoch": 0.5350657482780213, + "grad_norm": 2.973623752593994, + "learning_rate": 7.630335614206582e-05, + "loss": 0.6965, + "step": 3418 + }, + { + "epoch": 0.5352222917971196, + "grad_norm": 0.9018517732620239, + "learning_rate": 7.62952101661779e-05, + "loss": 0.3211, + "step": 3419 + }, + { + "epoch": 0.5353788353162179, + "grad_norm": 2.3923563957214355, + "learning_rate": 7.628706419029e-05, + "loss": 0.3482, + "step": 3420 + }, + { + "epoch": 0.5355353788353162, + "grad_norm": 2.5631814002990723, + "learning_rate": 7.627891821440209e-05, + "loss": 0.553, + "step": 3421 + }, + { + "epoch": 0.5356919223544145, + "grad_norm": 2.3675355911254883, + "learning_rate": 7.627077223851418e-05, + "loss": 0.601, + "step": 3422 + }, + { + "epoch": 0.5358484658735129, + "grad_norm": 3.191016674041748, + "learning_rate": 7.626262626262627e-05, + "loss": 0.7109, + "step": 3423 + }, + { + "epoch": 0.5360050093926112, + "grad_norm": 2.2137625217437744, + "learning_rate": 7.625448028673835e-05, + "loss": 0.7239, + "step": 3424 + }, + { + "epoch": 0.5361615529117094, + "grad_norm": 2.592637300491333, + "learning_rate": 7.624633431085045e-05, + "loss": 0.5356, + "step": 3425 + }, + { + "epoch": 0.5363180964308077, + "grad_norm": 8.435508728027344, + "learning_rate": 7.623818833496253e-05, + "loss": 1.2834, + "step": 3426 + }, + { + "epoch": 0.5364746399499061, + "grad_norm": 2.012038230895996, + "learning_rate": 7.623004235907462e-05, + "loss": 0.8711, + "step": 3427 + }, + { + "epoch": 0.5366311834690044, + "grad_norm": 2.593794345855713, + "learning_rate": 7.622189638318671e-05, + "loss": 0.8191, + "step": 3428 + }, + { + "epoch": 0.5367877269881027, + "grad_norm": 2.099384069442749, + "learning_rate": 7.62137504072988e-05, + "loss": 0.9566, + "step": 3429 + }, + { + "epoch": 0.536944270507201, + "grad_norm": 2.12691593170166, + "learning_rate": 7.620560443141088e-05, + "loss": 0.5908, + "step": 3430 + }, + { + "epoch": 0.5371008140262993, + "grad_norm": 2.597759485244751, + "learning_rate": 7.619745845552298e-05, + "loss": 1.148, + "step": 3431 + }, + { + "epoch": 0.5372573575453976, + "grad_norm": 2.621180772781372, + "learning_rate": 7.618931247963506e-05, + "loss": 0.8272, + "step": 3432 + }, + { + "epoch": 0.5374139010644959, + "grad_norm": 2.15786075592041, + "learning_rate": 7.618116650374715e-05, + "loss": 0.7745, + "step": 3433 + }, + { + "epoch": 0.5375704445835943, + "grad_norm": 4.472789764404297, + "learning_rate": 7.617302052785924e-05, + "loss": 0.794, + "step": 3434 + }, + { + "epoch": 0.5377269881026926, + "grad_norm": 4.199079990386963, + "learning_rate": 7.616487455197133e-05, + "loss": 0.9639, + "step": 3435 + }, + { + "epoch": 0.5378835316217908, + "grad_norm": 2.6146583557128906, + "learning_rate": 7.615672857608341e-05, + "loss": 0.88, + "step": 3436 + }, + { + "epoch": 0.5380400751408891, + "grad_norm": 2.5807607173919678, + "learning_rate": 7.614858260019551e-05, + "loss": 0.9183, + "step": 3437 + }, + { + "epoch": 0.5381966186599875, + "grad_norm": 8.182585716247559, + "learning_rate": 7.61404366243076e-05, + "loss": 1.0928, + "step": 3438 + }, + { + "epoch": 0.5383531621790858, + "grad_norm": 2.7926440238952637, + "learning_rate": 7.613229064841967e-05, + "loss": 0.9926, + "step": 3439 + }, + { + "epoch": 0.5385097056981841, + "grad_norm": 2.8866796493530273, + "learning_rate": 7.612414467253177e-05, + "loss": 1.177, + "step": 3440 + }, + { + "epoch": 0.5386662492172825, + "grad_norm": 7.21640157699585, + "learning_rate": 7.611599869664387e-05, + "loss": 1.4305, + "step": 3441 + }, + { + "epoch": 0.5388227927363807, + "grad_norm": 7.487827777862549, + "learning_rate": 7.610785272075594e-05, + "loss": 1.7132, + "step": 3442 + }, + { + "epoch": 0.538979336255479, + "grad_norm": 3.3370847702026367, + "learning_rate": 7.609970674486804e-05, + "loss": 0.9751, + "step": 3443 + }, + { + "epoch": 0.5391358797745773, + "grad_norm": 2.6586215496063232, + "learning_rate": 7.609156076898013e-05, + "loss": 1.554, + "step": 3444 + }, + { + "epoch": 0.5392924232936757, + "grad_norm": 5.3226399421691895, + "learning_rate": 7.608341479309222e-05, + "loss": 1.9042, + "step": 3445 + }, + { + "epoch": 0.539448966812774, + "grad_norm": 1.7487690448760986, + "learning_rate": 7.60752688172043e-05, + "loss": 0.5781, + "step": 3446 + }, + { + "epoch": 0.5396055103318722, + "grad_norm": 1.3267630338668823, + "learning_rate": 7.60671228413164e-05, + "loss": 0.5472, + "step": 3447 + }, + { + "epoch": 0.5397620538509705, + "grad_norm": 5.800814628601074, + "learning_rate": 7.605897686542848e-05, + "loss": 1.3644, + "step": 3448 + }, + { + "epoch": 0.5399185973700689, + "grad_norm": 3.124795436859131, + "learning_rate": 7.605083088954057e-05, + "loss": 0.7274, + "step": 3449 + }, + { + "epoch": 0.5400751408891672, + "grad_norm": 2.408482074737549, + "learning_rate": 7.604268491365266e-05, + "loss": 1.3403, + "step": 3450 + }, + { + "epoch": 0.5402316844082655, + "grad_norm": 0.7294401526451111, + "learning_rate": 7.603453893776475e-05, + "loss": 0.3031, + "step": 3451 + }, + { + "epoch": 0.5403882279273639, + "grad_norm": 0.966210126876831, + "learning_rate": 7.602639296187683e-05, + "loss": 0.4306, + "step": 3452 + }, + { + "epoch": 0.5405447714464621, + "grad_norm": 1.3746386766433716, + "learning_rate": 7.601824698598893e-05, + "loss": 0.2028, + "step": 3453 + }, + { + "epoch": 0.5407013149655604, + "grad_norm": 0.6886386275291443, + "learning_rate": 7.601010101010101e-05, + "loss": 0.3349, + "step": 3454 + }, + { + "epoch": 0.5408578584846587, + "grad_norm": 0.6538557410240173, + "learning_rate": 7.60019550342131e-05, + "loss": 0.2967, + "step": 3455 + }, + { + "epoch": 0.5410144020037571, + "grad_norm": 3.697408676147461, + "learning_rate": 7.59938090583252e-05, + "loss": 0.4882, + "step": 3456 + }, + { + "epoch": 0.5411709455228554, + "grad_norm": 0.9626391530036926, + "learning_rate": 7.598566308243728e-05, + "loss": 0.3055, + "step": 3457 + }, + { + "epoch": 0.5413274890419537, + "grad_norm": 1.0742534399032593, + "learning_rate": 7.597751710654936e-05, + "loss": 0.3749, + "step": 3458 + }, + { + "epoch": 0.5414840325610519, + "grad_norm": 1.0142515897750854, + "learning_rate": 7.596937113066146e-05, + "loss": 0.2302, + "step": 3459 + }, + { + "epoch": 0.5416405760801503, + "grad_norm": 1.0481387376785278, + "learning_rate": 7.596122515477354e-05, + "loss": 0.3254, + "step": 3460 + }, + { + "epoch": 0.5417971195992486, + "grad_norm": 1.2410204410552979, + "learning_rate": 7.595307917888564e-05, + "loss": 0.2815, + "step": 3461 + }, + { + "epoch": 0.5419536631183469, + "grad_norm": 1.1648932695388794, + "learning_rate": 7.594493320299772e-05, + "loss": 0.3625, + "step": 3462 + }, + { + "epoch": 0.5421102066374452, + "grad_norm": 1.1295238733291626, + "learning_rate": 7.59367872271098e-05, + "loss": 0.4357, + "step": 3463 + }, + { + "epoch": 0.5422667501565435, + "grad_norm": 2.1965973377227783, + "learning_rate": 7.59286412512219e-05, + "loss": 0.7085, + "step": 3464 + }, + { + "epoch": 0.5424232936756418, + "grad_norm": 1.790886402130127, + "learning_rate": 7.592049527533399e-05, + "loss": 0.389, + "step": 3465 + }, + { + "epoch": 0.5425798371947401, + "grad_norm": 1.2912918329238892, + "learning_rate": 7.591234929944607e-05, + "loss": 0.5523, + "step": 3466 + }, + { + "epoch": 0.5427363807138385, + "grad_norm": 3.4621424674987793, + "learning_rate": 7.590420332355817e-05, + "loss": 0.5383, + "step": 3467 + }, + { + "epoch": 0.5428929242329368, + "grad_norm": 2.350144863128662, + "learning_rate": 7.589605734767025e-05, + "loss": 0.822, + "step": 3468 + }, + { + "epoch": 0.5430494677520351, + "grad_norm": 1.8190078735351562, + "learning_rate": 7.588791137178234e-05, + "loss": 0.5704, + "step": 3469 + }, + { + "epoch": 0.5432060112711333, + "grad_norm": 2.3060402870178223, + "learning_rate": 7.587976539589443e-05, + "loss": 0.8099, + "step": 3470 + }, + { + "epoch": 0.5433625547902317, + "grad_norm": 1.8090523481369019, + "learning_rate": 7.587161942000652e-05, + "loss": 0.5649, + "step": 3471 + }, + { + "epoch": 0.54351909830933, + "grad_norm": 2.6844747066497803, + "learning_rate": 7.58634734441186e-05, + "loss": 0.8204, + "step": 3472 + }, + { + "epoch": 0.5436756418284283, + "grad_norm": 2.314054489135742, + "learning_rate": 7.58553274682307e-05, + "loss": 0.715, + "step": 3473 + }, + { + "epoch": 0.5438321853475266, + "grad_norm": 1.7262619733810425, + "learning_rate": 7.58471814923428e-05, + "loss": 0.5162, + "step": 3474 + }, + { + "epoch": 0.543988728866625, + "grad_norm": 1.7346463203430176, + "learning_rate": 7.583903551645487e-05, + "loss": 0.5776, + "step": 3475 + }, + { + "epoch": 0.5441452723857232, + "grad_norm": 2.1360247135162354, + "learning_rate": 7.583088954056696e-05, + "loss": 0.6661, + "step": 3476 + }, + { + "epoch": 0.5443018159048215, + "grad_norm": 2.1722593307495117, + "learning_rate": 7.582274356467906e-05, + "loss": 0.964, + "step": 3477 + }, + { + "epoch": 0.5444583594239198, + "grad_norm": 3.111992835998535, + "learning_rate": 7.581459758879113e-05, + "loss": 1.1206, + "step": 3478 + }, + { + "epoch": 0.5446149029430182, + "grad_norm": 1.0526963472366333, + "learning_rate": 7.580645161290323e-05, + "loss": 0.4871, + "step": 3479 + }, + { + "epoch": 0.5447714464621165, + "grad_norm": 2.2690000534057617, + "learning_rate": 7.579830563701533e-05, + "loss": 0.6657, + "step": 3480 + }, + { + "epoch": 0.5449279899812148, + "grad_norm": 6.505357265472412, + "learning_rate": 7.579015966112741e-05, + "loss": 0.9753, + "step": 3481 + }, + { + "epoch": 0.545084533500313, + "grad_norm": 4.666423797607422, + "learning_rate": 7.578201368523949e-05, + "loss": 1.0005, + "step": 3482 + }, + { + "epoch": 0.5452410770194114, + "grad_norm": 1.7371422052383423, + "learning_rate": 7.577386770935159e-05, + "loss": 0.6873, + "step": 3483 + }, + { + "epoch": 0.5453976205385097, + "grad_norm": 3.4155361652374268, + "learning_rate": 7.576572173346367e-05, + "loss": 0.8841, + "step": 3484 + }, + { + "epoch": 0.545554164057608, + "grad_norm": 2.2857167720794678, + "learning_rate": 7.575757575757576e-05, + "loss": 0.723, + "step": 3485 + }, + { + "epoch": 0.5457107075767064, + "grad_norm": 2.466841220855713, + "learning_rate": 7.574942978168785e-05, + "loss": 1.0656, + "step": 3486 + }, + { + "epoch": 0.5458672510958046, + "grad_norm": 4.408296585083008, + "learning_rate": 7.574128380579994e-05, + "loss": 1.2453, + "step": 3487 + }, + { + "epoch": 0.5460237946149029, + "grad_norm": 3.5641655921936035, + "learning_rate": 7.573313782991202e-05, + "loss": 1.099, + "step": 3488 + }, + { + "epoch": 0.5461803381340012, + "grad_norm": 2.1907687187194824, + "learning_rate": 7.572499185402412e-05, + "loss": 0.9084, + "step": 3489 + }, + { + "epoch": 0.5463368816530996, + "grad_norm": 4.63608455657959, + "learning_rate": 7.57168458781362e-05, + "loss": 0.9059, + "step": 3490 + }, + { + "epoch": 0.5464934251721979, + "grad_norm": 2.900362491607666, + "learning_rate": 7.570869990224829e-05, + "loss": 1.7142, + "step": 3491 + }, + { + "epoch": 0.5466499686912962, + "grad_norm": 2.5289697647094727, + "learning_rate": 7.570055392636038e-05, + "loss": 0.9883, + "step": 3492 + }, + { + "epoch": 0.5468065122103944, + "grad_norm": 2.3022143840789795, + "learning_rate": 7.569240795047247e-05, + "loss": 1.2772, + "step": 3493 + }, + { + "epoch": 0.5469630557294928, + "grad_norm": 3.920881509780884, + "learning_rate": 7.568426197458455e-05, + "loss": 1.666, + "step": 3494 + }, + { + "epoch": 0.5471195992485911, + "grad_norm": 4.273463726043701, + "learning_rate": 7.567611599869665e-05, + "loss": 1.6897, + "step": 3495 + }, + { + "epoch": 0.5472761427676894, + "grad_norm": NaN, + "learning_rate": 7.567611599869665e-05, + "loss": 0.0, + "step": 3496 + }, + { + "epoch": 0.5474326862867878, + "grad_norm": 7.167329788208008, + "learning_rate": 7.566797002280873e-05, + "loss": 1.0761, + "step": 3497 + }, + { + "epoch": 0.5475892298058861, + "grad_norm": 2.400947332382202, + "learning_rate": 7.565982404692083e-05, + "loss": 0.728, + "step": 3498 + }, + { + "epoch": 0.5477457733249843, + "grad_norm": 4.8002848625183105, + "learning_rate": 7.565167807103291e-05, + "loss": 0.5568, + "step": 3499 + }, + { + "epoch": 0.5479023168440826, + "grad_norm": 3.6249077320098877, + "learning_rate": 7.5643532095145e-05, + "loss": 1.387, + "step": 3500 + }, + { + "epoch": 0.548058860363181, + "grad_norm": 0.6711284518241882, + "learning_rate": 7.56353861192571e-05, + "loss": 0.2107, + "step": 3501 + }, + { + "epoch": 0.5482154038822793, + "grad_norm": 0.9918914437294006, + "learning_rate": 7.562724014336918e-05, + "loss": 0.4263, + "step": 3502 + }, + { + "epoch": 0.5483719474013776, + "grad_norm": 0.6129970550537109, + "learning_rate": 7.561909416748126e-05, + "loss": 0.2494, + "step": 3503 + }, + { + "epoch": 0.5485284909204758, + "grad_norm": 0.8330832123756409, + "learning_rate": 7.561094819159336e-05, + "loss": 0.2786, + "step": 3504 + }, + { + "epoch": 0.5486850344395742, + "grad_norm": 0.8496516942977905, + "learning_rate": 7.560280221570544e-05, + "loss": 0.27, + "step": 3505 + }, + { + "epoch": 0.5488415779586725, + "grad_norm": 0.8892713785171509, + "learning_rate": 7.559465623981753e-05, + "loss": 0.3714, + "step": 3506 + }, + { + "epoch": 0.5489981214777708, + "grad_norm": 0.8422902226448059, + "learning_rate": 7.558651026392962e-05, + "loss": 0.3077, + "step": 3507 + }, + { + "epoch": 0.5491546649968692, + "grad_norm": 1.2950185537338257, + "learning_rate": 7.557836428804171e-05, + "loss": 0.2696, + "step": 3508 + }, + { + "epoch": 0.5493112085159675, + "grad_norm": 1.0856096744537354, + "learning_rate": 7.557021831215379e-05, + "loss": 0.3142, + "step": 3509 + }, + { + "epoch": 0.5494677520350657, + "grad_norm": 1.2120784521102905, + "learning_rate": 7.556207233626589e-05, + "loss": 0.4336, + "step": 3510 + }, + { + "epoch": 0.549624295554164, + "grad_norm": 1.2028145790100098, + "learning_rate": 7.555392636037799e-05, + "loss": 0.4263, + "step": 3511 + }, + { + "epoch": 0.5497808390732624, + "grad_norm": 1.191662311553955, + "learning_rate": 7.554578038449006e-05, + "loss": 0.3642, + "step": 3512 + }, + { + "epoch": 0.5499373825923607, + "grad_norm": 1.0820554494857788, + "learning_rate": 7.553763440860215e-05, + "loss": 0.2868, + "step": 3513 + }, + { + "epoch": 0.550093926111459, + "grad_norm": 1.7401914596557617, + "learning_rate": 7.552948843271425e-05, + "loss": 0.7997, + "step": 3514 + }, + { + "epoch": 0.5502504696305573, + "grad_norm": 1.5352338552474976, + "learning_rate": 7.552134245682632e-05, + "loss": 0.52, + "step": 3515 + }, + { + "epoch": 0.5504070131496556, + "grad_norm": 4.264407634735107, + "learning_rate": 7.551319648093842e-05, + "loss": 0.4163, + "step": 3516 + }, + { + "epoch": 0.5505635566687539, + "grad_norm": 1.095915675163269, + "learning_rate": 7.550505050505052e-05, + "loss": 0.5099, + "step": 3517 + }, + { + "epoch": 0.5507201001878522, + "grad_norm": 2.5423567295074463, + "learning_rate": 7.549690452916259e-05, + "loss": 0.4332, + "step": 3518 + }, + { + "epoch": 0.5508766437069506, + "grad_norm": 1.4117050170898438, + "learning_rate": 7.548875855327468e-05, + "loss": 0.3816, + "step": 3519 + }, + { + "epoch": 0.5510331872260489, + "grad_norm": 2.133608818054199, + "learning_rate": 7.548061257738678e-05, + "loss": 0.5658, + "step": 3520 + }, + { + "epoch": 0.5511897307451471, + "grad_norm": 3.501784324645996, + "learning_rate": 7.547246660149886e-05, + "loss": 0.6317, + "step": 3521 + }, + { + "epoch": 0.5513462742642454, + "grad_norm": 3.141094923019409, + "learning_rate": 7.546432062561095e-05, + "loss": 0.7126, + "step": 3522 + }, + { + "epoch": 0.5515028177833438, + "grad_norm": 3.791727066040039, + "learning_rate": 7.545617464972305e-05, + "loss": 0.9248, + "step": 3523 + }, + { + "epoch": 0.5516593613024421, + "grad_norm": 2.8067197799682617, + "learning_rate": 7.544802867383513e-05, + "loss": 0.929, + "step": 3524 + }, + { + "epoch": 0.5518159048215404, + "grad_norm": 2.630166530609131, + "learning_rate": 7.543988269794721e-05, + "loss": 0.769, + "step": 3525 + }, + { + "epoch": 0.5519724483406387, + "grad_norm": 1.768406629562378, + "learning_rate": 7.543173672205931e-05, + "loss": 0.9103, + "step": 3526 + }, + { + "epoch": 0.552128991859737, + "grad_norm": 6.78751802444458, + "learning_rate": 7.54235907461714e-05, + "loss": 1.224, + "step": 3527 + }, + { + "epoch": 0.5522855353788353, + "grad_norm": 1.8711612224578857, + "learning_rate": 7.541544477028348e-05, + "loss": 0.7473, + "step": 3528 + }, + { + "epoch": 0.5524420788979336, + "grad_norm": 2.477947473526001, + "learning_rate": 7.540729879439558e-05, + "loss": 0.8452, + "step": 3529 + }, + { + "epoch": 0.552598622417032, + "grad_norm": 3.1295764446258545, + "learning_rate": 7.539915281850766e-05, + "loss": 1.2765, + "step": 3530 + }, + { + "epoch": 0.5527551659361303, + "grad_norm": 3.192579746246338, + "learning_rate": 7.539100684261974e-05, + "loss": 1.0056, + "step": 3531 + }, + { + "epoch": 0.5529117094552286, + "grad_norm": 1.989215612411499, + "learning_rate": 7.538286086673184e-05, + "loss": 0.8183, + "step": 3532 + }, + { + "epoch": 0.5530682529743268, + "grad_norm": 1.3750296831130981, + "learning_rate": 7.537471489084392e-05, + "loss": 0.7266, + "step": 3533 + }, + { + "epoch": 0.5532247964934252, + "grad_norm": 2.8049211502075195, + "learning_rate": 7.536656891495602e-05, + "loss": 0.6978, + "step": 3534 + }, + { + "epoch": 0.5533813400125235, + "grad_norm": 4.5168046951293945, + "learning_rate": 7.53584229390681e-05, + "loss": 0.8723, + "step": 3535 + }, + { + "epoch": 0.5535378835316218, + "grad_norm": 3.921037197113037, + "learning_rate": 7.535027696318019e-05, + "loss": 1.0347, + "step": 3536 + }, + { + "epoch": 0.5536944270507201, + "grad_norm": 4.131521701812744, + "learning_rate": 7.534213098729229e-05, + "loss": 1.1067, + "step": 3537 + }, + { + "epoch": 0.5538509705698184, + "grad_norm": 3.818636894226074, + "learning_rate": 7.533398501140437e-05, + "loss": 1.0325, + "step": 3538 + }, + { + "epoch": 0.5540075140889167, + "grad_norm": 4.097397804260254, + "learning_rate": 7.532583903551645e-05, + "loss": 0.9582, + "step": 3539 + }, + { + "epoch": 0.554164057608015, + "grad_norm": 7.718026161193848, + "learning_rate": 7.531769305962855e-05, + "loss": 1.0071, + "step": 3540 + }, + { + "epoch": 0.5543206011271133, + "grad_norm": 2.8737592697143555, + "learning_rate": 7.530954708374063e-05, + "loss": 1.2302, + "step": 3541 + }, + { + "epoch": 0.5544771446462117, + "grad_norm": 2.9192748069763184, + "learning_rate": 7.530140110785272e-05, + "loss": 1.5697, + "step": 3542 + }, + { + "epoch": 0.55463368816531, + "grad_norm": 7.6530890464782715, + "learning_rate": 7.529325513196482e-05, + "loss": 1.6973, + "step": 3543 + }, + { + "epoch": 0.5547902316844082, + "grad_norm": 3.5386157035827637, + "learning_rate": 7.52851091560769e-05, + "loss": 1.1301, + "step": 3544 + }, + { + "epoch": 0.5549467752035065, + "grad_norm": 2.6236512660980225, + "learning_rate": 7.527696318018898e-05, + "loss": 0.764, + "step": 3545 + }, + { + "epoch": 0.5551033187226049, + "grad_norm": 1.9622151851654053, + "learning_rate": 7.526881720430108e-05, + "loss": 0.6344, + "step": 3546 + }, + { + "epoch": 0.5552598622417032, + "grad_norm": 3.3782920837402344, + "learning_rate": 7.526067122841316e-05, + "loss": 0.93, + "step": 3547 + }, + { + "epoch": 0.5554164057608015, + "grad_norm": 2.6663033962249756, + "learning_rate": 7.525252525252525e-05, + "loss": 1.0315, + "step": 3548 + }, + { + "epoch": 0.5555729492798999, + "grad_norm": 3.595174789428711, + "learning_rate": 7.524437927663735e-05, + "loss": 0.9604, + "step": 3549 + }, + { + "epoch": 0.5557294927989981, + "grad_norm": 3.6181490421295166, + "learning_rate": 7.523623330074944e-05, + "loss": 1.623, + "step": 3550 + }, + { + "epoch": 0.5558860363180964, + "grad_norm": 0.4372076392173767, + "learning_rate": 7.522808732486151e-05, + "loss": 0.2579, + "step": 3551 + }, + { + "epoch": 0.5560425798371947, + "grad_norm": 0.6141397356987, + "learning_rate": 7.521994134897361e-05, + "loss": 0.2794, + "step": 3552 + }, + { + "epoch": 0.5561991233562931, + "grad_norm": 0.7352651357650757, + "learning_rate": 7.521179537308571e-05, + "loss": 0.3043, + "step": 3553 + }, + { + "epoch": 0.5563556668753914, + "grad_norm": 0.8296943306922913, + "learning_rate": 7.520364939719778e-05, + "loss": 0.3124, + "step": 3554 + }, + { + "epoch": 0.5565122103944896, + "grad_norm": 0.8993338942527771, + "learning_rate": 7.519550342130988e-05, + "loss": 0.3531, + "step": 3555 + }, + { + "epoch": 0.5566687539135879, + "grad_norm": 0.7367944121360779, + "learning_rate": 7.518735744542197e-05, + "loss": 0.3958, + "step": 3556 + }, + { + "epoch": 0.5568252974326863, + "grad_norm": 0.5843424201011658, + "learning_rate": 7.517921146953406e-05, + "loss": 0.1773, + "step": 3557 + }, + { + "epoch": 0.5569818409517846, + "grad_norm": 0.5512811541557312, + "learning_rate": 7.517106549364614e-05, + "loss": 0.3372, + "step": 3558 + }, + { + "epoch": 0.5571383844708829, + "grad_norm": 1.275847315788269, + "learning_rate": 7.516291951775824e-05, + "loss": 0.5139, + "step": 3559 + }, + { + "epoch": 0.5572949279899813, + "grad_norm": 1.3824962377548218, + "learning_rate": 7.515477354187032e-05, + "loss": 0.4745, + "step": 3560 + }, + { + "epoch": 0.5574514715090795, + "grad_norm": 0.8446884155273438, + "learning_rate": 7.51466275659824e-05, + "loss": 0.2335, + "step": 3561 + }, + { + "epoch": 0.5576080150281778, + "grad_norm": 0.804766833782196, + "learning_rate": 7.51384815900945e-05, + "loss": 0.3361, + "step": 3562 + }, + { + "epoch": 0.5577645585472761, + "grad_norm": 1.6075713634490967, + "learning_rate": 7.513033561420659e-05, + "loss": 0.4453, + "step": 3563 + }, + { + "epoch": 0.5579211020663745, + "grad_norm": 1.2725428342819214, + "learning_rate": 7.512218963831867e-05, + "loss": 0.4352, + "step": 3564 + }, + { + "epoch": 0.5580776455854728, + "grad_norm": NaN, + "learning_rate": 7.512218963831867e-05, + "loss": 0.0, + "step": 3565 + }, + { + "epoch": 0.5582341891045711, + "grad_norm": 1.706713080406189, + "learning_rate": 7.511404366243077e-05, + "loss": 0.5731, + "step": 3566 + }, + { + "epoch": 0.5583907326236693, + "grad_norm": 1.5209662914276123, + "learning_rate": 7.510589768654285e-05, + "loss": 0.5787, + "step": 3567 + }, + { + "epoch": 0.5585472761427677, + "grad_norm": 6.409161567687988, + "learning_rate": 7.509775171065493e-05, + "loss": 0.6881, + "step": 3568 + }, + { + "epoch": 0.558703819661866, + "grad_norm": 1.8482974767684937, + "learning_rate": 7.508960573476703e-05, + "loss": 0.6116, + "step": 3569 + }, + { + "epoch": 0.5588603631809643, + "grad_norm": 1.9168952703475952, + "learning_rate": 7.508145975887912e-05, + "loss": 0.7704, + "step": 3570 + }, + { + "epoch": 0.5590169067000627, + "grad_norm": 2.0184524059295654, + "learning_rate": 7.507331378299121e-05, + "loss": 0.8298, + "step": 3571 + }, + { + "epoch": 0.5591734502191609, + "grad_norm": 1.4530268907546997, + "learning_rate": 7.50651678071033e-05, + "loss": 0.4171, + "step": 3572 + }, + { + "epoch": 0.5593299937382592, + "grad_norm": 3.013077735900879, + "learning_rate": 7.505702183121538e-05, + "loss": 1.1413, + "step": 3573 + }, + { + "epoch": 0.5594865372573575, + "grad_norm": 2.2212564945220947, + "learning_rate": 7.504887585532748e-05, + "loss": 0.6532, + "step": 3574 + }, + { + "epoch": 0.5596430807764559, + "grad_norm": 2.9026901721954346, + "learning_rate": 7.504072987943956e-05, + "loss": 0.6007, + "step": 3575 + }, + { + "epoch": 0.5597996242955542, + "grad_norm": 3.231295585632324, + "learning_rate": 7.503258390355164e-05, + "loss": 1.0199, + "step": 3576 + }, + { + "epoch": 0.5599561678146525, + "grad_norm": 1.1754764318466187, + "learning_rate": 7.502443792766374e-05, + "loss": 0.331, + "step": 3577 + }, + { + "epoch": 0.5601127113337507, + "grad_norm": 1.7027287483215332, + "learning_rate": 7.501629195177583e-05, + "loss": 0.3533, + "step": 3578 + }, + { + "epoch": 0.5602692548528491, + "grad_norm": 2.655644178390503, + "learning_rate": 7.500814597588791e-05, + "loss": 0.9945, + "step": 3579 + }, + { + "epoch": 0.5604257983719474, + "grad_norm": 2.9612488746643066, + "learning_rate": 7.500000000000001e-05, + "loss": 1.0106, + "step": 3580 + }, + { + "epoch": 0.5605823418910457, + "grad_norm": 2.8067266941070557, + "learning_rate": 7.499185402411209e-05, + "loss": 0.6968, + "step": 3581 + }, + { + "epoch": 0.560738885410144, + "grad_norm": 2.3383238315582275, + "learning_rate": 7.498370804822417e-05, + "loss": 0.7483, + "step": 3582 + }, + { + "epoch": 0.5608954289292424, + "grad_norm": 11.540834426879883, + "learning_rate": 7.497556207233627e-05, + "loss": 0.6733, + "step": 3583 + }, + { + "epoch": 0.5610519724483406, + "grad_norm": 2.7578766345977783, + "learning_rate": 7.496741609644836e-05, + "loss": 0.7637, + "step": 3584 + }, + { + "epoch": 0.5612085159674389, + "grad_norm": 3.1179420948028564, + "learning_rate": 7.495927012056044e-05, + "loss": 1.1975, + "step": 3585 + }, + { + "epoch": 0.5613650594865373, + "grad_norm": 2.5510690212249756, + "learning_rate": 7.495112414467254e-05, + "loss": 0.6942, + "step": 3586 + }, + { + "epoch": 0.5615216030056356, + "grad_norm": 2.9032490253448486, + "learning_rate": 7.494297816878463e-05, + "loss": 0.9909, + "step": 3587 + }, + { + "epoch": 0.5616781465247339, + "grad_norm": 3.9693830013275146, + "learning_rate": 7.49348321928967e-05, + "loss": 1.2482, + "step": 3588 + }, + { + "epoch": 0.5618346900438321, + "grad_norm": 5.173501014709473, + "learning_rate": 7.49266862170088e-05, + "loss": 1.9323, + "step": 3589 + }, + { + "epoch": 0.5619912335629305, + "grad_norm": 4.659420967102051, + "learning_rate": 7.49185402411209e-05, + "loss": 0.9416, + "step": 3590 + }, + { + "epoch": 0.5621477770820288, + "grad_norm": 3.6704423427581787, + "learning_rate": 7.491039426523297e-05, + "loss": 1.0536, + "step": 3591 + }, + { + "epoch": 0.5623043206011271, + "grad_norm": 3.904744863510132, + "learning_rate": 7.490224828934507e-05, + "loss": 1.2894, + "step": 3592 + }, + { + "epoch": 0.5624608641202254, + "grad_norm": 4.052025318145752, + "learning_rate": 7.489410231345716e-05, + "loss": 1.5196, + "step": 3593 + }, + { + "epoch": 0.5626174076393238, + "grad_norm": 4.492613315582275, + "learning_rate": 7.488595633756925e-05, + "loss": 1.6485, + "step": 3594 + }, + { + "epoch": 0.562773951158422, + "grad_norm": 2.6382546424865723, + "learning_rate": 7.487781036168133e-05, + "loss": 1.1464, + "step": 3595 + }, + { + "epoch": 0.5629304946775203, + "grad_norm": 2.7325379848480225, + "learning_rate": 7.486966438579343e-05, + "loss": 0.6997, + "step": 3596 + }, + { + "epoch": 0.5630870381966186, + "grad_norm": 1.986447811126709, + "learning_rate": 7.486151840990551e-05, + "loss": 0.6134, + "step": 3597 + }, + { + "epoch": 0.563243581715717, + "grad_norm": 3.9178359508514404, + "learning_rate": 7.48533724340176e-05, + "loss": 1.3396, + "step": 3598 + }, + { + "epoch": 0.5634001252348153, + "grad_norm": 2.2678539752960205, + "learning_rate": 7.484522645812969e-05, + "loss": 0.962, + "step": 3599 + }, + { + "epoch": 0.5635566687539136, + "grad_norm": 3.61490797996521, + "learning_rate": 7.483708048224178e-05, + "loss": 1.3083, + "step": 3600 + }, + { + "epoch": 0.5637132122730119, + "grad_norm": 0.5830388069152832, + "learning_rate": 7.482893450635386e-05, + "loss": 0.3146, + "step": 3601 + }, + { + "epoch": 0.5638697557921102, + "grad_norm": 0.4239983558654785, + "learning_rate": 7.482078853046596e-05, + "loss": 0.1861, + "step": 3602 + }, + { + "epoch": 0.5640262993112085, + "grad_norm": 0.7456860542297363, + "learning_rate": 7.481264255457804e-05, + "loss": 0.3444, + "step": 3603 + }, + { + "epoch": 0.5641828428303068, + "grad_norm": 1.0944236516952515, + "learning_rate": 7.480449657869013e-05, + "loss": 0.4843, + "step": 3604 + }, + { + "epoch": 0.5643393863494052, + "grad_norm": 0.8403604626655579, + "learning_rate": 7.479635060280222e-05, + "loss": 0.374, + "step": 3605 + }, + { + "epoch": 0.5644959298685035, + "grad_norm": 0.7831420302391052, + "learning_rate": 7.47882046269143e-05, + "loss": 0.3121, + "step": 3606 + }, + { + "epoch": 0.5646524733876017, + "grad_norm": 0.9445400834083557, + "learning_rate": 7.478005865102639e-05, + "loss": 0.3883, + "step": 3607 + }, + { + "epoch": 0.5648090169067, + "grad_norm": 0.8931689858436584, + "learning_rate": 7.477191267513849e-05, + "loss": 0.275, + "step": 3608 + }, + { + "epoch": 0.5649655604257984, + "grad_norm": 1.4040522575378418, + "learning_rate": 7.476376669925057e-05, + "loss": 0.4909, + "step": 3609 + }, + { + "epoch": 0.5651221039448967, + "grad_norm": 0.8937142491340637, + "learning_rate": 7.475562072336267e-05, + "loss": 0.2087, + "step": 3610 + }, + { + "epoch": 0.565278647463995, + "grad_norm": 1.2367422580718994, + "learning_rate": 7.474747474747475e-05, + "loss": 0.5416, + "step": 3611 + }, + { + "epoch": 0.5654351909830932, + "grad_norm": 2.676766872406006, + "learning_rate": 7.473932877158684e-05, + "loss": 0.8305, + "step": 3612 + }, + { + "epoch": 0.5655917345021916, + "grad_norm": 0.8098173141479492, + "learning_rate": 7.473118279569893e-05, + "loss": 0.2461, + "step": 3613 + }, + { + "epoch": 0.5657482780212899, + "grad_norm": 1.007439374923706, + "learning_rate": 7.472303681981102e-05, + "loss": 0.3555, + "step": 3614 + }, + { + "epoch": 0.5659048215403882, + "grad_norm": 0.9236829876899719, + "learning_rate": 7.47148908439231e-05, + "loss": 0.3108, + "step": 3615 + }, + { + "epoch": 0.5660613650594866, + "grad_norm": 1.918010950088501, + "learning_rate": 7.47067448680352e-05, + "loss": 0.4002, + "step": 3616 + }, + { + "epoch": 0.5662179085785849, + "grad_norm": 3.876889228820801, + "learning_rate": 7.469859889214728e-05, + "loss": 0.7662, + "step": 3617 + }, + { + "epoch": 0.5663744520976831, + "grad_norm": 1.530329942703247, + "learning_rate": 7.469045291625937e-05, + "loss": 0.4308, + "step": 3618 + }, + { + "epoch": 0.5665309956167814, + "grad_norm": 4.2821269035339355, + "learning_rate": 7.468230694037146e-05, + "loss": 0.5821, + "step": 3619 + }, + { + "epoch": 0.5666875391358798, + "grad_norm": 1.9540798664093018, + "learning_rate": 7.467416096448355e-05, + "loss": 0.5608, + "step": 3620 + }, + { + "epoch": 0.5668440826549781, + "grad_norm": 3.8115177154541016, + "learning_rate": 7.466601498859563e-05, + "loss": 0.9121, + "step": 3621 + }, + { + "epoch": 0.5670006261740764, + "grad_norm": 1.6652933359146118, + "learning_rate": 7.465786901270773e-05, + "loss": 0.7054, + "step": 3622 + }, + { + "epoch": 0.5671571696931748, + "grad_norm": 2.1581315994262695, + "learning_rate": 7.464972303681983e-05, + "loss": 0.5853, + "step": 3623 + }, + { + "epoch": 0.567313713212273, + "grad_norm": 2.727357864379883, + "learning_rate": 7.46415770609319e-05, + "loss": 0.5419, + "step": 3624 + }, + { + "epoch": 0.5674702567313713, + "grad_norm": 1.8535575866699219, + "learning_rate": 7.463343108504399e-05, + "loss": 0.4701, + "step": 3625 + }, + { + "epoch": 0.5676268002504696, + "grad_norm": 4.431906700134277, + "learning_rate": 7.462528510915609e-05, + "loss": 1.3929, + "step": 3626 + }, + { + "epoch": 0.567783343769568, + "grad_norm": 2.7567501068115234, + "learning_rate": 7.461713913326816e-05, + "loss": 0.7251, + "step": 3627 + }, + { + "epoch": 0.5679398872886663, + "grad_norm": 2.7592763900756836, + "learning_rate": 7.460899315738026e-05, + "loss": 0.9954, + "step": 3628 + }, + { + "epoch": 0.5680964308077645, + "grad_norm": 2.0408637523651123, + "learning_rate": 7.460084718149235e-05, + "loss": 0.8086, + "step": 3629 + }, + { + "epoch": 0.5682529743268628, + "grad_norm": 4.35562801361084, + "learning_rate": 7.459270120560444e-05, + "loss": 0.8577, + "step": 3630 + }, + { + "epoch": 0.5684095178459612, + "grad_norm": 2.1096384525299072, + "learning_rate": 7.458455522971652e-05, + "loss": 0.8309, + "step": 3631 + }, + { + "epoch": 0.5685660613650595, + "grad_norm": 3.5621864795684814, + "learning_rate": 7.457640925382862e-05, + "loss": 1.356, + "step": 3632 + }, + { + "epoch": 0.5687226048841578, + "grad_norm": 3.6981101036071777, + "learning_rate": 7.45682632779407e-05, + "loss": 1.2487, + "step": 3633 + }, + { + "epoch": 0.5688791484032562, + "grad_norm": 2.163529396057129, + "learning_rate": 7.456011730205279e-05, + "loss": 0.9831, + "step": 3634 + }, + { + "epoch": 0.5690356919223544, + "grad_norm": 3.397315740585327, + "learning_rate": 7.455197132616488e-05, + "loss": 1.3071, + "step": 3635 + }, + { + "epoch": 0.5691922354414527, + "grad_norm": 5.767230987548828, + "learning_rate": 7.454382535027697e-05, + "loss": 1.0446, + "step": 3636 + }, + { + "epoch": 0.569348778960551, + "grad_norm": 2.5094258785247803, + "learning_rate": 7.453567937438905e-05, + "loss": 0.6914, + "step": 3637 + }, + { + "epoch": 0.5695053224796494, + "grad_norm": 2.672346830368042, + "learning_rate": 7.452753339850115e-05, + "loss": 1.2905, + "step": 3638 + }, + { + "epoch": 0.5696618659987477, + "grad_norm": 5.578522205352783, + "learning_rate": 7.451938742261323e-05, + "loss": 1.5002, + "step": 3639 + }, + { + "epoch": 0.569818409517846, + "grad_norm": 3.1099932193756104, + "learning_rate": 7.451124144672532e-05, + "loss": 1.973, + "step": 3640 + }, + { + "epoch": 0.5699749530369442, + "grad_norm": 2.8694941997528076, + "learning_rate": 7.450309547083741e-05, + "loss": 1.2303, + "step": 3641 + }, + { + "epoch": 0.5701314965560426, + "grad_norm": 2.569467782974243, + "learning_rate": 7.44949494949495e-05, + "loss": 1.1057, + "step": 3642 + }, + { + "epoch": 0.5702880400751409, + "grad_norm": 2.990097761154175, + "learning_rate": 7.448680351906158e-05, + "loss": 1.2979, + "step": 3643 + }, + { + "epoch": 0.5704445835942392, + "grad_norm": 1.8415426015853882, + "learning_rate": 7.447865754317368e-05, + "loss": 1.1546, + "step": 3644 + }, + { + "epoch": 0.5706011271133375, + "grad_norm": 3.9420058727264404, + "learning_rate": 7.447051156728576e-05, + "loss": 0.6834, + "step": 3645 + }, + { + "epoch": 0.5707576706324358, + "grad_norm": 5.3852219581604, + "learning_rate": 7.446236559139786e-05, + "loss": 1.3999, + "step": 3646 + }, + { + "epoch": 0.5709142141515341, + "grad_norm": 1.9494576454162598, + "learning_rate": 7.445421961550994e-05, + "loss": 0.6867, + "step": 3647 + }, + { + "epoch": 0.5710707576706324, + "grad_norm": 1.9683914184570312, + "learning_rate": 7.444607363962203e-05, + "loss": 0.4036, + "step": 3648 + }, + { + "epoch": 0.5712273011897308, + "grad_norm": 1.725531816482544, + "learning_rate": 7.443792766373412e-05, + "loss": 0.5782, + "step": 3649 + }, + { + "epoch": 0.5713838447088291, + "grad_norm": 2.527648448944092, + "learning_rate": 7.442978168784621e-05, + "loss": 1.1749, + "step": 3650 + }, + { + "epoch": 0.5715403882279274, + "grad_norm": 0.6183387041091919, + "learning_rate": 7.442163571195829e-05, + "loss": 0.2262, + "step": 3651 + }, + { + "epoch": 0.5716969317470256, + "grad_norm": 0.5857294797897339, + "learning_rate": 7.441348973607039e-05, + "loss": 0.3722, + "step": 3652 + }, + { + "epoch": 0.571853475266124, + "grad_norm": 0.795963704586029, + "learning_rate": 7.440534376018247e-05, + "loss": 0.327, + "step": 3653 + }, + { + "epoch": 0.5720100187852223, + "grad_norm": 0.6187400221824646, + "learning_rate": 7.439719778429456e-05, + "loss": 0.3346, + "step": 3654 + }, + { + "epoch": 0.5721665623043206, + "grad_norm": 1.9602134227752686, + "learning_rate": 7.438905180840665e-05, + "loss": 0.4209, + "step": 3655 + }, + { + "epoch": 0.5723231058234189, + "grad_norm": 0.77581387758255, + "learning_rate": 7.438090583251874e-05, + "loss": 0.3402, + "step": 3656 + }, + { + "epoch": 0.5724796493425173, + "grad_norm": 0.5660386085510254, + "learning_rate": 7.437275985663082e-05, + "loss": 0.2256, + "step": 3657 + }, + { + "epoch": 0.5726361928616155, + "grad_norm": 1.2211302518844604, + "learning_rate": 7.436461388074292e-05, + "loss": 0.3503, + "step": 3658 + }, + { + "epoch": 0.5727927363807138, + "grad_norm": 1.1478919982910156, + "learning_rate": 7.435646790485502e-05, + "loss": 0.5265, + "step": 3659 + }, + { + "epoch": 0.5729492798998121, + "grad_norm": 1.8452997207641602, + "learning_rate": 7.434832192896709e-05, + "loss": 0.5732, + "step": 3660 + }, + { + "epoch": 0.5731058234189105, + "grad_norm": 0.8858850598335266, + "learning_rate": 7.434017595307918e-05, + "loss": 0.3012, + "step": 3661 + }, + { + "epoch": 0.5732623669380088, + "grad_norm": 1.9138617515563965, + "learning_rate": 7.433202997719128e-05, + "loss": 0.3232, + "step": 3662 + }, + { + "epoch": 0.573418910457107, + "grad_norm": 1.45062255859375, + "learning_rate": 7.432388400130335e-05, + "loss": 0.5001, + "step": 3663 + }, + { + "epoch": 0.5735754539762054, + "grad_norm": 2.495861768722534, + "learning_rate": 7.431573802541545e-05, + "loss": 0.6508, + "step": 3664 + }, + { + "epoch": 0.5737319974953037, + "grad_norm": 1.3986589908599854, + "learning_rate": 7.430759204952755e-05, + "loss": 0.394, + "step": 3665 + }, + { + "epoch": 0.573888541014402, + "grad_norm": 1.0443804264068604, + "learning_rate": 7.429944607363962e-05, + "loss": 0.474, + "step": 3666 + }, + { + "epoch": 0.5740450845335003, + "grad_norm": 1.4572116136550903, + "learning_rate": 7.429130009775171e-05, + "loss": 0.5049, + "step": 3667 + }, + { + "epoch": 0.5742016280525987, + "grad_norm": 1.7479212284088135, + "learning_rate": 7.428315412186381e-05, + "loss": 0.4918, + "step": 3668 + }, + { + "epoch": 0.5743581715716969, + "grad_norm": 1.565940022468567, + "learning_rate": 7.42750081459759e-05, + "loss": 0.5001, + "step": 3669 + }, + { + "epoch": 0.5745147150907952, + "grad_norm": 1.2792044878005981, + "learning_rate": 7.426686217008798e-05, + "loss": 0.3763, + "step": 3670 + }, + { + "epoch": 0.5746712586098935, + "grad_norm": 2.461097002029419, + "learning_rate": 7.425871619420008e-05, + "loss": 0.7338, + "step": 3671 + }, + { + "epoch": 0.5748278021289919, + "grad_norm": 1.5226945877075195, + "learning_rate": 7.425057021831216e-05, + "loss": 0.5801, + "step": 3672 + }, + { + "epoch": 0.5749843456480902, + "grad_norm": 1.6842652559280396, + "learning_rate": 7.424242424242424e-05, + "loss": 0.5485, + "step": 3673 + }, + { + "epoch": 0.5751408891671885, + "grad_norm": 1.791096568107605, + "learning_rate": 7.423427826653634e-05, + "loss": 0.523, + "step": 3674 + }, + { + "epoch": 0.5752974326862867, + "grad_norm": 2.356050729751587, + "learning_rate": 7.422613229064842e-05, + "loss": 0.5266, + "step": 3675 + }, + { + "epoch": 0.5754539762053851, + "grad_norm": 1.8826202154159546, + "learning_rate": 7.421798631476051e-05, + "loss": 0.544, + "step": 3676 + }, + { + "epoch": 0.5756105197244834, + "grad_norm": 2.0575764179229736, + "learning_rate": 7.42098403388726e-05, + "loss": 0.9056, + "step": 3677 + }, + { + "epoch": 0.5757670632435817, + "grad_norm": 2.2602932453155518, + "learning_rate": 7.420169436298469e-05, + "loss": 0.9743, + "step": 3678 + }, + { + "epoch": 0.5759236067626801, + "grad_norm": 2.930988311767578, + "learning_rate": 7.419354838709677e-05, + "loss": 0.6091, + "step": 3679 + }, + { + "epoch": 0.5760801502817783, + "grad_norm": 3.9184303283691406, + "learning_rate": 7.418540241120887e-05, + "loss": 1.3262, + "step": 3680 + }, + { + "epoch": 0.5762366938008766, + "grad_norm": 2.7006380558013916, + "learning_rate": 7.417725643532095e-05, + "loss": 0.701, + "step": 3681 + }, + { + "epoch": 0.5763932373199749, + "grad_norm": 4.862171173095703, + "learning_rate": 7.416911045943305e-05, + "loss": 0.8858, + "step": 3682 + }, + { + "epoch": 0.5765497808390733, + "grad_norm": 3.206251621246338, + "learning_rate": 7.416096448354513e-05, + "loss": 0.8709, + "step": 3683 + }, + { + "epoch": 0.5767063243581716, + "grad_norm": 2.2023215293884277, + "learning_rate": 7.415281850765722e-05, + "loss": 0.7829, + "step": 3684 + }, + { + "epoch": 0.5768628678772699, + "grad_norm": 2.330810070037842, + "learning_rate": 7.414467253176932e-05, + "loss": 0.893, + "step": 3685 + }, + { + "epoch": 0.5770194113963681, + "grad_norm": 4.201470851898193, + "learning_rate": 7.41365265558814e-05, + "loss": 1.3191, + "step": 3686 + }, + { + "epoch": 0.5771759549154665, + "grad_norm": 2.188140392303467, + "learning_rate": 7.412838057999348e-05, + "loss": 0.9911, + "step": 3687 + }, + { + "epoch": 0.5773324984345648, + "grad_norm": 2.31640625, + "learning_rate": 7.412023460410558e-05, + "loss": 0.9286, + "step": 3688 + }, + { + "epoch": 0.5774890419536631, + "grad_norm": 2.348742961883545, + "learning_rate": 7.411208862821766e-05, + "loss": 1.2589, + "step": 3689 + }, + { + "epoch": 0.5776455854727615, + "grad_norm": 2.911344528198242, + "learning_rate": 7.410394265232975e-05, + "loss": 0.6726, + "step": 3690 + }, + { + "epoch": 0.5778021289918598, + "grad_norm": 4.505643844604492, + "learning_rate": 7.409579667644185e-05, + "loss": 1.0201, + "step": 3691 + }, + { + "epoch": 0.577958672510958, + "grad_norm": 3.594542980194092, + "learning_rate": 7.408765070055393e-05, + "loss": 1.3065, + "step": 3692 + }, + { + "epoch": 0.5781152160300563, + "grad_norm": 3.1790924072265625, + "learning_rate": 7.407950472466601e-05, + "loss": 1.4352, + "step": 3693 + }, + { + "epoch": 0.5782717595491547, + "grad_norm": 2.773710012435913, + "learning_rate": 7.407135874877811e-05, + "loss": 1.021, + "step": 3694 + }, + { + "epoch": 0.578428303068253, + "grad_norm": 5.494543075561523, + "learning_rate": 7.40632127728902e-05, + "loss": 1.2962, + "step": 3695 + }, + { + "epoch": 0.5785848465873513, + "grad_norm": 1.5307343006134033, + "learning_rate": 7.405506679700228e-05, + "loss": 0.3201, + "step": 3696 + }, + { + "epoch": 0.5787413901064495, + "grad_norm": 2.854384660720825, + "learning_rate": 7.404692082111437e-05, + "loss": 0.6485, + "step": 3697 + }, + { + "epoch": 0.5788979336255479, + "grad_norm": 2.5519518852233887, + "learning_rate": 7.403877484522647e-05, + "loss": 0.8492, + "step": 3698 + }, + { + "epoch": 0.5790544771446462, + "grad_norm": 6.553903579711914, + "learning_rate": 7.403062886933854e-05, + "loss": 0.8667, + "step": 3699 + }, + { + "epoch": 0.5792110206637445, + "grad_norm": 3.0271267890930176, + "learning_rate": 7.402248289345064e-05, + "loss": 0.9117, + "step": 3700 + }, + { + "epoch": 0.5793675641828429, + "grad_norm": 0.7754807472229004, + "learning_rate": 7.401433691756274e-05, + "loss": 0.3839, + "step": 3701 + }, + { + "epoch": 0.5795241077019412, + "grad_norm": 0.8328859210014343, + "learning_rate": 7.400619094167481e-05, + "loss": 0.3342, + "step": 3702 + }, + { + "epoch": 0.5796806512210394, + "grad_norm": 0.792377769947052, + "learning_rate": 7.39980449657869e-05, + "loss": 0.4497, + "step": 3703 + }, + { + "epoch": 0.5798371947401377, + "grad_norm": 1.1860653162002563, + "learning_rate": 7.3989898989899e-05, + "loss": 0.4423, + "step": 3704 + }, + { + "epoch": 0.5799937382592361, + "grad_norm": 0.7874749898910522, + "learning_rate": 7.398175301401109e-05, + "loss": 0.4176, + "step": 3705 + }, + { + "epoch": 0.5801502817783344, + "grad_norm": 1.1145291328430176, + "learning_rate": 7.397360703812317e-05, + "loss": 0.3658, + "step": 3706 + }, + { + "epoch": 0.5803068252974327, + "grad_norm": 1.3092644214630127, + "learning_rate": 7.396546106223527e-05, + "loss": 0.3438, + "step": 3707 + }, + { + "epoch": 0.580463368816531, + "grad_norm": 0.7166635394096375, + "learning_rate": 7.395731508634735e-05, + "loss": 0.3069, + "step": 3708 + }, + { + "epoch": 0.5806199123356293, + "grad_norm": 1.1097662448883057, + "learning_rate": 7.394916911045943e-05, + "loss": 0.3143, + "step": 3709 + }, + { + "epoch": 0.5807764558547276, + "grad_norm": 1.6240640878677368, + "learning_rate": 7.394102313457153e-05, + "loss": 0.3354, + "step": 3710 + }, + { + "epoch": 0.5809329993738259, + "grad_norm": 1.0043699741363525, + "learning_rate": 7.393287715868362e-05, + "loss": 0.3955, + "step": 3711 + }, + { + "epoch": 0.5810895428929242, + "grad_norm": 1.3049424886703491, + "learning_rate": 7.39247311827957e-05, + "loss": 0.5149, + "step": 3712 + }, + { + "epoch": 0.5812460864120226, + "grad_norm": 1.1289176940917969, + "learning_rate": 7.39165852069078e-05, + "loss": 0.4117, + "step": 3713 + }, + { + "epoch": 0.5814026299311209, + "grad_norm": 1.535634994506836, + "learning_rate": 7.390843923101988e-05, + "loss": 0.6087, + "step": 3714 + }, + { + "epoch": 0.5815591734502191, + "grad_norm": 2.2801449298858643, + "learning_rate": 7.390029325513196e-05, + "loss": 0.6113, + "step": 3715 + }, + { + "epoch": 0.5817157169693175, + "grad_norm": 1.413068175315857, + "learning_rate": 7.389214727924406e-05, + "loss": 0.4283, + "step": 3716 + }, + { + "epoch": 0.5818722604884158, + "grad_norm": 0.9460288286209106, + "learning_rate": 7.388400130335614e-05, + "loss": 0.4132, + "step": 3717 + }, + { + "epoch": 0.5820288040075141, + "grad_norm": 3.039630651473999, + "learning_rate": 7.387585532746823e-05, + "loss": 0.6252, + "step": 3718 + }, + { + "epoch": 0.5821853475266124, + "grad_norm": 2.680607557296753, + "learning_rate": 7.386770935158033e-05, + "loss": 0.7963, + "step": 3719 + }, + { + "epoch": 0.5823418910457107, + "grad_norm": 2.1921327114105225, + "learning_rate": 7.385956337569241e-05, + "loss": 0.7134, + "step": 3720 + }, + { + "epoch": 0.582498434564809, + "grad_norm": 3.8779377937316895, + "learning_rate": 7.385141739980451e-05, + "loss": 0.5953, + "step": 3721 + }, + { + "epoch": 0.5826549780839073, + "grad_norm": 1.5372300148010254, + "learning_rate": 7.384327142391659e-05, + "loss": 0.5688, + "step": 3722 + }, + { + "epoch": 0.5828115216030056, + "grad_norm": 1.4408715963363647, + "learning_rate": 7.383512544802867e-05, + "loss": 0.4321, + "step": 3723 + }, + { + "epoch": 0.582968065122104, + "grad_norm": 1.706533432006836, + "learning_rate": 7.382697947214077e-05, + "loss": 0.8616, + "step": 3724 + }, + { + "epoch": 0.5831246086412023, + "grad_norm": 3.145225763320923, + "learning_rate": 7.381883349625286e-05, + "loss": 0.5583, + "step": 3725 + }, + { + "epoch": 0.5832811521603005, + "grad_norm": 2.8366212844848633, + "learning_rate": 7.381068752036494e-05, + "loss": 1.1559, + "step": 3726 + }, + { + "epoch": 0.5834376956793988, + "grad_norm": 1.8331729173660278, + "learning_rate": 7.380254154447704e-05, + "loss": 0.5356, + "step": 3727 + }, + { + "epoch": 0.5835942391984972, + "grad_norm": 1.8301374912261963, + "learning_rate": 7.379439556858912e-05, + "loss": 0.4405, + "step": 3728 + }, + { + "epoch": 0.5837507827175955, + "grad_norm": 1.626231074333191, + "learning_rate": 7.37862495927012e-05, + "loss": 0.757, + "step": 3729 + }, + { + "epoch": 0.5839073262366938, + "grad_norm": 2.474825143814087, + "learning_rate": 7.37781036168133e-05, + "loss": 1.1547, + "step": 3730 + }, + { + "epoch": 0.5840638697557922, + "grad_norm": 8.329789161682129, + "learning_rate": 7.376995764092538e-05, + "loss": 0.907, + "step": 3731 + }, + { + "epoch": 0.5842204132748904, + "grad_norm": 2.8442914485931396, + "learning_rate": 7.376181166503747e-05, + "loss": 0.7043, + "step": 3732 + }, + { + "epoch": 0.5843769567939887, + "grad_norm": 2.9295618534088135, + "learning_rate": 7.375366568914957e-05, + "loss": 1.2216, + "step": 3733 + }, + { + "epoch": 0.584533500313087, + "grad_norm": 1.8419382572174072, + "learning_rate": 7.374551971326166e-05, + "loss": 0.8344, + "step": 3734 + }, + { + "epoch": 0.5846900438321854, + "grad_norm": 2.2870230674743652, + "learning_rate": 7.373737373737373e-05, + "loss": 1.1371, + "step": 3735 + }, + { + "epoch": 0.5848465873512837, + "grad_norm": 1.4934600591659546, + "learning_rate": 7.372922776148583e-05, + "loss": 0.769, + "step": 3736 + }, + { + "epoch": 0.5850031308703819, + "grad_norm": 3.9023361206054688, + "learning_rate": 7.372108178559793e-05, + "loss": 1.0298, + "step": 3737 + }, + { + "epoch": 0.5851596743894802, + "grad_norm": 2.41264271736145, + "learning_rate": 7.371293580971e-05, + "loss": 1.0825, + "step": 3738 + }, + { + "epoch": 0.5853162179085786, + "grad_norm": 1.7540068626403809, + "learning_rate": 7.37047898338221e-05, + "loss": 0.7813, + "step": 3739 + }, + { + "epoch": 0.5854727614276769, + "grad_norm": 5.407825946807861, + "learning_rate": 7.369664385793419e-05, + "loss": 1.6165, + "step": 3740 + }, + { + "epoch": 0.5856293049467752, + "grad_norm": 3.1771748065948486, + "learning_rate": 7.368849788204628e-05, + "loss": 1.1483, + "step": 3741 + }, + { + "epoch": 0.5857858484658736, + "grad_norm": 3.6251702308654785, + "learning_rate": 7.368035190615836e-05, + "loss": 1.0436, + "step": 3742 + }, + { + "epoch": 0.5859423919849718, + "grad_norm": 1.922057867050171, + "learning_rate": 7.367220593027046e-05, + "loss": 1.6241, + "step": 3743 + }, + { + "epoch": 0.5860989355040701, + "grad_norm": 1.8115549087524414, + "learning_rate": 7.366405995438254e-05, + "loss": 0.6663, + "step": 3744 + }, + { + "epoch": 0.5862554790231684, + "grad_norm": 2.096381425857544, + "learning_rate": 7.365591397849463e-05, + "loss": 1.138, + "step": 3745 + }, + { + "epoch": 0.5864120225422668, + "grad_norm": 2.6718926429748535, + "learning_rate": 7.364776800260672e-05, + "loss": 1.5158, + "step": 3746 + }, + { + "epoch": 0.5865685660613651, + "grad_norm": 2.886110544204712, + "learning_rate": 7.36396220267188e-05, + "loss": 0.7384, + "step": 3747 + }, + { + "epoch": 0.5867251095804634, + "grad_norm": 7.184595108032227, + "learning_rate": 7.363147605083089e-05, + "loss": 2.1075, + "step": 3748 + }, + { + "epoch": 0.5868816530995616, + "grad_norm": 5.065158843994141, + "learning_rate": 7.362333007494299e-05, + "loss": 0.7369, + "step": 3749 + }, + { + "epoch": 0.58703819661866, + "grad_norm": 4.3516526222229, + "learning_rate": 7.361518409905507e-05, + "loss": 1.328, + "step": 3750 + }, + { + "epoch": 0.5871947401377583, + "grad_norm": 0.6307777762413025, + "learning_rate": 7.360703812316715e-05, + "loss": 0.3528, + "step": 3751 + }, + { + "epoch": 0.5873512836568566, + "grad_norm": 0.6727766990661621, + "learning_rate": 7.359889214727925e-05, + "loss": 0.2914, + "step": 3752 + }, + { + "epoch": 0.587507827175955, + "grad_norm": 1.929072380065918, + "learning_rate": 7.359074617139134e-05, + "loss": 1.0971, + "step": 3753 + }, + { + "epoch": 0.5876643706950532, + "grad_norm": 0.8314942717552185, + "learning_rate": 7.358260019550342e-05, + "loss": 0.3025, + "step": 3754 + }, + { + "epoch": 0.5878209142141515, + "grad_norm": 0.7796949744224548, + "learning_rate": 7.357445421961552e-05, + "loss": 0.3559, + "step": 3755 + }, + { + "epoch": 0.5879774577332498, + "grad_norm": 0.8939725160598755, + "learning_rate": 7.35663082437276e-05, + "loss": 0.3864, + "step": 3756 + }, + { + "epoch": 0.5881340012523482, + "grad_norm": 0.700675904750824, + "learning_rate": 7.35581622678397e-05, + "loss": 0.3458, + "step": 3757 + }, + { + "epoch": 0.5882905447714465, + "grad_norm": 0.6457377672195435, + "learning_rate": 7.355001629195178e-05, + "loss": 0.3145, + "step": 3758 + }, + { + "epoch": 0.5884470882905448, + "grad_norm": 0.8445685505867004, + "learning_rate": 7.354187031606387e-05, + "loss": 0.3897, + "step": 3759 + }, + { + "epoch": 0.588603631809643, + "grad_norm": 0.8875198364257812, + "learning_rate": 7.353372434017596e-05, + "loss": 0.4935, + "step": 3760 + }, + { + "epoch": 0.5887601753287414, + "grad_norm": 1.1752440929412842, + "learning_rate": 7.352557836428805e-05, + "loss": 0.3705, + "step": 3761 + }, + { + "epoch": 0.5889167188478397, + "grad_norm": 0.7165386080741882, + "learning_rate": 7.351743238840013e-05, + "loss": 0.2813, + "step": 3762 + }, + { + "epoch": 0.589073262366938, + "grad_norm": 1.0987448692321777, + "learning_rate": 7.350928641251223e-05, + "loss": 0.2646, + "step": 3763 + }, + { + "epoch": 0.5892298058860364, + "grad_norm": 0.9813166260719299, + "learning_rate": 7.350114043662431e-05, + "loss": 0.3499, + "step": 3764 + }, + { + "epoch": 0.5893863494051347, + "grad_norm": 0.9269145131111145, + "learning_rate": 7.34929944607364e-05, + "loss": 0.34, + "step": 3765 + }, + { + "epoch": 0.5895428929242329, + "grad_norm": 1.295009732246399, + "learning_rate": 7.348484848484849e-05, + "loss": 0.5524, + "step": 3766 + }, + { + "epoch": 0.5896994364433312, + "grad_norm": 1.5106849670410156, + "learning_rate": 7.347670250896058e-05, + "loss": 0.3658, + "step": 3767 + }, + { + "epoch": 0.5898559799624296, + "grad_norm": 2.6317965984344482, + "learning_rate": 7.346855653307266e-05, + "loss": 0.6732, + "step": 3768 + }, + { + "epoch": 0.5900125234815279, + "grad_norm": 1.0115700960159302, + "learning_rate": 7.346041055718476e-05, + "loss": 0.2781, + "step": 3769 + }, + { + "epoch": 0.5901690670006262, + "grad_norm": 1.9729338884353638, + "learning_rate": 7.345226458129685e-05, + "loss": 0.5568, + "step": 3770 + }, + { + "epoch": 0.5903256105197244, + "grad_norm": 2.6406760215759277, + "learning_rate": 7.344411860540892e-05, + "loss": 0.5766, + "step": 3771 + }, + { + "epoch": 0.5904821540388228, + "grad_norm": 5.654886722564697, + "learning_rate": 7.343597262952102e-05, + "loss": 1.2707, + "step": 3772 + }, + { + "epoch": 0.5906386975579211, + "grad_norm": 2.0942139625549316, + "learning_rate": 7.342782665363312e-05, + "loss": 0.6073, + "step": 3773 + }, + { + "epoch": 0.5907952410770194, + "grad_norm": 4.011472225189209, + "learning_rate": 7.341968067774519e-05, + "loss": 0.788, + "step": 3774 + }, + { + "epoch": 0.5909517845961177, + "grad_norm": 1.5860822200775146, + "learning_rate": 7.341153470185729e-05, + "loss": 0.811, + "step": 3775 + }, + { + "epoch": 0.5911083281152161, + "grad_norm": 1.82094144821167, + "learning_rate": 7.340338872596938e-05, + "loss": 0.5411, + "step": 3776 + }, + { + "epoch": 0.5912648716343143, + "grad_norm": 1.7557754516601562, + "learning_rate": 7.339524275008145e-05, + "loss": 0.4254, + "step": 3777 + }, + { + "epoch": 0.5914214151534126, + "grad_norm": 3.759474277496338, + "learning_rate": 7.338709677419355e-05, + "loss": 0.9639, + "step": 3778 + }, + { + "epoch": 0.591577958672511, + "grad_norm": 2.2192845344543457, + "learning_rate": 7.337895079830565e-05, + "loss": 0.6307, + "step": 3779 + }, + { + "epoch": 0.5917345021916093, + "grad_norm": 2.9457969665527344, + "learning_rate": 7.337080482241773e-05, + "loss": 0.8704, + "step": 3780 + }, + { + "epoch": 0.5918910457107076, + "grad_norm": 2.3247287273406982, + "learning_rate": 7.336265884652982e-05, + "loss": 0.6367, + "step": 3781 + }, + { + "epoch": 0.5920475892298059, + "grad_norm": 4.398600101470947, + "learning_rate": 7.335451287064191e-05, + "loss": 0.6821, + "step": 3782 + }, + { + "epoch": 0.5922041327489042, + "grad_norm": 3.948310375213623, + "learning_rate": 7.3346366894754e-05, + "loss": 0.8518, + "step": 3783 + }, + { + "epoch": 0.5923606762680025, + "grad_norm": 2.9993910789489746, + "learning_rate": 7.333822091886608e-05, + "loss": 1.271, + "step": 3784 + }, + { + "epoch": 0.5925172197871008, + "grad_norm": 3.846550464630127, + "learning_rate": 7.333007494297818e-05, + "loss": 1.0764, + "step": 3785 + }, + { + "epoch": 0.5926737633061991, + "grad_norm": 2.488731622695923, + "learning_rate": 7.332192896709026e-05, + "loss": 0.999, + "step": 3786 + }, + { + "epoch": 0.5928303068252975, + "grad_norm": 5.66962194442749, + "learning_rate": 7.331378299120235e-05, + "loss": 0.6266, + "step": 3787 + }, + { + "epoch": 0.5929868503443957, + "grad_norm": 2.6650495529174805, + "learning_rate": 7.330563701531444e-05, + "loss": 1.2392, + "step": 3788 + }, + { + "epoch": 0.593143393863494, + "grad_norm": 2.693988800048828, + "learning_rate": 7.329749103942653e-05, + "loss": 0.9925, + "step": 3789 + }, + { + "epoch": 0.5932999373825923, + "grad_norm": 2.678619146347046, + "learning_rate": 7.328934506353861e-05, + "loss": 1.6207, + "step": 3790 + }, + { + "epoch": 0.5934564809016907, + "grad_norm": 3.424544095993042, + "learning_rate": 7.328119908765071e-05, + "loss": 1.678, + "step": 3791 + }, + { + "epoch": 0.593613024420789, + "grad_norm": 2.930544376373291, + "learning_rate": 7.327305311176279e-05, + "loss": 1.27, + "step": 3792 + }, + { + "epoch": 0.5937695679398873, + "grad_norm": 2.460292339324951, + "learning_rate": 7.326490713587489e-05, + "loss": 1.1153, + "step": 3793 + }, + { + "epoch": 0.5939261114589856, + "grad_norm": 4.187644004821777, + "learning_rate": 7.325676115998697e-05, + "loss": 1.7627, + "step": 3794 + }, + { + "epoch": 0.5940826549780839, + "grad_norm": 2.826730728149414, + "learning_rate": 7.324861518409906e-05, + "loss": 1.1942, + "step": 3795 + }, + { + "epoch": 0.5942391984971822, + "grad_norm": 3.993208408355713, + "learning_rate": 7.324046920821115e-05, + "loss": 0.7551, + "step": 3796 + }, + { + "epoch": 0.5943957420162805, + "grad_norm": 2.694694757461548, + "learning_rate": 7.323232323232324e-05, + "loss": 0.838, + "step": 3797 + }, + { + "epoch": 0.5945522855353789, + "grad_norm": 1.968223214149475, + "learning_rate": 7.322417725643532e-05, + "loss": 0.6867, + "step": 3798 + }, + { + "epoch": 0.5947088290544772, + "grad_norm": 3.0320818424224854, + "learning_rate": 7.321603128054742e-05, + "loss": 0.8856, + "step": 3799 + }, + { + "epoch": 0.5948653725735754, + "grad_norm": 3.479949474334717, + "learning_rate": 7.32078853046595e-05, + "loss": 1.2881, + "step": 3800 + }, + { + "epoch": 0.5950219160926737, + "grad_norm": 0.5971167087554932, + "learning_rate": 7.319973932877159e-05, + "loss": 0.3642, + "step": 3801 + }, + { + "epoch": 0.5951784596117721, + "grad_norm": 1.2218130826950073, + "learning_rate": 7.319159335288368e-05, + "loss": 0.3852, + "step": 3802 + }, + { + "epoch": 0.5953350031308704, + "grad_norm": 0.6445058584213257, + "learning_rate": 7.318344737699577e-05, + "loss": 0.3078, + "step": 3803 + }, + { + "epoch": 0.5954915466499687, + "grad_norm": 0.9557567238807678, + "learning_rate": 7.317530140110785e-05, + "loss": 0.2943, + "step": 3804 + }, + { + "epoch": 0.595648090169067, + "grad_norm": 0.640195906162262, + "learning_rate": 7.316715542521995e-05, + "loss": 0.2764, + "step": 3805 + }, + { + "epoch": 0.5958046336881653, + "grad_norm": 0.9253541827201843, + "learning_rate": 7.315900944933203e-05, + "loss": 0.2507, + "step": 3806 + }, + { + "epoch": 0.5959611772072636, + "grad_norm": 0.6508774161338806, + "learning_rate": 7.315086347344412e-05, + "loss": 0.2857, + "step": 3807 + }, + { + "epoch": 0.5961177207263619, + "grad_norm": 0.8666054010391235, + "learning_rate": 7.314271749755621e-05, + "loss": 0.4387, + "step": 3808 + }, + { + "epoch": 0.5962742642454603, + "grad_norm": 1.3025624752044678, + "learning_rate": 7.313457152166831e-05, + "loss": 0.3811, + "step": 3809 + }, + { + "epoch": 0.5964308077645586, + "grad_norm": 1.4658259153366089, + "learning_rate": 7.312642554578038e-05, + "loss": 0.6815, + "step": 3810 + }, + { + "epoch": 0.5965873512836568, + "grad_norm": 1.3252323865890503, + "learning_rate": 7.311827956989248e-05, + "loss": 0.4256, + "step": 3811 + }, + { + "epoch": 0.5967438948027551, + "grad_norm": 1.5327820777893066, + "learning_rate": 7.311013359400458e-05, + "loss": 0.3533, + "step": 3812 + }, + { + "epoch": 0.5969004383218535, + "grad_norm": 1.6847938299179077, + "learning_rate": 7.310198761811665e-05, + "loss": 0.5409, + "step": 3813 + }, + { + "epoch": 0.5970569818409518, + "grad_norm": 0.9728394746780396, + "learning_rate": 7.309384164222874e-05, + "loss": 0.3532, + "step": 3814 + }, + { + "epoch": 0.5972135253600501, + "grad_norm": 0.9910975694656372, + "learning_rate": 7.308569566634084e-05, + "loss": 0.2938, + "step": 3815 + }, + { + "epoch": 0.5973700688791485, + "grad_norm": 1.6390091180801392, + "learning_rate": 7.307754969045292e-05, + "loss": 0.2767, + "step": 3816 + }, + { + "epoch": 0.5975266123982467, + "grad_norm": 1.566412329673767, + "learning_rate": 7.306940371456501e-05, + "loss": 0.3619, + "step": 3817 + }, + { + "epoch": 0.597683155917345, + "grad_norm": 2.9122586250305176, + "learning_rate": 7.30612577386771e-05, + "loss": 0.965, + "step": 3818 + }, + { + "epoch": 0.5978396994364433, + "grad_norm": 2.0237343311309814, + "learning_rate": 7.305311176278919e-05, + "loss": 0.521, + "step": 3819 + }, + { + "epoch": 0.5979962429555417, + "grad_norm": 2.18483829498291, + "learning_rate": 7.304496578690127e-05, + "loss": 0.3355, + "step": 3820 + }, + { + "epoch": 0.59815278647464, + "grad_norm": 2.2950239181518555, + "learning_rate": 7.303681981101337e-05, + "loss": 0.6267, + "step": 3821 + }, + { + "epoch": 0.5983093299937383, + "grad_norm": 3.3221890926361084, + "learning_rate": 7.302867383512545e-05, + "loss": 0.632, + "step": 3822 + }, + { + "epoch": 0.5984658735128365, + "grad_norm": 2.3891899585723877, + "learning_rate": 7.302052785923754e-05, + "loss": 0.5944, + "step": 3823 + }, + { + "epoch": 0.5986224170319349, + "grad_norm": 3.633146047592163, + "learning_rate": 7.301238188334963e-05, + "loss": 0.9833, + "step": 3824 + }, + { + "epoch": 0.5987789605510332, + "grad_norm": 2.805389642715454, + "learning_rate": 7.300423590746172e-05, + "loss": 0.6967, + "step": 3825 + }, + { + "epoch": 0.5989355040701315, + "grad_norm": 3.0840983390808105, + "learning_rate": 7.29960899315738e-05, + "loss": 0.8624, + "step": 3826 + }, + { + "epoch": 0.5990920475892298, + "grad_norm": 1.5626389980316162, + "learning_rate": 7.29879439556859e-05, + "loss": 0.6054, + "step": 3827 + }, + { + "epoch": 0.5992485911083281, + "grad_norm": 2.547498941421509, + "learning_rate": 7.297979797979798e-05, + "loss": 0.9879, + "step": 3828 + }, + { + "epoch": 0.5994051346274264, + "grad_norm": 2.1917872428894043, + "learning_rate": 7.297165200391008e-05, + "loss": 0.5645, + "step": 3829 + }, + { + "epoch": 0.5995616781465247, + "grad_norm": 2.246025323867798, + "learning_rate": 7.296350602802216e-05, + "loss": 0.9298, + "step": 3830 + }, + { + "epoch": 0.599718221665623, + "grad_norm": 2.922922134399414, + "learning_rate": 7.295536005213425e-05, + "loss": 0.7321, + "step": 3831 + }, + { + "epoch": 0.5998747651847214, + "grad_norm": 3.4959418773651123, + "learning_rate": 7.294721407624635e-05, + "loss": 0.7162, + "step": 3832 + }, + { + "epoch": 0.6000313087038197, + "grad_norm": 2.9085447788238525, + "learning_rate": 7.293906810035843e-05, + "loss": 0.926, + "step": 3833 + }, + { + "epoch": 0.6001878522229179, + "grad_norm": 2.012540102005005, + "learning_rate": 7.293092212447051e-05, + "loss": 0.9039, + "step": 3834 + }, + { + "epoch": 0.6003443957420163, + "grad_norm": 3.8288769721984863, + "learning_rate": 7.292277614858261e-05, + "loss": 1.1455, + "step": 3835 + }, + { + "epoch": 0.6005009392611146, + "grad_norm": 2.6628003120422363, + "learning_rate": 7.291463017269468e-05, + "loss": 1.3019, + "step": 3836 + }, + { + "epoch": 0.6006574827802129, + "grad_norm": 1.602429986000061, + "learning_rate": 7.290648419680678e-05, + "loss": 0.3528, + "step": 3837 + }, + { + "epoch": 0.6008140262993112, + "grad_norm": 5.344915866851807, + "learning_rate": 7.289833822091887e-05, + "loss": 1.355, + "step": 3838 + }, + { + "epoch": 0.6009705698184096, + "grad_norm": 2.944849729537964, + "learning_rate": 7.289019224503096e-05, + "loss": 1.1046, + "step": 3839 + }, + { + "epoch": 0.6011271133375078, + "grad_norm": 2.6216044425964355, + "learning_rate": 7.288204626914304e-05, + "loss": 1.0132, + "step": 3840 + }, + { + "epoch": 0.6012836568566061, + "grad_norm": 2.4501755237579346, + "learning_rate": 7.287390029325514e-05, + "loss": 1.0492, + "step": 3841 + }, + { + "epoch": 0.6014402003757044, + "grad_norm": 3.4421002864837646, + "learning_rate": 7.286575431736722e-05, + "loss": 1.5135, + "step": 3842 + }, + { + "epoch": 0.6015967438948028, + "grad_norm": 5.939892292022705, + "learning_rate": 7.285760834147931e-05, + "loss": 1.8278, + "step": 3843 + }, + { + "epoch": 0.6017532874139011, + "grad_norm": 3.2068049907684326, + "learning_rate": 7.28494623655914e-05, + "loss": 1.1201, + "step": 3844 + }, + { + "epoch": 0.6019098309329993, + "grad_norm": 1.9640082120895386, + "learning_rate": 7.284131638970349e-05, + "loss": 0.783, + "step": 3845 + }, + { + "epoch": 0.6020663744520977, + "grad_norm": 2.2944910526275635, + "learning_rate": 7.283317041381557e-05, + "loss": 1.5391, + "step": 3846 + }, + { + "epoch": 0.602222917971196, + "grad_norm": 2.9001803398132324, + "learning_rate": 7.282502443792767e-05, + "loss": 0.8408, + "step": 3847 + }, + { + "epoch": 0.6023794614902943, + "grad_norm": 3.0269274711608887, + "learning_rate": 7.281687846203975e-05, + "loss": 0.8737, + "step": 3848 + }, + { + "epoch": 0.6025360050093926, + "grad_norm": 2.9694225788116455, + "learning_rate": 7.280873248615184e-05, + "loss": 0.841, + "step": 3849 + }, + { + "epoch": 0.602692548528491, + "grad_norm": 2.425008773803711, + "learning_rate": 7.280058651026393e-05, + "loss": 0.7161, + "step": 3850 + }, + { + "epoch": 0.6028490920475892, + "grad_norm": 0.7799201011657715, + "learning_rate": 7.279244053437602e-05, + "loss": 0.3521, + "step": 3851 + }, + { + "epoch": 0.6030056355666875, + "grad_norm": 0.46504324674606323, + "learning_rate": 7.278429455848811e-05, + "loss": 0.264, + "step": 3852 + }, + { + "epoch": 0.6031621790857858, + "grad_norm": 0.5859881043434143, + "learning_rate": 7.27761485826002e-05, + "loss": 0.3345, + "step": 3853 + }, + { + "epoch": 0.6033187226048842, + "grad_norm": 0.5559767484664917, + "learning_rate": 7.276800260671228e-05, + "loss": 0.2175, + "step": 3854 + }, + { + "epoch": 0.6034752661239825, + "grad_norm": 0.5144526362419128, + "learning_rate": 7.275985663082438e-05, + "loss": 0.1511, + "step": 3855 + }, + { + "epoch": 0.6036318096430808, + "grad_norm": 0.6575112342834473, + "learning_rate": 7.275171065493646e-05, + "loss": 0.3242, + "step": 3856 + }, + { + "epoch": 0.603788353162179, + "grad_norm": 0.7204045653343201, + "learning_rate": 7.274356467904855e-05, + "loss": 0.2735, + "step": 3857 + }, + { + "epoch": 0.6039448966812774, + "grad_norm": 1.6723484992980957, + "learning_rate": 7.273541870316064e-05, + "loss": 0.3105, + "step": 3858 + }, + { + "epoch": 0.6041014402003757, + "grad_norm": 1.013570785522461, + "learning_rate": 7.272727272727273e-05, + "loss": 0.2942, + "step": 3859 + }, + { + "epoch": 0.604257983719474, + "grad_norm": 1.001150131225586, + "learning_rate": 7.271912675138481e-05, + "loss": 0.3095, + "step": 3860 + }, + { + "epoch": 0.6044145272385724, + "grad_norm": 2.162501335144043, + "learning_rate": 7.271098077549691e-05, + "loss": 0.533, + "step": 3861 + }, + { + "epoch": 0.6045710707576706, + "grad_norm": 0.8694136738777161, + "learning_rate": 7.270283479960899e-05, + "loss": 0.3573, + "step": 3862 + }, + { + "epoch": 0.6047276142767689, + "grad_norm": 0.9457181096076965, + "learning_rate": 7.269468882372108e-05, + "loss": 0.2683, + "step": 3863 + }, + { + "epoch": 0.6048841577958672, + "grad_norm": 1.939742922782898, + "learning_rate": 7.268654284783317e-05, + "loss": 0.3599, + "step": 3864 + }, + { + "epoch": 0.6050407013149656, + "grad_norm": 1.0055243968963623, + "learning_rate": 7.267839687194526e-05, + "loss": 0.5039, + "step": 3865 + }, + { + "epoch": 0.6051972448340639, + "grad_norm": 1.8003647327423096, + "learning_rate": 7.267025089605734e-05, + "loss": 0.6682, + "step": 3866 + }, + { + "epoch": 0.6053537883531622, + "grad_norm": 2.456277370452881, + "learning_rate": 7.266210492016944e-05, + "loss": 0.5707, + "step": 3867 + }, + { + "epoch": 0.6055103318722604, + "grad_norm": 2.795834541320801, + "learning_rate": 7.265395894428154e-05, + "loss": 0.6896, + "step": 3868 + }, + { + "epoch": 0.6056668753913588, + "grad_norm": 1.7433967590332031, + "learning_rate": 7.26458129683936e-05, + "loss": 0.6254, + "step": 3869 + }, + { + "epoch": 0.6058234189104571, + "grad_norm": 1.2204716205596924, + "learning_rate": 7.26376669925057e-05, + "loss": 0.4233, + "step": 3870 + }, + { + "epoch": 0.6059799624295554, + "grad_norm": 2.0312552452087402, + "learning_rate": 7.26295210166178e-05, + "loss": 0.5248, + "step": 3871 + }, + { + "epoch": 0.6061365059486538, + "grad_norm": 1.5604084730148315, + "learning_rate": 7.262137504072987e-05, + "loss": 0.5223, + "step": 3872 + }, + { + "epoch": 0.6062930494677521, + "grad_norm": 1.5616984367370605, + "learning_rate": 7.261322906484197e-05, + "loss": 0.5621, + "step": 3873 + }, + { + "epoch": 0.6064495929868503, + "grad_norm": 1.4964371919631958, + "learning_rate": 7.260508308895407e-05, + "loss": 0.7224, + "step": 3874 + }, + { + "epoch": 0.6066061365059486, + "grad_norm": 2.5376038551330566, + "learning_rate": 7.259693711306615e-05, + "loss": 0.7499, + "step": 3875 + }, + { + "epoch": 0.606762680025047, + "grad_norm": 2.2310400009155273, + "learning_rate": 7.258879113717823e-05, + "loss": 0.5471, + "step": 3876 + }, + { + "epoch": 0.6069192235441453, + "grad_norm": 1.5747265815734863, + "learning_rate": 7.258064516129033e-05, + "loss": 0.9133, + "step": 3877 + }, + { + "epoch": 0.6070757670632436, + "grad_norm": 3.2209110260009766, + "learning_rate": 7.257249918540241e-05, + "loss": 0.8987, + "step": 3878 + }, + { + "epoch": 0.6072323105823418, + "grad_norm": 1.972409725189209, + "learning_rate": 7.25643532095145e-05, + "loss": 0.5897, + "step": 3879 + }, + { + "epoch": 0.6073888541014402, + "grad_norm": 3.8592703342437744, + "learning_rate": 7.25562072336266e-05, + "loss": 0.9098, + "step": 3880 + }, + { + "epoch": 0.6075453976205385, + "grad_norm": 1.0783019065856934, + "learning_rate": 7.254806125773868e-05, + "loss": 0.2965, + "step": 3881 + }, + { + "epoch": 0.6077019411396368, + "grad_norm": 2.2084429264068604, + "learning_rate": 7.253991528185076e-05, + "loss": 0.2859, + "step": 3882 + }, + { + "epoch": 0.6078584846587352, + "grad_norm": 2.998116970062256, + "learning_rate": 7.253176930596286e-05, + "loss": 0.6557, + "step": 3883 + }, + { + "epoch": 0.6080150281778335, + "grad_norm": 1.934214472770691, + "learning_rate": 7.252362333007494e-05, + "loss": 1.083, + "step": 3884 + }, + { + "epoch": 0.6081715716969317, + "grad_norm": 2.1544973850250244, + "learning_rate": 7.251547735418703e-05, + "loss": 0.6685, + "step": 3885 + }, + { + "epoch": 0.60832811521603, + "grad_norm": 3.413381576538086, + "learning_rate": 7.250733137829913e-05, + "loss": 1.4158, + "step": 3886 + }, + { + "epoch": 0.6084846587351284, + "grad_norm": 2.272794246673584, + "learning_rate": 7.249918540241121e-05, + "loss": 0.5702, + "step": 3887 + }, + { + "epoch": 0.6086412022542267, + "grad_norm": 2.88045597076416, + "learning_rate": 7.24910394265233e-05, + "loss": 1.342, + "step": 3888 + }, + { + "epoch": 0.608797745773325, + "grad_norm": 4.6706743240356445, + "learning_rate": 7.248289345063539e-05, + "loss": 1.0839, + "step": 3889 + }, + { + "epoch": 0.6089542892924233, + "grad_norm": 5.675500392913818, + "learning_rate": 7.247474747474747e-05, + "loss": 0.8073, + "step": 3890 + }, + { + "epoch": 0.6091108328115216, + "grad_norm": 3.1954867839813232, + "learning_rate": 7.246660149885957e-05, + "loss": 1.3865, + "step": 3891 + }, + { + "epoch": 0.6092673763306199, + "grad_norm": 2.5591351985931396, + "learning_rate": 7.245845552297165e-05, + "loss": 1.1134, + "step": 3892 + }, + { + "epoch": 0.6094239198497182, + "grad_norm": 3.7819011211395264, + "learning_rate": 7.245030954708374e-05, + "loss": 1.2989, + "step": 3893 + }, + { + "epoch": 0.6095804633688165, + "grad_norm": 3.5383028984069824, + "learning_rate": 7.244216357119584e-05, + "loss": 1.2646, + "step": 3894 + }, + { + "epoch": 0.6097370068879149, + "grad_norm": 2.8460702896118164, + "learning_rate": 7.243401759530792e-05, + "loss": 0.9721, + "step": 3895 + }, + { + "epoch": 0.6098935504070131, + "grad_norm": 3.8615150451660156, + "learning_rate": 7.242587161942e-05, + "loss": 0.6116, + "step": 3896 + }, + { + "epoch": 0.6100500939261114, + "grad_norm": 8.560420036315918, + "learning_rate": 7.24177256435321e-05, + "loss": 1.3273, + "step": 3897 + }, + { + "epoch": 0.6102066374452098, + "grad_norm": 1.900999665260315, + "learning_rate": 7.240957966764418e-05, + "loss": 0.6952, + "step": 3898 + }, + { + "epoch": 0.6103631809643081, + "grad_norm": 2.7844791412353516, + "learning_rate": 7.240143369175627e-05, + "loss": 0.7541, + "step": 3899 + }, + { + "epoch": 0.6105197244834064, + "grad_norm": 2.5076091289520264, + "learning_rate": 7.239328771586837e-05, + "loss": 0.9422, + "step": 3900 + }, + { + "epoch": 0.6106762680025047, + "grad_norm": 0.4418751299381256, + "learning_rate": 7.238514173998045e-05, + "loss": 0.3151, + "step": 3901 + }, + { + "epoch": 0.610832811521603, + "grad_norm": 0.4853661060333252, + "learning_rate": 7.237699576409253e-05, + "loss": 0.2032, + "step": 3902 + }, + { + "epoch": 0.6109893550407013, + "grad_norm": 1.024626612663269, + "learning_rate": 7.236884978820463e-05, + "loss": 0.3152, + "step": 3903 + }, + { + "epoch": 0.6111458985597996, + "grad_norm": 1.2331557273864746, + "learning_rate": 7.236070381231673e-05, + "loss": 0.4215, + "step": 3904 + }, + { + "epoch": 0.611302442078898, + "grad_norm": 0.689155101776123, + "learning_rate": 7.23525578364288e-05, + "loss": 0.2578, + "step": 3905 + }, + { + "epoch": 0.6114589855979963, + "grad_norm": 0.828482449054718, + "learning_rate": 7.23444118605409e-05, + "loss": 0.32, + "step": 3906 + }, + { + "epoch": 0.6116155291170946, + "grad_norm": 0.9290992617607117, + "learning_rate": 7.233626588465299e-05, + "loss": 0.3619, + "step": 3907 + }, + { + "epoch": 0.6117720726361928, + "grad_norm": 1.4195841550827026, + "learning_rate": 7.232811990876506e-05, + "loss": 0.5307, + "step": 3908 + }, + { + "epoch": 0.6119286161552911, + "grad_norm": 1.0552152395248413, + "learning_rate": 7.231997393287716e-05, + "loss": 0.4645, + "step": 3909 + }, + { + "epoch": 0.6120851596743895, + "grad_norm": 0.8926309943199158, + "learning_rate": 7.231182795698926e-05, + "loss": 0.2912, + "step": 3910 + }, + { + "epoch": 0.6122417031934878, + "grad_norm": 1.2415571212768555, + "learning_rate": 7.230368198110134e-05, + "loss": 0.2446, + "step": 3911 + }, + { + "epoch": 0.6123982467125861, + "grad_norm": 1.4370903968811035, + "learning_rate": 7.229553600521342e-05, + "loss": 0.3948, + "step": 3912 + }, + { + "epoch": 0.6125547902316844, + "grad_norm": 1.073499321937561, + "learning_rate": 7.228739002932552e-05, + "loss": 0.4317, + "step": 3913 + }, + { + "epoch": 0.6127113337507827, + "grad_norm": 1.7266350984573364, + "learning_rate": 7.22792440534376e-05, + "loss": 0.4841, + "step": 3914 + }, + { + "epoch": 0.612867877269881, + "grad_norm": 0.9151492714881897, + "learning_rate": 7.227109807754969e-05, + "loss": 0.3053, + "step": 3915 + }, + { + "epoch": 0.6130244207889793, + "grad_norm": 1.36431086063385, + "learning_rate": 7.226295210166179e-05, + "loss": 0.3624, + "step": 3916 + }, + { + "epoch": 0.6131809643080777, + "grad_norm": 1.3360979557037354, + "learning_rate": 7.225480612577387e-05, + "loss": 0.9921, + "step": 3917 + }, + { + "epoch": 0.613337507827176, + "grad_norm": 1.917824149131775, + "learning_rate": 7.224666014988595e-05, + "loss": 0.5457, + "step": 3918 + }, + { + "epoch": 0.6134940513462742, + "grad_norm": 1.466226577758789, + "learning_rate": 7.223851417399805e-05, + "loss": 0.4694, + "step": 3919 + }, + { + "epoch": 0.6136505948653725, + "grad_norm": 2.8808796405792236, + "learning_rate": 7.223036819811014e-05, + "loss": 0.8421, + "step": 3920 + }, + { + "epoch": 0.6138071383844709, + "grad_norm": 0.9904908537864685, + "learning_rate": 7.222222222222222e-05, + "loss": 0.3814, + "step": 3921 + }, + { + "epoch": 0.6139636819035692, + "grad_norm": 1.4282481670379639, + "learning_rate": 7.221407624633432e-05, + "loss": 0.4737, + "step": 3922 + }, + { + "epoch": 0.6141202254226675, + "grad_norm": 1.7681618928909302, + "learning_rate": 7.22059302704464e-05, + "loss": 0.4343, + "step": 3923 + }, + { + "epoch": 0.6142767689417659, + "grad_norm": 3.114720582962036, + "learning_rate": 7.219778429455848e-05, + "loss": 0.5146, + "step": 3924 + }, + { + "epoch": 0.6144333124608641, + "grad_norm": 9.134531021118164, + "learning_rate": 7.218963831867058e-05, + "loss": 1.1991, + "step": 3925 + }, + { + "epoch": 0.6145898559799624, + "grad_norm": 2.34670090675354, + "learning_rate": 7.218149234278266e-05, + "loss": 0.6394, + "step": 3926 + }, + { + "epoch": 0.6147463994990607, + "grad_norm": 1.7255058288574219, + "learning_rate": 7.217334636689476e-05, + "loss": 0.6857, + "step": 3927 + }, + { + "epoch": 0.6149029430181591, + "grad_norm": 2.0148377418518066, + "learning_rate": 7.216520039100685e-05, + "loss": 0.8134, + "step": 3928 + }, + { + "epoch": 0.6150594865372574, + "grad_norm": 2.44610595703125, + "learning_rate": 7.215705441511893e-05, + "loss": 0.6629, + "step": 3929 + }, + { + "epoch": 0.6152160300563556, + "grad_norm": 1.5726186037063599, + "learning_rate": 7.214890843923103e-05, + "loss": 0.4529, + "step": 3930 + }, + { + "epoch": 0.6153725735754539, + "grad_norm": 3.6320037841796875, + "learning_rate": 7.214076246334311e-05, + "loss": 0.894, + "step": 3931 + }, + { + "epoch": 0.6155291170945523, + "grad_norm": 2.3404176235198975, + "learning_rate": 7.21326164874552e-05, + "loss": 0.8046, + "step": 3932 + }, + { + "epoch": 0.6156856606136506, + "grad_norm": 2.419029951095581, + "learning_rate": 7.212447051156729e-05, + "loss": 0.7625, + "step": 3933 + }, + { + "epoch": 0.6158422041327489, + "grad_norm": 4.117671489715576, + "learning_rate": 7.211632453567938e-05, + "loss": 0.8434, + "step": 3934 + }, + { + "epoch": 0.6159987476518473, + "grad_norm": 2.7817366123199463, + "learning_rate": 7.210817855979146e-05, + "loss": 0.7957, + "step": 3935 + }, + { + "epoch": 0.6161552911709455, + "grad_norm": 5.117337703704834, + "learning_rate": 7.210003258390356e-05, + "loss": 1.035, + "step": 3936 + }, + { + "epoch": 0.6163118346900438, + "grad_norm": 4.646219253540039, + "learning_rate": 7.209188660801564e-05, + "loss": 1.6939, + "step": 3937 + }, + { + "epoch": 0.6164683782091421, + "grad_norm": 3.3624448776245117, + "learning_rate": 7.208374063212772e-05, + "loss": 1.1596, + "step": 3938 + }, + { + "epoch": 0.6166249217282405, + "grad_norm": 3.29414439201355, + "learning_rate": 7.207559465623982e-05, + "loss": 1.116, + "step": 3939 + }, + { + "epoch": 0.6167814652473388, + "grad_norm": 2.919261932373047, + "learning_rate": 7.206744868035192e-05, + "loss": 1.0026, + "step": 3940 + }, + { + "epoch": 0.6169380087664371, + "grad_norm": 4.914209842681885, + "learning_rate": 7.205930270446399e-05, + "loss": 1.7225, + "step": 3941 + }, + { + "epoch": 0.6170945522855353, + "grad_norm": 4.807373046875, + "learning_rate": 7.205115672857609e-05, + "loss": 1.2906, + "step": 3942 + }, + { + "epoch": 0.6172510958046337, + "grad_norm": 3.3090410232543945, + "learning_rate": 7.204301075268818e-05, + "loss": 1.1899, + "step": 3943 + }, + { + "epoch": 0.617407639323732, + "grad_norm": 3.664698600769043, + "learning_rate": 7.203486477680025e-05, + "loss": 2.1178, + "step": 3944 + }, + { + "epoch": 0.6175641828428303, + "grad_norm": 3.049705982208252, + "learning_rate": 7.202671880091235e-05, + "loss": 1.2333, + "step": 3945 + }, + { + "epoch": 0.6177207263619287, + "grad_norm": 2.2145516872406006, + "learning_rate": 7.201857282502445e-05, + "loss": 1.0686, + "step": 3946 + }, + { + "epoch": 0.617877269881027, + "grad_norm": 4.213322162628174, + "learning_rate": 7.201042684913653e-05, + "loss": 0.7454, + "step": 3947 + }, + { + "epoch": 0.6180338134001252, + "grad_norm": 4.395740032196045, + "learning_rate": 7.200228087324862e-05, + "loss": 0.9817, + "step": 3948 + }, + { + "epoch": 0.6181903569192235, + "grad_norm": 3.773676633834839, + "learning_rate": 7.199413489736071e-05, + "loss": 1.0082, + "step": 3949 + }, + { + "epoch": 0.6183469004383219, + "grad_norm": 2.8936471939086914, + "learning_rate": 7.19859889214728e-05, + "loss": 0.9519, + "step": 3950 + }, + { + "epoch": 0.6185034439574202, + "grad_norm": 0.622107207775116, + "learning_rate": 7.197784294558488e-05, + "loss": 0.2714, + "step": 3951 + }, + { + "epoch": 0.6186599874765185, + "grad_norm": 0.4948064982891083, + "learning_rate": 7.196969696969698e-05, + "loss": 0.2846, + "step": 3952 + }, + { + "epoch": 0.6188165309956167, + "grad_norm": 0.5977234840393066, + "learning_rate": 7.196155099380906e-05, + "loss": 0.3544, + "step": 3953 + }, + { + "epoch": 0.6189730745147151, + "grad_norm": 0.8432263135910034, + "learning_rate": 7.195340501792115e-05, + "loss": 0.3575, + "step": 3954 + }, + { + "epoch": 0.6191296180338134, + "grad_norm": 0.6762120127677917, + "learning_rate": 7.194525904203324e-05, + "loss": 0.3053, + "step": 3955 + }, + { + "epoch": 0.6192861615529117, + "grad_norm": 0.7471327185630798, + "learning_rate": 7.193711306614533e-05, + "loss": 0.3695, + "step": 3956 + }, + { + "epoch": 0.61944270507201, + "grad_norm": 0.7542109489440918, + "learning_rate": 7.192896709025741e-05, + "loss": 0.3155, + "step": 3957 + }, + { + "epoch": 0.6195992485911084, + "grad_norm": 0.8401228785514832, + "learning_rate": 7.192082111436951e-05, + "loss": 0.3783, + "step": 3958 + }, + { + "epoch": 0.6197557921102066, + "grad_norm": 0.9318457841873169, + "learning_rate": 7.191267513848159e-05, + "loss": 0.4224, + "step": 3959 + }, + { + "epoch": 0.6199123356293049, + "grad_norm": 0.855857789516449, + "learning_rate": 7.190452916259367e-05, + "loss": 0.3364, + "step": 3960 + }, + { + "epoch": 0.6200688791484033, + "grad_norm": 0.9321397542953491, + "learning_rate": 7.189638318670577e-05, + "loss": 0.2805, + "step": 3961 + }, + { + "epoch": 0.6202254226675016, + "grad_norm": 0.8587724566459656, + "learning_rate": 7.188823721081786e-05, + "loss": 0.2618, + "step": 3962 + }, + { + "epoch": 0.6203819661865999, + "grad_norm": 0.8477203249931335, + "learning_rate": 7.188009123492995e-05, + "loss": 0.3386, + "step": 3963 + }, + { + "epoch": 0.6205385097056982, + "grad_norm": 2.613424301147461, + "learning_rate": 7.187194525904204e-05, + "loss": 0.4731, + "step": 3964 + }, + { + "epoch": 0.6206950532247965, + "grad_norm": 0.8483730554580688, + "learning_rate": 7.186379928315412e-05, + "loss": 0.4293, + "step": 3965 + }, + { + "epoch": 0.6208515967438948, + "grad_norm": 1.2565417289733887, + "learning_rate": 7.185565330726622e-05, + "loss": 0.2402, + "step": 3966 + }, + { + "epoch": 0.6210081402629931, + "grad_norm": 2.4790115356445312, + "learning_rate": 7.18475073313783e-05, + "loss": 0.3951, + "step": 3967 + }, + { + "epoch": 0.6211646837820914, + "grad_norm": 1.964317798614502, + "learning_rate": 7.183936135549039e-05, + "loss": 0.5538, + "step": 3968 + }, + { + "epoch": 0.6213212273011898, + "grad_norm": 1.6911293268203735, + "learning_rate": 7.183121537960248e-05, + "loss": 0.8067, + "step": 3969 + }, + { + "epoch": 0.621477770820288, + "grad_norm": 1.0256625413894653, + "learning_rate": 7.182306940371457e-05, + "loss": 0.6569, + "step": 3970 + }, + { + "epoch": 0.6216343143393863, + "grad_norm": 1.0340994596481323, + "learning_rate": 7.181492342782665e-05, + "loss": 0.273, + "step": 3971 + }, + { + "epoch": 0.6217908578584846, + "grad_norm": 2.8374288082122803, + "learning_rate": 7.180677745193875e-05, + "loss": 0.6021, + "step": 3972 + }, + { + "epoch": 0.621947401377583, + "grad_norm": 1.56617271900177, + "learning_rate": 7.179863147605083e-05, + "loss": 0.4047, + "step": 3973 + }, + { + "epoch": 0.6221039448966813, + "grad_norm": 2.797501564025879, + "learning_rate": 7.179048550016292e-05, + "loss": 0.5352, + "step": 3974 + }, + { + "epoch": 0.6222604884157796, + "grad_norm": 1.8821336030960083, + "learning_rate": 7.178233952427501e-05, + "loss": 0.7107, + "step": 3975 + }, + { + "epoch": 0.6224170319348779, + "grad_norm": 1.9744617938995361, + "learning_rate": 7.177419354838711e-05, + "loss": 0.4779, + "step": 3976 + }, + { + "epoch": 0.6225735754539762, + "grad_norm": 3.041905403137207, + "learning_rate": 7.176604757249918e-05, + "loss": 0.6616, + "step": 3977 + }, + { + "epoch": 0.6227301189730745, + "grad_norm": 1.4562413692474365, + "learning_rate": 7.175790159661128e-05, + "loss": 0.5731, + "step": 3978 + }, + { + "epoch": 0.6228866624921728, + "grad_norm": 5.017227649688721, + "learning_rate": 7.174975562072337e-05, + "loss": 1.0157, + "step": 3979 + }, + { + "epoch": 0.6230432060112712, + "grad_norm": 4.560835361480713, + "learning_rate": 7.174160964483544e-05, + "loss": 1.1047, + "step": 3980 + }, + { + "epoch": 0.6231997495303695, + "grad_norm": 2.313770294189453, + "learning_rate": 7.173346366894754e-05, + "loss": 0.8591, + "step": 3981 + }, + { + "epoch": 0.6233562930494677, + "grad_norm": 4.254480361938477, + "learning_rate": 7.172531769305964e-05, + "loss": 0.8015, + "step": 3982 + }, + { + "epoch": 0.623512836568566, + "grad_norm": 1.8769431114196777, + "learning_rate": 7.171717171717171e-05, + "loss": 0.8352, + "step": 3983 + }, + { + "epoch": 0.6236693800876644, + "grad_norm": 1.6235374212265015, + "learning_rate": 7.170902574128381e-05, + "loss": 0.8967, + "step": 3984 + }, + { + "epoch": 0.6238259236067627, + "grad_norm": 1.672648549079895, + "learning_rate": 7.17008797653959e-05, + "loss": 0.4923, + "step": 3985 + }, + { + "epoch": 0.623982467125861, + "grad_norm": 2.8542118072509766, + "learning_rate": 7.169273378950799e-05, + "loss": 1.1915, + "step": 3986 + }, + { + "epoch": 0.6241390106449592, + "grad_norm": 1.1852918863296509, + "learning_rate": 7.168458781362007e-05, + "loss": 0.763, + "step": 3987 + }, + { + "epoch": 0.6242955541640576, + "grad_norm": 2.208726644515991, + "learning_rate": 7.167644183773217e-05, + "loss": 0.7429, + "step": 3988 + }, + { + "epoch": 0.6244520976831559, + "grad_norm": 1.6211552619934082, + "learning_rate": 7.166829586184425e-05, + "loss": 0.2756, + "step": 3989 + }, + { + "epoch": 0.6246086412022542, + "grad_norm": 4.184123516082764, + "learning_rate": 7.166014988595634e-05, + "loss": 1.4804, + "step": 3990 + }, + { + "epoch": 0.6247651847213526, + "grad_norm": 3.7560129165649414, + "learning_rate": 7.165200391006843e-05, + "loss": 1.5841, + "step": 3991 + }, + { + "epoch": 0.6249217282404509, + "grad_norm": 2.2105586528778076, + "learning_rate": 7.164385793418052e-05, + "loss": 1.1518, + "step": 3992 + }, + { + "epoch": 0.6250782717595491, + "grad_norm": 3.064943790435791, + "learning_rate": 7.16357119582926e-05, + "loss": 1.6727, + "step": 3993 + }, + { + "epoch": 0.6252348152786474, + "grad_norm": 4.357314109802246, + "learning_rate": 7.16275659824047e-05, + "loss": 1.1761, + "step": 3994 + }, + { + "epoch": 0.6253913587977458, + "grad_norm": 3.2794041633605957, + "learning_rate": 7.161942000651678e-05, + "loss": 0.8968, + "step": 3995 + }, + { + "epoch": 0.6255479023168441, + "grad_norm": 1.0701076984405518, + "learning_rate": 7.161127403062887e-05, + "loss": 0.3659, + "step": 3996 + }, + { + "epoch": 0.6257044458359424, + "grad_norm": 1.3866759538650513, + "learning_rate": 7.160312805474096e-05, + "loss": 0.2641, + "step": 3997 + }, + { + "epoch": 0.6258609893550408, + "grad_norm": 3.704007387161255, + "learning_rate": 7.159498207885305e-05, + "loss": 0.957, + "step": 3998 + }, + { + "epoch": 0.626017532874139, + "grad_norm": 3.317664623260498, + "learning_rate": 7.158683610296514e-05, + "loss": 0.8084, + "step": 3999 + }, + { + "epoch": 0.6261740763932373, + "grad_norm": 1.8763704299926758, + "learning_rate": 7.157869012707723e-05, + "loss": 0.4789, + "step": 4000 + }, + { + "epoch": 0.6261740763932373, + "eval_loss": 0.5680548548698425, + "eval_runtime": 202.3671, + "eval_samples_per_second": 61.191, + "eval_steps_per_second": 3.825, + "eval_wer": 0.3520836432411756, + "step": 4000 + }, + { + "epoch": 0.6263306199123356, + "grad_norm": 0.5184564590454102, + "learning_rate": 7.157054415118931e-05, + "loss": 0.2456, + "step": 4001 + }, + { + "epoch": 0.626487163431434, + "grad_norm": 0.6123303174972534, + "learning_rate": 7.156239817530141e-05, + "loss": 0.2253, + "step": 4002 + }, + { + "epoch": 0.6266437069505323, + "grad_norm": 0.38349056243896484, + "learning_rate": 7.155425219941349e-05, + "loss": 0.1479, + "step": 4003 + }, + { + "epoch": 0.6268002504696305, + "grad_norm": 0.7966318130493164, + "learning_rate": 7.154610622352558e-05, + "loss": 0.2951, + "step": 4004 + }, + { + "epoch": 0.6269567939887288, + "grad_norm": 0.6110249161720276, + "learning_rate": 7.153796024763767e-05, + "loss": 0.2406, + "step": 4005 + }, + { + "epoch": 0.6271133375078272, + "grad_norm": 1.6781020164489746, + "learning_rate": 7.152981427174976e-05, + "loss": 0.2985, + "step": 4006 + }, + { + "epoch": 0.6272698810269255, + "grad_norm": 0.7489010095596313, + "learning_rate": 7.152166829586184e-05, + "loss": 0.2395, + "step": 4007 + }, + { + "epoch": 0.6274264245460238, + "grad_norm": 0.6254457831382751, + "learning_rate": 7.151352231997394e-05, + "loss": 0.3247, + "step": 4008 + }, + { + "epoch": 0.6275829680651221, + "grad_norm": 0.829120397567749, + "learning_rate": 7.150537634408602e-05, + "loss": 0.2446, + "step": 4009 + }, + { + "epoch": 0.6277395115842204, + "grad_norm": 1.5489546060562134, + "learning_rate": 7.14972303681981e-05, + "loss": 0.3209, + "step": 4010 + }, + { + "epoch": 0.6278960551033187, + "grad_norm": 1.4028329849243164, + "learning_rate": 7.14890843923102e-05, + "loss": 0.7168, + "step": 4011 + }, + { + "epoch": 0.628052598622417, + "grad_norm": 0.8241497874259949, + "learning_rate": 7.148093841642229e-05, + "loss": 0.2985, + "step": 4012 + }, + { + "epoch": 0.6282091421415154, + "grad_norm": 1.1835201978683472, + "learning_rate": 7.147279244053437e-05, + "loss": 0.2363, + "step": 4013 + }, + { + "epoch": 0.6283656856606137, + "grad_norm": 2.104499101638794, + "learning_rate": 7.146464646464647e-05, + "loss": 0.3609, + "step": 4014 + }, + { + "epoch": 0.628522229179712, + "grad_norm": 1.0410614013671875, + "learning_rate": 7.145650048875857e-05, + "loss": 0.363, + "step": 4015 + }, + { + "epoch": 0.6286787726988102, + "grad_norm": 1.179788589477539, + "learning_rate": 7.144835451287064e-05, + "loss": 0.3684, + "step": 4016 + }, + { + "epoch": 0.6288353162179086, + "grad_norm": 1.1474721431732178, + "learning_rate": 7.144020853698273e-05, + "loss": 0.4978, + "step": 4017 + }, + { + "epoch": 0.6289918597370069, + "grad_norm": 1.2667263746261597, + "learning_rate": 7.143206256109483e-05, + "loss": 0.4949, + "step": 4018 + }, + { + "epoch": 0.6291484032561052, + "grad_norm": 1.1795562505722046, + "learning_rate": 7.14239165852069e-05, + "loss": 0.3509, + "step": 4019 + }, + { + "epoch": 0.6293049467752035, + "grad_norm": 1.0589113235473633, + "learning_rate": 7.1415770609319e-05, + "loss": 0.2482, + "step": 4020 + }, + { + "epoch": 0.6294614902943018, + "grad_norm": 1.4888694286346436, + "learning_rate": 7.14076246334311e-05, + "loss": 0.6945, + "step": 4021 + }, + { + "epoch": 0.6296180338134001, + "grad_norm": 5.316585063934326, + "learning_rate": 7.139947865754318e-05, + "loss": 0.8729, + "step": 4022 + }, + { + "epoch": 0.6297745773324984, + "grad_norm": 1.338080883026123, + "learning_rate": 7.139133268165526e-05, + "loss": 0.6833, + "step": 4023 + }, + { + "epoch": 0.6299311208515967, + "grad_norm": 1.6352275609970093, + "learning_rate": 7.138318670576736e-05, + "loss": 0.3879, + "step": 4024 + }, + { + "epoch": 0.6300876643706951, + "grad_norm": 1.638392448425293, + "learning_rate": 7.137504072987944e-05, + "loss": 0.6886, + "step": 4025 + }, + { + "epoch": 0.6302442078897934, + "grad_norm": 1.6636238098144531, + "learning_rate": 7.136689475399153e-05, + "loss": 0.4412, + "step": 4026 + }, + { + "epoch": 0.6304007514088916, + "grad_norm": 2.60396146774292, + "learning_rate": 7.135874877810362e-05, + "loss": 0.4947, + "step": 4027 + }, + { + "epoch": 0.63055729492799, + "grad_norm": 5.837787628173828, + "learning_rate": 7.135060280221571e-05, + "loss": 0.9766, + "step": 4028 + }, + { + "epoch": 0.6307138384470883, + "grad_norm": 3.400256633758545, + "learning_rate": 7.134245682632779e-05, + "loss": 0.7033, + "step": 4029 + }, + { + "epoch": 0.6308703819661866, + "grad_norm": 3.498169422149658, + "learning_rate": 7.133431085043989e-05, + "loss": 0.985, + "step": 4030 + }, + { + "epoch": 0.6310269254852849, + "grad_norm": 2.733015298843384, + "learning_rate": 7.132616487455197e-05, + "loss": 0.7181, + "step": 4031 + }, + { + "epoch": 0.6311834690043833, + "grad_norm": 3.041734218597412, + "learning_rate": 7.131801889866406e-05, + "loss": 0.8177, + "step": 4032 + }, + { + "epoch": 0.6313400125234815, + "grad_norm": 2.8727657794952393, + "learning_rate": 7.130987292277615e-05, + "loss": 0.539, + "step": 4033 + }, + { + "epoch": 0.6314965560425798, + "grad_norm": 1.9304410219192505, + "learning_rate": 7.130172694688824e-05, + "loss": 0.6488, + "step": 4034 + }, + { + "epoch": 0.6316530995616781, + "grad_norm": 4.15205717086792, + "learning_rate": 7.129358097100034e-05, + "loss": 0.8884, + "step": 4035 + }, + { + "epoch": 0.6318096430807765, + "grad_norm": 3.55513072013855, + "learning_rate": 7.128543499511242e-05, + "loss": 1.6036, + "step": 4036 + }, + { + "epoch": 0.6319661865998748, + "grad_norm": 2.2254080772399902, + "learning_rate": 7.12772890192245e-05, + "loss": 0.5239, + "step": 4037 + }, + { + "epoch": 0.632122730118973, + "grad_norm": 5.128606796264648, + "learning_rate": 7.12691430433366e-05, + "loss": 1.0426, + "step": 4038 + }, + { + "epoch": 0.6322792736380713, + "grad_norm": 2.453298568725586, + "learning_rate": 7.126099706744868e-05, + "loss": 0.9575, + "step": 4039 + }, + { + "epoch": 0.6324358171571697, + "grad_norm": 4.878091812133789, + "learning_rate": 7.125285109156077e-05, + "loss": 1.3099, + "step": 4040 + }, + { + "epoch": 0.632592360676268, + "grad_norm": 5.231321334838867, + "learning_rate": 7.124470511567287e-05, + "loss": 1.5026, + "step": 4041 + }, + { + "epoch": 0.6327489041953663, + "grad_norm": 4.322978973388672, + "learning_rate": 7.123655913978495e-05, + "loss": 1.0803, + "step": 4042 + }, + { + "epoch": 0.6329054477144647, + "grad_norm": 4.630960464477539, + "learning_rate": 7.122841316389703e-05, + "loss": 1.516, + "step": 4043 + }, + { + "epoch": 0.6330619912335629, + "grad_norm": 4.173024654388428, + "learning_rate": 7.122026718800913e-05, + "loss": 1.5266, + "step": 4044 + }, + { + "epoch": 0.6332185347526612, + "grad_norm": 2.521254301071167, + "learning_rate": 7.121212121212121e-05, + "loss": 0.7765, + "step": 4045 + }, + { + "epoch": 0.6333750782717595, + "grad_norm": 2.1717076301574707, + "learning_rate": 7.12039752362333e-05, + "loss": 0.9039, + "step": 4046 + }, + { + "epoch": 0.6335316217908579, + "grad_norm": 4.085787773132324, + "learning_rate": 7.11958292603454e-05, + "loss": 0.6167, + "step": 4047 + }, + { + "epoch": 0.6336881653099562, + "grad_norm": 5.298760414123535, + "learning_rate": 7.118768328445748e-05, + "loss": 1.4995, + "step": 4048 + }, + { + "epoch": 0.6338447088290545, + "grad_norm": 2.72283935546875, + "learning_rate": 7.117953730856956e-05, + "loss": 0.9365, + "step": 4049 + }, + { + "epoch": 0.6340012523481527, + "grad_norm": 5.8402323722839355, + "learning_rate": 7.117139133268166e-05, + "loss": 1.8611, + "step": 4050 + }, + { + "epoch": 0.6341577958672511, + "grad_norm": 0.4091038107872009, + "learning_rate": 7.116324535679376e-05, + "loss": 0.269, + "step": 4051 + }, + { + "epoch": 0.6343143393863494, + "grad_norm": 0.7593440413475037, + "learning_rate": 7.115509938090583e-05, + "loss": 0.2856, + "step": 4052 + }, + { + "epoch": 0.6344708829054477, + "grad_norm": 0.8198299407958984, + "learning_rate": 7.114695340501792e-05, + "loss": 0.3325, + "step": 4053 + }, + { + "epoch": 0.6346274264245461, + "grad_norm": 0.7222666144371033, + "learning_rate": 7.113880742913002e-05, + "loss": 0.1831, + "step": 4054 + }, + { + "epoch": 0.6347839699436444, + "grad_norm": 0.6263940334320068, + "learning_rate": 7.113066145324209e-05, + "loss": 0.2646, + "step": 4055 + }, + { + "epoch": 0.6349405134627426, + "grad_norm": 0.7704491019248962, + "learning_rate": 7.112251547735419e-05, + "loss": 0.3597, + "step": 4056 + }, + { + "epoch": 0.6350970569818409, + "grad_norm": 1.380808711051941, + "learning_rate": 7.111436950146629e-05, + "loss": 0.3396, + "step": 4057 + }, + { + "epoch": 0.6352536005009393, + "grad_norm": 1.0140278339385986, + "learning_rate": 7.110622352557837e-05, + "loss": 0.3422, + "step": 4058 + }, + { + "epoch": 0.6354101440200376, + "grad_norm": 1.5128672122955322, + "learning_rate": 7.109807754969045e-05, + "loss": 0.333, + "step": 4059 + }, + { + "epoch": 0.6355666875391359, + "grad_norm": 1.1953186988830566, + "learning_rate": 7.108993157380255e-05, + "loss": 0.3238, + "step": 4060 + }, + { + "epoch": 0.6357232310582341, + "grad_norm": 1.5451245307922363, + "learning_rate": 7.108178559791463e-05, + "loss": 0.6668, + "step": 4061 + }, + { + "epoch": 0.6358797745773325, + "grad_norm": 1.3509916067123413, + "learning_rate": 7.107363962202672e-05, + "loss": 0.3405, + "step": 4062 + }, + { + "epoch": 0.6360363180964308, + "grad_norm": 1.486836552619934, + "learning_rate": 7.106549364613882e-05, + "loss": 0.4089, + "step": 4063 + }, + { + "epoch": 0.6361928616155291, + "grad_norm": 0.718737006187439, + "learning_rate": 7.10573476702509e-05, + "loss": 0.4116, + "step": 4064 + }, + { + "epoch": 0.6363494051346275, + "grad_norm": 1.3487416505813599, + "learning_rate": 7.104920169436298e-05, + "loss": 0.3456, + "step": 4065 + }, + { + "epoch": 0.6365059486537258, + "grad_norm": 0.9967809319496155, + "learning_rate": 7.104105571847508e-05, + "loss": 0.4296, + "step": 4066 + }, + { + "epoch": 0.636662492172824, + "grad_norm": 1.2724248170852661, + "learning_rate": 7.103290974258716e-05, + "loss": 0.5024, + "step": 4067 + }, + { + "epoch": 0.6368190356919223, + "grad_norm": 2.5215675830841064, + "learning_rate": 7.102476376669925e-05, + "loss": 0.3499, + "step": 4068 + }, + { + "epoch": 0.6369755792110207, + "grad_norm": 1.6018798351287842, + "learning_rate": 7.101661779081135e-05, + "loss": 0.4748, + "step": 4069 + }, + { + "epoch": 0.637132122730119, + "grad_norm": 1.5177613496780396, + "learning_rate": 7.100847181492343e-05, + "loss": 0.8357, + "step": 4070 + }, + { + "epoch": 0.6372886662492173, + "grad_norm": 1.5786387920379639, + "learning_rate": 7.100032583903551e-05, + "loss": 0.5266, + "step": 4071 + }, + { + "epoch": 0.6374452097683156, + "grad_norm": 2.904491662979126, + "learning_rate": 7.099217986314761e-05, + "loss": 0.8952, + "step": 4072 + }, + { + "epoch": 0.6376017532874139, + "grad_norm": 4.233807563781738, + "learning_rate": 7.09840338872597e-05, + "loss": 0.7068, + "step": 4073 + }, + { + "epoch": 0.6377582968065122, + "grad_norm": 2.2421011924743652, + "learning_rate": 7.097588791137179e-05, + "loss": 0.6392, + "step": 4074 + }, + { + "epoch": 0.6379148403256105, + "grad_norm": 2.2943003177642822, + "learning_rate": 7.096774193548388e-05, + "loss": 0.7441, + "step": 4075 + }, + { + "epoch": 0.6380713838447089, + "grad_norm": 3.074669361114502, + "learning_rate": 7.095959595959596e-05, + "loss": 0.9442, + "step": 4076 + }, + { + "epoch": 0.6382279273638072, + "grad_norm": 2.1546573638916016, + "learning_rate": 7.095144998370806e-05, + "loss": 0.7166, + "step": 4077 + }, + { + "epoch": 0.6383844708829054, + "grad_norm": 3.267489194869995, + "learning_rate": 7.094330400782014e-05, + "loss": 0.513, + "step": 4078 + }, + { + "epoch": 0.6385410144020037, + "grad_norm": 3.2228360176086426, + "learning_rate": 7.093515803193222e-05, + "loss": 0.7661, + "step": 4079 + }, + { + "epoch": 0.638697557921102, + "grad_norm": 3.362151622772217, + "learning_rate": 7.092701205604432e-05, + "loss": 1.0373, + "step": 4080 + }, + { + "epoch": 0.6388541014402004, + "grad_norm": 2.363523006439209, + "learning_rate": 7.09188660801564e-05, + "loss": 0.8131, + "step": 4081 + }, + { + "epoch": 0.6390106449592987, + "grad_norm": 2.442619800567627, + "learning_rate": 7.091072010426849e-05, + "loss": 0.5424, + "step": 4082 + }, + { + "epoch": 0.639167188478397, + "grad_norm": 3.1112220287323, + "learning_rate": 7.090257412838059e-05, + "loss": 1.0884, + "step": 4083 + }, + { + "epoch": 0.6393237319974953, + "grad_norm": 1.9161471128463745, + "learning_rate": 7.089442815249267e-05, + "loss": 0.4981, + "step": 4084 + }, + { + "epoch": 0.6394802755165936, + "grad_norm": 2.2730491161346436, + "learning_rate": 7.088628217660475e-05, + "loss": 1.033, + "step": 4085 + }, + { + "epoch": 0.6396368190356919, + "grad_norm": 2.718834161758423, + "learning_rate": 7.087813620071685e-05, + "loss": 1.2705, + "step": 4086 + }, + { + "epoch": 0.6397933625547902, + "grad_norm": 2.256880283355713, + "learning_rate": 7.086999022482895e-05, + "loss": 0.8141, + "step": 4087 + }, + { + "epoch": 0.6399499060738886, + "grad_norm": 2.558476448059082, + "learning_rate": 7.086184424894102e-05, + "loss": 1.2563, + "step": 4088 + }, + { + "epoch": 0.6401064495929869, + "grad_norm": 6.324321746826172, + "learning_rate": 7.085369827305312e-05, + "loss": 1.0598, + "step": 4089 + }, + { + "epoch": 0.6402629931120851, + "grad_norm": 4.784167289733887, + "learning_rate": 7.084555229716521e-05, + "loss": 1.5546, + "step": 4090 + }, + { + "epoch": 0.6404195366311835, + "grad_norm": 3.4937503337860107, + "learning_rate": 7.083740632127728e-05, + "loss": 0.8985, + "step": 4091 + }, + { + "epoch": 0.6405760801502818, + "grad_norm": 3.0360963344573975, + "learning_rate": 7.082926034538938e-05, + "loss": 0.8769, + "step": 4092 + }, + { + "epoch": 0.6407326236693801, + "grad_norm": 2.3702633380889893, + "learning_rate": 7.082111436950148e-05, + "loss": 0.9607, + "step": 4093 + }, + { + "epoch": 0.6408891671884784, + "grad_norm": 4.978992938995361, + "learning_rate": 7.081296839361356e-05, + "loss": 1.3954, + "step": 4094 + }, + { + "epoch": 0.6410457107075767, + "grad_norm": 6.189276218414307, + "learning_rate": 7.080482241772565e-05, + "loss": 0.5675, + "step": 4095 + }, + { + "epoch": 0.641202254226675, + "grad_norm": 2.064746379852295, + "learning_rate": 7.079667644183774e-05, + "loss": 0.6357, + "step": 4096 + }, + { + "epoch": 0.6413587977457733, + "grad_norm": 2.740323781967163, + "learning_rate": 7.078853046594983e-05, + "loss": 1.2606, + "step": 4097 + }, + { + "epoch": 0.6415153412648716, + "grad_norm": 3.894662857055664, + "learning_rate": 7.078038449006191e-05, + "loss": 1.2527, + "step": 4098 + }, + { + "epoch": 0.64167188478397, + "grad_norm": 1.8819857835769653, + "learning_rate": 7.077223851417401e-05, + "loss": 1.2173, + "step": 4099 + }, + { + "epoch": 0.6418284283030683, + "grad_norm": 2.6567904949188232, + "learning_rate": 7.076409253828609e-05, + "loss": 1.1724, + "step": 4100 + }, + { + "epoch": 0.6419849718221665, + "grad_norm": 0.7583183646202087, + "learning_rate": 7.075594656239817e-05, + "loss": 0.4343, + "step": 4101 + }, + { + "epoch": 0.6421415153412648, + "grad_norm": 0.7557559013366699, + "learning_rate": 7.074780058651027e-05, + "loss": 0.2886, + "step": 4102 + }, + { + "epoch": 0.6422980588603632, + "grad_norm": 0.8399807214736938, + "learning_rate": 7.073965461062236e-05, + "loss": 0.2382, + "step": 4103 + }, + { + "epoch": 0.6424546023794615, + "grad_norm": 0.6882436275482178, + "learning_rate": 7.073150863473444e-05, + "loss": 0.3059, + "step": 4104 + }, + { + "epoch": 0.6426111458985598, + "grad_norm": 0.9894430637359619, + "learning_rate": 7.072336265884654e-05, + "loss": 0.3557, + "step": 4105 + }, + { + "epoch": 0.6427676894176582, + "grad_norm": 0.46684709191322327, + "learning_rate": 7.071521668295862e-05, + "loss": 0.2844, + "step": 4106 + }, + { + "epoch": 0.6429242329367564, + "grad_norm": 0.6804313063621521, + "learning_rate": 7.07070707070707e-05, + "loss": 0.3142, + "step": 4107 + }, + { + "epoch": 0.6430807764558547, + "grad_norm": 1.00410795211792, + "learning_rate": 7.06989247311828e-05, + "loss": 0.3573, + "step": 4108 + }, + { + "epoch": 0.643237319974953, + "grad_norm": 2.023380994796753, + "learning_rate": 7.069077875529489e-05, + "loss": 0.3894, + "step": 4109 + }, + { + "epoch": 0.6433938634940514, + "grad_norm": 0.5460801720619202, + "learning_rate": 7.068263277940698e-05, + "loss": 0.3014, + "step": 4110 + }, + { + "epoch": 0.6435504070131497, + "grad_norm": 0.9008684158325195, + "learning_rate": 7.067448680351907e-05, + "loss": 0.4341, + "step": 4111 + }, + { + "epoch": 0.6437069505322479, + "grad_norm": 2.2813901901245117, + "learning_rate": 7.066634082763115e-05, + "loss": 0.4422, + "step": 4112 + }, + { + "epoch": 0.6438634940513462, + "grad_norm": 0.9679890275001526, + "learning_rate": 7.065819485174325e-05, + "loss": 0.3195, + "step": 4113 + }, + { + "epoch": 0.6440200375704446, + "grad_norm": 1.719936490058899, + "learning_rate": 7.065004887585533e-05, + "loss": 0.3792, + "step": 4114 + }, + { + "epoch": 0.6441765810895429, + "grad_norm": 1.4312705993652344, + "learning_rate": 7.064190289996741e-05, + "loss": 0.3485, + "step": 4115 + }, + { + "epoch": 0.6443331246086412, + "grad_norm": 2.564697027206421, + "learning_rate": 7.063375692407951e-05, + "loss": 0.8211, + "step": 4116 + }, + { + "epoch": 0.6444896681277396, + "grad_norm": 1.0948103666305542, + "learning_rate": 7.06256109481916e-05, + "loss": 0.3807, + "step": 4117 + }, + { + "epoch": 0.6446462116468378, + "grad_norm": 1.9665987491607666, + "learning_rate": 7.061746497230368e-05, + "loss": 0.643, + "step": 4118 + }, + { + "epoch": 0.6448027551659361, + "grad_norm": 1.431214451789856, + "learning_rate": 7.060931899641578e-05, + "loss": 0.378, + "step": 4119 + }, + { + "epoch": 0.6449592986850344, + "grad_norm": 2.0006844997406006, + "learning_rate": 7.060117302052786e-05, + "loss": 0.7017, + "step": 4120 + }, + { + "epoch": 0.6451158422041328, + "grad_norm": 1.6398265361785889, + "learning_rate": 7.059302704463994e-05, + "loss": 0.6125, + "step": 4121 + }, + { + "epoch": 0.6452723857232311, + "grad_norm": 1.4262232780456543, + "learning_rate": 7.058488106875204e-05, + "loss": 0.4777, + "step": 4122 + }, + { + "epoch": 0.6454289292423294, + "grad_norm": 3.1118104457855225, + "learning_rate": 7.057673509286414e-05, + "loss": 0.6016, + "step": 4123 + }, + { + "epoch": 0.6455854727614276, + "grad_norm": 1.3902126550674438, + "learning_rate": 7.056858911697621e-05, + "loss": 0.3865, + "step": 4124 + }, + { + "epoch": 0.645742016280526, + "grad_norm": 1.9578372240066528, + "learning_rate": 7.05604431410883e-05, + "loss": 0.5977, + "step": 4125 + }, + { + "epoch": 0.6458985597996243, + "grad_norm": 1.458163857460022, + "learning_rate": 7.05522971652004e-05, + "loss": 0.7894, + "step": 4126 + }, + { + "epoch": 0.6460551033187226, + "grad_norm": 1.9610273838043213, + "learning_rate": 7.054415118931247e-05, + "loss": 0.81, + "step": 4127 + }, + { + "epoch": 0.646211646837821, + "grad_norm": 3.130499839782715, + "learning_rate": 7.053600521342457e-05, + "loss": 0.8929, + "step": 4128 + }, + { + "epoch": 0.6463681903569192, + "grad_norm": 2.6079819202423096, + "learning_rate": 7.052785923753667e-05, + "loss": 0.9198, + "step": 4129 + }, + { + "epoch": 0.6465247338760175, + "grad_norm": 2.5300586223602295, + "learning_rate": 7.051971326164874e-05, + "loss": 0.8187, + "step": 4130 + }, + { + "epoch": 0.6466812773951158, + "grad_norm": 2.5399575233459473, + "learning_rate": 7.051156728576084e-05, + "loss": 0.6531, + "step": 4131 + }, + { + "epoch": 0.6468378209142142, + "grad_norm": 2.7596874237060547, + "learning_rate": 7.050342130987293e-05, + "loss": 0.5118, + "step": 4132 + }, + { + "epoch": 0.6469943644333125, + "grad_norm": 2.168501138687134, + "learning_rate": 7.049527533398502e-05, + "loss": 1.1901, + "step": 4133 + }, + { + "epoch": 0.6471509079524108, + "grad_norm": 1.709295392036438, + "learning_rate": 7.04871293580971e-05, + "loss": 0.4286, + "step": 4134 + }, + { + "epoch": 0.647307451471509, + "grad_norm": 1.5526564121246338, + "learning_rate": 7.04789833822092e-05, + "loss": 0.8409, + "step": 4135 + }, + { + "epoch": 0.6474639949906074, + "grad_norm": 4.063093662261963, + "learning_rate": 7.047083740632128e-05, + "loss": 1.7659, + "step": 4136 + }, + { + "epoch": 0.6476205385097057, + "grad_norm": 2.062741994857788, + "learning_rate": 7.046269143043337e-05, + "loss": 0.744, + "step": 4137 + }, + { + "epoch": 0.647777082028804, + "grad_norm": 2.4378039836883545, + "learning_rate": 7.045454545454546e-05, + "loss": 0.8707, + "step": 4138 + }, + { + "epoch": 0.6479336255479023, + "grad_norm": 1.9690673351287842, + "learning_rate": 7.044639947865755e-05, + "loss": 0.8501, + "step": 4139 + }, + { + "epoch": 0.6480901690670007, + "grad_norm": 2.0832738876342773, + "learning_rate": 7.043825350276963e-05, + "loss": 1.2283, + "step": 4140 + }, + { + "epoch": 0.6482467125860989, + "grad_norm": 3.087444305419922, + "learning_rate": 7.043010752688173e-05, + "loss": 1.5801, + "step": 4141 + }, + { + "epoch": 0.6484032561051972, + "grad_norm": 2.0335288047790527, + "learning_rate": 7.042196155099381e-05, + "loss": 1.4548, + "step": 4142 + }, + { + "epoch": 0.6485597996242956, + "grad_norm": 2.5438923835754395, + "learning_rate": 7.04138155751059e-05, + "loss": 1.2667, + "step": 4143 + }, + { + "epoch": 0.6487163431433939, + "grad_norm": 2.394395112991333, + "learning_rate": 7.040566959921799e-05, + "loss": 1.4884, + "step": 4144 + }, + { + "epoch": 0.6488728866624922, + "grad_norm": 3.9514219760894775, + "learning_rate": 7.039752362333008e-05, + "loss": 1.0516, + "step": 4145 + }, + { + "epoch": 0.6490294301815904, + "grad_norm": 3.4415695667266846, + "learning_rate": 7.038937764744217e-05, + "loss": 1.083, + "step": 4146 + }, + { + "epoch": 0.6491859737006888, + "grad_norm": 1.521572232246399, + "learning_rate": 7.038123167155426e-05, + "loss": 0.6453, + "step": 4147 + }, + { + "epoch": 0.6493425172197871, + "grad_norm": 2.1153311729431152, + "learning_rate": 7.037308569566634e-05, + "loss": 0.8638, + "step": 4148 + }, + { + "epoch": 0.6494990607388854, + "grad_norm": 2.721769332885742, + "learning_rate": 7.036493971977844e-05, + "loss": 1.0239, + "step": 4149 + }, + { + "epoch": 0.6496556042579837, + "grad_norm": 3.856257677078247, + "learning_rate": 7.035679374389052e-05, + "loss": 1.5095, + "step": 4150 + }, + { + "epoch": 0.6498121477770821, + "grad_norm": 0.5057368278503418, + "learning_rate": 7.03486477680026e-05, + "loss": 0.2121, + "step": 4151 + }, + { + "epoch": 0.6499686912961803, + "grad_norm": 0.46311473846435547, + "learning_rate": 7.03405017921147e-05, + "loss": 0.211, + "step": 4152 + }, + { + "epoch": 0.6501252348152786, + "grad_norm": 0.8797329068183899, + "learning_rate": 7.033235581622679e-05, + "loss": 0.3142, + "step": 4153 + }, + { + "epoch": 0.650281778334377, + "grad_norm": 0.8221085071563721, + "learning_rate": 7.032420984033887e-05, + "loss": 0.3273, + "step": 4154 + }, + { + "epoch": 0.6504383218534753, + "grad_norm": 0.8498123288154602, + "learning_rate": 7.031606386445097e-05, + "loss": 0.2691, + "step": 4155 + }, + { + "epoch": 0.6505948653725736, + "grad_norm": 1.0004853010177612, + "learning_rate": 7.030791788856305e-05, + "loss": 0.3386, + "step": 4156 + }, + { + "epoch": 0.6507514088916719, + "grad_norm": 0.9904922246932983, + "learning_rate": 7.029977191267514e-05, + "loss": 0.2186, + "step": 4157 + }, + { + "epoch": 0.6509079524107702, + "grad_norm": 1.5108505487442017, + "learning_rate": 7.029162593678723e-05, + "loss": 0.3394, + "step": 4158 + }, + { + "epoch": 0.6510644959298685, + "grad_norm": 0.6343823671340942, + "learning_rate": 7.028347996089932e-05, + "loss": 0.3577, + "step": 4159 + }, + { + "epoch": 0.6512210394489668, + "grad_norm": 1.0920754671096802, + "learning_rate": 7.02753339850114e-05, + "loss": 0.5832, + "step": 4160 + }, + { + "epoch": 0.6513775829680651, + "grad_norm": 0.7101654410362244, + "learning_rate": 7.02671880091235e-05, + "loss": 0.2122, + "step": 4161 + }, + { + "epoch": 0.6515341264871635, + "grad_norm": 4.906535625457764, + "learning_rate": 7.02590420332356e-05, + "loss": 0.6437, + "step": 4162 + }, + { + "epoch": 0.6516906700062617, + "grad_norm": 0.7401002049446106, + "learning_rate": 7.025089605734767e-05, + "loss": 0.2993, + "step": 4163 + }, + { + "epoch": 0.65184721352536, + "grad_norm": 1.3129963874816895, + "learning_rate": 7.024275008145976e-05, + "loss": 0.3525, + "step": 4164 + }, + { + "epoch": 0.6520037570444583, + "grad_norm": 0.9443928003311157, + "learning_rate": 7.023460410557186e-05, + "loss": 0.4767, + "step": 4165 + }, + { + "epoch": 0.6521603005635567, + "grad_norm": 1.64704167842865, + "learning_rate": 7.022645812968393e-05, + "loss": 0.4991, + "step": 4166 + }, + { + "epoch": 0.652316844082655, + "grad_norm": 2.6924502849578857, + "learning_rate": 7.021831215379603e-05, + "loss": 0.4422, + "step": 4167 + }, + { + "epoch": 0.6524733876017533, + "grad_norm": 1.5548021793365479, + "learning_rate": 7.021016617790812e-05, + "loss": 0.6068, + "step": 4168 + }, + { + "epoch": 0.6526299311208515, + "grad_norm": 1.6948214769363403, + "learning_rate": 7.020202020202021e-05, + "loss": 0.6034, + "step": 4169 + }, + { + "epoch": 0.6527864746399499, + "grad_norm": 1.4500186443328857, + "learning_rate": 7.019387422613229e-05, + "loss": 0.2455, + "step": 4170 + }, + { + "epoch": 0.6529430181590482, + "grad_norm": 1.5253585577011108, + "learning_rate": 7.018572825024439e-05, + "loss": 0.4831, + "step": 4171 + }, + { + "epoch": 0.6530995616781465, + "grad_norm": 1.2881150245666504, + "learning_rate": 7.017758227435647e-05, + "loss": 0.4052, + "step": 4172 + }, + { + "epoch": 0.6532561051972449, + "grad_norm": 3.631683349609375, + "learning_rate": 7.016943629846856e-05, + "loss": 0.5277, + "step": 4173 + }, + { + "epoch": 0.6534126487163432, + "grad_norm": 2.1267573833465576, + "learning_rate": 7.016129032258065e-05, + "loss": 0.7282, + "step": 4174 + }, + { + "epoch": 0.6535691922354414, + "grad_norm": 1.4850146770477295, + "learning_rate": 7.015314434669274e-05, + "loss": 0.5354, + "step": 4175 + }, + { + "epoch": 0.6537257357545397, + "grad_norm": 2.13199782371521, + "learning_rate": 7.014499837080482e-05, + "loss": 0.2997, + "step": 4176 + }, + { + "epoch": 0.6538822792736381, + "grad_norm": 2.6171281337738037, + "learning_rate": 7.013685239491692e-05, + "loss": 0.7327, + "step": 4177 + }, + { + "epoch": 0.6540388227927364, + "grad_norm": 2.846111536026001, + "learning_rate": 7.0128706419029e-05, + "loss": 0.6076, + "step": 4178 + }, + { + "epoch": 0.6541953663118347, + "grad_norm": 3.0018086433410645, + "learning_rate": 7.012056044314109e-05, + "loss": 0.7293, + "step": 4179 + }, + { + "epoch": 0.654351909830933, + "grad_norm": 2.646679401397705, + "learning_rate": 7.011241446725318e-05, + "loss": 0.7725, + "step": 4180 + }, + { + "epoch": 0.6545084533500313, + "grad_norm": 3.4814083576202393, + "learning_rate": 7.010426849136527e-05, + "loss": 1.1618, + "step": 4181 + }, + { + "epoch": 0.6546649968691296, + "grad_norm": 2.606968879699707, + "learning_rate": 7.009612251547736e-05, + "loss": 0.7895, + "step": 4182 + }, + { + "epoch": 0.6548215403882279, + "grad_norm": 2.2563066482543945, + "learning_rate": 7.008797653958945e-05, + "loss": 0.7727, + "step": 4183 + }, + { + "epoch": 0.6549780839073263, + "grad_norm": 3.0285520553588867, + "learning_rate": 7.007983056370153e-05, + "loss": 0.5796, + "step": 4184 + }, + { + "epoch": 0.6551346274264246, + "grad_norm": 3.004166841506958, + "learning_rate": 7.007168458781363e-05, + "loss": 0.8773, + "step": 4185 + }, + { + "epoch": 0.6552911709455228, + "grad_norm": 2.3888113498687744, + "learning_rate": 7.006353861192571e-05, + "loss": 0.583, + "step": 4186 + }, + { + "epoch": 0.6554477144646211, + "grad_norm": 6.347829341888428, + "learning_rate": 7.00553926360378e-05, + "loss": 1.1064, + "step": 4187 + }, + { + "epoch": 0.6556042579837195, + "grad_norm": 3.2362918853759766, + "learning_rate": 7.00472466601499e-05, + "loss": 1.0853, + "step": 4188 + }, + { + "epoch": 0.6557608015028178, + "grad_norm": 2.8374903202056885, + "learning_rate": 7.003910068426198e-05, + "loss": 1.1816, + "step": 4189 + }, + { + "epoch": 0.6559173450219161, + "grad_norm": 2.2205324172973633, + "learning_rate": 7.003095470837406e-05, + "loss": 1.1542, + "step": 4190 + }, + { + "epoch": 0.6560738885410144, + "grad_norm": 6.552746772766113, + "learning_rate": 7.002280873248616e-05, + "loss": 1.5263, + "step": 4191 + }, + { + "epoch": 0.6562304320601127, + "grad_norm": 3.0984935760498047, + "learning_rate": 7.001466275659824e-05, + "loss": 0.9162, + "step": 4192 + }, + { + "epoch": 0.656386975579211, + "grad_norm": 4.213404655456543, + "learning_rate": 7.000651678071033e-05, + "loss": 1.0294, + "step": 4193 + }, + { + "epoch": 0.6565435190983093, + "grad_norm": 5.382426738739014, + "learning_rate": 6.999837080482242e-05, + "loss": 0.9715, + "step": 4194 + }, + { + "epoch": 0.6567000626174077, + "grad_norm": 4.665666580200195, + "learning_rate": 6.999022482893451e-05, + "loss": 1.3871, + "step": 4195 + }, + { + "epoch": 0.656856606136506, + "grad_norm": 2.8839826583862305, + "learning_rate": 6.998207885304659e-05, + "loss": 0.5342, + "step": 4196 + }, + { + "epoch": 0.6570131496556043, + "grad_norm": 3.870335340499878, + "learning_rate": 6.997393287715869e-05, + "loss": 0.6429, + "step": 4197 + }, + { + "epoch": 0.6571696931747025, + "grad_norm": 2.675859212875366, + "learning_rate": 6.996578690127079e-05, + "loss": 0.8159, + "step": 4198 + }, + { + "epoch": 0.6573262366938009, + "grad_norm": 1.6190845966339111, + "learning_rate": 6.995764092538286e-05, + "loss": 0.4591, + "step": 4199 + }, + { + "epoch": 0.6574827802128992, + "grad_norm": 5.749204635620117, + "learning_rate": 6.994949494949495e-05, + "loss": 1.3905, + "step": 4200 + }, + { + "epoch": 0.6576393237319975, + "grad_norm": 0.6186854839324951, + "learning_rate": 6.994134897360705e-05, + "loss": 0.2177, + "step": 4201 + }, + { + "epoch": 0.6577958672510958, + "grad_norm": 0.803088903427124, + "learning_rate": 6.993320299771912e-05, + "loss": 0.309, + "step": 4202 + }, + { + "epoch": 0.6579524107701941, + "grad_norm": 0.7830495238304138, + "learning_rate": 6.992505702183122e-05, + "loss": 0.2821, + "step": 4203 + }, + { + "epoch": 0.6581089542892924, + "grad_norm": 0.5155359506607056, + "learning_rate": 6.991691104594332e-05, + "loss": 0.24, + "step": 4204 + }, + { + "epoch": 0.6582654978083907, + "grad_norm": 0.7666541337966919, + "learning_rate": 6.99087650700554e-05, + "loss": 0.3088, + "step": 4205 + }, + { + "epoch": 0.658422041327489, + "grad_norm": 0.8050822615623474, + "learning_rate": 6.990061909416748e-05, + "loss": 0.3652, + "step": 4206 + }, + { + "epoch": 0.6585785848465874, + "grad_norm": 0.7881529927253723, + "learning_rate": 6.989247311827958e-05, + "loss": 0.3282, + "step": 4207 + }, + { + "epoch": 0.6587351283656857, + "grad_norm": 0.9218533635139465, + "learning_rate": 6.988432714239166e-05, + "loss": 0.3247, + "step": 4208 + }, + { + "epoch": 0.6588916718847839, + "grad_norm": 0.7466624975204468, + "learning_rate": 6.987618116650375e-05, + "loss": 0.3511, + "step": 4209 + }, + { + "epoch": 0.6590482154038823, + "grad_norm": 8.123543739318848, + "learning_rate": 6.986803519061585e-05, + "loss": 0.6356, + "step": 4210 + }, + { + "epoch": 0.6592047589229806, + "grad_norm": 10.217103958129883, + "learning_rate": 6.985988921472793e-05, + "loss": 1.3898, + "step": 4211 + }, + { + "epoch": 0.6593613024420789, + "grad_norm": 1.5522810220718384, + "learning_rate": 6.985174323884001e-05, + "loss": 0.4616, + "step": 4212 + }, + { + "epoch": 0.6595178459611772, + "grad_norm": 1.725705623626709, + "learning_rate": 6.984359726295211e-05, + "loss": 0.7324, + "step": 4213 + }, + { + "epoch": 0.6596743894802756, + "grad_norm": 2.657944917678833, + "learning_rate": 6.98354512870642e-05, + "loss": 0.6474, + "step": 4214 + }, + { + "epoch": 0.6598309329993738, + "grad_norm": 1.4123575687408447, + "learning_rate": 6.982730531117628e-05, + "loss": 0.3883, + "step": 4215 + }, + { + "epoch": 0.6599874765184721, + "grad_norm": 2.1162705421447754, + "learning_rate": 6.981915933528838e-05, + "loss": 0.6398, + "step": 4216 + }, + { + "epoch": 0.6601440200375704, + "grad_norm": 1.8174790143966675, + "learning_rate": 6.981101335940046e-05, + "loss": 0.3582, + "step": 4217 + }, + { + "epoch": 0.6603005635566688, + "grad_norm": 1.3173619508743286, + "learning_rate": 6.980286738351254e-05, + "loss": 0.3945, + "step": 4218 + }, + { + "epoch": 0.6604571070757671, + "grad_norm": 1.5002254247665405, + "learning_rate": 6.979472140762464e-05, + "loss": 0.8039, + "step": 4219 + }, + { + "epoch": 0.6606136505948653, + "grad_norm": 1.3716272115707397, + "learning_rate": 6.978657543173672e-05, + "loss": 0.4376, + "step": 4220 + }, + { + "epoch": 0.6607701941139636, + "grad_norm": 3.6601369380950928, + "learning_rate": 6.977842945584882e-05, + "loss": 0.9224, + "step": 4221 + }, + { + "epoch": 0.660926737633062, + "grad_norm": 2.487698793411255, + "learning_rate": 6.97702834799609e-05, + "loss": 0.7298, + "step": 4222 + }, + { + "epoch": 0.6610832811521603, + "grad_norm": 1.2544103860855103, + "learning_rate": 6.976213750407299e-05, + "loss": 0.5203, + "step": 4223 + }, + { + "epoch": 0.6612398246712586, + "grad_norm": 1.662057638168335, + "learning_rate": 6.975399152818509e-05, + "loss": 0.6395, + "step": 4224 + }, + { + "epoch": 0.661396368190357, + "grad_norm": 2.5482735633850098, + "learning_rate": 6.974584555229717e-05, + "loss": 0.7496, + "step": 4225 + }, + { + "epoch": 0.6615529117094552, + "grad_norm": 2.6809940338134766, + "learning_rate": 6.973769957640925e-05, + "loss": 0.7502, + "step": 4226 + }, + { + "epoch": 0.6617094552285535, + "grad_norm": 1.9213640689849854, + "learning_rate": 6.972955360052135e-05, + "loss": 0.6887, + "step": 4227 + }, + { + "epoch": 0.6618659987476518, + "grad_norm": 3.0842068195343018, + "learning_rate": 6.972140762463343e-05, + "loss": 0.6185, + "step": 4228 + }, + { + "epoch": 0.6620225422667502, + "grad_norm": 1.6545064449310303, + "learning_rate": 6.971326164874552e-05, + "loss": 0.6091, + "step": 4229 + }, + { + "epoch": 0.6621790857858485, + "grad_norm": 2.1216650009155273, + "learning_rate": 6.970511567285762e-05, + "loss": 0.7895, + "step": 4230 + }, + { + "epoch": 0.6623356293049468, + "grad_norm": 4.363924026489258, + "learning_rate": 6.96969696969697e-05, + "loss": 0.8183, + "step": 4231 + }, + { + "epoch": 0.662492172824045, + "grad_norm": 2.2277867794036865, + "learning_rate": 6.968882372108178e-05, + "loss": 0.7261, + "step": 4232 + }, + { + "epoch": 0.6626487163431434, + "grad_norm": 3.7462992668151855, + "learning_rate": 6.968067774519388e-05, + "loss": 0.7464, + "step": 4233 + }, + { + "epoch": 0.6628052598622417, + "grad_norm": 2.7952945232391357, + "learning_rate": 6.967253176930598e-05, + "loss": 0.5303, + "step": 4234 + }, + { + "epoch": 0.66296180338134, + "grad_norm": 2.008056879043579, + "learning_rate": 6.966438579341805e-05, + "loss": 0.9462, + "step": 4235 + }, + { + "epoch": 0.6631183469004384, + "grad_norm": 4.011176109313965, + "learning_rate": 6.965623981753014e-05, + "loss": 1.1206, + "step": 4236 + }, + { + "epoch": 0.6632748904195366, + "grad_norm": 4.135866165161133, + "learning_rate": 6.964809384164224e-05, + "loss": 1.2526, + "step": 4237 + }, + { + "epoch": 0.6634314339386349, + "grad_norm": 6.294758319854736, + "learning_rate": 6.963994786575431e-05, + "loss": 1.0882, + "step": 4238 + }, + { + "epoch": 0.6635879774577332, + "grad_norm": 2.060603618621826, + "learning_rate": 6.963180188986641e-05, + "loss": 0.9881, + "step": 4239 + }, + { + "epoch": 0.6637445209768316, + "grad_norm": 4.447430610656738, + "learning_rate": 6.962365591397851e-05, + "loss": 1.5042, + "step": 4240 + }, + { + "epoch": 0.6639010644959299, + "grad_norm": 5.796322345733643, + "learning_rate": 6.961550993809058e-05, + "loss": 1.4759, + "step": 4241 + }, + { + "epoch": 0.6640576080150282, + "grad_norm": 4.124522686004639, + "learning_rate": 6.960736396220267e-05, + "loss": 1.1296, + "step": 4242 + }, + { + "epoch": 0.6642141515341264, + "grad_norm": 1.4639137983322144, + "learning_rate": 6.959921798631477e-05, + "loss": 0.8298, + "step": 4243 + }, + { + "epoch": 0.6643706950532248, + "grad_norm": 3.9577112197875977, + "learning_rate": 6.959107201042686e-05, + "loss": 1.1781, + "step": 4244 + }, + { + "epoch": 0.6645272385723231, + "grad_norm": 6.210996627807617, + "learning_rate": 6.958292603453894e-05, + "loss": 1.4259, + "step": 4245 + }, + { + "epoch": 0.6646837820914214, + "grad_norm": 3.4087557792663574, + "learning_rate": 6.957478005865104e-05, + "loss": 0.6318, + "step": 4246 + }, + { + "epoch": 0.6648403256105198, + "grad_norm": 3.9765408039093018, + "learning_rate": 6.956663408276312e-05, + "loss": 0.7294, + "step": 4247 + }, + { + "epoch": 0.6649968691296181, + "grad_norm": 2.7956151962280273, + "learning_rate": 6.95584881068752e-05, + "loss": 0.8746, + "step": 4248 + }, + { + "epoch": 0.6651534126487163, + "grad_norm": 1.4452195167541504, + "learning_rate": 6.95503421309873e-05, + "loss": 0.4648, + "step": 4249 + }, + { + "epoch": 0.6653099561678146, + "grad_norm": 3.5288681983947754, + "learning_rate": 6.954219615509939e-05, + "loss": 0.971, + "step": 4250 + }, + { + "epoch": 0.665466499686913, + "grad_norm": 0.7570310831069946, + "learning_rate": 6.953405017921147e-05, + "loss": 0.2786, + "step": 4251 + }, + { + "epoch": 0.6656230432060113, + "grad_norm": 0.6884904503822327, + "learning_rate": 6.952590420332357e-05, + "loss": 0.2499, + "step": 4252 + }, + { + "epoch": 0.6657795867251096, + "grad_norm": 0.6780734062194824, + "learning_rate": 6.951775822743565e-05, + "loss": 0.2747, + "step": 4253 + }, + { + "epoch": 0.6659361302442078, + "grad_norm": 0.4654354453086853, + "learning_rate": 6.950961225154773e-05, + "loss": 0.216, + "step": 4254 + }, + { + "epoch": 0.6660926737633062, + "grad_norm": 1.2301207780838013, + "learning_rate": 6.950146627565983e-05, + "loss": 0.2754, + "step": 4255 + }, + { + "epoch": 0.6662492172824045, + "grad_norm": 1.2137277126312256, + "learning_rate": 6.949332029977191e-05, + "loss": 0.3767, + "step": 4256 + }, + { + "epoch": 0.6664057608015028, + "grad_norm": 0.9222862720489502, + "learning_rate": 6.948517432388401e-05, + "loss": 0.4986, + "step": 4257 + }, + { + "epoch": 0.6665623043206012, + "grad_norm": 1.2736104726791382, + "learning_rate": 6.94770283479961e-05, + "loss": 0.3517, + "step": 4258 + }, + { + "epoch": 0.6667188478396995, + "grad_norm": 1.0065364837646484, + "learning_rate": 6.946888237210818e-05, + "loss": 0.3003, + "step": 4259 + }, + { + "epoch": 0.6668753913587977, + "grad_norm": 1.100253701210022, + "learning_rate": 6.946073639622028e-05, + "loss": 0.2663, + "step": 4260 + }, + { + "epoch": 0.667031934877896, + "grad_norm": 0.7247801423072815, + "learning_rate": 6.945259042033236e-05, + "loss": 0.3416, + "step": 4261 + }, + { + "epoch": 0.6671884783969944, + "grad_norm": 0.6895297169685364, + "learning_rate": 6.944444444444444e-05, + "loss": 0.3578, + "step": 4262 + }, + { + "epoch": 0.6673450219160927, + "grad_norm": 1.4111422300338745, + "learning_rate": 6.943629846855654e-05, + "loss": 0.4387, + "step": 4263 + }, + { + "epoch": 0.667501565435191, + "grad_norm": 1.4125404357910156, + "learning_rate": 6.942815249266863e-05, + "loss": 0.3475, + "step": 4264 + }, + { + "epoch": 0.6676581089542893, + "grad_norm": 2.1171553134918213, + "learning_rate": 6.942000651678071e-05, + "loss": 0.4414, + "step": 4265 + }, + { + "epoch": 0.6678146524733876, + "grad_norm": 1.183144450187683, + "learning_rate": 6.94118605408928e-05, + "loss": 0.4492, + "step": 4266 + }, + { + "epoch": 0.6679711959924859, + "grad_norm": 1.8935967683792114, + "learning_rate": 6.940371456500489e-05, + "loss": 0.396, + "step": 4267 + }, + { + "epoch": 0.6681277395115842, + "grad_norm": 1.4636279344558716, + "learning_rate": 6.939556858911697e-05, + "loss": 0.4949, + "step": 4268 + }, + { + "epoch": 0.6682842830306825, + "grad_norm": 2.9190914630889893, + "learning_rate": 6.938742261322907e-05, + "loss": 0.4514, + "step": 4269 + }, + { + "epoch": 0.6684408265497809, + "grad_norm": 1.2088713645935059, + "learning_rate": 6.937927663734116e-05, + "loss": 0.4041, + "step": 4270 + }, + { + "epoch": 0.6685973700688791, + "grad_norm": 2.0570266246795654, + "learning_rate": 6.937113066145324e-05, + "loss": 0.5784, + "step": 4271 + }, + { + "epoch": 0.6687539135879774, + "grad_norm": 1.5934492349624634, + "learning_rate": 6.936298468556534e-05, + "loss": 0.4386, + "step": 4272 + }, + { + "epoch": 0.6689104571070758, + "grad_norm": 3.532195806503296, + "learning_rate": 6.935483870967743e-05, + "loss": 0.7824, + "step": 4273 + }, + { + "epoch": 0.6690670006261741, + "grad_norm": 2.2653942108154297, + "learning_rate": 6.93466927337895e-05, + "loss": 0.4835, + "step": 4274 + }, + { + "epoch": 0.6692235441452724, + "grad_norm": 1.8834033012390137, + "learning_rate": 6.93385467579016e-05, + "loss": 0.6769, + "step": 4275 + }, + { + "epoch": 0.6693800876643707, + "grad_norm": 0.9919312596321106, + "learning_rate": 6.93304007820137e-05, + "loss": 0.5919, + "step": 4276 + }, + { + "epoch": 0.669536631183469, + "grad_norm": 1.9623829126358032, + "learning_rate": 6.932225480612577e-05, + "loss": 0.5857, + "step": 4277 + }, + { + "epoch": 0.6696931747025673, + "grad_norm": 2.074381113052368, + "learning_rate": 6.931410883023787e-05, + "loss": 0.8534, + "step": 4278 + }, + { + "epoch": 0.6698497182216656, + "grad_norm": 1.709227204322815, + "learning_rate": 6.930596285434996e-05, + "loss": 0.6892, + "step": 4279 + }, + { + "epoch": 0.6700062617407639, + "grad_norm": 3.339108943939209, + "learning_rate": 6.929781687846205e-05, + "loss": 0.6428, + "step": 4280 + }, + { + "epoch": 0.6701628052598623, + "grad_norm": 2.4149675369262695, + "learning_rate": 6.928967090257413e-05, + "loss": 0.8007, + "step": 4281 + }, + { + "epoch": 0.6703193487789606, + "grad_norm": 3.3626890182495117, + "learning_rate": 6.928152492668623e-05, + "loss": 0.8315, + "step": 4282 + }, + { + "epoch": 0.6704758922980588, + "grad_norm": 3.421877145767212, + "learning_rate": 6.927337895079831e-05, + "loss": 1.1989, + "step": 4283 + }, + { + "epoch": 0.6706324358171571, + "grad_norm": 2.0584561824798584, + "learning_rate": 6.92652329749104e-05, + "loss": 0.8985, + "step": 4284 + }, + { + "epoch": 0.6707889793362555, + "grad_norm": 2.0099194049835205, + "learning_rate": 6.925708699902249e-05, + "loss": 0.8543, + "step": 4285 + }, + { + "epoch": 0.6709455228553538, + "grad_norm": 6.092891216278076, + "learning_rate": 6.924894102313458e-05, + "loss": 0.6615, + "step": 4286 + }, + { + "epoch": 0.6711020663744521, + "grad_norm": 5.342296123504639, + "learning_rate": 6.924079504724666e-05, + "loss": 0.9918, + "step": 4287 + }, + { + "epoch": 0.6712586098935505, + "grad_norm": 3.257554769515991, + "learning_rate": 6.923264907135876e-05, + "loss": 1.3803, + "step": 4288 + }, + { + "epoch": 0.6714151534126487, + "grad_norm": 9.83000659942627, + "learning_rate": 6.922450309547084e-05, + "loss": 1.0254, + "step": 4289 + }, + { + "epoch": 0.671571696931747, + "grad_norm": 4.220418453216553, + "learning_rate": 6.921635711958292e-05, + "loss": 0.7492, + "step": 4290 + }, + { + "epoch": 0.6717282404508453, + "grad_norm": 5.1329345703125, + "learning_rate": 6.920821114369502e-05, + "loss": 1.2727, + "step": 4291 + }, + { + "epoch": 0.6718847839699437, + "grad_norm": 5.4828362464904785, + "learning_rate": 6.92000651678071e-05, + "loss": 1.2356, + "step": 4292 + }, + { + "epoch": 0.672041327489042, + "grad_norm": 9.952239990234375, + "learning_rate": 6.91919191919192e-05, + "loss": 1.6404, + "step": 4293 + }, + { + "epoch": 0.6721978710081402, + "grad_norm": 3.7232704162597656, + "learning_rate": 6.918377321603129e-05, + "loss": 1.7315, + "step": 4294 + }, + { + "epoch": 0.6723544145272385, + "grad_norm": 3.5613460540771484, + "learning_rate": 6.917562724014337e-05, + "loss": 1.3143, + "step": 4295 + }, + { + "epoch": 0.6725109580463369, + "grad_norm": 2.5666444301605225, + "learning_rate": 6.916748126425547e-05, + "loss": 0.8609, + "step": 4296 + }, + { + "epoch": 0.6726675015654352, + "grad_norm": 3.474299669265747, + "learning_rate": 6.915933528836755e-05, + "loss": 1.2806, + "step": 4297 + }, + { + "epoch": 0.6728240450845335, + "grad_norm": 3.095768451690674, + "learning_rate": 6.915118931247964e-05, + "loss": 0.5334, + "step": 4298 + }, + { + "epoch": 0.6729805886036319, + "grad_norm": 4.49686336517334, + "learning_rate": 6.914304333659173e-05, + "loss": 1.0956, + "step": 4299 + }, + { + "epoch": 0.6731371321227301, + "grad_norm": 3.003765821456909, + "learning_rate": 6.913489736070382e-05, + "loss": 0.6559, + "step": 4300 + }, + { + "epoch": 0.6732936756418284, + "grad_norm": 0.645232617855072, + "learning_rate": 6.91267513848159e-05, + "loss": 0.3602, + "step": 4301 + }, + { + "epoch": 0.6734502191609267, + "grad_norm": 0.5092976689338684, + "learning_rate": 6.9118605408928e-05, + "loss": 0.2512, + "step": 4302 + }, + { + "epoch": 0.6736067626800251, + "grad_norm": 1.594048261642456, + "learning_rate": 6.911045943304008e-05, + "loss": 0.4109, + "step": 4303 + }, + { + "epoch": 0.6737633061991234, + "grad_norm": 0.9327597618103027, + "learning_rate": 6.910231345715217e-05, + "loss": 0.3397, + "step": 4304 + }, + { + "epoch": 0.6739198497182217, + "grad_norm": 0.6653254628181458, + "learning_rate": 6.909416748126426e-05, + "loss": 0.4492, + "step": 4305 + }, + { + "epoch": 0.6740763932373199, + "grad_norm": 0.6242194175720215, + "learning_rate": 6.908602150537635e-05, + "loss": 0.3129, + "step": 4306 + }, + { + "epoch": 0.6742329367564183, + "grad_norm": 0.7211905121803284, + "learning_rate": 6.907787552948843e-05, + "loss": 0.4965, + "step": 4307 + }, + { + "epoch": 0.6743894802755166, + "grad_norm": 1.0577666759490967, + "learning_rate": 6.906972955360053e-05, + "loss": 0.329, + "step": 4308 + }, + { + "epoch": 0.6745460237946149, + "grad_norm": 1.1194297075271606, + "learning_rate": 6.906158357771262e-05, + "loss": 0.3709, + "step": 4309 + }, + { + "epoch": 0.6747025673137133, + "grad_norm": 1.0689432621002197, + "learning_rate": 6.90534376018247e-05, + "loss": 0.4792, + "step": 4310 + }, + { + "epoch": 0.6748591108328115, + "grad_norm": 1.0708930492401123, + "learning_rate": 6.904529162593679e-05, + "loss": 0.2873, + "step": 4311 + }, + { + "epoch": 0.6750156543519098, + "grad_norm": 1.0569963455200195, + "learning_rate": 6.903714565004889e-05, + "loss": 0.3806, + "step": 4312 + }, + { + "epoch": 0.6751721978710081, + "grad_norm": 1.1316843032836914, + "learning_rate": 6.902899967416096e-05, + "loss": 0.3218, + "step": 4313 + }, + { + "epoch": 0.6753287413901065, + "grad_norm": 2.121037006378174, + "learning_rate": 6.902085369827306e-05, + "loss": 0.4747, + "step": 4314 + }, + { + "epoch": 0.6754852849092048, + "grad_norm": 1.012650489807129, + "learning_rate": 6.901270772238515e-05, + "loss": 0.4852, + "step": 4315 + }, + { + "epoch": 0.6756418284283031, + "grad_norm": 1.6196757555007935, + "learning_rate": 6.900456174649724e-05, + "loss": 0.6095, + "step": 4316 + }, + { + "epoch": 0.6757983719474013, + "grad_norm": 1.3816664218902588, + "learning_rate": 6.899641577060932e-05, + "loss": 0.3425, + "step": 4317 + }, + { + "epoch": 0.6759549154664997, + "grad_norm": 1.2530063390731812, + "learning_rate": 6.898826979472142e-05, + "loss": 0.4393, + "step": 4318 + }, + { + "epoch": 0.676111458985598, + "grad_norm": 1.627371907234192, + "learning_rate": 6.89801238188335e-05, + "loss": 0.4605, + "step": 4319 + }, + { + "epoch": 0.6762680025046963, + "grad_norm": 1.9170197248458862, + "learning_rate": 6.897197784294559e-05, + "loss": 0.6439, + "step": 4320 + }, + { + "epoch": 0.6764245460237946, + "grad_norm": 1.3550236225128174, + "learning_rate": 6.896383186705768e-05, + "loss": 0.7575, + "step": 4321 + }, + { + "epoch": 0.676581089542893, + "grad_norm": 1.5622198581695557, + "learning_rate": 6.895568589116977e-05, + "loss": 0.5264, + "step": 4322 + }, + { + "epoch": 0.6767376330619912, + "grad_norm": 1.1554995775222778, + "learning_rate": 6.894753991528185e-05, + "loss": 0.5289, + "step": 4323 + }, + { + "epoch": 0.6768941765810895, + "grad_norm": 4.639176845550537, + "learning_rate": 6.893939393939395e-05, + "loss": 1.1424, + "step": 4324 + }, + { + "epoch": 0.6770507201001879, + "grad_norm": 3.0524396896362305, + "learning_rate": 6.893124796350603e-05, + "loss": 0.7693, + "step": 4325 + }, + { + "epoch": 0.6772072636192862, + "grad_norm": 2.1831419467926025, + "learning_rate": 6.892310198761812e-05, + "loss": 0.6102, + "step": 4326 + }, + { + "epoch": 0.6773638071383845, + "grad_norm": 2.5035552978515625, + "learning_rate": 6.891495601173021e-05, + "loss": 0.6621, + "step": 4327 + }, + { + "epoch": 0.6775203506574827, + "grad_norm": 2.195643424987793, + "learning_rate": 6.89068100358423e-05, + "loss": 0.4422, + "step": 4328 + }, + { + "epoch": 0.6776768941765811, + "grad_norm": 4.309935092926025, + "learning_rate": 6.889866405995438e-05, + "loss": 0.6306, + "step": 4329 + }, + { + "epoch": 0.6778334376956794, + "grad_norm": 3.546494245529175, + "learning_rate": 6.889051808406648e-05, + "loss": 0.6772, + "step": 4330 + }, + { + "epoch": 0.6779899812147777, + "grad_norm": 2.84580659866333, + "learning_rate": 6.888237210817856e-05, + "loss": 0.8346, + "step": 4331 + }, + { + "epoch": 0.678146524733876, + "grad_norm": 1.9011963605880737, + "learning_rate": 6.887422613229066e-05, + "loss": 0.8935, + "step": 4332 + }, + { + "epoch": 0.6783030682529744, + "grad_norm": 11.29277515411377, + "learning_rate": 6.886608015640274e-05, + "loss": 1.2814, + "step": 4333 + }, + { + "epoch": 0.6784596117720726, + "grad_norm": 3.0307843685150146, + "learning_rate": 6.885793418051483e-05, + "loss": 0.6881, + "step": 4334 + }, + { + "epoch": 0.6786161552911709, + "grad_norm": 2.247396230697632, + "learning_rate": 6.884978820462692e-05, + "loss": 1.015, + "step": 4335 + }, + { + "epoch": 0.6787726988102692, + "grad_norm": 2.3274593353271484, + "learning_rate": 6.884164222873901e-05, + "loss": 1.0641, + "step": 4336 + }, + { + "epoch": 0.6789292423293676, + "grad_norm": 4.236484527587891, + "learning_rate": 6.883349625285109e-05, + "loss": 1.3734, + "step": 4337 + }, + { + "epoch": 0.6790857858484659, + "grad_norm": 8.191431999206543, + "learning_rate": 6.882535027696319e-05, + "loss": 1.5951, + "step": 4338 + }, + { + "epoch": 0.6792423293675642, + "grad_norm": 1.8858083486557007, + "learning_rate": 6.881720430107527e-05, + "loss": 0.9457, + "step": 4339 + }, + { + "epoch": 0.6793988728866625, + "grad_norm": 3.796680450439453, + "learning_rate": 6.880905832518736e-05, + "loss": 1.5379, + "step": 4340 + }, + { + "epoch": 0.6795554164057608, + "grad_norm": 2.5891528129577637, + "learning_rate": 6.880091234929945e-05, + "loss": 0.7877, + "step": 4341 + }, + { + "epoch": 0.6797119599248591, + "grad_norm": 4.140232086181641, + "learning_rate": 6.879276637341154e-05, + "loss": 1.1173, + "step": 4342 + }, + { + "epoch": 0.6798685034439574, + "grad_norm": 2.4479730129241943, + "learning_rate": 6.878462039752362e-05, + "loss": 1.04, + "step": 4343 + }, + { + "epoch": 0.6800250469630558, + "grad_norm": 3.1244750022888184, + "learning_rate": 6.877647442163572e-05, + "loss": 1.3103, + "step": 4344 + }, + { + "epoch": 0.680181590482154, + "grad_norm": 1.662410855293274, + "learning_rate": 6.876832844574782e-05, + "loss": 0.9875, + "step": 4345 + }, + { + "epoch": 0.6803381340012523, + "grad_norm": 3.203068971633911, + "learning_rate": 6.876018246985989e-05, + "loss": 0.6822, + "step": 4346 + }, + { + "epoch": 0.6804946775203506, + "grad_norm": 2.7868893146514893, + "learning_rate": 6.875203649397198e-05, + "loss": 0.7676, + "step": 4347 + }, + { + "epoch": 0.680651221039449, + "grad_norm": 6.211584091186523, + "learning_rate": 6.874389051808407e-05, + "loss": 0.8282, + "step": 4348 + }, + { + "epoch": 0.6808077645585473, + "grad_norm": 2.3865396976470947, + "learning_rate": 6.873574454219615e-05, + "loss": 0.9206, + "step": 4349 + }, + { + "epoch": 0.6809643080776456, + "grad_norm": 2.5000975131988525, + "learning_rate": 6.872759856630825e-05, + "loss": 1.1784, + "step": 4350 + }, + { + "epoch": 0.6811208515967438, + "grad_norm": 0.559771716594696, + "learning_rate": 6.871945259042033e-05, + "loss": 0.284, + "step": 4351 + }, + { + "epoch": 0.6812773951158422, + "grad_norm": 0.7753599286079407, + "learning_rate": 6.871130661453243e-05, + "loss": 0.322, + "step": 4352 + }, + { + "epoch": 0.6814339386349405, + "grad_norm": 0.5816391110420227, + "learning_rate": 6.870316063864451e-05, + "loss": 0.3077, + "step": 4353 + }, + { + "epoch": 0.6815904821540388, + "grad_norm": 0.873810887336731, + "learning_rate": 6.86950146627566e-05, + "loss": 0.3599, + "step": 4354 + }, + { + "epoch": 0.6817470256731372, + "grad_norm": 0.5738383531570435, + "learning_rate": 6.86868686868687e-05, + "loss": 0.2749, + "step": 4355 + }, + { + "epoch": 0.6819035691922355, + "grad_norm": 0.9451320767402649, + "learning_rate": 6.867872271098078e-05, + "loss": 0.3553, + "step": 4356 + }, + { + "epoch": 0.6820601127113337, + "grad_norm": 1.486505150794983, + "learning_rate": 6.867057673509286e-05, + "loss": 0.5524, + "step": 4357 + }, + { + "epoch": 0.682216656230432, + "grad_norm": 1.0109913349151611, + "learning_rate": 6.866243075920496e-05, + "loss": 0.3914, + "step": 4358 + }, + { + "epoch": 0.6823731997495304, + "grad_norm": 1.04562246799469, + "learning_rate": 6.865428478331704e-05, + "loss": 0.4122, + "step": 4359 + }, + { + "epoch": 0.6825297432686287, + "grad_norm": 1.0908540487289429, + "learning_rate": 6.864613880742913e-05, + "loss": 0.5042, + "step": 4360 + }, + { + "epoch": 0.682686286787727, + "grad_norm": 0.6894891262054443, + "learning_rate": 6.863799283154122e-05, + "loss": 0.2869, + "step": 4361 + }, + { + "epoch": 0.6828428303068252, + "grad_norm": 1.0430984497070312, + "learning_rate": 6.862984685565331e-05, + "loss": 0.2779, + "step": 4362 + }, + { + "epoch": 0.6829993738259236, + "grad_norm": 1.6871583461761475, + "learning_rate": 6.862170087976539e-05, + "loss": 0.6291, + "step": 4363 + }, + { + "epoch": 0.6831559173450219, + "grad_norm": 1.77274751663208, + "learning_rate": 6.861355490387749e-05, + "loss": 0.4144, + "step": 4364 + }, + { + "epoch": 0.6833124608641202, + "grad_norm": 1.032151460647583, + "learning_rate": 6.860540892798957e-05, + "loss": 0.2492, + "step": 4365 + }, + { + "epoch": 0.6834690043832186, + "grad_norm": 0.8621804118156433, + "learning_rate": 6.859726295210166e-05, + "loss": 0.378, + "step": 4366 + }, + { + "epoch": 0.6836255479023169, + "grad_norm": 2.3414111137390137, + "learning_rate": 6.858911697621375e-05, + "loss": 0.4479, + "step": 4367 + }, + { + "epoch": 0.6837820914214151, + "grad_norm": 1.37322998046875, + "learning_rate": 6.858097100032585e-05, + "loss": 0.336, + "step": 4368 + }, + { + "epoch": 0.6839386349405134, + "grad_norm": 2.062911033630371, + "learning_rate": 6.857282502443792e-05, + "loss": 0.5655, + "step": 4369 + }, + { + "epoch": 0.6840951784596118, + "grad_norm": 1.526799201965332, + "learning_rate": 6.856467904855002e-05, + "loss": 0.615, + "step": 4370 + }, + { + "epoch": 0.6842517219787101, + "grad_norm": 4.701175689697266, + "learning_rate": 6.855653307266212e-05, + "loss": 0.457, + "step": 4371 + }, + { + "epoch": 0.6844082654978084, + "grad_norm": 2.760439157485962, + "learning_rate": 6.854838709677419e-05, + "loss": 0.9073, + "step": 4372 + }, + { + "epoch": 0.6845648090169068, + "grad_norm": 2.1794676780700684, + "learning_rate": 6.854024112088628e-05, + "loss": 0.4957, + "step": 4373 + }, + { + "epoch": 0.684721352536005, + "grad_norm": 1.6359556913375854, + "learning_rate": 6.853209514499838e-05, + "loss": 0.5357, + "step": 4374 + }, + { + "epoch": 0.6848778960551033, + "grad_norm": 2.903383255004883, + "learning_rate": 6.852394916911046e-05, + "loss": 0.7883, + "step": 4375 + }, + { + "epoch": 0.6850344395742016, + "grad_norm": 1.5516784191131592, + "learning_rate": 6.851580319322255e-05, + "loss": 0.4946, + "step": 4376 + }, + { + "epoch": 0.6851909830933, + "grad_norm": 2.9901552200317383, + "learning_rate": 6.850765721733464e-05, + "loss": 0.7756, + "step": 4377 + }, + { + "epoch": 0.6853475266123983, + "grad_norm": 2.444187879562378, + "learning_rate": 6.849951124144673e-05, + "loss": 0.7164, + "step": 4378 + }, + { + "epoch": 0.6855040701314965, + "grad_norm": 2.192251205444336, + "learning_rate": 6.849136526555881e-05, + "loss": 0.8779, + "step": 4379 + }, + { + "epoch": 0.6856606136505948, + "grad_norm": 1.842110514640808, + "learning_rate": 6.848321928967091e-05, + "loss": 0.7532, + "step": 4380 + }, + { + "epoch": 0.6858171571696932, + "grad_norm": 5.022533416748047, + "learning_rate": 6.8475073313783e-05, + "loss": 0.6663, + "step": 4381 + }, + { + "epoch": 0.6859737006887915, + "grad_norm": 1.9455078840255737, + "learning_rate": 6.846692733789508e-05, + "loss": 0.4562, + "step": 4382 + }, + { + "epoch": 0.6861302442078898, + "grad_norm": 5.739245414733887, + "learning_rate": 6.845878136200717e-05, + "loss": 0.8219, + "step": 4383 + }, + { + "epoch": 0.6862867877269881, + "grad_norm": 3.0788419246673584, + "learning_rate": 6.845063538611926e-05, + "loss": 1.5096, + "step": 4384 + }, + { + "epoch": 0.6864433312460864, + "grad_norm": 1.8186209201812744, + "learning_rate": 6.844248941023134e-05, + "loss": 0.9332, + "step": 4385 + }, + { + "epoch": 0.6865998747651847, + "grad_norm": 4.17464017868042, + "learning_rate": 6.843434343434344e-05, + "loss": 1.3202, + "step": 4386 + }, + { + "epoch": 0.686756418284283, + "grad_norm": 2.7175521850585938, + "learning_rate": 6.842619745845552e-05, + "loss": 1.1391, + "step": 4387 + }, + { + "epoch": 0.6869129618033814, + "grad_norm": 3.1879570484161377, + "learning_rate": 6.84180514825676e-05, + "loss": 1.0159, + "step": 4388 + }, + { + "epoch": 0.6870695053224797, + "grad_norm": 2.479628324508667, + "learning_rate": 6.84099055066797e-05, + "loss": 0.9552, + "step": 4389 + }, + { + "epoch": 0.687226048841578, + "grad_norm": 2.24845027923584, + "learning_rate": 6.840175953079179e-05, + "loss": 0.9136, + "step": 4390 + }, + { + "epoch": 0.6873825923606762, + "grad_norm": 4.3809356689453125, + "learning_rate": 6.839361355490389e-05, + "loss": 1.299, + "step": 4391 + }, + { + "epoch": 0.6875391358797746, + "grad_norm": 2.2518513202667236, + "learning_rate": 6.838546757901597e-05, + "loss": 0.9019, + "step": 4392 + }, + { + "epoch": 0.6876956793988729, + "grad_norm": 2.641254425048828, + "learning_rate": 6.837732160312805e-05, + "loss": 1.955, + "step": 4393 + }, + { + "epoch": 0.6878522229179712, + "grad_norm": 4.46509313583374, + "learning_rate": 6.836917562724015e-05, + "loss": 1.5609, + "step": 4394 + }, + { + "epoch": 0.6880087664370695, + "grad_norm": 3.2887706756591797, + "learning_rate": 6.836102965135223e-05, + "loss": 1.675, + "step": 4395 + }, + { + "epoch": 0.6881653099561679, + "grad_norm": 3.5958664417266846, + "learning_rate": 6.835288367546432e-05, + "loss": 1.4032, + "step": 4396 + }, + { + "epoch": 0.6883218534752661, + "grad_norm": 2.785684108734131, + "learning_rate": 6.834473769957641e-05, + "loss": 1.1909, + "step": 4397 + }, + { + "epoch": 0.6884783969943644, + "grad_norm": 4.321646690368652, + "learning_rate": 6.83365917236885e-05, + "loss": 1.1364, + "step": 4398 + }, + { + "epoch": 0.6886349405134627, + "grad_norm": 2.955415964126587, + "learning_rate": 6.832844574780058e-05, + "loss": 1.0573, + "step": 4399 + }, + { + "epoch": 0.6887914840325611, + "grad_norm": 3.135511875152588, + "learning_rate": 6.832029977191268e-05, + "loss": 1.0678, + "step": 4400 + }, + { + "epoch": 0.6889480275516594, + "grad_norm": 1.029984712600708, + "learning_rate": 6.831215379602476e-05, + "loss": 0.8348, + "step": 4401 + }, + { + "epoch": 0.6891045710707576, + "grad_norm": 0.518986165523529, + "learning_rate": 6.830400782013685e-05, + "loss": 0.2463, + "step": 4402 + }, + { + "epoch": 0.689261114589856, + "grad_norm": 0.653876543045044, + "learning_rate": 6.829586184424894e-05, + "loss": 0.2913, + "step": 4403 + }, + { + "epoch": 0.6894176581089543, + "grad_norm": 0.6357722878456116, + "learning_rate": 6.828771586836104e-05, + "loss": 0.2796, + "step": 4404 + }, + { + "epoch": 0.6895742016280526, + "grad_norm": 0.747806191444397, + "learning_rate": 6.827956989247311e-05, + "loss": 0.3402, + "step": 4405 + }, + { + "epoch": 0.6897307451471509, + "grad_norm": 0.5982475280761719, + "learning_rate": 6.827142391658521e-05, + "loss": 0.2425, + "step": 4406 + }, + { + "epoch": 0.6898872886662493, + "grad_norm": 0.8068994879722595, + "learning_rate": 6.82632779406973e-05, + "loss": 0.2713, + "step": 4407 + }, + { + "epoch": 0.6900438321853475, + "grad_norm": 0.5349702835083008, + "learning_rate": 6.825513196480938e-05, + "loss": 0.3472, + "step": 4408 + }, + { + "epoch": 0.6902003757044458, + "grad_norm": 1.0866934061050415, + "learning_rate": 6.824698598892147e-05, + "loss": 0.3804, + "step": 4409 + }, + { + "epoch": 0.6903569192235441, + "grad_norm": 0.8647403120994568, + "learning_rate": 6.823884001303357e-05, + "loss": 0.3135, + "step": 4410 + }, + { + "epoch": 0.6905134627426425, + "grad_norm": 0.8280303478240967, + "learning_rate": 6.823069403714565e-05, + "loss": 0.2794, + "step": 4411 + }, + { + "epoch": 0.6906700062617408, + "grad_norm": 1.5587530136108398, + "learning_rate": 6.822254806125774e-05, + "loss": 0.4043, + "step": 4412 + }, + { + "epoch": 0.6908265497808391, + "grad_norm": 1.2850236892700195, + "learning_rate": 6.821440208536984e-05, + "loss": 0.3733, + "step": 4413 + }, + { + "epoch": 0.6909830932999373, + "grad_norm": 0.9070258736610413, + "learning_rate": 6.820625610948192e-05, + "loss": 0.489, + "step": 4414 + }, + { + "epoch": 0.6911396368190357, + "grad_norm": 1.208214521408081, + "learning_rate": 6.8198110133594e-05, + "loss": 0.3106, + "step": 4415 + }, + { + "epoch": 0.691296180338134, + "grad_norm": 1.0790586471557617, + "learning_rate": 6.81899641577061e-05, + "loss": 0.4104, + "step": 4416 + }, + { + "epoch": 0.6914527238572323, + "grad_norm": 3.0541858673095703, + "learning_rate": 6.818181818181818e-05, + "loss": 0.7237, + "step": 4417 + }, + { + "epoch": 0.6916092673763307, + "grad_norm": 1.8948487043380737, + "learning_rate": 6.817367220593027e-05, + "loss": 0.3827, + "step": 4418 + }, + { + "epoch": 0.6917658108954289, + "grad_norm": 1.1310468912124634, + "learning_rate": 6.816552623004237e-05, + "loss": 0.4306, + "step": 4419 + }, + { + "epoch": 0.6919223544145272, + "grad_norm": 7.239919662475586, + "learning_rate": 6.815738025415445e-05, + "loss": 0.7688, + "step": 4420 + }, + { + "epoch": 0.6920788979336255, + "grad_norm": 2.5930356979370117, + "learning_rate": 6.814923427826653e-05, + "loss": 0.4867, + "step": 4421 + }, + { + "epoch": 0.6922354414527239, + "grad_norm": 1.239784836769104, + "learning_rate": 6.814108830237863e-05, + "loss": 0.4145, + "step": 4422 + }, + { + "epoch": 0.6923919849718222, + "grad_norm": 2.8590307235717773, + "learning_rate": 6.813294232649071e-05, + "loss": 0.5741, + "step": 4423 + }, + { + "epoch": 0.6925485284909205, + "grad_norm": 3.7747175693511963, + "learning_rate": 6.81247963506028e-05, + "loss": 0.7803, + "step": 4424 + }, + { + "epoch": 0.6927050720100187, + "grad_norm": 1.6210392713546753, + "learning_rate": 6.81166503747149e-05, + "loss": 0.5538, + "step": 4425 + }, + { + "epoch": 0.6928616155291171, + "grad_norm": 3.506330728530884, + "learning_rate": 6.810850439882698e-05, + "loss": 0.849, + "step": 4426 + }, + { + "epoch": 0.6930181590482154, + "grad_norm": 3.2611281871795654, + "learning_rate": 6.810035842293908e-05, + "loss": 0.5791, + "step": 4427 + }, + { + "epoch": 0.6931747025673137, + "grad_norm": 3.5340349674224854, + "learning_rate": 6.809221244705116e-05, + "loss": 0.9848, + "step": 4428 + }, + { + "epoch": 0.6933312460864121, + "grad_norm": 3.0892205238342285, + "learning_rate": 6.808406647116324e-05, + "loss": 0.7161, + "step": 4429 + }, + { + "epoch": 0.6934877896055104, + "grad_norm": 4.5512495040893555, + "learning_rate": 6.807592049527534e-05, + "loss": 1.2493, + "step": 4430 + }, + { + "epoch": 0.6936443331246086, + "grad_norm": 3.3775227069854736, + "learning_rate": 6.806777451938742e-05, + "loss": 0.9464, + "step": 4431 + }, + { + "epoch": 0.6938008766437069, + "grad_norm": 3.389875888824463, + "learning_rate": 6.805962854349951e-05, + "loss": 0.8027, + "step": 4432 + }, + { + "epoch": 0.6939574201628053, + "grad_norm": 3.510012626647949, + "learning_rate": 6.80514825676116e-05, + "loss": 0.8086, + "step": 4433 + }, + { + "epoch": 0.6941139636819036, + "grad_norm": 4.3926215171813965, + "learning_rate": 6.804333659172369e-05, + "loss": 1.3159, + "step": 4434 + }, + { + "epoch": 0.6942705072010019, + "grad_norm": 2.127270221710205, + "learning_rate": 6.803519061583577e-05, + "loss": 1.2118, + "step": 4435 + }, + { + "epoch": 0.6944270507201001, + "grad_norm": 2.821136713027954, + "learning_rate": 6.802704463994787e-05, + "loss": 1.2124, + "step": 4436 + }, + { + "epoch": 0.6945835942391985, + "grad_norm": 3.26589298248291, + "learning_rate": 6.801889866405995e-05, + "loss": 1.1774, + "step": 4437 + }, + { + "epoch": 0.6947401377582968, + "grad_norm": 3.477168083190918, + "learning_rate": 6.801075268817204e-05, + "loss": 0.9347, + "step": 4438 + }, + { + "epoch": 0.6948966812773951, + "grad_norm": 3.090838670730591, + "learning_rate": 6.800260671228414e-05, + "loss": 0.6648, + "step": 4439 + }, + { + "epoch": 0.6950532247964935, + "grad_norm": 2.557313919067383, + "learning_rate": 6.799446073639623e-05, + "loss": 1.0562, + "step": 4440 + }, + { + "epoch": 0.6952097683155918, + "grad_norm": 2.9074196815490723, + "learning_rate": 6.79863147605083e-05, + "loss": 1.1979, + "step": 4441 + }, + { + "epoch": 0.69536631183469, + "grad_norm": 3.0708723068237305, + "learning_rate": 6.79781687846204e-05, + "loss": 1.1697, + "step": 4442 + }, + { + "epoch": 0.6955228553537883, + "grad_norm": 2.5219247341156006, + "learning_rate": 6.79700228087325e-05, + "loss": 1.4209, + "step": 4443 + }, + { + "epoch": 0.6956793988728867, + "grad_norm": 2.6390976905822754, + "learning_rate": 6.796187683284457e-05, + "loss": 1.564, + "step": 4444 + }, + { + "epoch": 0.695835942391985, + "grad_norm": 3.2992279529571533, + "learning_rate": 6.795373085695666e-05, + "loss": 0.7857, + "step": 4445 + }, + { + "epoch": 0.6959924859110833, + "grad_norm": 3.0010104179382324, + "learning_rate": 6.794558488106876e-05, + "loss": 0.9164, + "step": 4446 + }, + { + "epoch": 0.6961490294301816, + "grad_norm": 1.8679206371307373, + "learning_rate": 6.793743890518083e-05, + "loss": 0.6495, + "step": 4447 + }, + { + "epoch": 0.6963055729492799, + "grad_norm": 2.8024818897247314, + "learning_rate": 6.792929292929293e-05, + "loss": 0.8072, + "step": 4448 + }, + { + "epoch": 0.6964621164683782, + "grad_norm": 3.0274980068206787, + "learning_rate": 6.792114695340503e-05, + "loss": 0.5737, + "step": 4449 + }, + { + "epoch": 0.6966186599874765, + "grad_norm": 3.663559675216675, + "learning_rate": 6.791300097751711e-05, + "loss": 0.9462, + "step": 4450 + }, + { + "epoch": 0.6967752035065748, + "grad_norm": 0.794104814529419, + "learning_rate": 6.79048550016292e-05, + "loss": 0.3488, + "step": 4451 + }, + { + "epoch": 0.6969317470256732, + "grad_norm": 0.8464572429656982, + "learning_rate": 6.789670902574129e-05, + "loss": 0.2485, + "step": 4452 + }, + { + "epoch": 0.6970882905447714, + "grad_norm": 0.632779598236084, + "learning_rate": 6.788856304985338e-05, + "loss": 0.2507, + "step": 4453 + }, + { + "epoch": 0.6972448340638697, + "grad_norm": 1.1157773733139038, + "learning_rate": 6.788041707396546e-05, + "loss": 0.4119, + "step": 4454 + }, + { + "epoch": 0.697401377582968, + "grad_norm": 0.6431015133857727, + "learning_rate": 6.787227109807756e-05, + "loss": 0.2867, + "step": 4455 + }, + { + "epoch": 0.6975579211020664, + "grad_norm": 1.218506097793579, + "learning_rate": 6.786412512218964e-05, + "loss": 0.2994, + "step": 4456 + }, + { + "epoch": 0.6977144646211647, + "grad_norm": 1.0360254049301147, + "learning_rate": 6.785597914630172e-05, + "loss": 0.4063, + "step": 4457 + }, + { + "epoch": 0.697871008140263, + "grad_norm": 0.9618489146232605, + "learning_rate": 6.784783317041382e-05, + "loss": 0.2495, + "step": 4458 + }, + { + "epoch": 0.6980275516593613, + "grad_norm": 1.6079397201538086, + "learning_rate": 6.78396871945259e-05, + "loss": 0.5211, + "step": 4459 + }, + { + "epoch": 0.6981840951784596, + "grad_norm": 1.0736277103424072, + "learning_rate": 6.783154121863799e-05, + "loss": 0.3917, + "step": 4460 + }, + { + "epoch": 0.6983406386975579, + "grad_norm": 1.5302294492721558, + "learning_rate": 6.782339524275009e-05, + "loss": 0.2544, + "step": 4461 + }, + { + "epoch": 0.6984971822166562, + "grad_norm": 2.2953782081604004, + "learning_rate": 6.781524926686217e-05, + "loss": 0.3216, + "step": 4462 + }, + { + "epoch": 0.6986537257357546, + "grad_norm": 2.000795841217041, + "learning_rate": 6.780710329097427e-05, + "loss": 0.3668, + "step": 4463 + }, + { + "epoch": 0.6988102692548529, + "grad_norm": 1.4406143426895142, + "learning_rate": 6.779895731508635e-05, + "loss": 0.3635, + "step": 4464 + }, + { + "epoch": 0.6989668127739511, + "grad_norm": 0.6982142925262451, + "learning_rate": 6.779081133919843e-05, + "loss": 0.2632, + "step": 4465 + }, + { + "epoch": 0.6991233562930494, + "grad_norm": 1.446283221244812, + "learning_rate": 6.778266536331053e-05, + "loss": 0.5531, + "step": 4466 + }, + { + "epoch": 0.6992798998121478, + "grad_norm": 1.2501206398010254, + "learning_rate": 6.777451938742262e-05, + "loss": 0.4297, + "step": 4467 + }, + { + "epoch": 0.6994364433312461, + "grad_norm": 2.414593458175659, + "learning_rate": 6.77663734115347e-05, + "loss": 0.4709, + "step": 4468 + }, + { + "epoch": 0.6995929868503444, + "grad_norm": 2.031153440475464, + "learning_rate": 6.77582274356468e-05, + "loss": 0.9353, + "step": 4469 + }, + { + "epoch": 0.6997495303694427, + "grad_norm": 4.054030895233154, + "learning_rate": 6.775008145975888e-05, + "loss": 0.5699, + "step": 4470 + }, + { + "epoch": 0.699906073888541, + "grad_norm": 2.6509463787078857, + "learning_rate": 6.774193548387096e-05, + "loss": 0.787, + "step": 4471 + }, + { + "epoch": 0.7000626174076393, + "grad_norm": 1.584572672843933, + "learning_rate": 6.773378950798306e-05, + "loss": 0.8247, + "step": 4472 + }, + { + "epoch": 0.7002191609267376, + "grad_norm": 1.6638438701629639, + "learning_rate": 6.772564353209515e-05, + "loss": 0.4325, + "step": 4473 + }, + { + "epoch": 0.700375704445836, + "grad_norm": 1.6743277311325073, + "learning_rate": 6.771749755620723e-05, + "loss": 0.3052, + "step": 4474 + }, + { + "epoch": 0.7005322479649343, + "grad_norm": 1.3474088907241821, + "learning_rate": 6.770935158031933e-05, + "loss": 0.6968, + "step": 4475 + }, + { + "epoch": 0.7006887914840325, + "grad_norm": 2.3388173580169678, + "learning_rate": 6.770120560443141e-05, + "loss": 1.1405, + "step": 4476 + }, + { + "epoch": 0.7008453350031308, + "grad_norm": 4.039053440093994, + "learning_rate": 6.76930596285435e-05, + "loss": 1.4002, + "step": 4477 + }, + { + "epoch": 0.7010018785222292, + "grad_norm": 2.2212297916412354, + "learning_rate": 6.768491365265559e-05, + "loss": 0.6915, + "step": 4478 + }, + { + "epoch": 0.7011584220413275, + "grad_norm": 2.2442827224731445, + "learning_rate": 6.767676767676769e-05, + "loss": 0.6261, + "step": 4479 + }, + { + "epoch": 0.7013149655604258, + "grad_norm": 2.243187427520752, + "learning_rate": 6.766862170087976e-05, + "loss": 0.7698, + "step": 4480 + }, + { + "epoch": 0.7014715090795242, + "grad_norm": 2.7173266410827637, + "learning_rate": 6.766047572499186e-05, + "loss": 0.6297, + "step": 4481 + }, + { + "epoch": 0.7016280525986224, + "grad_norm": 2.762281894683838, + "learning_rate": 6.765232974910395e-05, + "loss": 1.0099, + "step": 4482 + }, + { + "epoch": 0.7017845961177207, + "grad_norm": 3.3382763862609863, + "learning_rate": 6.764418377321602e-05, + "loss": 1.2769, + "step": 4483 + }, + { + "epoch": 0.701941139636819, + "grad_norm": 1.9162719249725342, + "learning_rate": 6.763603779732812e-05, + "loss": 0.5806, + "step": 4484 + }, + { + "epoch": 0.7020976831559174, + "grad_norm": 2.5812859535217285, + "learning_rate": 6.762789182144022e-05, + "loss": 1.1229, + "step": 4485 + }, + { + "epoch": 0.7022542266750157, + "grad_norm": 2.3034353256225586, + "learning_rate": 6.76197458455523e-05, + "loss": 1.0733, + "step": 4486 + }, + { + "epoch": 0.7024107701941139, + "grad_norm": 3.6082699298858643, + "learning_rate": 6.761159986966439e-05, + "loss": 0.7708, + "step": 4487 + }, + { + "epoch": 0.7025673137132122, + "grad_norm": 3.9631216526031494, + "learning_rate": 6.760345389377648e-05, + "loss": 0.8621, + "step": 4488 + }, + { + "epoch": 0.7027238572323106, + "grad_norm": 1.9447300434112549, + "learning_rate": 6.759530791788857e-05, + "loss": 0.628, + "step": 4489 + }, + { + "epoch": 0.7028804007514089, + "grad_norm": 2.5721356868743896, + "learning_rate": 6.758716194200065e-05, + "loss": 1.2505, + "step": 4490 + }, + { + "epoch": 0.7030369442705072, + "grad_norm": 3.6780452728271484, + "learning_rate": 6.757901596611275e-05, + "loss": 1.3873, + "step": 4491 + }, + { + "epoch": 0.7031934877896056, + "grad_norm": 4.5944318771362305, + "learning_rate": 6.757086999022483e-05, + "loss": 1.3905, + "step": 4492 + }, + { + "epoch": 0.7033500313087038, + "grad_norm": 3.164154052734375, + "learning_rate": 6.756272401433692e-05, + "loss": 0.8415, + "step": 4493 + }, + { + "epoch": 0.7035065748278021, + "grad_norm": 2.729268789291382, + "learning_rate": 6.755457803844901e-05, + "loss": 0.9409, + "step": 4494 + }, + { + "epoch": 0.7036631183469004, + "grad_norm": 1.995418906211853, + "learning_rate": 6.75464320625611e-05, + "loss": 0.6934, + "step": 4495 + }, + { + "epoch": 0.7038196618659988, + "grad_norm": 3.1199913024902344, + "learning_rate": 6.753828608667318e-05, + "loss": 0.8596, + "step": 4496 + }, + { + "epoch": 0.7039762053850971, + "grad_norm": 2.592271566390991, + "learning_rate": 6.753014011078528e-05, + "loss": 1.0215, + "step": 4497 + }, + { + "epoch": 0.7041327489041954, + "grad_norm": 2.247290849685669, + "learning_rate": 6.752199413489736e-05, + "loss": 0.7624, + "step": 4498 + }, + { + "epoch": 0.7042892924232936, + "grad_norm": 3.487016201019287, + "learning_rate": 6.751384815900946e-05, + "loss": 1.3416, + "step": 4499 + }, + { + "epoch": 0.704445835942392, + "grad_norm": 2.7323129177093506, + "learning_rate": 6.750570218312154e-05, + "loss": 0.8073, + "step": 4500 + }, + { + "epoch": 0.7046023794614903, + "grad_norm": 0.5901132225990295, + "learning_rate": 6.749755620723363e-05, + "loss": 0.2115, + "step": 4501 + }, + { + "epoch": 0.7047589229805886, + "grad_norm": 0.8102548122406006, + "learning_rate": 6.748941023134572e-05, + "loss": 0.3013, + "step": 4502 + }, + { + "epoch": 0.704915466499687, + "grad_norm": 0.619346022605896, + "learning_rate": 6.748126425545781e-05, + "loss": 0.3295, + "step": 4503 + }, + { + "epoch": 0.7050720100187852, + "grad_norm": 0.9394071102142334, + "learning_rate": 6.747311827956989e-05, + "loss": 0.2808, + "step": 4504 + }, + { + "epoch": 0.7052285535378835, + "grad_norm": 0.6291738152503967, + "learning_rate": 6.746497230368199e-05, + "loss": 0.2933, + "step": 4505 + }, + { + "epoch": 0.7053850970569818, + "grad_norm": 0.7566792964935303, + "learning_rate": 6.745682632779407e-05, + "loss": 0.2953, + "step": 4506 + }, + { + "epoch": 0.7055416405760802, + "grad_norm": 0.9165009260177612, + "learning_rate": 6.744868035190616e-05, + "loss": 0.3463, + "step": 4507 + }, + { + "epoch": 0.7056981840951785, + "grad_norm": 0.8587214350700378, + "learning_rate": 6.744053437601825e-05, + "loss": 0.4082, + "step": 4508 + }, + { + "epoch": 0.7058547276142768, + "grad_norm": 1.5487347841262817, + "learning_rate": 6.743238840013034e-05, + "loss": 0.3678, + "step": 4509 + }, + { + "epoch": 0.706011271133375, + "grad_norm": 0.638279378414154, + "learning_rate": 6.742424242424242e-05, + "loss": 0.3586, + "step": 4510 + }, + { + "epoch": 0.7061678146524734, + "grad_norm": 1.31162691116333, + "learning_rate": 6.741609644835452e-05, + "loss": 0.3573, + "step": 4511 + }, + { + "epoch": 0.7063243581715717, + "grad_norm": 0.7423022389411926, + "learning_rate": 6.74079504724666e-05, + "loss": 0.2939, + "step": 4512 + }, + { + "epoch": 0.70648090169067, + "grad_norm": 1.175406575202942, + "learning_rate": 6.739980449657869e-05, + "loss": 0.3545, + "step": 4513 + }, + { + "epoch": 0.7066374452097683, + "grad_norm": 1.2868375778198242, + "learning_rate": 6.739165852069078e-05, + "loss": 0.6091, + "step": 4514 + }, + { + "epoch": 0.7067939887288667, + "grad_norm": 1.4250621795654297, + "learning_rate": 6.738351254480288e-05, + "loss": 0.4709, + "step": 4515 + }, + { + "epoch": 0.7069505322479649, + "grad_norm": 1.5997248888015747, + "learning_rate": 6.737536656891495e-05, + "loss": 0.5456, + "step": 4516 + }, + { + "epoch": 0.7071070757670632, + "grad_norm": 1.0502129793167114, + "learning_rate": 6.736722059302705e-05, + "loss": 0.5144, + "step": 4517 + }, + { + "epoch": 0.7072636192861615, + "grad_norm": 1.5181663036346436, + "learning_rate": 6.735907461713914e-05, + "loss": 0.4548, + "step": 4518 + }, + { + "epoch": 0.7074201628052599, + "grad_norm": 3.3124940395355225, + "learning_rate": 6.735092864125121e-05, + "loss": 0.6239, + "step": 4519 + }, + { + "epoch": 0.7075767063243582, + "grad_norm": 6.429731845855713, + "learning_rate": 6.734278266536331e-05, + "loss": 1.23, + "step": 4520 + }, + { + "epoch": 0.7077332498434565, + "grad_norm": 2.5387415885925293, + "learning_rate": 6.733463668947541e-05, + "loss": 0.9489, + "step": 4521 + }, + { + "epoch": 0.7078897933625548, + "grad_norm": 2.707570791244507, + "learning_rate": 6.732649071358749e-05, + "loss": 0.6071, + "step": 4522 + }, + { + "epoch": 0.7080463368816531, + "grad_norm": 3.4098827838897705, + "learning_rate": 6.731834473769958e-05, + "loss": 0.4595, + "step": 4523 + }, + { + "epoch": 0.7082028804007514, + "grad_norm": 3.2148051261901855, + "learning_rate": 6.731019876181167e-05, + "loss": 0.7428, + "step": 4524 + }, + { + "epoch": 0.7083594239198497, + "grad_norm": 2.382962226867676, + "learning_rate": 6.730205278592376e-05, + "loss": 0.79, + "step": 4525 + }, + { + "epoch": 0.7085159674389481, + "grad_norm": 1.0933892726898193, + "learning_rate": 6.729390681003584e-05, + "loss": 0.4544, + "step": 4526 + }, + { + "epoch": 0.7086725109580463, + "grad_norm": 2.4809699058532715, + "learning_rate": 6.728576083414794e-05, + "loss": 1.0682, + "step": 4527 + }, + { + "epoch": 0.7088290544771446, + "grad_norm": 1.6338454484939575, + "learning_rate": 6.727761485826002e-05, + "loss": 0.6126, + "step": 4528 + }, + { + "epoch": 0.7089855979962429, + "grad_norm": 1.5515691041946411, + "learning_rate": 6.72694688823721e-05, + "loss": 0.5437, + "step": 4529 + }, + { + "epoch": 0.7091421415153413, + "grad_norm": 2.567842960357666, + "learning_rate": 6.72613229064842e-05, + "loss": 0.7508, + "step": 4530 + }, + { + "epoch": 0.7092986850344396, + "grad_norm": 1.4604039192199707, + "learning_rate": 6.725317693059629e-05, + "loss": 0.4363, + "step": 4531 + }, + { + "epoch": 0.7094552285535379, + "grad_norm": 2.9853522777557373, + "learning_rate": 6.724503095470837e-05, + "loss": 1.0414, + "step": 4532 + }, + { + "epoch": 0.7096117720726361, + "grad_norm": 3.6199662685394287, + "learning_rate": 6.723688497882047e-05, + "loss": 0.9244, + "step": 4533 + }, + { + "epoch": 0.7097683155917345, + "grad_norm": 3.685770273208618, + "learning_rate": 6.722873900293255e-05, + "loss": 0.9089, + "step": 4534 + }, + { + "epoch": 0.7099248591108328, + "grad_norm": 2.03233003616333, + "learning_rate": 6.722059302704464e-05, + "loss": 0.7917, + "step": 4535 + }, + { + "epoch": 0.7100814026299311, + "grad_norm": 2.8743557929992676, + "learning_rate": 6.721244705115673e-05, + "loss": 1.1568, + "step": 4536 + }, + { + "epoch": 0.7102379461490295, + "grad_norm": 3.0643622875213623, + "learning_rate": 6.720430107526882e-05, + "loss": 0.9175, + "step": 4537 + }, + { + "epoch": 0.7103944896681278, + "grad_norm": 3.216731071472168, + "learning_rate": 6.719615509938091e-05, + "loss": 1.0524, + "step": 4538 + }, + { + "epoch": 0.710551033187226, + "grad_norm": 4.503540992736816, + "learning_rate": 6.7188009123493e-05, + "loss": 1.3369, + "step": 4539 + }, + { + "epoch": 0.7107075767063243, + "grad_norm": 2.2522311210632324, + "learning_rate": 6.717986314760508e-05, + "loss": 0.8473, + "step": 4540 + }, + { + "epoch": 0.7108641202254227, + "grad_norm": 2.6000595092773438, + "learning_rate": 6.717171717171718e-05, + "loss": 1.1985, + "step": 4541 + }, + { + "epoch": 0.711020663744521, + "grad_norm": 4.03325080871582, + "learning_rate": 6.716357119582926e-05, + "loss": 1.8613, + "step": 4542 + }, + { + "epoch": 0.7111772072636193, + "grad_norm": 4.666566848754883, + "learning_rate": 6.715542521994135e-05, + "loss": 1.3656, + "step": 4543 + }, + { + "epoch": 0.7113337507827175, + "grad_norm": 4.0898919105529785, + "learning_rate": 6.714727924405344e-05, + "loss": 1.4285, + "step": 4544 + }, + { + "epoch": 0.7114902943018159, + "grad_norm": 3.2154951095581055, + "learning_rate": 6.713913326816553e-05, + "loss": 1.3151, + "step": 4545 + }, + { + "epoch": 0.7116468378209142, + "grad_norm": 1.54813551902771, + "learning_rate": 6.713098729227761e-05, + "loss": 0.6946, + "step": 4546 + }, + { + "epoch": 0.7118033813400125, + "grad_norm": 2.015345335006714, + "learning_rate": 6.712284131638971e-05, + "loss": 0.6815, + "step": 4547 + }, + { + "epoch": 0.7119599248591109, + "grad_norm": 3.0418825149536133, + "learning_rate": 6.711469534050179e-05, + "loss": 1.046, + "step": 4548 + }, + { + "epoch": 0.7121164683782092, + "grad_norm": 1.9586329460144043, + "learning_rate": 6.710654936461388e-05, + "loss": 0.4312, + "step": 4549 + }, + { + "epoch": 0.7122730118973074, + "grad_norm": 2.3792312145233154, + "learning_rate": 6.709840338872597e-05, + "loss": 1.2584, + "step": 4550 + }, + { + "epoch": 0.7124295554164057, + "grad_norm": 2.061821460723877, + "learning_rate": 6.709025741283807e-05, + "loss": 0.7383, + "step": 4551 + }, + { + "epoch": 0.7125860989355041, + "grad_norm": 0.7015791535377502, + "learning_rate": 6.708211143695014e-05, + "loss": 0.2201, + "step": 4552 + }, + { + "epoch": 0.7127426424546024, + "grad_norm": 0.6468502879142761, + "learning_rate": 6.707396546106224e-05, + "loss": 0.2301, + "step": 4553 + }, + { + "epoch": 0.7128991859737007, + "grad_norm": 0.6800637245178223, + "learning_rate": 6.706581948517434e-05, + "loss": 0.3602, + "step": 4554 + }, + { + "epoch": 0.713055729492799, + "grad_norm": 1.1272120475769043, + "learning_rate": 6.70576735092864e-05, + "loss": 0.3606, + "step": 4555 + }, + { + "epoch": 0.7132122730118973, + "grad_norm": 0.9968860745429993, + "learning_rate": 6.70495275333985e-05, + "loss": 0.3102, + "step": 4556 + }, + { + "epoch": 0.7133688165309956, + "grad_norm": 0.8693810105323792, + "learning_rate": 6.70413815575106e-05, + "loss": 0.3064, + "step": 4557 + }, + { + "epoch": 0.7135253600500939, + "grad_norm": 0.7694476246833801, + "learning_rate": 6.703323558162268e-05, + "loss": 0.2822, + "step": 4558 + }, + { + "epoch": 0.7136819035691923, + "grad_norm": 2.6625993251800537, + "learning_rate": 6.702508960573477e-05, + "loss": 0.4168, + "step": 4559 + }, + { + "epoch": 0.7138384470882906, + "grad_norm": 1.2377609014511108, + "learning_rate": 6.701694362984687e-05, + "loss": 0.4319, + "step": 4560 + }, + { + "epoch": 0.7139949906073888, + "grad_norm": 1.0157783031463623, + "learning_rate": 6.700879765395895e-05, + "loss": 0.3973, + "step": 4561 + }, + { + "epoch": 0.7141515341264871, + "grad_norm": 2.355327606201172, + "learning_rate": 6.700065167807103e-05, + "loss": 0.6831, + "step": 4562 + }, + { + "epoch": 0.7143080776455855, + "grad_norm": 2.9206175804138184, + "learning_rate": 6.699250570218313e-05, + "loss": 0.5975, + "step": 4563 + }, + { + "epoch": 0.7144646211646838, + "grad_norm": 1.0744960308074951, + "learning_rate": 6.698435972629521e-05, + "loss": 0.4185, + "step": 4564 + }, + { + "epoch": 0.7146211646837821, + "grad_norm": 1.002577543258667, + "learning_rate": 6.69762137504073e-05, + "loss": 0.2077, + "step": 4565 + }, + { + "epoch": 0.7147777082028804, + "grad_norm": 2.3161442279815674, + "learning_rate": 6.69680677745194e-05, + "loss": 0.8131, + "step": 4566 + }, + { + "epoch": 0.7149342517219787, + "grad_norm": 1.4823671579360962, + "learning_rate": 6.695992179863148e-05, + "loss": 0.499, + "step": 4567 + }, + { + "epoch": 0.715090795241077, + "grad_norm": 3.1765286922454834, + "learning_rate": 6.695177582274356e-05, + "loss": 0.5807, + "step": 4568 + }, + { + "epoch": 0.7152473387601753, + "grad_norm": 1.5027145147323608, + "learning_rate": 6.694362984685566e-05, + "loss": 0.6923, + "step": 4569 + }, + { + "epoch": 0.7154038822792737, + "grad_norm": 1.3950703144073486, + "learning_rate": 6.693548387096774e-05, + "loss": 0.3229, + "step": 4570 + }, + { + "epoch": 0.715560425798372, + "grad_norm": 4.720500469207764, + "learning_rate": 6.692733789507983e-05, + "loss": 0.7187, + "step": 4571 + }, + { + "epoch": 0.7157169693174703, + "grad_norm": 2.121323823928833, + "learning_rate": 6.691919191919192e-05, + "loss": 0.6477, + "step": 4572 + }, + { + "epoch": 0.7158735128365685, + "grad_norm": 1.6872276067733765, + "learning_rate": 6.691104594330401e-05, + "loss": 0.5412, + "step": 4573 + }, + { + "epoch": 0.7160300563556669, + "grad_norm": 5.68472957611084, + "learning_rate": 6.69028999674161e-05, + "loss": 1.1055, + "step": 4574 + }, + { + "epoch": 0.7161865998747652, + "grad_norm": 1.5992431640625, + "learning_rate": 6.689475399152819e-05, + "loss": 0.5579, + "step": 4575 + }, + { + "epoch": 0.7163431433938635, + "grad_norm": 1.6018445491790771, + "learning_rate": 6.688660801564027e-05, + "loss": 0.4274, + "step": 4576 + }, + { + "epoch": 0.7164996869129618, + "grad_norm": 1.8607207536697388, + "learning_rate": 6.687846203975237e-05, + "loss": 0.6753, + "step": 4577 + }, + { + "epoch": 0.7166562304320601, + "grad_norm": 1.880508303642273, + "learning_rate": 6.687031606386445e-05, + "loss": 0.8573, + "step": 4578 + }, + { + "epoch": 0.7168127739511584, + "grad_norm": 3.4472711086273193, + "learning_rate": 6.686217008797654e-05, + "loss": 0.8786, + "step": 4579 + }, + { + "epoch": 0.7169693174702567, + "grad_norm": 3.3110475540161133, + "learning_rate": 6.685402411208864e-05, + "loss": 0.7371, + "step": 4580 + }, + { + "epoch": 0.717125860989355, + "grad_norm": 2.219878673553467, + "learning_rate": 6.684587813620072e-05, + "loss": 0.6419, + "step": 4581 + }, + { + "epoch": 0.7172824045084534, + "grad_norm": 1.754758358001709, + "learning_rate": 6.68377321603128e-05, + "loss": 0.8407, + "step": 4582 + }, + { + "epoch": 0.7174389480275517, + "grad_norm": 6.4166483879089355, + "learning_rate": 6.68295861844249e-05, + "loss": 0.9152, + "step": 4583 + }, + { + "epoch": 0.7175954915466499, + "grad_norm": 3.6035308837890625, + "learning_rate": 6.682144020853698e-05, + "loss": 1.206, + "step": 4584 + }, + { + "epoch": 0.7177520350657483, + "grad_norm": 3.6860532760620117, + "learning_rate": 6.681329423264907e-05, + "loss": 1.6324, + "step": 4585 + }, + { + "epoch": 0.7179085785848466, + "grad_norm": 2.8589601516723633, + "learning_rate": 6.680514825676116e-05, + "loss": 0.8848, + "step": 4586 + }, + { + "epoch": 0.7180651221039449, + "grad_norm": 1.8204675912857056, + "learning_rate": 6.679700228087326e-05, + "loss": 0.6501, + "step": 4587 + }, + { + "epoch": 0.7182216656230432, + "grad_norm": 3.3402719497680664, + "learning_rate": 6.678885630498533e-05, + "loss": 0.9024, + "step": 4588 + }, + { + "epoch": 0.7183782091421416, + "grad_norm": 5.371490001678467, + "learning_rate": 6.678071032909743e-05, + "loss": 1.1339, + "step": 4589 + }, + { + "epoch": 0.7185347526612398, + "grad_norm": 2.7088637351989746, + "learning_rate": 6.677256435320953e-05, + "loss": 1.5761, + "step": 4590 + }, + { + "epoch": 0.7186912961803381, + "grad_norm": 2.063610792160034, + "learning_rate": 6.67644183773216e-05, + "loss": 0.9973, + "step": 4591 + }, + { + "epoch": 0.7188478396994364, + "grad_norm": 3.0595123767852783, + "learning_rate": 6.67562724014337e-05, + "loss": 1.2204, + "step": 4592 + }, + { + "epoch": 0.7190043832185348, + "grad_norm": 3.0156033039093018, + "learning_rate": 6.674812642554579e-05, + "loss": 1.1144, + "step": 4593 + }, + { + "epoch": 0.7191609267376331, + "grad_norm": 2.3352854251861572, + "learning_rate": 6.673998044965786e-05, + "loss": 1.0209, + "step": 4594 + }, + { + "epoch": 0.7193174702567313, + "grad_norm": 3.9630496501922607, + "learning_rate": 6.673183447376996e-05, + "loss": 1.2048, + "step": 4595 + }, + { + "epoch": 0.7194740137758296, + "grad_norm": 3.5117297172546387, + "learning_rate": 6.672368849788206e-05, + "loss": 1.2188, + "step": 4596 + }, + { + "epoch": 0.719630557294928, + "grad_norm": 1.7080022096633911, + "learning_rate": 6.671554252199414e-05, + "loss": 0.4845, + "step": 4597 + }, + { + "epoch": 0.7197871008140263, + "grad_norm": 2.837496280670166, + "learning_rate": 6.670739654610622e-05, + "loss": 0.7948, + "step": 4598 + }, + { + "epoch": 0.7199436443331246, + "grad_norm": 9.003721237182617, + "learning_rate": 6.669925057021832e-05, + "loss": 0.9053, + "step": 4599 + }, + { + "epoch": 0.720100187852223, + "grad_norm": 2.957827091217041, + "learning_rate": 6.66911045943304e-05, + "loss": 0.9889, + "step": 4600 + }, + { + "epoch": 0.7202567313713212, + "grad_norm": 0.4917665123939514, + "learning_rate": 6.668295861844249e-05, + "loss": 0.2554, + "step": 4601 + }, + { + "epoch": 0.7204132748904195, + "grad_norm": 1.1542333364486694, + "learning_rate": 6.667481264255459e-05, + "loss": 0.2797, + "step": 4602 + }, + { + "epoch": 0.7205698184095178, + "grad_norm": 0.9716810584068298, + "learning_rate": 6.666666666666667e-05, + "loss": 0.2912, + "step": 4603 + }, + { + "epoch": 0.7207263619286162, + "grad_norm": 0.7835025191307068, + "learning_rate": 6.665852069077875e-05, + "loss": 0.1774, + "step": 4604 + }, + { + "epoch": 0.7208829054477145, + "grad_norm": 0.704622209072113, + "learning_rate": 6.665037471489085e-05, + "loss": 0.2253, + "step": 4605 + }, + { + "epoch": 0.7210394489668128, + "grad_norm": 1.376958966255188, + "learning_rate": 6.664222873900293e-05, + "loss": 0.2503, + "step": 4606 + }, + { + "epoch": 0.721195992485911, + "grad_norm": 0.9551268219947815, + "learning_rate": 6.663408276311502e-05, + "loss": 0.3409, + "step": 4607 + }, + { + "epoch": 0.7213525360050094, + "grad_norm": 1.0896974802017212, + "learning_rate": 6.662593678722712e-05, + "loss": 0.2839, + "step": 4608 + }, + { + "epoch": 0.7215090795241077, + "grad_norm": 0.9676045179367065, + "learning_rate": 6.66177908113392e-05, + "loss": 0.2776, + "step": 4609 + }, + { + "epoch": 0.721665623043206, + "grad_norm": 1.2424070835113525, + "learning_rate": 6.66096448354513e-05, + "loss": 0.2973, + "step": 4610 + }, + { + "epoch": 0.7218221665623044, + "grad_norm": 2.451803207397461, + "learning_rate": 6.660149885956338e-05, + "loss": 0.6994, + "step": 4611 + }, + { + "epoch": 0.7219787100814026, + "grad_norm": 1.430544376373291, + "learning_rate": 6.659335288367546e-05, + "loss": 0.3816, + "step": 4612 + }, + { + "epoch": 0.7221352536005009, + "grad_norm": 1.2546181678771973, + "learning_rate": 6.658520690778756e-05, + "loss": 0.4097, + "step": 4613 + }, + { + "epoch": 0.7222917971195992, + "grad_norm": 1.1002241373062134, + "learning_rate": 6.657706093189965e-05, + "loss": 0.4003, + "step": 4614 + }, + { + "epoch": 0.7224483406386976, + "grad_norm": 1.422910213470459, + "learning_rate": 6.656891495601173e-05, + "loss": 0.4891, + "step": 4615 + }, + { + "epoch": 0.7226048841577959, + "grad_norm": 1.5784900188446045, + "learning_rate": 6.656076898012383e-05, + "loss": 0.4693, + "step": 4616 + }, + { + "epoch": 0.7227614276768942, + "grad_norm": 1.2246325016021729, + "learning_rate": 6.655262300423591e-05, + "loss": 0.2994, + "step": 4617 + }, + { + "epoch": 0.7229179711959924, + "grad_norm": 2.859010934829712, + "learning_rate": 6.6544477028348e-05, + "loss": 0.8597, + "step": 4618 + }, + { + "epoch": 0.7230745147150908, + "grad_norm": 1.3255746364593506, + "learning_rate": 6.653633105246009e-05, + "loss": 0.4115, + "step": 4619 + }, + { + "epoch": 0.7232310582341891, + "grad_norm": 1.4984878301620483, + "learning_rate": 6.652818507657217e-05, + "loss": 0.517, + "step": 4620 + }, + { + "epoch": 0.7233876017532874, + "grad_norm": 1.7940763235092163, + "learning_rate": 6.652003910068426e-05, + "loss": 0.5626, + "step": 4621 + }, + { + "epoch": 0.7235441452723858, + "grad_norm": 1.8023570775985718, + "learning_rate": 6.651189312479636e-05, + "loss": 0.6195, + "step": 4622 + }, + { + "epoch": 0.7237006887914841, + "grad_norm": 1.0469415187835693, + "learning_rate": 6.650374714890844e-05, + "loss": 0.3168, + "step": 4623 + }, + { + "epoch": 0.7238572323105823, + "grad_norm": 1.491673469543457, + "learning_rate": 6.649560117302052e-05, + "loss": 0.5322, + "step": 4624 + }, + { + "epoch": 0.7240137758296806, + "grad_norm": 2.1377389430999756, + "learning_rate": 6.648745519713262e-05, + "loss": 0.4816, + "step": 4625 + }, + { + "epoch": 0.724170319348779, + "grad_norm": 2.091294527053833, + "learning_rate": 6.647930922124472e-05, + "loss": 0.5199, + "step": 4626 + }, + { + "epoch": 0.7243268628678773, + "grad_norm": 1.7544718980789185, + "learning_rate": 6.647116324535679e-05, + "loss": 0.5104, + "step": 4627 + }, + { + "epoch": 0.7244834063869756, + "grad_norm": 4.000062465667725, + "learning_rate": 6.646301726946889e-05, + "loss": 0.459, + "step": 4628 + }, + { + "epoch": 0.7246399499060739, + "grad_norm": 2.6143932342529297, + "learning_rate": 6.645487129358098e-05, + "loss": 0.7478, + "step": 4629 + }, + { + "epoch": 0.7247964934251722, + "grad_norm": 1.9209630489349365, + "learning_rate": 6.644672531769305e-05, + "loss": 0.5052, + "step": 4630 + }, + { + "epoch": 0.7249530369442705, + "grad_norm": 5.988038063049316, + "learning_rate": 6.643857934180515e-05, + "loss": 1.1346, + "step": 4631 + }, + { + "epoch": 0.7251095804633688, + "grad_norm": 3.9235405921936035, + "learning_rate": 6.643043336591725e-05, + "loss": 1.4509, + "step": 4632 + }, + { + "epoch": 0.7252661239824671, + "grad_norm": 2.6590089797973633, + "learning_rate": 6.642228739002933e-05, + "loss": 0.8105, + "step": 4633 + }, + { + "epoch": 0.7254226675015655, + "grad_norm": 4.853567123413086, + "learning_rate": 6.641414141414142e-05, + "loss": 1.5265, + "step": 4634 + }, + { + "epoch": 0.7255792110206637, + "grad_norm": 4.043431282043457, + "learning_rate": 6.640599543825351e-05, + "loss": 0.7691, + "step": 4635 + }, + { + "epoch": 0.725735754539762, + "grad_norm": 3.192645311355591, + "learning_rate": 6.63978494623656e-05, + "loss": 0.8297, + "step": 4636 + }, + { + "epoch": 0.7258922980588604, + "grad_norm": 4.897477626800537, + "learning_rate": 6.638970348647768e-05, + "loss": 0.8393, + "step": 4637 + }, + { + "epoch": 0.7260488415779587, + "grad_norm": 3.359286069869995, + "learning_rate": 6.638155751058978e-05, + "loss": 0.5041, + "step": 4638 + }, + { + "epoch": 0.726205385097057, + "grad_norm": 2.5566673278808594, + "learning_rate": 6.637341153470186e-05, + "loss": 1.0753, + "step": 4639 + }, + { + "epoch": 0.7263619286161553, + "grad_norm": 5.9001665115356445, + "learning_rate": 6.636526555881394e-05, + "loss": 1.2578, + "step": 4640 + }, + { + "epoch": 0.7265184721352536, + "grad_norm": 1.739303708076477, + "learning_rate": 6.635711958292604e-05, + "loss": 1.1088, + "step": 4641 + }, + { + "epoch": 0.7266750156543519, + "grad_norm": 4.787917137145996, + "learning_rate": 6.634897360703813e-05, + "loss": 1.1005, + "step": 4642 + }, + { + "epoch": 0.7268315591734502, + "grad_norm": 3.120086669921875, + "learning_rate": 6.634082763115021e-05, + "loss": 0.7467, + "step": 4643 + }, + { + "epoch": 0.7269881026925485, + "grad_norm": 3.0654985904693604, + "learning_rate": 6.633268165526231e-05, + "loss": 1.3098, + "step": 4644 + }, + { + "epoch": 0.7271446462116469, + "grad_norm": 1.4052202701568604, + "learning_rate": 6.632453567937439e-05, + "loss": 0.5663, + "step": 4645 + }, + { + "epoch": 0.7273011897307452, + "grad_norm": 4.115591526031494, + "learning_rate": 6.631638970348649e-05, + "loss": 1.097, + "step": 4646 + }, + { + "epoch": 0.7274577332498434, + "grad_norm": 6.781321048736572, + "learning_rate": 6.630824372759857e-05, + "loss": 0.8054, + "step": 4647 + }, + { + "epoch": 0.7276142767689417, + "grad_norm": 3.1392276287078857, + "learning_rate": 6.630009775171066e-05, + "loss": 0.8691, + "step": 4648 + }, + { + "epoch": 0.7277708202880401, + "grad_norm": 3.341214895248413, + "learning_rate": 6.629195177582275e-05, + "loss": 0.6943, + "step": 4649 + }, + { + "epoch": 0.7279273638071384, + "grad_norm": 3.279862642288208, + "learning_rate": 6.628380579993484e-05, + "loss": 1.6014, + "step": 4650 + }, + { + "epoch": 0.7280839073262367, + "grad_norm": 0.793935239315033, + "learning_rate": 6.627565982404692e-05, + "loss": 0.3091, + "step": 4651 + }, + { + "epoch": 0.728240450845335, + "grad_norm": 0.7119731903076172, + "learning_rate": 6.626751384815902e-05, + "loss": 0.2088, + "step": 4652 + }, + { + "epoch": 0.7283969943644333, + "grad_norm": 0.9595901370048523, + "learning_rate": 6.62593678722711e-05, + "loss": 0.3801, + "step": 4653 + }, + { + "epoch": 0.7285535378835316, + "grad_norm": 0.6090596318244934, + "learning_rate": 6.625122189638319e-05, + "loss": 0.2595, + "step": 4654 + }, + { + "epoch": 0.7287100814026299, + "grad_norm": 0.7971254587173462, + "learning_rate": 6.624307592049528e-05, + "loss": 0.3074, + "step": 4655 + }, + { + "epoch": 0.7288666249217283, + "grad_norm": 0.9841130971908569, + "learning_rate": 6.623492994460737e-05, + "loss": 0.4731, + "step": 4656 + }, + { + "epoch": 0.7290231684408266, + "grad_norm": 0.6675652861595154, + "learning_rate": 6.622678396871945e-05, + "loss": 0.2597, + "step": 4657 + }, + { + "epoch": 0.7291797119599248, + "grad_norm": 0.6159108281135559, + "learning_rate": 6.621863799283155e-05, + "loss": 0.2205, + "step": 4658 + }, + { + "epoch": 0.7293362554790231, + "grad_norm": 0.9080237150192261, + "learning_rate": 6.621049201694363e-05, + "loss": 0.3099, + "step": 4659 + }, + { + "epoch": 0.7294927989981215, + "grad_norm": 1.2737823724746704, + "learning_rate": 6.620234604105571e-05, + "loss": 0.3551, + "step": 4660 + }, + { + "epoch": 0.7296493425172198, + "grad_norm": 0.922956645488739, + "learning_rate": 6.619420006516781e-05, + "loss": 0.2833, + "step": 4661 + }, + { + "epoch": 0.7298058860363181, + "grad_norm": 1.649587631225586, + "learning_rate": 6.618605408927991e-05, + "loss": 0.3743, + "step": 4662 + }, + { + "epoch": 0.7299624295554165, + "grad_norm": 1.5690643787384033, + "learning_rate": 6.617790811339198e-05, + "loss": 0.3669, + "step": 4663 + }, + { + "epoch": 0.7301189730745147, + "grad_norm": 1.238051414489746, + "learning_rate": 6.616976213750408e-05, + "loss": 0.5206, + "step": 4664 + }, + { + "epoch": 0.730275516593613, + "grad_norm": 2.0355186462402344, + "learning_rate": 6.616161616161617e-05, + "loss": 0.4332, + "step": 4665 + }, + { + "epoch": 0.7304320601127113, + "grad_norm": 1.6368257999420166, + "learning_rate": 6.615347018572824e-05, + "loss": 0.4088, + "step": 4666 + }, + { + "epoch": 0.7305886036318097, + "grad_norm": 5.554535388946533, + "learning_rate": 6.614532420984034e-05, + "loss": 2.7865, + "step": 4667 + }, + { + "epoch": 0.730745147150908, + "grad_norm": 1.5197654962539673, + "learning_rate": 6.613717823395244e-05, + "loss": 0.7608, + "step": 4668 + }, + { + "epoch": 0.7309016906700062, + "grad_norm": 1.4725568294525146, + "learning_rate": 6.612903225806452e-05, + "loss": 0.602, + "step": 4669 + }, + { + "epoch": 0.7310582341891045, + "grad_norm": 2.7572665214538574, + "learning_rate": 6.61208862821766e-05, + "loss": 0.6377, + "step": 4670 + }, + { + "epoch": 0.7312147777082029, + "grad_norm": 3.283994674682617, + "learning_rate": 6.61127403062887e-05, + "loss": 0.8424, + "step": 4671 + }, + { + "epoch": 0.7313713212273012, + "grad_norm": 1.3180993795394897, + "learning_rate": 6.610459433040079e-05, + "loss": 0.4101, + "step": 4672 + }, + { + "epoch": 0.7315278647463995, + "grad_norm": 2.1216559410095215, + "learning_rate": 6.609644835451287e-05, + "loss": 0.5512, + "step": 4673 + }, + { + "epoch": 0.7316844082654979, + "grad_norm": 1.4616084098815918, + "learning_rate": 6.608830237862497e-05, + "loss": 0.4529, + "step": 4674 + }, + { + "epoch": 0.7318409517845961, + "grad_norm": 2.967432975769043, + "learning_rate": 6.608015640273705e-05, + "loss": 0.5378, + "step": 4675 + }, + { + "epoch": 0.7319974953036944, + "grad_norm": 2.3110125064849854, + "learning_rate": 6.607201042684914e-05, + "loss": 0.675, + "step": 4676 + }, + { + "epoch": 0.7321540388227927, + "grad_norm": 5.924403190612793, + "learning_rate": 6.606386445096123e-05, + "loss": 1.1536, + "step": 4677 + }, + { + "epoch": 0.7323105823418911, + "grad_norm": 2.107023239135742, + "learning_rate": 6.605571847507332e-05, + "loss": 0.9309, + "step": 4678 + }, + { + "epoch": 0.7324671258609894, + "grad_norm": 10.09967041015625, + "learning_rate": 6.60475724991854e-05, + "loss": 0.8307, + "step": 4679 + }, + { + "epoch": 0.7326236693800877, + "grad_norm": 3.561817169189453, + "learning_rate": 6.60394265232975e-05, + "loss": 0.8847, + "step": 4680 + }, + { + "epoch": 0.7327802128991859, + "grad_norm": 3.995798110961914, + "learning_rate": 6.603128054740958e-05, + "loss": 0.9537, + "step": 4681 + }, + { + "epoch": 0.7329367564182843, + "grad_norm": 1.7014974355697632, + "learning_rate": 6.602313457152167e-05, + "loss": 0.6394, + "step": 4682 + }, + { + "epoch": 0.7330932999373826, + "grad_norm": 1.6752828359603882, + "learning_rate": 6.601498859563376e-05, + "loss": 0.514, + "step": 4683 + }, + { + "epoch": 0.7332498434564809, + "grad_norm": 2.39339542388916, + "learning_rate": 6.600684261974585e-05, + "loss": 0.9367, + "step": 4684 + }, + { + "epoch": 0.7334063869755792, + "grad_norm": 2.928898811340332, + "learning_rate": 6.599869664385794e-05, + "loss": 0.9028, + "step": 4685 + }, + { + "epoch": 0.7335629304946775, + "grad_norm": 3.999894618988037, + "learning_rate": 6.599055066797003e-05, + "loss": 0.8042, + "step": 4686 + }, + { + "epoch": 0.7337194740137758, + "grad_norm": 5.74957799911499, + "learning_rate": 6.598240469208211e-05, + "loss": 0.6269, + "step": 4687 + }, + { + "epoch": 0.7338760175328741, + "grad_norm": 2.315056085586548, + "learning_rate": 6.597425871619421e-05, + "loss": 0.9987, + "step": 4688 + }, + { + "epoch": 0.7340325610519725, + "grad_norm": 3.978837490081787, + "learning_rate": 6.596611274030629e-05, + "loss": 1.4979, + "step": 4689 + }, + { + "epoch": 0.7341891045710708, + "grad_norm": 7.354511737823486, + "learning_rate": 6.595796676441838e-05, + "loss": 1.257, + "step": 4690 + }, + { + "epoch": 0.7343456480901691, + "grad_norm": 2.5705740451812744, + "learning_rate": 6.594982078853047e-05, + "loss": 0.8818, + "step": 4691 + }, + { + "epoch": 0.7345021916092673, + "grad_norm": 2.982900381088257, + "learning_rate": 6.594167481264256e-05, + "loss": 1.3352, + "step": 4692 + }, + { + "epoch": 0.7346587351283657, + "grad_norm": 2.021130323410034, + "learning_rate": 6.593352883675464e-05, + "loss": 0.8331, + "step": 4693 + }, + { + "epoch": 0.734815278647464, + "grad_norm": 2.425612688064575, + "learning_rate": 6.592538286086674e-05, + "loss": 1.5831, + "step": 4694 + }, + { + "epoch": 0.7349718221665623, + "grad_norm": 2.4118194580078125, + "learning_rate": 6.591723688497882e-05, + "loss": 0.7423, + "step": 4695 + }, + { + "epoch": 0.7351283656856606, + "grad_norm": 3.1054391860961914, + "learning_rate": 6.59090909090909e-05, + "loss": 0.7271, + "step": 4696 + }, + { + "epoch": 0.735284909204759, + "grad_norm": 1.7798104286193848, + "learning_rate": 6.5900944933203e-05, + "loss": 0.5951, + "step": 4697 + }, + { + "epoch": 0.7354414527238572, + "grad_norm": 2.329460620880127, + "learning_rate": 6.58927989573151e-05, + "loss": 1.2699, + "step": 4698 + }, + { + "epoch": 0.7355979962429555, + "grad_norm": 2.3901195526123047, + "learning_rate": 6.588465298142717e-05, + "loss": 0.7689, + "step": 4699 + }, + { + "epoch": 0.7357545397620538, + "grad_norm": 3.240715503692627, + "learning_rate": 6.587650700553927e-05, + "loss": 1.4439, + "step": 4700 + }, + { + "epoch": 0.7359110832811522, + "grad_norm": 0.7688095569610596, + "learning_rate": 6.586836102965137e-05, + "loss": 0.2034, + "step": 4701 + }, + { + "epoch": 0.7360676268002505, + "grad_norm": 0.5488224029541016, + "learning_rate": 6.586021505376344e-05, + "loss": 0.2661, + "step": 4702 + }, + { + "epoch": 0.7362241703193487, + "grad_norm": 0.8780460953712463, + "learning_rate": 6.585206907787553e-05, + "loss": 0.2339, + "step": 4703 + }, + { + "epoch": 0.736380713838447, + "grad_norm": 0.7592791318893433, + "learning_rate": 6.584392310198763e-05, + "loss": 0.4153, + "step": 4704 + }, + { + "epoch": 0.7365372573575454, + "grad_norm": 0.5927779674530029, + "learning_rate": 6.58357771260997e-05, + "loss": 0.3377, + "step": 4705 + }, + { + "epoch": 0.7366938008766437, + "grad_norm": 1.0158073902130127, + "learning_rate": 6.58276311502118e-05, + "loss": 0.3325, + "step": 4706 + }, + { + "epoch": 0.736850344395742, + "grad_norm": 0.6805522441864014, + "learning_rate": 6.58194851743239e-05, + "loss": 0.3212, + "step": 4707 + }, + { + "epoch": 0.7370068879148404, + "grad_norm": 0.6450332999229431, + "learning_rate": 6.581133919843598e-05, + "loss": 0.2266, + "step": 4708 + }, + { + "epoch": 0.7371634314339386, + "grad_norm": 1.6810818910598755, + "learning_rate": 6.580319322254806e-05, + "loss": 0.353, + "step": 4709 + }, + { + "epoch": 0.7373199749530369, + "grad_norm": 0.6583768129348755, + "learning_rate": 6.579504724666016e-05, + "loss": 0.3323, + "step": 4710 + }, + { + "epoch": 0.7374765184721352, + "grad_norm": 1.4601815938949585, + "learning_rate": 6.578690127077224e-05, + "loss": 0.4062, + "step": 4711 + }, + { + "epoch": 0.7376330619912336, + "grad_norm": 1.0899896621704102, + "learning_rate": 6.577875529488433e-05, + "loss": 0.4321, + "step": 4712 + }, + { + "epoch": 0.7377896055103319, + "grad_norm": 1.4773411750793457, + "learning_rate": 6.577060931899642e-05, + "loss": 0.6728, + "step": 4713 + }, + { + "epoch": 0.7379461490294302, + "grad_norm": 2.240180730819702, + "learning_rate": 6.576246334310851e-05, + "loss": 0.688, + "step": 4714 + }, + { + "epoch": 0.7381026925485284, + "grad_norm": 2.1533327102661133, + "learning_rate": 6.575431736722059e-05, + "loss": 0.5501, + "step": 4715 + }, + { + "epoch": 0.7382592360676268, + "grad_norm": 1.0487756729125977, + "learning_rate": 6.574617139133269e-05, + "loss": 0.3445, + "step": 4716 + }, + { + "epoch": 0.7384157795867251, + "grad_norm": 1.1764057874679565, + "learning_rate": 6.573802541544477e-05, + "loss": 0.4786, + "step": 4717 + }, + { + "epoch": 0.7385723231058234, + "grad_norm": 3.8479886054992676, + "learning_rate": 6.572987943955686e-05, + "loss": 0.5149, + "step": 4718 + }, + { + "epoch": 0.7387288666249218, + "grad_norm": 2.167663335800171, + "learning_rate": 6.572173346366895e-05, + "loss": 0.5442, + "step": 4719 + }, + { + "epoch": 0.73888541014402, + "grad_norm": 1.2980501651763916, + "learning_rate": 6.571358748778104e-05, + "loss": 0.4829, + "step": 4720 + }, + { + "epoch": 0.7390419536631183, + "grad_norm": 1.848788857460022, + "learning_rate": 6.570544151189314e-05, + "loss": 0.564, + "step": 4721 + }, + { + "epoch": 0.7391984971822166, + "grad_norm": 0.9088883996009827, + "learning_rate": 6.569729553600522e-05, + "loss": 0.3049, + "step": 4722 + }, + { + "epoch": 0.739355040701315, + "grad_norm": 1.597267508506775, + "learning_rate": 6.56891495601173e-05, + "loss": 0.2177, + "step": 4723 + }, + { + "epoch": 0.7395115842204133, + "grad_norm": 2.298304557800293, + "learning_rate": 6.56810035842294e-05, + "loss": 0.9116, + "step": 4724 + }, + { + "epoch": 0.7396681277395116, + "grad_norm": 1.7569562196731567, + "learning_rate": 6.567285760834148e-05, + "loss": 0.6576, + "step": 4725 + }, + { + "epoch": 0.7398246712586098, + "grad_norm": 1.9308221340179443, + "learning_rate": 6.566471163245357e-05, + "loss": 0.6671, + "step": 4726 + }, + { + "epoch": 0.7399812147777082, + "grad_norm": 2.0804152488708496, + "learning_rate": 6.565656565656566e-05, + "loss": 0.509, + "step": 4727 + }, + { + "epoch": 0.7401377582968065, + "grad_norm": 5.8397111892700195, + "learning_rate": 6.564841968067775e-05, + "loss": 0.8094, + "step": 4728 + }, + { + "epoch": 0.7402943018159048, + "grad_norm": 2.985219717025757, + "learning_rate": 6.564027370478983e-05, + "loss": 0.6479, + "step": 4729 + }, + { + "epoch": 0.7404508453350032, + "grad_norm": 2.475395679473877, + "learning_rate": 6.563212772890193e-05, + "loss": 1.1029, + "step": 4730 + }, + { + "epoch": 0.7406073888541015, + "grad_norm": 3.9644997119903564, + "learning_rate": 6.562398175301401e-05, + "loss": 0.6553, + "step": 4731 + }, + { + "epoch": 0.7407639323731997, + "grad_norm": 5.173982620239258, + "learning_rate": 6.56158357771261e-05, + "loss": 1.0543, + "step": 4732 + }, + { + "epoch": 0.740920475892298, + "grad_norm": 3.656726837158203, + "learning_rate": 6.56076898012382e-05, + "loss": 0.8394, + "step": 4733 + }, + { + "epoch": 0.7410770194113964, + "grad_norm": 4.616371154785156, + "learning_rate": 6.559954382535028e-05, + "loss": 0.7162, + "step": 4734 + }, + { + "epoch": 0.7412335629304947, + "grad_norm": 3.4403295516967773, + "learning_rate": 6.559139784946236e-05, + "loss": 1.1081, + "step": 4735 + }, + { + "epoch": 0.741390106449593, + "grad_norm": 2.613797664642334, + "learning_rate": 6.558325187357446e-05, + "loss": 0.9742, + "step": 4736 + }, + { + "epoch": 0.7415466499686914, + "grad_norm": 3.041602373123169, + "learning_rate": 6.557510589768656e-05, + "loss": 1.0684, + "step": 4737 + }, + { + "epoch": 0.7417031934877896, + "grad_norm": 2.2397079467773438, + "learning_rate": 6.556695992179863e-05, + "loss": 0.8803, + "step": 4738 + }, + { + "epoch": 0.7418597370068879, + "grad_norm": 2.8632075786590576, + "learning_rate": 6.555881394591072e-05, + "loss": 1.063, + "step": 4739 + }, + { + "epoch": 0.7420162805259862, + "grad_norm": 4.082852840423584, + "learning_rate": 6.555066797002282e-05, + "loss": 1.2206, + "step": 4740 + }, + { + "epoch": 0.7421728240450846, + "grad_norm": 3.948521375656128, + "learning_rate": 6.554252199413489e-05, + "loss": 1.0654, + "step": 4741 + }, + { + "epoch": 0.7423293675641829, + "grad_norm": 2.840710401535034, + "learning_rate": 6.553437601824699e-05, + "loss": 1.3062, + "step": 4742 + }, + { + "epoch": 0.7424859110832811, + "grad_norm": 2.1377036571502686, + "learning_rate": 6.552623004235909e-05, + "loss": 0.6315, + "step": 4743 + }, + { + "epoch": 0.7426424546023794, + "grad_norm": 2.190688371658325, + "learning_rate": 6.551808406647117e-05, + "loss": 0.8288, + "step": 4744 + }, + { + "epoch": 0.7427989981214778, + "grad_norm": 3.033433437347412, + "learning_rate": 6.550993809058325e-05, + "loss": 1.4012, + "step": 4745 + }, + { + "epoch": 0.7429555416405761, + "grad_norm": 4.043979167938232, + "learning_rate": 6.550179211469535e-05, + "loss": 0.5211, + "step": 4746 + }, + { + "epoch": 0.7431120851596744, + "grad_norm": 2.824323892593384, + "learning_rate": 6.549364613880743e-05, + "loss": 0.6292, + "step": 4747 + }, + { + "epoch": 0.7432686286787727, + "grad_norm": 2.0989389419555664, + "learning_rate": 6.548550016291952e-05, + "loss": 0.9071, + "step": 4748 + }, + { + "epoch": 0.743425172197871, + "grad_norm": 3.9108405113220215, + "learning_rate": 6.547735418703162e-05, + "loss": 1.0703, + "step": 4749 + }, + { + "epoch": 0.7435817157169693, + "grad_norm": 3.472896099090576, + "learning_rate": 6.54692082111437e-05, + "loss": 0.8256, + "step": 4750 + }, + { + "epoch": 0.7437382592360676, + "grad_norm": 0.8685640692710876, + "learning_rate": 6.546106223525578e-05, + "loss": 0.2925, + "step": 4751 + }, + { + "epoch": 0.743894802755166, + "grad_norm": 0.5130553841590881, + "learning_rate": 6.545291625936788e-05, + "loss": 0.2486, + "step": 4752 + }, + { + "epoch": 0.7440513462742643, + "grad_norm": 0.6523815393447876, + "learning_rate": 6.544477028347996e-05, + "loss": 0.2785, + "step": 4753 + }, + { + "epoch": 0.7442078897933626, + "grad_norm": 0.6810485124588013, + "learning_rate": 6.543662430759205e-05, + "loss": 0.3274, + "step": 4754 + }, + { + "epoch": 0.7443644333124608, + "grad_norm": 0.4992976486682892, + "learning_rate": 6.542847833170415e-05, + "loss": 0.1515, + "step": 4755 + }, + { + "epoch": 0.7445209768315592, + "grad_norm": 0.6672161221504211, + "learning_rate": 6.542033235581623e-05, + "loss": 0.2749, + "step": 4756 + }, + { + "epoch": 0.7446775203506575, + "grad_norm": 1.2143253087997437, + "learning_rate": 6.541218637992833e-05, + "loss": 0.2779, + "step": 4757 + }, + { + "epoch": 0.7448340638697558, + "grad_norm": 0.856577455997467, + "learning_rate": 6.540404040404041e-05, + "loss": 0.3519, + "step": 4758 + }, + { + "epoch": 0.7449906073888541, + "grad_norm": 0.8528262972831726, + "learning_rate": 6.53958944281525e-05, + "loss": 0.3374, + "step": 4759 + }, + { + "epoch": 0.7451471509079524, + "grad_norm": 1.0364097356796265, + "learning_rate": 6.538774845226459e-05, + "loss": 0.3946, + "step": 4760 + }, + { + "epoch": 0.7453036944270507, + "grad_norm": 1.1756216287612915, + "learning_rate": 6.537960247637667e-05, + "loss": 0.3586, + "step": 4761 + }, + { + "epoch": 0.745460237946149, + "grad_norm": 0.614716112613678, + "learning_rate": 6.537145650048876e-05, + "loss": 0.2929, + "step": 4762 + }, + { + "epoch": 0.7456167814652473, + "grad_norm": 3.524451732635498, + "learning_rate": 6.536331052460086e-05, + "loss": 0.5482, + "step": 4763 + }, + { + "epoch": 0.7457733249843457, + "grad_norm": 1.1706409454345703, + "learning_rate": 6.535516454871294e-05, + "loss": 0.4711, + "step": 4764 + }, + { + "epoch": 0.745929868503444, + "grad_norm": 0.9206258058547974, + "learning_rate": 6.534701857282502e-05, + "loss": 0.3078, + "step": 4765 + }, + { + "epoch": 0.7460864120225422, + "grad_norm": 3.309708833694458, + "learning_rate": 6.533887259693712e-05, + "loss": 1.1878, + "step": 4766 + }, + { + "epoch": 0.7462429555416406, + "grad_norm": 1.3692095279693604, + "learning_rate": 6.53307266210492e-05, + "loss": 0.5237, + "step": 4767 + }, + { + "epoch": 0.7463994990607389, + "grad_norm": 1.8444077968597412, + "learning_rate": 6.532258064516129e-05, + "loss": 0.5782, + "step": 4768 + }, + { + "epoch": 0.7465560425798372, + "grad_norm": 1.5097472667694092, + "learning_rate": 6.531443466927339e-05, + "loss": 0.8246, + "step": 4769 + }, + { + "epoch": 0.7467125860989355, + "grad_norm": 2.1871895790100098, + "learning_rate": 6.530628869338547e-05, + "loss": 0.4813, + "step": 4770 + }, + { + "epoch": 0.7468691296180339, + "grad_norm": 1.6015490293502808, + "learning_rate": 6.529814271749755e-05, + "loss": 0.6424, + "step": 4771 + }, + { + "epoch": 0.7470256731371321, + "grad_norm": 1.5530624389648438, + "learning_rate": 6.528999674160965e-05, + "loss": 0.5851, + "step": 4772 + }, + { + "epoch": 0.7471822166562304, + "grad_norm": 1.2208997011184692, + "learning_rate": 6.528185076572175e-05, + "loss": 0.555, + "step": 4773 + }, + { + "epoch": 0.7473387601753287, + "grad_norm": 1.9978177547454834, + "learning_rate": 6.527370478983382e-05, + "loss": 0.5138, + "step": 4774 + }, + { + "epoch": 0.7474953036944271, + "grad_norm": 1.5212359428405762, + "learning_rate": 6.526555881394592e-05, + "loss": 0.8343, + "step": 4775 + }, + { + "epoch": 0.7476518472135254, + "grad_norm": 1.6767089366912842, + "learning_rate": 6.525741283805801e-05, + "loss": 0.5526, + "step": 4776 + }, + { + "epoch": 0.7478083907326236, + "grad_norm": 1.751574158668518, + "learning_rate": 6.524926686217008e-05, + "loss": 0.8577, + "step": 4777 + }, + { + "epoch": 0.747964934251722, + "grad_norm": 2.014150619506836, + "learning_rate": 6.524112088628218e-05, + "loss": 0.4131, + "step": 4778 + }, + { + "epoch": 0.7481214777708203, + "grad_norm": 1.6118807792663574, + "learning_rate": 6.523297491039428e-05, + "loss": 0.4824, + "step": 4779 + }, + { + "epoch": 0.7482780212899186, + "grad_norm": 2.5635480880737305, + "learning_rate": 6.522482893450636e-05, + "loss": 0.7305, + "step": 4780 + }, + { + "epoch": 0.7484345648090169, + "grad_norm": 2.8880295753479004, + "learning_rate": 6.521668295861844e-05, + "loss": 1.2592, + "step": 4781 + }, + { + "epoch": 0.7485911083281153, + "grad_norm": 2.3325014114379883, + "learning_rate": 6.520853698273054e-05, + "loss": 0.8738, + "step": 4782 + }, + { + "epoch": 0.7487476518472135, + "grad_norm": 2.357480525970459, + "learning_rate": 6.520039100684263e-05, + "loss": 0.9353, + "step": 4783 + }, + { + "epoch": 0.7489041953663118, + "grad_norm": 2.400451898574829, + "learning_rate": 6.519224503095471e-05, + "loss": 1.1273, + "step": 4784 + }, + { + "epoch": 0.7490607388854101, + "grad_norm": 3.804110288619995, + "learning_rate": 6.51840990550668e-05, + "loss": 1.1341, + "step": 4785 + }, + { + "epoch": 0.7492172824045085, + "grad_norm": 2.084383964538574, + "learning_rate": 6.517595307917889e-05, + "loss": 1.3952, + "step": 4786 + }, + { + "epoch": 0.7493738259236068, + "grad_norm": 3.265181064605713, + "learning_rate": 6.516780710329097e-05, + "loss": 1.1462, + "step": 4787 + }, + { + "epoch": 0.7495303694427051, + "grad_norm": 2.0713424682617188, + "learning_rate": 6.515966112740307e-05, + "loss": 0.9793, + "step": 4788 + }, + { + "epoch": 0.7496869129618033, + "grad_norm": 2.4927051067352295, + "learning_rate": 6.515151515151516e-05, + "loss": 0.9051, + "step": 4789 + }, + { + "epoch": 0.7498434564809017, + "grad_norm": 2.5559167861938477, + "learning_rate": 6.514336917562724e-05, + "loss": 1.5661, + "step": 4790 + }, + { + "epoch": 0.75, + "grad_norm": 3.6723506450653076, + "learning_rate": 6.513522319973934e-05, + "loss": 1.042, + "step": 4791 + }, + { + "epoch": 0.7501565435190983, + "grad_norm": 2.8272545337677, + "learning_rate": 6.512707722385142e-05, + "loss": 1.3083, + "step": 4792 + }, + { + "epoch": 0.7503130870381967, + "grad_norm": 2.4287972450256348, + "learning_rate": 6.51189312479635e-05, + "loss": 1.4264, + "step": 4793 + }, + { + "epoch": 0.7504696305572949, + "grad_norm": 3.6377670764923096, + "learning_rate": 6.51107852720756e-05, + "loss": 1.603, + "step": 4794 + }, + { + "epoch": 0.7506261740763932, + "grad_norm": 3.492624521255493, + "learning_rate": 6.510263929618768e-05, + "loss": 1.2109, + "step": 4795 + }, + { + "epoch": 0.7507827175954915, + "grad_norm": 2.3533551692962646, + "learning_rate": 6.509449332029978e-05, + "loss": 0.4569, + "step": 4796 + }, + { + "epoch": 0.7509392611145899, + "grad_norm": 1.829797387123108, + "learning_rate": 6.508634734441187e-05, + "loss": 0.6655, + "step": 4797 + }, + { + "epoch": 0.7510958046336882, + "grad_norm": 4.232069492340088, + "learning_rate": 6.507820136852395e-05, + "loss": 1.0733, + "step": 4798 + }, + { + "epoch": 0.7512523481527865, + "grad_norm": 3.512695789337158, + "learning_rate": 6.507005539263605e-05, + "loss": 0.916, + "step": 4799 + }, + { + "epoch": 0.7514088916718847, + "grad_norm": 4.111003398895264, + "learning_rate": 6.506190941674813e-05, + "loss": 1.2821, + "step": 4800 + }, + { + "epoch": 0.7515654351909831, + "grad_norm": 0.6085000038146973, + "learning_rate": 6.505376344086021e-05, + "loss": 0.2784, + "step": 4801 + }, + { + "epoch": 0.7517219787100814, + "grad_norm": 0.4919755160808563, + "learning_rate": 6.504561746497231e-05, + "loss": 0.2174, + "step": 4802 + }, + { + "epoch": 0.7518785222291797, + "grad_norm": 0.6014512181282043, + "learning_rate": 6.50374714890844e-05, + "loss": 0.3156, + "step": 4803 + }, + { + "epoch": 0.752035065748278, + "grad_norm": 0.6035397052764893, + "learning_rate": 6.502932551319648e-05, + "loss": 0.2356, + "step": 4804 + }, + { + "epoch": 0.7521916092673764, + "grad_norm": 0.45744192600250244, + "learning_rate": 6.502117953730858e-05, + "loss": 0.2753, + "step": 4805 + }, + { + "epoch": 0.7523481527864746, + "grad_norm": 0.7924423813819885, + "learning_rate": 6.501303356142066e-05, + "loss": 0.3303, + "step": 4806 + }, + { + "epoch": 0.7525046963055729, + "grad_norm": 1.1337898969650269, + "learning_rate": 6.500488758553274e-05, + "loss": 0.3087, + "step": 4807 + }, + { + "epoch": 0.7526612398246713, + "grad_norm": 1.081886887550354, + "learning_rate": 6.499674160964484e-05, + "loss": 0.3836, + "step": 4808 + }, + { + "epoch": 0.7528177833437696, + "grad_norm": 0.7402124404907227, + "learning_rate": 6.498859563375694e-05, + "loss": 0.3486, + "step": 4809 + }, + { + "epoch": 0.7529743268628679, + "grad_norm": 1.0022863149642944, + "learning_rate": 6.498044965786901e-05, + "loss": 0.2138, + "step": 4810 + }, + { + "epoch": 0.7531308703819661, + "grad_norm": 1.1986711025238037, + "learning_rate": 6.49723036819811e-05, + "loss": 0.4848, + "step": 4811 + }, + { + "epoch": 0.7532874139010645, + "grad_norm": 1.126493215560913, + "learning_rate": 6.49641577060932e-05, + "loss": 0.3719, + "step": 4812 + }, + { + "epoch": 0.7534439574201628, + "grad_norm": 0.9378973245620728, + "learning_rate": 6.495601173020527e-05, + "loss": 0.2974, + "step": 4813 + }, + { + "epoch": 0.7536005009392611, + "grad_norm": 1.924163579940796, + "learning_rate": 6.494786575431737e-05, + "loss": 0.3075, + "step": 4814 + }, + { + "epoch": 0.7537570444583594, + "grad_norm": 1.2932161092758179, + "learning_rate": 6.493971977842947e-05, + "loss": 0.4706, + "step": 4815 + }, + { + "epoch": 0.7539135879774578, + "grad_norm": 1.4578534364700317, + "learning_rate": 6.493157380254155e-05, + "loss": 0.513, + "step": 4816 + }, + { + "epoch": 0.754070131496556, + "grad_norm": 1.4161067008972168, + "learning_rate": 6.492342782665364e-05, + "loss": 0.4447, + "step": 4817 + }, + { + "epoch": 0.7542266750156543, + "grad_norm": 2.9668524265289307, + "learning_rate": 6.491528185076573e-05, + "loss": 0.5216, + "step": 4818 + }, + { + "epoch": 0.7543832185347527, + "grad_norm": 1.7271867990493774, + "learning_rate": 6.490713587487782e-05, + "loss": 0.3663, + "step": 4819 + }, + { + "epoch": 0.754539762053851, + "grad_norm": 1.7852696180343628, + "learning_rate": 6.48989898989899e-05, + "loss": 0.3113, + "step": 4820 + }, + { + "epoch": 0.7546963055729493, + "grad_norm": 2.116375207901001, + "learning_rate": 6.4890843923102e-05, + "loss": 0.4494, + "step": 4821 + }, + { + "epoch": 0.7548528490920476, + "grad_norm": 1.6538002490997314, + "learning_rate": 6.488269794721408e-05, + "loss": 0.5377, + "step": 4822 + }, + { + "epoch": 0.7550093926111459, + "grad_norm": 2.1452198028564453, + "learning_rate": 6.487455197132617e-05, + "loss": 0.529, + "step": 4823 + }, + { + "epoch": 0.7551659361302442, + "grad_norm": 2.1898319721221924, + "learning_rate": 6.486640599543826e-05, + "loss": 0.5898, + "step": 4824 + }, + { + "epoch": 0.7553224796493425, + "grad_norm": 2.1289799213409424, + "learning_rate": 6.485826001955035e-05, + "loss": 0.6889, + "step": 4825 + }, + { + "epoch": 0.7554790231684408, + "grad_norm": 2.324491024017334, + "learning_rate": 6.485011404366243e-05, + "loss": 0.6788, + "step": 4826 + }, + { + "epoch": 0.7556355666875392, + "grad_norm": 2.0605876445770264, + "learning_rate": 6.484196806777453e-05, + "loss": 0.6205, + "step": 4827 + }, + { + "epoch": 0.7557921102066374, + "grad_norm": 1.7951353788375854, + "learning_rate": 6.483382209188661e-05, + "loss": 0.6071, + "step": 4828 + }, + { + "epoch": 0.7559486537257357, + "grad_norm": 3.1372671127319336, + "learning_rate": 6.48256761159987e-05, + "loss": 0.5193, + "step": 4829 + }, + { + "epoch": 0.756105197244834, + "grad_norm": 4.107729911804199, + "learning_rate": 6.481753014011079e-05, + "loss": 0.8734, + "step": 4830 + }, + { + "epoch": 0.7562617407639324, + "grad_norm": 2.425840377807617, + "learning_rate": 6.480938416422288e-05, + "loss": 0.996, + "step": 4831 + }, + { + "epoch": 0.7564182842830307, + "grad_norm": 3.020906448364258, + "learning_rate": 6.480123818833497e-05, + "loss": 1.0177, + "step": 4832 + }, + { + "epoch": 0.756574827802129, + "grad_norm": 2.7140252590179443, + "learning_rate": 6.479309221244706e-05, + "loss": 0.9197, + "step": 4833 + }, + { + "epoch": 0.7567313713212273, + "grad_norm": 2.8187766075134277, + "learning_rate": 6.478494623655914e-05, + "loss": 0.6507, + "step": 4834 + }, + { + "epoch": 0.7568879148403256, + "grad_norm": 2.216740608215332, + "learning_rate": 6.477680026067124e-05, + "loss": 0.863, + "step": 4835 + }, + { + "epoch": 0.7570444583594239, + "grad_norm": 2.024322271347046, + "learning_rate": 6.476865428478332e-05, + "loss": 0.9116, + "step": 4836 + }, + { + "epoch": 0.7572010018785222, + "grad_norm": 3.397472620010376, + "learning_rate": 6.47605083088954e-05, + "loss": 1.1172, + "step": 4837 + }, + { + "epoch": 0.7573575453976206, + "grad_norm": 2.8809151649475098, + "learning_rate": 6.47523623330075e-05, + "loss": 0.9919, + "step": 4838 + }, + { + "epoch": 0.7575140889167189, + "grad_norm": 5.613738536834717, + "learning_rate": 6.474421635711959e-05, + "loss": 0.8303, + "step": 4839 + }, + { + "epoch": 0.7576706324358171, + "grad_norm": 4.941376209259033, + "learning_rate": 6.473607038123167e-05, + "loss": 1.0777, + "step": 4840 + }, + { + "epoch": 0.7578271759549154, + "grad_norm": 3.8242385387420654, + "learning_rate": 6.472792440534377e-05, + "loss": 1.2319, + "step": 4841 + }, + { + "epoch": 0.7579837194740138, + "grad_norm": 2.8608696460723877, + "learning_rate": 6.471977842945585e-05, + "loss": 1.2837, + "step": 4842 + }, + { + "epoch": 0.7581402629931121, + "grad_norm": 4.452634334564209, + "learning_rate": 6.471163245356794e-05, + "loss": 1.8232, + "step": 4843 + }, + { + "epoch": 0.7582968065122104, + "grad_norm": 1.9955354928970337, + "learning_rate": 6.470348647768003e-05, + "loss": 1.4785, + "step": 4844 + }, + { + "epoch": 0.7584533500313086, + "grad_norm": 3.534079074859619, + "learning_rate": 6.469534050179213e-05, + "loss": 1.292, + "step": 4845 + }, + { + "epoch": 0.758609893550407, + "grad_norm": 6.85938835144043, + "learning_rate": 6.46871945259042e-05, + "loss": 0.4599, + "step": 4846 + }, + { + "epoch": 0.7587664370695053, + "grad_norm": 2.686095714569092, + "learning_rate": 6.46790485500163e-05, + "loss": 0.9724, + "step": 4847 + }, + { + "epoch": 0.7589229805886036, + "grad_norm": 5.025091648101807, + "learning_rate": 6.46709025741284e-05, + "loss": 0.7952, + "step": 4848 + }, + { + "epoch": 0.759079524107702, + "grad_norm": 3.660395622253418, + "learning_rate": 6.466275659824046e-05, + "loss": 1.0058, + "step": 4849 + }, + { + "epoch": 0.7592360676268003, + "grad_norm": 2.1809301376342773, + "learning_rate": 6.465461062235256e-05, + "loss": 0.8774, + "step": 4850 + }, + { + "epoch": 0.7593926111458985, + "grad_norm": 0.6096743941307068, + "learning_rate": 6.464646464646466e-05, + "loss": 0.2264, + "step": 4851 + }, + { + "epoch": 0.7595491546649968, + "grad_norm": 0.4532240033149719, + "learning_rate": 6.463831867057673e-05, + "loss": 0.236, + "step": 4852 + }, + { + "epoch": 0.7597056981840952, + "grad_norm": 0.40098637342453003, + "learning_rate": 6.463017269468883e-05, + "loss": 0.2394, + "step": 4853 + }, + { + "epoch": 0.7598622417031935, + "grad_norm": 0.44564446806907654, + "learning_rate": 6.462202671880092e-05, + "loss": 0.1843, + "step": 4854 + }, + { + "epoch": 0.7600187852222918, + "grad_norm": 1.0658234357833862, + "learning_rate": 6.461388074291301e-05, + "loss": 0.4509, + "step": 4855 + }, + { + "epoch": 0.7601753287413902, + "grad_norm": 0.5774534940719604, + "learning_rate": 6.460573476702509e-05, + "loss": 0.2592, + "step": 4856 + }, + { + "epoch": 0.7603318722604884, + "grad_norm": 0.5653113722801208, + "learning_rate": 6.459758879113719e-05, + "loss": 0.2718, + "step": 4857 + }, + { + "epoch": 0.7604884157795867, + "grad_norm": 0.7485427856445312, + "learning_rate": 6.458944281524927e-05, + "loss": 0.2292, + "step": 4858 + }, + { + "epoch": 0.760644959298685, + "grad_norm": 0.7683144807815552, + "learning_rate": 6.458129683936136e-05, + "loss": 0.3597, + "step": 4859 + }, + { + "epoch": 0.7608015028177834, + "grad_norm": 0.8077671527862549, + "learning_rate": 6.457315086347344e-05, + "loss": 0.4272, + "step": 4860 + }, + { + "epoch": 0.7609580463368817, + "grad_norm": 0.9202790260314941, + "learning_rate": 6.456500488758554e-05, + "loss": 0.4369, + "step": 4861 + }, + { + "epoch": 0.76111458985598, + "grad_norm": 1.1608312129974365, + "learning_rate": 6.455685891169762e-05, + "loss": 0.4509, + "step": 4862 + }, + { + "epoch": 0.7612711333750782, + "grad_norm": 2.2180731296539307, + "learning_rate": 6.45487129358097e-05, + "loss": 0.4772, + "step": 4863 + }, + { + "epoch": 0.7614276768941766, + "grad_norm": 1.9489948749542236, + "learning_rate": 6.45405669599218e-05, + "loss": 0.4496, + "step": 4864 + }, + { + "epoch": 0.7615842204132749, + "grad_norm": 1.5381250381469727, + "learning_rate": 6.453242098403389e-05, + "loss": 0.6925, + "step": 4865 + }, + { + "epoch": 0.7617407639323732, + "grad_norm": 1.3331910371780396, + "learning_rate": 6.452427500814597e-05, + "loss": 0.3162, + "step": 4866 + }, + { + "epoch": 0.7618973074514716, + "grad_norm": 1.4507498741149902, + "learning_rate": 6.451612903225807e-05, + "loss": 0.5176, + "step": 4867 + }, + { + "epoch": 0.7620538509705698, + "grad_norm": 2.1762688159942627, + "learning_rate": 6.450798305637016e-05, + "loss": 0.6366, + "step": 4868 + }, + { + "epoch": 0.7622103944896681, + "grad_norm": 2.504751682281494, + "learning_rate": 6.449983708048223e-05, + "loss": 0.7117, + "step": 4869 + }, + { + "epoch": 0.7623669380087664, + "grad_norm": 1.6231107711791992, + "learning_rate": 6.449169110459433e-05, + "loss": 0.5526, + "step": 4870 + }, + { + "epoch": 0.7625234815278648, + "grad_norm": 2.183051824569702, + "learning_rate": 6.448354512870643e-05, + "loss": 0.4324, + "step": 4871 + }, + { + "epoch": 0.7626800250469631, + "grad_norm": 1.2351131439208984, + "learning_rate": 6.44753991528185e-05, + "loss": 0.3661, + "step": 4872 + }, + { + "epoch": 0.7628365685660614, + "grad_norm": 1.4733872413635254, + "learning_rate": 6.44672531769306e-05, + "loss": 0.4601, + "step": 4873 + }, + { + "epoch": 0.7629931120851596, + "grad_norm": 3.0454819202423096, + "learning_rate": 6.44591072010427e-05, + "loss": 0.8904, + "step": 4874 + }, + { + "epoch": 0.763149655604258, + "grad_norm": 1.8598064184188843, + "learning_rate": 6.445096122515478e-05, + "loss": 0.5661, + "step": 4875 + }, + { + "epoch": 0.7633061991233563, + "grad_norm": 1.177088737487793, + "learning_rate": 6.444281524926686e-05, + "loss": 0.4304, + "step": 4876 + }, + { + "epoch": 0.7634627426424546, + "grad_norm": 3.347585678100586, + "learning_rate": 6.443466927337896e-05, + "loss": 0.6821, + "step": 4877 + }, + { + "epoch": 0.763619286161553, + "grad_norm": 1.6210945844650269, + "learning_rate": 6.442652329749104e-05, + "loss": 0.4725, + "step": 4878 + }, + { + "epoch": 0.7637758296806513, + "grad_norm": 3.683291435241699, + "learning_rate": 6.441837732160313e-05, + "loss": 0.7417, + "step": 4879 + }, + { + "epoch": 0.7639323731997495, + "grad_norm": 2.9106688499450684, + "learning_rate": 6.441023134571522e-05, + "loss": 0.8232, + "step": 4880 + }, + { + "epoch": 0.7640889167188478, + "grad_norm": 2.3701815605163574, + "learning_rate": 6.440208536982731e-05, + "loss": 0.74, + "step": 4881 + }, + { + "epoch": 0.7642454602379462, + "grad_norm": 3.109884023666382, + "learning_rate": 6.439393939393939e-05, + "loss": 0.9258, + "step": 4882 + }, + { + "epoch": 0.7644020037570445, + "grad_norm": 3.0632503032684326, + "learning_rate": 6.438579341805149e-05, + "loss": 1.1097, + "step": 4883 + }, + { + "epoch": 0.7645585472761428, + "grad_norm": 3.366041898727417, + "learning_rate": 6.437764744216357e-05, + "loss": 1.0313, + "step": 4884 + }, + { + "epoch": 0.764715090795241, + "grad_norm": 9.39246654510498, + "learning_rate": 6.436950146627566e-05, + "loss": 0.917, + "step": 4885 + }, + { + "epoch": 0.7648716343143394, + "grad_norm": 5.430665016174316, + "learning_rate": 6.436135549038775e-05, + "loss": 0.7755, + "step": 4886 + }, + { + "epoch": 0.7650281778334377, + "grad_norm": 2.7117135524749756, + "learning_rate": 6.435320951449984e-05, + "loss": 0.8336, + "step": 4887 + }, + { + "epoch": 0.765184721352536, + "grad_norm": 2.2599639892578125, + "learning_rate": 6.434506353861192e-05, + "loss": 1.1356, + "step": 4888 + }, + { + "epoch": 0.7653412648716343, + "grad_norm": 3.0391757488250732, + "learning_rate": 6.433691756272402e-05, + "loss": 1.1681, + "step": 4889 + }, + { + "epoch": 0.7654978083907327, + "grad_norm": 4.346817970275879, + "learning_rate": 6.43287715868361e-05, + "loss": 0.7632, + "step": 4890 + }, + { + "epoch": 0.7656543519098309, + "grad_norm": 2.938584089279175, + "learning_rate": 6.43206256109482e-05, + "loss": 1.3043, + "step": 4891 + }, + { + "epoch": 0.7658108954289292, + "grad_norm": 2.0257248878479004, + "learning_rate": 6.431247963506028e-05, + "loss": 0.7329, + "step": 4892 + }, + { + "epoch": 0.7659674389480275, + "grad_norm": 2.089578866958618, + "learning_rate": 6.430433365917237e-05, + "loss": 1.087, + "step": 4893 + }, + { + "epoch": 0.7661239824671259, + "grad_norm": 2.492358684539795, + "learning_rate": 6.429618768328446e-05, + "loss": 1.3927, + "step": 4894 + }, + { + "epoch": 0.7662805259862242, + "grad_norm": 3.2081849575042725, + "learning_rate": 6.428804170739655e-05, + "loss": 1.9312, + "step": 4895 + }, + { + "epoch": 0.7664370695053225, + "grad_norm": 1.829896330833435, + "learning_rate": 6.427989573150863e-05, + "loss": 0.8145, + "step": 4896 + }, + { + "epoch": 0.7665936130244208, + "grad_norm": NaN, + "learning_rate": 6.427989573150863e-05, + "loss": 0.0, + "step": 4897 + }, + { + "epoch": 0.7667501565435191, + "grad_norm": 2.590881586074829, + "learning_rate": 6.427174975562073e-05, + "loss": 0.8922, + "step": 4898 + }, + { + "epoch": 0.7669067000626174, + "grad_norm": 4.2230753898620605, + "learning_rate": 6.426360377973281e-05, + "loss": 0.753, + "step": 4899 + }, + { + "epoch": 0.7670632435817157, + "grad_norm": 2.305063486099243, + "learning_rate": 6.42554578038449e-05, + "loss": 0.5332, + "step": 4900 + }, + { + "epoch": 0.7672197871008141, + "grad_norm": 0.7517115473747253, + "learning_rate": 6.4247311827957e-05, + "loss": 0.32, + "step": 4901 + }, + { + "epoch": 0.7673763306199123, + "grad_norm": 0.5889120697975159, + "learning_rate": 6.423916585206908e-05, + "loss": 0.3047, + "step": 4902 + }, + { + "epoch": 0.7675328741390106, + "grad_norm": 0.6595581769943237, + "learning_rate": 6.423101987618116e-05, + "loss": 0.2838, + "step": 4903 + }, + { + "epoch": 0.7676894176581089, + "grad_norm": 2.008385419845581, + "learning_rate": 6.422287390029326e-05, + "loss": 0.5608, + "step": 4904 + }, + { + "epoch": 0.7678459611772073, + "grad_norm": 0.7125493884086609, + "learning_rate": 6.421472792440536e-05, + "loss": 0.2508, + "step": 4905 + }, + { + "epoch": 0.7680025046963056, + "grad_norm": 1.2356780767440796, + "learning_rate": 6.420658194851743e-05, + "loss": 0.3922, + "step": 4906 + }, + { + "epoch": 0.7681590482154039, + "grad_norm": 1.720711588859558, + "learning_rate": 6.419843597262952e-05, + "loss": 0.2207, + "step": 4907 + }, + { + "epoch": 0.7683155917345021, + "grad_norm": 0.7716761827468872, + "learning_rate": 6.419028999674162e-05, + "loss": 0.3234, + "step": 4908 + }, + { + "epoch": 0.7684721352536005, + "grad_norm": 0.6555934548377991, + "learning_rate": 6.418214402085369e-05, + "loss": 0.2672, + "step": 4909 + }, + { + "epoch": 0.7686286787726988, + "grad_norm": 0.9979533553123474, + "learning_rate": 6.417399804496579e-05, + "loss": 0.4912, + "step": 4910 + }, + { + "epoch": 0.7687852222917971, + "grad_norm": 0.7542836666107178, + "learning_rate": 6.416585206907789e-05, + "loss": 0.3585, + "step": 4911 + }, + { + "epoch": 0.7689417658108955, + "grad_norm": 1.205335259437561, + "learning_rate": 6.415770609318996e-05, + "loss": 0.491, + "step": 4912 + }, + { + "epoch": 0.7690983093299938, + "grad_norm": 1.4040656089782715, + "learning_rate": 6.414956011730205e-05, + "loss": 0.4164, + "step": 4913 + }, + { + "epoch": 0.769254852849092, + "grad_norm": 0.7652807235717773, + "learning_rate": 6.414141414141415e-05, + "loss": 0.2663, + "step": 4914 + }, + { + "epoch": 0.7694113963681903, + "grad_norm": 1.7695695161819458, + "learning_rate": 6.413326816552623e-05, + "loss": 0.3779, + "step": 4915 + }, + { + "epoch": 0.7695679398872887, + "grad_norm": 1.196907639503479, + "learning_rate": 6.412512218963832e-05, + "loss": 0.2942, + "step": 4916 + }, + { + "epoch": 0.769724483406387, + "grad_norm": 1.3101022243499756, + "learning_rate": 6.411697621375041e-05, + "loss": 0.4524, + "step": 4917 + }, + { + "epoch": 0.7698810269254853, + "grad_norm": 1.93929123878479, + "learning_rate": 6.41088302378625e-05, + "loss": 0.6556, + "step": 4918 + }, + { + "epoch": 0.7700375704445835, + "grad_norm": 1.8864033222198486, + "learning_rate": 6.410068426197458e-05, + "loss": 0.4298, + "step": 4919 + }, + { + "epoch": 0.7701941139636819, + "grad_norm": 1.5498677492141724, + "learning_rate": 6.409253828608668e-05, + "loss": 0.4391, + "step": 4920 + }, + { + "epoch": 0.7703506574827802, + "grad_norm": 2.040987968444824, + "learning_rate": 6.408439231019876e-05, + "loss": 0.5407, + "step": 4921 + }, + { + "epoch": 0.7705072010018785, + "grad_norm": 2.1546471118927, + "learning_rate": 6.407624633431085e-05, + "loss": 0.5596, + "step": 4922 + }, + { + "epoch": 0.7706637445209769, + "grad_norm": 2.234776735305786, + "learning_rate": 6.406810035842294e-05, + "loss": 0.7307, + "step": 4923 + }, + { + "epoch": 0.7708202880400752, + "grad_norm": 2.2175040245056152, + "learning_rate": 6.405995438253503e-05, + "loss": 0.5681, + "step": 4924 + }, + { + "epoch": 0.7709768315591734, + "grad_norm": 2.1105751991271973, + "learning_rate": 6.405180840664711e-05, + "loss": 0.3475, + "step": 4925 + }, + { + "epoch": 0.7711333750782717, + "grad_norm": 1.8572533130645752, + "learning_rate": 6.404366243075921e-05, + "loss": 0.8765, + "step": 4926 + }, + { + "epoch": 0.7712899185973701, + "grad_norm": 2.215799331665039, + "learning_rate": 6.403551645487129e-05, + "loss": 0.9286, + "step": 4927 + }, + { + "epoch": 0.7714464621164684, + "grad_norm": 9.3485107421875, + "learning_rate": 6.402737047898339e-05, + "loss": 1.2464, + "step": 4928 + }, + { + "epoch": 0.7716030056355667, + "grad_norm": 1.4729753732681274, + "learning_rate": 6.401922450309547e-05, + "loss": 0.6221, + "step": 4929 + }, + { + "epoch": 0.771759549154665, + "grad_norm": 1.8866816759109497, + "learning_rate": 6.401107852720756e-05, + "loss": 0.4825, + "step": 4930 + }, + { + "epoch": 0.7719160926737633, + "grad_norm": 3.88224458694458, + "learning_rate": 6.400293255131966e-05, + "loss": 0.912, + "step": 4931 + }, + { + "epoch": 0.7720726361928616, + "grad_norm": 3.0478696823120117, + "learning_rate": 6.399478657543174e-05, + "loss": 0.8617, + "step": 4932 + }, + { + "epoch": 0.7722291797119599, + "grad_norm": 2.57456636428833, + "learning_rate": 6.398664059954382e-05, + "loss": 0.5162, + "step": 4933 + }, + { + "epoch": 0.7723857232310583, + "grad_norm": 4.4192328453063965, + "learning_rate": 6.397849462365592e-05, + "loss": 1.2102, + "step": 4934 + }, + { + "epoch": 0.7725422667501566, + "grad_norm": 2.173511028289795, + "learning_rate": 6.3970348647768e-05, + "loss": 0.7131, + "step": 4935 + }, + { + "epoch": 0.7726988102692548, + "grad_norm": 2.907986640930176, + "learning_rate": 6.396220267188009e-05, + "loss": 0.8409, + "step": 4936 + }, + { + "epoch": 0.7728553537883531, + "grad_norm": 2.880525827407837, + "learning_rate": 6.395405669599218e-05, + "loss": 1.2662, + "step": 4937 + }, + { + "epoch": 0.7730118973074515, + "grad_norm": 2.610398054122925, + "learning_rate": 6.394591072010427e-05, + "loss": 1.1771, + "step": 4938 + }, + { + "epoch": 0.7731684408265498, + "grad_norm": 2.119020462036133, + "learning_rate": 6.393776474421635e-05, + "loss": 0.7328, + "step": 4939 + }, + { + "epoch": 0.7733249843456481, + "grad_norm": 2.1824069023132324, + "learning_rate": 6.392961876832845e-05, + "loss": 0.9327, + "step": 4940 + }, + { + "epoch": 0.7734815278647464, + "grad_norm": 2.942899703979492, + "learning_rate": 6.392147279244053e-05, + "loss": 1.0775, + "step": 4941 + }, + { + "epoch": 0.7736380713838447, + "grad_norm": 2.6131272315979004, + "learning_rate": 6.391332681655262e-05, + "loss": 1.0736, + "step": 4942 + }, + { + "epoch": 0.773794614902943, + "grad_norm": 3.4046826362609863, + "learning_rate": 6.390518084066471e-05, + "loss": 1.0089, + "step": 4943 + }, + { + "epoch": 0.7739511584220413, + "grad_norm": 4.588470458984375, + "learning_rate": 6.389703486477681e-05, + "loss": 1.2714, + "step": 4944 + }, + { + "epoch": 0.7741077019411396, + "grad_norm": 2.40347957611084, + "learning_rate": 6.388888888888888e-05, + "loss": 1.6361, + "step": 4945 + }, + { + "epoch": 0.774264245460238, + "grad_norm": 1.9076412916183472, + "learning_rate": 6.388074291300098e-05, + "loss": 0.6412, + "step": 4946 + }, + { + "epoch": 0.7744207889793363, + "grad_norm": 2.846426248550415, + "learning_rate": 6.387259693711308e-05, + "loss": 0.8389, + "step": 4947 + }, + { + "epoch": 0.7745773324984345, + "grad_norm": 4.058146953582764, + "learning_rate": 6.386445096122515e-05, + "loss": 0.8888, + "step": 4948 + }, + { + "epoch": 0.7747338760175329, + "grad_norm": 3.269162178039551, + "learning_rate": 6.385630498533724e-05, + "loss": 0.8558, + "step": 4949 + }, + { + "epoch": 0.7748904195366312, + "grad_norm": 1.6662254333496094, + "learning_rate": 6.384815900944934e-05, + "loss": 0.5839, + "step": 4950 + }, + { + "epoch": 0.7750469630557295, + "grad_norm": 0.6740942597389221, + "learning_rate": 6.384001303356142e-05, + "loss": 0.2996, + "step": 4951 + }, + { + "epoch": 0.7752035065748278, + "grad_norm": 1.2687697410583496, + "learning_rate": 6.383186705767351e-05, + "loss": 0.6998, + "step": 4952 + }, + { + "epoch": 0.7753600500939261, + "grad_norm": 0.8134473562240601, + "learning_rate": 6.38237210817856e-05, + "loss": 0.3369, + "step": 4953 + }, + { + "epoch": 0.7755165936130244, + "grad_norm": 0.4649645686149597, + "learning_rate": 6.381557510589769e-05, + "loss": 0.1808, + "step": 4954 + }, + { + "epoch": 0.7756731371321227, + "grad_norm": 1.0419833660125732, + "learning_rate": 6.380742913000977e-05, + "loss": 0.4444, + "step": 4955 + }, + { + "epoch": 0.775829680651221, + "grad_norm": 0.8530073165893555, + "learning_rate": 6.379928315412187e-05, + "loss": 0.3997, + "step": 4956 + }, + { + "epoch": 0.7759862241703194, + "grad_norm": 1.3897801637649536, + "learning_rate": 6.379113717823395e-05, + "loss": 0.38, + "step": 4957 + }, + { + "epoch": 0.7761427676894177, + "grad_norm": 0.9212905764579773, + "learning_rate": 6.378299120234604e-05, + "loss": 0.3362, + "step": 4958 + }, + { + "epoch": 0.7762993112085159, + "grad_norm": 0.973595917224884, + "learning_rate": 6.377484522645814e-05, + "loss": 0.4317, + "step": 4959 + }, + { + "epoch": 0.7764558547276142, + "grad_norm": 0.8121359348297119, + "learning_rate": 6.376669925057022e-05, + "loss": 0.2593, + "step": 4960 + }, + { + "epoch": 0.7766123982467126, + "grad_norm": 1.2019002437591553, + "learning_rate": 6.37585532746823e-05, + "loss": 0.3388, + "step": 4961 + }, + { + "epoch": 0.7767689417658109, + "grad_norm": 1.0539467334747314, + "learning_rate": 6.37504072987944e-05, + "loss": 0.3663, + "step": 4962 + }, + { + "epoch": 0.7769254852849092, + "grad_norm": 0.9664640426635742, + "learning_rate": 6.374226132290648e-05, + "loss": 0.4636, + "step": 4963 + }, + { + "epoch": 0.7770820288040076, + "grad_norm": 7.855664253234863, + "learning_rate": 6.373411534701858e-05, + "loss": 1.4173, + "step": 4964 + }, + { + "epoch": 0.7772385723231058, + "grad_norm": 1.5810894966125488, + "learning_rate": 6.372596937113067e-05, + "loss": 0.4415, + "step": 4965 + }, + { + "epoch": 0.7773951158422041, + "grad_norm": 2.227142572402954, + "learning_rate": 6.371782339524275e-05, + "loss": 0.3489, + "step": 4966 + }, + { + "epoch": 0.7775516593613024, + "grad_norm": 1.2681872844696045, + "learning_rate": 6.370967741935485e-05, + "loss": 0.4835, + "step": 4967 + }, + { + "epoch": 0.7777082028804008, + "grad_norm": 2.072190046310425, + "learning_rate": 6.370153144346693e-05, + "loss": 0.3894, + "step": 4968 + }, + { + "epoch": 0.7778647463994991, + "grad_norm": 1.5362502336502075, + "learning_rate": 6.369338546757901e-05, + "loss": 0.4358, + "step": 4969 + }, + { + "epoch": 0.7780212899185974, + "grad_norm": 2.1481027603149414, + "learning_rate": 6.368523949169111e-05, + "loss": 0.4104, + "step": 4970 + }, + { + "epoch": 0.7781778334376956, + "grad_norm": 2.56472110748291, + "learning_rate": 6.36770935158032e-05, + "loss": 0.6535, + "step": 4971 + }, + { + "epoch": 0.778334376956794, + "grad_norm": 0.9203495383262634, + "learning_rate": 6.366894753991528e-05, + "loss": 0.4544, + "step": 4972 + }, + { + "epoch": 0.7784909204758923, + "grad_norm": 1.153438925743103, + "learning_rate": 6.366080156402738e-05, + "loss": 0.4292, + "step": 4973 + }, + { + "epoch": 0.7786474639949906, + "grad_norm": 2.0540740489959717, + "learning_rate": 6.365265558813946e-05, + "loss": 0.3867, + "step": 4974 + }, + { + "epoch": 0.778804007514089, + "grad_norm": 2.4637601375579834, + "learning_rate": 6.364450961225154e-05, + "loss": 0.3917, + "step": 4975 + }, + { + "epoch": 0.7789605510331872, + "grad_norm": 3.519160032272339, + "learning_rate": 6.363636363636364e-05, + "loss": 1.3033, + "step": 4976 + }, + { + "epoch": 0.7791170945522855, + "grad_norm": 2.331962823867798, + "learning_rate": 6.362821766047572e-05, + "loss": 0.7409, + "step": 4977 + }, + { + "epoch": 0.7792736380713838, + "grad_norm": 2.5789718627929688, + "learning_rate": 6.362007168458781e-05, + "loss": 0.8158, + "step": 4978 + }, + { + "epoch": 0.7794301815904822, + "grad_norm": 2.785104751586914, + "learning_rate": 6.36119257086999e-05, + "loss": 1.1613, + "step": 4979 + }, + { + "epoch": 0.7795867251095805, + "grad_norm": 2.911665678024292, + "learning_rate": 6.3603779732812e-05, + "loss": 0.6027, + "step": 4980 + }, + { + "epoch": 0.7797432686286788, + "grad_norm": 3.0279858112335205, + "learning_rate": 6.359563375692407e-05, + "loss": 0.8664, + "step": 4981 + }, + { + "epoch": 0.779899812147777, + "grad_norm": 3.167090654373169, + "learning_rate": 6.358748778103617e-05, + "loss": 0.8662, + "step": 4982 + }, + { + "epoch": 0.7800563556668754, + "grad_norm": 1.9860167503356934, + "learning_rate": 6.357934180514827e-05, + "loss": 0.5197, + "step": 4983 + }, + { + "epoch": 0.7802128991859737, + "grad_norm": 4.838427543640137, + "learning_rate": 6.357119582926034e-05, + "loss": 1.2965, + "step": 4984 + }, + { + "epoch": 0.780369442705072, + "grad_norm": 5.077131748199463, + "learning_rate": 6.356304985337244e-05, + "loss": 1.022, + "step": 4985 + }, + { + "epoch": 0.7805259862241704, + "grad_norm": 2.986581563949585, + "learning_rate": 6.355490387748453e-05, + "loss": 0.9619, + "step": 4986 + }, + { + "epoch": 0.7806825297432687, + "grad_norm": 2.59447979927063, + "learning_rate": 6.354675790159662e-05, + "loss": 1.2224, + "step": 4987 + }, + { + "epoch": 0.7808390732623669, + "grad_norm": 3.351933002471924, + "learning_rate": 6.35386119257087e-05, + "loss": 1.3135, + "step": 4988 + }, + { + "epoch": 0.7809956167814652, + "grad_norm": 3.4256985187530518, + "learning_rate": 6.35304659498208e-05, + "loss": 1.1606, + "step": 4989 + }, + { + "epoch": 0.7811521603005636, + "grad_norm": 2.6116297245025635, + "learning_rate": 6.352231997393288e-05, + "loss": 0.918, + "step": 4990 + }, + { + "epoch": 0.7813087038196619, + "grad_norm": 3.58426833152771, + "learning_rate": 6.351417399804496e-05, + "loss": 1.2197, + "step": 4991 + }, + { + "epoch": 0.7814652473387602, + "grad_norm": 3.7549238204956055, + "learning_rate": 6.350602802215706e-05, + "loss": 1.4176, + "step": 4992 + }, + { + "epoch": 0.7816217908578584, + "grad_norm": 5.245760917663574, + "learning_rate": 6.349788204626915e-05, + "loss": 0.83, + "step": 4993 + }, + { + "epoch": 0.7817783343769568, + "grad_norm": 2.7488627433776855, + "learning_rate": 6.348973607038123e-05, + "loss": 1.2137, + "step": 4994 + }, + { + "epoch": 0.7819348778960551, + "grad_norm": 2.4260141849517822, + "learning_rate": 6.348159009449333e-05, + "loss": 1.2441, + "step": 4995 + }, + { + "epoch": 0.7820914214151534, + "grad_norm": 4.675817489624023, + "learning_rate": 6.347344411860541e-05, + "loss": 0.7472, + "step": 4996 + }, + { + "epoch": 0.7822479649342517, + "grad_norm": 5.772819995880127, + "learning_rate": 6.34652981427175e-05, + "loss": 0.764, + "step": 4997 + }, + { + "epoch": 0.7824045084533501, + "grad_norm": 3.326655149459839, + "learning_rate": 6.345715216682959e-05, + "loss": 0.8156, + "step": 4998 + }, + { + "epoch": 0.7825610519724483, + "grad_norm": 2.0741796493530273, + "learning_rate": 6.344900619094168e-05, + "loss": 0.6492, + "step": 4999 + }, + { + "epoch": 0.7827175954915466, + "grad_norm": 2.43733811378479, + "learning_rate": 6.344086021505376e-05, + "loss": 0.8011, + "step": 5000 + }, + { + "epoch": 0.7827175954915466, + "eval_loss": 0.5288154482841492, + "eval_runtime": 204.0938, + "eval_samples_per_second": 60.673, + "eval_steps_per_second": 3.792, + "eval_wer": 0.3381537284569777, + "step": 5000 + }, + { + "epoch": 0.782874139010645, + "grad_norm": 0.8928443789482117, + "learning_rate": 6.343271423916586e-05, + "loss": 0.4537, + "step": 5001 + }, + { + "epoch": 0.7830306825297433, + "grad_norm": 0.5538212060928345, + "learning_rate": 6.342456826327794e-05, + "loss": 0.2446, + "step": 5002 + }, + { + "epoch": 0.7831872260488416, + "grad_norm": 0.6514049768447876, + "learning_rate": 6.341642228739004e-05, + "loss": 0.2307, + "step": 5003 + }, + { + "epoch": 0.7833437695679399, + "grad_norm": 0.7799990773200989, + "learning_rate": 6.340827631150212e-05, + "loss": 0.2149, + "step": 5004 + }, + { + "epoch": 0.7835003130870382, + "grad_norm": 0.83309006690979, + "learning_rate": 6.34001303356142e-05, + "loss": 0.3174, + "step": 5005 + }, + { + "epoch": 0.7836568566061365, + "grad_norm": 0.6411677598953247, + "learning_rate": 6.33919843597263e-05, + "loss": 0.1866, + "step": 5006 + }, + { + "epoch": 0.7838134001252348, + "grad_norm": 0.8295854330062866, + "learning_rate": 6.338383838383839e-05, + "loss": 0.2523, + "step": 5007 + }, + { + "epoch": 0.7839699436443331, + "grad_norm": 0.7609614729881287, + "learning_rate": 6.337569240795047e-05, + "loss": 0.2887, + "step": 5008 + }, + { + "epoch": 0.7841264871634315, + "grad_norm": 0.9332055449485779, + "learning_rate": 6.336754643206257e-05, + "loss": 0.3538, + "step": 5009 + }, + { + "epoch": 0.7842830306825297, + "grad_norm": 0.7943589687347412, + "learning_rate": 6.335940045617465e-05, + "loss": 0.2595, + "step": 5010 + }, + { + "epoch": 0.784439574201628, + "grad_norm": 0.7718414068222046, + "learning_rate": 6.335125448028673e-05, + "loss": 0.3088, + "step": 5011 + }, + { + "epoch": 0.7845961177207263, + "grad_norm": 2.2170569896698, + "learning_rate": 6.334310850439883e-05, + "loss": 0.6548, + "step": 5012 + }, + { + "epoch": 0.7847526612398247, + "grad_norm": 0.9952752590179443, + "learning_rate": 6.333496252851092e-05, + "loss": 0.529, + "step": 5013 + }, + { + "epoch": 0.784909204758923, + "grad_norm": 1.1256723403930664, + "learning_rate": 6.3326816552623e-05, + "loss": 0.4328, + "step": 5014 + }, + { + "epoch": 0.7850657482780213, + "grad_norm": 1.1400662660598755, + "learning_rate": 6.33186705767351e-05, + "loss": 0.4821, + "step": 5015 + }, + { + "epoch": 0.7852222917971196, + "grad_norm": 2.2570650577545166, + "learning_rate": 6.33105246008472e-05, + "loss": 0.5994, + "step": 5016 + }, + { + "epoch": 0.7853788353162179, + "grad_norm": 0.7826789021492004, + "learning_rate": 6.330237862495926e-05, + "loss": 0.2492, + "step": 5017 + }, + { + "epoch": 0.7855353788353162, + "grad_norm": 2.862241268157959, + "learning_rate": 6.329423264907136e-05, + "loss": 0.4616, + "step": 5018 + }, + { + "epoch": 0.7856919223544145, + "grad_norm": 1.410860538482666, + "learning_rate": 6.328608667318346e-05, + "loss": 0.5328, + "step": 5019 + }, + { + "epoch": 0.7858484658735129, + "grad_norm": 1.5059651136398315, + "learning_rate": 6.327794069729553e-05, + "loss": 0.5831, + "step": 5020 + }, + { + "epoch": 0.7860050093926112, + "grad_norm": 1.6701918840408325, + "learning_rate": 6.326979472140763e-05, + "loss": 0.4997, + "step": 5021 + }, + { + "epoch": 0.7861615529117094, + "grad_norm": 1.7379719018936157, + "learning_rate": 6.326164874551972e-05, + "loss": 0.3692, + "step": 5022 + }, + { + "epoch": 0.7863180964308077, + "grad_norm": 2.5276622772216797, + "learning_rate": 6.325350276963181e-05, + "loss": 0.6604, + "step": 5023 + }, + { + "epoch": 0.7864746399499061, + "grad_norm": 1.8474514484405518, + "learning_rate": 6.324535679374389e-05, + "loss": 0.4008, + "step": 5024 + }, + { + "epoch": 0.7866311834690044, + "grad_norm": 2.4855966567993164, + "learning_rate": 6.323721081785599e-05, + "loss": 0.6807, + "step": 5025 + }, + { + "epoch": 0.7867877269881027, + "grad_norm": 3.7951228618621826, + "learning_rate": 6.322906484196807e-05, + "loss": 0.8768, + "step": 5026 + }, + { + "epoch": 0.786944270507201, + "grad_norm": 1.8104325532913208, + "learning_rate": 6.322091886608016e-05, + "loss": 0.5704, + "step": 5027 + }, + { + "epoch": 0.7871008140262993, + "grad_norm": 2.525326728820801, + "learning_rate": 6.321277289019225e-05, + "loss": 1.001, + "step": 5028 + }, + { + "epoch": 0.7872573575453976, + "grad_norm": 2.0678927898406982, + "learning_rate": 6.320462691430434e-05, + "loss": 0.6141, + "step": 5029 + }, + { + "epoch": 0.7874139010644959, + "grad_norm": 1.6791964769363403, + "learning_rate": 6.319648093841642e-05, + "loss": 0.3488, + "step": 5030 + }, + { + "epoch": 0.7875704445835943, + "grad_norm": 3.6276705265045166, + "learning_rate": 6.318833496252852e-05, + "loss": 0.632, + "step": 5031 + }, + { + "epoch": 0.7877269881026926, + "grad_norm": 1.9610886573791504, + "learning_rate": 6.31801889866406e-05, + "loss": 0.8963, + "step": 5032 + }, + { + "epoch": 0.7878835316217908, + "grad_norm": 4.563727855682373, + "learning_rate": 6.317204301075269e-05, + "loss": 0.8166, + "step": 5033 + }, + { + "epoch": 0.7880400751408891, + "grad_norm": 1.9971166849136353, + "learning_rate": 6.316389703486478e-05, + "loss": 0.9544, + "step": 5034 + }, + { + "epoch": 0.7881966186599875, + "grad_norm": 2.704655885696411, + "learning_rate": 6.315575105897687e-05, + "loss": 0.7647, + "step": 5035 + }, + { + "epoch": 0.7883531621790858, + "grad_norm": 4.8408308029174805, + "learning_rate": 6.314760508308895e-05, + "loss": 1.2418, + "step": 5036 + }, + { + "epoch": 0.7885097056981841, + "grad_norm": 10.681609153747559, + "learning_rate": 6.313945910720105e-05, + "loss": 1.5581, + "step": 5037 + }, + { + "epoch": 0.7886662492172825, + "grad_norm": 3.453049421310425, + "learning_rate": 6.313131313131313e-05, + "loss": 0.9122, + "step": 5038 + }, + { + "epoch": 0.7888227927363807, + "grad_norm": 2.410040855407715, + "learning_rate": 6.312316715542523e-05, + "loss": 0.7854, + "step": 5039 + }, + { + "epoch": 0.788979336255479, + "grad_norm": 2.6402151584625244, + "learning_rate": 6.311502117953731e-05, + "loss": 1.1231, + "step": 5040 + }, + { + "epoch": 0.7891358797745773, + "grad_norm": 3.4769527912139893, + "learning_rate": 6.31068752036494e-05, + "loss": 0.9825, + "step": 5041 + }, + { + "epoch": 0.7892924232936757, + "grad_norm": 3.831864833831787, + "learning_rate": 6.30987292277615e-05, + "loss": 1.2863, + "step": 5042 + }, + { + "epoch": 0.789448966812774, + "grad_norm": 5.428854465484619, + "learning_rate": 6.309058325187358e-05, + "loss": 1.9608, + "step": 5043 + }, + { + "epoch": 0.7896055103318722, + "grad_norm": 2.174945592880249, + "learning_rate": 6.308243727598566e-05, + "loss": 0.8358, + "step": 5044 + }, + { + "epoch": 0.7897620538509705, + "grad_norm": 2.6053144931793213, + "learning_rate": 6.307429130009776e-05, + "loss": 1.1809, + "step": 5045 + }, + { + "epoch": 0.7899185973700689, + "grad_norm": 2.970327854156494, + "learning_rate": 6.306614532420984e-05, + "loss": 0.925, + "step": 5046 + }, + { + "epoch": 0.7900751408891672, + "grad_norm": 4.339375019073486, + "learning_rate": 6.305799934832193e-05, + "loss": 0.8735, + "step": 5047 + }, + { + "epoch": 0.7902316844082655, + "grad_norm": 2.2157442569732666, + "learning_rate": 6.304985337243402e-05, + "loss": 0.7265, + "step": 5048 + }, + { + "epoch": 0.7903882279273639, + "grad_norm": 4.371285438537598, + "learning_rate": 6.30417073965461e-05, + "loss": 1.1509, + "step": 5049 + }, + { + "epoch": 0.7905447714464621, + "grad_norm": 4.964325428009033, + "learning_rate": 6.303356142065819e-05, + "loss": 1.094, + "step": 5050 + }, + { + "epoch": 0.7907013149655604, + "grad_norm": 0.5630809664726257, + "learning_rate": 6.302541544477029e-05, + "loss": 0.2434, + "step": 5051 + }, + { + "epoch": 0.7908578584846587, + "grad_norm": 0.45341452956199646, + "learning_rate": 6.301726946888239e-05, + "loss": 0.2617, + "step": 5052 + }, + { + "epoch": 0.7910144020037571, + "grad_norm": 0.63944411277771, + "learning_rate": 6.300912349299446e-05, + "loss": 0.2906, + "step": 5053 + }, + { + "epoch": 0.7911709455228554, + "grad_norm": 0.7956841588020325, + "learning_rate": 6.300097751710655e-05, + "loss": 0.3581, + "step": 5054 + }, + { + "epoch": 0.7913274890419537, + "grad_norm": 0.522815465927124, + "learning_rate": 6.299283154121865e-05, + "loss": 0.3022, + "step": 5055 + }, + { + "epoch": 0.7914840325610519, + "grad_norm": 0.6130359172821045, + "learning_rate": 6.298468556533072e-05, + "loss": 0.3912, + "step": 5056 + }, + { + "epoch": 0.7916405760801503, + "grad_norm": 0.7449454069137573, + "learning_rate": 6.297653958944282e-05, + "loss": 0.289, + "step": 5057 + }, + { + "epoch": 0.7917971195992486, + "grad_norm": 0.7996970415115356, + "learning_rate": 6.296839361355491e-05, + "loss": 0.3883, + "step": 5058 + }, + { + "epoch": 0.7919536631183469, + "grad_norm": 1.928208827972412, + "learning_rate": 6.296024763766698e-05, + "loss": 0.6691, + "step": 5059 + }, + { + "epoch": 0.7921102066374452, + "grad_norm": 0.810681164264679, + "learning_rate": 6.295210166177908e-05, + "loss": 0.3702, + "step": 5060 + }, + { + "epoch": 0.7922667501565435, + "grad_norm": 1.1188267469406128, + "learning_rate": 6.294395568589118e-05, + "loss": 0.3114, + "step": 5061 + }, + { + "epoch": 0.7924232936756418, + "grad_norm": 2.4603874683380127, + "learning_rate": 6.293580971000326e-05, + "loss": 0.3388, + "step": 5062 + }, + { + "epoch": 0.7925798371947401, + "grad_norm": 1.590019702911377, + "learning_rate": 6.292766373411535e-05, + "loss": 0.4942, + "step": 5063 + }, + { + "epoch": 0.7927363807138385, + "grad_norm": 2.2806010246276855, + "learning_rate": 6.291951775822744e-05, + "loss": 0.7018, + "step": 5064 + }, + { + "epoch": 0.7928929242329368, + "grad_norm": 0.9622505307197571, + "learning_rate": 6.291137178233953e-05, + "loss": 0.4578, + "step": 5065 + }, + { + "epoch": 0.7930494677520351, + "grad_norm": 1.0235636234283447, + "learning_rate": 6.290322580645161e-05, + "loss": 0.3506, + "step": 5066 + }, + { + "epoch": 0.7932060112711333, + "grad_norm": 1.3954724073410034, + "learning_rate": 6.289507983056371e-05, + "loss": 0.4535, + "step": 5067 + }, + { + "epoch": 0.7933625547902317, + "grad_norm": 1.709902048110962, + "learning_rate": 6.288693385467579e-05, + "loss": 0.5728, + "step": 5068 + }, + { + "epoch": 0.79351909830933, + "grad_norm": 2.8813834190368652, + "learning_rate": 6.287878787878788e-05, + "loss": 0.5725, + "step": 5069 + }, + { + "epoch": 0.7936756418284283, + "grad_norm": 1.9602422714233398, + "learning_rate": 6.287064190289997e-05, + "loss": 0.615, + "step": 5070 + }, + { + "epoch": 0.7938321853475266, + "grad_norm": 3.3098692893981934, + "learning_rate": 6.286249592701206e-05, + "loss": 0.9936, + "step": 5071 + }, + { + "epoch": 0.793988728866625, + "grad_norm": 1.0330898761749268, + "learning_rate": 6.285434995112414e-05, + "loss": 0.4345, + "step": 5072 + }, + { + "epoch": 0.7941452723857232, + "grad_norm": 1.5182857513427734, + "learning_rate": 6.284620397523624e-05, + "loss": 0.4801, + "step": 5073 + }, + { + "epoch": 0.7943018159048215, + "grad_norm": 3.6843984127044678, + "learning_rate": 6.283805799934832e-05, + "loss": 0.5554, + "step": 5074 + }, + { + "epoch": 0.7944583594239198, + "grad_norm": 1.6782314777374268, + "learning_rate": 6.282991202346042e-05, + "loss": 0.5653, + "step": 5075 + }, + { + "epoch": 0.7946149029430182, + "grad_norm": 2.8625223636627197, + "learning_rate": 6.28217660475725e-05, + "loss": 1.245, + "step": 5076 + }, + { + "epoch": 0.7947714464621165, + "grad_norm": 2.8648698329925537, + "learning_rate": 6.281362007168459e-05, + "loss": 0.9547, + "step": 5077 + }, + { + "epoch": 0.7949279899812148, + "grad_norm": 1.9560805559158325, + "learning_rate": 6.280547409579668e-05, + "loss": 0.7727, + "step": 5078 + }, + { + "epoch": 0.795084533500313, + "grad_norm": 1.374638557434082, + "learning_rate": 6.279732811990877e-05, + "loss": 0.4597, + "step": 5079 + }, + { + "epoch": 0.7952410770194114, + "grad_norm": 1.639078974723816, + "learning_rate": 6.278918214402085e-05, + "loss": 0.6681, + "step": 5080 + }, + { + "epoch": 0.7953976205385097, + "grad_norm": 2.6539037227630615, + "learning_rate": 6.278103616813295e-05, + "loss": 0.6142, + "step": 5081 + }, + { + "epoch": 0.795554164057608, + "grad_norm": 1.8508052825927734, + "learning_rate": 6.277289019224503e-05, + "loss": 0.7843, + "step": 5082 + }, + { + "epoch": 0.7957107075767064, + "grad_norm": 4.024486541748047, + "learning_rate": 6.276474421635712e-05, + "loss": 0.5752, + "step": 5083 + }, + { + "epoch": 0.7958672510958046, + "grad_norm": 3.339578866958618, + "learning_rate": 6.275659824046921e-05, + "loss": 0.728, + "step": 5084 + }, + { + "epoch": 0.7960237946149029, + "grad_norm": 1.641694188117981, + "learning_rate": 6.27484522645813e-05, + "loss": 0.6977, + "step": 5085 + }, + { + "epoch": 0.7961803381340012, + "grad_norm": 3.4622838497161865, + "learning_rate": 6.274030628869338e-05, + "loss": 0.9919, + "step": 5086 + }, + { + "epoch": 0.7963368816530996, + "grad_norm": 2.0422229766845703, + "learning_rate": 6.273216031280548e-05, + "loss": 0.9538, + "step": 5087 + }, + { + "epoch": 0.7964934251721979, + "grad_norm": 3.2952167987823486, + "learning_rate": 6.272401433691756e-05, + "loss": 0.8718, + "step": 5088 + }, + { + "epoch": 0.7966499686912962, + "grad_norm": 2.668931722640991, + "learning_rate": 6.271586836102965e-05, + "loss": 1.3747, + "step": 5089 + }, + { + "epoch": 0.7968065122103944, + "grad_norm": 7.05169153213501, + "learning_rate": 6.270772238514174e-05, + "loss": 1.8604, + "step": 5090 + }, + { + "epoch": 0.7969630557294928, + "grad_norm": 4.695873260498047, + "learning_rate": 6.269957640925384e-05, + "loss": 1.5307, + "step": 5091 + }, + { + "epoch": 0.7971195992485911, + "grad_norm": 2.515012502670288, + "learning_rate": 6.269143043336591e-05, + "loss": 0.9385, + "step": 5092 + }, + { + "epoch": 0.7972761427676894, + "grad_norm": 1.703904390335083, + "learning_rate": 6.268328445747801e-05, + "loss": 1.2225, + "step": 5093 + }, + { + "epoch": 0.7974326862867878, + "grad_norm": 2.864043951034546, + "learning_rate": 6.26751384815901e-05, + "loss": 1.0783, + "step": 5094 + }, + { + "epoch": 0.7975892298058861, + "grad_norm": 3.065922498703003, + "learning_rate": 6.266699250570218e-05, + "loss": 1.1086, + "step": 5095 + }, + { + "epoch": 0.7977457733249843, + "grad_norm": 2.8458445072174072, + "learning_rate": 6.265884652981427e-05, + "loss": 0.8584, + "step": 5096 + }, + { + "epoch": 0.7979023168440826, + "grad_norm": 1.8227765560150146, + "learning_rate": 6.265070055392637e-05, + "loss": 0.3353, + "step": 5097 + }, + { + "epoch": 0.798058860363181, + "grad_norm": 4.517581939697266, + "learning_rate": 6.264255457803845e-05, + "loss": 0.8851, + "step": 5098 + }, + { + "epoch": 0.7982154038822793, + "grad_norm": 1.938895344734192, + "learning_rate": 6.263440860215054e-05, + "loss": 0.5042, + "step": 5099 + }, + { + "epoch": 0.7983719474013776, + "grad_norm": 2.2861714363098145, + "learning_rate": 6.262626262626264e-05, + "loss": 0.7505, + "step": 5100 + }, + { + "epoch": 0.7985284909204758, + "grad_norm": 1.7086668014526367, + "learning_rate": 6.261811665037472e-05, + "loss": 0.4722, + "step": 5101 + }, + { + "epoch": 0.7986850344395742, + "grad_norm": 0.9649326801300049, + "learning_rate": 6.26099706744868e-05, + "loss": 0.4012, + "step": 5102 + }, + { + "epoch": 0.7988415779586725, + "grad_norm": 0.6233301758766174, + "learning_rate": 6.26018246985989e-05, + "loss": 0.2775, + "step": 5103 + }, + { + "epoch": 0.7989981214777708, + "grad_norm": 0.6370360851287842, + "learning_rate": 6.259367872271098e-05, + "loss": 0.2154, + "step": 5104 + }, + { + "epoch": 0.7991546649968692, + "grad_norm": 0.7721182107925415, + "learning_rate": 6.258553274682307e-05, + "loss": 0.208, + "step": 5105 + }, + { + "epoch": 0.7993112085159675, + "grad_norm": 0.6881215572357178, + "learning_rate": 6.257738677093517e-05, + "loss": 0.2581, + "step": 5106 + }, + { + "epoch": 0.7994677520350657, + "grad_norm": 0.5637189745903015, + "learning_rate": 6.256924079504725e-05, + "loss": 0.2154, + "step": 5107 + }, + { + "epoch": 0.799624295554164, + "grad_norm": 0.6382756233215332, + "learning_rate": 6.256109481915933e-05, + "loss": 0.2476, + "step": 5108 + }, + { + "epoch": 0.7997808390732624, + "grad_norm": 1.2556431293487549, + "learning_rate": 6.255294884327143e-05, + "loss": 0.4462, + "step": 5109 + }, + { + "epoch": 0.7999373825923607, + "grad_norm": 0.9714632630348206, + "learning_rate": 6.254480286738351e-05, + "loss": 0.3589, + "step": 5110 + }, + { + "epoch": 0.800093926111459, + "grad_norm": 0.9368165135383606, + "learning_rate": 6.253665689149561e-05, + "loss": 0.3885, + "step": 5111 + }, + { + "epoch": 0.8002504696305573, + "grad_norm": 7.254343509674072, + "learning_rate": 6.25285109156077e-05, + "loss": 0.711, + "step": 5112 + }, + { + "epoch": 0.8004070131496556, + "grad_norm": 0.8671680688858032, + "learning_rate": 6.252036493971978e-05, + "loss": 0.2544, + "step": 5113 + }, + { + "epoch": 0.8005635566687539, + "grad_norm": 1.135000228881836, + "learning_rate": 6.251221896383188e-05, + "loss": 0.3659, + "step": 5114 + }, + { + "epoch": 0.8007201001878522, + "grad_norm": 1.1863824129104614, + "learning_rate": 6.250407298794396e-05, + "loss": 0.4078, + "step": 5115 + }, + { + "epoch": 0.8008766437069506, + "grad_norm": 1.2954200506210327, + "learning_rate": 6.249592701205604e-05, + "loss": 0.4201, + "step": 5116 + }, + { + "epoch": 0.8010331872260489, + "grad_norm": 1.096360445022583, + "learning_rate": 6.248778103616814e-05, + "loss": 0.4353, + "step": 5117 + }, + { + "epoch": 0.8011897307451471, + "grad_norm": 1.5964641571044922, + "learning_rate": 6.247963506028022e-05, + "loss": 0.5519, + "step": 5118 + }, + { + "epoch": 0.8013462742642454, + "grad_norm": 1.9996333122253418, + "learning_rate": 6.247148908439231e-05, + "loss": 0.3962, + "step": 5119 + }, + { + "epoch": 0.8015028177833438, + "grad_norm": 2.1622207164764404, + "learning_rate": 6.24633431085044e-05, + "loss": 0.819, + "step": 5120 + }, + { + "epoch": 0.8016593613024421, + "grad_norm": 1.5558791160583496, + "learning_rate": 6.245519713261649e-05, + "loss": 0.6535, + "step": 5121 + }, + { + "epoch": 0.8018159048215404, + "grad_norm": 1.8924232721328735, + "learning_rate": 6.244705115672857e-05, + "loss": 0.5546, + "step": 5122 + }, + { + "epoch": 0.8019724483406387, + "grad_norm": 1.4775395393371582, + "learning_rate": 6.243890518084067e-05, + "loss": 0.5717, + "step": 5123 + }, + { + "epoch": 0.802128991859737, + "grad_norm": 2.02616286277771, + "learning_rate": 6.243075920495275e-05, + "loss": 0.6567, + "step": 5124 + }, + { + "epoch": 0.8022855353788353, + "grad_norm": 1.605916976928711, + "learning_rate": 6.242261322906484e-05, + "loss": 0.7509, + "step": 5125 + }, + { + "epoch": 0.8024420788979336, + "grad_norm": 1.308473825454712, + "learning_rate": 6.241446725317693e-05, + "loss": 0.4833, + "step": 5126 + }, + { + "epoch": 0.802598622417032, + "grad_norm": 2.0987865924835205, + "learning_rate": 6.240632127728903e-05, + "loss": 0.4719, + "step": 5127 + }, + { + "epoch": 0.8027551659361303, + "grad_norm": 2.246732473373413, + "learning_rate": 6.23981753014011e-05, + "loss": 0.7996, + "step": 5128 + }, + { + "epoch": 0.8029117094552286, + "grad_norm": 1.329953908920288, + "learning_rate": 6.23900293255132e-05, + "loss": 0.8771, + "step": 5129 + }, + { + "epoch": 0.8030682529743268, + "grad_norm": 1.8251405954360962, + "learning_rate": 6.23818833496253e-05, + "loss": 0.4261, + "step": 5130 + }, + { + "epoch": 0.8032247964934252, + "grad_norm": 2.030503749847412, + "learning_rate": 6.237373737373737e-05, + "loss": 0.9651, + "step": 5131 + }, + { + "epoch": 0.8033813400125235, + "grad_norm": 3.046326160430908, + "learning_rate": 6.236559139784946e-05, + "loss": 0.7856, + "step": 5132 + }, + { + "epoch": 0.8035378835316218, + "grad_norm": 1.9184679985046387, + "learning_rate": 6.235744542196156e-05, + "loss": 0.5555, + "step": 5133 + }, + { + "epoch": 0.8036944270507201, + "grad_norm": 2.6761085987091064, + "learning_rate": 6.234929944607365e-05, + "loss": 0.6727, + "step": 5134 + }, + { + "epoch": 0.8038509705698184, + "grad_norm": 3.5750551223754883, + "learning_rate": 6.234115347018573e-05, + "loss": 1.1722, + "step": 5135 + }, + { + "epoch": 0.8040075140889167, + "grad_norm": 2.8153672218322754, + "learning_rate": 6.233300749429783e-05, + "loss": 0.9666, + "step": 5136 + }, + { + "epoch": 0.804164057608015, + "grad_norm": 3.592055559158325, + "learning_rate": 6.232486151840991e-05, + "loss": 1.6105, + "step": 5137 + }, + { + "epoch": 0.8043206011271133, + "grad_norm": 1.2772399187088013, + "learning_rate": 6.2316715542522e-05, + "loss": 0.5696, + "step": 5138 + }, + { + "epoch": 0.8044771446462117, + "grad_norm": 3.1485698223114014, + "learning_rate": 6.230856956663409e-05, + "loss": 1.5273, + "step": 5139 + }, + { + "epoch": 0.80463368816531, + "grad_norm": 2.024648427963257, + "learning_rate": 6.230042359074618e-05, + "loss": 1.0977, + "step": 5140 + }, + { + "epoch": 0.8047902316844082, + "grad_norm": 3.102140188217163, + "learning_rate": 6.229227761485826e-05, + "loss": 0.7277, + "step": 5141 + }, + { + "epoch": 0.8049467752035065, + "grad_norm": 3.47141170501709, + "learning_rate": 6.228413163897036e-05, + "loss": 0.604, + "step": 5142 + }, + { + "epoch": 0.8051033187226049, + "grad_norm": 3.387033462524414, + "learning_rate": 6.227598566308244e-05, + "loss": 1.3995, + "step": 5143 + }, + { + "epoch": 0.8052598622417032, + "grad_norm": 3.9833240509033203, + "learning_rate": 6.226783968719452e-05, + "loss": 1.3781, + "step": 5144 + }, + { + "epoch": 0.8054164057608015, + "grad_norm": 1.5175559520721436, + "learning_rate": 6.225969371130662e-05, + "loss": 0.5136, + "step": 5145 + }, + { + "epoch": 0.8055729492798999, + "grad_norm": 3.528369426727295, + "learning_rate": 6.22515477354187e-05, + "loss": 1.0215, + "step": 5146 + }, + { + "epoch": 0.8057294927989981, + "grad_norm": 3.7512574195861816, + "learning_rate": 6.224340175953079e-05, + "loss": 1.0599, + "step": 5147 + }, + { + "epoch": 0.8058860363180964, + "grad_norm": 3.6728758811950684, + "learning_rate": 6.223525578364289e-05, + "loss": 0.9463, + "step": 5148 + }, + { + "epoch": 0.8060425798371947, + "grad_norm": 3.5168604850769043, + "learning_rate": 6.222710980775497e-05, + "loss": 0.8556, + "step": 5149 + }, + { + "epoch": 0.8061991233562931, + "grad_norm": 2.2791383266448975, + "learning_rate": 6.221896383186707e-05, + "loss": 0.7931, + "step": 5150 + }, + { + "epoch": 0.8063556668753914, + "grad_norm": 0.6642107367515564, + "learning_rate": 6.221081785597915e-05, + "loss": 0.1972, + "step": 5151 + }, + { + "epoch": 0.8065122103944896, + "grad_norm": 1.2314287424087524, + "learning_rate": 6.220267188009123e-05, + "loss": 0.2898, + "step": 5152 + }, + { + "epoch": 0.8066687539135879, + "grad_norm": 0.5869191884994507, + "learning_rate": 6.219452590420333e-05, + "loss": 0.2908, + "step": 5153 + }, + { + "epoch": 0.8068252974326863, + "grad_norm": 0.7790637016296387, + "learning_rate": 6.218637992831542e-05, + "loss": 0.2885, + "step": 5154 + }, + { + "epoch": 0.8069818409517846, + "grad_norm": 0.6538547277450562, + "learning_rate": 6.21782339524275e-05, + "loss": 0.2983, + "step": 5155 + }, + { + "epoch": 0.8071383844708829, + "grad_norm": 1.153669834136963, + "learning_rate": 6.21700879765396e-05, + "loss": 0.4271, + "step": 5156 + }, + { + "epoch": 0.8072949279899813, + "grad_norm": 1.9424060583114624, + "learning_rate": 6.216194200065168e-05, + "loss": 0.3716, + "step": 5157 + }, + { + "epoch": 0.8074514715090795, + "grad_norm": 0.6700789332389832, + "learning_rate": 6.215379602476376e-05, + "loss": 0.2426, + "step": 5158 + }, + { + "epoch": 0.8076080150281778, + "grad_norm": 0.6152874827384949, + "learning_rate": 6.214565004887586e-05, + "loss": 0.2984, + "step": 5159 + }, + { + "epoch": 0.8077645585472761, + "grad_norm": 1.4087196588516235, + "learning_rate": 6.213750407298794e-05, + "loss": 0.3262, + "step": 5160 + }, + { + "epoch": 0.8079211020663745, + "grad_norm": 0.7439823746681213, + "learning_rate": 6.212935809710003e-05, + "loss": 0.389, + "step": 5161 + }, + { + "epoch": 0.8080776455854728, + "grad_norm": 1.0830724239349365, + "learning_rate": 6.212121212121213e-05, + "loss": 0.2793, + "step": 5162 + }, + { + "epoch": 0.8082341891045711, + "grad_norm": 1.3322657346725464, + "learning_rate": 6.211306614532422e-05, + "loss": 0.2808, + "step": 5163 + }, + { + "epoch": 0.8083907326236693, + "grad_norm": 14.3956298828125, + "learning_rate": 6.21049201694363e-05, + "loss": 1.6581, + "step": 5164 + }, + { + "epoch": 0.8085472761427677, + "grad_norm": 1.1760270595550537, + "learning_rate": 6.209677419354839e-05, + "loss": 0.5013, + "step": 5165 + }, + { + "epoch": 0.808703819661866, + "grad_norm": 1.509891152381897, + "learning_rate": 6.208862821766049e-05, + "loss": 0.4066, + "step": 5166 + }, + { + "epoch": 0.8088603631809643, + "grad_norm": 1.2697254419326782, + "learning_rate": 6.208048224177256e-05, + "loss": 0.4421, + "step": 5167 + }, + { + "epoch": 0.8090169067000627, + "grad_norm": 2.6287477016448975, + "learning_rate": 6.207233626588466e-05, + "loss": 0.9845, + "step": 5168 + }, + { + "epoch": 0.8091734502191609, + "grad_norm": 1.3080765008926392, + "learning_rate": 6.206419028999675e-05, + "loss": 0.4687, + "step": 5169 + }, + { + "epoch": 0.8093299937382592, + "grad_norm": 2.3372604846954346, + "learning_rate": 6.205604431410882e-05, + "loss": 0.9924, + "step": 5170 + }, + { + "epoch": 0.8094865372573575, + "grad_norm": 2.2267704010009766, + "learning_rate": 6.204789833822092e-05, + "loss": 0.4617, + "step": 5171 + }, + { + "epoch": 0.8096430807764559, + "grad_norm": 2.0063843727111816, + "learning_rate": 6.203975236233302e-05, + "loss": 0.5388, + "step": 5172 + }, + { + "epoch": 0.8097996242955542, + "grad_norm": 1.3539295196533203, + "learning_rate": 6.20316063864451e-05, + "loss": 0.2313, + "step": 5173 + }, + { + "epoch": 0.8099561678146525, + "grad_norm": 1.658872127532959, + "learning_rate": 6.202346041055719e-05, + "loss": 0.74, + "step": 5174 + }, + { + "epoch": 0.8101127113337507, + "grad_norm": 5.083950519561768, + "learning_rate": 6.201531443466928e-05, + "loss": 1.0668, + "step": 5175 + }, + { + "epoch": 0.8102692548528491, + "grad_norm": 4.854441165924072, + "learning_rate": 6.200716845878137e-05, + "loss": 0.7227, + "step": 5176 + }, + { + "epoch": 0.8104257983719474, + "grad_norm": 4.989040374755859, + "learning_rate": 6.199902248289345e-05, + "loss": 0.6226, + "step": 5177 + }, + { + "epoch": 0.8105823418910457, + "grad_norm": 2.306349754333496, + "learning_rate": 6.199087650700555e-05, + "loss": 0.6739, + "step": 5178 + }, + { + "epoch": 0.810738885410144, + "grad_norm": 2.7024402618408203, + "learning_rate": 6.198273053111763e-05, + "loss": 0.7363, + "step": 5179 + }, + { + "epoch": 0.8108954289292424, + "grad_norm": 2.111072063446045, + "learning_rate": 6.197458455522971e-05, + "loss": 0.7655, + "step": 5180 + }, + { + "epoch": 0.8110519724483406, + "grad_norm": 1.7654629945755005, + "learning_rate": 6.196643857934181e-05, + "loss": 0.68, + "step": 5181 + }, + { + "epoch": 0.8112085159674389, + "grad_norm": 2.2726595401763916, + "learning_rate": 6.19582926034539e-05, + "loss": 0.9862, + "step": 5182 + }, + { + "epoch": 0.8113650594865373, + "grad_norm": 1.8245066404342651, + "learning_rate": 6.195014662756598e-05, + "loss": 0.7579, + "step": 5183 + }, + { + "epoch": 0.8115216030056356, + "grad_norm": 3.4480128288269043, + "learning_rate": 6.194200065167808e-05, + "loss": 0.8905, + "step": 5184 + }, + { + "epoch": 0.8116781465247339, + "grad_norm": 5.122439384460449, + "learning_rate": 6.193385467579016e-05, + "loss": 1.2761, + "step": 5185 + }, + { + "epoch": 0.8118346900438321, + "grad_norm": 2.0798394680023193, + "learning_rate": 6.192570869990226e-05, + "loss": 0.5302, + "step": 5186 + }, + { + "epoch": 0.8119912335629305, + "grad_norm": 4.454853057861328, + "learning_rate": 6.191756272401434e-05, + "loss": 1.0401, + "step": 5187 + }, + { + "epoch": 0.8121477770820288, + "grad_norm": 4.144859313964844, + "learning_rate": 6.190941674812643e-05, + "loss": 0.9434, + "step": 5188 + }, + { + "epoch": 0.8123043206011271, + "grad_norm": 14.000391960144043, + "learning_rate": 6.190127077223852e-05, + "loss": 1.7155, + "step": 5189 + }, + { + "epoch": 0.8124608641202254, + "grad_norm": 2.7173001766204834, + "learning_rate": 6.18931247963506e-05, + "loss": 1.1619, + "step": 5190 + }, + { + "epoch": 0.8126174076393238, + "grad_norm": 4.3457136154174805, + "learning_rate": 6.188497882046269e-05, + "loss": 1.0895, + "step": 5191 + }, + { + "epoch": 0.812773951158422, + "grad_norm": 2.681278944015503, + "learning_rate": 6.187683284457479e-05, + "loss": 1.0953, + "step": 5192 + }, + { + "epoch": 0.8129304946775203, + "grad_norm": 5.55991792678833, + "learning_rate": 6.186868686868687e-05, + "loss": 1.2873, + "step": 5193 + }, + { + "epoch": 0.8130870381966186, + "grad_norm": 3.7444751262664795, + "learning_rate": 6.186054089279896e-05, + "loss": 0.8723, + "step": 5194 + }, + { + "epoch": 0.813243581715717, + "grad_norm": 1.9623128175735474, + "learning_rate": 6.185239491691105e-05, + "loss": 1.082, + "step": 5195 + }, + { + "epoch": 0.8134001252348153, + "grad_norm": 3.0476202964782715, + "learning_rate": 6.184424894102314e-05, + "loss": 0.8617, + "step": 5196 + }, + { + "epoch": 0.8135566687539136, + "grad_norm": 4.060564041137695, + "learning_rate": 6.183610296513522e-05, + "loss": 0.4784, + "step": 5197 + }, + { + "epoch": 0.8137132122730119, + "grad_norm": 5.373463153839111, + "learning_rate": 6.182795698924732e-05, + "loss": 0.8146, + "step": 5198 + }, + { + "epoch": 0.8138697557921102, + "grad_norm": 9.525053024291992, + "learning_rate": 6.181981101335941e-05, + "loss": 0.8353, + "step": 5199 + }, + { + "epoch": 0.8140262993112085, + "grad_norm": 4.723721027374268, + "learning_rate": 6.181166503747148e-05, + "loss": 1.3896, + "step": 5200 + }, + { + "epoch": 0.8141828428303068, + "grad_norm": 0.6140694618225098, + "learning_rate": 6.180351906158358e-05, + "loss": 0.2691, + "step": 5201 + }, + { + "epoch": 0.8143393863494052, + "grad_norm": 0.749832272529602, + "learning_rate": 6.179537308569568e-05, + "loss": 0.1926, + "step": 5202 + }, + { + "epoch": 0.8144959298685035, + "grad_norm": 0.7941885590553284, + "learning_rate": 6.178722710980775e-05, + "loss": 0.2688, + "step": 5203 + }, + { + "epoch": 0.8146524733876017, + "grad_norm": 0.7258235812187195, + "learning_rate": 6.177908113391985e-05, + "loss": 0.2596, + "step": 5204 + }, + { + "epoch": 0.8148090169067, + "grad_norm": 0.7868902683258057, + "learning_rate": 6.177093515803194e-05, + "loss": 0.3096, + "step": 5205 + }, + { + "epoch": 0.8149655604257984, + "grad_norm": 0.6499333381652832, + "learning_rate": 6.176278918214401e-05, + "loss": 0.2367, + "step": 5206 + }, + { + "epoch": 0.8151221039448967, + "grad_norm": 0.7214033007621765, + "learning_rate": 6.175464320625611e-05, + "loss": 0.2363, + "step": 5207 + }, + { + "epoch": 0.815278647463995, + "grad_norm": 0.8899503946304321, + "learning_rate": 6.174649723036821e-05, + "loss": 0.3142, + "step": 5208 + }, + { + "epoch": 0.8154351909830932, + "grad_norm": 1.3864123821258545, + "learning_rate": 6.173835125448029e-05, + "loss": 0.318, + "step": 5209 + }, + { + "epoch": 0.8155917345021916, + "grad_norm": 0.678581178188324, + "learning_rate": 6.173020527859238e-05, + "loss": 0.1754, + "step": 5210 + }, + { + "epoch": 0.8157482780212899, + "grad_norm": 1.000651240348816, + "learning_rate": 6.172205930270447e-05, + "loss": 0.3301, + "step": 5211 + }, + { + "epoch": 0.8159048215403882, + "grad_norm": 0.8735445141792297, + "learning_rate": 6.171391332681656e-05, + "loss": 0.2668, + "step": 5212 + }, + { + "epoch": 0.8160613650594866, + "grad_norm": 1.2199441194534302, + "learning_rate": 6.170576735092864e-05, + "loss": 0.3766, + "step": 5213 + }, + { + "epoch": 0.8162179085785849, + "grad_norm": 1.5821715593338013, + "learning_rate": 6.169762137504074e-05, + "loss": 0.5652, + "step": 5214 + }, + { + "epoch": 0.8163744520976831, + "grad_norm": 1.018648386001587, + "learning_rate": 6.168947539915282e-05, + "loss": 0.3232, + "step": 5215 + }, + { + "epoch": 0.8165309956167814, + "grad_norm": 0.9888678193092346, + "learning_rate": 6.16813294232649e-05, + "loss": 0.4402, + "step": 5216 + }, + { + "epoch": 0.8166875391358798, + "grad_norm": 0.8414931893348694, + "learning_rate": 6.1673183447377e-05, + "loss": 0.3982, + "step": 5217 + }, + { + "epoch": 0.8168440826549781, + "grad_norm": 1.0033010244369507, + "learning_rate": 6.166503747148909e-05, + "loss": 0.4337, + "step": 5218 + }, + { + "epoch": 0.8170006261740764, + "grad_norm": 1.3958560228347778, + "learning_rate": 6.165689149560117e-05, + "loss": 0.59, + "step": 5219 + }, + { + "epoch": 0.8171571696931748, + "grad_norm": 2.2653136253356934, + "learning_rate": 6.164874551971327e-05, + "loss": 0.629, + "step": 5220 + }, + { + "epoch": 0.817313713212273, + "grad_norm": 1.1855946779251099, + "learning_rate": 6.164059954382535e-05, + "loss": 0.5347, + "step": 5221 + }, + { + "epoch": 0.8174702567313713, + "grad_norm": 1.4344104528427124, + "learning_rate": 6.163245356793745e-05, + "loss": 0.5636, + "step": 5222 + }, + { + "epoch": 0.8176268002504696, + "grad_norm": 1.8822580575942993, + "learning_rate": 6.162430759204953e-05, + "loss": 0.5435, + "step": 5223 + }, + { + "epoch": 0.817783343769568, + "grad_norm": 2.3126745223999023, + "learning_rate": 6.161616161616162e-05, + "loss": 0.6364, + "step": 5224 + }, + { + "epoch": 0.8179398872886663, + "grad_norm": 2.8798370361328125, + "learning_rate": 6.160801564027371e-05, + "loss": 0.8797, + "step": 5225 + }, + { + "epoch": 0.8180964308077645, + "grad_norm": 2.0018234252929688, + "learning_rate": 6.15998696643858e-05, + "loss": 0.7881, + "step": 5226 + }, + { + "epoch": 0.8182529743268628, + "grad_norm": 2.852816343307495, + "learning_rate": 6.159172368849788e-05, + "loss": 0.5535, + "step": 5227 + }, + { + "epoch": 0.8184095178459612, + "grad_norm": 3.1242082118988037, + "learning_rate": 6.158357771260998e-05, + "loss": 1.0081, + "step": 5228 + }, + { + "epoch": 0.8185660613650595, + "grad_norm": 2.5085649490356445, + "learning_rate": 6.157543173672206e-05, + "loss": 0.596, + "step": 5229 + }, + { + "epoch": 0.8187226048841578, + "grad_norm": 3.8994970321655273, + "learning_rate": 6.156728576083415e-05, + "loss": 0.9707, + "step": 5230 + }, + { + "epoch": 0.8188791484032562, + "grad_norm": 2.6675243377685547, + "learning_rate": 6.155913978494624e-05, + "loss": 0.639, + "step": 5231 + }, + { + "epoch": 0.8190356919223544, + "grad_norm": 4.391568183898926, + "learning_rate": 6.155099380905833e-05, + "loss": 0.721, + "step": 5232 + }, + { + "epoch": 0.8191922354414527, + "grad_norm": 1.932989478111267, + "learning_rate": 6.154284783317041e-05, + "loss": 0.8073, + "step": 5233 + }, + { + "epoch": 0.819348778960551, + "grad_norm": 2.2673749923706055, + "learning_rate": 6.153470185728251e-05, + "loss": 1.0787, + "step": 5234 + }, + { + "epoch": 0.8195053224796494, + "grad_norm": 1.9027347564697266, + "learning_rate": 6.152655588139459e-05, + "loss": 0.6395, + "step": 5235 + }, + { + "epoch": 0.8196618659987477, + "grad_norm": 2.4586291313171387, + "learning_rate": 6.151840990550668e-05, + "loss": 0.7244, + "step": 5236 + }, + { + "epoch": 0.819818409517846, + "grad_norm": 1.8931399583816528, + "learning_rate": 6.151026392961877e-05, + "loss": 0.5823, + "step": 5237 + }, + { + "epoch": 0.8199749530369442, + "grad_norm": 2.859834909439087, + "learning_rate": 6.150211795373087e-05, + "loss": 1.0467, + "step": 5238 + }, + { + "epoch": 0.8201314965560426, + "grad_norm": 3.1207375526428223, + "learning_rate": 6.149397197784294e-05, + "loss": 1.4148, + "step": 5239 + }, + { + "epoch": 0.8202880400751409, + "grad_norm": 2.8431215286254883, + "learning_rate": 6.148582600195504e-05, + "loss": 1.3275, + "step": 5240 + }, + { + "epoch": 0.8204445835942392, + "grad_norm": 3.0533735752105713, + "learning_rate": 6.147768002606714e-05, + "loss": 1.2395, + "step": 5241 + }, + { + "epoch": 0.8206011271133375, + "grad_norm": 3.8717195987701416, + "learning_rate": 6.14695340501792e-05, + "loss": 1.3207, + "step": 5242 + }, + { + "epoch": 0.8207576706324358, + "grad_norm": 2.1968164443969727, + "learning_rate": 6.14613880742913e-05, + "loss": 1.4475, + "step": 5243 + }, + { + "epoch": 0.8209142141515341, + "grad_norm": 2.448904275894165, + "learning_rate": 6.14532420984034e-05, + "loss": 1.5251, + "step": 5244 + }, + { + "epoch": 0.8210707576706324, + "grad_norm": 3.337311267852783, + "learning_rate": 6.144509612251548e-05, + "loss": 2.0702, + "step": 5245 + }, + { + "epoch": 0.8212273011897308, + "grad_norm": 3.1176674365997314, + "learning_rate": 6.143695014662757e-05, + "loss": 0.9166, + "step": 5246 + }, + { + "epoch": 0.8213838447088291, + "grad_norm": 1.9876058101654053, + "learning_rate": 6.142880417073966e-05, + "loss": 0.526, + "step": 5247 + }, + { + "epoch": 0.8215403882279274, + "grad_norm": 3.276524305343628, + "learning_rate": 6.142065819485175e-05, + "loss": 0.6404, + "step": 5248 + }, + { + "epoch": 0.8216969317470256, + "grad_norm": 3.130098581314087, + "learning_rate": 6.141251221896383e-05, + "loss": 1.3618, + "step": 5249 + }, + { + "epoch": 0.821853475266124, + "grad_norm": 2.4291348457336426, + "learning_rate": 6.140436624307593e-05, + "loss": 1.139, + "step": 5250 + }, + { + "epoch": 0.8220100187852223, + "grad_norm": 0.4404025375843048, + "learning_rate": 6.139622026718801e-05, + "loss": 0.299, + "step": 5251 + }, + { + "epoch": 0.8221665623043206, + "grad_norm": 0.7779234647750854, + "learning_rate": 6.13880742913001e-05, + "loss": 0.3324, + "step": 5252 + }, + { + "epoch": 0.8223231058234189, + "grad_norm": 0.49647995829582214, + "learning_rate": 6.13799283154122e-05, + "loss": 0.2399, + "step": 5253 + }, + { + "epoch": 0.8224796493425173, + "grad_norm": 0.5757866501808167, + "learning_rate": 6.137178233952428e-05, + "loss": 0.3437, + "step": 5254 + }, + { + "epoch": 0.8226361928616155, + "grad_norm": 0.7377982139587402, + "learning_rate": 6.136363636363636e-05, + "loss": 0.3153, + "step": 5255 + }, + { + "epoch": 0.8227927363807138, + "grad_norm": 0.6753270626068115, + "learning_rate": 6.135549038774846e-05, + "loss": 0.3672, + "step": 5256 + }, + { + "epoch": 0.8229492798998121, + "grad_norm": 0.8196328282356262, + "learning_rate": 6.134734441186054e-05, + "loss": 0.485, + "step": 5257 + }, + { + "epoch": 0.8231058234189105, + "grad_norm": 0.8253426551818848, + "learning_rate": 6.133919843597263e-05, + "loss": 0.3302, + "step": 5258 + }, + { + "epoch": 0.8232623669380088, + "grad_norm": 1.1898819208145142, + "learning_rate": 6.133105246008472e-05, + "loss": 0.4202, + "step": 5259 + }, + { + "epoch": 0.823418910457107, + "grad_norm": 1.2410742044448853, + "learning_rate": 6.132290648419681e-05, + "loss": 0.4575, + "step": 5260 + }, + { + "epoch": 0.8235754539762054, + "grad_norm": 0.978538990020752, + "learning_rate": 6.13147605083089e-05, + "loss": 0.3527, + "step": 5261 + }, + { + "epoch": 0.8237319974953037, + "grad_norm": 1.4939621686935425, + "learning_rate": 6.130661453242099e-05, + "loss": 0.4017, + "step": 5262 + }, + { + "epoch": 0.823888541014402, + "grad_norm": 0.8201844692230225, + "learning_rate": 6.129846855653307e-05, + "loss": 0.2589, + "step": 5263 + }, + { + "epoch": 0.8240450845335003, + "grad_norm": 0.627465009689331, + "learning_rate": 6.129032258064517e-05, + "loss": 0.2198, + "step": 5264 + }, + { + "epoch": 0.8242016280525987, + "grad_norm": 0.7719655632972717, + "learning_rate": 6.128217660475725e-05, + "loss": 0.3934, + "step": 5265 + }, + { + "epoch": 0.8243581715716969, + "grad_norm": 1.5124768018722534, + "learning_rate": 6.127403062886934e-05, + "loss": 0.3943, + "step": 5266 + }, + { + "epoch": 0.8245147150907952, + "grad_norm": 1.007308840751648, + "learning_rate": 6.126588465298143e-05, + "loss": 0.3011, + "step": 5267 + }, + { + "epoch": 0.8246712586098935, + "grad_norm": 0.9941921830177307, + "learning_rate": 6.125773867709352e-05, + "loss": 0.4402, + "step": 5268 + }, + { + "epoch": 0.8248278021289919, + "grad_norm": 2.4098832607269287, + "learning_rate": 6.12495927012056e-05, + "loss": 0.5368, + "step": 5269 + }, + { + "epoch": 0.8249843456480902, + "grad_norm": 1.365856647491455, + "learning_rate": 6.12414467253177e-05, + "loss": 0.2952, + "step": 5270 + }, + { + "epoch": 0.8251408891671885, + "grad_norm": 1.4020335674285889, + "learning_rate": 6.123330074942978e-05, + "loss": 0.5026, + "step": 5271 + }, + { + "epoch": 0.8252974326862867, + "grad_norm": 1.3973968029022217, + "learning_rate": 6.122515477354187e-05, + "loss": 0.3618, + "step": 5272 + }, + { + "epoch": 0.8254539762053851, + "grad_norm": 3.137390375137329, + "learning_rate": 6.121700879765396e-05, + "loss": 0.5625, + "step": 5273 + }, + { + "epoch": 0.8256105197244834, + "grad_norm": 2.8088769912719727, + "learning_rate": 6.120886282176606e-05, + "loss": 0.9254, + "step": 5274 + }, + { + "epoch": 0.8257670632435817, + "grad_norm": 1.8520309925079346, + "learning_rate": 6.120071684587813e-05, + "loss": 0.5714, + "step": 5275 + }, + { + "epoch": 0.8259236067626801, + "grad_norm": 2.985860824584961, + "learning_rate": 6.119257086999023e-05, + "loss": 0.9238, + "step": 5276 + }, + { + "epoch": 0.8260801502817783, + "grad_norm": 2.4794557094573975, + "learning_rate": 6.118442489410233e-05, + "loss": 0.8004, + "step": 5277 + }, + { + "epoch": 0.8262366938008766, + "grad_norm": 2.7719407081604004, + "learning_rate": 6.11762789182144e-05, + "loss": 1.0327, + "step": 5278 + }, + { + "epoch": 0.8263932373199749, + "grad_norm": 4.306914806365967, + "learning_rate": 6.11681329423265e-05, + "loss": 0.6782, + "step": 5279 + }, + { + "epoch": 0.8265497808390733, + "grad_norm": 3.3036375045776367, + "learning_rate": 6.115998696643859e-05, + "loss": 1.316, + "step": 5280 + }, + { + "epoch": 0.8267063243581716, + "grad_norm": 1.727888584136963, + "learning_rate": 6.115184099055067e-05, + "loss": 0.6945, + "step": 5281 + }, + { + "epoch": 0.8268628678772699, + "grad_norm": 2.726290225982666, + "learning_rate": 6.114369501466276e-05, + "loss": 1.2745, + "step": 5282 + }, + { + "epoch": 0.8270194113963681, + "grad_norm": 1.5891977548599243, + "learning_rate": 6.113554903877486e-05, + "loss": 0.6925, + "step": 5283 + }, + { + "epoch": 0.8271759549154665, + "grad_norm": 3.0548410415649414, + "learning_rate": 6.112740306288694e-05, + "loss": 0.9126, + "step": 5284 + }, + { + "epoch": 0.8273324984345648, + "grad_norm": 2.4095051288604736, + "learning_rate": 6.111925708699902e-05, + "loss": 1.2881, + "step": 5285 + }, + { + "epoch": 0.8274890419536631, + "grad_norm": 2.9456393718719482, + "learning_rate": 6.111111111111112e-05, + "loss": 0.5087, + "step": 5286 + }, + { + "epoch": 0.8276455854727615, + "grad_norm": 2.607278347015381, + "learning_rate": 6.11029651352232e-05, + "loss": 0.6267, + "step": 5287 + }, + { + "epoch": 0.8278021289918598, + "grad_norm": 2.630280017852783, + "learning_rate": 6.109481915933529e-05, + "loss": 1.24, + "step": 5288 + }, + { + "epoch": 0.827958672510958, + "grad_norm": 3.9374468326568604, + "learning_rate": 6.108667318344739e-05, + "loss": 1.1015, + "step": 5289 + }, + { + "epoch": 0.8281152160300563, + "grad_norm": 2.1209938526153564, + "learning_rate": 6.107852720755947e-05, + "loss": 0.9197, + "step": 5290 + }, + { + "epoch": 0.8282717595491547, + "grad_norm": 6.662008762359619, + "learning_rate": 6.107038123167155e-05, + "loss": 1.3589, + "step": 5291 + }, + { + "epoch": 0.828428303068253, + "grad_norm": 8.11575984954834, + "learning_rate": 6.106223525578365e-05, + "loss": 1.4188, + "step": 5292 + }, + { + "epoch": 0.8285848465873513, + "grad_norm": 3.0771565437316895, + "learning_rate": 6.105408927989573e-05, + "loss": 0.9405, + "step": 5293 + }, + { + "epoch": 0.8287413901064495, + "grad_norm": 2.288696765899658, + "learning_rate": 6.104594330400782e-05, + "loss": 1.3566, + "step": 5294 + }, + { + "epoch": 0.8288979336255479, + "grad_norm": 3.234102487564087, + "learning_rate": 6.103779732811992e-05, + "loss": 1.405, + "step": 5295 + }, + { + "epoch": 0.8290544771446462, + "grad_norm": 3.665799379348755, + "learning_rate": 6.1029651352232e-05, + "loss": 0.9603, + "step": 5296 + }, + { + "epoch": 0.8292110206637445, + "grad_norm": 2.4358675479888916, + "learning_rate": 6.102150537634409e-05, + "loss": 0.5643, + "step": 5297 + }, + { + "epoch": 0.8293675641828429, + "grad_norm": 2.0331332683563232, + "learning_rate": 6.101335940045618e-05, + "loss": 0.7653, + "step": 5298 + }, + { + "epoch": 0.8295241077019412, + "grad_norm": 6.727940082550049, + "learning_rate": 6.1005213424568264e-05, + "loss": 1.3614, + "step": 5299 + }, + { + "epoch": 0.8296806512210394, + "grad_norm": 3.0956966876983643, + "learning_rate": 6.0997067448680354e-05, + "loss": 1.6866, + "step": 5300 + }, + { + "epoch": 0.8298371947401377, + "grad_norm": 0.5027851462364197, + "learning_rate": 6.098892147279245e-05, + "loss": 0.2493, + "step": 5301 + }, + { + "epoch": 0.8299937382592361, + "grad_norm": 0.6843994855880737, + "learning_rate": 6.098077549690453e-05, + "loss": 0.2823, + "step": 5302 + }, + { + "epoch": 0.8301502817783344, + "grad_norm": 0.5217085480690002, + "learning_rate": 6.097262952101662e-05, + "loss": 0.2491, + "step": 5303 + }, + { + "epoch": 0.8303068252974327, + "grad_norm": 0.7930935621261597, + "learning_rate": 6.0964483545128716e-05, + "loss": 0.33, + "step": 5304 + }, + { + "epoch": 0.830463368816531, + "grad_norm": 0.5468308329582214, + "learning_rate": 6.095633756924079e-05, + "loss": 0.3076, + "step": 5305 + }, + { + "epoch": 0.8306199123356293, + "grad_norm": 1.1770238876342773, + "learning_rate": 6.094819159335289e-05, + "loss": 0.3726, + "step": 5306 + }, + { + "epoch": 0.8307764558547276, + "grad_norm": 0.8294389247894287, + "learning_rate": 6.094004561746498e-05, + "loss": 0.3437, + "step": 5307 + }, + { + "epoch": 0.8309329993738259, + "grad_norm": 1.2726783752441406, + "learning_rate": 6.093189964157706e-05, + "loss": 0.3841, + "step": 5308 + }, + { + "epoch": 0.8310895428929242, + "grad_norm": 1.655905842781067, + "learning_rate": 6.0923753665689155e-05, + "loss": 0.6141, + "step": 5309 + }, + { + "epoch": 0.8312460864120226, + "grad_norm": 0.8553209900856018, + "learning_rate": 6.0915607689801246e-05, + "loss": 0.3122, + "step": 5310 + }, + { + "epoch": 0.8314026299311209, + "grad_norm": 1.1314302682876587, + "learning_rate": 6.090746171391333e-05, + "loss": 0.4688, + "step": 5311 + }, + { + "epoch": 0.8315591734502191, + "grad_norm": 0.8515439033508301, + "learning_rate": 6.089931573802542e-05, + "loss": 0.4307, + "step": 5312 + }, + { + "epoch": 0.8317157169693175, + "grad_norm": 1.206177830696106, + "learning_rate": 6.089116976213751e-05, + "loss": 0.3795, + "step": 5313 + }, + { + "epoch": 0.8318722604884158, + "grad_norm": 1.2328027486801147, + "learning_rate": 6.0883023786249595e-05, + "loss": 0.5286, + "step": 5314 + }, + { + "epoch": 0.8320288040075141, + "grad_norm": 1.4709664583206177, + "learning_rate": 6.0874877810361685e-05, + "loss": 0.4599, + "step": 5315 + }, + { + "epoch": 0.8321853475266124, + "grad_norm": 1.2622349262237549, + "learning_rate": 6.0866731834473776e-05, + "loss": 0.5391, + "step": 5316 + }, + { + "epoch": 0.8323418910457107, + "grad_norm": 0.8720753192901611, + "learning_rate": 6.085858585858586e-05, + "loss": 0.3589, + "step": 5317 + }, + { + "epoch": 0.832498434564809, + "grad_norm": 1.2046051025390625, + "learning_rate": 6.085043988269795e-05, + "loss": 0.3458, + "step": 5318 + }, + { + "epoch": 0.8326549780839073, + "grad_norm": 1.2181085348129272, + "learning_rate": 6.084229390681005e-05, + "loss": 0.803, + "step": 5319 + }, + { + "epoch": 0.8328115216030056, + "grad_norm": 1.1022517681121826, + "learning_rate": 6.0834147930922124e-05, + "loss": 0.4858, + "step": 5320 + }, + { + "epoch": 0.832968065122104, + "grad_norm": 1.637634515762329, + "learning_rate": 6.0826001955034215e-05, + "loss": 0.4766, + "step": 5321 + }, + { + "epoch": 0.8331246086412023, + "grad_norm": 2.7707488536834717, + "learning_rate": 6.081785597914631e-05, + "loss": 0.8294, + "step": 5322 + }, + { + "epoch": 0.8332811521603005, + "grad_norm": 2.6427834033966064, + "learning_rate": 6.080971000325839e-05, + "loss": 0.728, + "step": 5323 + }, + { + "epoch": 0.8334376956793988, + "grad_norm": 3.0325303077697754, + "learning_rate": 6.0801564027370486e-05, + "loss": 0.7306, + "step": 5324 + }, + { + "epoch": 0.8335942391984972, + "grad_norm": 2.115546226501465, + "learning_rate": 6.079341805148258e-05, + "loss": 0.5867, + "step": 5325 + }, + { + "epoch": 0.8337507827175955, + "grad_norm": 3.994058132171631, + "learning_rate": 6.0785272075594654e-05, + "loss": 0.7642, + "step": 5326 + }, + { + "epoch": 0.8339073262366938, + "grad_norm": 3.897233724594116, + "learning_rate": 6.077712609970675e-05, + "loss": 0.6015, + "step": 5327 + }, + { + "epoch": 0.8340638697557922, + "grad_norm": 2.956090211868286, + "learning_rate": 6.076898012381884e-05, + "loss": 0.8439, + "step": 5328 + }, + { + "epoch": 0.8342204132748904, + "grad_norm": 2.1760990619659424, + "learning_rate": 6.0760834147930925e-05, + "loss": 0.5667, + "step": 5329 + }, + { + "epoch": 0.8343769567939887, + "grad_norm": 3.4608137607574463, + "learning_rate": 6.0752688172043016e-05, + "loss": 0.6594, + "step": 5330 + }, + { + "epoch": 0.834533500313087, + "grad_norm": 1.424245834350586, + "learning_rate": 6.0744542196155106e-05, + "loss": 0.6264, + "step": 5331 + }, + { + "epoch": 0.8346900438321854, + "grad_norm": 3.9865870475769043, + "learning_rate": 6.073639622026719e-05, + "loss": 0.6111, + "step": 5332 + }, + { + "epoch": 0.8348465873512837, + "grad_norm": 2.0970871448516846, + "learning_rate": 6.072825024437928e-05, + "loss": 0.7201, + "step": 5333 + }, + { + "epoch": 0.8350031308703819, + "grad_norm": 2.3476569652557373, + "learning_rate": 6.072010426849137e-05, + "loss": 0.912, + "step": 5334 + }, + { + "epoch": 0.8351596743894802, + "grad_norm": 2.7833664417266846, + "learning_rate": 6.0711958292603455e-05, + "loss": 0.8076, + "step": 5335 + }, + { + "epoch": 0.8353162179085786, + "grad_norm": 2.2423288822174072, + "learning_rate": 6.0703812316715545e-05, + "loss": 0.6664, + "step": 5336 + }, + { + "epoch": 0.8354727614276769, + "grad_norm": 3.1082794666290283, + "learning_rate": 6.0695666340827636e-05, + "loss": 0.9382, + "step": 5337 + }, + { + "epoch": 0.8356293049467752, + "grad_norm": 4.014952182769775, + "learning_rate": 6.068752036493972e-05, + "loss": 0.9751, + "step": 5338 + }, + { + "epoch": 0.8357858484658736, + "grad_norm": 5.697469711303711, + "learning_rate": 6.067937438905181e-05, + "loss": 0.8851, + "step": 5339 + }, + { + "epoch": 0.8359423919849718, + "grad_norm": 2.3953723907470703, + "learning_rate": 6.067122841316391e-05, + "loss": 0.6277, + "step": 5340 + }, + { + "epoch": 0.8360989355040701, + "grad_norm": 5.9300456047058105, + "learning_rate": 6.0663082437275985e-05, + "loss": 0.7335, + "step": 5341 + }, + { + "epoch": 0.8362554790231684, + "grad_norm": 3.5478675365448, + "learning_rate": 6.065493646138808e-05, + "loss": 0.9485, + "step": 5342 + }, + { + "epoch": 0.8364120225422668, + "grad_norm": 7.171238899230957, + "learning_rate": 6.064679048550017e-05, + "loss": 1.1059, + "step": 5343 + }, + { + "epoch": 0.8365685660613651, + "grad_norm": 2.4004721641540527, + "learning_rate": 6.063864450961225e-05, + "loss": 1.0421, + "step": 5344 + }, + { + "epoch": 0.8367251095804634, + "grad_norm": 2.809591054916382, + "learning_rate": 6.063049853372435e-05, + "loss": 0.7284, + "step": 5345 + }, + { + "epoch": 0.8368816530995616, + "grad_norm": 3.8347232341766357, + "learning_rate": 6.062235255783644e-05, + "loss": 0.6712, + "step": 5346 + }, + { + "epoch": 0.83703819661866, + "grad_norm": 3.5065743923187256, + "learning_rate": 6.061420658194852e-05, + "loss": 1.2916, + "step": 5347 + }, + { + "epoch": 0.8371947401377583, + "grad_norm": 1.3483152389526367, + "learning_rate": 6.060606060606061e-05, + "loss": 0.3717, + "step": 5348 + }, + { + "epoch": 0.8373512836568566, + "grad_norm": 3.3616669178009033, + "learning_rate": 6.05979146301727e-05, + "loss": 0.7193, + "step": 5349 + }, + { + "epoch": 0.837507827175955, + "grad_norm": 2.857607364654541, + "learning_rate": 6.0589768654284786e-05, + "loss": 0.8992, + "step": 5350 + }, + { + "epoch": 0.8376643706950532, + "grad_norm": 0.5286772847175598, + "learning_rate": 6.0581622678396876e-05, + "loss": 0.2153, + "step": 5351 + }, + { + "epoch": 0.8378209142141515, + "grad_norm": 0.8061527609825134, + "learning_rate": 6.057347670250897e-05, + "loss": 0.3042, + "step": 5352 + }, + { + "epoch": 0.8379774577332498, + "grad_norm": 0.5429078936576843, + "learning_rate": 6.056533072662105e-05, + "loss": 0.2596, + "step": 5353 + }, + { + "epoch": 0.8381340012523482, + "grad_norm": 1.119554042816162, + "learning_rate": 6.055718475073314e-05, + "loss": 0.3439, + "step": 5354 + }, + { + "epoch": 0.8382905447714465, + "grad_norm": 0.640008807182312, + "learning_rate": 6.054903877484523e-05, + "loss": 0.2159, + "step": 5355 + }, + { + "epoch": 0.8384470882905448, + "grad_norm": 0.5420833826065063, + "learning_rate": 6.0540892798957315e-05, + "loss": 0.3568, + "step": 5356 + }, + { + "epoch": 0.838603631809643, + "grad_norm": 0.6244762539863586, + "learning_rate": 6.0532746823069406e-05, + "loss": 0.2203, + "step": 5357 + }, + { + "epoch": 0.8387601753287414, + "grad_norm": 1.6230146884918213, + "learning_rate": 6.05246008471815e-05, + "loss": 0.2759, + "step": 5358 + }, + { + "epoch": 0.8389167188478397, + "grad_norm": 0.9389179348945618, + "learning_rate": 6.051645487129358e-05, + "loss": 0.3135, + "step": 5359 + }, + { + "epoch": 0.839073262366938, + "grad_norm": 0.8818756341934204, + "learning_rate": 6.050830889540568e-05, + "loss": 0.266, + "step": 5360 + }, + { + "epoch": 0.8392298058860364, + "grad_norm": 1.0641034841537476, + "learning_rate": 6.050016291951777e-05, + "loss": 0.3352, + "step": 5361 + }, + { + "epoch": 0.8393863494051347, + "grad_norm": 1.4963665008544922, + "learning_rate": 6.0492016943629845e-05, + "loss": 0.4422, + "step": 5362 + }, + { + "epoch": 0.8395428929242329, + "grad_norm": 1.4346555471420288, + "learning_rate": 6.048387096774194e-05, + "loss": 0.5504, + "step": 5363 + }, + { + "epoch": 0.8396994364433312, + "grad_norm": 1.3006185293197632, + "learning_rate": 6.047572499185403e-05, + "loss": 0.3781, + "step": 5364 + }, + { + "epoch": 0.8398559799624296, + "grad_norm": 1.846401572227478, + "learning_rate": 6.0467579015966116e-05, + "loss": 0.2837, + "step": 5365 + }, + { + "epoch": 0.8400125234815279, + "grad_norm": 1.544712781906128, + "learning_rate": 6.045943304007821e-05, + "loss": 0.5468, + "step": 5366 + }, + { + "epoch": 0.8401690670006262, + "grad_norm": 1.6740056276321411, + "learning_rate": 6.04512870641903e-05, + "loss": 0.4657, + "step": 5367 + }, + { + "epoch": 0.8403256105197244, + "grad_norm": 1.0058600902557373, + "learning_rate": 6.044314108830238e-05, + "loss": 0.3633, + "step": 5368 + }, + { + "epoch": 0.8404821540388228, + "grad_norm": 1.7011468410491943, + "learning_rate": 6.043499511241447e-05, + "loss": 0.3932, + "step": 5369 + }, + { + "epoch": 0.8406386975579211, + "grad_norm": 1.7930282354354858, + "learning_rate": 6.042684913652656e-05, + "loss": 0.5111, + "step": 5370 + }, + { + "epoch": 0.8407952410770194, + "grad_norm": 1.4254179000854492, + "learning_rate": 6.0418703160638646e-05, + "loss": 0.3925, + "step": 5371 + }, + { + "epoch": 0.8409517845961177, + "grad_norm": 2.5728864669799805, + "learning_rate": 6.0410557184750737e-05, + "loss": 0.6847, + "step": 5372 + }, + { + "epoch": 0.8411083281152161, + "grad_norm": 1.6146100759506226, + "learning_rate": 6.040241120886282e-05, + "loss": 0.7509, + "step": 5373 + }, + { + "epoch": 0.8412648716343143, + "grad_norm": 3.6361477375030518, + "learning_rate": 6.039426523297491e-05, + "loss": 1.1826, + "step": 5374 + }, + { + "epoch": 0.8414214151534126, + "grad_norm": 1.476425290107727, + "learning_rate": 6.0386119257087e-05, + "loss": 0.4404, + "step": 5375 + }, + { + "epoch": 0.841577958672511, + "grad_norm": 2.603708028793335, + "learning_rate": 6.0377973281199085e-05, + "loss": 0.489, + "step": 5376 + }, + { + "epoch": 0.8417345021916093, + "grad_norm": 1.5301849842071533, + "learning_rate": 6.0369827305311176e-05, + "loss": 0.6892, + "step": 5377 + }, + { + "epoch": 0.8418910457107076, + "grad_norm": 2.5470283031463623, + "learning_rate": 6.036168132942327e-05, + "loss": 0.7763, + "step": 5378 + }, + { + "epoch": 0.8420475892298059, + "grad_norm": 1.1967774629592896, + "learning_rate": 6.035353535353535e-05, + "loss": 0.3853, + "step": 5379 + }, + { + "epoch": 0.8422041327489042, + "grad_norm": 1.5100548267364502, + "learning_rate": 6.034538937764744e-05, + "loss": 0.4114, + "step": 5380 + }, + { + "epoch": 0.8423606762680025, + "grad_norm": 1.6263009309768677, + "learning_rate": 6.033724340175954e-05, + "loss": 0.9804, + "step": 5381 + }, + { + "epoch": 0.8425172197871008, + "grad_norm": 2.5686964988708496, + "learning_rate": 6.0329097425871615e-05, + "loss": 0.7735, + "step": 5382 + }, + { + "epoch": 0.8426737633061991, + "grad_norm": 2.644500732421875, + "learning_rate": 6.032095144998371e-05, + "loss": 1.0138, + "step": 5383 + }, + { + "epoch": 0.8428303068252975, + "grad_norm": 2.6102468967437744, + "learning_rate": 6.03128054740958e-05, + "loss": 0.6708, + "step": 5384 + }, + { + "epoch": 0.8429868503443957, + "grad_norm": 2.3857808113098145, + "learning_rate": 6.030465949820788e-05, + "loss": 1.0056, + "step": 5385 + }, + { + "epoch": 0.843143393863494, + "grad_norm": 5.273211479187012, + "learning_rate": 6.029651352231998e-05, + "loss": 2.0149, + "step": 5386 + }, + { + "epoch": 0.8432999373825923, + "grad_norm": 3.4087677001953125, + "learning_rate": 6.028836754643207e-05, + "loss": 0.7637, + "step": 5387 + }, + { + "epoch": 0.8434564809016907, + "grad_norm": 2.771296739578247, + "learning_rate": 6.028022157054415e-05, + "loss": 1.1621, + "step": 5388 + }, + { + "epoch": 0.843613024420789, + "grad_norm": 2.9216625690460205, + "learning_rate": 6.027207559465624e-05, + "loss": 0.8789, + "step": 5389 + }, + { + "epoch": 0.8437695679398873, + "grad_norm": 5.07627010345459, + "learning_rate": 6.026392961876833e-05, + "loss": 1.754, + "step": 5390 + }, + { + "epoch": 0.8439261114589856, + "grad_norm": 3.0921220779418945, + "learning_rate": 6.0255783642880416e-05, + "loss": 1.1952, + "step": 5391 + }, + { + "epoch": 0.8440826549780839, + "grad_norm": 7.292037010192871, + "learning_rate": 6.0247637666992506e-05, + "loss": 1.7851, + "step": 5392 + }, + { + "epoch": 0.8442391984971822, + "grad_norm": 3.0476233959198, + "learning_rate": 6.02394916911046e-05, + "loss": 1.2791, + "step": 5393 + }, + { + "epoch": 0.8443957420162805, + "grad_norm": 5.9591593742370605, + "learning_rate": 6.023134571521668e-05, + "loss": 0.8269, + "step": 5394 + }, + { + "epoch": 0.8445522855353789, + "grad_norm": 1.9713138341903687, + "learning_rate": 6.022319973932877e-05, + "loss": 0.6119, + "step": 5395 + }, + { + "epoch": 0.8447088290544772, + "grad_norm": 1.622549057006836, + "learning_rate": 6.021505376344086e-05, + "loss": 0.4119, + "step": 5396 + }, + { + "epoch": 0.8448653725735754, + "grad_norm": 4.068728923797607, + "learning_rate": 6.0206907787552946e-05, + "loss": 0.9249, + "step": 5397 + }, + { + "epoch": 0.8450219160926737, + "grad_norm": 7.709217548370361, + "learning_rate": 6.0198761811665036e-05, + "loss": 1.0799, + "step": 5398 + }, + { + "epoch": 0.8451784596117721, + "grad_norm": 3.36576247215271, + "learning_rate": 6.019061583577713e-05, + "loss": 1.2573, + "step": 5399 + }, + { + "epoch": 0.8453350031308704, + "grad_norm": 3.096303939819336, + "learning_rate": 6.018246985988921e-05, + "loss": 1.0201, + "step": 5400 + }, + { + "epoch": 0.8454915466499687, + "grad_norm": 0.3892497718334198, + "learning_rate": 6.017432388400131e-05, + "loss": 0.2403, + "step": 5401 + }, + { + "epoch": 0.845648090169067, + "grad_norm": 0.4642884433269501, + "learning_rate": 6.01661779081134e-05, + "loss": 0.2832, + "step": 5402 + }, + { + "epoch": 0.8458046336881653, + "grad_norm": 0.4376092553138733, + "learning_rate": 6.0158031932225475e-05, + "loss": 0.2374, + "step": 5403 + }, + { + "epoch": 0.8459611772072636, + "grad_norm": 1.271323800086975, + "learning_rate": 6.014988595633757e-05, + "loss": 0.3886, + "step": 5404 + }, + { + "epoch": 0.8461177207263619, + "grad_norm": 0.48567450046539307, + "learning_rate": 6.014173998044966e-05, + "loss": 0.1715, + "step": 5405 + }, + { + "epoch": 0.8462742642454603, + "grad_norm": 0.5607938766479492, + "learning_rate": 6.013359400456175e-05, + "loss": 0.2589, + "step": 5406 + }, + { + "epoch": 0.8464308077645586, + "grad_norm": 0.8393260836601257, + "learning_rate": 6.012544802867384e-05, + "loss": 0.2672, + "step": 5407 + }, + { + "epoch": 0.8465873512836568, + "grad_norm": 0.6491692662239075, + "learning_rate": 6.011730205278593e-05, + "loss": 0.2409, + "step": 5408 + }, + { + "epoch": 0.8467438948027551, + "grad_norm": 0.9220317006111145, + "learning_rate": 6.010915607689801e-05, + "loss": 0.535, + "step": 5409 + }, + { + "epoch": 0.8469004383218535, + "grad_norm": 0.9282836318016052, + "learning_rate": 6.01010101010101e-05, + "loss": 0.387, + "step": 5410 + }, + { + "epoch": 0.8470569818409518, + "grad_norm": 0.6408154964447021, + "learning_rate": 6.009286412512219e-05, + "loss": 0.2758, + "step": 5411 + }, + { + "epoch": 0.8472135253600501, + "grad_norm": 0.9305775165557861, + "learning_rate": 6.0084718149234276e-05, + "loss": 0.3105, + "step": 5412 + }, + { + "epoch": 0.8473700688791485, + "grad_norm": 1.3824748992919922, + "learning_rate": 6.007657217334637e-05, + "loss": 0.4381, + "step": 5413 + }, + { + "epoch": 0.8475266123982467, + "grad_norm": 1.060451626777649, + "learning_rate": 6.006842619745846e-05, + "loss": 0.2449, + "step": 5414 + }, + { + "epoch": 0.847683155917345, + "grad_norm": 1.3796744346618652, + "learning_rate": 6.006028022157054e-05, + "loss": 0.3819, + "step": 5415 + }, + { + "epoch": 0.8478396994364433, + "grad_norm": 1.4318833351135254, + "learning_rate": 6.005213424568263e-05, + "loss": 0.8912, + "step": 5416 + }, + { + "epoch": 0.8479962429555417, + "grad_norm": 1.1379272937774658, + "learning_rate": 6.004398826979473e-05, + "loss": 0.5477, + "step": 5417 + }, + { + "epoch": 0.84815278647464, + "grad_norm": 1.4896435737609863, + "learning_rate": 6.0035842293906806e-05, + "loss": 0.582, + "step": 5418 + }, + { + "epoch": 0.8483093299937383, + "grad_norm": 3.1661972999572754, + "learning_rate": 6.00276963180189e-05, + "loss": 0.8752, + "step": 5419 + }, + { + "epoch": 0.8484658735128365, + "grad_norm": 0.8245986700057983, + "learning_rate": 6.0019550342130994e-05, + "loss": 0.399, + "step": 5420 + }, + { + "epoch": 0.8486224170319349, + "grad_norm": 1.3120903968811035, + "learning_rate": 6.001140436624307e-05, + "loss": 0.4597, + "step": 5421 + }, + { + "epoch": 0.8487789605510332, + "grad_norm": 1.4632010459899902, + "learning_rate": 6.000325839035517e-05, + "loss": 0.4656, + "step": 5422 + }, + { + "epoch": 0.8489355040701315, + "grad_norm": 1.078107476234436, + "learning_rate": 5.999511241446726e-05, + "loss": 0.4484, + "step": 5423 + }, + { + "epoch": 0.8490920475892298, + "grad_norm": 2.023293972015381, + "learning_rate": 5.998696643857934e-05, + "loss": 0.7947, + "step": 5424 + }, + { + "epoch": 0.8492485911083281, + "grad_norm": 2.5043177604675293, + "learning_rate": 5.997882046269143e-05, + "loss": 0.8959, + "step": 5425 + }, + { + "epoch": 0.8494051346274264, + "grad_norm": 5.2309136390686035, + "learning_rate": 5.997067448680352e-05, + "loss": 1.3844, + "step": 5426 + }, + { + "epoch": 0.8495616781465247, + "grad_norm": 1.6641303300857544, + "learning_rate": 5.996252851091561e-05, + "loss": 0.8358, + "step": 5427 + }, + { + "epoch": 0.849718221665623, + "grad_norm": 1.8463335037231445, + "learning_rate": 5.99543825350277e-05, + "loss": 0.4781, + "step": 5428 + }, + { + "epoch": 0.8498747651847214, + "grad_norm": 1.8724762201309204, + "learning_rate": 5.994623655913979e-05, + "loss": 0.531, + "step": 5429 + }, + { + "epoch": 0.8500313087038197, + "grad_norm": 2.480299472808838, + "learning_rate": 5.993809058325187e-05, + "loss": 0.7346, + "step": 5430 + }, + { + "epoch": 0.8501878522229179, + "grad_norm": 1.658918857574463, + "learning_rate": 5.992994460736396e-05, + "loss": 0.9755, + "step": 5431 + }, + { + "epoch": 0.8503443957420163, + "grad_norm": 2.265087366104126, + "learning_rate": 5.992179863147605e-05, + "loss": 0.6529, + "step": 5432 + }, + { + "epoch": 0.8505009392611146, + "grad_norm": 2.218214273452759, + "learning_rate": 5.991365265558814e-05, + "loss": 1.0925, + "step": 5433 + }, + { + "epoch": 0.8506574827802129, + "grad_norm": 1.6540828943252563, + "learning_rate": 5.990550667970023e-05, + "loss": 0.6818, + "step": 5434 + }, + { + "epoch": 0.8508140262993112, + "grad_norm": 4.167775630950928, + "learning_rate": 5.9897360703812325e-05, + "loss": 0.9295, + "step": 5435 + }, + { + "epoch": 0.8509705698184096, + "grad_norm": 1.9077893495559692, + "learning_rate": 5.98892147279244e-05, + "loss": 0.4971, + "step": 5436 + }, + { + "epoch": 0.8511271133375078, + "grad_norm": 3.6881799697875977, + "learning_rate": 5.98810687520365e-05, + "loss": 0.986, + "step": 5437 + }, + { + "epoch": 0.8512836568566061, + "grad_norm": 4.474174976348877, + "learning_rate": 5.987292277614859e-05, + "loss": 1.2331, + "step": 5438 + }, + { + "epoch": 0.8514402003757044, + "grad_norm": 5.37129545211792, + "learning_rate": 5.9864776800260666e-05, + "loss": 1.4407, + "step": 5439 + }, + { + "epoch": 0.8515967438948028, + "grad_norm": 2.6918904781341553, + "learning_rate": 5.9856630824372764e-05, + "loss": 1.1923, + "step": 5440 + }, + { + "epoch": 0.8517532874139011, + "grad_norm": 2.0565409660339355, + "learning_rate": 5.9848484848484854e-05, + "loss": 0.7909, + "step": 5441 + }, + { + "epoch": 0.8519098309329993, + "grad_norm": 4.735820770263672, + "learning_rate": 5.984033887259694e-05, + "loss": 1.2377, + "step": 5442 + }, + { + "epoch": 0.8520663744520977, + "grad_norm": 4.168967247009277, + "learning_rate": 5.983219289670903e-05, + "loss": 1.5254, + "step": 5443 + }, + { + "epoch": 0.852222917971196, + "grad_norm": 5.895458221435547, + "learning_rate": 5.982404692082112e-05, + "loss": 1.3735, + "step": 5444 + }, + { + "epoch": 0.8523794614902943, + "grad_norm": 2.454702854156494, + "learning_rate": 5.98159009449332e-05, + "loss": 0.8208, + "step": 5445 + }, + { + "epoch": 0.8525360050093926, + "grad_norm": 2.998230457305908, + "learning_rate": 5.980775496904529e-05, + "loss": 1.1169, + "step": 5446 + }, + { + "epoch": 0.852692548528491, + "grad_norm": 2.541281223297119, + "learning_rate": 5.9799608993157384e-05, + "loss": 0.5445, + "step": 5447 + }, + { + "epoch": 0.8528490920475892, + "grad_norm": 1.3784314393997192, + "learning_rate": 5.979146301726947e-05, + "loss": 0.4074, + "step": 5448 + }, + { + "epoch": 0.8530056355666875, + "grad_norm": 3.8045036792755127, + "learning_rate": 5.978331704138156e-05, + "loss": 0.9726, + "step": 5449 + }, + { + "epoch": 0.8531621790857858, + "grad_norm": 3.7253432273864746, + "learning_rate": 5.977517106549365e-05, + "loss": 1.0571, + "step": 5450 + }, + { + "epoch": 0.8533187226048842, + "grad_norm": 0.3533252477645874, + "learning_rate": 5.976702508960573e-05, + "loss": 0.1919, + "step": 5451 + }, + { + "epoch": 0.8534752661239825, + "grad_norm": 0.9153391718864441, + "learning_rate": 5.975887911371782e-05, + "loss": 0.3715, + "step": 5452 + }, + { + "epoch": 0.8536318096430808, + "grad_norm": 0.5268846154212952, + "learning_rate": 5.975073313782992e-05, + "loss": 0.2432, + "step": 5453 + }, + { + "epoch": 0.853788353162179, + "grad_norm": 0.6055665016174316, + "learning_rate": 5.9742587161942e-05, + "loss": 0.2523, + "step": 5454 + }, + { + "epoch": 0.8539448966812774, + "grad_norm": 0.6184440851211548, + "learning_rate": 5.973444118605409e-05, + "loss": 0.3346, + "step": 5455 + }, + { + "epoch": 0.8541014402003757, + "grad_norm": 0.9909890294075012, + "learning_rate": 5.9726295210166185e-05, + "loss": 0.4776, + "step": 5456 + }, + { + "epoch": 0.854257983719474, + "grad_norm": 1.0755350589752197, + "learning_rate": 5.971814923427826e-05, + "loss": 0.5381, + "step": 5457 + }, + { + "epoch": 0.8544145272385724, + "grad_norm": 0.7765876054763794, + "learning_rate": 5.971000325839036e-05, + "loss": 0.4243, + "step": 5458 + }, + { + "epoch": 0.8545710707576706, + "grad_norm": 0.8233168125152588, + "learning_rate": 5.970185728250245e-05, + "loss": 0.2383, + "step": 5459 + }, + { + "epoch": 0.8547276142767689, + "grad_norm": 0.6127341389656067, + "learning_rate": 5.9693711306614533e-05, + "loss": 0.4137, + "step": 5460 + }, + { + "epoch": 0.8548841577958672, + "grad_norm": 0.598900318145752, + "learning_rate": 5.9685565330726624e-05, + "loss": 0.4161, + "step": 5461 + }, + { + "epoch": 0.8550407013149656, + "grad_norm": 0.8214806318283081, + "learning_rate": 5.9677419354838715e-05, + "loss": 0.3643, + "step": 5462 + }, + { + "epoch": 0.8551972448340639, + "grad_norm": 2.7143466472625732, + "learning_rate": 5.96692733789508e-05, + "loss": 0.4585, + "step": 5463 + }, + { + "epoch": 0.8553537883531622, + "grad_norm": 1.9841428995132446, + "learning_rate": 5.966112740306289e-05, + "loss": 0.5751, + "step": 5464 + }, + { + "epoch": 0.8555103318722604, + "grad_norm": 0.9186218976974487, + "learning_rate": 5.965298142717498e-05, + "loss": 0.3759, + "step": 5465 + }, + { + "epoch": 0.8556668753913588, + "grad_norm": 1.317680835723877, + "learning_rate": 5.964483545128706e-05, + "loss": 0.5381, + "step": 5466 + }, + { + "epoch": 0.8558234189104571, + "grad_norm": 1.6145505905151367, + "learning_rate": 5.9636689475399154e-05, + "loss": 0.6974, + "step": 5467 + }, + { + "epoch": 0.8559799624295554, + "grad_norm": 1.795633316040039, + "learning_rate": 5.9628543499511244e-05, + "loss": 0.6894, + "step": 5468 + }, + { + "epoch": 0.8561365059486538, + "grad_norm": 1.0849848985671997, + "learning_rate": 5.962039752362333e-05, + "loss": 0.3506, + "step": 5469 + }, + { + "epoch": 0.8562930494677521, + "grad_norm": 1.01227867603302, + "learning_rate": 5.961225154773542e-05, + "loss": 0.5117, + "step": 5470 + }, + { + "epoch": 0.8564495929868503, + "grad_norm": 2.7633676528930664, + "learning_rate": 5.9604105571847516e-05, + "loss": 0.6361, + "step": 5471 + }, + { + "epoch": 0.8566061365059486, + "grad_norm": 2.1956052780151367, + "learning_rate": 5.959595959595959e-05, + "loss": 0.8255, + "step": 5472 + }, + { + "epoch": 0.856762680025047, + "grad_norm": 1.7503818273544312, + "learning_rate": 5.958781362007168e-05, + "loss": 0.5332, + "step": 5473 + }, + { + "epoch": 0.8569192235441453, + "grad_norm": 1.5323725938796997, + "learning_rate": 5.957966764418378e-05, + "loss": 0.3837, + "step": 5474 + }, + { + "epoch": 0.8570757670632436, + "grad_norm": 2.647382974624634, + "learning_rate": 5.957152166829586e-05, + "loss": 0.9867, + "step": 5475 + }, + { + "epoch": 0.8572323105823418, + "grad_norm": 1.7415530681610107, + "learning_rate": 5.9563375692407955e-05, + "loss": 1.0799, + "step": 5476 + }, + { + "epoch": 0.8573888541014402, + "grad_norm": 2.146803617477417, + "learning_rate": 5.9555229716520045e-05, + "loss": 0.6686, + "step": 5477 + }, + { + "epoch": 0.8575453976205385, + "grad_norm": 1.4132221937179565, + "learning_rate": 5.954708374063213e-05, + "loss": 0.6071, + "step": 5478 + }, + { + "epoch": 0.8577019411396368, + "grad_norm": 4.554912567138672, + "learning_rate": 5.953893776474422e-05, + "loss": 0.4333, + "step": 5479 + }, + { + "epoch": 0.8578584846587352, + "grad_norm": 1.6843363046646118, + "learning_rate": 5.953079178885631e-05, + "loss": 0.8017, + "step": 5480 + }, + { + "epoch": 0.8580150281778335, + "grad_norm": 2.8403491973876953, + "learning_rate": 5.9522645812968394e-05, + "loss": 0.9211, + "step": 5481 + }, + { + "epoch": 0.8581715716969317, + "grad_norm": 1.4705026149749756, + "learning_rate": 5.9514499837080484e-05, + "loss": 0.7697, + "step": 5482 + }, + { + "epoch": 0.85832811521603, + "grad_norm": 4.832894325256348, + "learning_rate": 5.9506353861192575e-05, + "loss": 1.4454, + "step": 5483 + }, + { + "epoch": 0.8584846587351284, + "grad_norm": 3.6842331886291504, + "learning_rate": 5.949820788530466e-05, + "loss": 1.0483, + "step": 5484 + }, + { + "epoch": 0.8586412022542267, + "grad_norm": 2.9061782360076904, + "learning_rate": 5.949006190941675e-05, + "loss": 0.5994, + "step": 5485 + }, + { + "epoch": 0.858797745773325, + "grad_norm": 2.161362648010254, + "learning_rate": 5.948191593352884e-05, + "loss": 1.1584, + "step": 5486 + }, + { + "epoch": 0.8589542892924233, + "grad_norm": 2.9222826957702637, + "learning_rate": 5.9473769957640923e-05, + "loss": 1.3209, + "step": 5487 + }, + { + "epoch": 0.8591108328115216, + "grad_norm": 3.043940544128418, + "learning_rate": 5.9465623981753014e-05, + "loss": 1.2094, + "step": 5488 + }, + { + "epoch": 0.8592673763306199, + "grad_norm": 3.8373889923095703, + "learning_rate": 5.945747800586511e-05, + "loss": 1.1848, + "step": 5489 + }, + { + "epoch": 0.8594239198497182, + "grad_norm": 4.030364036560059, + "learning_rate": 5.944933202997719e-05, + "loss": 1.7806, + "step": 5490 + }, + { + "epoch": 0.8595804633688165, + "grad_norm": 2.5214076042175293, + "learning_rate": 5.944118605408928e-05, + "loss": 1.028, + "step": 5491 + }, + { + "epoch": 0.8597370068879149, + "grad_norm": 1.5299099683761597, + "learning_rate": 5.9433040078201376e-05, + "loss": 0.5957, + "step": 5492 + }, + { + "epoch": 0.8598935504070131, + "grad_norm": 2.0262839794158936, + "learning_rate": 5.942489410231345e-05, + "loss": 0.3756, + "step": 5493 + }, + { + "epoch": 0.8600500939261114, + "grad_norm": 2.380892276763916, + "learning_rate": 5.941674812642555e-05, + "loss": 1.5598, + "step": 5494 + }, + { + "epoch": 0.8602066374452098, + "grad_norm": 2.7562849521636963, + "learning_rate": 5.940860215053764e-05, + "loss": 0.7994, + "step": 5495 + }, + { + "epoch": 0.8603631809643081, + "grad_norm": 2.3159022331237793, + "learning_rate": 5.9400456174649725e-05, + "loss": 0.8852, + "step": 5496 + }, + { + "epoch": 0.8605197244834064, + "grad_norm": 4.1425652503967285, + "learning_rate": 5.9392310198761815e-05, + "loss": 0.4073, + "step": 5497 + }, + { + "epoch": 0.8606762680025047, + "grad_norm": 1.591884970664978, + "learning_rate": 5.9384164222873906e-05, + "loss": 0.7525, + "step": 5498 + }, + { + "epoch": 0.860832811521603, + "grad_norm": 2.577976703643799, + "learning_rate": 5.937601824698599e-05, + "loss": 0.813, + "step": 5499 + }, + { + "epoch": 0.8609893550407013, + "grad_norm": 3.479261875152588, + "learning_rate": 5.936787227109808e-05, + "loss": 1.6574, + "step": 5500 + }, + { + "epoch": 0.8611458985597996, + "grad_norm": 0.48172926902770996, + "learning_rate": 5.935972629521017e-05, + "loss": 0.2249, + "step": 5501 + }, + { + "epoch": 0.861302442078898, + "grad_norm": 0.7737534642219543, + "learning_rate": 5.9351580319322254e-05, + "loss": 0.234, + "step": 5502 + }, + { + "epoch": 0.8614589855979963, + "grad_norm": 0.6840766668319702, + "learning_rate": 5.9343434343434345e-05, + "loss": 0.2058, + "step": 5503 + }, + { + "epoch": 0.8616155291170946, + "grad_norm": 0.8967920541763306, + "learning_rate": 5.9335288367546435e-05, + "loss": 0.4156, + "step": 5504 + }, + { + "epoch": 0.8617720726361928, + "grad_norm": 0.8657261729240417, + "learning_rate": 5.932714239165852e-05, + "loss": 0.4209, + "step": 5505 + }, + { + "epoch": 0.8619286161552911, + "grad_norm": 0.6968880891799927, + "learning_rate": 5.931899641577061e-05, + "loss": 0.2901, + "step": 5506 + }, + { + "epoch": 0.8620851596743895, + "grad_norm": 0.8164982795715332, + "learning_rate": 5.931085043988271e-05, + "loss": 0.3196, + "step": 5507 + }, + { + "epoch": 0.8622417031934878, + "grad_norm": 0.611218273639679, + "learning_rate": 5.9302704463994784e-05, + "loss": 0.2863, + "step": 5508 + }, + { + "epoch": 0.8623982467125861, + "grad_norm": 0.5277093052864075, + "learning_rate": 5.9294558488106874e-05, + "loss": 0.1928, + "step": 5509 + }, + { + "epoch": 0.8625547902316844, + "grad_norm": 1.0876168012619019, + "learning_rate": 5.928641251221897e-05, + "loss": 0.2831, + "step": 5510 + }, + { + "epoch": 0.8627113337507827, + "grad_norm": 0.9069687724113464, + "learning_rate": 5.927826653633105e-05, + "loss": 0.3199, + "step": 5511 + }, + { + "epoch": 0.862867877269881, + "grad_norm": 1.0563366413116455, + "learning_rate": 5.9270120560443146e-05, + "loss": 0.2922, + "step": 5512 + }, + { + "epoch": 0.8630244207889793, + "grad_norm": 1.1015454530715942, + "learning_rate": 5.9261974584555236e-05, + "loss": 0.3683, + "step": 5513 + }, + { + "epoch": 0.8631809643080777, + "grad_norm": 0.9517279863357544, + "learning_rate": 5.9253828608667313e-05, + "loss": 0.3215, + "step": 5514 + }, + { + "epoch": 0.863337507827176, + "grad_norm": 1.7878706455230713, + "learning_rate": 5.924568263277941e-05, + "loss": 0.4443, + "step": 5515 + }, + { + "epoch": 0.8634940513462742, + "grad_norm": 1.77521812915802, + "learning_rate": 5.92375366568915e-05, + "loss": 0.575, + "step": 5516 + }, + { + "epoch": 0.8636505948653725, + "grad_norm": 2.2090823650360107, + "learning_rate": 5.9229390681003585e-05, + "loss": 0.5984, + "step": 5517 + }, + { + "epoch": 0.8638071383844709, + "grad_norm": 1.2394325733184814, + "learning_rate": 5.9221244705115676e-05, + "loss": 0.4943, + "step": 5518 + }, + { + "epoch": 0.8639636819035692, + "grad_norm": 1.3861702680587769, + "learning_rate": 5.9213098729227766e-05, + "loss": 0.3762, + "step": 5519 + }, + { + "epoch": 0.8641202254226675, + "grad_norm": 1.8215223550796509, + "learning_rate": 5.920495275333985e-05, + "loss": 0.5202, + "step": 5520 + }, + { + "epoch": 0.8642767689417659, + "grad_norm": 1.9948828220367432, + "learning_rate": 5.919680677745194e-05, + "loss": 0.5168, + "step": 5521 + }, + { + "epoch": 0.8644333124608641, + "grad_norm": 1.412211298942566, + "learning_rate": 5.918866080156403e-05, + "loss": 0.5749, + "step": 5522 + }, + { + "epoch": 0.8645898559799624, + "grad_norm": 2.1438796520233154, + "learning_rate": 5.9180514825676115e-05, + "loss": 0.7083, + "step": 5523 + }, + { + "epoch": 0.8647463994990607, + "grad_norm": 1.920267105102539, + "learning_rate": 5.9172368849788205e-05, + "loss": 0.6837, + "step": 5524 + }, + { + "epoch": 0.8649029430181591, + "grad_norm": 1.5543715953826904, + "learning_rate": 5.91642228739003e-05, + "loss": 0.5249, + "step": 5525 + }, + { + "epoch": 0.8650594865372574, + "grad_norm": 1.9056282043457031, + "learning_rate": 5.915607689801238e-05, + "loss": 0.509, + "step": 5526 + }, + { + "epoch": 0.8652160300563556, + "grad_norm": 2.1754462718963623, + "learning_rate": 5.914793092212447e-05, + "loss": 0.9442, + "step": 5527 + }, + { + "epoch": 0.8653725735754539, + "grad_norm": 1.6455159187316895, + "learning_rate": 5.913978494623657e-05, + "loss": 0.8949, + "step": 5528 + }, + { + "epoch": 0.8655291170945523, + "grad_norm": 2.502504587173462, + "learning_rate": 5.9131638970348644e-05, + "loss": 1.0707, + "step": 5529 + }, + { + "epoch": 0.8656856606136506, + "grad_norm": 2.3944480419158936, + "learning_rate": 5.912349299446074e-05, + "loss": 0.6534, + "step": 5530 + }, + { + "epoch": 0.8658422041327489, + "grad_norm": 3.050482749938965, + "learning_rate": 5.911534701857283e-05, + "loss": 0.7171, + "step": 5531 + }, + { + "epoch": 0.8659987476518473, + "grad_norm": 2.2033979892730713, + "learning_rate": 5.910720104268491e-05, + "loss": 0.7803, + "step": 5532 + }, + { + "epoch": 0.8661552911709455, + "grad_norm": 2.0699005126953125, + "learning_rate": 5.9099055066797006e-05, + "loss": 0.8444, + "step": 5533 + }, + { + "epoch": 0.8663118346900438, + "grad_norm": 3.5027740001678467, + "learning_rate": 5.90909090909091e-05, + "loss": 0.711, + "step": 5534 + }, + { + "epoch": 0.8664683782091421, + "grad_norm": 4.875735759735107, + "learning_rate": 5.908276311502118e-05, + "loss": 0.8425, + "step": 5535 + }, + { + "epoch": 0.8666249217282405, + "grad_norm": 2.649096727371216, + "learning_rate": 5.907461713913327e-05, + "loss": 1.0038, + "step": 5536 + }, + { + "epoch": 0.8667814652473388, + "grad_norm": 2.35786771774292, + "learning_rate": 5.906647116324536e-05, + "loss": 0.9495, + "step": 5537 + }, + { + "epoch": 0.8669380087664371, + "grad_norm": 3.479048252105713, + "learning_rate": 5.9058325187357445e-05, + "loss": 1.3289, + "step": 5538 + }, + { + "epoch": 0.8670945522855353, + "grad_norm": 3.687445878982544, + "learning_rate": 5.9050179211469536e-05, + "loss": 1.3214, + "step": 5539 + }, + { + "epoch": 0.8672510958046337, + "grad_norm": 2.7467174530029297, + "learning_rate": 5.9042033235581626e-05, + "loss": 0.8662, + "step": 5540 + }, + { + "epoch": 0.867407639323732, + "grad_norm": 5.100554943084717, + "learning_rate": 5.903388725969371e-05, + "loss": 1.4328, + "step": 5541 + }, + { + "epoch": 0.8675641828428303, + "grad_norm": 2.867222309112549, + "learning_rate": 5.90257412838058e-05, + "loss": 1.2009, + "step": 5542 + }, + { + "epoch": 0.8677207263619287, + "grad_norm": 1.7521190643310547, + "learning_rate": 5.901759530791789e-05, + "loss": 0.9047, + "step": 5543 + }, + { + "epoch": 0.867877269881027, + "grad_norm": 2.207857608795166, + "learning_rate": 5.9009449332029975e-05, + "loss": 0.9379, + "step": 5544 + }, + { + "epoch": 0.8680338134001252, + "grad_norm": 3.548516273498535, + "learning_rate": 5.9001303356142065e-05, + "loss": 1.241, + "step": 5545 + }, + { + "epoch": 0.8681903569192235, + "grad_norm": 3.3841984272003174, + "learning_rate": 5.899315738025416e-05, + "loss": 1.4431, + "step": 5546 + }, + { + "epoch": 0.8683469004383219, + "grad_norm": 3.936304807662964, + "learning_rate": 5.898501140436624e-05, + "loss": 0.9425, + "step": 5547 + }, + { + "epoch": 0.8685034439574202, + "grad_norm": 3.3981528282165527, + "learning_rate": 5.897686542847834e-05, + "loss": 0.564, + "step": 5548 + }, + { + "epoch": 0.8686599874765185, + "grad_norm": 2.6525893211364746, + "learning_rate": 5.896871945259043e-05, + "loss": 0.7714, + "step": 5549 + }, + { + "epoch": 0.8688165309956167, + "grad_norm": 2.0061185359954834, + "learning_rate": 5.8960573476702505e-05, + "loss": 0.8728, + "step": 5550 + }, + { + "epoch": 0.8689730745147151, + "grad_norm": 0.7613506317138672, + "learning_rate": 5.89524275008146e-05, + "loss": 0.334, + "step": 5551 + }, + { + "epoch": 0.8691296180338134, + "grad_norm": 0.6146905422210693, + "learning_rate": 5.894428152492669e-05, + "loss": 0.2919, + "step": 5552 + }, + { + "epoch": 0.8692861615529117, + "grad_norm": 0.5351287126541138, + "learning_rate": 5.8936135549038776e-05, + "loss": 0.2244, + "step": 5553 + }, + { + "epoch": 0.86944270507201, + "grad_norm": 0.7883973717689514, + "learning_rate": 5.892798957315087e-05, + "loss": 0.3242, + "step": 5554 + }, + { + "epoch": 0.8695992485911084, + "grad_norm": 0.746200442314148, + "learning_rate": 5.891984359726296e-05, + "loss": 0.2891, + "step": 5555 + }, + { + "epoch": 0.8697557921102066, + "grad_norm": 0.7840794324874878, + "learning_rate": 5.891169762137504e-05, + "loss": 0.2925, + "step": 5556 + }, + { + "epoch": 0.8699123356293049, + "grad_norm": 0.6303369402885437, + "learning_rate": 5.890355164548713e-05, + "loss": 0.2389, + "step": 5557 + }, + { + "epoch": 0.8700688791484033, + "grad_norm": 0.9928824305534363, + "learning_rate": 5.889540566959922e-05, + "loss": 0.343, + "step": 5558 + }, + { + "epoch": 0.8702254226675016, + "grad_norm": 1.0401079654693604, + "learning_rate": 5.8887259693711306e-05, + "loss": 0.2272, + "step": 5559 + }, + { + "epoch": 0.8703819661865999, + "grad_norm": 0.7101280093193054, + "learning_rate": 5.8879113717823396e-05, + "loss": 0.3374, + "step": 5560 + }, + { + "epoch": 0.8705385097056982, + "grad_norm": 1.0101439952850342, + "learning_rate": 5.887096774193549e-05, + "loss": 0.2207, + "step": 5561 + }, + { + "epoch": 0.8706950532247965, + "grad_norm": 0.9707580208778381, + "learning_rate": 5.886282176604757e-05, + "loss": 0.272, + "step": 5562 + }, + { + "epoch": 0.8708515967438948, + "grad_norm": 1.5482279062271118, + "learning_rate": 5.885467579015966e-05, + "loss": 0.3946, + "step": 5563 + }, + { + "epoch": 0.8710081402629931, + "grad_norm": 1.1519665718078613, + "learning_rate": 5.884652981427176e-05, + "loss": 0.3319, + "step": 5564 + }, + { + "epoch": 0.8711646837820914, + "grad_norm": 1.0253418684005737, + "learning_rate": 5.8838383838383835e-05, + "loss": 0.5775, + "step": 5565 + }, + { + "epoch": 0.8713212273011898, + "grad_norm": 0.8758780360221863, + "learning_rate": 5.883023786249593e-05, + "loss": 0.3563, + "step": 5566 + }, + { + "epoch": 0.871477770820288, + "grad_norm": 1.2263768911361694, + "learning_rate": 5.882209188660802e-05, + "loss": 0.4474, + "step": 5567 + }, + { + "epoch": 0.8716343143393863, + "grad_norm": 1.9731618165969849, + "learning_rate": 5.88139459107201e-05, + "loss": 0.776, + "step": 5568 + }, + { + "epoch": 0.8717908578584846, + "grad_norm": 1.0311064720153809, + "learning_rate": 5.88057999348322e-05, + "loss": 0.3965, + "step": 5569 + }, + { + "epoch": 0.871947401377583, + "grad_norm": 1.8176010847091675, + "learning_rate": 5.879765395894429e-05, + "loss": 0.5234, + "step": 5570 + }, + { + "epoch": 0.8721039448966813, + "grad_norm": 2.634796380996704, + "learning_rate": 5.878950798305637e-05, + "loss": 0.4796, + "step": 5571 + }, + { + "epoch": 0.8722604884157796, + "grad_norm": 4.555521488189697, + "learning_rate": 5.878136200716846e-05, + "loss": 0.4825, + "step": 5572 + }, + { + "epoch": 0.8724170319348779, + "grad_norm": 2.611691474914551, + "learning_rate": 5.877321603128055e-05, + "loss": 0.6148, + "step": 5573 + }, + { + "epoch": 0.8725735754539762, + "grad_norm": 1.9506736993789673, + "learning_rate": 5.8765070055392637e-05, + "loss": 0.3155, + "step": 5574 + }, + { + "epoch": 0.8727301189730745, + "grad_norm": 2.6600797176361084, + "learning_rate": 5.875692407950473e-05, + "loss": 0.7015, + "step": 5575 + }, + { + "epoch": 0.8728866624921728, + "grad_norm": 1.3806215524673462, + "learning_rate": 5.874877810361682e-05, + "loss": 0.3839, + "step": 5576 + }, + { + "epoch": 0.8730432060112712, + "grad_norm": 1.6722749471664429, + "learning_rate": 5.87406321277289e-05, + "loss": 0.4725, + "step": 5577 + }, + { + "epoch": 0.8731997495303695, + "grad_norm": 3.17425799369812, + "learning_rate": 5.873248615184099e-05, + "loss": 0.616, + "step": 5578 + }, + { + "epoch": 0.8733562930494677, + "grad_norm": 5.947484016418457, + "learning_rate": 5.872434017595308e-05, + "loss": 1.2217, + "step": 5579 + }, + { + "epoch": 0.873512836568566, + "grad_norm": 1.1996161937713623, + "learning_rate": 5.8716194200065166e-05, + "loss": 0.5405, + "step": 5580 + }, + { + "epoch": 0.8736693800876644, + "grad_norm": 3.291942834854126, + "learning_rate": 5.870804822417726e-05, + "loss": 0.7252, + "step": 5581 + }, + { + "epoch": 0.8738259236067627, + "grad_norm": 4.42039680480957, + "learning_rate": 5.8699902248289354e-05, + "loss": 1.3133, + "step": 5582 + }, + { + "epoch": 0.873982467125861, + "grad_norm": 2.2321555614471436, + "learning_rate": 5.869175627240143e-05, + "loss": 0.8139, + "step": 5583 + }, + { + "epoch": 0.8741390106449592, + "grad_norm": 5.462028980255127, + "learning_rate": 5.868361029651352e-05, + "loss": 0.9876, + "step": 5584 + }, + { + "epoch": 0.8742955541640576, + "grad_norm": 2.9468157291412354, + "learning_rate": 5.867546432062562e-05, + "loss": 1.1577, + "step": 5585 + }, + { + "epoch": 0.8744520976831559, + "grad_norm": 3.9236979484558105, + "learning_rate": 5.8667318344737696e-05, + "loss": 0.581, + "step": 5586 + }, + { + "epoch": 0.8746086412022542, + "grad_norm": 5.916179180145264, + "learning_rate": 5.865917236884979e-05, + "loss": 1.5488, + "step": 5587 + }, + { + "epoch": 0.8747651847213526, + "grad_norm": 2.869581699371338, + "learning_rate": 5.8651026392961884e-05, + "loss": 0.8473, + "step": 5588 + }, + { + "epoch": 0.8749217282404509, + "grad_norm": 2.6398251056671143, + "learning_rate": 5.864288041707397e-05, + "loss": 0.9443, + "step": 5589 + }, + { + "epoch": 0.8750782717595491, + "grad_norm": 2.5502991676330566, + "learning_rate": 5.863473444118606e-05, + "loss": 0.9664, + "step": 5590 + }, + { + "epoch": 0.8752348152786474, + "grad_norm": 3.1175637245178223, + "learning_rate": 5.862658846529815e-05, + "loss": 1.2969, + "step": 5591 + }, + { + "epoch": 0.8753913587977458, + "grad_norm": 3.3992087841033936, + "learning_rate": 5.861844248941023e-05, + "loss": 1.5228, + "step": 5592 + }, + { + "epoch": 0.8755479023168441, + "grad_norm": 3.840141534805298, + "learning_rate": 5.861029651352232e-05, + "loss": 1.5067, + "step": 5593 + }, + { + "epoch": 0.8757044458359424, + "grad_norm": 1.5250256061553955, + "learning_rate": 5.860215053763441e-05, + "loss": 0.7011, + "step": 5594 + }, + { + "epoch": 0.8758609893550408, + "grad_norm": 2.907444477081299, + "learning_rate": 5.85940045617465e-05, + "loss": 1.0911, + "step": 5595 + }, + { + "epoch": 0.876017532874139, + "grad_norm": 2.3179149627685547, + "learning_rate": 5.858585858585859e-05, + "loss": 1.1743, + "step": 5596 + }, + { + "epoch": 0.8761740763932373, + "grad_norm": 4.73147439956665, + "learning_rate": 5.857771260997068e-05, + "loss": 0.9891, + "step": 5597 + }, + { + "epoch": 0.8763306199123356, + "grad_norm": 2.005425453186035, + "learning_rate": 5.856956663408276e-05, + "loss": 0.5013, + "step": 5598 + }, + { + "epoch": 0.876487163431434, + "grad_norm": 4.088343620300293, + "learning_rate": 5.856142065819485e-05, + "loss": 1.5133, + "step": 5599 + }, + { + "epoch": 0.8766437069505323, + "grad_norm": 2.9430007934570312, + "learning_rate": 5.855327468230695e-05, + "loss": 1.1424, + "step": 5600 + }, + { + "epoch": 0.8768002504696305, + "grad_norm": 0.5713847279548645, + "learning_rate": 5.8545128706419027e-05, + "loss": 0.2732, + "step": 5601 + }, + { + "epoch": 0.8769567939887288, + "grad_norm": 0.6552407145500183, + "learning_rate": 5.853698273053112e-05, + "loss": 0.2871, + "step": 5602 + }, + { + "epoch": 0.8771133375078272, + "grad_norm": 1.9758774042129517, + "learning_rate": 5.8528836754643214e-05, + "loss": 0.33, + "step": 5603 + }, + { + "epoch": 0.8772698810269255, + "grad_norm": 0.616115927696228, + "learning_rate": 5.852069077875529e-05, + "loss": 0.2011, + "step": 5604 + }, + { + "epoch": 0.8774264245460238, + "grad_norm": 1.1900806427001953, + "learning_rate": 5.851254480286739e-05, + "loss": 0.4573, + "step": 5605 + }, + { + "epoch": 0.8775829680651221, + "grad_norm": 0.8411769270896912, + "learning_rate": 5.850439882697948e-05, + "loss": 0.3418, + "step": 5606 + }, + { + "epoch": 0.8777395115842204, + "grad_norm": 0.7348935604095459, + "learning_rate": 5.849625285109156e-05, + "loss": 0.3825, + "step": 5607 + }, + { + "epoch": 0.8778960551033187, + "grad_norm": 0.9572919607162476, + "learning_rate": 5.8488106875203653e-05, + "loss": 0.3857, + "step": 5608 + }, + { + "epoch": 0.878052598622417, + "grad_norm": 0.8478525876998901, + "learning_rate": 5.8479960899315744e-05, + "loss": 0.4707, + "step": 5609 + }, + { + "epoch": 0.8782091421415154, + "grad_norm": 0.5869054198265076, + "learning_rate": 5.847181492342783e-05, + "loss": 0.2316, + "step": 5610 + }, + { + "epoch": 0.8783656856606137, + "grad_norm": 1.0997898578643799, + "learning_rate": 5.846366894753992e-05, + "loss": 0.3272, + "step": 5611 + }, + { + "epoch": 0.878522229179712, + "grad_norm": 0.9716269969940186, + "learning_rate": 5.845552297165201e-05, + "loss": 0.2887, + "step": 5612 + }, + { + "epoch": 0.8786787726988102, + "grad_norm": 1.098987102508545, + "learning_rate": 5.844737699576409e-05, + "loss": 0.3705, + "step": 5613 + }, + { + "epoch": 0.8788353162179086, + "grad_norm": 2.404444932937622, + "learning_rate": 5.843923101987618e-05, + "loss": 0.3096, + "step": 5614 + }, + { + "epoch": 0.8789918597370069, + "grad_norm": 1.046128273010254, + "learning_rate": 5.8431085043988274e-05, + "loss": 0.3913, + "step": 5615 + }, + { + "epoch": 0.8791484032561052, + "grad_norm": 1.8709193468093872, + "learning_rate": 5.842293906810036e-05, + "loss": 0.3729, + "step": 5616 + }, + { + "epoch": 0.8793049467752035, + "grad_norm": 2.1944234371185303, + "learning_rate": 5.841479309221245e-05, + "loss": 0.6856, + "step": 5617 + }, + { + "epoch": 0.8794614902943018, + "grad_norm": 1.8170089721679688, + "learning_rate": 5.8406647116324545e-05, + "loss": 0.689, + "step": 5618 + }, + { + "epoch": 0.8796180338134001, + "grad_norm": 1.7294507026672363, + "learning_rate": 5.839850114043662e-05, + "loss": 0.5214, + "step": 5619 + }, + { + "epoch": 0.8797745773324984, + "grad_norm": 1.0675519704818726, + "learning_rate": 5.839035516454871e-05, + "loss": 0.4147, + "step": 5620 + }, + { + "epoch": 0.8799311208515967, + "grad_norm": 1.1872717142105103, + "learning_rate": 5.838220918866081e-05, + "loss": 0.4247, + "step": 5621 + }, + { + "epoch": 0.8800876643706951, + "grad_norm": 1.938613772392273, + "learning_rate": 5.837406321277289e-05, + "loss": 0.3839, + "step": 5622 + }, + { + "epoch": 0.8802442078897934, + "grad_norm": 1.2566262483596802, + "learning_rate": 5.8365917236884984e-05, + "loss": 0.4388, + "step": 5623 + }, + { + "epoch": 0.8804007514088916, + "grad_norm": 2.4345316886901855, + "learning_rate": 5.8357771260997075e-05, + "loss": 0.7636, + "step": 5624 + }, + { + "epoch": 0.88055729492799, + "grad_norm": 3.225085735321045, + "learning_rate": 5.834962528510916e-05, + "loss": 0.5791, + "step": 5625 + }, + { + "epoch": 0.8807138384470883, + "grad_norm": 2.1173014640808105, + "learning_rate": 5.834147930922125e-05, + "loss": 0.5505, + "step": 5626 + }, + { + "epoch": 0.8808703819661866, + "grad_norm": 2.305084705352783, + "learning_rate": 5.833333333333334e-05, + "loss": 0.6246, + "step": 5627 + }, + { + "epoch": 0.8810269254852849, + "grad_norm": 4.080682754516602, + "learning_rate": 5.832518735744542e-05, + "loss": 0.9088, + "step": 5628 + }, + { + "epoch": 0.8811834690043833, + "grad_norm": 2.819668769836426, + "learning_rate": 5.8317041381557514e-05, + "loss": 0.487, + "step": 5629 + }, + { + "epoch": 0.8813400125234815, + "grad_norm": 2.6084089279174805, + "learning_rate": 5.8308895405669604e-05, + "loss": 0.6663, + "step": 5630 + }, + { + "epoch": 0.8814965560425798, + "grad_norm": 1.7253185510635376, + "learning_rate": 5.830074942978169e-05, + "loss": 0.6469, + "step": 5631 + }, + { + "epoch": 0.8816530995616781, + "grad_norm": 4.3553466796875, + "learning_rate": 5.829260345389378e-05, + "loss": 1.0115, + "step": 5632 + }, + { + "epoch": 0.8818096430807765, + "grad_norm": 4.60039758682251, + "learning_rate": 5.828445747800587e-05, + "loss": 0.8019, + "step": 5633 + }, + { + "epoch": 0.8819661865998748, + "grad_norm": 2.2935051918029785, + "learning_rate": 5.827631150211795e-05, + "loss": 0.9182, + "step": 5634 + }, + { + "epoch": 0.882122730118973, + "grad_norm": 2.370966672897339, + "learning_rate": 5.8268165526230043e-05, + "loss": 0.5442, + "step": 5635 + }, + { + "epoch": 0.8822792736380713, + "grad_norm": 2.4622228145599365, + "learning_rate": 5.826001955034214e-05, + "loss": 0.8381, + "step": 5636 + }, + { + "epoch": 0.8824358171571697, + "grad_norm": 4.109498500823975, + "learning_rate": 5.825187357445422e-05, + "loss": 0.8809, + "step": 5637 + }, + { + "epoch": 0.882592360676268, + "grad_norm": 4.741673946380615, + "learning_rate": 5.824372759856631e-05, + "loss": 1.2479, + "step": 5638 + }, + { + "epoch": 0.8827489041953663, + "grad_norm": 2.735490560531616, + "learning_rate": 5.8235581622678405e-05, + "loss": 0.8752, + "step": 5639 + }, + { + "epoch": 0.8829054477144647, + "grad_norm": 8.83154582977295, + "learning_rate": 5.822743564679048e-05, + "loss": 0.8857, + "step": 5640 + }, + { + "epoch": 0.8830619912335629, + "grad_norm": 3.055189371109009, + "learning_rate": 5.821928967090258e-05, + "loss": 1.6545, + "step": 5641 + }, + { + "epoch": 0.8832185347526612, + "grad_norm": 3.004971981048584, + "learning_rate": 5.821114369501467e-05, + "loss": 1.3897, + "step": 5642 + }, + { + "epoch": 0.8833750782717595, + "grad_norm": 2.5267226696014404, + "learning_rate": 5.820299771912675e-05, + "loss": 1.2875, + "step": 5643 + }, + { + "epoch": 0.8835316217908579, + "grad_norm": 6.716701030731201, + "learning_rate": 5.8194851743238845e-05, + "loss": 1.1921, + "step": 5644 + }, + { + "epoch": 0.8836881653099562, + "grad_norm": 6.098626136779785, + "learning_rate": 5.8186705767350935e-05, + "loss": 1.7093, + "step": 5645 + }, + { + "epoch": 0.8838447088290545, + "grad_norm": 2.538290023803711, + "learning_rate": 5.817855979146302e-05, + "loss": 0.5694, + "step": 5646 + }, + { + "epoch": 0.8840012523481527, + "grad_norm": 2.384411334991455, + "learning_rate": 5.817041381557511e-05, + "loss": 0.7579, + "step": 5647 + }, + { + "epoch": 0.8841577958672511, + "grad_norm": 1.4556760787963867, + "learning_rate": 5.81622678396872e-05, + "loss": 0.2363, + "step": 5648 + }, + { + "epoch": 0.8843143393863494, + "grad_norm": 3.1603636741638184, + "learning_rate": 5.8154121863799284e-05, + "loss": 0.7423, + "step": 5649 + }, + { + "epoch": 0.8844708829054477, + "grad_norm": 3.221339702606201, + "learning_rate": 5.8145975887911374e-05, + "loss": 0.8497, + "step": 5650 + }, + { + "epoch": 0.8846274264245461, + "grad_norm": 0.5123854875564575, + "learning_rate": 5.8137829912023465e-05, + "loss": 0.2025, + "step": 5651 + }, + { + "epoch": 0.8847839699436444, + "grad_norm": 0.4051559865474701, + "learning_rate": 5.812968393613555e-05, + "loss": 0.1697, + "step": 5652 + }, + { + "epoch": 0.8849405134627426, + "grad_norm": 0.6000675559043884, + "learning_rate": 5.812153796024764e-05, + "loss": 0.2402, + "step": 5653 + }, + { + "epoch": 0.8850970569818409, + "grad_norm": 0.7476035356521606, + "learning_rate": 5.8113391984359736e-05, + "loss": 0.2063, + "step": 5654 + }, + { + "epoch": 0.8852536005009393, + "grad_norm": 0.9231533408164978, + "learning_rate": 5.810524600847181e-05, + "loss": 0.4397, + "step": 5655 + }, + { + "epoch": 0.8854101440200376, + "grad_norm": 0.8962478041648865, + "learning_rate": 5.8097100032583904e-05, + "loss": 0.4544, + "step": 5656 + }, + { + "epoch": 0.8855666875391359, + "grad_norm": 0.8312439322471619, + "learning_rate": 5.8088954056696e-05, + "loss": 0.2307, + "step": 5657 + }, + { + "epoch": 0.8857232310582341, + "grad_norm": 0.6206727027893066, + "learning_rate": 5.808080808080808e-05, + "loss": 0.2269, + "step": 5658 + }, + { + "epoch": 0.8858797745773325, + "grad_norm": 1.356205940246582, + "learning_rate": 5.8072662104920175e-05, + "loss": 0.3693, + "step": 5659 + }, + { + "epoch": 0.8860363180964308, + "grad_norm": 1.1686919927597046, + "learning_rate": 5.8064516129032266e-05, + "loss": 0.2211, + "step": 5660 + }, + { + "epoch": 0.8861928616155291, + "grad_norm": 1.5235484838485718, + "learning_rate": 5.805637015314434e-05, + "loss": 0.4592, + "step": 5661 + }, + { + "epoch": 0.8863494051346275, + "grad_norm": 1.2184727191925049, + "learning_rate": 5.804822417725644e-05, + "loss": 0.3428, + "step": 5662 + }, + { + "epoch": 0.8865059486537258, + "grad_norm": 0.9925685524940491, + "learning_rate": 5.804007820136853e-05, + "loss": 0.3733, + "step": 5663 + }, + { + "epoch": 0.886662492172824, + "grad_norm": 1.5476540327072144, + "learning_rate": 5.8031932225480614e-05, + "loss": 0.4568, + "step": 5664 + }, + { + "epoch": 0.8868190356919223, + "grad_norm": 1.3099669218063354, + "learning_rate": 5.8023786249592705e-05, + "loss": 0.404, + "step": 5665 + }, + { + "epoch": 0.8869755792110207, + "grad_norm": 1.371342420578003, + "learning_rate": 5.8015640273704795e-05, + "loss": 0.4452, + "step": 5666 + }, + { + "epoch": 0.887132122730119, + "grad_norm": 1.6890991926193237, + "learning_rate": 5.800749429781688e-05, + "loss": 0.436, + "step": 5667 + }, + { + "epoch": 0.8872886662492173, + "grad_norm": 1.7460452318191528, + "learning_rate": 5.799934832192897e-05, + "loss": 0.5188, + "step": 5668 + }, + { + "epoch": 0.8874452097683156, + "grad_norm": 1.8611011505126953, + "learning_rate": 5.799120234604106e-05, + "loss": 0.43, + "step": 5669 + }, + { + "epoch": 0.8876017532874139, + "grad_norm": 1.5318129062652588, + "learning_rate": 5.7983056370153144e-05, + "loss": 0.5457, + "step": 5670 + }, + { + "epoch": 0.8877582968065122, + "grad_norm": 2.046391010284424, + "learning_rate": 5.7974910394265235e-05, + "loss": 0.4271, + "step": 5671 + }, + { + "epoch": 0.8879148403256105, + "grad_norm": 2.5477263927459717, + "learning_rate": 5.7966764418377325e-05, + "loss": 0.9109, + "step": 5672 + }, + { + "epoch": 0.8880713838447089, + "grad_norm": 3.3841185569763184, + "learning_rate": 5.795861844248941e-05, + "loss": 1.1078, + "step": 5673 + }, + { + "epoch": 0.8882279273638072, + "grad_norm": 1.9520622491836548, + "learning_rate": 5.79504724666015e-05, + "loss": 0.7024, + "step": 5674 + }, + { + "epoch": 0.8883844708829054, + "grad_norm": 2.0057179927825928, + "learning_rate": 5.79423264907136e-05, + "loss": 0.8157, + "step": 5675 + }, + { + "epoch": 0.8885410144020037, + "grad_norm": 2.9027462005615234, + "learning_rate": 5.7934180514825674e-05, + "loss": 0.5594, + "step": 5676 + }, + { + "epoch": 0.888697557921102, + "grad_norm": 2.0265984535217285, + "learning_rate": 5.792603453893777e-05, + "loss": 0.6476, + "step": 5677 + }, + { + "epoch": 0.8888541014402004, + "grad_norm": 1.8752539157867432, + "learning_rate": 5.791788856304986e-05, + "loss": 0.5354, + "step": 5678 + }, + { + "epoch": 0.8890106449592987, + "grad_norm": 3.339463472366333, + "learning_rate": 5.790974258716194e-05, + "loss": 0.8288, + "step": 5679 + }, + { + "epoch": 0.889167188478397, + "grad_norm": 1.4405149221420288, + "learning_rate": 5.7901596611274036e-05, + "loss": 0.3915, + "step": 5680 + }, + { + "epoch": 0.8893237319974953, + "grad_norm": 2.129316806793213, + "learning_rate": 5.7893450635386126e-05, + "loss": 0.5446, + "step": 5681 + }, + { + "epoch": 0.8894802755165936, + "grad_norm": 2.448796033859253, + "learning_rate": 5.788530465949821e-05, + "loss": 0.9986, + "step": 5682 + }, + { + "epoch": 0.8896368190356919, + "grad_norm": 3.2974514961242676, + "learning_rate": 5.78771586836103e-05, + "loss": 0.7891, + "step": 5683 + }, + { + "epoch": 0.8897933625547902, + "grad_norm": 2.4935545921325684, + "learning_rate": 5.786901270772239e-05, + "loss": 0.7224, + "step": 5684 + }, + { + "epoch": 0.8899499060738886, + "grad_norm": 3.501852512359619, + "learning_rate": 5.7860866731834475e-05, + "loss": 1.3054, + "step": 5685 + }, + { + "epoch": 0.8901064495929869, + "grad_norm": 2.3779783248901367, + "learning_rate": 5.7852720755946565e-05, + "loss": 0.8252, + "step": 5686 + }, + { + "epoch": 0.8902629931120851, + "grad_norm": 5.2571258544921875, + "learning_rate": 5.7844574780058656e-05, + "loss": 1.1986, + "step": 5687 + }, + { + "epoch": 0.8904195366311835, + "grad_norm": 5.050364971160889, + "learning_rate": 5.783642880417074e-05, + "loss": 1.5808, + "step": 5688 + }, + { + "epoch": 0.8905760801502818, + "grad_norm": 4.385239601135254, + "learning_rate": 5.782828282828283e-05, + "loss": 1.2334, + "step": 5689 + }, + { + "epoch": 0.8907326236693801, + "grad_norm": 2.8168227672576904, + "learning_rate": 5.782013685239492e-05, + "loss": 0.8889, + "step": 5690 + }, + { + "epoch": 0.8908891671884784, + "grad_norm": 4.924570083618164, + "learning_rate": 5.7811990876507004e-05, + "loss": 1.4488, + "step": 5691 + }, + { + "epoch": 0.8910457107075767, + "grad_norm": 1.8961448669433594, + "learning_rate": 5.7803844900619095e-05, + "loss": 1.0301, + "step": 5692 + }, + { + "epoch": 0.891202254226675, + "grad_norm": 3.4480092525482178, + "learning_rate": 5.779569892473119e-05, + "loss": 0.6567, + "step": 5693 + }, + { + "epoch": 0.8913587977457733, + "grad_norm": 2.773325204849243, + "learning_rate": 5.778755294884327e-05, + "loss": 0.8133, + "step": 5694 + }, + { + "epoch": 0.8915153412648716, + "grad_norm": 2.156409978866577, + "learning_rate": 5.7779406972955367e-05, + "loss": 0.5714, + "step": 5695 + }, + { + "epoch": 0.89167188478397, + "grad_norm": 3.1263296604156494, + "learning_rate": 5.777126099706746e-05, + "loss": 0.903, + "step": 5696 + }, + { + "epoch": 0.8918284283030683, + "grad_norm": 2.064736843109131, + "learning_rate": 5.7763115021179534e-05, + "loss": 1.0752, + "step": 5697 + }, + { + "epoch": 0.8919849718221665, + "grad_norm": 1.6839841604232788, + "learning_rate": 5.775496904529163e-05, + "loss": 0.3253, + "step": 5698 + }, + { + "epoch": 0.8921415153412648, + "grad_norm": 2.39098858833313, + "learning_rate": 5.774682306940372e-05, + "loss": 0.8859, + "step": 5699 + }, + { + "epoch": 0.8922980588603632, + "grad_norm": 3.523895502090454, + "learning_rate": 5.7738677093515806e-05, + "loss": 0.9221, + "step": 5700 + }, + { + "epoch": 0.8924546023794615, + "grad_norm": 0.5077540278434753, + "learning_rate": 5.7730531117627896e-05, + "loss": 0.2531, + "step": 5701 + }, + { + "epoch": 0.8926111458985598, + "grad_norm": 1.3468024730682373, + "learning_rate": 5.772238514173999e-05, + "loss": 0.3465, + "step": 5702 + }, + { + "epoch": 0.8927676894176582, + "grad_norm": 0.46694883704185486, + "learning_rate": 5.771423916585207e-05, + "loss": 0.1835, + "step": 5703 + }, + { + "epoch": 0.8929242329367564, + "grad_norm": 1.4088585376739502, + "learning_rate": 5.770609318996416e-05, + "loss": 0.3807, + "step": 5704 + }, + { + "epoch": 0.8930807764558547, + "grad_norm": 0.9732994437217712, + "learning_rate": 5.769794721407625e-05, + "loss": 0.2601, + "step": 5705 + }, + { + "epoch": 0.893237319974953, + "grad_norm": 0.7927038073539734, + "learning_rate": 5.7689801238188335e-05, + "loss": 0.2396, + "step": 5706 + }, + { + "epoch": 0.8933938634940514, + "grad_norm": 0.977576494216919, + "learning_rate": 5.7681655262300426e-05, + "loss": 0.3125, + "step": 5707 + }, + { + "epoch": 0.8935504070131497, + "grad_norm": 1.5142171382904053, + "learning_rate": 5.7673509286412516e-05, + "loss": 0.354, + "step": 5708 + }, + { + "epoch": 0.8937069505322479, + "grad_norm": 3.0933449268341064, + "learning_rate": 5.76653633105246e-05, + "loss": 0.3367, + "step": 5709 + }, + { + "epoch": 0.8938634940513462, + "grad_norm": 0.9718630313873291, + "learning_rate": 5.765721733463669e-05, + "loss": 0.3626, + "step": 5710 + }, + { + "epoch": 0.8940200375704446, + "grad_norm": 1.5996646881103516, + "learning_rate": 5.764907135874879e-05, + "loss": 0.507, + "step": 5711 + }, + { + "epoch": 0.8941765810895429, + "grad_norm": 1.1600375175476074, + "learning_rate": 5.7640925382860865e-05, + "loss": 0.3435, + "step": 5712 + }, + { + "epoch": 0.8943331246086412, + "grad_norm": 1.1197341680526733, + "learning_rate": 5.763277940697296e-05, + "loss": 0.3283, + "step": 5713 + }, + { + "epoch": 0.8944896681277396, + "grad_norm": 1.2241231203079224, + "learning_rate": 5.762463343108505e-05, + "loss": 0.324, + "step": 5714 + }, + { + "epoch": 0.8946462116468378, + "grad_norm": 1.6064132452011108, + "learning_rate": 5.761648745519713e-05, + "loss": 0.4884, + "step": 5715 + }, + { + "epoch": 0.8948027551659361, + "grad_norm": 1.2963098287582397, + "learning_rate": 5.760834147930923e-05, + "loss": 0.3113, + "step": 5716 + }, + { + "epoch": 0.8949592986850344, + "grad_norm": 2.1202645301818848, + "learning_rate": 5.760019550342132e-05, + "loss": 0.4752, + "step": 5717 + }, + { + "epoch": 0.8951158422041328, + "grad_norm": 4.790133476257324, + "learning_rate": 5.75920495275334e-05, + "loss": 0.8416, + "step": 5718 + }, + { + "epoch": 0.8952723857232311, + "grad_norm": 2.2087514400482178, + "learning_rate": 5.758390355164549e-05, + "loss": 0.572, + "step": 5719 + }, + { + "epoch": 0.8954289292423294, + "grad_norm": 1.7621793746948242, + "learning_rate": 5.757575757575758e-05, + "loss": 0.4176, + "step": 5720 + }, + { + "epoch": 0.8955854727614276, + "grad_norm": 2.787997007369995, + "learning_rate": 5.7567611599869666e-05, + "loss": 0.7036, + "step": 5721 + }, + { + "epoch": 0.895742016280526, + "grad_norm": 1.5720093250274658, + "learning_rate": 5.7559465623981756e-05, + "loss": 0.442, + "step": 5722 + }, + { + "epoch": 0.8958985597996243, + "grad_norm": 1.3821911811828613, + "learning_rate": 5.755131964809385e-05, + "loss": 0.5028, + "step": 5723 + }, + { + "epoch": 0.8960551033187226, + "grad_norm": 2.3956687450408936, + "learning_rate": 5.754317367220593e-05, + "loss": 0.8633, + "step": 5724 + }, + { + "epoch": 0.896211646837821, + "grad_norm": 2.5609207153320312, + "learning_rate": 5.753502769631802e-05, + "loss": 0.5668, + "step": 5725 + }, + { + "epoch": 0.8963681903569192, + "grad_norm": 2.324817419052124, + "learning_rate": 5.752688172043011e-05, + "loss": 0.3799, + "step": 5726 + }, + { + "epoch": 0.8965247338760175, + "grad_norm": 2.7727701663970947, + "learning_rate": 5.7518735744542196e-05, + "loss": 0.7745, + "step": 5727 + }, + { + "epoch": 0.8966812773951158, + "grad_norm": 1.8464268445968628, + "learning_rate": 5.7510589768654286e-05, + "loss": 0.4613, + "step": 5728 + }, + { + "epoch": 0.8968378209142142, + "grad_norm": 1.9112999439239502, + "learning_rate": 5.7502443792766383e-05, + "loss": 1.033, + "step": 5729 + }, + { + "epoch": 0.8969943644333125, + "grad_norm": 2.8692572116851807, + "learning_rate": 5.749429781687846e-05, + "loss": 0.698, + "step": 5730 + }, + { + "epoch": 0.8971509079524108, + "grad_norm": 2.602889060974121, + "learning_rate": 5.748615184099055e-05, + "loss": 0.752, + "step": 5731 + }, + { + "epoch": 0.897307451471509, + "grad_norm": 1.9638316631317139, + "learning_rate": 5.747800586510265e-05, + "loss": 0.6217, + "step": 5732 + }, + { + "epoch": 0.8974639949906074, + "grad_norm": 3.626715898513794, + "learning_rate": 5.7469859889214725e-05, + "loss": 1.1124, + "step": 5733 + }, + { + "epoch": 0.8976205385097057, + "grad_norm": 2.321315050125122, + "learning_rate": 5.746171391332682e-05, + "loss": 0.519, + "step": 5734 + }, + { + "epoch": 0.897777082028804, + "grad_norm": 2.4858341217041016, + "learning_rate": 5.745356793743891e-05, + "loss": 1.104, + "step": 5735 + }, + { + "epoch": 0.8979336255479023, + "grad_norm": 2.414111614227295, + "learning_rate": 5.7445421961551e-05, + "loss": 1.2624, + "step": 5736 + }, + { + "epoch": 0.8980901690670007, + "grad_norm": 3.516148805618286, + "learning_rate": 5.743727598566309e-05, + "loss": 0.6762, + "step": 5737 + }, + { + "epoch": 0.8982467125860989, + "grad_norm": 2.0621910095214844, + "learning_rate": 5.742913000977518e-05, + "loss": 0.7611, + "step": 5738 + }, + { + "epoch": 0.8984032561051972, + "grad_norm": 2.8742783069610596, + "learning_rate": 5.742098403388726e-05, + "loss": 1.0297, + "step": 5739 + }, + { + "epoch": 0.8985597996242956, + "grad_norm": 3.0231213569641113, + "learning_rate": 5.741283805799935e-05, + "loss": 1.4686, + "step": 5740 + }, + { + "epoch": 0.8987163431433939, + "grad_norm": 3.669046640396118, + "learning_rate": 5.740469208211144e-05, + "loss": 1.1212, + "step": 5741 + }, + { + "epoch": 0.8988728866624922, + "grad_norm": 3.101304054260254, + "learning_rate": 5.7396546106223526e-05, + "loss": 1.0878, + "step": 5742 + }, + { + "epoch": 0.8990294301815904, + "grad_norm": 2.8977560997009277, + "learning_rate": 5.738840013033562e-05, + "loss": 0.8868, + "step": 5743 + }, + { + "epoch": 0.8991859737006888, + "grad_norm": 2.906952381134033, + "learning_rate": 5.738025415444771e-05, + "loss": 1.3266, + "step": 5744 + }, + { + "epoch": 0.8993425172197871, + "grad_norm": 2.569636583328247, + "learning_rate": 5.737210817855979e-05, + "loss": 1.6096, + "step": 5745 + }, + { + "epoch": 0.8994990607388854, + "grad_norm": 3.3320376873016357, + "learning_rate": 5.736396220267188e-05, + "loss": 0.9821, + "step": 5746 + }, + { + "epoch": 0.8996556042579837, + "grad_norm": 1.213990569114685, + "learning_rate": 5.735581622678398e-05, + "loss": 0.2825, + "step": 5747 + }, + { + "epoch": 0.8998121477770821, + "grad_norm": 2.900756597518921, + "learning_rate": 5.7347670250896056e-05, + "loss": 0.4689, + "step": 5748 + }, + { + "epoch": 0.8999686912961803, + "grad_norm": 3.5605342388153076, + "learning_rate": 5.7339524275008146e-05, + "loss": 1.308, + "step": 5749 + }, + { + "epoch": 0.9001252348152786, + "grad_norm": 1.81029212474823, + "learning_rate": 5.7331378299120244e-05, + "loss": 0.7572, + "step": 5750 + }, + { + "epoch": 0.900281778334377, + "grad_norm": 0.6420179605484009, + "learning_rate": 5.732323232323232e-05, + "loss": 0.3053, + "step": 5751 + }, + { + "epoch": 0.9004383218534753, + "grad_norm": 0.40381482243537903, + "learning_rate": 5.731508634734442e-05, + "loss": 0.1861, + "step": 5752 + }, + { + "epoch": 0.9005948653725736, + "grad_norm": 0.6334764957427979, + "learning_rate": 5.730694037145651e-05, + "loss": 0.2078, + "step": 5753 + }, + { + "epoch": 0.9007514088916719, + "grad_norm": 0.8252048492431641, + "learning_rate": 5.729879439556859e-05, + "loss": 0.3206, + "step": 5754 + }, + { + "epoch": 0.9009079524107702, + "grad_norm": 0.5334571003913879, + "learning_rate": 5.729064841968068e-05, + "loss": 0.2654, + "step": 5755 + }, + { + "epoch": 0.9010644959298685, + "grad_norm": 0.9224761724472046, + "learning_rate": 5.728250244379277e-05, + "loss": 0.3509, + "step": 5756 + }, + { + "epoch": 0.9012210394489668, + "grad_norm": 1.119659423828125, + "learning_rate": 5.727435646790486e-05, + "loss": 0.4404, + "step": 5757 + }, + { + "epoch": 0.9013775829680651, + "grad_norm": 1.2681474685668945, + "learning_rate": 5.726621049201695e-05, + "loss": 0.3548, + "step": 5758 + }, + { + "epoch": 0.9015341264871635, + "grad_norm": 1.404576301574707, + "learning_rate": 5.725806451612904e-05, + "loss": 0.421, + "step": 5759 + }, + { + "epoch": 0.9016906700062617, + "grad_norm": 0.8436529040336609, + "learning_rate": 5.724991854024112e-05, + "loss": 0.3602, + "step": 5760 + }, + { + "epoch": 0.90184721352536, + "grad_norm": 1.3830260038375854, + "learning_rate": 5.724177256435321e-05, + "loss": 0.459, + "step": 5761 + }, + { + "epoch": 0.9020037570444583, + "grad_norm": 0.9702096581459045, + "learning_rate": 5.72336265884653e-05, + "loss": 0.413, + "step": 5762 + }, + { + "epoch": 0.9021603005635567, + "grad_norm": 1.4060574769973755, + "learning_rate": 5.722548061257739e-05, + "loss": 0.4304, + "step": 5763 + }, + { + "epoch": 0.902316844082655, + "grad_norm": 1.5202380418777466, + "learning_rate": 5.721733463668948e-05, + "loss": 0.4704, + "step": 5764 + }, + { + "epoch": 0.9024733876017533, + "grad_norm": 1.0202263593673706, + "learning_rate": 5.7209188660801575e-05, + "loss": 0.4839, + "step": 5765 + }, + { + "epoch": 0.9026299311208515, + "grad_norm": 1.1681538820266724, + "learning_rate": 5.720104268491365e-05, + "loss": 0.6356, + "step": 5766 + }, + { + "epoch": 0.9027864746399499, + "grad_norm": 1.1084274053573608, + "learning_rate": 5.719289670902574e-05, + "loss": 0.5448, + "step": 5767 + }, + { + "epoch": 0.9029430181590482, + "grad_norm": 1.4460532665252686, + "learning_rate": 5.718475073313784e-05, + "loss": 0.4466, + "step": 5768 + }, + { + "epoch": 0.9030995616781465, + "grad_norm": 1.9046310186386108, + "learning_rate": 5.7176604757249916e-05, + "loss": 0.537, + "step": 5769 + }, + { + "epoch": 0.9032561051972449, + "grad_norm": 2.0393190383911133, + "learning_rate": 5.7168458781362014e-05, + "loss": 0.9071, + "step": 5770 + }, + { + "epoch": 0.9034126487163432, + "grad_norm": 1.9265525341033936, + "learning_rate": 5.7160312805474104e-05, + "loss": 0.6384, + "step": 5771 + }, + { + "epoch": 0.9035691922354414, + "grad_norm": 3.9877982139587402, + "learning_rate": 5.715216682958619e-05, + "loss": 0.5851, + "step": 5772 + }, + { + "epoch": 0.9037257357545397, + "grad_norm": 1.244064450263977, + "learning_rate": 5.714402085369828e-05, + "loss": 0.4648, + "step": 5773 + }, + { + "epoch": 0.9038822792736381, + "grad_norm": 1.7872674465179443, + "learning_rate": 5.713587487781037e-05, + "loss": 0.5048, + "step": 5774 + }, + { + "epoch": 0.9040388227927364, + "grad_norm": 2.1904892921447754, + "learning_rate": 5.712772890192245e-05, + "loss": 0.615, + "step": 5775 + }, + { + "epoch": 0.9041953663118347, + "grad_norm": 2.8697376251220703, + "learning_rate": 5.711958292603454e-05, + "loss": 0.9014, + "step": 5776 + }, + { + "epoch": 0.904351909830933, + "grad_norm": 1.342934489250183, + "learning_rate": 5.7111436950146634e-05, + "loss": 0.9071, + "step": 5777 + }, + { + "epoch": 0.9045084533500313, + "grad_norm": 2.9823124408721924, + "learning_rate": 5.710329097425872e-05, + "loss": 0.969, + "step": 5778 + }, + { + "epoch": 0.9046649968691296, + "grad_norm": 1.696518898010254, + "learning_rate": 5.709514499837081e-05, + "loss": 0.393, + "step": 5779 + }, + { + "epoch": 0.9048215403882279, + "grad_norm": 4.400091171264648, + "learning_rate": 5.70869990224829e-05, + "loss": 1.0396, + "step": 5780 + }, + { + "epoch": 0.9049780839073263, + "grad_norm": 3.5593554973602295, + "learning_rate": 5.707885304659498e-05, + "loss": 0.9426, + "step": 5781 + }, + { + "epoch": 0.9051346274264246, + "grad_norm": 2.4696385860443115, + "learning_rate": 5.707070707070707e-05, + "loss": 0.9763, + "step": 5782 + }, + { + "epoch": 0.9052911709455228, + "grad_norm": 2.264039993286133, + "learning_rate": 5.706256109481917e-05, + "loss": 0.8595, + "step": 5783 + }, + { + "epoch": 0.9054477144646211, + "grad_norm": 3.5313825607299805, + "learning_rate": 5.705441511893125e-05, + "loss": 0.6919, + "step": 5784 + }, + { + "epoch": 0.9056042579837195, + "grad_norm": 2.26444935798645, + "learning_rate": 5.704626914304334e-05, + "loss": 0.7239, + "step": 5785 + }, + { + "epoch": 0.9057608015028178, + "grad_norm": 4.820674896240234, + "learning_rate": 5.7038123167155435e-05, + "loss": 0.9216, + "step": 5786 + }, + { + "epoch": 0.9059173450219161, + "grad_norm": 3.6055870056152344, + "learning_rate": 5.702997719126751e-05, + "loss": 1.1406, + "step": 5787 + }, + { + "epoch": 0.9060738885410144, + "grad_norm": 4.752426624298096, + "learning_rate": 5.702183121537961e-05, + "loss": 1.2616, + "step": 5788 + }, + { + "epoch": 0.9062304320601127, + "grad_norm": 3.7912864685058594, + "learning_rate": 5.70136852394917e-05, + "loss": 0.7686, + "step": 5789 + }, + { + "epoch": 0.906386975579211, + "grad_norm": 4.708919525146484, + "learning_rate": 5.700553926360378e-05, + "loss": 1.013, + "step": 5790 + }, + { + "epoch": 0.9065435190983093, + "grad_norm": 3.422611713409424, + "learning_rate": 5.6997393287715874e-05, + "loss": 0.9476, + "step": 5791 + }, + { + "epoch": 0.9067000626174077, + "grad_norm": 2.840277910232544, + "learning_rate": 5.6989247311827965e-05, + "loss": 0.9371, + "step": 5792 + }, + { + "epoch": 0.906856606136506, + "grad_norm": 2.5797152519226074, + "learning_rate": 5.698110133594005e-05, + "loss": 1.543, + "step": 5793 + }, + { + "epoch": 0.9070131496556043, + "grad_norm": 7.267298698425293, + "learning_rate": 5.697295536005214e-05, + "loss": 0.843, + "step": 5794 + }, + { + "epoch": 0.9071696931747025, + "grad_norm": 4.591012954711914, + "learning_rate": 5.696480938416423e-05, + "loss": 0.9035, + "step": 5795 + }, + { + "epoch": 0.9073262366938009, + "grad_norm": 4.006809711456299, + "learning_rate": 5.695666340827631e-05, + "loss": 0.6884, + "step": 5796 + }, + { + "epoch": 0.9074827802128992, + "grad_norm": 5.466050624847412, + "learning_rate": 5.6948517432388404e-05, + "loss": 0.539, + "step": 5797 + }, + { + "epoch": 0.9076393237319975, + "grad_norm": 9.21789836883545, + "learning_rate": 5.6940371456500494e-05, + "loss": 1.0256, + "step": 5798 + }, + { + "epoch": 0.9077958672510958, + "grad_norm": 2.7037904262542725, + "learning_rate": 5.693222548061258e-05, + "loss": 0.7152, + "step": 5799 + }, + { + "epoch": 0.9079524107701941, + "grad_norm": 16.705434799194336, + "learning_rate": 5.692407950472467e-05, + "loss": 1.7485, + "step": 5800 + }, + { + "epoch": 0.9081089542892924, + "grad_norm": 0.7188997268676758, + "learning_rate": 5.6915933528836766e-05, + "loss": 0.3556, + "step": 5801 + }, + { + "epoch": 0.9082654978083907, + "grad_norm": 0.7125795483589172, + "learning_rate": 5.690778755294884e-05, + "loss": 0.2428, + "step": 5802 + }, + { + "epoch": 0.908422041327489, + "grad_norm": 0.6038082838058472, + "learning_rate": 5.689964157706093e-05, + "loss": 0.2451, + "step": 5803 + }, + { + "epoch": 0.9085785848465874, + "grad_norm": 0.6841592192649841, + "learning_rate": 5.689149560117303e-05, + "loss": 0.3175, + "step": 5804 + }, + { + "epoch": 0.9087351283656857, + "grad_norm": 0.7800334095954895, + "learning_rate": 5.688334962528511e-05, + "loss": 0.2935, + "step": 5805 + }, + { + "epoch": 0.9088916718847839, + "grad_norm": 1.123820185661316, + "learning_rate": 5.6875203649397205e-05, + "loss": 0.3883, + "step": 5806 + }, + { + "epoch": 0.9090482154038823, + "grad_norm": 0.6562023758888245, + "learning_rate": 5.6867057673509295e-05, + "loss": 0.3225, + "step": 5807 + }, + { + "epoch": 0.9092047589229806, + "grad_norm": 1.1559545993804932, + "learning_rate": 5.685891169762137e-05, + "loss": 0.318, + "step": 5808 + }, + { + "epoch": 0.9093613024420789, + "grad_norm": 0.8748825788497925, + "learning_rate": 5.685076572173347e-05, + "loss": 0.351, + "step": 5809 + }, + { + "epoch": 0.9095178459611772, + "grad_norm": 0.7582481503486633, + "learning_rate": 5.684261974584556e-05, + "loss": 0.197, + "step": 5810 + }, + { + "epoch": 0.9096743894802756, + "grad_norm": 1.3697614669799805, + "learning_rate": 5.6834473769957644e-05, + "loss": 0.3824, + "step": 5811 + }, + { + "epoch": 0.9098309329993738, + "grad_norm": 0.7281429171562195, + "learning_rate": 5.6826327794069734e-05, + "loss": 0.2442, + "step": 5812 + }, + { + "epoch": 0.9099874765184721, + "grad_norm": 0.9868363738059998, + "learning_rate": 5.6818181818181825e-05, + "loss": 0.4497, + "step": 5813 + }, + { + "epoch": 0.9101440200375704, + "grad_norm": 1.1798285245895386, + "learning_rate": 5.681003584229391e-05, + "loss": 0.3653, + "step": 5814 + }, + { + "epoch": 0.9103005635566688, + "grad_norm": 1.1077758073806763, + "learning_rate": 5.6801889866406e-05, + "loss": 0.5075, + "step": 5815 + }, + { + "epoch": 0.9104571070757671, + "grad_norm": 2.1031486988067627, + "learning_rate": 5.679374389051809e-05, + "loss": 0.6143, + "step": 5816 + }, + { + "epoch": 0.9106136505948653, + "grad_norm": 0.8568233847618103, + "learning_rate": 5.6785597914630173e-05, + "loss": 0.275, + "step": 5817 + }, + { + "epoch": 0.9107701941139636, + "grad_norm": 1.8614147901535034, + "learning_rate": 5.6777451938742264e-05, + "loss": 0.6013, + "step": 5818 + }, + { + "epoch": 0.910926737633062, + "grad_norm": 1.8825385570526123, + "learning_rate": 5.6769305962854355e-05, + "loss": 0.3679, + "step": 5819 + }, + { + "epoch": 0.9110832811521603, + "grad_norm": 2.185786724090576, + "learning_rate": 5.676115998696644e-05, + "loss": 0.8472, + "step": 5820 + }, + { + "epoch": 0.9112398246712586, + "grad_norm": 1.5637785196304321, + "learning_rate": 5.675301401107853e-05, + "loss": 0.5734, + "step": 5821 + }, + { + "epoch": 0.911396368190357, + "grad_norm": 1.5636290311813354, + "learning_rate": 5.6744868035190626e-05, + "loss": 0.4598, + "step": 5822 + }, + { + "epoch": 0.9115529117094552, + "grad_norm": 1.5320894718170166, + "learning_rate": 5.67367220593027e-05, + "loss": 0.5548, + "step": 5823 + }, + { + "epoch": 0.9117094552285535, + "grad_norm": 3.831382989883423, + "learning_rate": 5.67285760834148e-05, + "loss": 0.7703, + "step": 5824 + }, + { + "epoch": 0.9118659987476518, + "grad_norm": 1.672237515449524, + "learning_rate": 5.672043010752689e-05, + "loss": 0.4007, + "step": 5825 + }, + { + "epoch": 0.9120225422667502, + "grad_norm": 1.8232954740524292, + "learning_rate": 5.671228413163897e-05, + "loss": 0.5219, + "step": 5826 + }, + { + "epoch": 0.9121790857858485, + "grad_norm": 2.2931876182556152, + "learning_rate": 5.6704138155751065e-05, + "loss": 0.7425, + "step": 5827 + }, + { + "epoch": 0.9123356293049468, + "grad_norm": 2.695343494415283, + "learning_rate": 5.6695992179863156e-05, + "loss": 0.6545, + "step": 5828 + }, + { + "epoch": 0.912492172824045, + "grad_norm": 2.3940255641937256, + "learning_rate": 5.668784620397524e-05, + "loss": 0.6287, + "step": 5829 + }, + { + "epoch": 0.9126487163431434, + "grad_norm": 4.439887046813965, + "learning_rate": 5.667970022808733e-05, + "loss": 0.9861, + "step": 5830 + }, + { + "epoch": 0.9128052598622417, + "grad_norm": 3.632812738418579, + "learning_rate": 5.667155425219942e-05, + "loss": 0.6878, + "step": 5831 + }, + { + "epoch": 0.91296180338134, + "grad_norm": 1.620123028755188, + "learning_rate": 5.6663408276311504e-05, + "loss": 0.745, + "step": 5832 + }, + { + "epoch": 0.9131183469004384, + "grad_norm": 2.003901481628418, + "learning_rate": 5.6655262300423595e-05, + "loss": 0.5943, + "step": 5833 + }, + { + "epoch": 0.9132748904195366, + "grad_norm": 2.97859787940979, + "learning_rate": 5.6647116324535685e-05, + "loss": 0.7812, + "step": 5834 + }, + { + "epoch": 0.9134314339386349, + "grad_norm": 2.5605907440185547, + "learning_rate": 5.663897034864777e-05, + "loss": 0.71, + "step": 5835 + }, + { + "epoch": 0.9135879774577332, + "grad_norm": 7.032586574554443, + "learning_rate": 5.663082437275986e-05, + "loss": 1.6155, + "step": 5836 + }, + { + "epoch": 0.9137445209768316, + "grad_norm": 4.495550632476807, + "learning_rate": 5.662267839687195e-05, + "loss": 0.9735, + "step": 5837 + }, + { + "epoch": 0.9139010644959299, + "grad_norm": 2.5938167572021484, + "learning_rate": 5.6614532420984034e-05, + "loss": 0.8463, + "step": 5838 + }, + { + "epoch": 0.9140576080150282, + "grad_norm": 3.4164140224456787, + "learning_rate": 5.6606386445096124e-05, + "loss": 0.9419, + "step": 5839 + }, + { + "epoch": 0.9142141515341264, + "grad_norm": 4.6837639808654785, + "learning_rate": 5.659824046920822e-05, + "loss": 1.0194, + "step": 5840 + }, + { + "epoch": 0.9143706950532248, + "grad_norm": 2.179004192352295, + "learning_rate": 5.65900944933203e-05, + "loss": 1.0824, + "step": 5841 + }, + { + "epoch": 0.9145272385723231, + "grad_norm": 5.017429828643799, + "learning_rate": 5.6581948517432396e-05, + "loss": 0.8023, + "step": 5842 + }, + { + "epoch": 0.9146837820914214, + "grad_norm": 2.7589974403381348, + "learning_rate": 5.6573802541544486e-05, + "loss": 1.4619, + "step": 5843 + }, + { + "epoch": 0.9148403256105198, + "grad_norm": 3.2170026302337646, + "learning_rate": 5.6565656565656563e-05, + "loss": 1.3938, + "step": 5844 + }, + { + "epoch": 0.9149968691296181, + "grad_norm": 3.978886365890503, + "learning_rate": 5.655751058976866e-05, + "loss": 1.3953, + "step": 5845 + }, + { + "epoch": 0.9151534126487163, + "grad_norm": 3.528461217880249, + "learning_rate": 5.654936461388075e-05, + "loss": 0.892, + "step": 5846 + }, + { + "epoch": 0.9153099561678146, + "grad_norm": 3.551738739013672, + "learning_rate": 5.6541218637992835e-05, + "loss": 0.8167, + "step": 5847 + }, + { + "epoch": 0.915466499686913, + "grad_norm": 4.376492500305176, + "learning_rate": 5.6533072662104926e-05, + "loss": 0.8565, + "step": 5848 + }, + { + "epoch": 0.9156230432060113, + "grad_norm": 3.9032742977142334, + "learning_rate": 5.6524926686217016e-05, + "loss": 1.2621, + "step": 5849 + }, + { + "epoch": 0.9157795867251096, + "grad_norm": 1.9891350269317627, + "learning_rate": 5.65167807103291e-05, + "loss": 0.6706, + "step": 5850 + }, + { + "epoch": 0.9159361302442078, + "grad_norm": 0.4979400932788849, + "learning_rate": 5.650863473444119e-05, + "loss": 0.3348, + "step": 5851 + }, + { + "epoch": 0.9160926737633062, + "grad_norm": 0.8353685736656189, + "learning_rate": 5.650048875855328e-05, + "loss": 0.27, + "step": 5852 + }, + { + "epoch": 0.9162492172824045, + "grad_norm": 1.6350167989730835, + "learning_rate": 5.6492342782665365e-05, + "loss": 0.4818, + "step": 5853 + }, + { + "epoch": 0.9164057608015028, + "grad_norm": 0.7375616431236267, + "learning_rate": 5.6484196806777455e-05, + "loss": 0.3181, + "step": 5854 + }, + { + "epoch": 0.9165623043206012, + "grad_norm": 0.6710753440856934, + "learning_rate": 5.6476050830889546e-05, + "loss": 0.3299, + "step": 5855 + }, + { + "epoch": 0.9167188478396995, + "grad_norm": 0.7548393607139587, + "learning_rate": 5.646790485500163e-05, + "loss": 0.2573, + "step": 5856 + }, + { + "epoch": 0.9168753913587977, + "grad_norm": 0.825143575668335, + "learning_rate": 5.645975887911372e-05, + "loss": 0.2547, + "step": 5857 + }, + { + "epoch": 0.917031934877896, + "grad_norm": 0.7915762066841125, + "learning_rate": 5.645161290322582e-05, + "loss": 0.3633, + "step": 5858 + }, + { + "epoch": 0.9171884783969944, + "grad_norm": 0.8717302680015564, + "learning_rate": 5.6443466927337894e-05, + "loss": 0.2708, + "step": 5859 + }, + { + "epoch": 0.9173450219160927, + "grad_norm": 0.79691082239151, + "learning_rate": 5.6435320951449985e-05, + "loss": 0.2493, + "step": 5860 + }, + { + "epoch": 0.917501565435191, + "grad_norm": 0.9627877473831177, + "learning_rate": 5.642717497556208e-05, + "loss": 0.2499, + "step": 5861 + }, + { + "epoch": 0.9176581089542893, + "grad_norm": 1.2067431211471558, + "learning_rate": 5.641902899967416e-05, + "loss": 0.5234, + "step": 5862 + }, + { + "epoch": 0.9178146524733876, + "grad_norm": 0.9127941131591797, + "learning_rate": 5.6410883023786256e-05, + "loss": 0.2624, + "step": 5863 + }, + { + "epoch": 0.9179711959924859, + "grad_norm": 1.1872403621673584, + "learning_rate": 5.640273704789835e-05, + "loss": 0.4549, + "step": 5864 + }, + { + "epoch": 0.9181277395115842, + "grad_norm": 1.162492036819458, + "learning_rate": 5.639459107201043e-05, + "loss": 0.4651, + "step": 5865 + }, + { + "epoch": 0.9182842830306825, + "grad_norm": 1.9813909530639648, + "learning_rate": 5.638644509612252e-05, + "loss": 0.435, + "step": 5866 + }, + { + "epoch": 0.9184408265497809, + "grad_norm": 1.8703290224075317, + "learning_rate": 5.637829912023461e-05, + "loss": 0.4423, + "step": 5867 + }, + { + "epoch": 0.9185973700688791, + "grad_norm": 8.12370491027832, + "learning_rate": 5.6370153144346695e-05, + "loss": 0.7439, + "step": 5868 + }, + { + "epoch": 0.9187539135879774, + "grad_norm": 2.163707971572876, + "learning_rate": 5.6362007168458786e-05, + "loss": 0.5096, + "step": 5869 + }, + { + "epoch": 0.9189104571070758, + "grad_norm": 1.7443746328353882, + "learning_rate": 5.6353861192570876e-05, + "loss": 0.8183, + "step": 5870 + }, + { + "epoch": 0.9190670006261741, + "grad_norm": 1.430751919746399, + "learning_rate": 5.634571521668296e-05, + "loss": 0.2961, + "step": 5871 + }, + { + "epoch": 0.9192235441452724, + "grad_norm": 1.7890346050262451, + "learning_rate": 5.633756924079505e-05, + "loss": 0.4602, + "step": 5872 + }, + { + "epoch": 0.9193800876643707, + "grad_norm": 2.0673258304595947, + "learning_rate": 5.632942326490714e-05, + "loss": 0.9098, + "step": 5873 + }, + { + "epoch": 0.919536631183469, + "grad_norm": 1.1336114406585693, + "learning_rate": 5.6321277289019225e-05, + "loss": 0.4313, + "step": 5874 + }, + { + "epoch": 0.9196931747025673, + "grad_norm": 3.2422988414764404, + "learning_rate": 5.6313131313131316e-05, + "loss": 0.4289, + "step": 5875 + }, + { + "epoch": 0.9198497182216656, + "grad_norm": 1.536336064338684, + "learning_rate": 5.630498533724341e-05, + "loss": 0.4026, + "step": 5876 + }, + { + "epoch": 0.9200062617407639, + "grad_norm": 3.5071606636047363, + "learning_rate": 5.629683936135549e-05, + "loss": 0.7245, + "step": 5877 + }, + { + "epoch": 0.9201628052598623, + "grad_norm": 3.9274885654449463, + "learning_rate": 5.628869338546758e-05, + "loss": 0.9597, + "step": 5878 + }, + { + "epoch": 0.9203193487789606, + "grad_norm": 2.311702013015747, + "learning_rate": 5.628054740957968e-05, + "loss": 0.7638, + "step": 5879 + }, + { + "epoch": 0.9204758922980588, + "grad_norm": 2.5375020503997803, + "learning_rate": 5.6272401433691755e-05, + "loss": 1.1626, + "step": 5880 + }, + { + "epoch": 0.9206324358171571, + "grad_norm": 2.8421969413757324, + "learning_rate": 5.626425545780385e-05, + "loss": 0.6257, + "step": 5881 + }, + { + "epoch": 0.9207889793362555, + "grad_norm": 3.7737224102020264, + "learning_rate": 5.625610948191594e-05, + "loss": 0.6454, + "step": 5882 + }, + { + "epoch": 0.9209455228553538, + "grad_norm": 2.3813657760620117, + "learning_rate": 5.6247963506028026e-05, + "loss": 0.6717, + "step": 5883 + }, + { + "epoch": 0.9211020663744521, + "grad_norm": 3.0000627040863037, + "learning_rate": 5.623981753014012e-05, + "loss": 1.0373, + "step": 5884 + }, + { + "epoch": 0.9212586098935505, + "grad_norm": 2.0780389308929443, + "learning_rate": 5.6231671554252194e-05, + "loss": 0.5852, + "step": 5885 + }, + { + "epoch": 0.9214151534126487, + "grad_norm": 3.3037562370300293, + "learning_rate": 5.622352557836429e-05, + "loss": 1.0639, + "step": 5886 + }, + { + "epoch": 0.921571696931747, + "grad_norm": 3.1686434745788574, + "learning_rate": 5.621537960247638e-05, + "loss": 1.0296, + "step": 5887 + }, + { + "epoch": 0.9217282404508453, + "grad_norm": 5.579136848449707, + "learning_rate": 5.6207233626588465e-05, + "loss": 1.1994, + "step": 5888 + }, + { + "epoch": 0.9218847839699437, + "grad_norm": 3.030250310897827, + "learning_rate": 5.6199087650700556e-05, + "loss": 1.3882, + "step": 5889 + }, + { + "epoch": 0.922041327489042, + "grad_norm": 2.1622378826141357, + "learning_rate": 5.6190941674812646e-05, + "loss": 1.1412, + "step": 5890 + }, + { + "epoch": 0.9221978710081402, + "grad_norm": 1.8687773942947388, + "learning_rate": 5.618279569892473e-05, + "loss": 1.1695, + "step": 5891 + }, + { + "epoch": 0.9223544145272385, + "grad_norm": 2.60247802734375, + "learning_rate": 5.617464972303682e-05, + "loss": 1.0183, + "step": 5892 + }, + { + "epoch": 0.9225109580463369, + "grad_norm": 3.9566094875335693, + "learning_rate": 5.616650374714891e-05, + "loss": 1.3807, + "step": 5893 + }, + { + "epoch": 0.9226675015654352, + "grad_norm": 3.0139660835266113, + "learning_rate": 5.6158357771260995e-05, + "loss": 0.9324, + "step": 5894 + }, + { + "epoch": 0.9228240450845335, + "grad_norm": 2.6354336738586426, + "learning_rate": 5.6150211795373085e-05, + "loss": 0.6322, + "step": 5895 + }, + { + "epoch": 0.9229805886036319, + "grad_norm": 2.0620462894439697, + "learning_rate": 5.6142065819485176e-05, + "loss": 0.3755, + "step": 5896 + }, + { + "epoch": 0.9231371321227301, + "grad_norm": 3.5353946685791016, + "learning_rate": 5.613391984359726e-05, + "loss": 1.006, + "step": 5897 + }, + { + "epoch": 0.9232936756418284, + "grad_norm": 4.043470859527588, + "learning_rate": 5.612577386770935e-05, + "loss": 0.9942, + "step": 5898 + }, + { + "epoch": 0.9234502191609267, + "grad_norm": 6.177516460418701, + "learning_rate": 5.611762789182145e-05, + "loss": 0.5055, + "step": 5899 + }, + { + "epoch": 0.9236067626800251, + "grad_norm": 5.454159736633301, + "learning_rate": 5.6109481915933524e-05, + "loss": 1.292, + "step": 5900 + }, + { + "epoch": 0.9237633061991234, + "grad_norm": 0.585024356842041, + "learning_rate": 5.610133594004562e-05, + "loss": 0.3113, + "step": 5901 + }, + { + "epoch": 0.9239198497182217, + "grad_norm": 0.9380548596382141, + "learning_rate": 5.609318996415771e-05, + "loss": 0.3265, + "step": 5902 + }, + { + "epoch": 0.9240763932373199, + "grad_norm": 1.278571605682373, + "learning_rate": 5.608504398826979e-05, + "loss": 0.1969, + "step": 5903 + }, + { + "epoch": 0.9242329367564183, + "grad_norm": 0.9056649208068848, + "learning_rate": 5.6076898012381887e-05, + "loss": 0.2719, + "step": 5904 + }, + { + "epoch": 0.9243894802755166, + "grad_norm": 0.6293573379516602, + "learning_rate": 5.606875203649398e-05, + "loss": 0.2557, + "step": 5905 + }, + { + "epoch": 0.9245460237946149, + "grad_norm": 1.038644790649414, + "learning_rate": 5.606060606060606e-05, + "loss": 0.2815, + "step": 5906 + }, + { + "epoch": 0.9247025673137133, + "grad_norm": 0.7282165884971619, + "learning_rate": 5.605246008471815e-05, + "loss": 0.1976, + "step": 5907 + }, + { + "epoch": 0.9248591108328115, + "grad_norm": 2.783550262451172, + "learning_rate": 5.604431410883024e-05, + "loss": 0.6033, + "step": 5908 + }, + { + "epoch": 0.9250156543519098, + "grad_norm": 0.8846545219421387, + "learning_rate": 5.6036168132942326e-05, + "loss": 0.3496, + "step": 5909 + }, + { + "epoch": 0.9251721978710081, + "grad_norm": 1.1276614665985107, + "learning_rate": 5.6028022157054416e-05, + "loss": 0.362, + "step": 5910 + }, + { + "epoch": 0.9253287413901065, + "grad_norm": 1.3047834634780884, + "learning_rate": 5.601987618116651e-05, + "loss": 0.2713, + "step": 5911 + }, + { + "epoch": 0.9254852849092048, + "grad_norm": 2.8469998836517334, + "learning_rate": 5.601173020527859e-05, + "loss": 0.3653, + "step": 5912 + }, + { + "epoch": 0.9256418284283031, + "grad_norm": 1.128003716468811, + "learning_rate": 5.600358422939068e-05, + "loss": 0.7109, + "step": 5913 + }, + { + "epoch": 0.9257983719474013, + "grad_norm": 1.6421388387680054, + "learning_rate": 5.599543825350277e-05, + "loss": 0.3094, + "step": 5914 + }, + { + "epoch": 0.9259549154664997, + "grad_norm": 1.1446313858032227, + "learning_rate": 5.5987292277614855e-05, + "loss": 0.4439, + "step": 5915 + }, + { + "epoch": 0.926111458985598, + "grad_norm": 2.218585968017578, + "learning_rate": 5.5979146301726946e-05, + "loss": 0.6278, + "step": 5916 + }, + { + "epoch": 0.9262680025046963, + "grad_norm": 1.4253005981445312, + "learning_rate": 5.597100032583904e-05, + "loss": 0.5594, + "step": 5917 + }, + { + "epoch": 0.9264245460237946, + "grad_norm": 1.429388403892517, + "learning_rate": 5.596285434995112e-05, + "loss": 0.2642, + "step": 5918 + }, + { + "epoch": 0.926581089542893, + "grad_norm": 0.7703977823257446, + "learning_rate": 5.595470837406321e-05, + "loss": 0.2749, + "step": 5919 + }, + { + "epoch": 0.9267376330619912, + "grad_norm": 1.7231454849243164, + "learning_rate": 5.594656239817531e-05, + "loss": 0.5604, + "step": 5920 + }, + { + "epoch": 0.9268941765810895, + "grad_norm": 1.4617310762405396, + "learning_rate": 5.5938416422287385e-05, + "loss": 0.4875, + "step": 5921 + }, + { + "epoch": 0.9270507201001879, + "grad_norm": 1.296615719795227, + "learning_rate": 5.593027044639948e-05, + "loss": 0.4031, + "step": 5922 + }, + { + "epoch": 0.9272072636192862, + "grad_norm": 3.796126127243042, + "learning_rate": 5.592212447051157e-05, + "loss": 0.51, + "step": 5923 + }, + { + "epoch": 0.9273638071383845, + "grad_norm": 2.4103543758392334, + "learning_rate": 5.5913978494623656e-05, + "loss": 0.5423, + "step": 5924 + }, + { + "epoch": 0.9275203506574827, + "grad_norm": 3.329827070236206, + "learning_rate": 5.590583251873575e-05, + "loss": 0.4415, + "step": 5925 + }, + { + "epoch": 0.9276768941765811, + "grad_norm": 1.438841462135315, + "learning_rate": 5.589768654284784e-05, + "loss": 0.5618, + "step": 5926 + }, + { + "epoch": 0.9278334376956794, + "grad_norm": 2.1622979640960693, + "learning_rate": 5.588954056695992e-05, + "loss": 0.7428, + "step": 5927 + }, + { + "epoch": 0.9279899812147777, + "grad_norm": 1.740425944328308, + "learning_rate": 5.588139459107201e-05, + "loss": 0.5345, + "step": 5928 + }, + { + "epoch": 0.928146524733876, + "grad_norm": 1.751548409461975, + "learning_rate": 5.58732486151841e-05, + "loss": 0.5165, + "step": 5929 + }, + { + "epoch": 0.9283030682529744, + "grad_norm": 1.1133090257644653, + "learning_rate": 5.5865102639296186e-05, + "loss": 0.3222, + "step": 5930 + }, + { + "epoch": 0.9284596117720726, + "grad_norm": 1.8430182933807373, + "learning_rate": 5.5856956663408277e-05, + "loss": 0.6739, + "step": 5931 + }, + { + "epoch": 0.9286161552911709, + "grad_norm": 2.1542317867279053, + "learning_rate": 5.584881068752037e-05, + "loss": 0.7937, + "step": 5932 + }, + { + "epoch": 0.9287726988102692, + "grad_norm": 1.9720276594161987, + "learning_rate": 5.584066471163245e-05, + "loss": 0.8902, + "step": 5933 + }, + { + "epoch": 0.9289292423293676, + "grad_norm": 2.3973569869995117, + "learning_rate": 5.583251873574454e-05, + "loss": 0.8929, + "step": 5934 + }, + { + "epoch": 0.9290857858484659, + "grad_norm": 3.0641469955444336, + "learning_rate": 5.582437275985664e-05, + "loss": 0.6421, + "step": 5935 + }, + { + "epoch": 0.9292423293675642, + "grad_norm": 2.3113574981689453, + "learning_rate": 5.5816226783968716e-05, + "loss": 1.2535, + "step": 5936 + }, + { + "epoch": 0.9293988728866625, + "grad_norm": 3.4048542976379395, + "learning_rate": 5.5808080808080806e-05, + "loss": 1.0383, + "step": 5937 + }, + { + "epoch": 0.9295554164057608, + "grad_norm": 5.482643127441406, + "learning_rate": 5.5799934832192903e-05, + "loss": 0.9394, + "step": 5938 + }, + { + "epoch": 0.9297119599248591, + "grad_norm": 4.407021999359131, + "learning_rate": 5.579178885630498e-05, + "loss": 1.8432, + "step": 5939 + }, + { + "epoch": 0.9298685034439574, + "grad_norm": 3.810326099395752, + "learning_rate": 5.578364288041708e-05, + "loss": 0.9262, + "step": 5940 + }, + { + "epoch": 0.9300250469630558, + "grad_norm": 1.6734158992767334, + "learning_rate": 5.577549690452917e-05, + "loss": 0.5605, + "step": 5941 + }, + { + "epoch": 0.930181590482154, + "grad_norm": 4.337081432342529, + "learning_rate": 5.576735092864125e-05, + "loss": 1.0746, + "step": 5942 + }, + { + "epoch": 0.9303381340012523, + "grad_norm": 3.22505259513855, + "learning_rate": 5.575920495275334e-05, + "loss": 1.0478, + "step": 5943 + }, + { + "epoch": 0.9304946775203506, + "grad_norm": 3.250194787979126, + "learning_rate": 5.575105897686543e-05, + "loss": 1.0596, + "step": 5944 + }, + { + "epoch": 0.930651221039449, + "grad_norm": 4.560720443725586, + "learning_rate": 5.574291300097752e-05, + "loss": 1.3013, + "step": 5945 + }, + { + "epoch": 0.9308077645585473, + "grad_norm": 5.742722988128662, + "learning_rate": 5.573476702508961e-05, + "loss": 1.0605, + "step": 5946 + }, + { + "epoch": 0.9309643080776456, + "grad_norm": 1.9229604005813599, + "learning_rate": 5.57266210492017e-05, + "loss": 1.2312, + "step": 5947 + }, + { + "epoch": 0.9311208515967438, + "grad_norm": 3.0936131477355957, + "learning_rate": 5.571847507331378e-05, + "loss": 0.5623, + "step": 5948 + }, + { + "epoch": 0.9312773951158422, + "grad_norm": 3.424055337905884, + "learning_rate": 5.571032909742587e-05, + "loss": 0.7434, + "step": 5949 + }, + { + "epoch": 0.9314339386349405, + "grad_norm": 2.844520330429077, + "learning_rate": 5.570218312153796e-05, + "loss": 0.8376, + "step": 5950 + }, + { + "epoch": 0.9315904821540388, + "grad_norm": 0.5454773902893066, + "learning_rate": 5.5694037145650046e-05, + "loss": 0.3196, + "step": 5951 + }, + { + "epoch": 0.9317470256731372, + "grad_norm": 0.5380678176879883, + "learning_rate": 5.568589116976214e-05, + "loss": 0.2121, + "step": 5952 + }, + { + "epoch": 0.9319035691922355, + "grad_norm": 0.7230846881866455, + "learning_rate": 5.5677745193874234e-05, + "loss": 0.1761, + "step": 5953 + }, + { + "epoch": 0.9320601127113337, + "grad_norm": 0.6854438781738281, + "learning_rate": 5.566959921798631e-05, + "loss": 0.3018, + "step": 5954 + }, + { + "epoch": 0.932216656230432, + "grad_norm": 2.9334025382995605, + "learning_rate": 5.56614532420984e-05, + "loss": 0.4555, + "step": 5955 + }, + { + "epoch": 0.9323731997495304, + "grad_norm": 0.5883039236068726, + "learning_rate": 5.56533072662105e-05, + "loss": 0.2992, + "step": 5956 + }, + { + "epoch": 0.9325297432686287, + "grad_norm": 2.600187301635742, + "learning_rate": 5.5645161290322576e-05, + "loss": 0.5274, + "step": 5957 + }, + { + "epoch": 0.932686286787727, + "grad_norm": 0.5567541718482971, + "learning_rate": 5.563701531443467e-05, + "loss": 0.2221, + "step": 5958 + }, + { + "epoch": 0.9328428303068252, + "grad_norm": 0.6628239154815674, + "learning_rate": 5.5628869338546764e-05, + "loss": 0.2585, + "step": 5959 + }, + { + "epoch": 0.9329993738259236, + "grad_norm": 1.0294798612594604, + "learning_rate": 5.562072336265885e-05, + "loss": 0.392, + "step": 5960 + }, + { + "epoch": 0.9331559173450219, + "grad_norm": 1.9239901304244995, + "learning_rate": 5.561257738677094e-05, + "loss": 0.3537, + "step": 5961 + }, + { + "epoch": 0.9333124608641202, + "grad_norm": 1.2613896131515503, + "learning_rate": 5.560443141088303e-05, + "loss": 0.5455, + "step": 5962 + }, + { + "epoch": 0.9334690043832186, + "grad_norm": 1.2586708068847656, + "learning_rate": 5.559628543499511e-05, + "loss": 0.2743, + "step": 5963 + }, + { + "epoch": 0.9336255479023169, + "grad_norm": 1.7663739919662476, + "learning_rate": 5.55881394591072e-05, + "loss": 0.4733, + "step": 5964 + }, + { + "epoch": 0.9337820914214151, + "grad_norm": 6.41594123840332, + "learning_rate": 5.5579993483219293e-05, + "loss": 1.1611, + "step": 5965 + }, + { + "epoch": 0.9339386349405134, + "grad_norm": 0.7718617916107178, + "learning_rate": 5.557184750733138e-05, + "loss": 0.2932, + "step": 5966 + }, + { + "epoch": 0.9340951784596118, + "grad_norm": 1.2264654636383057, + "learning_rate": 5.556370153144347e-05, + "loss": 0.5549, + "step": 5967 + }, + { + "epoch": 0.9342517219787101, + "grad_norm": 1.3138896226882935, + "learning_rate": 5.555555555555556e-05, + "loss": 0.3374, + "step": 5968 + }, + { + "epoch": 0.9344082654978084, + "grad_norm": 2.041210412979126, + "learning_rate": 5.554740957966764e-05, + "loss": 0.478, + "step": 5969 + }, + { + "epoch": 0.9345648090169068, + "grad_norm": 2.11377215385437, + "learning_rate": 5.553926360377973e-05, + "loss": 0.4747, + "step": 5970 + }, + { + "epoch": 0.934721352536005, + "grad_norm": 1.8172807693481445, + "learning_rate": 5.553111762789183e-05, + "loss": 0.5648, + "step": 5971 + }, + { + "epoch": 0.9348778960551033, + "grad_norm": 0.9809459447860718, + "learning_rate": 5.552297165200391e-05, + "loss": 0.4599, + "step": 5972 + }, + { + "epoch": 0.9350344395742016, + "grad_norm": 2.937228202819824, + "learning_rate": 5.5514825676116e-05, + "loss": 0.5481, + "step": 5973 + }, + { + "epoch": 0.9351909830933, + "grad_norm": 2.1043498516082764, + "learning_rate": 5.5506679700228095e-05, + "loss": 0.614, + "step": 5974 + }, + { + "epoch": 0.9353475266123983, + "grad_norm": 2.7475788593292236, + "learning_rate": 5.549853372434017e-05, + "loss": 0.5803, + "step": 5975 + }, + { + "epoch": 0.9355040701314965, + "grad_norm": 4.213901042938232, + "learning_rate": 5.549038774845227e-05, + "loss": 0.7818, + "step": 5976 + }, + { + "epoch": 0.9356606136505948, + "grad_norm": 2.592068672180176, + "learning_rate": 5.548224177256436e-05, + "loss": 0.6025, + "step": 5977 + }, + { + "epoch": 0.9358171571696932, + "grad_norm": 3.213937759399414, + "learning_rate": 5.5474095796676436e-05, + "loss": 0.7889, + "step": 5978 + }, + { + "epoch": 0.9359737006887915, + "grad_norm": 1.779938817024231, + "learning_rate": 5.5465949820788534e-05, + "loss": 0.4813, + "step": 5979 + }, + { + "epoch": 0.9361302442078898, + "grad_norm": 3.2269694805145264, + "learning_rate": 5.5457803844900624e-05, + "loss": 0.7326, + "step": 5980 + }, + { + "epoch": 0.9362867877269881, + "grad_norm": 2.009551763534546, + "learning_rate": 5.544965786901271e-05, + "loss": 0.824, + "step": 5981 + }, + { + "epoch": 0.9364433312460864, + "grad_norm": 1.9612061977386475, + "learning_rate": 5.54415118931248e-05, + "loss": 0.4312, + "step": 5982 + }, + { + "epoch": 0.9365998747651847, + "grad_norm": 5.861486434936523, + "learning_rate": 5.543336591723689e-05, + "loss": 1.14, + "step": 5983 + }, + { + "epoch": 0.936756418284283, + "grad_norm": 5.8112053871154785, + "learning_rate": 5.542521994134897e-05, + "loss": 1.0509, + "step": 5984 + }, + { + "epoch": 0.9369129618033814, + "grad_norm": 2.867262363433838, + "learning_rate": 5.541707396546106e-05, + "loss": 0.9429, + "step": 5985 + }, + { + "epoch": 0.9370695053224797, + "grad_norm": 1.93150794506073, + "learning_rate": 5.5408927989573154e-05, + "loss": 0.4645, + "step": 5986 + }, + { + "epoch": 0.937226048841578, + "grad_norm": 6.300821304321289, + "learning_rate": 5.540078201368524e-05, + "loss": 1.3439, + "step": 5987 + }, + { + "epoch": 0.9373825923606762, + "grad_norm": 4.322120189666748, + "learning_rate": 5.539263603779733e-05, + "loss": 0.8987, + "step": 5988 + }, + { + "epoch": 0.9375391358797746, + "grad_norm": 4.383589267730713, + "learning_rate": 5.5384490061909425e-05, + "loss": 1.3041, + "step": 5989 + }, + { + "epoch": 0.9376956793988729, + "grad_norm": 3.6460044384002686, + "learning_rate": 5.53763440860215e-05, + "loss": 0.6958, + "step": 5990 + }, + { + "epoch": 0.9378522229179712, + "grad_norm": 3.467339038848877, + "learning_rate": 5.536819811013359e-05, + "loss": 1.206, + "step": 5991 + }, + { + "epoch": 0.9380087664370695, + "grad_norm": 3.4302709102630615, + "learning_rate": 5.536005213424569e-05, + "loss": 0.651, + "step": 5992 + }, + { + "epoch": 0.9381653099561679, + "grad_norm": 3.1747353076934814, + "learning_rate": 5.535190615835777e-05, + "loss": 0.8834, + "step": 5993 + }, + { + "epoch": 0.9383218534752661, + "grad_norm": 4.789120674133301, + "learning_rate": 5.5343760182469864e-05, + "loss": 1.1253, + "step": 5994 + }, + { + "epoch": 0.9384783969943644, + "grad_norm": 4.885626792907715, + "learning_rate": 5.5335614206581955e-05, + "loss": 1.1139, + "step": 5995 + }, + { + "epoch": 0.9386349405134627, + "grad_norm": 4.155054092407227, + "learning_rate": 5.532746823069403e-05, + "loss": 0.8002, + "step": 5996 + }, + { + "epoch": 0.9387914840325611, + "grad_norm": 1.278029441833496, + "learning_rate": 5.531932225480613e-05, + "loss": 0.6504, + "step": 5997 + }, + { + "epoch": 0.9389480275516594, + "grad_norm": 6.005850315093994, + "learning_rate": 5.531117627891822e-05, + "loss": 1.0767, + "step": 5998 + }, + { + "epoch": 0.9391045710707576, + "grad_norm": 2.9828882217407227, + "learning_rate": 5.5303030303030304e-05, + "loss": 1.2831, + "step": 5999 + }, + { + "epoch": 0.939261114589856, + "grad_norm": 6.20111083984375, + "learning_rate": 5.5294884327142394e-05, + "loss": 1.4331, + "step": 6000 + }, + { + "epoch": 0.939261114589856, + "eval_loss": 0.5385640263557434, + "eval_runtime": 202.7076, + "eval_samples_per_second": 61.088, + "eval_steps_per_second": 3.818, + "eval_wer": 0.3279745840151306, + "step": 6000 + }, + { + "epoch": 0.9394176581089543, + "grad_norm": 0.45739564299583435, + "learning_rate": 5.5286738351254485e-05, + "loss": 0.2359, + "step": 6001 + }, + { + "epoch": 0.9395742016280526, + "grad_norm": 0.7036822438240051, + "learning_rate": 5.527859237536657e-05, + "loss": 0.2401, + "step": 6002 + }, + { + "epoch": 0.9397307451471509, + "grad_norm": 0.6641222834587097, + "learning_rate": 5.527044639947866e-05, + "loss": 0.2783, + "step": 6003 + }, + { + "epoch": 0.9398872886662493, + "grad_norm": 0.7817454934120178, + "learning_rate": 5.526230042359075e-05, + "loss": 0.2506, + "step": 6004 + }, + { + "epoch": 0.9400438321853475, + "grad_norm": 0.7956076860427856, + "learning_rate": 5.525415444770283e-05, + "loss": 0.377, + "step": 6005 + }, + { + "epoch": 0.9402003757044458, + "grad_norm": 0.8571197986602783, + "learning_rate": 5.5246008471814924e-05, + "loss": 0.2795, + "step": 6006 + }, + { + "epoch": 0.9403569192235441, + "grad_norm": 0.44949957728385925, + "learning_rate": 5.5237862495927014e-05, + "loss": 0.2569, + "step": 6007 + }, + { + "epoch": 0.9405134627426425, + "grad_norm": 0.8823639154434204, + "learning_rate": 5.52297165200391e-05, + "loss": 0.2454, + "step": 6008 + }, + { + "epoch": 0.9406700062617408, + "grad_norm": 0.5709412097930908, + "learning_rate": 5.522157054415119e-05, + "loss": 0.2297, + "step": 6009 + }, + { + "epoch": 0.9408265497808391, + "grad_norm": 1.0281107425689697, + "learning_rate": 5.5213424568263286e-05, + "loss": 0.3125, + "step": 6010 + }, + { + "epoch": 0.9409830932999373, + "grad_norm": 0.5475720167160034, + "learning_rate": 5.520527859237536e-05, + "loss": 0.2388, + "step": 6011 + }, + { + "epoch": 0.9411396368190357, + "grad_norm": 1.5307281017303467, + "learning_rate": 5.519713261648746e-05, + "loss": 0.2881, + "step": 6012 + }, + { + "epoch": 0.941296180338134, + "grad_norm": 1.1175769567489624, + "learning_rate": 5.518898664059955e-05, + "loss": 0.4067, + "step": 6013 + }, + { + "epoch": 0.9414527238572323, + "grad_norm": 1.1175353527069092, + "learning_rate": 5.518084066471163e-05, + "loss": 0.3125, + "step": 6014 + }, + { + "epoch": 0.9416092673763307, + "grad_norm": 1.4056910276412964, + "learning_rate": 5.5172694688823725e-05, + "loss": 0.3649, + "step": 6015 + }, + { + "epoch": 0.9417658108954289, + "grad_norm": 1.4787929058074951, + "learning_rate": 5.5164548712935815e-05, + "loss": 0.371, + "step": 6016 + }, + { + "epoch": 0.9419223544145272, + "grad_norm": 1.4776294231414795, + "learning_rate": 5.51564027370479e-05, + "loss": 0.3554, + "step": 6017 + }, + { + "epoch": 0.9420788979336255, + "grad_norm": 2.6284024715423584, + "learning_rate": 5.514825676115999e-05, + "loss": 0.741, + "step": 6018 + }, + { + "epoch": 0.9422354414527239, + "grad_norm": 1.3238672018051147, + "learning_rate": 5.514011078527208e-05, + "loss": 0.5631, + "step": 6019 + }, + { + "epoch": 0.9423919849718222, + "grad_norm": 1.462795615196228, + "learning_rate": 5.5131964809384164e-05, + "loss": 0.4937, + "step": 6020 + }, + { + "epoch": 0.9425485284909205, + "grad_norm": 2.699808120727539, + "learning_rate": 5.5123818833496254e-05, + "loss": 0.3548, + "step": 6021 + }, + { + "epoch": 0.9427050720100187, + "grad_norm": 1.9005050659179688, + "learning_rate": 5.5115672857608345e-05, + "loss": 0.4075, + "step": 6022 + }, + { + "epoch": 0.9428616155291171, + "grad_norm": 2.1674396991729736, + "learning_rate": 5.510752688172043e-05, + "loss": 0.4306, + "step": 6023 + }, + { + "epoch": 0.9430181590482154, + "grad_norm": 2.107959270477295, + "learning_rate": 5.509938090583252e-05, + "loss": 0.7261, + "step": 6024 + }, + { + "epoch": 0.9431747025673137, + "grad_norm": 1.6260392665863037, + "learning_rate": 5.509123492994461e-05, + "loss": 0.4477, + "step": 6025 + }, + { + "epoch": 0.9433312460864121, + "grad_norm": 5.39668083190918, + "learning_rate": 5.5083088954056694e-05, + "loss": 0.6708, + "step": 6026 + }, + { + "epoch": 0.9434877896055104, + "grad_norm": 4.290079593658447, + "learning_rate": 5.5074942978168784e-05, + "loss": 0.7285, + "step": 6027 + }, + { + "epoch": 0.9436443331246086, + "grad_norm": 2.2853217124938965, + "learning_rate": 5.506679700228088e-05, + "loss": 0.7657, + "step": 6028 + }, + { + "epoch": 0.9438008766437069, + "grad_norm": 1.9476042985916138, + "learning_rate": 5.505865102639296e-05, + "loss": 0.8039, + "step": 6029 + }, + { + "epoch": 0.9439574201628053, + "grad_norm": 2.741121768951416, + "learning_rate": 5.5050505050505056e-05, + "loss": 0.7319, + "step": 6030 + }, + { + "epoch": 0.9441139636819036, + "grad_norm": 3.0978474617004395, + "learning_rate": 5.5042359074617146e-05, + "loss": 1.0628, + "step": 6031 + }, + { + "epoch": 0.9442705072010019, + "grad_norm": 2.030190944671631, + "learning_rate": 5.503421309872922e-05, + "loss": 0.5555, + "step": 6032 + }, + { + "epoch": 0.9444270507201001, + "grad_norm": 3.3656437397003174, + "learning_rate": 5.502606712284132e-05, + "loss": 1.5035, + "step": 6033 + }, + { + "epoch": 0.9445835942391985, + "grad_norm": 3.867708921432495, + "learning_rate": 5.501792114695341e-05, + "loss": 0.6184, + "step": 6034 + }, + { + "epoch": 0.9447401377582968, + "grad_norm": 3.208928346633911, + "learning_rate": 5.5009775171065495e-05, + "loss": 0.7782, + "step": 6035 + }, + { + "epoch": 0.9448966812773951, + "grad_norm": 4.077434539794922, + "learning_rate": 5.5001629195177585e-05, + "loss": 1.2491, + "step": 6036 + }, + { + "epoch": 0.9450532247964935, + "grad_norm": 2.53130841255188, + "learning_rate": 5.4993483219289676e-05, + "loss": 1.1192, + "step": 6037 + }, + { + "epoch": 0.9452097683155918, + "grad_norm": 2.4781649112701416, + "learning_rate": 5.498533724340176e-05, + "loss": 1.1069, + "step": 6038 + }, + { + "epoch": 0.94536631183469, + "grad_norm": 3.171342372894287, + "learning_rate": 5.497719126751385e-05, + "loss": 1.3417, + "step": 6039 + }, + { + "epoch": 0.9455228553537883, + "grad_norm": 3.2505016326904297, + "learning_rate": 5.496904529162594e-05, + "loss": 0.9691, + "step": 6040 + }, + { + "epoch": 0.9456793988728867, + "grad_norm": 3.048201560974121, + "learning_rate": 5.4960899315738024e-05, + "loss": 0.8175, + "step": 6041 + }, + { + "epoch": 0.945835942391985, + "grad_norm": 3.754199504852295, + "learning_rate": 5.4952753339850115e-05, + "loss": 1.1609, + "step": 6042 + }, + { + "epoch": 0.9459924859110833, + "grad_norm": 2.358083486557007, + "learning_rate": 5.4944607363962205e-05, + "loss": 1.3812, + "step": 6043 + }, + { + "epoch": 0.9461490294301816, + "grad_norm": 2.8667285442352295, + "learning_rate": 5.493646138807429e-05, + "loss": 1.3263, + "step": 6044 + }, + { + "epoch": 0.9463055729492799, + "grad_norm": 3.5335445404052734, + "learning_rate": 5.492831541218638e-05, + "loss": 1.1956, + "step": 6045 + }, + { + "epoch": 0.9464621164683782, + "grad_norm": 4.139008045196533, + "learning_rate": 5.492016943629848e-05, + "loss": 1.0, + "step": 6046 + }, + { + "epoch": 0.9466186599874765, + "grad_norm": 1.881716012954712, + "learning_rate": 5.4912023460410554e-05, + "loss": 0.4364, + "step": 6047 + }, + { + "epoch": 0.9467752035065748, + "grad_norm": 4.235720157623291, + "learning_rate": 5.4903877484522644e-05, + "loss": 0.9895, + "step": 6048 + }, + { + "epoch": 0.9469317470256732, + "grad_norm": 4.037186622619629, + "learning_rate": 5.489573150863474e-05, + "loss": 1.1095, + "step": 6049 + }, + { + "epoch": 0.9470882905447714, + "grad_norm": 2.019585132598877, + "learning_rate": 5.488758553274682e-05, + "loss": 0.6792, + "step": 6050 + }, + { + "epoch": 0.9472448340638697, + "grad_norm": 0.5855453014373779, + "learning_rate": 5.4879439556858916e-05, + "loss": 0.3423, + "step": 6051 + }, + { + "epoch": 0.947401377582968, + "grad_norm": 0.5344957113265991, + "learning_rate": 5.4871293580971007e-05, + "loss": 0.282, + "step": 6052 + }, + { + "epoch": 0.9475579211020664, + "grad_norm": 0.5378496646881104, + "learning_rate": 5.486314760508309e-05, + "loss": 0.2461, + "step": 6053 + }, + { + "epoch": 0.9477144646211647, + "grad_norm": 0.6376375555992126, + "learning_rate": 5.485500162919518e-05, + "loss": 0.2114, + "step": 6054 + }, + { + "epoch": 0.947871008140263, + "grad_norm": 0.8486429452896118, + "learning_rate": 5.484685565330727e-05, + "loss": 0.3516, + "step": 6055 + }, + { + "epoch": 0.9480275516593613, + "grad_norm": 0.491794228553772, + "learning_rate": 5.4838709677419355e-05, + "loss": 0.2708, + "step": 6056 + }, + { + "epoch": 0.9481840951784596, + "grad_norm": 0.6281721591949463, + "learning_rate": 5.4830563701531446e-05, + "loss": 0.359, + "step": 6057 + }, + { + "epoch": 0.9483406386975579, + "grad_norm": 0.7014753818511963, + "learning_rate": 5.4822417725643536e-05, + "loss": 0.3143, + "step": 6058 + }, + { + "epoch": 0.9484971822166562, + "grad_norm": 0.7170886993408203, + "learning_rate": 5.481427174975562e-05, + "loss": 0.2533, + "step": 6059 + }, + { + "epoch": 0.9486537257357546, + "grad_norm": 1.3650339841842651, + "learning_rate": 5.480612577386771e-05, + "loss": 0.3036, + "step": 6060 + }, + { + "epoch": 0.9488102692548529, + "grad_norm": 0.9287976026535034, + "learning_rate": 5.47979797979798e-05, + "loss": 0.31, + "step": 6061 + }, + { + "epoch": 0.9489668127739511, + "grad_norm": 0.6770702600479126, + "learning_rate": 5.4789833822091885e-05, + "loss": 0.3883, + "step": 6062 + }, + { + "epoch": 0.9491233562930494, + "grad_norm": 0.9984747767448425, + "learning_rate": 5.4781687846203975e-05, + "loss": 0.3608, + "step": 6063 + }, + { + "epoch": 0.9492798998121478, + "grad_norm": 0.8738959431648254, + "learning_rate": 5.477354187031607e-05, + "loss": 0.3658, + "step": 6064 + }, + { + "epoch": 0.9494364433312461, + "grad_norm": 1.0981100797653198, + "learning_rate": 5.476539589442815e-05, + "loss": 0.4033, + "step": 6065 + }, + { + "epoch": 0.9495929868503444, + "grad_norm": 1.185802698135376, + "learning_rate": 5.475724991854024e-05, + "loss": 0.4332, + "step": 6066 + }, + { + "epoch": 0.9497495303694427, + "grad_norm": 1.9600502252578735, + "learning_rate": 5.474910394265234e-05, + "loss": 0.5061, + "step": 6067 + }, + { + "epoch": 0.949906073888541, + "grad_norm": 1.2545747756958008, + "learning_rate": 5.4740957966764414e-05, + "loss": 0.4034, + "step": 6068 + }, + { + "epoch": 0.9500626174076393, + "grad_norm": 2.085923671722412, + "learning_rate": 5.473281199087651e-05, + "loss": 0.5676, + "step": 6069 + }, + { + "epoch": 0.9502191609267376, + "grad_norm": 2.020521879196167, + "learning_rate": 5.47246660149886e-05, + "loss": 0.5537, + "step": 6070 + }, + { + "epoch": 0.950375704445836, + "grad_norm": 2.156144142150879, + "learning_rate": 5.4716520039100686e-05, + "loss": 0.5866, + "step": 6071 + }, + { + "epoch": 0.9505322479649343, + "grad_norm": 1.085339903831482, + "learning_rate": 5.4708374063212776e-05, + "loss": 0.5793, + "step": 6072 + }, + { + "epoch": 0.9506887914840325, + "grad_norm": 2.1549618244171143, + "learning_rate": 5.470022808732487e-05, + "loss": 0.6602, + "step": 6073 + }, + { + "epoch": 0.9508453350031308, + "grad_norm": 1.2073723077774048, + "learning_rate": 5.469208211143695e-05, + "loss": 0.5402, + "step": 6074 + }, + { + "epoch": 0.9510018785222292, + "grad_norm": 1.8182886838912964, + "learning_rate": 5.468393613554904e-05, + "loss": 0.5222, + "step": 6075 + }, + { + "epoch": 0.9511584220413275, + "grad_norm": 1.1683801412582397, + "learning_rate": 5.467579015966113e-05, + "loss": 0.5614, + "step": 6076 + }, + { + "epoch": 0.9513149655604258, + "grad_norm": 2.431330919265747, + "learning_rate": 5.4667644183773215e-05, + "loss": 0.5472, + "step": 6077 + }, + { + "epoch": 0.9514715090795242, + "grad_norm": 2.0304226875305176, + "learning_rate": 5.4659498207885306e-05, + "loss": 0.5318, + "step": 6078 + }, + { + "epoch": 0.9516280525986224, + "grad_norm": 1.866745114326477, + "learning_rate": 5.4651352231997396e-05, + "loss": 0.6678, + "step": 6079 + }, + { + "epoch": 0.9517845961177207, + "grad_norm": 2.2654240131378174, + "learning_rate": 5.464320625610948e-05, + "loss": 0.6614, + "step": 6080 + }, + { + "epoch": 0.951941139636819, + "grad_norm": 4.580554962158203, + "learning_rate": 5.463506028022157e-05, + "loss": 0.93, + "step": 6081 + }, + { + "epoch": 0.9520976831559174, + "grad_norm": 3.026287078857422, + "learning_rate": 5.462691430433367e-05, + "loss": 0.8998, + "step": 6082 + }, + { + "epoch": 0.9522542266750157, + "grad_norm": 5.884927749633789, + "learning_rate": 5.4618768328445745e-05, + "loss": 0.9745, + "step": 6083 + }, + { + "epoch": 0.9524107701941139, + "grad_norm": 5.203183174133301, + "learning_rate": 5.4610622352557836e-05, + "loss": 1.1751, + "step": 6084 + }, + { + "epoch": 0.9525673137132122, + "grad_norm": 3.9105796813964844, + "learning_rate": 5.460247637666993e-05, + "loss": 1.0047, + "step": 6085 + }, + { + "epoch": 0.9527238572323106, + "grad_norm": 2.9104368686676025, + "learning_rate": 5.459433040078201e-05, + "loss": 1.13, + "step": 6086 + }, + { + "epoch": 0.9528804007514089, + "grad_norm": 1.3325798511505127, + "learning_rate": 5.458618442489411e-05, + "loss": 0.5441, + "step": 6087 + }, + { + "epoch": 0.9530369442705072, + "grad_norm": 3.1526544094085693, + "learning_rate": 5.45780384490062e-05, + "loss": 0.7441, + "step": 6088 + }, + { + "epoch": 0.9531934877896056, + "grad_norm": 3.779604434967041, + "learning_rate": 5.456989247311828e-05, + "loss": 1.0196, + "step": 6089 + }, + { + "epoch": 0.9533500313087038, + "grad_norm": 12.101778984069824, + "learning_rate": 5.456174649723037e-05, + "loss": 1.2448, + "step": 6090 + }, + { + "epoch": 0.9535065748278021, + "grad_norm": 5.820189476013184, + "learning_rate": 5.455360052134246e-05, + "loss": 1.0976, + "step": 6091 + }, + { + "epoch": 0.9536631183469004, + "grad_norm": 5.1457905769348145, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.0244, + "step": 6092 + }, + { + "epoch": 0.9538196618659988, + "grad_norm": 5.173978328704834, + "learning_rate": 5.453730856956664e-05, + "loss": 1.2082, + "step": 6093 + }, + { + "epoch": 0.9539762053850971, + "grad_norm": 2.209398031234741, + "learning_rate": 5.452916259367873e-05, + "loss": 0.7369, + "step": 6094 + }, + { + "epoch": 0.9541327489041954, + "grad_norm": 6.519552230834961, + "learning_rate": 5.452101661779081e-05, + "loss": 1.2341, + "step": 6095 + }, + { + "epoch": 0.9542892924232936, + "grad_norm": 5.321483135223389, + "learning_rate": 5.45128706419029e-05, + "loss": 1.058, + "step": 6096 + }, + { + "epoch": 0.954445835942392, + "grad_norm": 3.593118667602539, + "learning_rate": 5.450472466601499e-05, + "loss": 0.3445, + "step": 6097 + }, + { + "epoch": 0.9546023794614903, + "grad_norm": 4.471179008483887, + "learning_rate": 5.4496578690127076e-05, + "loss": 1.535, + "step": 6098 + }, + { + "epoch": 0.9547589229805886, + "grad_norm": 1.9580414295196533, + "learning_rate": 5.4488432714239166e-05, + "loss": 0.6723, + "step": 6099 + }, + { + "epoch": 0.954915466499687, + "grad_norm": 3.8541243076324463, + "learning_rate": 5.4480286738351264e-05, + "loss": 1.53, + "step": 6100 + }, + { + "epoch": 0.9550720100187852, + "grad_norm": 0.815633237361908, + "learning_rate": 5.447214076246334e-05, + "loss": 0.3053, + "step": 6101 + }, + { + "epoch": 0.9552285535378835, + "grad_norm": 0.7964222431182861, + "learning_rate": 5.446399478657543e-05, + "loss": 0.3006, + "step": 6102 + }, + { + "epoch": 0.9553850970569818, + "grad_norm": 0.8210471272468567, + "learning_rate": 5.445584881068753e-05, + "loss": 0.3399, + "step": 6103 + }, + { + "epoch": 0.9555416405760802, + "grad_norm": 0.6854082345962524, + "learning_rate": 5.4447702834799605e-05, + "loss": 0.2859, + "step": 6104 + }, + { + "epoch": 0.9556981840951785, + "grad_norm": 0.8238632678985596, + "learning_rate": 5.44395568589117e-05, + "loss": 0.2644, + "step": 6105 + }, + { + "epoch": 0.9558547276142768, + "grad_norm": 0.8514614701271057, + "learning_rate": 5.443141088302379e-05, + "loss": 0.2692, + "step": 6106 + }, + { + "epoch": 0.956011271133375, + "grad_norm": 0.6705734133720398, + "learning_rate": 5.442326490713587e-05, + "loss": 0.2396, + "step": 6107 + }, + { + "epoch": 0.9561678146524734, + "grad_norm": 0.8184763193130493, + "learning_rate": 5.441511893124797e-05, + "loss": 0.2958, + "step": 6108 + }, + { + "epoch": 0.9563243581715717, + "grad_norm": 0.6683715581893921, + "learning_rate": 5.440697295536006e-05, + "loss": 0.3257, + "step": 6109 + }, + { + "epoch": 0.95648090169067, + "grad_norm": 0.7657026052474976, + "learning_rate": 5.439882697947214e-05, + "loss": 0.2528, + "step": 6110 + }, + { + "epoch": 0.9566374452097683, + "grad_norm": 0.8463659882545471, + "learning_rate": 5.439068100358423e-05, + "loss": 0.3537, + "step": 6111 + }, + { + "epoch": 0.9567939887288667, + "grad_norm": 1.5671477317810059, + "learning_rate": 5.438253502769632e-05, + "loss": 0.3774, + "step": 6112 + }, + { + "epoch": 0.9569505322479649, + "grad_norm": 1.354697346687317, + "learning_rate": 5.4374389051808407e-05, + "loss": 0.3835, + "step": 6113 + }, + { + "epoch": 0.9571070757670632, + "grad_norm": 1.1292951107025146, + "learning_rate": 5.43662430759205e-05, + "loss": 0.4401, + "step": 6114 + }, + { + "epoch": 0.9572636192861615, + "grad_norm": 0.7961772680282593, + "learning_rate": 5.435809710003259e-05, + "loss": 0.2987, + "step": 6115 + }, + { + "epoch": 0.9574201628052599, + "grad_norm": 1.9348281621932983, + "learning_rate": 5.434995112414467e-05, + "loss": 0.3214, + "step": 6116 + }, + { + "epoch": 0.9575767063243582, + "grad_norm": 2.482940912246704, + "learning_rate": 5.434180514825676e-05, + "loss": 0.651, + "step": 6117 + }, + { + "epoch": 0.9577332498434565, + "grad_norm": 1.6560194492340088, + "learning_rate": 5.433365917236886e-05, + "loss": 0.5567, + "step": 6118 + }, + { + "epoch": 0.9578897933625548, + "grad_norm": 1.2727673053741455, + "learning_rate": 5.4325513196480936e-05, + "loss": 0.5164, + "step": 6119 + }, + { + "epoch": 0.9580463368816531, + "grad_norm": 4.31023645401001, + "learning_rate": 5.431736722059303e-05, + "loss": 1.0016, + "step": 6120 + }, + { + "epoch": 0.9582028804007514, + "grad_norm": 1.6571321487426758, + "learning_rate": 5.4309221244705124e-05, + "loss": 0.4481, + "step": 6121 + }, + { + "epoch": 0.9583594239198497, + "grad_norm": 2.523447036743164, + "learning_rate": 5.43010752688172e-05, + "loss": 0.7219, + "step": 6122 + }, + { + "epoch": 0.9585159674389481, + "grad_norm": 1.7725909948349, + "learning_rate": 5.42929292929293e-05, + "loss": 0.6874, + "step": 6123 + }, + { + "epoch": 0.9586725109580463, + "grad_norm": 1.8680260181427002, + "learning_rate": 5.428478331704139e-05, + "loss": 0.353, + "step": 6124 + }, + { + "epoch": 0.9588290544771446, + "grad_norm": 2.0811450481414795, + "learning_rate": 5.4276637341153466e-05, + "loss": 0.9684, + "step": 6125 + }, + { + "epoch": 0.9589855979962429, + "grad_norm": 1.707165241241455, + "learning_rate": 5.426849136526556e-05, + "loss": 0.4368, + "step": 6126 + }, + { + "epoch": 0.9591421415153413, + "grad_norm": 4.497917175292969, + "learning_rate": 5.4260345389377654e-05, + "loss": 1.0949, + "step": 6127 + }, + { + "epoch": 0.9592986850344396, + "grad_norm": 1.305927038192749, + "learning_rate": 5.425219941348974e-05, + "loss": 0.8586, + "step": 6128 + }, + { + "epoch": 0.9594552285535379, + "grad_norm": 2.3248562812805176, + "learning_rate": 5.424405343760183e-05, + "loss": 0.8113, + "step": 6129 + }, + { + "epoch": 0.9596117720726361, + "grad_norm": 2.83687686920166, + "learning_rate": 5.423590746171392e-05, + "loss": 0.8443, + "step": 6130 + }, + { + "epoch": 0.9597683155917345, + "grad_norm": 1.875752568244934, + "learning_rate": 5.4227761485826e-05, + "loss": 0.8292, + "step": 6131 + }, + { + "epoch": 0.9599248591108328, + "grad_norm": 1.7559168338775635, + "learning_rate": 5.421961550993809e-05, + "loss": 0.4858, + "step": 6132 + }, + { + "epoch": 0.9600814026299311, + "grad_norm": 1.7225316762924194, + "learning_rate": 5.421146953405018e-05, + "loss": 0.8854, + "step": 6133 + }, + { + "epoch": 0.9602379461490295, + "grad_norm": 2.782503604888916, + "learning_rate": 5.420332355816227e-05, + "loss": 0.968, + "step": 6134 + }, + { + "epoch": 0.9603944896681278, + "grad_norm": 2.175476312637329, + "learning_rate": 5.419517758227436e-05, + "loss": 0.9988, + "step": 6135 + }, + { + "epoch": 0.960551033187226, + "grad_norm": 3.492811679840088, + "learning_rate": 5.418703160638645e-05, + "loss": 1.03, + "step": 6136 + }, + { + "epoch": 0.9607075767063243, + "grad_norm": 2.9522173404693604, + "learning_rate": 5.417888563049853e-05, + "loss": 0.6228, + "step": 6137 + }, + { + "epoch": 0.9608641202254227, + "grad_norm": 2.6148457527160645, + "learning_rate": 5.417073965461062e-05, + "loss": 0.6255, + "step": 6138 + }, + { + "epoch": 0.961020663744521, + "grad_norm": 3.409109354019165, + "learning_rate": 5.416259367872272e-05, + "loss": 1.208, + "step": 6139 + }, + { + "epoch": 0.9611772072636193, + "grad_norm": 2.1727287769317627, + "learning_rate": 5.4154447702834797e-05, + "loss": 1.15, + "step": 6140 + }, + { + "epoch": 0.9613337507827175, + "grad_norm": 5.304561614990234, + "learning_rate": 5.4146301726946894e-05, + "loss": 1.7204, + "step": 6141 + }, + { + "epoch": 0.9614902943018159, + "grad_norm": 4.356844902038574, + "learning_rate": 5.4138155751058984e-05, + "loss": 1.2534, + "step": 6142 + }, + { + "epoch": 0.9616468378209142, + "grad_norm": 2.164963960647583, + "learning_rate": 5.413000977517106e-05, + "loss": 1.1577, + "step": 6143 + }, + { + "epoch": 0.9618033813400125, + "grad_norm": 2.7476749420166016, + "learning_rate": 5.412186379928316e-05, + "loss": 1.183, + "step": 6144 + }, + { + "epoch": 0.9619599248591109, + "grad_norm": 5.879080295562744, + "learning_rate": 5.411371782339525e-05, + "loss": 1.3033, + "step": 6145 + }, + { + "epoch": 0.9621164683782092, + "grad_norm": 3.4051928520202637, + "learning_rate": 5.410557184750733e-05, + "loss": 1.3204, + "step": 6146 + }, + { + "epoch": 0.9622730118973074, + "grad_norm": 4.87657356262207, + "learning_rate": 5.4097425871619423e-05, + "loss": 0.8309, + "step": 6147 + }, + { + "epoch": 0.9624295554164057, + "grad_norm": 1.8144965171813965, + "learning_rate": 5.4089279895731514e-05, + "loss": 0.5891, + "step": 6148 + }, + { + "epoch": 0.9625860989355041, + "grad_norm": 3.290811777114868, + "learning_rate": 5.40811339198436e-05, + "loss": 0.5198, + "step": 6149 + }, + { + "epoch": 0.9627426424546024, + "grad_norm": 1.8109855651855469, + "learning_rate": 5.407298794395569e-05, + "loss": 0.9454, + "step": 6150 + }, + { + "epoch": 0.9628991859737007, + "grad_norm": 0.58514404296875, + "learning_rate": 5.406484196806778e-05, + "loss": 0.2455, + "step": 6151 + }, + { + "epoch": 0.963055729492799, + "grad_norm": 0.6561605334281921, + "learning_rate": 5.405669599217986e-05, + "loss": 0.3631, + "step": 6152 + }, + { + "epoch": 0.9632122730118973, + "grad_norm": 0.4310479462146759, + "learning_rate": 5.404855001629195e-05, + "loss": 0.2234, + "step": 6153 + }, + { + "epoch": 0.9633688165309956, + "grad_norm": 0.9630250334739685, + "learning_rate": 5.4040404040404044e-05, + "loss": 0.3531, + "step": 6154 + }, + { + "epoch": 0.9635253600500939, + "grad_norm": 1.332391381263733, + "learning_rate": 5.403225806451613e-05, + "loss": 0.3841, + "step": 6155 + }, + { + "epoch": 0.9636819035691923, + "grad_norm": 0.8622978925704956, + "learning_rate": 5.402411208862822e-05, + "loss": 0.386, + "step": 6156 + }, + { + "epoch": 0.9638384470882906, + "grad_norm": 1.7819268703460693, + "learning_rate": 5.4015966112740315e-05, + "loss": 0.2367, + "step": 6157 + }, + { + "epoch": 0.9639949906073888, + "grad_norm": 0.9107612371444702, + "learning_rate": 5.400782013685239e-05, + "loss": 0.3141, + "step": 6158 + }, + { + "epoch": 0.9641515341264871, + "grad_norm": 1.409263253211975, + "learning_rate": 5.399967416096449e-05, + "loss": 0.2749, + "step": 6159 + }, + { + "epoch": 0.9643080776455855, + "grad_norm": 0.9187862277030945, + "learning_rate": 5.399152818507658e-05, + "loss": 0.31, + "step": 6160 + }, + { + "epoch": 0.9644646211646838, + "grad_norm": 0.6966629028320312, + "learning_rate": 5.398338220918866e-05, + "loss": 0.2227, + "step": 6161 + }, + { + "epoch": 0.9646211646837821, + "grad_norm": 1.951658010482788, + "learning_rate": 5.3975236233300754e-05, + "loss": 0.4876, + "step": 6162 + }, + { + "epoch": 0.9647777082028804, + "grad_norm": 1.404180645942688, + "learning_rate": 5.3967090257412845e-05, + "loss": 0.4373, + "step": 6163 + }, + { + "epoch": 0.9649342517219787, + "grad_norm": 1.3852275609970093, + "learning_rate": 5.395894428152493e-05, + "loss": 0.382, + "step": 6164 + }, + { + "epoch": 0.965090795241077, + "grad_norm": 0.9810996055603027, + "learning_rate": 5.395079830563702e-05, + "loss": 0.2675, + "step": 6165 + }, + { + "epoch": 0.9652473387601753, + "grad_norm": 1.2185840606689453, + "learning_rate": 5.394265232974911e-05, + "loss": 0.3078, + "step": 6166 + }, + { + "epoch": 0.9654038822792737, + "grad_norm": 1.9976465702056885, + "learning_rate": 5.393450635386119e-05, + "loss": 0.5144, + "step": 6167 + }, + { + "epoch": 0.965560425798372, + "grad_norm": 3.026080846786499, + "learning_rate": 5.3926360377973284e-05, + "loss": 0.4272, + "step": 6168 + }, + { + "epoch": 0.9657169693174703, + "grad_norm": 1.1126292943954468, + "learning_rate": 5.3918214402085374e-05, + "loss": 0.4676, + "step": 6169 + }, + { + "epoch": 0.9658735128365685, + "grad_norm": 1.7893247604370117, + "learning_rate": 5.391006842619746e-05, + "loss": 0.3578, + "step": 6170 + }, + { + "epoch": 0.9660300563556669, + "grad_norm": 0.9957181215286255, + "learning_rate": 5.390192245030955e-05, + "loss": 0.4002, + "step": 6171 + }, + { + "epoch": 0.9661865998747652, + "grad_norm": 1.2214800119400024, + "learning_rate": 5.389377647442164e-05, + "loss": 0.376, + "step": 6172 + }, + { + "epoch": 0.9663431433938635, + "grad_norm": 1.9289844036102295, + "learning_rate": 5.388563049853372e-05, + "loss": 0.5235, + "step": 6173 + }, + { + "epoch": 0.9664996869129618, + "grad_norm": 6.206414699554443, + "learning_rate": 5.3877484522645813e-05, + "loss": 0.6596, + "step": 6174 + }, + { + "epoch": 0.9666562304320601, + "grad_norm": 1.8601716756820679, + "learning_rate": 5.386933854675791e-05, + "loss": 0.8663, + "step": 6175 + }, + { + "epoch": 0.9668127739511584, + "grad_norm": 3.135791301727295, + "learning_rate": 5.386119257086999e-05, + "loss": 0.8972, + "step": 6176 + }, + { + "epoch": 0.9669693174702567, + "grad_norm": 2.4721782207489014, + "learning_rate": 5.3853046594982085e-05, + "loss": 0.7027, + "step": 6177 + }, + { + "epoch": 0.967125860989355, + "grad_norm": 2.465195655822754, + "learning_rate": 5.3844900619094176e-05, + "loss": 0.6497, + "step": 6178 + }, + { + "epoch": 0.9672824045084534, + "grad_norm": 1.6721711158752441, + "learning_rate": 5.383675464320625e-05, + "loss": 0.9321, + "step": 6179 + }, + { + "epoch": 0.9674389480275517, + "grad_norm": 1.906428575515747, + "learning_rate": 5.382860866731835e-05, + "loss": 0.7006, + "step": 6180 + }, + { + "epoch": 0.9675954915466499, + "grad_norm": 2.4535622596740723, + "learning_rate": 5.382046269143044e-05, + "loss": 0.7045, + "step": 6181 + }, + { + "epoch": 0.9677520350657483, + "grad_norm": 2.036137580871582, + "learning_rate": 5.3812316715542524e-05, + "loss": 0.733, + "step": 6182 + }, + { + "epoch": 0.9679085785848466, + "grad_norm": 4.3599066734313965, + "learning_rate": 5.3804170739654615e-05, + "loss": 0.9817, + "step": 6183 + }, + { + "epoch": 0.9680651221039449, + "grad_norm": 3.715348958969116, + "learning_rate": 5.3796024763766705e-05, + "loss": 1.5623, + "step": 6184 + }, + { + "epoch": 0.9682216656230432, + "grad_norm": 2.8665544986724854, + "learning_rate": 5.378787878787879e-05, + "loss": 0.9657, + "step": 6185 + }, + { + "epoch": 0.9683782091421416, + "grad_norm": 2.9131133556365967, + "learning_rate": 5.377973281199088e-05, + "loss": 0.8994, + "step": 6186 + }, + { + "epoch": 0.9685347526612398, + "grad_norm": 2.8016703128814697, + "learning_rate": 5.377158683610297e-05, + "loss": 0.9422, + "step": 6187 + }, + { + "epoch": 0.9686912961803381, + "grad_norm": 2.4871556758880615, + "learning_rate": 5.3763440860215054e-05, + "loss": 1.1561, + "step": 6188 + }, + { + "epoch": 0.9688478396994364, + "grad_norm": 2.471621513366699, + "learning_rate": 5.3755294884327144e-05, + "loss": 0.8829, + "step": 6189 + }, + { + "epoch": 0.9690043832185348, + "grad_norm": 3.025010824203491, + "learning_rate": 5.3747148908439235e-05, + "loss": 1.1001, + "step": 6190 + }, + { + "epoch": 0.9691609267376331, + "grad_norm": 3.3834481239318848, + "learning_rate": 5.373900293255132e-05, + "loss": 0.9078, + "step": 6191 + }, + { + "epoch": 0.9693174702567313, + "grad_norm": 5.352244853973389, + "learning_rate": 5.373085695666341e-05, + "loss": 1.3083, + "step": 6192 + }, + { + "epoch": 0.9694740137758296, + "grad_norm": 3.93146014213562, + "learning_rate": 5.3722710980775506e-05, + "loss": 1.4014, + "step": 6193 + }, + { + "epoch": 0.969630557294928, + "grad_norm": 2.0004422664642334, + "learning_rate": 5.371456500488758e-05, + "loss": 0.7373, + "step": 6194 + }, + { + "epoch": 0.9697871008140263, + "grad_norm": 2.594266653060913, + "learning_rate": 5.3706419028999674e-05, + "loss": 1.0937, + "step": 6195 + }, + { + "epoch": 0.9699436443331246, + "grad_norm": 2.0084402561187744, + "learning_rate": 5.369827305311177e-05, + "loss": 0.4896, + "step": 6196 + }, + { + "epoch": 0.970100187852223, + "grad_norm": 2.6967530250549316, + "learning_rate": 5.369012707722385e-05, + "loss": 0.5904, + "step": 6197 + }, + { + "epoch": 0.9702567313713212, + "grad_norm": 4.651576995849609, + "learning_rate": 5.3681981101335945e-05, + "loss": 0.7844, + "step": 6198 + }, + { + "epoch": 0.9704132748904195, + "grad_norm": 3.375757932662964, + "learning_rate": 5.3673835125448036e-05, + "loss": 0.7257, + "step": 6199 + }, + { + "epoch": 0.9705698184095178, + "grad_norm": 4.032326698303223, + "learning_rate": 5.366568914956012e-05, + "loss": 0.7741, + "step": 6200 + }, + { + "epoch": 0.9707263619286162, + "grad_norm": 0.5251049399375916, + "learning_rate": 5.365754317367221e-05, + "loss": 0.307, + "step": 6201 + }, + { + "epoch": 0.9708829054477145, + "grad_norm": 0.43933144211769104, + "learning_rate": 5.36493971977843e-05, + "loss": 0.2197, + "step": 6202 + }, + { + "epoch": 0.9710394489668128, + "grad_norm": 0.6492868661880493, + "learning_rate": 5.3641251221896384e-05, + "loss": 0.3342, + "step": 6203 + }, + { + "epoch": 0.971195992485911, + "grad_norm": 0.5122601985931396, + "learning_rate": 5.3633105246008475e-05, + "loss": 0.2158, + "step": 6204 + }, + { + "epoch": 0.9713525360050094, + "grad_norm": 0.8021095991134644, + "learning_rate": 5.3624959270120566e-05, + "loss": 0.2567, + "step": 6205 + }, + { + "epoch": 0.9715090795241077, + "grad_norm": 0.36425158381462097, + "learning_rate": 5.361681329423265e-05, + "loss": 0.2041, + "step": 6206 + }, + { + "epoch": 0.971665623043206, + "grad_norm": 1.2175166606903076, + "learning_rate": 5.360866731834474e-05, + "loss": 0.2127, + "step": 6207 + }, + { + "epoch": 0.9718221665623044, + "grad_norm": 0.7628523707389832, + "learning_rate": 5.360052134245683e-05, + "loss": 0.3752, + "step": 6208 + }, + { + "epoch": 0.9719787100814026, + "grad_norm": 1.4054656028747559, + "learning_rate": 5.3592375366568914e-05, + "loss": 0.2851, + "step": 6209 + }, + { + "epoch": 0.9721352536005009, + "grad_norm": 2.210369110107422, + "learning_rate": 5.3584229390681005e-05, + "loss": 0.5025, + "step": 6210 + }, + { + "epoch": 0.9722917971195992, + "grad_norm": 0.9684845209121704, + "learning_rate": 5.35760834147931e-05, + "loss": 0.5691, + "step": 6211 + }, + { + "epoch": 0.9724483406386976, + "grad_norm": 1.3074193000793457, + "learning_rate": 5.356793743890518e-05, + "loss": 0.4951, + "step": 6212 + }, + { + "epoch": 0.9726048841577959, + "grad_norm": 1.0090785026550293, + "learning_rate": 5.355979146301727e-05, + "loss": 0.4583, + "step": 6213 + }, + { + "epoch": 0.9727614276768942, + "grad_norm": 0.8254941701889038, + "learning_rate": 5.355164548712937e-05, + "loss": 0.3174, + "step": 6214 + }, + { + "epoch": 0.9729179711959924, + "grad_norm": 1.288428544998169, + "learning_rate": 5.3543499511241444e-05, + "loss": 0.2987, + "step": 6215 + }, + { + "epoch": 0.9730745147150908, + "grad_norm": 5.458842754364014, + "learning_rate": 5.353535353535354e-05, + "loss": 0.5298, + "step": 6216 + }, + { + "epoch": 0.9732310582341891, + "grad_norm": 1.452070713043213, + "learning_rate": 5.352720755946563e-05, + "loss": 0.6011, + "step": 6217 + }, + { + "epoch": 0.9733876017532874, + "grad_norm": 1.6317099332809448, + "learning_rate": 5.3519061583577715e-05, + "loss": 0.5085, + "step": 6218 + }, + { + "epoch": 0.9735441452723858, + "grad_norm": 1.085327386856079, + "learning_rate": 5.3510915607689806e-05, + "loss": 0.4272, + "step": 6219 + }, + { + "epoch": 0.9737006887914841, + "grad_norm": 2.039574384689331, + "learning_rate": 5.3502769631801896e-05, + "loss": 0.3443, + "step": 6220 + }, + { + "epoch": 0.9738572323105823, + "grad_norm": 1.2183775901794434, + "learning_rate": 5.349462365591398e-05, + "loss": 0.3136, + "step": 6221 + }, + { + "epoch": 0.9740137758296806, + "grad_norm": 2.1368536949157715, + "learning_rate": 5.348647768002607e-05, + "loss": 0.6431, + "step": 6222 + }, + { + "epoch": 0.974170319348779, + "grad_norm": 1.8204090595245361, + "learning_rate": 5.347833170413816e-05, + "loss": 0.6956, + "step": 6223 + }, + { + "epoch": 0.9743268628678773, + "grad_norm": 1.8916419744491577, + "learning_rate": 5.3470185728250245e-05, + "loss": 0.414, + "step": 6224 + }, + { + "epoch": 0.9744834063869756, + "grad_norm": 1.6738173961639404, + "learning_rate": 5.3462039752362335e-05, + "loss": 0.3837, + "step": 6225 + }, + { + "epoch": 0.9746399499060739, + "grad_norm": 1.9560691118240356, + "learning_rate": 5.3453893776474426e-05, + "loss": 0.3843, + "step": 6226 + }, + { + "epoch": 0.9747964934251722, + "grad_norm": 1.9134124517440796, + "learning_rate": 5.344574780058651e-05, + "loss": 0.7935, + "step": 6227 + }, + { + "epoch": 0.9749530369442705, + "grad_norm": 2.0428993701934814, + "learning_rate": 5.34376018246986e-05, + "loss": 0.7331, + "step": 6228 + }, + { + "epoch": 0.9751095804633688, + "grad_norm": 1.6960750818252563, + "learning_rate": 5.34294558488107e-05, + "loss": 0.4258, + "step": 6229 + }, + { + "epoch": 0.9752661239824671, + "grad_norm": 3.177854061126709, + "learning_rate": 5.3421309872922774e-05, + "loss": 0.793, + "step": 6230 + }, + { + "epoch": 0.9754226675015655, + "grad_norm": 2.2938034534454346, + "learning_rate": 5.3413163897034865e-05, + "loss": 0.5263, + "step": 6231 + }, + { + "epoch": 0.9755792110206637, + "grad_norm": 3.137784242630005, + "learning_rate": 5.340501792114696e-05, + "loss": 0.7294, + "step": 6232 + }, + { + "epoch": 0.975735754539762, + "grad_norm": 1.9419035911560059, + "learning_rate": 5.339687194525904e-05, + "loss": 0.3644, + "step": 6233 + }, + { + "epoch": 0.9758922980588604, + "grad_norm": 4.391149044036865, + "learning_rate": 5.3388725969371137e-05, + "loss": 0.8974, + "step": 6234 + }, + { + "epoch": 0.9760488415779587, + "grad_norm": 2.6669795513153076, + "learning_rate": 5.338057999348323e-05, + "loss": 0.4477, + "step": 6235 + }, + { + "epoch": 0.976205385097057, + "grad_norm": 2.2228808403015137, + "learning_rate": 5.337243401759531e-05, + "loss": 0.9199, + "step": 6236 + }, + { + "epoch": 0.9763619286161553, + "grad_norm": 3.3278636932373047, + "learning_rate": 5.33642880417074e-05, + "loss": 0.9817, + "step": 6237 + }, + { + "epoch": 0.9765184721352536, + "grad_norm": 4.683374881744385, + "learning_rate": 5.335614206581949e-05, + "loss": 1.1879, + "step": 6238 + }, + { + "epoch": 0.9766750156543519, + "grad_norm": 3.6261773109436035, + "learning_rate": 5.3347996089931576e-05, + "loss": 0.8649, + "step": 6239 + }, + { + "epoch": 0.9768315591734502, + "grad_norm": 4.163529396057129, + "learning_rate": 5.3339850114043666e-05, + "loss": 1.8066, + "step": 6240 + }, + { + "epoch": 0.9769881026925485, + "grad_norm": 3.5933308601379395, + "learning_rate": 5.333170413815576e-05, + "loss": 1.132, + "step": 6241 + }, + { + "epoch": 0.9771446462116469, + "grad_norm": 1.6846421957015991, + "learning_rate": 5.332355816226784e-05, + "loss": 0.992, + "step": 6242 + }, + { + "epoch": 0.9773011897307452, + "grad_norm": 3.6472795009613037, + "learning_rate": 5.331541218637993e-05, + "loss": 1.083, + "step": 6243 + }, + { + "epoch": 0.9774577332498434, + "grad_norm": 2.1556003093719482, + "learning_rate": 5.330726621049202e-05, + "loss": 0.9381, + "step": 6244 + }, + { + "epoch": 0.9776142767689417, + "grad_norm": 2.8695013523101807, + "learning_rate": 5.3299120234604105e-05, + "loss": 0.6569, + "step": 6245 + }, + { + "epoch": 0.9777708202880401, + "grad_norm": 4.105661869049072, + "learning_rate": 5.3290974258716196e-05, + "loss": 1.3522, + "step": 6246 + }, + { + "epoch": 0.9779273638071384, + "grad_norm": 4.950375556945801, + "learning_rate": 5.328282828282829e-05, + "loss": 1.3833, + "step": 6247 + }, + { + "epoch": 0.9780839073262367, + "grad_norm": 2.9692625999450684, + "learning_rate": 5.327468230694037e-05, + "loss": 1.2247, + "step": 6248 + }, + { + "epoch": 0.978240450845335, + "grad_norm": 5.4289631843566895, + "learning_rate": 5.326653633105246e-05, + "loss": 0.8253, + "step": 6249 + }, + { + "epoch": 0.9783969943644333, + "grad_norm": 3.727268695831299, + "learning_rate": 5.325839035516456e-05, + "loss": 1.1448, + "step": 6250 + }, + { + "epoch": 0.9785535378835316, + "grad_norm": 0.5546844601631165, + "learning_rate": 5.3250244379276635e-05, + "loss": 0.3042, + "step": 6251 + }, + { + "epoch": 0.9787100814026299, + "grad_norm": 0.5170332193374634, + "learning_rate": 5.324209840338873e-05, + "loss": 0.239, + "step": 6252 + }, + { + "epoch": 0.9788666249217283, + "grad_norm": 0.6411470174789429, + "learning_rate": 5.323395242750082e-05, + "loss": 0.2682, + "step": 6253 + }, + { + "epoch": 0.9790231684408266, + "grad_norm": 0.6074718832969666, + "learning_rate": 5.32258064516129e-05, + "loss": 0.2083, + "step": 6254 + }, + { + "epoch": 0.9791797119599248, + "grad_norm": 0.7093194127082825, + "learning_rate": 5.3217660475725e-05, + "loss": 0.2199, + "step": 6255 + }, + { + "epoch": 0.9793362554790231, + "grad_norm": 0.5651874542236328, + "learning_rate": 5.320951449983709e-05, + "loss": 0.2272, + "step": 6256 + }, + { + "epoch": 0.9794927989981215, + "grad_norm": 0.5808147192001343, + "learning_rate": 5.320136852394917e-05, + "loss": 0.2406, + "step": 6257 + }, + { + "epoch": 0.9796493425172198, + "grad_norm": 0.9861701130867004, + "learning_rate": 5.319322254806126e-05, + "loss": 0.3747, + "step": 6258 + }, + { + "epoch": 0.9798058860363181, + "grad_norm": 2.314011573791504, + "learning_rate": 5.318507657217335e-05, + "loss": 0.2498, + "step": 6259 + }, + { + "epoch": 0.9799624295554165, + "grad_norm": 2.124988555908203, + "learning_rate": 5.3176930596285436e-05, + "loss": 0.5024, + "step": 6260 + }, + { + "epoch": 0.9801189730745147, + "grad_norm": 0.8064952492713928, + "learning_rate": 5.3168784620397527e-05, + "loss": 0.3032, + "step": 6261 + }, + { + "epoch": 0.980275516593613, + "grad_norm": 0.7486231923103333, + "learning_rate": 5.316063864450962e-05, + "loss": 0.2881, + "step": 6262 + }, + { + "epoch": 0.9804320601127113, + "grad_norm": 1.3491332530975342, + "learning_rate": 5.31524926686217e-05, + "loss": 0.4531, + "step": 6263 + }, + { + "epoch": 0.9805886036318097, + "grad_norm": 2.470372200012207, + "learning_rate": 5.314434669273379e-05, + "loss": 0.4992, + "step": 6264 + }, + { + "epoch": 0.980745147150908, + "grad_norm": 0.9447457790374756, + "learning_rate": 5.313620071684589e-05, + "loss": 0.4262, + "step": 6265 + }, + { + "epoch": 0.9809016906700062, + "grad_norm": 1.1542634963989258, + "learning_rate": 5.3128054740957966e-05, + "loss": 0.369, + "step": 6266 + }, + { + "epoch": 0.9810582341891045, + "grad_norm": 2.5110297203063965, + "learning_rate": 5.3119908765070056e-05, + "loss": 0.4338, + "step": 6267 + }, + { + "epoch": 0.9812147777082029, + "grad_norm": 1.6778266429901123, + "learning_rate": 5.3111762789182153e-05, + "loss": 0.4859, + "step": 6268 + }, + { + "epoch": 0.9813713212273012, + "grad_norm": 2.4591615200042725, + "learning_rate": 5.310361681329423e-05, + "loss": 0.6113, + "step": 6269 + }, + { + "epoch": 0.9815278647463995, + "grad_norm": 1.4726178646087646, + "learning_rate": 5.309547083740633e-05, + "loss": 0.6057, + "step": 6270 + }, + { + "epoch": 0.9816844082654979, + "grad_norm": 1.1679779291152954, + "learning_rate": 5.308732486151842e-05, + "loss": 0.349, + "step": 6271 + }, + { + "epoch": 0.9818409517845961, + "grad_norm": 2.513794422149658, + "learning_rate": 5.3079178885630495e-05, + "loss": 0.6433, + "step": 6272 + }, + { + "epoch": 0.9819974953036944, + "grad_norm": 1.3962961435317993, + "learning_rate": 5.307103290974259e-05, + "loss": 0.3674, + "step": 6273 + }, + { + "epoch": 0.9821540388227927, + "grad_norm": 2.459113836288452, + "learning_rate": 5.306288693385468e-05, + "loss": 0.4433, + "step": 6274 + }, + { + "epoch": 0.9823105823418911, + "grad_norm": 1.5913830995559692, + "learning_rate": 5.305474095796677e-05, + "loss": 0.555, + "step": 6275 + }, + { + "epoch": 0.9824671258609894, + "grad_norm": 3.978336811065674, + "learning_rate": 5.304659498207886e-05, + "loss": 0.6247, + "step": 6276 + }, + { + "epoch": 0.9826236693800877, + "grad_norm": 2.653175115585327, + "learning_rate": 5.303844900619095e-05, + "loss": 0.7433, + "step": 6277 + }, + { + "epoch": 0.9827802128991859, + "grad_norm": 4.027853965759277, + "learning_rate": 5.303030303030303e-05, + "loss": 0.9498, + "step": 6278 + }, + { + "epoch": 0.9829367564182843, + "grad_norm": 2.0566391944885254, + "learning_rate": 5.302215705441512e-05, + "loss": 0.5884, + "step": 6279 + }, + { + "epoch": 0.9830932999373826, + "grad_norm": 1.4902759790420532, + "learning_rate": 5.301401107852721e-05, + "loss": 0.573, + "step": 6280 + }, + { + "epoch": 0.9832498434564809, + "grad_norm": 2.907083034515381, + "learning_rate": 5.3005865102639296e-05, + "loss": 0.9744, + "step": 6281 + }, + { + "epoch": 0.9834063869755792, + "grad_norm": 1.751982569694519, + "learning_rate": 5.299771912675139e-05, + "loss": 0.3195, + "step": 6282 + }, + { + "epoch": 0.9835629304946775, + "grad_norm": 2.9293994903564453, + "learning_rate": 5.298957315086348e-05, + "loss": 0.7187, + "step": 6283 + }, + { + "epoch": 0.9837194740137758, + "grad_norm": 2.9749984741210938, + "learning_rate": 5.298142717497556e-05, + "loss": 0.956, + "step": 6284 + }, + { + "epoch": 0.9838760175328741, + "grad_norm": 3.788294792175293, + "learning_rate": 5.297328119908765e-05, + "loss": 0.7454, + "step": 6285 + }, + { + "epoch": 0.9840325610519725, + "grad_norm": 7.279014587402344, + "learning_rate": 5.296513522319975e-05, + "loss": 0.9063, + "step": 6286 + }, + { + "epoch": 0.9841891045710708, + "grad_norm": 4.103922367095947, + "learning_rate": 5.2956989247311826e-05, + "loss": 0.8321, + "step": 6287 + }, + { + "epoch": 0.9843456480901691, + "grad_norm": 3.3531363010406494, + "learning_rate": 5.294884327142392e-05, + "loss": 0.4936, + "step": 6288 + }, + { + "epoch": 0.9845021916092673, + "grad_norm": 2.205991744995117, + "learning_rate": 5.2940697295536014e-05, + "loss": 0.9578, + "step": 6289 + }, + { + "epoch": 0.9846587351283657, + "grad_norm": 3.112929582595825, + "learning_rate": 5.293255131964809e-05, + "loss": 1.3485, + "step": 6290 + }, + { + "epoch": 0.984815278647464, + "grad_norm": 5.545471668243408, + "learning_rate": 5.292440534376019e-05, + "loss": 1.304, + "step": 6291 + }, + { + "epoch": 0.9849718221665623, + "grad_norm": 1.9816195964813232, + "learning_rate": 5.291625936787228e-05, + "loss": 0.8888, + "step": 6292 + }, + { + "epoch": 0.9851283656856606, + "grad_norm": 2.9957804679870605, + "learning_rate": 5.290811339198436e-05, + "loss": 0.7084, + "step": 6293 + }, + { + "epoch": 0.985284909204759, + "grad_norm": 4.34970760345459, + "learning_rate": 5.289996741609645e-05, + "loss": 1.2371, + "step": 6294 + }, + { + "epoch": 0.9854414527238572, + "grad_norm": 2.757340908050537, + "learning_rate": 5.2891821440208543e-05, + "loss": 1.4999, + "step": 6295 + }, + { + "epoch": 0.9855979962429555, + "grad_norm": 3.137096405029297, + "learning_rate": 5.288367546432063e-05, + "loss": 0.9133, + "step": 6296 + }, + { + "epoch": 0.9857545397620538, + "grad_norm": 5.948956489562988, + "learning_rate": 5.287552948843272e-05, + "loss": 0.7545, + "step": 6297 + }, + { + "epoch": 0.9859110832811522, + "grad_norm": 3.743398666381836, + "learning_rate": 5.286738351254481e-05, + "loss": 1.1177, + "step": 6298 + }, + { + "epoch": 0.9860676268002505, + "grad_norm": 3.1029443740844727, + "learning_rate": 5.285923753665689e-05, + "loss": 0.5795, + "step": 6299 + }, + { + "epoch": 0.9862241703193487, + "grad_norm": 1.4705368280410767, + "learning_rate": 5.285109156076898e-05, + "loss": 0.562, + "step": 6300 + }, + { + "epoch": 0.986380713838447, + "grad_norm": 0.8839526176452637, + "learning_rate": 5.284294558488107e-05, + "loss": 0.3511, + "step": 6301 + }, + { + "epoch": 0.9865372573575454, + "grad_norm": 0.5678699016571045, + "learning_rate": 5.283479960899316e-05, + "loss": 0.2379, + "step": 6302 + }, + { + "epoch": 0.9866938008766437, + "grad_norm": 0.7273358702659607, + "learning_rate": 5.282665363310525e-05, + "loss": 0.297, + "step": 6303 + }, + { + "epoch": 0.986850344395742, + "grad_norm": 0.42303794622421265, + "learning_rate": 5.2818507657217345e-05, + "loss": 0.233, + "step": 6304 + }, + { + "epoch": 0.9870068879148404, + "grad_norm": 0.48728495836257935, + "learning_rate": 5.281036168132942e-05, + "loss": 0.1759, + "step": 6305 + }, + { + "epoch": 0.9871634314339386, + "grad_norm": 0.9152201414108276, + "learning_rate": 5.280221570544152e-05, + "loss": 0.3198, + "step": 6306 + }, + { + "epoch": 0.9873199749530369, + "grad_norm": 0.7592007517814636, + "learning_rate": 5.279406972955361e-05, + "loss": 0.3827, + "step": 6307 + }, + { + "epoch": 0.9874765184721352, + "grad_norm": 0.6978158950805664, + "learning_rate": 5.2785923753665686e-05, + "loss": 0.3064, + "step": 6308 + }, + { + "epoch": 0.9876330619912336, + "grad_norm": 2.6744179725646973, + "learning_rate": 5.2777777777777784e-05, + "loss": 0.3331, + "step": 6309 + }, + { + "epoch": 0.9877896055103319, + "grad_norm": 0.9452757239341736, + "learning_rate": 5.2769631801889874e-05, + "loss": 0.2836, + "step": 6310 + }, + { + "epoch": 0.9879461490294302, + "grad_norm": 1.1801416873931885, + "learning_rate": 5.276148582600196e-05, + "loss": 0.2489, + "step": 6311 + }, + { + "epoch": 0.9881026925485284, + "grad_norm": 1.6981850862503052, + "learning_rate": 5.275333985011405e-05, + "loss": 0.3923, + "step": 6312 + }, + { + "epoch": 0.9882592360676268, + "grad_norm": 1.288787841796875, + "learning_rate": 5.274519387422614e-05, + "loss": 0.3063, + "step": 6313 + }, + { + "epoch": 0.9884157795867251, + "grad_norm": 1.1000502109527588, + "learning_rate": 5.273704789833822e-05, + "loss": 0.4541, + "step": 6314 + }, + { + "epoch": 0.9885723231058234, + "grad_norm": 1.716823935508728, + "learning_rate": 5.272890192245031e-05, + "loss": 0.4176, + "step": 6315 + }, + { + "epoch": 0.9887288666249218, + "grad_norm": 1.871440052986145, + "learning_rate": 5.2720755946562404e-05, + "loss": 0.4938, + "step": 6316 + }, + { + "epoch": 0.98888541014402, + "grad_norm": 1.2430906295776367, + "learning_rate": 5.271260997067449e-05, + "loss": 0.2822, + "step": 6317 + }, + { + "epoch": 0.9890419536631183, + "grad_norm": 5.997138023376465, + "learning_rate": 5.270446399478658e-05, + "loss": 0.8798, + "step": 6318 + }, + { + "epoch": 0.9891984971822166, + "grad_norm": 1.4383291006088257, + "learning_rate": 5.269631801889867e-05, + "loss": 0.5445, + "step": 6319 + }, + { + "epoch": 0.989355040701315, + "grad_norm": 1.3459978103637695, + "learning_rate": 5.268817204301075e-05, + "loss": 0.3405, + "step": 6320 + }, + { + "epoch": 0.9895115842204133, + "grad_norm": 1.1621296405792236, + "learning_rate": 5.268002606712284e-05, + "loss": 0.3061, + "step": 6321 + }, + { + "epoch": 0.9896681277395116, + "grad_norm": 2.7933435440063477, + "learning_rate": 5.267188009123494e-05, + "loss": 0.9078, + "step": 6322 + }, + { + "epoch": 0.9898246712586098, + "grad_norm": 2.2307493686676025, + "learning_rate": 5.266373411534702e-05, + "loss": 0.6005, + "step": 6323 + }, + { + "epoch": 0.9899812147777082, + "grad_norm": 2.7686495780944824, + "learning_rate": 5.265558813945911e-05, + "loss": 0.4916, + "step": 6324 + }, + { + "epoch": 0.9901377582968065, + "grad_norm": 3.216503143310547, + "learning_rate": 5.2647442163571205e-05, + "loss": 0.5527, + "step": 6325 + }, + { + "epoch": 0.9902943018159048, + "grad_norm": 2.2012248039245605, + "learning_rate": 5.263929618768328e-05, + "loss": 0.454, + "step": 6326 + }, + { + "epoch": 0.9904508453350032, + "grad_norm": 1.627835750579834, + "learning_rate": 5.263115021179538e-05, + "loss": 0.5205, + "step": 6327 + }, + { + "epoch": 0.9906073888541015, + "grad_norm": 1.9550319910049438, + "learning_rate": 5.262300423590747e-05, + "loss": 0.7902, + "step": 6328 + }, + { + "epoch": 0.9907639323731997, + "grad_norm": 2.0032527446746826, + "learning_rate": 5.2614858260019554e-05, + "loss": 0.5143, + "step": 6329 + }, + { + "epoch": 0.990920475892298, + "grad_norm": 3.2040085792541504, + "learning_rate": 5.2606712284131644e-05, + "loss": 0.8005, + "step": 6330 + }, + { + "epoch": 0.9910770194113964, + "grad_norm": 2.652134895324707, + "learning_rate": 5.2598566308243735e-05, + "loss": 0.4152, + "step": 6331 + }, + { + "epoch": 0.9912335629304947, + "grad_norm": 3.71842098236084, + "learning_rate": 5.259042033235582e-05, + "loss": 0.7756, + "step": 6332 + }, + { + "epoch": 0.991390106449593, + "grad_norm": 3.174973964691162, + "learning_rate": 5.258227435646791e-05, + "loss": 0.9111, + "step": 6333 + }, + { + "epoch": 0.9915466499686914, + "grad_norm": 5.226846218109131, + "learning_rate": 5.257412838058e-05, + "loss": 0.5267, + "step": 6334 + }, + { + "epoch": 0.9917031934877896, + "grad_norm": 1.8572748899459839, + "learning_rate": 5.256598240469208e-05, + "loss": 0.7916, + "step": 6335 + }, + { + "epoch": 0.9918597370068879, + "grad_norm": 2.6535792350769043, + "learning_rate": 5.2557836428804174e-05, + "loss": 0.8172, + "step": 6336 + }, + { + "epoch": 0.9920162805259862, + "grad_norm": 2.0554020404815674, + "learning_rate": 5.2549690452916264e-05, + "loss": 0.6396, + "step": 6337 + }, + { + "epoch": 0.9921728240450846, + "grad_norm": 4.514573574066162, + "learning_rate": 5.254154447702835e-05, + "loss": 1.3195, + "step": 6338 + }, + { + "epoch": 0.9923293675641829, + "grad_norm": 8.749300003051758, + "learning_rate": 5.253339850114044e-05, + "loss": 1.3317, + "step": 6339 + }, + { + "epoch": 0.9924859110832811, + "grad_norm": 3.8025007247924805, + "learning_rate": 5.2525252525252536e-05, + "loss": 0.9188, + "step": 6340 + }, + { + "epoch": 0.9926424546023794, + "grad_norm": 4.563300132751465, + "learning_rate": 5.251710654936461e-05, + "loss": 1.0612, + "step": 6341 + }, + { + "epoch": 0.9927989981214778, + "grad_norm": 2.5941476821899414, + "learning_rate": 5.25089605734767e-05, + "loss": 0.7113, + "step": 6342 + }, + { + "epoch": 0.9929555416405761, + "grad_norm": 4.095574378967285, + "learning_rate": 5.25008145975888e-05, + "loss": 1.1405, + "step": 6343 + }, + { + "epoch": 0.9931120851596744, + "grad_norm": 2.6061956882476807, + "learning_rate": 5.249266862170088e-05, + "loss": 1.374, + "step": 6344 + }, + { + "epoch": 0.9932686286787727, + "grad_norm": 2.9273109436035156, + "learning_rate": 5.2484522645812975e-05, + "loss": 1.0698, + "step": 6345 + }, + { + "epoch": 0.993425172197871, + "grad_norm": 2.786961317062378, + "learning_rate": 5.2476376669925065e-05, + "loss": 0.7303, + "step": 6346 + }, + { + "epoch": 0.9935817157169693, + "grad_norm": 7.7008233070373535, + "learning_rate": 5.246823069403715e-05, + "loss": 1.0253, + "step": 6347 + }, + { + "epoch": 0.9937382592360676, + "grad_norm": 2.1601271629333496, + "learning_rate": 5.246008471814924e-05, + "loss": 0.4077, + "step": 6348 + }, + { + "epoch": 0.993894802755166, + "grad_norm": 3.5852668285369873, + "learning_rate": 5.245193874226133e-05, + "loss": 1.0957, + "step": 6349 + }, + { + "epoch": 0.9940513462742643, + "grad_norm": 4.761088848114014, + "learning_rate": 5.2443792766373414e-05, + "loss": 1.6317, + "step": 6350 + }, + { + "epoch": 0.9942078897933626, + "grad_norm": 0.6365141868591309, + "learning_rate": 5.2435646790485504e-05, + "loss": 0.2198, + "step": 6351 + }, + { + "epoch": 0.9943644333124608, + "grad_norm": 1.775795578956604, + "learning_rate": 5.2427500814597595e-05, + "loss": 0.3368, + "step": 6352 + }, + { + "epoch": 0.9945209768315592, + "grad_norm": 0.6703605651855469, + "learning_rate": 5.241935483870968e-05, + "loss": 0.2632, + "step": 6353 + }, + { + "epoch": 0.9946775203506575, + "grad_norm": 0.6997856497764587, + "learning_rate": 5.241120886282177e-05, + "loss": 0.2458, + "step": 6354 + }, + { + "epoch": 0.9948340638697558, + "grad_norm": 0.9928366541862488, + "learning_rate": 5.240306288693386e-05, + "loss": 0.2819, + "step": 6355 + }, + { + "epoch": 0.9949906073888541, + "grad_norm": 0.834804356098175, + "learning_rate": 5.2394916911045944e-05, + "loss": 0.3273, + "step": 6356 + }, + { + "epoch": 0.9951471509079524, + "grad_norm": 0.765377938747406, + "learning_rate": 5.2386770935158034e-05, + "loss": 0.2566, + "step": 6357 + }, + { + "epoch": 0.9953036944270507, + "grad_norm": 0.9819781184196472, + "learning_rate": 5.237862495927013e-05, + "loss": 0.2805, + "step": 6358 + }, + { + "epoch": 0.995460237946149, + "grad_norm": 1.2002366781234741, + "learning_rate": 5.237047898338221e-05, + "loss": 0.3549, + "step": 6359 + }, + { + "epoch": 0.9956167814652473, + "grad_norm": 1.0426875352859497, + "learning_rate": 5.23623330074943e-05, + "loss": 0.4118, + "step": 6360 + }, + { + "epoch": 0.9957733249843457, + "grad_norm": 0.9702119827270508, + "learning_rate": 5.2354187031606396e-05, + "loss": 0.4415, + "step": 6361 + }, + { + "epoch": 0.995929868503444, + "grad_norm": 1.366406798362732, + "learning_rate": 5.234604105571847e-05, + "loss": 0.6206, + "step": 6362 + }, + { + "epoch": 0.9960864120225422, + "grad_norm": 2.292806386947632, + "learning_rate": 5.233789507983057e-05, + "loss": 0.5298, + "step": 6363 + }, + { + "epoch": 0.9962429555416406, + "grad_norm": 2.9922564029693604, + "learning_rate": 5.232974910394266e-05, + "loss": 0.4761, + "step": 6364 + }, + { + "epoch": 0.9963994990607389, + "grad_norm": 1.8331172466278076, + "learning_rate": 5.2321603128054745e-05, + "loss": 0.6217, + "step": 6365 + }, + { + "epoch": 0.9965560425798372, + "grad_norm": 2.2292113304138184, + "learning_rate": 5.2313457152166835e-05, + "loss": 0.3351, + "step": 6366 + }, + { + "epoch": 0.9967125860989355, + "grad_norm": 1.964722752571106, + "learning_rate": 5.2305311176278926e-05, + "loss": 0.5092, + "step": 6367 + }, + { + "epoch": 0.9968691296180339, + "grad_norm": 2.579782485961914, + "learning_rate": 5.229716520039101e-05, + "loss": 0.4554, + "step": 6368 + }, + { + "epoch": 0.9970256731371321, + "grad_norm": 1.815104365348816, + "learning_rate": 5.22890192245031e-05, + "loss": 0.5769, + "step": 6369 + }, + { + "epoch": 0.9971822166562304, + "grad_norm": 1.8351359367370605, + "learning_rate": 5.228087324861519e-05, + "loss": 0.6557, + "step": 6370 + }, + { + "epoch": 0.9973387601753287, + "grad_norm": 1.935080885887146, + "learning_rate": 5.2272727272727274e-05, + "loss": 0.5604, + "step": 6371 + }, + { + "epoch": 0.9974953036944271, + "grad_norm": 2.9325780868530273, + "learning_rate": 5.2264581296839365e-05, + "loss": 0.7199, + "step": 6372 + }, + { + "epoch": 0.9976518472135254, + "grad_norm": 2.5574018955230713, + "learning_rate": 5.2256435320951455e-05, + "loss": 0.9383, + "step": 6373 + }, + { + "epoch": 0.9978083907326236, + "grad_norm": 3.0656120777130127, + "learning_rate": 5.224828934506354e-05, + "loss": 1.1333, + "step": 6374 + }, + { + "epoch": 0.997964934251722, + "grad_norm": 2.586146593093872, + "learning_rate": 5.224014336917563e-05, + "loss": 0.7367, + "step": 6375 + }, + { + "epoch": 0.9981214777708203, + "grad_norm": 4.800915718078613, + "learning_rate": 5.223199739328773e-05, + "loss": 0.8591, + "step": 6376 + }, + { + "epoch": 0.9982780212899186, + "grad_norm": 2.818758726119995, + "learning_rate": 5.2223851417399804e-05, + "loss": 1.0466, + "step": 6377 + }, + { + "epoch": 0.9984345648090169, + "grad_norm": 2.6604623794555664, + "learning_rate": 5.2215705441511894e-05, + "loss": 0.4844, + "step": 6378 + }, + { + "epoch": 0.9985911083281153, + "grad_norm": 2.3141117095947266, + "learning_rate": 5.220755946562399e-05, + "loss": 1.0606, + "step": 6379 + }, + { + "epoch": 0.9987476518472135, + "grad_norm": 3.6309938430786133, + "learning_rate": 5.219941348973607e-05, + "loss": 0.8984, + "step": 6380 + }, + { + "epoch": 0.9989041953663118, + "grad_norm": 6.009674549102783, + "learning_rate": 5.2191267513848166e-05, + "loss": 1.5284, + "step": 6381 + }, + { + "epoch": 0.9990607388854101, + "grad_norm": 4.902308940887451, + "learning_rate": 5.2183121537960257e-05, + "loss": 0.873, + "step": 6382 + }, + { + "epoch": 0.9992172824045085, + "grad_norm": 2.413064479827881, + "learning_rate": 5.2174975562072334e-05, + "loss": 1.2835, + "step": 6383 + }, + { + "epoch": 0.9993738259236068, + "grad_norm": 2.246983766555786, + "learning_rate": 5.216682958618443e-05, + "loss": 1.0306, + "step": 6384 + }, + { + "epoch": 0.9995303694427051, + "grad_norm": 2.670351505279541, + "learning_rate": 5.215868361029652e-05, + "loss": 0.5182, + "step": 6385 + }, + { + "epoch": 0.9996869129618033, + "grad_norm": 1.8352153301239014, + "learning_rate": 5.2150537634408605e-05, + "loss": 0.8853, + "step": 6386 + }, + { + "epoch": 0.9998434564809017, + "grad_norm": 4.802717685699463, + "learning_rate": 5.2142391658520696e-05, + "loss": 1.6502, + "step": 6387 + }, + { + "epoch": 1.0, + "grad_norm": 5.534973621368408, + "learning_rate": 5.2134245682632786e-05, + "loss": 1.0462, + "step": 6388 + }, + { + "epoch": 1.0001565435190982, + "grad_norm": 0.44646477699279785, + "learning_rate": 5.212609970674487e-05, + "loss": 0.2331, + "step": 6389 + }, + { + "epoch": 1.0003130870381967, + "grad_norm": 0.6952627897262573, + "learning_rate": 5.211795373085696e-05, + "loss": 0.1864, + "step": 6390 + }, + { + "epoch": 1.0004696305572949, + "grad_norm": 0.6646711826324463, + "learning_rate": 5.210980775496905e-05, + "loss": 0.1911, + "step": 6391 + }, + { + "epoch": 1.0006261740763933, + "grad_norm": 0.6875047087669373, + "learning_rate": 5.2101661779081135e-05, + "loss": 0.2753, + "step": 6392 + }, + { + "epoch": 1.0007827175954915, + "grad_norm": 0.4401170313358307, + "learning_rate": 5.2093515803193225e-05, + "loss": 0.2439, + "step": 6393 + }, + { + "epoch": 1.0009392611145898, + "grad_norm": 0.743806004524231, + "learning_rate": 5.208536982730532e-05, + "loss": 0.2519, + "step": 6394 + }, + { + "epoch": 1.0010958046336882, + "grad_norm": 0.7445011138916016, + "learning_rate": 5.20772238514174e-05, + "loss": 0.3562, + "step": 6395 + }, + { + "epoch": 1.0012523481527864, + "grad_norm": 0.7056273221969604, + "learning_rate": 5.206907787552949e-05, + "loss": 0.2531, + "step": 6396 + }, + { + "epoch": 1.0014088916718848, + "grad_norm": 0.8415284752845764, + "learning_rate": 5.2060931899641574e-05, + "loss": 0.3519, + "step": 6397 + }, + { + "epoch": 1.001565435190983, + "grad_norm": 0.5610738396644592, + "learning_rate": 5.2052785923753664e-05, + "loss": 0.1767, + "step": 6398 + }, + { + "epoch": 1.0017219787100815, + "grad_norm": 1.275416374206543, + "learning_rate": 5.204463994786576e-05, + "loss": 0.3685, + "step": 6399 + }, + { + "epoch": 1.0018785222291797, + "grad_norm": 0.7698217034339905, + "learning_rate": 5.203649397197784e-05, + "loss": 0.2677, + "step": 6400 + }, + { + "epoch": 1.002035065748278, + "grad_norm": 1.1420444250106812, + "learning_rate": 5.202834799608993e-05, + "loss": 0.449, + "step": 6401 + }, + { + "epoch": 1.0021916092673764, + "grad_norm": 1.5269135236740112, + "learning_rate": 5.2020202020202026e-05, + "loss": 0.6345, + "step": 6402 + }, + { + "epoch": 1.0023481527864746, + "grad_norm": 1.16241455078125, + "learning_rate": 5.20120560443141e-05, + "loss": 0.2923, + "step": 6403 + }, + { + "epoch": 1.002504696305573, + "grad_norm": 1.8474535942077637, + "learning_rate": 5.20039100684262e-05, + "loss": 0.5143, + "step": 6404 + }, + { + "epoch": 1.0026612398246713, + "grad_norm": 1.480614185333252, + "learning_rate": 5.199576409253829e-05, + "loss": 0.3496, + "step": 6405 + }, + { + "epoch": 1.0028177833437695, + "grad_norm": 1.2145130634307861, + "learning_rate": 5.1987618116650375e-05, + "loss": 0.3695, + "step": 6406 + }, + { + "epoch": 1.002974326862868, + "grad_norm": 2.06465220451355, + "learning_rate": 5.1979472140762465e-05, + "loss": 0.4223, + "step": 6407 + }, + { + "epoch": 1.0031308703819661, + "grad_norm": 4.258430004119873, + "learning_rate": 5.1971326164874556e-05, + "loss": 0.3094, + "step": 6408 + }, + { + "epoch": 1.0032874139010646, + "grad_norm": 0.9647040367126465, + "learning_rate": 5.196318018898664e-05, + "loss": 0.3286, + "step": 6409 + }, + { + "epoch": 1.0034439574201628, + "grad_norm": 1.3660646677017212, + "learning_rate": 5.195503421309873e-05, + "loss": 0.4495, + "step": 6410 + }, + { + "epoch": 1.003600500939261, + "grad_norm": 1.300477385520935, + "learning_rate": 5.194688823721082e-05, + "loss": 0.3833, + "step": 6411 + }, + { + "epoch": 1.0037570444583594, + "grad_norm": 2.3913612365722656, + "learning_rate": 5.1938742261322905e-05, + "loss": 0.3753, + "step": 6412 + }, + { + "epoch": 1.0039135879774577, + "grad_norm": 1.642021656036377, + "learning_rate": 5.1930596285434995e-05, + "loss": 0.6121, + "step": 6413 + }, + { + "epoch": 1.004070131496556, + "grad_norm": 1.3553133010864258, + "learning_rate": 5.1922450309547086e-05, + "loss": 0.4259, + "step": 6414 + }, + { + "epoch": 1.0042266750156543, + "grad_norm": 3.2899296283721924, + "learning_rate": 5.191430433365917e-05, + "loss": 0.7713, + "step": 6415 + }, + { + "epoch": 1.0043832185347528, + "grad_norm": 1.9336705207824707, + "learning_rate": 5.190615835777126e-05, + "loss": 0.5176, + "step": 6416 + }, + { + "epoch": 1.004539762053851, + "grad_norm": 3.0473525524139404, + "learning_rate": 5.189801238188336e-05, + "loss": 0.6637, + "step": 6417 + }, + { + "epoch": 1.0046963055729492, + "grad_norm": 3.6838417053222656, + "learning_rate": 5.1889866405995434e-05, + "loss": 0.9123, + "step": 6418 + }, + { + "epoch": 1.0048528490920476, + "grad_norm": 3.1711621284484863, + "learning_rate": 5.1881720430107525e-05, + "loss": 0.6221, + "step": 6419 + }, + { + "epoch": 1.0050093926111459, + "grad_norm": 4.010916709899902, + "learning_rate": 5.187357445421962e-05, + "loss": 1.0548, + "step": 6420 + }, + { + "epoch": 1.0051659361302443, + "grad_norm": 3.223778009414673, + "learning_rate": 5.18654284783317e-05, + "loss": 0.7543, + "step": 6421 + }, + { + "epoch": 1.0053224796493425, + "grad_norm": 2.4866480827331543, + "learning_rate": 5.1857282502443796e-05, + "loss": 0.9201, + "step": 6422 + }, + { + "epoch": 1.0054790231684407, + "grad_norm": 1.8812668323516846, + "learning_rate": 5.184913652655589e-05, + "loss": 0.6145, + "step": 6423 + }, + { + "epoch": 1.0056355666875392, + "grad_norm": 2.2162387371063232, + "learning_rate": 5.184099055066797e-05, + "loss": 0.9398, + "step": 6424 + }, + { + "epoch": 1.0057921102066374, + "grad_norm": 4.88568115234375, + "learning_rate": 5.183284457478006e-05, + "loss": 0.6559, + "step": 6425 + }, + { + "epoch": 1.0059486537257358, + "grad_norm": 3.6865124702453613, + "learning_rate": 5.182469859889215e-05, + "loss": 1.1089, + "step": 6426 + }, + { + "epoch": 1.006105197244834, + "grad_norm": 3.203242540359497, + "learning_rate": 5.1816552623004235e-05, + "loss": 1.0181, + "step": 6427 + }, + { + "epoch": 1.0062617407639323, + "grad_norm": 3.230801582336426, + "learning_rate": 5.1808406647116326e-05, + "loss": 0.9768, + "step": 6428 + }, + { + "epoch": 1.0064182842830307, + "grad_norm": 4.661052227020264, + "learning_rate": 5.1800260671228416e-05, + "loss": 1.0266, + "step": 6429 + }, + { + "epoch": 1.006574827802129, + "grad_norm": 3.0104620456695557, + "learning_rate": 5.17921146953405e-05, + "loss": 1.5348, + "step": 6430 + }, + { + "epoch": 1.0067313713212274, + "grad_norm": 3.1573593616485596, + "learning_rate": 5.178396871945259e-05, + "loss": 1.4116, + "step": 6431 + }, + { + "epoch": 1.0068879148403256, + "grad_norm": 3.9143972396850586, + "learning_rate": 5.177582274356468e-05, + "loss": 1.5685, + "step": 6432 + }, + { + "epoch": 1.007044458359424, + "grad_norm": 1.9968611001968384, + "learning_rate": 5.1767676767676765e-05, + "loss": 1.0756, + "step": 6433 + }, + { + "epoch": 1.0072010018785222, + "grad_norm": 3.1806540489196777, + "learning_rate": 5.1759530791788855e-05, + "loss": 1.1874, + "step": 6434 + }, + { + "epoch": 1.0073575453976205, + "grad_norm": 2.2111480236053467, + "learning_rate": 5.175138481590095e-05, + "loss": 0.3959, + "step": 6435 + }, + { + "epoch": 1.007514088916719, + "grad_norm": 1.4430058002471924, + "learning_rate": 5.174323884001303e-05, + "loss": 0.5381, + "step": 6436 + }, + { + "epoch": 1.0076706324358171, + "grad_norm": 3.3847568035125732, + "learning_rate": 5.173509286412512e-05, + "loss": 0.6922, + "step": 6437 + }, + { + "epoch": 1.0078271759549156, + "grad_norm": 5.115494728088379, + "learning_rate": 5.172694688823722e-05, + "loss": 1.7523, + "step": 6438 + }, + { + "epoch": 1.0079837194740138, + "grad_norm": 0.5558544993400574, + "learning_rate": 5.1718800912349295e-05, + "loss": 0.2338, + "step": 6439 + }, + { + "epoch": 1.008140262993112, + "grad_norm": 0.40718337893486023, + "learning_rate": 5.171065493646139e-05, + "loss": 0.1833, + "step": 6440 + }, + { + "epoch": 1.0082968065122104, + "grad_norm": 0.6938580274581909, + "learning_rate": 5.170250896057348e-05, + "loss": 0.1485, + "step": 6441 + }, + { + "epoch": 1.0084533500313086, + "grad_norm": 0.6344759464263916, + "learning_rate": 5.169436298468556e-05, + "loss": 0.1619, + "step": 6442 + }, + { + "epoch": 1.008609893550407, + "grad_norm": 0.38678082823753357, + "learning_rate": 5.168621700879766e-05, + "loss": 0.1344, + "step": 6443 + }, + { + "epoch": 1.0087664370695053, + "grad_norm": 0.5225772857666016, + "learning_rate": 5.167807103290975e-05, + "loss": 0.2037, + "step": 6444 + }, + { + "epoch": 1.0089229805886035, + "grad_norm": 0.8443629741668701, + "learning_rate": 5.166992505702183e-05, + "loss": 0.1773, + "step": 6445 + }, + { + "epoch": 1.009079524107702, + "grad_norm": 0.732243537902832, + "learning_rate": 5.166177908113392e-05, + "loss": 0.222, + "step": 6446 + }, + { + "epoch": 1.0092360676268002, + "grad_norm": 0.799583911895752, + "learning_rate": 5.165363310524601e-05, + "loss": 0.2052, + "step": 6447 + }, + { + "epoch": 1.0093926111458986, + "grad_norm": 0.920010507106781, + "learning_rate": 5.1645487129358096e-05, + "loss": 0.333, + "step": 6448 + }, + { + "epoch": 1.0095491546649968, + "grad_norm": 0.8178783059120178, + "learning_rate": 5.1637341153470186e-05, + "loss": 0.2442, + "step": 6449 + }, + { + "epoch": 1.0097056981840953, + "grad_norm": 1.169918417930603, + "learning_rate": 5.162919517758228e-05, + "loss": 0.3169, + "step": 6450 + }, + { + "epoch": 1.0098622417031935, + "grad_norm": 0.6244511604309082, + "learning_rate": 5.162104920169436e-05, + "loss": 0.2666, + "step": 6451 + }, + { + "epoch": 1.0100187852222917, + "grad_norm": 1.6202548742294312, + "learning_rate": 5.161290322580645e-05, + "loss": 0.564, + "step": 6452 + }, + { + "epoch": 1.0101753287413902, + "grad_norm": 1.075972080230713, + "learning_rate": 5.160475724991855e-05, + "loss": 0.3216, + "step": 6453 + }, + { + "epoch": 1.0103318722604884, + "grad_norm": 1.8658288717269897, + "learning_rate": 5.1596611274030625e-05, + "loss": 0.4163, + "step": 6454 + }, + { + "epoch": 1.0104884157795868, + "grad_norm": 2.323573112487793, + "learning_rate": 5.1588465298142716e-05, + "loss": 0.3993, + "step": 6455 + }, + { + "epoch": 1.010644959298685, + "grad_norm": 2.097219467163086, + "learning_rate": 5.158031932225481e-05, + "loss": 0.4224, + "step": 6456 + }, + { + "epoch": 1.0108015028177832, + "grad_norm": 1.5247915983200073, + "learning_rate": 5.157217334636689e-05, + "loss": 0.401, + "step": 6457 + }, + { + "epoch": 1.0109580463368817, + "grad_norm": 1.358276128768921, + "learning_rate": 5.156402737047899e-05, + "loss": 0.3695, + "step": 6458 + }, + { + "epoch": 1.01111458985598, + "grad_norm": 1.0023305416107178, + "learning_rate": 5.155588139459108e-05, + "loss": 0.2094, + "step": 6459 + }, + { + "epoch": 1.0112711333750783, + "grad_norm": 2.0206191539764404, + "learning_rate": 5.1547735418703155e-05, + "loss": 0.4256, + "step": 6460 + }, + { + "epoch": 1.0114276768941766, + "grad_norm": 2.0373613834381104, + "learning_rate": 5.153958944281525e-05, + "loss": 0.5785, + "step": 6461 + }, + { + "epoch": 1.0115842204132748, + "grad_norm": 1.8475041389465332, + "learning_rate": 5.153144346692734e-05, + "loss": 0.4691, + "step": 6462 + }, + { + "epoch": 1.0117407639323732, + "grad_norm": 1.6636425256729126, + "learning_rate": 5.1523297491039426e-05, + "loss": 0.358, + "step": 6463 + }, + { + "epoch": 1.0118973074514714, + "grad_norm": 1.327019214630127, + "learning_rate": 5.151515151515152e-05, + "loss": 0.3868, + "step": 6464 + }, + { + "epoch": 1.0120538509705699, + "grad_norm": 2.910613775253296, + "learning_rate": 5.150700553926361e-05, + "loss": 0.6738, + "step": 6465 + }, + { + "epoch": 1.012210394489668, + "grad_norm": 6.60212516784668, + "learning_rate": 5.149885956337569e-05, + "loss": 0.9203, + "step": 6466 + }, + { + "epoch": 1.0123669380087665, + "grad_norm": 1.188012719154358, + "learning_rate": 5.149071358748778e-05, + "loss": 0.578, + "step": 6467 + }, + { + "epoch": 1.0125234815278648, + "grad_norm": 2.549516201019287, + "learning_rate": 5.148256761159987e-05, + "loss": 0.8863, + "step": 6468 + }, + { + "epoch": 1.012680025046963, + "grad_norm": 4.689101696014404, + "learning_rate": 5.1474421635711956e-05, + "loss": 0.6894, + "step": 6469 + }, + { + "epoch": 1.0128365685660614, + "grad_norm": 3.251169204711914, + "learning_rate": 5.1466275659824047e-05, + "loss": 0.9513, + "step": 6470 + }, + { + "epoch": 1.0129931120851596, + "grad_norm": 3.0921826362609863, + "learning_rate": 5.145812968393614e-05, + "loss": 0.8237, + "step": 6471 + }, + { + "epoch": 1.013149655604258, + "grad_norm": 2.4306607246398926, + "learning_rate": 5.144998370804822e-05, + "loss": 0.749, + "step": 6472 + }, + { + "epoch": 1.0133061991233563, + "grad_norm": 2.987457036972046, + "learning_rate": 5.144183773216031e-05, + "loss": 0.7419, + "step": 6473 + }, + { + "epoch": 1.0134627426424545, + "grad_norm": 4.21177339553833, + "learning_rate": 5.143369175627241e-05, + "loss": 1.1512, + "step": 6474 + }, + { + "epoch": 1.013619286161553, + "grad_norm": 4.13670015335083, + "learning_rate": 5.1425545780384486e-05, + "loss": 0.9932, + "step": 6475 + }, + { + "epoch": 1.0137758296806512, + "grad_norm": 4.21058464050293, + "learning_rate": 5.141739980449658e-05, + "loss": 1.0303, + "step": 6476 + }, + { + "epoch": 1.0139323731997496, + "grad_norm": 2.0108046531677246, + "learning_rate": 5.1409253828608674e-05, + "loss": 0.6589, + "step": 6477 + }, + { + "epoch": 1.0140889167188478, + "grad_norm": 6.968317031860352, + "learning_rate": 5.140110785272075e-05, + "loss": 1.4498, + "step": 6478 + }, + { + "epoch": 1.014245460237946, + "grad_norm": 2.5853500366210938, + "learning_rate": 5.139296187683285e-05, + "loss": 0.6867, + "step": 6479 + }, + { + "epoch": 1.0144020037570445, + "grad_norm": 3.090435743331909, + "learning_rate": 5.138481590094494e-05, + "loss": 1.2936, + "step": 6480 + }, + { + "epoch": 1.0145585472761427, + "grad_norm": 2.820812463760376, + "learning_rate": 5.137666992505702e-05, + "loss": 1.3696, + "step": 6481 + }, + { + "epoch": 1.0147150907952411, + "grad_norm": 3.0980496406555176, + "learning_rate": 5.136852394916911e-05, + "loss": 1.3321, + "step": 6482 + }, + { + "epoch": 1.0148716343143394, + "grad_norm": 1.9641499519348145, + "learning_rate": 5.13603779732812e-05, + "loss": 0.5944, + "step": 6483 + }, + { + "epoch": 1.0150281778334378, + "grad_norm": 3.3694238662719727, + "learning_rate": 5.135223199739329e-05, + "loss": 0.5091, + "step": 6484 + }, + { + "epoch": 1.015184721352536, + "grad_norm": 4.473544597625732, + "learning_rate": 5.134408602150538e-05, + "loss": 0.5589, + "step": 6485 + }, + { + "epoch": 1.0153412648716342, + "grad_norm": 5.464939594268799, + "learning_rate": 5.133594004561747e-05, + "loss": 0.9626, + "step": 6486 + }, + { + "epoch": 1.0154978083907327, + "grad_norm": 1.6060478687286377, + "learning_rate": 5.132779406972955e-05, + "loss": 0.3868, + "step": 6487 + }, + { + "epoch": 1.0156543519098309, + "grad_norm": 3.539865255355835, + "learning_rate": 5.131964809384164e-05, + "loss": 1.0912, + "step": 6488 + }, + { + "epoch": 1.0158108954289293, + "grad_norm": 0.6577065587043762, + "learning_rate": 5.131150211795373e-05, + "loss": 0.2683, + "step": 6489 + }, + { + "epoch": 1.0159674389480275, + "grad_norm": 0.3845237195491791, + "learning_rate": 5.1303356142065816e-05, + "loss": 0.1681, + "step": 6490 + }, + { + "epoch": 1.0161239824671258, + "grad_norm": 0.32245373725891113, + "learning_rate": 5.129521016617791e-05, + "loss": 0.168, + "step": 6491 + }, + { + "epoch": 1.0162805259862242, + "grad_norm": 0.7575495839118958, + "learning_rate": 5.1287064190290004e-05, + "loss": 0.3375, + "step": 6492 + }, + { + "epoch": 1.0164370695053224, + "grad_norm": 0.601871907711029, + "learning_rate": 5.127891821440208e-05, + "loss": 0.1759, + "step": 6493 + }, + { + "epoch": 1.0165936130244209, + "grad_norm": 0.4575227200984955, + "learning_rate": 5.127077223851418e-05, + "loss": 0.1741, + "step": 6494 + }, + { + "epoch": 1.016750156543519, + "grad_norm": 1.0045688152313232, + "learning_rate": 5.126262626262627e-05, + "loss": 0.327, + "step": 6495 + }, + { + "epoch": 1.0169067000626173, + "grad_norm": 0.6271325945854187, + "learning_rate": 5.1254480286738346e-05, + "loss": 0.2964, + "step": 6496 + }, + { + "epoch": 1.0170632435817157, + "grad_norm": 0.8027560710906982, + "learning_rate": 5.124633431085044e-05, + "loss": 0.2349, + "step": 6497 + }, + { + "epoch": 1.017219787100814, + "grad_norm": 0.835831344127655, + "learning_rate": 5.1238188334962534e-05, + "loss": 0.2638, + "step": 6498 + }, + { + "epoch": 1.0173763306199124, + "grad_norm": 1.9164248704910278, + "learning_rate": 5.123004235907462e-05, + "loss": 0.2788, + "step": 6499 + }, + { + "epoch": 1.0175328741390106, + "grad_norm": 3.649179697036743, + "learning_rate": 5.122189638318671e-05, + "loss": 1.091, + "step": 6500 + }, + { + "epoch": 1.017689417658109, + "grad_norm": 0.8312178254127502, + "learning_rate": 5.12137504072988e-05, + "loss": 0.2326, + "step": 6501 + }, + { + "epoch": 1.0178459611772073, + "grad_norm": 1.9861102104187012, + "learning_rate": 5.120560443141088e-05, + "loss": 0.4665, + "step": 6502 + }, + { + "epoch": 1.0180025046963055, + "grad_norm": 0.7803061604499817, + "learning_rate": 5.119745845552297e-05, + "loss": 0.3122, + "step": 6503 + }, + { + "epoch": 1.018159048215404, + "grad_norm": 1.3569579124450684, + "learning_rate": 5.1189312479635063e-05, + "loss": 0.461, + "step": 6504 + }, + { + "epoch": 1.0183155917345021, + "grad_norm": 4.8883185386657715, + "learning_rate": 5.118116650374715e-05, + "loss": 0.2913, + "step": 6505 + }, + { + "epoch": 1.0184721352536006, + "grad_norm": 0.7378587126731873, + "learning_rate": 5.117302052785924e-05, + "loss": 0.4026, + "step": 6506 + }, + { + "epoch": 1.0186286787726988, + "grad_norm": 2.4150452613830566, + "learning_rate": 5.116487455197133e-05, + "loss": 0.4282, + "step": 6507 + }, + { + "epoch": 1.018785222291797, + "grad_norm": 1.1827623844146729, + "learning_rate": 5.115672857608341e-05, + "loss": 0.2891, + "step": 6508 + }, + { + "epoch": 1.0189417658108955, + "grad_norm": 1.3880053758621216, + "learning_rate": 5.11485826001955e-05, + "loss": 0.3938, + "step": 6509 + }, + { + "epoch": 1.0190983093299937, + "grad_norm": 1.3361334800720215, + "learning_rate": 5.11404366243076e-05, + "loss": 0.4128, + "step": 6510 + }, + { + "epoch": 1.0192548528490921, + "grad_norm": 2.54728627204895, + "learning_rate": 5.113229064841968e-05, + "loss": 0.4863, + "step": 6511 + }, + { + "epoch": 1.0194113963681903, + "grad_norm": 1.262649416923523, + "learning_rate": 5.1124144672531774e-05, + "loss": 0.4776, + "step": 6512 + }, + { + "epoch": 1.0195679398872888, + "grad_norm": 0.9820156693458557, + "learning_rate": 5.1115998696643865e-05, + "loss": 0.2884, + "step": 6513 + }, + { + "epoch": 1.019724483406387, + "grad_norm": 4.298229694366455, + "learning_rate": 5.110785272075594e-05, + "loss": 1.5198, + "step": 6514 + }, + { + "epoch": 1.0198810269254852, + "grad_norm": 2.7332186698913574, + "learning_rate": 5.109970674486804e-05, + "loss": 0.4988, + "step": 6515 + }, + { + "epoch": 1.0200375704445837, + "grad_norm": 2.1346867084503174, + "learning_rate": 5.109156076898013e-05, + "loss": 0.679, + "step": 6516 + }, + { + "epoch": 1.0201941139636819, + "grad_norm": 1.1560571193695068, + "learning_rate": 5.108341479309221e-05, + "loss": 0.3312, + "step": 6517 + }, + { + "epoch": 1.0203506574827803, + "grad_norm": 1.8341444730758667, + "learning_rate": 5.1075268817204304e-05, + "loss": 0.5427, + "step": 6518 + }, + { + "epoch": 1.0205072010018785, + "grad_norm": 2.53837251663208, + "learning_rate": 5.1067122841316394e-05, + "loss": 0.874, + "step": 6519 + }, + { + "epoch": 1.0206637445209767, + "grad_norm": 2.647571325302124, + "learning_rate": 5.105897686542848e-05, + "loss": 0.592, + "step": 6520 + }, + { + "epoch": 1.0208202880400752, + "grad_norm": 6.803788661956787, + "learning_rate": 5.105083088954057e-05, + "loss": 0.9078, + "step": 6521 + }, + { + "epoch": 1.0209768315591734, + "grad_norm": 4.859503746032715, + "learning_rate": 5.104268491365266e-05, + "loss": 0.4769, + "step": 6522 + }, + { + "epoch": 1.0211333750782718, + "grad_norm": 2.294985771179199, + "learning_rate": 5.103453893776474e-05, + "loss": 0.6814, + "step": 6523 + }, + { + "epoch": 1.02128991859737, + "grad_norm": 3.679522752761841, + "learning_rate": 5.102639296187683e-05, + "loss": 1.0656, + "step": 6524 + }, + { + "epoch": 1.0214464621164683, + "grad_norm": 4.422848701477051, + "learning_rate": 5.1018246985988924e-05, + "loss": 0.7903, + "step": 6525 + }, + { + "epoch": 1.0216030056355667, + "grad_norm": 4.791163921356201, + "learning_rate": 5.101010101010101e-05, + "loss": 0.9927, + "step": 6526 + }, + { + "epoch": 1.021759549154665, + "grad_norm": 2.3407394886016846, + "learning_rate": 5.10019550342131e-05, + "loss": 1.1812, + "step": 6527 + }, + { + "epoch": 1.0219160926737634, + "grad_norm": 4.2075090408325195, + "learning_rate": 5.0993809058325195e-05, + "loss": 1.0285, + "step": 6528 + }, + { + "epoch": 1.0220726361928616, + "grad_norm": 1.6374648809432983, + "learning_rate": 5.098566308243727e-05, + "loss": 0.3604, + "step": 6529 + }, + { + "epoch": 1.0222291797119598, + "grad_norm": 4.218658924102783, + "learning_rate": 5.097751710654936e-05, + "loss": 2.024, + "step": 6530 + }, + { + "epoch": 1.0223857232310583, + "grad_norm": 2.9478394985198975, + "learning_rate": 5.096937113066146e-05, + "loss": 1.2133, + "step": 6531 + }, + { + "epoch": 1.0225422667501565, + "grad_norm": 5.02390193939209, + "learning_rate": 5.096122515477354e-05, + "loss": 2.1396, + "step": 6532 + }, + { + "epoch": 1.022698810269255, + "grad_norm": 4.369184494018555, + "learning_rate": 5.0953079178885635e-05, + "loss": 1.1621, + "step": 6533 + }, + { + "epoch": 1.0228553537883531, + "grad_norm": 2.914919137954712, + "learning_rate": 5.0944933202997725e-05, + "loss": 0.4609, + "step": 6534 + }, + { + "epoch": 1.0230118973074516, + "grad_norm": 2.8801920413970947, + "learning_rate": 5.093678722710981e-05, + "loss": 0.4465, + "step": 6535 + }, + { + "epoch": 1.0231684408265498, + "grad_norm": 2.6521406173706055, + "learning_rate": 5.09286412512219e-05, + "loss": 0.5954, + "step": 6536 + }, + { + "epoch": 1.023324984345648, + "grad_norm": 5.296048641204834, + "learning_rate": 5.092049527533399e-05, + "loss": 0.9958, + "step": 6537 + }, + { + "epoch": 1.0234815278647464, + "grad_norm": 3.5684502124786377, + "learning_rate": 5.0912349299446074e-05, + "loss": 1.0751, + "step": 6538 + }, + { + "epoch": 1.0236380713838447, + "grad_norm": 0.6658557057380676, + "learning_rate": 5.0904203323558164e-05, + "loss": 0.305, + "step": 6539 + }, + { + "epoch": 1.023794614902943, + "grad_norm": 0.7545821666717529, + "learning_rate": 5.0896057347670255e-05, + "loss": 0.1633, + "step": 6540 + }, + { + "epoch": 1.0239511584220413, + "grad_norm": 0.6108189821243286, + "learning_rate": 5.088791137178234e-05, + "loss": 0.2915, + "step": 6541 + }, + { + "epoch": 1.0241077019411395, + "grad_norm": 0.4476344585418701, + "learning_rate": 5.087976539589443e-05, + "loss": 0.1848, + "step": 6542 + }, + { + "epoch": 1.024264245460238, + "grad_norm": 0.4871160089969635, + "learning_rate": 5.087161942000652e-05, + "loss": 0.1892, + "step": 6543 + }, + { + "epoch": 1.0244207889793362, + "grad_norm": 0.4665106236934662, + "learning_rate": 5.08634734441186e-05, + "loss": 0.1375, + "step": 6544 + }, + { + "epoch": 1.0245773324984346, + "grad_norm": 0.8740967512130737, + "learning_rate": 5.0855327468230694e-05, + "loss": 0.3076, + "step": 6545 + }, + { + "epoch": 1.0247338760175329, + "grad_norm": 1.4955202341079712, + "learning_rate": 5.084718149234279e-05, + "loss": 0.1909, + "step": 6546 + }, + { + "epoch": 1.0248904195366313, + "grad_norm": 0.6252956390380859, + "learning_rate": 5.083903551645487e-05, + "loss": 0.1766, + "step": 6547 + }, + { + "epoch": 1.0250469630557295, + "grad_norm": 0.9013951420783997, + "learning_rate": 5.083088954056696e-05, + "loss": 0.289, + "step": 6548 + }, + { + "epoch": 1.0252035065748277, + "grad_norm": 0.7472125291824341, + "learning_rate": 5.0822743564679056e-05, + "loss": 0.2001, + "step": 6549 + }, + { + "epoch": 1.0253600500939262, + "grad_norm": 0.6622899174690247, + "learning_rate": 5.081459758879113e-05, + "loss": 0.2324, + "step": 6550 + }, + { + "epoch": 1.0255165936130244, + "grad_norm": 0.9896902441978455, + "learning_rate": 5.080645161290323e-05, + "loss": 0.3246, + "step": 6551 + }, + { + "epoch": 1.0256731371321228, + "grad_norm": 1.865359902381897, + "learning_rate": 5.079830563701532e-05, + "loss": 0.246, + "step": 6552 + }, + { + "epoch": 1.025829680651221, + "grad_norm": 1.2624621391296387, + "learning_rate": 5.0790159661127404e-05, + "loss": 0.3133, + "step": 6553 + }, + { + "epoch": 1.0259862241703193, + "grad_norm": 1.7104816436767578, + "learning_rate": 5.0782013685239495e-05, + "loss": 0.3138, + "step": 6554 + }, + { + "epoch": 1.0261427676894177, + "grad_norm": 2.0678064823150635, + "learning_rate": 5.0773867709351585e-05, + "loss": 0.6791, + "step": 6555 + }, + { + "epoch": 1.026299311208516, + "grad_norm": 1.4145028591156006, + "learning_rate": 5.076572173346367e-05, + "loss": 0.445, + "step": 6556 + }, + { + "epoch": 1.0264558547276144, + "grad_norm": 1.202193021774292, + "learning_rate": 5.075757575757576e-05, + "loss": 0.4151, + "step": 6557 + }, + { + "epoch": 1.0266123982467126, + "grad_norm": 1.779117465019226, + "learning_rate": 5.074942978168785e-05, + "loss": 0.3557, + "step": 6558 + }, + { + "epoch": 1.0267689417658108, + "grad_norm": 0.933120846748352, + "learning_rate": 5.0741283805799934e-05, + "loss": 0.2397, + "step": 6559 + }, + { + "epoch": 1.0269254852849092, + "grad_norm": 3.003127336502075, + "learning_rate": 5.0733137829912025e-05, + "loss": 0.4677, + "step": 6560 + }, + { + "epoch": 1.0270820288040075, + "grad_norm": 1.0892728567123413, + "learning_rate": 5.0724991854024115e-05, + "loss": 0.4849, + "step": 6561 + }, + { + "epoch": 1.027238572323106, + "grad_norm": 2.282942295074463, + "learning_rate": 5.07168458781362e-05, + "loss": 0.5137, + "step": 6562 + }, + { + "epoch": 1.027395115842204, + "grad_norm": 2.256998300552368, + "learning_rate": 5.070869990224829e-05, + "loss": 0.4327, + "step": 6563 + }, + { + "epoch": 1.0275516593613025, + "grad_norm": 2.931816816329956, + "learning_rate": 5.0700553926360387e-05, + "loss": 0.7402, + "step": 6564 + }, + { + "epoch": 1.0277082028804008, + "grad_norm": 2.2234859466552734, + "learning_rate": 5.0692407950472464e-05, + "loss": 0.6394, + "step": 6565 + }, + { + "epoch": 1.027864746399499, + "grad_norm": 3.9679665565490723, + "learning_rate": 5.0684261974584554e-05, + "loss": 0.4438, + "step": 6566 + }, + { + "epoch": 1.0280212899185974, + "grad_norm": 2.771491765975952, + "learning_rate": 5.067611599869665e-05, + "loss": 0.9748, + "step": 6567 + }, + { + "epoch": 1.0281778334376956, + "grad_norm": 2.59289288520813, + "learning_rate": 5.066797002280873e-05, + "loss": 0.9514, + "step": 6568 + }, + { + "epoch": 1.028334376956794, + "grad_norm": 1.9834150075912476, + "learning_rate": 5.0659824046920826e-05, + "loss": 0.6484, + "step": 6569 + }, + { + "epoch": 1.0284909204758923, + "grad_norm": 5.006476879119873, + "learning_rate": 5.0651678071032916e-05, + "loss": 1.1482, + "step": 6570 + }, + { + "epoch": 1.0286474639949905, + "grad_norm": 2.7423977851867676, + "learning_rate": 5.064353209514499e-05, + "loss": 0.7995, + "step": 6571 + }, + { + "epoch": 1.028804007514089, + "grad_norm": 2.940101385116577, + "learning_rate": 5.063538611925709e-05, + "loss": 0.9063, + "step": 6572 + }, + { + "epoch": 1.0289605510331872, + "grad_norm": 2.2754197120666504, + "learning_rate": 5.062724014336918e-05, + "loss": 0.9592, + "step": 6573 + }, + { + "epoch": 1.0291170945522856, + "grad_norm": 3.1628077030181885, + "learning_rate": 5.0619094167481265e-05, + "loss": 0.8427, + "step": 6574 + }, + { + "epoch": 1.0292736380713838, + "grad_norm": 4.82814359664917, + "learning_rate": 5.0610948191593355e-05, + "loss": 1.6216, + "step": 6575 + }, + { + "epoch": 1.029430181590482, + "grad_norm": 4.856509685516357, + "learning_rate": 5.0602802215705446e-05, + "loss": 1.5833, + "step": 6576 + }, + { + "epoch": 1.0295867251095805, + "grad_norm": 1.9453966617584229, + "learning_rate": 5.059465623981753e-05, + "loss": 0.9935, + "step": 6577 + }, + { + "epoch": 1.0297432686286787, + "grad_norm": 4.165056228637695, + "learning_rate": 5.058651026392962e-05, + "loss": 0.9377, + "step": 6578 + }, + { + "epoch": 1.0298998121477771, + "grad_norm": 4.736178398132324, + "learning_rate": 5.057836428804171e-05, + "loss": 1.199, + "step": 6579 + }, + { + "epoch": 1.0300563556668754, + "grad_norm": 2.4817047119140625, + "learning_rate": 5.0570218312153794e-05, + "loss": 1.168, + "step": 6580 + }, + { + "epoch": 1.0302128991859738, + "grad_norm": 2.4407622814178467, + "learning_rate": 5.0562072336265885e-05, + "loss": 1.04, + "step": 6581 + }, + { + "epoch": 1.030369442705072, + "grad_norm": 3.3002634048461914, + "learning_rate": 5.055392636037798e-05, + "loss": 1.3289, + "step": 6582 + }, + { + "epoch": 1.0305259862241702, + "grad_norm": 2.3797760009765625, + "learning_rate": 5.054578038449006e-05, + "loss": 1.433, + "step": 6583 + }, + { + "epoch": 1.0306825297432687, + "grad_norm": 2.4937233924865723, + "learning_rate": 5.053763440860215e-05, + "loss": 0.4442, + "step": 6584 + }, + { + "epoch": 1.030839073262367, + "grad_norm": 3.9731931686401367, + "learning_rate": 5.052948843271425e-05, + "loss": 0.8842, + "step": 6585 + }, + { + "epoch": 1.0309956167814653, + "grad_norm": 2.3832530975341797, + "learning_rate": 5.0521342456826324e-05, + "loss": 0.8296, + "step": 6586 + }, + { + "epoch": 1.0311521603005636, + "grad_norm": 1.9602853059768677, + "learning_rate": 5.051319648093842e-05, + "loss": 0.757, + "step": 6587 + }, + { + "epoch": 1.0313087038196618, + "grad_norm": 1.5627907514572144, + "learning_rate": 5.050505050505051e-05, + "loss": 0.6394, + "step": 6588 + }, + { + "epoch": 1.0314652473387602, + "grad_norm": 0.5925785303115845, + "learning_rate": 5.049690452916259e-05, + "loss": 0.2496, + "step": 6589 + }, + { + "epoch": 1.0316217908578584, + "grad_norm": 0.5217147469520569, + "learning_rate": 5.0488758553274686e-05, + "loss": 0.2807, + "step": 6590 + }, + { + "epoch": 1.0317783343769569, + "grad_norm": 0.8166399002075195, + "learning_rate": 5.0480612577386777e-05, + "loss": 0.3049, + "step": 6591 + }, + { + "epoch": 1.031934877896055, + "grad_norm": 0.6484920382499695, + "learning_rate": 5.047246660149886e-05, + "loss": 0.2387, + "step": 6592 + }, + { + "epoch": 1.0320914214151533, + "grad_norm": 0.656051754951477, + "learning_rate": 5.046432062561095e-05, + "loss": 0.3074, + "step": 6593 + }, + { + "epoch": 1.0322479649342517, + "grad_norm": 0.7739185690879822, + "learning_rate": 5.045617464972304e-05, + "loss": 0.2082, + "step": 6594 + }, + { + "epoch": 1.03240450845335, + "grad_norm": 0.5686241388320923, + "learning_rate": 5.0448028673835125e-05, + "loss": 0.2521, + "step": 6595 + }, + { + "epoch": 1.0325610519724484, + "grad_norm": 1.7825051546096802, + "learning_rate": 5.0439882697947216e-05, + "loss": 0.349, + "step": 6596 + }, + { + "epoch": 1.0327175954915466, + "grad_norm": 0.6790578365325928, + "learning_rate": 5.0431736722059306e-05, + "loss": 0.2292, + "step": 6597 + }, + { + "epoch": 1.032874139010645, + "grad_norm": 0.6622328758239746, + "learning_rate": 5.042359074617139e-05, + "loss": 0.339, + "step": 6598 + }, + { + "epoch": 1.0330306825297433, + "grad_norm": 1.0825217962265015, + "learning_rate": 5.041544477028348e-05, + "loss": 0.2357, + "step": 6599 + }, + { + "epoch": 1.0331872260488415, + "grad_norm": 0.787962019443512, + "learning_rate": 5.040729879439557e-05, + "loss": 0.2566, + "step": 6600 + }, + { + "epoch": 1.03334376956794, + "grad_norm": 1.4425798654556274, + "learning_rate": 5.0399152818507655e-05, + "loss": 0.2059, + "step": 6601 + }, + { + "epoch": 1.0335003130870382, + "grad_norm": 1.2877237796783447, + "learning_rate": 5.0391006842619745e-05, + "loss": 0.2803, + "step": 6602 + }, + { + "epoch": 1.0336568566061366, + "grad_norm": 1.3921213150024414, + "learning_rate": 5.038286086673184e-05, + "loss": 0.3144, + "step": 6603 + }, + { + "epoch": 1.0338134001252348, + "grad_norm": 2.162383556365967, + "learning_rate": 5.037471489084392e-05, + "loss": 0.5041, + "step": 6604 + }, + { + "epoch": 1.033969943644333, + "grad_norm": 1.7939164638519287, + "learning_rate": 5.036656891495602e-05, + "loss": 0.3183, + "step": 6605 + }, + { + "epoch": 1.0341264871634315, + "grad_norm": 1.38788640499115, + "learning_rate": 5.035842293906811e-05, + "loss": 0.4543, + "step": 6606 + }, + { + "epoch": 1.0342830306825297, + "grad_norm": 1.2259414196014404, + "learning_rate": 5.0350276963180184e-05, + "loss": 0.1861, + "step": 6607 + }, + { + "epoch": 1.0344395742016281, + "grad_norm": 1.0719993114471436, + "learning_rate": 5.034213098729228e-05, + "loss": 0.355, + "step": 6608 + }, + { + "epoch": 1.0345961177207263, + "grad_norm": 2.2205703258514404, + "learning_rate": 5.033398501140437e-05, + "loss": 0.4091, + "step": 6609 + }, + { + "epoch": 1.0347526612398246, + "grad_norm": 3.5455265045166016, + "learning_rate": 5.0325839035516456e-05, + "loss": 0.9612, + "step": 6610 + }, + { + "epoch": 1.034909204758923, + "grad_norm": 2.6302850246429443, + "learning_rate": 5.0317693059628546e-05, + "loss": 0.7796, + "step": 6611 + }, + { + "epoch": 1.0350657482780212, + "grad_norm": 3.8716084957122803, + "learning_rate": 5.030954708374064e-05, + "loss": 0.5411, + "step": 6612 + }, + { + "epoch": 1.0352222917971197, + "grad_norm": 2.2961063385009766, + "learning_rate": 5.030140110785272e-05, + "loss": 0.6048, + "step": 6613 + }, + { + "epoch": 1.0353788353162179, + "grad_norm": 4.32797908782959, + "learning_rate": 5.029325513196481e-05, + "loss": 0.5995, + "step": 6614 + }, + { + "epoch": 1.0355353788353163, + "grad_norm": 2.706387996673584, + "learning_rate": 5.02851091560769e-05, + "loss": 0.4835, + "step": 6615 + }, + { + "epoch": 1.0356919223544145, + "grad_norm": 2.383793354034424, + "learning_rate": 5.0276963180188986e-05, + "loss": 0.8129, + "step": 6616 + }, + { + "epoch": 1.0358484658735128, + "grad_norm": 1.6074126958847046, + "learning_rate": 5.0268817204301076e-05, + "loss": 0.4871, + "step": 6617 + }, + { + "epoch": 1.0360050093926112, + "grad_norm": 4.210323810577393, + "learning_rate": 5.0260671228413167e-05, + "loss": 0.9725, + "step": 6618 + }, + { + "epoch": 1.0361615529117094, + "grad_norm": 2.06913161277771, + "learning_rate": 5.025252525252525e-05, + "loss": 0.7499, + "step": 6619 + }, + { + "epoch": 1.0363180964308079, + "grad_norm": 3.098050832748413, + "learning_rate": 5.024437927663734e-05, + "loss": 1.2645, + "step": 6620 + }, + { + "epoch": 1.036474639949906, + "grad_norm": 4.34983491897583, + "learning_rate": 5.023623330074944e-05, + "loss": 0.7463, + "step": 6621 + }, + { + "epoch": 1.0366311834690043, + "grad_norm": 2.5730996131896973, + "learning_rate": 5.0228087324861515e-05, + "loss": 0.5065, + "step": 6622 + }, + { + "epoch": 1.0367877269881027, + "grad_norm": 3.289973735809326, + "learning_rate": 5.021994134897361e-05, + "loss": 0.781, + "step": 6623 + }, + { + "epoch": 1.036944270507201, + "grad_norm": 1.6288220882415771, + "learning_rate": 5.02117953730857e-05, + "loss": 0.7551, + "step": 6624 + }, + { + "epoch": 1.0371008140262994, + "grad_norm": 4.031460285186768, + "learning_rate": 5.020364939719778e-05, + "loss": 0.7209, + "step": 6625 + }, + { + "epoch": 1.0372573575453976, + "grad_norm": 2.3460779190063477, + "learning_rate": 5.019550342130988e-05, + "loss": 0.8699, + "step": 6626 + }, + { + "epoch": 1.0374139010644958, + "grad_norm": 3.1547651290893555, + "learning_rate": 5.018735744542197e-05, + "loss": 0.6809, + "step": 6627 + }, + { + "epoch": 1.0375704445835943, + "grad_norm": 3.4516754150390625, + "learning_rate": 5.017921146953405e-05, + "loss": 0.5602, + "step": 6628 + }, + { + "epoch": 1.0377269881026925, + "grad_norm": 4.485353946685791, + "learning_rate": 5.017106549364614e-05, + "loss": 0.836, + "step": 6629 + }, + { + "epoch": 1.037883531621791, + "grad_norm": 3.5356812477111816, + "learning_rate": 5.016291951775823e-05, + "loss": 1.3076, + "step": 6630 + }, + { + "epoch": 1.0380400751408891, + "grad_norm": 3.5349292755126953, + "learning_rate": 5.0154773541870316e-05, + "loss": 1.6122, + "step": 6631 + }, + { + "epoch": 1.0381966186599876, + "grad_norm": 3.4962515830993652, + "learning_rate": 5.014662756598241e-05, + "loss": 1.0545, + "step": 6632 + }, + { + "epoch": 1.0383531621790858, + "grad_norm": 7.21666145324707, + "learning_rate": 5.01384815900945e-05, + "loss": 1.2403, + "step": 6633 + }, + { + "epoch": 1.038509705698184, + "grad_norm": 3.0118675231933594, + "learning_rate": 5.013033561420658e-05, + "loss": 0.5193, + "step": 6634 + }, + { + "epoch": 1.0386662492172825, + "grad_norm": 2.8659937381744385, + "learning_rate": 5.012218963831867e-05, + "loss": 0.7124, + "step": 6635 + }, + { + "epoch": 1.0388227927363807, + "grad_norm": 5.115139961242676, + "learning_rate": 5.011404366243076e-05, + "loss": 0.4808, + "step": 6636 + }, + { + "epoch": 1.0389793362554791, + "grad_norm": 3.370532989501953, + "learning_rate": 5.0105897686542846e-05, + "loss": 0.2383, + "step": 6637 + }, + { + "epoch": 1.0391358797745773, + "grad_norm": 2.266979455947876, + "learning_rate": 5.0097751710654936e-05, + "loss": 0.6944, + "step": 6638 + }, + { + "epoch": 1.0392924232936755, + "grad_norm": 0.6112035512924194, + "learning_rate": 5.0089605734767034e-05, + "loss": 0.2087, + "step": 6639 + }, + { + "epoch": 1.039448966812774, + "grad_norm": 0.5600243806838989, + "learning_rate": 5.008145975887911e-05, + "loss": 0.2115, + "step": 6640 + }, + { + "epoch": 1.0396055103318722, + "grad_norm": 0.34493255615234375, + "learning_rate": 5.007331378299121e-05, + "loss": 0.1727, + "step": 6641 + }, + { + "epoch": 1.0397620538509706, + "grad_norm": 0.7079585790634155, + "learning_rate": 5.00651678071033e-05, + "loss": 0.1958, + "step": 6642 + }, + { + "epoch": 1.0399185973700689, + "grad_norm": 0.5595355033874512, + "learning_rate": 5.0057021831215375e-05, + "loss": 0.2015, + "step": 6643 + }, + { + "epoch": 1.040075140889167, + "grad_norm": 0.9041721224784851, + "learning_rate": 5.004887585532747e-05, + "loss": 0.2896, + "step": 6644 + }, + { + "epoch": 1.0402316844082655, + "grad_norm": 0.7162913680076599, + "learning_rate": 5.004072987943956e-05, + "loss": 0.2643, + "step": 6645 + }, + { + "epoch": 1.0403882279273637, + "grad_norm": 1.269806981086731, + "learning_rate": 5.003258390355165e-05, + "loss": 0.2715, + "step": 6646 + }, + { + "epoch": 1.0405447714464622, + "grad_norm": 6.513917922973633, + "learning_rate": 5.002443792766374e-05, + "loss": 0.9485, + "step": 6647 + }, + { + "epoch": 1.0407013149655604, + "grad_norm": 1.1703436374664307, + "learning_rate": 5.001629195177583e-05, + "loss": 0.2291, + "step": 6648 + }, + { + "epoch": 1.0408578584846588, + "grad_norm": 0.8256023526191711, + "learning_rate": 5.000814597588791e-05, + "loss": 0.3079, + "step": 6649 + }, + { + "epoch": 1.041014402003757, + "grad_norm": 0.9510347843170166, + "learning_rate": 5e-05, + "loss": 0.2622, + "step": 6650 + }, + { + "epoch": 1.0411709455228553, + "grad_norm": 0.908298671245575, + "learning_rate": 4.9991854024112086e-05, + "loss": 0.2417, + "step": 6651 + }, + { + "epoch": 1.0413274890419537, + "grad_norm": 0.7860861420631409, + "learning_rate": 4.9983708048224183e-05, + "loss": 0.3605, + "step": 6652 + }, + { + "epoch": 1.041484032561052, + "grad_norm": 1.2359915971755981, + "learning_rate": 4.997556207233627e-05, + "loss": 0.2966, + "step": 6653 + }, + { + "epoch": 1.0416405760801504, + "grad_norm": 1.5685316324234009, + "learning_rate": 4.996741609644836e-05, + "loss": 0.5617, + "step": 6654 + }, + { + "epoch": 1.0417971195992486, + "grad_norm": 0.8780500292778015, + "learning_rate": 4.995927012056045e-05, + "loss": 0.324, + "step": 6655 + }, + { + "epoch": 1.0419536631183468, + "grad_norm": 1.1632583141326904, + "learning_rate": 4.995112414467253e-05, + "loss": 0.4745, + "step": 6656 + }, + { + "epoch": 1.0421102066374452, + "grad_norm": 1.7200939655303955, + "learning_rate": 4.994297816878462e-05, + "loss": 0.4605, + "step": 6657 + }, + { + "epoch": 1.0422667501565435, + "grad_norm": 1.833055019378662, + "learning_rate": 4.993483219289671e-05, + "loss": 0.4432, + "step": 6658 + }, + { + "epoch": 1.042423293675642, + "grad_norm": 1.0805412530899048, + "learning_rate": 4.99266862170088e-05, + "loss": 0.3479, + "step": 6659 + }, + { + "epoch": 1.0425798371947401, + "grad_norm": 1.3027489185333252, + "learning_rate": 4.991854024112089e-05, + "loss": 0.2329, + "step": 6660 + }, + { + "epoch": 1.0427363807138383, + "grad_norm": 2.1370339393615723, + "learning_rate": 4.991039426523298e-05, + "loss": 0.4049, + "step": 6661 + }, + { + "epoch": 1.0428929242329368, + "grad_norm": 1.4715862274169922, + "learning_rate": 4.990224828934507e-05, + "loss": 0.4389, + "step": 6662 + }, + { + "epoch": 1.043049467752035, + "grad_norm": 2.7009165287017822, + "learning_rate": 4.989410231345715e-05, + "loss": 0.5555, + "step": 6663 + }, + { + "epoch": 1.0432060112711334, + "grad_norm": 3.2517948150634766, + "learning_rate": 4.988595633756924e-05, + "loss": 0.792, + "step": 6664 + }, + { + "epoch": 1.0433625547902317, + "grad_norm": 3.356839895248413, + "learning_rate": 4.987781036168133e-05, + "loss": 0.5749, + "step": 6665 + }, + { + "epoch": 1.04351909830933, + "grad_norm": 1.3069401979446411, + "learning_rate": 4.986966438579342e-05, + "loss": 0.5203, + "step": 6666 + }, + { + "epoch": 1.0436756418284283, + "grad_norm": 2.157499074935913, + "learning_rate": 4.9861518409905514e-05, + "loss": 0.4903, + "step": 6667 + }, + { + "epoch": 1.0438321853475265, + "grad_norm": 2.7970659732818604, + "learning_rate": 4.98533724340176e-05, + "loss": 0.5319, + "step": 6668 + }, + { + "epoch": 1.043988728866625, + "grad_norm": 4.686150074005127, + "learning_rate": 4.984522645812968e-05, + "loss": 0.6786, + "step": 6669 + }, + { + "epoch": 1.0441452723857232, + "grad_norm": 1.4876434803009033, + "learning_rate": 4.983708048224178e-05, + "loss": 0.2979, + "step": 6670 + }, + { + "epoch": 1.0443018159048216, + "grad_norm": 2.70512056350708, + "learning_rate": 4.982893450635386e-05, + "loss": 0.5909, + "step": 6671 + }, + { + "epoch": 1.0444583594239198, + "grad_norm": 4.516408443450928, + "learning_rate": 4.982078853046595e-05, + "loss": 1.0476, + "step": 6672 + }, + { + "epoch": 1.044614902943018, + "grad_norm": 2.5637316703796387, + "learning_rate": 4.9812642554578044e-05, + "loss": 0.8341, + "step": 6673 + }, + { + "epoch": 1.0447714464621165, + "grad_norm": 2.988163948059082, + "learning_rate": 4.980449657869013e-05, + "loss": 1.053, + "step": 6674 + }, + { + "epoch": 1.0449279899812147, + "grad_norm": 3.953814744949341, + "learning_rate": 4.979635060280222e-05, + "loss": 0.7796, + "step": 6675 + }, + { + "epoch": 1.0450845335003132, + "grad_norm": 4.234143257141113, + "learning_rate": 4.978820462691431e-05, + "loss": 1.2284, + "step": 6676 + }, + { + "epoch": 1.0452410770194114, + "grad_norm": 3.1846773624420166, + "learning_rate": 4.978005865102639e-05, + "loss": 1.3192, + "step": 6677 + }, + { + "epoch": 1.0453976205385098, + "grad_norm": 4.99560022354126, + "learning_rate": 4.977191267513848e-05, + "loss": 1.1922, + "step": 6678 + }, + { + "epoch": 1.045554164057608, + "grad_norm": 5.184391975402832, + "learning_rate": 4.9763766699250573e-05, + "loss": 1.3004, + "step": 6679 + }, + { + "epoch": 1.0457107075767063, + "grad_norm": 7.28325080871582, + "learning_rate": 4.9755620723362664e-05, + "loss": 0.7086, + "step": 6680 + }, + { + "epoch": 1.0458672510958047, + "grad_norm": 3.0515623092651367, + "learning_rate": 4.974747474747475e-05, + "loss": 1.1059, + "step": 6681 + }, + { + "epoch": 1.046023794614903, + "grad_norm": 4.713110446929932, + "learning_rate": 4.973932877158684e-05, + "loss": 1.5569, + "step": 6682 + }, + { + "epoch": 1.0461803381340014, + "grad_norm": 1.9445428848266602, + "learning_rate": 4.973118279569893e-05, + "loss": 1.3306, + "step": 6683 + }, + { + "epoch": 1.0463368816530996, + "grad_norm": 1.7326585054397583, + "learning_rate": 4.972303681981101e-05, + "loss": 0.4203, + "step": 6684 + }, + { + "epoch": 1.0464934251721978, + "grad_norm": 2.6604175567626953, + "learning_rate": 4.971489084392311e-05, + "loss": 0.5763, + "step": 6685 + }, + { + "epoch": 1.0466499686912962, + "grad_norm": 3.105912685394287, + "learning_rate": 4.9706744868035194e-05, + "loss": 1.1151, + "step": 6686 + }, + { + "epoch": 1.0468065122103944, + "grad_norm": 3.5240981578826904, + "learning_rate": 4.969859889214728e-05, + "loss": 0.5894, + "step": 6687 + }, + { + "epoch": 1.0469630557294929, + "grad_norm": 2.7693464756011963, + "learning_rate": 4.9690452916259375e-05, + "loss": 0.4594, + "step": 6688 + }, + { + "epoch": 1.047119599248591, + "grad_norm": 0.5213939547538757, + "learning_rate": 4.968230694037146e-05, + "loss": 0.2224, + "step": 6689 + }, + { + "epoch": 1.0472761427676893, + "grad_norm": 0.5196644067764282, + "learning_rate": 4.967416096448355e-05, + "loss": 0.2283, + "step": 6690 + }, + { + "epoch": 1.0474326862867878, + "grad_norm": 0.8004798293113708, + "learning_rate": 4.966601498859564e-05, + "loss": 0.2167, + "step": 6691 + }, + { + "epoch": 1.047589229805886, + "grad_norm": 0.6141131520271301, + "learning_rate": 4.965786901270772e-05, + "loss": 0.2111, + "step": 6692 + }, + { + "epoch": 1.0477457733249844, + "grad_norm": 0.7516882419586182, + "learning_rate": 4.9649723036819814e-05, + "loss": 0.189, + "step": 6693 + }, + { + "epoch": 1.0479023168440826, + "grad_norm": 1.048161506652832, + "learning_rate": 4.9641577060931904e-05, + "loss": 0.2938, + "step": 6694 + }, + { + "epoch": 1.0480588603631809, + "grad_norm": 0.5566027164459229, + "learning_rate": 4.963343108504399e-05, + "loss": 0.2245, + "step": 6695 + }, + { + "epoch": 1.0482154038822793, + "grad_norm": 0.5382580757141113, + "learning_rate": 4.962528510915608e-05, + "loss": 0.1999, + "step": 6696 + }, + { + "epoch": 1.0483719474013775, + "grad_norm": 1.0812475681304932, + "learning_rate": 4.961713913326817e-05, + "loss": 0.2364, + "step": 6697 + }, + { + "epoch": 1.048528490920476, + "grad_norm": 0.8649194836616516, + "learning_rate": 4.960899315738026e-05, + "loss": 0.224, + "step": 6698 + }, + { + "epoch": 1.0486850344395742, + "grad_norm": 0.8285524845123291, + "learning_rate": 4.960084718149234e-05, + "loss": 0.2622, + "step": 6699 + }, + { + "epoch": 1.0488415779586726, + "grad_norm": 2.0754034519195557, + "learning_rate": 4.9592701205604434e-05, + "loss": 0.2968, + "step": 6700 + }, + { + "epoch": 1.0489981214777708, + "grad_norm": 0.5280343294143677, + "learning_rate": 4.9584555229716524e-05, + "loss": 0.1436, + "step": 6701 + }, + { + "epoch": 1.049154664996869, + "grad_norm": 1.13711678981781, + "learning_rate": 4.957640925382861e-05, + "loss": 0.4196, + "step": 6702 + }, + { + "epoch": 1.0493112085159675, + "grad_norm": 1.9892593622207642, + "learning_rate": 4.95682632779407e-05, + "loss": 0.3256, + "step": 6703 + }, + { + "epoch": 1.0494677520350657, + "grad_norm": 1.100034236907959, + "learning_rate": 4.956011730205279e-05, + "loss": 0.3691, + "step": 6704 + }, + { + "epoch": 1.0496242955541641, + "grad_norm": 2.1389739513397217, + "learning_rate": 4.955197132616487e-05, + "loss": 0.2876, + "step": 6705 + }, + { + "epoch": 1.0497808390732624, + "grad_norm": 1.3459503650665283, + "learning_rate": 4.954382535027697e-05, + "loss": 0.4902, + "step": 6706 + }, + { + "epoch": 1.0499373825923606, + "grad_norm": 1.258462905883789, + "learning_rate": 4.9535679374389054e-05, + "loss": 0.3253, + "step": 6707 + }, + { + "epoch": 1.050093926111459, + "grad_norm": 4.665914058685303, + "learning_rate": 4.9527533398501144e-05, + "loss": 0.2867, + "step": 6708 + }, + { + "epoch": 1.0502504696305572, + "grad_norm": 2.9002885818481445, + "learning_rate": 4.9519387422613235e-05, + "loss": 0.632, + "step": 6709 + }, + { + "epoch": 1.0504070131496557, + "grad_norm": 3.2716166973114014, + "learning_rate": 4.951124144672532e-05, + "loss": 0.6216, + "step": 6710 + }, + { + "epoch": 1.050563556668754, + "grad_norm": 1.5183744430541992, + "learning_rate": 4.950309547083741e-05, + "loss": 0.4617, + "step": 6711 + }, + { + "epoch": 1.0507201001878523, + "grad_norm": 1.3296854496002197, + "learning_rate": 4.94949494949495e-05, + "loss": 0.3841, + "step": 6712 + }, + { + "epoch": 1.0508766437069506, + "grad_norm": 2.082188606262207, + "learning_rate": 4.9486803519061584e-05, + "loss": 0.6278, + "step": 6713 + }, + { + "epoch": 1.0510331872260488, + "grad_norm": 3.051767587661743, + "learning_rate": 4.9478657543173674e-05, + "loss": 0.4719, + "step": 6714 + }, + { + "epoch": 1.0511897307451472, + "grad_norm": 2.9229235649108887, + "learning_rate": 4.9470511567285765e-05, + "loss": 0.9153, + "step": 6715 + }, + { + "epoch": 1.0513462742642454, + "grad_norm": 3.0239248275756836, + "learning_rate": 4.9462365591397855e-05, + "loss": 0.5968, + "step": 6716 + }, + { + "epoch": 1.0515028177833439, + "grad_norm": 5.21114444732666, + "learning_rate": 4.945421961550994e-05, + "loss": 0.6764, + "step": 6717 + }, + { + "epoch": 1.051659361302442, + "grad_norm": 1.855782151222229, + "learning_rate": 4.944607363962203e-05, + "loss": 0.8069, + "step": 6718 + }, + { + "epoch": 1.0518159048215403, + "grad_norm": 2.1413767337799072, + "learning_rate": 4.943792766373412e-05, + "loss": 1.2068, + "step": 6719 + }, + { + "epoch": 1.0519724483406387, + "grad_norm": 1.6339367628097534, + "learning_rate": 4.9429781687846204e-05, + "loss": 0.7228, + "step": 6720 + }, + { + "epoch": 1.052128991859737, + "grad_norm": 2.5088329315185547, + "learning_rate": 4.9421635711958294e-05, + "loss": 0.8869, + "step": 6721 + }, + { + "epoch": 1.0522855353788354, + "grad_norm": 4.1419501304626465, + "learning_rate": 4.9413489736070385e-05, + "loss": 0.7675, + "step": 6722 + }, + { + "epoch": 1.0524420788979336, + "grad_norm": 2.452402114868164, + "learning_rate": 4.940534376018247e-05, + "loss": 0.9393, + "step": 6723 + }, + { + "epoch": 1.0525986224170318, + "grad_norm": 5.489727973937988, + "learning_rate": 4.9397197784294566e-05, + "loss": 1.0705, + "step": 6724 + }, + { + "epoch": 1.0527551659361303, + "grad_norm": 3.9477028846740723, + "learning_rate": 4.938905180840665e-05, + "loss": 0.6862, + "step": 6725 + }, + { + "epoch": 1.0529117094552285, + "grad_norm": 2.376145362854004, + "learning_rate": 4.938090583251874e-05, + "loss": 0.7596, + "step": 6726 + }, + { + "epoch": 1.053068252974327, + "grad_norm": 3.339566707611084, + "learning_rate": 4.937275985663083e-05, + "loss": 1.1442, + "step": 6727 + }, + { + "epoch": 1.0532247964934252, + "grad_norm": 3.727616786956787, + "learning_rate": 4.9364613880742914e-05, + "loss": 1.5521, + "step": 6728 + }, + { + "epoch": 1.0533813400125234, + "grad_norm": 4.805947303771973, + "learning_rate": 4.9356467904855005e-05, + "loss": 1.0923, + "step": 6729 + }, + { + "epoch": 1.0535378835316218, + "grad_norm": 2.9245474338531494, + "learning_rate": 4.9348321928967095e-05, + "loss": 1.2729, + "step": 6730 + }, + { + "epoch": 1.05369442705072, + "grad_norm": 3.7413411140441895, + "learning_rate": 4.934017595307918e-05, + "loss": 1.1353, + "step": 6731 + }, + { + "epoch": 1.0538509705698185, + "grad_norm": 4.752077579498291, + "learning_rate": 4.933202997719127e-05, + "loss": 1.0396, + "step": 6732 + }, + { + "epoch": 1.0540075140889167, + "grad_norm": 4.5017547607421875, + "learning_rate": 4.932388400130336e-05, + "loss": 1.2242, + "step": 6733 + }, + { + "epoch": 1.0541640576080151, + "grad_norm": 3.8800249099731445, + "learning_rate": 4.931573802541545e-05, + "loss": 0.7253, + "step": 6734 + }, + { + "epoch": 1.0543206011271133, + "grad_norm": 3.549839496612549, + "learning_rate": 4.9307592049527534e-05, + "loss": 1.4446, + "step": 6735 + }, + { + "epoch": 1.0544771446462116, + "grad_norm": 2.2889926433563232, + "learning_rate": 4.9299446073639625e-05, + "loss": 0.5819, + "step": 6736 + }, + { + "epoch": 1.05463368816531, + "grad_norm": 3.030524492263794, + "learning_rate": 4.9291300097751715e-05, + "loss": 1.1587, + "step": 6737 + }, + { + "epoch": 1.0547902316844082, + "grad_norm": 3.0870912075042725, + "learning_rate": 4.92831541218638e-05, + "loss": 0.5602, + "step": 6738 + }, + { + "epoch": 1.0549467752035067, + "grad_norm": 0.7909827828407288, + "learning_rate": 4.927500814597589e-05, + "loss": 0.243, + "step": 6739 + }, + { + "epoch": 1.0551033187226049, + "grad_norm": 0.5814756751060486, + "learning_rate": 4.926686217008798e-05, + "loss": 0.2855, + "step": 6740 + }, + { + "epoch": 1.055259862241703, + "grad_norm": 0.571857213973999, + "learning_rate": 4.9258716194200064e-05, + "loss": 0.1856, + "step": 6741 + }, + { + "epoch": 1.0554164057608015, + "grad_norm": 0.4724878668785095, + "learning_rate": 4.925057021831216e-05, + "loss": 0.2267, + "step": 6742 + }, + { + "epoch": 1.0555729492798998, + "grad_norm": 0.5813170075416565, + "learning_rate": 4.9242424242424245e-05, + "loss": 0.3143, + "step": 6743 + }, + { + "epoch": 1.0557294927989982, + "grad_norm": 0.7668220400810242, + "learning_rate": 4.9234278266536336e-05, + "loss": 0.3442, + "step": 6744 + }, + { + "epoch": 1.0558860363180964, + "grad_norm": 0.8871173858642578, + "learning_rate": 4.9226132290648426e-05, + "loss": 0.1965, + "step": 6745 + }, + { + "epoch": 1.0560425798371949, + "grad_norm": 1.2818444967269897, + "learning_rate": 4.921798631476051e-05, + "loss": 0.2485, + "step": 6746 + }, + { + "epoch": 1.056199123356293, + "grad_norm": 0.8827008605003357, + "learning_rate": 4.92098403388726e-05, + "loss": 0.321, + "step": 6747 + }, + { + "epoch": 1.0563556668753913, + "grad_norm": 1.1395732164382935, + "learning_rate": 4.920169436298469e-05, + "loss": 0.4879, + "step": 6748 + }, + { + "epoch": 1.0565122103944897, + "grad_norm": 0.8035351634025574, + "learning_rate": 4.9193548387096775e-05, + "loss": 0.256, + "step": 6749 + }, + { + "epoch": 1.056668753913588, + "grad_norm": 1.021380066871643, + "learning_rate": 4.9185402411208865e-05, + "loss": 0.2577, + "step": 6750 + }, + { + "epoch": 1.0568252974326864, + "grad_norm": 1.3839352130889893, + "learning_rate": 4.9177256435320956e-05, + "loss": 0.1877, + "step": 6751 + }, + { + "epoch": 1.0569818409517846, + "grad_norm": 1.411385416984558, + "learning_rate": 4.9169110459433046e-05, + "loss": 0.413, + "step": 6752 + }, + { + "epoch": 1.0571383844708828, + "grad_norm": 1.1416454315185547, + "learning_rate": 4.916096448354513e-05, + "loss": 0.2255, + "step": 6753 + }, + { + "epoch": 1.0572949279899813, + "grad_norm": 0.7078889012336731, + "learning_rate": 4.915281850765722e-05, + "loss": 0.304, + "step": 6754 + }, + { + "epoch": 1.0574514715090795, + "grad_norm": 0.940355122089386, + "learning_rate": 4.914467253176931e-05, + "loss": 0.4063, + "step": 6755 + }, + { + "epoch": 1.057608015028178, + "grad_norm": 2.1265530586242676, + "learning_rate": 4.9136526555881395e-05, + "loss": 0.4522, + "step": 6756 + }, + { + "epoch": 1.0577645585472761, + "grad_norm": 1.6780152320861816, + "learning_rate": 4.9128380579993485e-05, + "loss": 0.3762, + "step": 6757 + }, + { + "epoch": 1.0579211020663744, + "grad_norm": 1.1242471933364868, + "learning_rate": 4.9120234604105576e-05, + "loss": 0.4183, + "step": 6758 + }, + { + "epoch": 1.0580776455854728, + "grad_norm": 1.114508032798767, + "learning_rate": 4.911208862821766e-05, + "loss": 0.3805, + "step": 6759 + }, + { + "epoch": 1.058234189104571, + "grad_norm": 2.6315956115722656, + "learning_rate": 4.910394265232976e-05, + "loss": 0.5363, + "step": 6760 + }, + { + "epoch": 1.0583907326236695, + "grad_norm": 1.9601577520370483, + "learning_rate": 4.909579667644184e-05, + "loss": 0.7805, + "step": 6761 + }, + { + "epoch": 1.0585472761427677, + "grad_norm": 1.504913568496704, + "learning_rate": 4.9087650700553924e-05, + "loss": 0.181, + "step": 6762 + }, + { + "epoch": 1.0587038196618659, + "grad_norm": 5.147781848907471, + "learning_rate": 4.907950472466602e-05, + "loss": 1.1409, + "step": 6763 + }, + { + "epoch": 1.0588603631809643, + "grad_norm": 1.9039956331253052, + "learning_rate": 4.9071358748778105e-05, + "loss": 0.516, + "step": 6764 + }, + { + "epoch": 1.0590169067000625, + "grad_norm": 2.0624444484710693, + "learning_rate": 4.9063212772890196e-05, + "loss": 0.549, + "step": 6765 + }, + { + "epoch": 1.059173450219161, + "grad_norm": 1.513196587562561, + "learning_rate": 4.9055066797002287e-05, + "loss": 0.524, + "step": 6766 + }, + { + "epoch": 1.0593299937382592, + "grad_norm": 2.6370248794555664, + "learning_rate": 4.904692082111437e-05, + "loss": 0.6807, + "step": 6767 + }, + { + "epoch": 1.0594865372573576, + "grad_norm": 2.6152403354644775, + "learning_rate": 4.903877484522646e-05, + "loss": 0.6255, + "step": 6768 + }, + { + "epoch": 1.0596430807764559, + "grad_norm": 4.643380165100098, + "learning_rate": 4.903062886933855e-05, + "loss": 1.3625, + "step": 6769 + }, + { + "epoch": 1.059799624295554, + "grad_norm": 6.062804698944092, + "learning_rate": 4.902248289345064e-05, + "loss": 0.7233, + "step": 6770 + }, + { + "epoch": 1.0599561678146525, + "grad_norm": 8.514850616455078, + "learning_rate": 4.9014336917562726e-05, + "loss": 0.8532, + "step": 6771 + }, + { + "epoch": 1.0601127113337507, + "grad_norm": 2.5692520141601562, + "learning_rate": 4.9006190941674816e-05, + "loss": 0.8265, + "step": 6772 + }, + { + "epoch": 1.0602692548528492, + "grad_norm": 3.187839984893799, + "learning_rate": 4.899804496578691e-05, + "loss": 1.1954, + "step": 6773 + }, + { + "epoch": 1.0604257983719474, + "grad_norm": 2.942547082901001, + "learning_rate": 4.898989898989899e-05, + "loss": 1.0381, + "step": 6774 + }, + { + "epoch": 1.0605823418910456, + "grad_norm": 3.837843656539917, + "learning_rate": 4.898175301401108e-05, + "loss": 1.6482, + "step": 6775 + }, + { + "epoch": 1.060738885410144, + "grad_norm": 3.651784896850586, + "learning_rate": 4.897360703812317e-05, + "loss": 1.0089, + "step": 6776 + }, + { + "epoch": 1.0608954289292423, + "grad_norm": 2.431555986404419, + "learning_rate": 4.8965461062235255e-05, + "loss": 0.7505, + "step": 6777 + }, + { + "epoch": 1.0610519724483407, + "grad_norm": 4.5881123542785645, + "learning_rate": 4.8957315086347346e-05, + "loss": 0.9132, + "step": 6778 + }, + { + "epoch": 1.061208515967439, + "grad_norm": 2.178025722503662, + "learning_rate": 4.8949169110459436e-05, + "loss": 1.1276, + "step": 6779 + }, + { + "epoch": 1.0613650594865374, + "grad_norm": 3.0926101207733154, + "learning_rate": 4.894102313457152e-05, + "loss": 0.9213, + "step": 6780 + }, + { + "epoch": 1.0615216030056356, + "grad_norm": 6.249564170837402, + "learning_rate": 4.893287715868361e-05, + "loss": 1.5745, + "step": 6781 + }, + { + "epoch": 1.0616781465247338, + "grad_norm": 2.3513906002044678, + "learning_rate": 4.89247311827957e-05, + "loss": 0.812, + "step": 6782 + }, + { + "epoch": 1.0618346900438322, + "grad_norm": 2.0852646827697754, + "learning_rate": 4.891658520690779e-05, + "loss": 1.1183, + "step": 6783 + }, + { + "epoch": 1.0619912335629305, + "grad_norm": 2.8138318061828613, + "learning_rate": 4.8908439231019875e-05, + "loss": 0.5087, + "step": 6784 + }, + { + "epoch": 1.062147777082029, + "grad_norm": 2.8632090091705322, + "learning_rate": 4.8900293255131966e-05, + "loss": 0.8794, + "step": 6785 + }, + { + "epoch": 1.0623043206011271, + "grad_norm": 4.016355514526367, + "learning_rate": 4.8892147279244056e-05, + "loss": 0.6256, + "step": 6786 + }, + { + "epoch": 1.0624608641202253, + "grad_norm": 3.723677158355713, + "learning_rate": 4.888400130335614e-05, + "loss": 1.1446, + "step": 6787 + }, + { + "epoch": 1.0626174076393238, + "grad_norm": 4.918694496154785, + "learning_rate": 4.887585532746823e-05, + "loss": 0.8213, + "step": 6788 + }, + { + "epoch": 1.062773951158422, + "grad_norm": 0.877237856388092, + "learning_rate": 4.886770935158032e-05, + "loss": 0.3141, + "step": 6789 + }, + { + "epoch": 1.0629304946775204, + "grad_norm": 0.6738548278808594, + "learning_rate": 4.8859563375692405e-05, + "loss": 0.2808, + "step": 6790 + }, + { + "epoch": 1.0630870381966186, + "grad_norm": 0.5218839049339294, + "learning_rate": 4.88514173998045e-05, + "loss": 0.2675, + "step": 6791 + }, + { + "epoch": 1.0632435817157169, + "grad_norm": 0.5013272166252136, + "learning_rate": 4.8843271423916586e-05, + "loss": 0.2127, + "step": 6792 + }, + { + "epoch": 1.0634001252348153, + "grad_norm": 0.581167459487915, + "learning_rate": 4.8835125448028677e-05, + "loss": 0.2212, + "step": 6793 + }, + { + "epoch": 1.0635566687539135, + "grad_norm": 0.5769028663635254, + "learning_rate": 4.882697947214077e-05, + "loss": 0.2934, + "step": 6794 + }, + { + "epoch": 1.063713212273012, + "grad_norm": 0.910311758518219, + "learning_rate": 4.881883349625285e-05, + "loss": 0.3238, + "step": 6795 + }, + { + "epoch": 1.0638697557921102, + "grad_norm": 2.146749496459961, + "learning_rate": 4.881068752036494e-05, + "loss": 0.4033, + "step": 6796 + }, + { + "epoch": 1.0640262993112084, + "grad_norm": 1.062942385673523, + "learning_rate": 4.880254154447703e-05, + "loss": 0.3196, + "step": 6797 + }, + { + "epoch": 1.0641828428303068, + "grad_norm": 0.6565183401107788, + "learning_rate": 4.8794395568589116e-05, + "loss": 0.2167, + "step": 6798 + }, + { + "epoch": 1.064339386349405, + "grad_norm": 1.172593116760254, + "learning_rate": 4.8786249592701206e-05, + "loss": 0.2917, + "step": 6799 + }, + { + "epoch": 1.0644959298685035, + "grad_norm": 1.303518533706665, + "learning_rate": 4.87781036168133e-05, + "loss": 0.3858, + "step": 6800 + }, + { + "epoch": 1.0646524733876017, + "grad_norm": 0.7082028388977051, + "learning_rate": 4.876995764092539e-05, + "loss": 0.1744, + "step": 6801 + }, + { + "epoch": 1.0648090169067002, + "grad_norm": 0.8552493453025818, + "learning_rate": 4.876181166503747e-05, + "loss": 0.3118, + "step": 6802 + }, + { + "epoch": 1.0649655604257984, + "grad_norm": 1.3199973106384277, + "learning_rate": 4.875366568914956e-05, + "loss": 0.4782, + "step": 6803 + }, + { + "epoch": 1.0651221039448966, + "grad_norm": 0.9895820021629333, + "learning_rate": 4.874551971326165e-05, + "loss": 0.2432, + "step": 6804 + }, + { + "epoch": 1.065278647463995, + "grad_norm": 1.1748993396759033, + "learning_rate": 4.8737373737373736e-05, + "loss": 0.3518, + "step": 6805 + }, + { + "epoch": 1.0654351909830932, + "grad_norm": 1.1158841848373413, + "learning_rate": 4.8729227761485826e-05, + "loss": 0.2376, + "step": 6806 + }, + { + "epoch": 1.0655917345021917, + "grad_norm": 1.2025699615478516, + "learning_rate": 4.872108178559792e-05, + "loss": 0.4535, + "step": 6807 + }, + { + "epoch": 1.06574827802129, + "grad_norm": 1.1096374988555908, + "learning_rate": 4.871293580971e-05, + "loss": 0.5053, + "step": 6808 + }, + { + "epoch": 1.0659048215403881, + "grad_norm": 1.7945232391357422, + "learning_rate": 4.87047898338221e-05, + "loss": 0.5466, + "step": 6809 + }, + { + "epoch": 1.0660613650594866, + "grad_norm": 2.7603957653045654, + "learning_rate": 4.869664385793418e-05, + "loss": 0.3258, + "step": 6810 + }, + { + "epoch": 1.0662179085785848, + "grad_norm": 1.8197838068008423, + "learning_rate": 4.868849788204627e-05, + "loss": 0.2955, + "step": 6811 + }, + { + "epoch": 1.0663744520976832, + "grad_norm": 1.68156898021698, + "learning_rate": 4.868035190615836e-05, + "loss": 0.4522, + "step": 6812 + }, + { + "epoch": 1.0665309956167814, + "grad_norm": 1.4183430671691895, + "learning_rate": 4.8672205930270446e-05, + "loss": 0.5002, + "step": 6813 + }, + { + "epoch": 1.0666875391358799, + "grad_norm": 2.6327524185180664, + "learning_rate": 4.866405995438254e-05, + "loss": 0.6865, + "step": 6814 + }, + { + "epoch": 1.066844082654978, + "grad_norm": 1.8330765962600708, + "learning_rate": 4.865591397849463e-05, + "loss": 0.729, + "step": 6815 + }, + { + "epoch": 1.0670006261740763, + "grad_norm": 3.4433417320251465, + "learning_rate": 4.864776800260671e-05, + "loss": 0.66, + "step": 6816 + }, + { + "epoch": 1.0671571696931748, + "grad_norm": 1.331876277923584, + "learning_rate": 4.86396220267188e-05, + "loss": 0.3788, + "step": 6817 + }, + { + "epoch": 1.067313713212273, + "grad_norm": 3.1101908683776855, + "learning_rate": 4.863147605083089e-05, + "loss": 0.6454, + "step": 6818 + }, + { + "epoch": 1.0674702567313714, + "grad_norm": 2.420680284500122, + "learning_rate": 4.862333007494298e-05, + "loss": 0.6873, + "step": 6819 + }, + { + "epoch": 1.0676268002504696, + "grad_norm": 2.1393580436706543, + "learning_rate": 4.8615184099055066e-05, + "loss": 0.665, + "step": 6820 + }, + { + "epoch": 1.0677833437695678, + "grad_norm": 2.1372129917144775, + "learning_rate": 4.860703812316716e-05, + "loss": 0.8046, + "step": 6821 + }, + { + "epoch": 1.0679398872886663, + "grad_norm": 3.244813919067383, + "learning_rate": 4.859889214727925e-05, + "loss": 0.7903, + "step": 6822 + }, + { + "epoch": 1.0680964308077645, + "grad_norm": 3.2950539588928223, + "learning_rate": 4.859074617139133e-05, + "loss": 1.1653, + "step": 6823 + }, + { + "epoch": 1.068252974326863, + "grad_norm": 2.7189440727233887, + "learning_rate": 4.858260019550342e-05, + "loss": 0.6442, + "step": 6824 + }, + { + "epoch": 1.0684095178459612, + "grad_norm": 3.235990285873413, + "learning_rate": 4.857445421961551e-05, + "loss": 0.5187, + "step": 6825 + }, + { + "epoch": 1.0685660613650594, + "grad_norm": 5.094785213470459, + "learning_rate": 4.8566308243727596e-05, + "loss": 0.7299, + "step": 6826 + }, + { + "epoch": 1.0687226048841578, + "grad_norm": 3.930760145187378, + "learning_rate": 4.855816226783969e-05, + "loss": 1.0198, + "step": 6827 + }, + { + "epoch": 1.068879148403256, + "grad_norm": 2.0470924377441406, + "learning_rate": 4.855001629195178e-05, + "loss": 0.719, + "step": 6828 + }, + { + "epoch": 1.0690356919223545, + "grad_norm": 2.979560136795044, + "learning_rate": 4.854187031606387e-05, + "loss": 1.3104, + "step": 6829 + }, + { + "epoch": 1.0691922354414527, + "grad_norm": 3.4131393432617188, + "learning_rate": 4.853372434017596e-05, + "loss": 0.8223, + "step": 6830 + }, + { + "epoch": 1.069348778960551, + "grad_norm": 4.642034530639648, + "learning_rate": 4.852557836428804e-05, + "loss": 0.9159, + "step": 6831 + }, + { + "epoch": 1.0695053224796494, + "grad_norm": 2.865239381790161, + "learning_rate": 4.851743238840013e-05, + "loss": 0.744, + "step": 6832 + }, + { + "epoch": 1.0696618659987476, + "grad_norm": 4.736968517303467, + "learning_rate": 4.850928641251222e-05, + "loss": 0.8852, + "step": 6833 + }, + { + "epoch": 1.069818409517846, + "grad_norm": 3.0678436756134033, + "learning_rate": 4.850114043662431e-05, + "loss": 0.7275, + "step": 6834 + }, + { + "epoch": 1.0699749530369442, + "grad_norm": 1.9453117847442627, + "learning_rate": 4.84929944607364e-05, + "loss": 0.5078, + "step": 6835 + }, + { + "epoch": 1.0701314965560427, + "grad_norm": 2.9627764225006104, + "learning_rate": 4.848484848484849e-05, + "loss": 1.2258, + "step": 6836 + }, + { + "epoch": 1.070288040075141, + "grad_norm": 2.671152114868164, + "learning_rate": 4.847670250896058e-05, + "loss": 0.4451, + "step": 6837 + }, + { + "epoch": 1.070444583594239, + "grad_norm": 4.325310707092285, + "learning_rate": 4.846855653307266e-05, + "loss": 0.6096, + "step": 6838 + }, + { + "epoch": 1.0706011271133375, + "grad_norm": 0.5964329242706299, + "learning_rate": 4.846041055718475e-05, + "loss": 0.3169, + "step": 6839 + }, + { + "epoch": 1.0707576706324358, + "grad_norm": 0.40914592146873474, + "learning_rate": 4.845226458129684e-05, + "loss": 0.1426, + "step": 6840 + }, + { + "epoch": 1.0709142141515342, + "grad_norm": 0.4691919982433319, + "learning_rate": 4.844411860540893e-05, + "loss": 0.235, + "step": 6841 + }, + { + "epoch": 1.0710707576706324, + "grad_norm": 0.6279969811439514, + "learning_rate": 4.843597262952102e-05, + "loss": 0.1306, + "step": 6842 + }, + { + "epoch": 1.0712273011897309, + "grad_norm": 1.0746411085128784, + "learning_rate": 4.842782665363311e-05, + "loss": 0.4056, + "step": 6843 + }, + { + "epoch": 1.071383844708829, + "grad_norm": 0.5242482423782349, + "learning_rate": 4.841968067774519e-05, + "loss": 0.1821, + "step": 6844 + }, + { + "epoch": 1.0715403882279273, + "grad_norm": 0.5771825313568115, + "learning_rate": 4.841153470185729e-05, + "loss": 0.2363, + "step": 6845 + }, + { + "epoch": 1.0716969317470257, + "grad_norm": 0.681204080581665, + "learning_rate": 4.840338872596937e-05, + "loss": 0.2136, + "step": 6846 + }, + { + "epoch": 1.071853475266124, + "grad_norm": 0.8976181149482727, + "learning_rate": 4.8395242750081456e-05, + "loss": 0.1958, + "step": 6847 + }, + { + "epoch": 1.0720100187852224, + "grad_norm": 0.8619001507759094, + "learning_rate": 4.8387096774193554e-05, + "loss": 0.2367, + "step": 6848 + }, + { + "epoch": 1.0721665623043206, + "grad_norm": 0.8263822793960571, + "learning_rate": 4.837895079830564e-05, + "loss": 0.1895, + "step": 6849 + }, + { + "epoch": 1.0723231058234188, + "grad_norm": 1.0600205659866333, + "learning_rate": 4.837080482241773e-05, + "loss": 0.3205, + "step": 6850 + }, + { + "epoch": 1.0724796493425173, + "grad_norm": 1.9708024263381958, + "learning_rate": 4.836265884652982e-05, + "loss": 0.4525, + "step": 6851 + }, + { + "epoch": 1.0726361928616155, + "grad_norm": 1.2705374956130981, + "learning_rate": 4.83545128706419e-05, + "loss": 0.3878, + "step": 6852 + }, + { + "epoch": 1.072792736380714, + "grad_norm": 0.9221144914627075, + "learning_rate": 4.834636689475399e-05, + "loss": 0.2958, + "step": 6853 + }, + { + "epoch": 1.0729492798998121, + "grad_norm": 1.265500545501709, + "learning_rate": 4.833822091886608e-05, + "loss": 0.4911, + "step": 6854 + }, + { + "epoch": 1.0731058234189104, + "grad_norm": 0.9922137260437012, + "learning_rate": 4.8330074942978174e-05, + "loss": 0.3487, + "step": 6855 + }, + { + "epoch": 1.0732623669380088, + "grad_norm": 2.4258639812469482, + "learning_rate": 4.832192896709026e-05, + "loss": 0.5438, + "step": 6856 + }, + { + "epoch": 1.073418910457107, + "grad_norm": 1.2685762643814087, + "learning_rate": 4.831378299120235e-05, + "loss": 0.3721, + "step": 6857 + }, + { + "epoch": 1.0735754539762055, + "grad_norm": 1.6824244260787964, + "learning_rate": 4.830563701531444e-05, + "loss": 0.3719, + "step": 6858 + }, + { + "epoch": 1.0737319974953037, + "grad_norm": 1.280900239944458, + "learning_rate": 4.829749103942652e-05, + "loss": 0.3036, + "step": 6859 + }, + { + "epoch": 1.073888541014402, + "grad_norm": 2.112321615219116, + "learning_rate": 4.828934506353861e-05, + "loss": 0.4047, + "step": 6860 + }, + { + "epoch": 1.0740450845335003, + "grad_norm": 2.192340135574341, + "learning_rate": 4.8281199087650703e-05, + "loss": 0.5004, + "step": 6861 + }, + { + "epoch": 1.0742016280525986, + "grad_norm": 2.2779247760772705, + "learning_rate": 4.827305311176279e-05, + "loss": 0.5282, + "step": 6862 + }, + { + "epoch": 1.074358171571697, + "grad_norm": 2.1559882164001465, + "learning_rate": 4.8264907135874885e-05, + "loss": 0.6913, + "step": 6863 + }, + { + "epoch": 1.0745147150907952, + "grad_norm": 1.8766790628433228, + "learning_rate": 4.825676115998697e-05, + "loss": 0.4102, + "step": 6864 + }, + { + "epoch": 1.0746712586098937, + "grad_norm": 2.494537591934204, + "learning_rate": 4.824861518409905e-05, + "loss": 0.7491, + "step": 6865 + }, + { + "epoch": 1.0748278021289919, + "grad_norm": 2.574007034301758, + "learning_rate": 4.824046920821115e-05, + "loss": 0.4448, + "step": 6866 + }, + { + "epoch": 1.07498434564809, + "grad_norm": 2.12300443649292, + "learning_rate": 4.823232323232323e-05, + "loss": 0.5627, + "step": 6867 + }, + { + "epoch": 1.0751408891671885, + "grad_norm": 2.8537344932556152, + "learning_rate": 4.8224177256435324e-05, + "loss": 0.8907, + "step": 6868 + }, + { + "epoch": 1.0752974326862867, + "grad_norm": 3.584474563598633, + "learning_rate": 4.8216031280547414e-05, + "loss": 0.5294, + "step": 6869 + }, + { + "epoch": 1.0754539762053852, + "grad_norm": 3.263317584991455, + "learning_rate": 4.82078853046595e-05, + "loss": 0.622, + "step": 6870 + }, + { + "epoch": 1.0756105197244834, + "grad_norm": 2.000978469848633, + "learning_rate": 4.819973932877159e-05, + "loss": 0.4999, + "step": 6871 + }, + { + "epoch": 1.0757670632435816, + "grad_norm": 2.828787326812744, + "learning_rate": 4.819159335288368e-05, + "loss": 0.9799, + "step": 6872 + }, + { + "epoch": 1.07592360676268, + "grad_norm": 3.242637872695923, + "learning_rate": 4.818344737699577e-05, + "loss": 1.0103, + "step": 6873 + }, + { + "epoch": 1.0760801502817783, + "grad_norm": 3.468906879425049, + "learning_rate": 4.817530140110785e-05, + "loss": 0.6955, + "step": 6874 + }, + { + "epoch": 1.0762366938008767, + "grad_norm": 4.24753475189209, + "learning_rate": 4.8167155425219944e-05, + "loss": 0.9113, + "step": 6875 + }, + { + "epoch": 1.076393237319975, + "grad_norm": 3.2321040630340576, + "learning_rate": 4.8159009449332034e-05, + "loss": 0.7864, + "step": 6876 + }, + { + "epoch": 1.0765497808390734, + "grad_norm": 4.477276802062988, + "learning_rate": 4.815086347344412e-05, + "loss": 1.473, + "step": 6877 + }, + { + "epoch": 1.0767063243581716, + "grad_norm": 3.578273296356201, + "learning_rate": 4.814271749755621e-05, + "loss": 0.555, + "step": 6878 + }, + { + "epoch": 1.0768628678772698, + "grad_norm": 2.6200077533721924, + "learning_rate": 4.81345715216683e-05, + "loss": 1.267, + "step": 6879 + }, + { + "epoch": 1.0770194113963683, + "grad_norm": 25.329809188842773, + "learning_rate": 4.812642554578038e-05, + "loss": 1.1929, + "step": 6880 + }, + { + "epoch": 1.0771759549154665, + "grad_norm": 6.0253825187683105, + "learning_rate": 4.811827956989248e-05, + "loss": 0.9742, + "step": 6881 + }, + { + "epoch": 1.077332498434565, + "grad_norm": 1.6082556247711182, + "learning_rate": 4.8110133594004564e-05, + "loss": 0.693, + "step": 6882 + }, + { + "epoch": 1.0774890419536631, + "grad_norm": 2.11550235748291, + "learning_rate": 4.810198761811665e-05, + "loss": 0.502, + "step": 6883 + }, + { + "epoch": 1.0776455854727613, + "grad_norm": 3.245015859603882, + "learning_rate": 4.8093841642228745e-05, + "loss": 0.871, + "step": 6884 + }, + { + "epoch": 1.0778021289918598, + "grad_norm": 2.401534080505371, + "learning_rate": 4.808569566634083e-05, + "loss": 0.4188, + "step": 6885 + }, + { + "epoch": 1.077958672510958, + "grad_norm": 3.9887232780456543, + "learning_rate": 4.807754969045292e-05, + "loss": 0.8584, + "step": 6886 + }, + { + "epoch": 1.0781152160300564, + "grad_norm": 2.569413423538208, + "learning_rate": 4.806940371456501e-05, + "loss": 0.4928, + "step": 6887 + }, + { + "epoch": 1.0782717595491547, + "grad_norm": 3.0119240283966064, + "learning_rate": 4.8061257738677093e-05, + "loss": 0.6635, + "step": 6888 + }, + { + "epoch": 1.0784283030682529, + "grad_norm": 0.45537278056144714, + "learning_rate": 4.8053111762789184e-05, + "loss": 0.2335, + "step": 6889 + }, + { + "epoch": 1.0785848465873513, + "grad_norm": 0.4793911576271057, + "learning_rate": 4.8044965786901275e-05, + "loss": 0.2101, + "step": 6890 + }, + { + "epoch": 1.0787413901064495, + "grad_norm": 0.4080857038497925, + "learning_rate": 4.803681981101336e-05, + "loss": 0.1588, + "step": 6891 + }, + { + "epoch": 1.078897933625548, + "grad_norm": 1.0198720693588257, + "learning_rate": 4.802867383512545e-05, + "loss": 0.2603, + "step": 6892 + }, + { + "epoch": 1.0790544771446462, + "grad_norm": 0.7360960841178894, + "learning_rate": 4.802052785923754e-05, + "loss": 0.2137, + "step": 6893 + }, + { + "epoch": 1.0792110206637444, + "grad_norm": 0.6191814541816711, + "learning_rate": 4.801238188334963e-05, + "loss": 0.2409, + "step": 6894 + }, + { + "epoch": 1.0793675641828429, + "grad_norm": 0.9502469301223755, + "learning_rate": 4.8004235907461714e-05, + "loss": 0.346, + "step": 6895 + }, + { + "epoch": 1.079524107701941, + "grad_norm": 0.7830762267112732, + "learning_rate": 4.7996089931573804e-05, + "loss": 0.1688, + "step": 6896 + }, + { + "epoch": 1.0796806512210395, + "grad_norm": 0.971585214138031, + "learning_rate": 4.7987943955685895e-05, + "loss": 0.2827, + "step": 6897 + }, + { + "epoch": 1.0798371947401377, + "grad_norm": 0.9651789665222168, + "learning_rate": 4.797979797979798e-05, + "loss": 0.2943, + "step": 6898 + }, + { + "epoch": 1.0799937382592362, + "grad_norm": 0.6673857569694519, + "learning_rate": 4.7971652003910076e-05, + "loss": 0.2456, + "step": 6899 + }, + { + "epoch": 1.0801502817783344, + "grad_norm": 1.0560170412063599, + "learning_rate": 4.796350602802216e-05, + "loss": 0.2605, + "step": 6900 + }, + { + "epoch": 1.0803068252974326, + "grad_norm": 1.4736162424087524, + "learning_rate": 4.795536005213424e-05, + "loss": 0.3816, + "step": 6901 + }, + { + "epoch": 1.080463368816531, + "grad_norm": 1.2288436889648438, + "learning_rate": 4.794721407624634e-05, + "loss": 0.6036, + "step": 6902 + }, + { + "epoch": 1.0806199123356293, + "grad_norm": 0.7832455635070801, + "learning_rate": 4.7939068100358424e-05, + "loss": 0.267, + "step": 6903 + }, + { + "epoch": 1.0807764558547277, + "grad_norm": 1.7882494926452637, + "learning_rate": 4.7930922124470515e-05, + "loss": 0.2924, + "step": 6904 + }, + { + "epoch": 1.080932999373826, + "grad_norm": 1.11783766746521, + "learning_rate": 4.7922776148582605e-05, + "loss": 0.2749, + "step": 6905 + }, + { + "epoch": 1.0810895428929241, + "grad_norm": 0.8063753247261047, + "learning_rate": 4.791463017269469e-05, + "loss": 0.2971, + "step": 6906 + }, + { + "epoch": 1.0812460864120226, + "grad_norm": 2.4267823696136475, + "learning_rate": 4.790648419680678e-05, + "loss": 0.5115, + "step": 6907 + }, + { + "epoch": 1.0814026299311208, + "grad_norm": 2.2946929931640625, + "learning_rate": 4.789833822091887e-05, + "loss": 0.3627, + "step": 6908 + }, + { + "epoch": 1.0815591734502192, + "grad_norm": 1.356031894683838, + "learning_rate": 4.7890192245030954e-05, + "loss": 0.386, + "step": 6909 + }, + { + "epoch": 1.0817157169693175, + "grad_norm": 2.905677556991577, + "learning_rate": 4.7882046269143044e-05, + "loss": 0.6916, + "step": 6910 + }, + { + "epoch": 1.081872260488416, + "grad_norm": 5.2382988929748535, + "learning_rate": 4.7873900293255135e-05, + "loss": 0.9345, + "step": 6911 + }, + { + "epoch": 1.0820288040075141, + "grad_norm": 4.306769371032715, + "learning_rate": 4.7865754317367225e-05, + "loss": 0.8691, + "step": 6912 + }, + { + "epoch": 1.0821853475266123, + "grad_norm": 2.1271374225616455, + "learning_rate": 4.785760834147931e-05, + "loss": 0.5895, + "step": 6913 + }, + { + "epoch": 1.0823418910457108, + "grad_norm": 2.4963855743408203, + "learning_rate": 4.78494623655914e-05, + "loss": 0.5591, + "step": 6914 + }, + { + "epoch": 1.082498434564809, + "grad_norm": 4.370108604431152, + "learning_rate": 4.784131638970349e-05, + "loss": 0.6943, + "step": 6915 + }, + { + "epoch": 1.0826549780839074, + "grad_norm": 2.665440082550049, + "learning_rate": 4.7833170413815574e-05, + "loss": 0.7252, + "step": 6916 + }, + { + "epoch": 1.0828115216030056, + "grad_norm": 3.9221231937408447, + "learning_rate": 4.782502443792767e-05, + "loss": 0.912, + "step": 6917 + }, + { + "epoch": 1.0829680651221039, + "grad_norm": 2.410745143890381, + "learning_rate": 4.7816878462039755e-05, + "loss": 1.0796, + "step": 6918 + }, + { + "epoch": 1.0831246086412023, + "grad_norm": 4.316107749938965, + "learning_rate": 4.780873248615184e-05, + "loss": 0.7056, + "step": 6919 + }, + { + "epoch": 1.0832811521603005, + "grad_norm": 3.7121410369873047, + "learning_rate": 4.7800586510263936e-05, + "loss": 0.9123, + "step": 6920 + }, + { + "epoch": 1.083437695679399, + "grad_norm": 3.3907828330993652, + "learning_rate": 4.779244053437602e-05, + "loss": 1.1279, + "step": 6921 + }, + { + "epoch": 1.0835942391984972, + "grad_norm": 3.277078866958618, + "learning_rate": 4.778429455848811e-05, + "loss": 0.9135, + "step": 6922 + }, + { + "epoch": 1.0837507827175954, + "grad_norm": 2.793943166732788, + "learning_rate": 4.77761485826002e-05, + "loss": 0.8357, + "step": 6923 + }, + { + "epoch": 1.0839073262366938, + "grad_norm": 2.951786518096924, + "learning_rate": 4.7768002606712285e-05, + "loss": 0.9736, + "step": 6924 + }, + { + "epoch": 1.084063869755792, + "grad_norm": 5.774934768676758, + "learning_rate": 4.7759856630824375e-05, + "loss": 1.0674, + "step": 6925 + }, + { + "epoch": 1.0842204132748905, + "grad_norm": 2.9892711639404297, + "learning_rate": 4.7751710654936466e-05, + "loss": 0.876, + "step": 6926 + }, + { + "epoch": 1.0843769567939887, + "grad_norm": 2.8682878017425537, + "learning_rate": 4.774356467904855e-05, + "loss": 0.886, + "step": 6927 + }, + { + "epoch": 1.084533500313087, + "grad_norm": 2.688657283782959, + "learning_rate": 4.773541870316064e-05, + "loss": 0.93, + "step": 6928 + }, + { + "epoch": 1.0846900438321854, + "grad_norm": 2.829371452331543, + "learning_rate": 4.772727272727273e-05, + "loss": 1.103, + "step": 6929 + }, + { + "epoch": 1.0848465873512836, + "grad_norm": 3.5289573669433594, + "learning_rate": 4.771912675138482e-05, + "loss": 1.1429, + "step": 6930 + }, + { + "epoch": 1.085003130870382, + "grad_norm": 2.9953837394714355, + "learning_rate": 4.7710980775496905e-05, + "loss": 1.4431, + "step": 6931 + }, + { + "epoch": 1.0851596743894802, + "grad_norm": 2.281331777572632, + "learning_rate": 4.7702834799608995e-05, + "loss": 1.3537, + "step": 6932 + }, + { + "epoch": 1.0853162179085787, + "grad_norm": 2.8594393730163574, + "learning_rate": 4.7694688823721086e-05, + "loss": 1.5825, + "step": 6933 + }, + { + "epoch": 1.085472761427677, + "grad_norm": 3.7858548164367676, + "learning_rate": 4.768654284783317e-05, + "loss": 0.2768, + "step": 6934 + }, + { + "epoch": 1.0856293049467751, + "grad_norm": 2.329890727996826, + "learning_rate": 4.767839687194526e-05, + "loss": 0.9226, + "step": 6935 + }, + { + "epoch": 1.0857858484658736, + "grad_norm": 3.329467296600342, + "learning_rate": 4.767025089605735e-05, + "loss": 0.9218, + "step": 6936 + }, + { + "epoch": 1.0859423919849718, + "grad_norm": 4.156576156616211, + "learning_rate": 4.7662104920169434e-05, + "loss": 0.8344, + "step": 6937 + }, + { + "epoch": 1.0860989355040702, + "grad_norm": 3.3976616859436035, + "learning_rate": 4.765395894428153e-05, + "loss": 0.8931, + "step": 6938 + }, + { + "epoch": 1.0862554790231684, + "grad_norm": 0.34758007526397705, + "learning_rate": 4.7645812968393615e-05, + "loss": 0.1885, + "step": 6939 + }, + { + "epoch": 1.0864120225422667, + "grad_norm": 0.36093318462371826, + "learning_rate": 4.7637666992505706e-05, + "loss": 0.2027, + "step": 6940 + }, + { + "epoch": 1.086568566061365, + "grad_norm": 0.5725318789482117, + "learning_rate": 4.7629521016617796e-05, + "loss": 0.2282, + "step": 6941 + }, + { + "epoch": 1.0867251095804633, + "grad_norm": 0.4558492600917816, + "learning_rate": 4.762137504072988e-05, + "loss": 0.191, + "step": 6942 + }, + { + "epoch": 1.0868816530995618, + "grad_norm": 0.6670626997947693, + "learning_rate": 4.761322906484197e-05, + "loss": 0.2413, + "step": 6943 + }, + { + "epoch": 1.08703819661866, + "grad_norm": 0.45883703231811523, + "learning_rate": 4.760508308895406e-05, + "loss": 0.191, + "step": 6944 + }, + { + "epoch": 1.0871947401377584, + "grad_norm": 0.46798765659332275, + "learning_rate": 4.7596937113066145e-05, + "loss": 0.1956, + "step": 6945 + }, + { + "epoch": 1.0873512836568566, + "grad_norm": 0.9541764855384827, + "learning_rate": 4.7588791137178236e-05, + "loss": 0.3475, + "step": 6946 + }, + { + "epoch": 1.0875078271759548, + "grad_norm": 0.7809955477714539, + "learning_rate": 4.7580645161290326e-05, + "loss": 0.2305, + "step": 6947 + }, + { + "epoch": 1.0876643706950533, + "grad_norm": 0.8210253715515137, + "learning_rate": 4.7572499185402417e-05, + "loss": 0.27, + "step": 6948 + }, + { + "epoch": 1.0878209142141515, + "grad_norm": 0.6892174482345581, + "learning_rate": 4.75643532095145e-05, + "loss": 0.2707, + "step": 6949 + }, + { + "epoch": 1.08797745773325, + "grad_norm": 1.2915009260177612, + "learning_rate": 4.755620723362659e-05, + "loss": 0.3388, + "step": 6950 + }, + { + "epoch": 1.0881340012523482, + "grad_norm": 1.6901679039001465, + "learning_rate": 4.754806125773868e-05, + "loss": 0.3802, + "step": 6951 + }, + { + "epoch": 1.0882905447714464, + "grad_norm": 1.70040762424469, + "learning_rate": 4.7539915281850765e-05, + "loss": 0.2882, + "step": 6952 + }, + { + "epoch": 1.0884470882905448, + "grad_norm": 1.0617280006408691, + "learning_rate": 4.7531769305962856e-05, + "loss": 0.3515, + "step": 6953 + }, + { + "epoch": 1.088603631809643, + "grad_norm": 1.5180904865264893, + "learning_rate": 4.7523623330074946e-05, + "loss": 0.237, + "step": 6954 + }, + { + "epoch": 1.0887601753287415, + "grad_norm": 1.0653403997421265, + "learning_rate": 4.751547735418703e-05, + "loss": 0.3719, + "step": 6955 + }, + { + "epoch": 1.0889167188478397, + "grad_norm": 2.392138719558716, + "learning_rate": 4.750733137829913e-05, + "loss": 0.333, + "step": 6956 + }, + { + "epoch": 1.089073262366938, + "grad_norm": 1.7555619478225708, + "learning_rate": 4.749918540241121e-05, + "loss": 0.6222, + "step": 6957 + }, + { + "epoch": 1.0892298058860364, + "grad_norm": 1.6956660747528076, + "learning_rate": 4.74910394265233e-05, + "loss": 0.4916, + "step": 6958 + }, + { + "epoch": 1.0893863494051346, + "grad_norm": 1.7112665176391602, + "learning_rate": 4.748289345063539e-05, + "loss": 0.6313, + "step": 6959 + }, + { + "epoch": 1.089542892924233, + "grad_norm": 1.4841006994247437, + "learning_rate": 4.7474747474747476e-05, + "loss": 0.588, + "step": 6960 + }, + { + "epoch": 1.0896994364433312, + "grad_norm": 1.1675784587860107, + "learning_rate": 4.7466601498859566e-05, + "loss": 0.3909, + "step": 6961 + }, + { + "epoch": 1.0898559799624294, + "grad_norm": 1.506224274635315, + "learning_rate": 4.745845552297166e-05, + "loss": 0.4365, + "step": 6962 + }, + { + "epoch": 1.0900125234815279, + "grad_norm": 2.1182355880737305, + "learning_rate": 4.745030954708374e-05, + "loss": 0.5029, + "step": 6963 + }, + { + "epoch": 1.090169067000626, + "grad_norm": 2.058540105819702, + "learning_rate": 4.744216357119583e-05, + "loss": 0.4965, + "step": 6964 + }, + { + "epoch": 1.0903256105197245, + "grad_norm": 2.7665724754333496, + "learning_rate": 4.743401759530792e-05, + "loss": 0.7332, + "step": 6965 + }, + { + "epoch": 1.0904821540388228, + "grad_norm": 1.3612734079360962, + "learning_rate": 4.742587161942001e-05, + "loss": 0.4001, + "step": 6966 + }, + { + "epoch": 1.0906386975579212, + "grad_norm": 3.9983832836151123, + "learning_rate": 4.7417725643532096e-05, + "loss": 0.4484, + "step": 6967 + }, + { + "epoch": 1.0907952410770194, + "grad_norm": 2.969397783279419, + "learning_rate": 4.7409579667644186e-05, + "loss": 1.0361, + "step": 6968 + }, + { + "epoch": 1.0909517845961176, + "grad_norm": 2.55652117729187, + "learning_rate": 4.740143369175628e-05, + "loss": 0.6784, + "step": 6969 + }, + { + "epoch": 1.091108328115216, + "grad_norm": 2.742945432662964, + "learning_rate": 4.739328771586836e-05, + "loss": 0.5686, + "step": 6970 + }, + { + "epoch": 1.0912648716343143, + "grad_norm": 3.0690629482269287, + "learning_rate": 4.738514173998045e-05, + "loss": 0.8781, + "step": 6971 + }, + { + "epoch": 1.0914214151534127, + "grad_norm": 2.8628926277160645, + "learning_rate": 4.737699576409254e-05, + "loss": 0.8698, + "step": 6972 + }, + { + "epoch": 1.091577958672511, + "grad_norm": 5.635262489318848, + "learning_rate": 4.7368849788204626e-05, + "loss": 1.0364, + "step": 6973 + }, + { + "epoch": 1.0917345021916092, + "grad_norm": 1.5599803924560547, + "learning_rate": 4.736070381231672e-05, + "loss": 0.5204, + "step": 6974 + }, + { + "epoch": 1.0918910457107076, + "grad_norm": 1.9783178567886353, + "learning_rate": 4.7352557836428807e-05, + "loss": 1.1666, + "step": 6975 + }, + { + "epoch": 1.0920475892298058, + "grad_norm": 7.3416595458984375, + "learning_rate": 4.73444118605409e-05, + "loss": 1.6979, + "step": 6976 + }, + { + "epoch": 1.0922041327489043, + "grad_norm": 2.3318259716033936, + "learning_rate": 4.733626588465299e-05, + "loss": 0.673, + "step": 6977 + }, + { + "epoch": 1.0923606762680025, + "grad_norm": 3.252856969833374, + "learning_rate": 4.732811990876507e-05, + "loss": 0.976, + "step": 6978 + }, + { + "epoch": 1.092517219787101, + "grad_norm": 3.784106492996216, + "learning_rate": 4.731997393287716e-05, + "loss": 0.6695, + "step": 6979 + }, + { + "epoch": 1.0926737633061991, + "grad_norm": 3.4905519485473633, + "learning_rate": 4.731182795698925e-05, + "loss": 1.0711, + "step": 6980 + }, + { + "epoch": 1.0928303068252974, + "grad_norm": 3.225107192993164, + "learning_rate": 4.7303681981101336e-05, + "loss": 1.0253, + "step": 6981 + }, + { + "epoch": 1.0929868503443958, + "grad_norm": 1.9015222787857056, + "learning_rate": 4.729553600521343e-05, + "loss": 0.9101, + "step": 6982 + }, + { + "epoch": 1.093143393863494, + "grad_norm": 1.6202189922332764, + "learning_rate": 4.728739002932552e-05, + "loss": 1.0893, + "step": 6983 + }, + { + "epoch": 1.0932999373825925, + "grad_norm": 4.562743663787842, + "learning_rate": 4.727924405343761e-05, + "loss": 1.1005, + "step": 6984 + }, + { + "epoch": 1.0934564809016907, + "grad_norm": 3.4571828842163086, + "learning_rate": 4.727109807754969e-05, + "loss": 0.8342, + "step": 6985 + }, + { + "epoch": 1.093613024420789, + "grad_norm": 1.505638837814331, + "learning_rate": 4.726295210166178e-05, + "loss": 0.4878, + "step": 6986 + }, + { + "epoch": 1.0937695679398873, + "grad_norm": 1.9155659675598145, + "learning_rate": 4.725480612577387e-05, + "loss": 0.3804, + "step": 6987 + }, + { + "epoch": 1.0939261114589856, + "grad_norm": 2.2810349464416504, + "learning_rate": 4.7246660149885956e-05, + "loss": 0.6696, + "step": 6988 + }, + { + "epoch": 1.094082654978084, + "grad_norm": 0.4229256510734558, + "learning_rate": 4.723851417399805e-05, + "loss": 0.2413, + "step": 6989 + }, + { + "epoch": 1.0942391984971822, + "grad_norm": 0.6127775311470032, + "learning_rate": 4.723036819811014e-05, + "loss": 0.1663, + "step": 6990 + }, + { + "epoch": 1.0943957420162804, + "grad_norm": 1.1207201480865479, + "learning_rate": 4.722222222222222e-05, + "loss": 0.2084, + "step": 6991 + }, + { + "epoch": 1.0945522855353789, + "grad_norm": 0.731033444404602, + "learning_rate": 4.721407624633432e-05, + "loss": 0.2607, + "step": 6992 + }, + { + "epoch": 1.094708829054477, + "grad_norm": 0.6186383962631226, + "learning_rate": 4.72059302704464e-05, + "loss": 0.2822, + "step": 6993 + }, + { + "epoch": 1.0948653725735755, + "grad_norm": 1.075435996055603, + "learning_rate": 4.7197784294558486e-05, + "loss": 0.2707, + "step": 6994 + }, + { + "epoch": 1.0950219160926737, + "grad_norm": 0.48871439695358276, + "learning_rate": 4.718963831867058e-05, + "loss": 0.1774, + "step": 6995 + }, + { + "epoch": 1.095178459611772, + "grad_norm": 1.068093180656433, + "learning_rate": 4.718149234278267e-05, + "loss": 0.247, + "step": 6996 + }, + { + "epoch": 1.0953350031308704, + "grad_norm": 0.618746817111969, + "learning_rate": 4.717334636689476e-05, + "loss": 0.2656, + "step": 6997 + }, + { + "epoch": 1.0954915466499686, + "grad_norm": 0.6828145980834961, + "learning_rate": 4.716520039100685e-05, + "loss": 0.213, + "step": 6998 + }, + { + "epoch": 1.095648090169067, + "grad_norm": 0.8708493113517761, + "learning_rate": 4.715705441511893e-05, + "loss": 0.3665, + "step": 6999 + }, + { + "epoch": 1.0958046336881653, + "grad_norm": 1.228941559791565, + "learning_rate": 4.714890843923102e-05, + "loss": 0.2201, + "step": 7000 + }, + { + "epoch": 1.0958046336881653, + "eval_loss": 0.5153822302818298, + "eval_runtime": 203.7267, + "eval_samples_per_second": 60.782, + "eval_steps_per_second": 3.799, + "eval_wer": 0.3198461440380815, + "step": 7000 + }, + { + "epoch": 1.0959611772072637, + "grad_norm": 1.1535078287124634, + "learning_rate": 4.714076246334311e-05, + "loss": 0.319, + "step": 7001 + }, + { + "epoch": 1.096117720726362, + "grad_norm": 7.56706428527832, + "learning_rate": 4.71326164874552e-05, + "loss": 0.366, + "step": 7002 + }, + { + "epoch": 1.0962742642454602, + "grad_norm": 1.9455337524414062, + "learning_rate": 4.712447051156729e-05, + "loss": 0.6983, + "step": 7003 + }, + { + "epoch": 1.0964308077645586, + "grad_norm": 1.7844760417938232, + "learning_rate": 4.711632453567938e-05, + "loss": 0.3671, + "step": 7004 + }, + { + "epoch": 1.0965873512836568, + "grad_norm": 2.190796375274658, + "learning_rate": 4.710817855979147e-05, + "loss": 0.6487, + "step": 7005 + }, + { + "epoch": 1.0967438948027552, + "grad_norm": 0.894835352897644, + "learning_rate": 4.710003258390355e-05, + "loss": 0.3787, + "step": 7006 + }, + { + "epoch": 1.0969004383218535, + "grad_norm": 1.064031720161438, + "learning_rate": 4.709188660801564e-05, + "loss": 0.4345, + "step": 7007 + }, + { + "epoch": 1.0970569818409517, + "grad_norm": 1.4196325540542603, + "learning_rate": 4.708374063212773e-05, + "loss": 0.3998, + "step": 7008 + }, + { + "epoch": 1.0972135253600501, + "grad_norm": 3.02227520942688, + "learning_rate": 4.707559465623982e-05, + "loss": 0.5372, + "step": 7009 + }, + { + "epoch": 1.0973700688791483, + "grad_norm": 2.3961074352264404, + "learning_rate": 4.7067448680351914e-05, + "loss": 0.755, + "step": 7010 + }, + { + "epoch": 1.0975266123982468, + "grad_norm": 1.722723364830017, + "learning_rate": 4.7059302704464e-05, + "loss": 0.5098, + "step": 7011 + }, + { + "epoch": 1.097683155917345, + "grad_norm": 4.46126127243042, + "learning_rate": 4.705115672857608e-05, + "loss": 0.8616, + "step": 7012 + }, + { + "epoch": 1.0978396994364434, + "grad_norm": 1.1746712923049927, + "learning_rate": 4.704301075268818e-05, + "loss": 0.3193, + "step": 7013 + }, + { + "epoch": 1.0979962429555417, + "grad_norm": 3.2180590629577637, + "learning_rate": 4.703486477680026e-05, + "loss": 0.6306, + "step": 7014 + }, + { + "epoch": 1.0981527864746399, + "grad_norm": 2.6652908325195312, + "learning_rate": 4.702671880091235e-05, + "loss": 0.7449, + "step": 7015 + }, + { + "epoch": 1.0983093299937383, + "grad_norm": 1.6503469944000244, + "learning_rate": 4.7018572825024444e-05, + "loss": 0.4651, + "step": 7016 + }, + { + "epoch": 1.0984658735128365, + "grad_norm": 3.861362934112549, + "learning_rate": 4.701042684913653e-05, + "loss": 0.4408, + "step": 7017 + }, + { + "epoch": 1.098622417031935, + "grad_norm": 1.6205487251281738, + "learning_rate": 4.700228087324862e-05, + "loss": 0.4838, + "step": 7018 + }, + { + "epoch": 1.0987789605510332, + "grad_norm": 1.5987690687179565, + "learning_rate": 4.699413489736071e-05, + "loss": 0.2886, + "step": 7019 + }, + { + "epoch": 1.0989355040701314, + "grad_norm": 3.050769329071045, + "learning_rate": 4.69859889214728e-05, + "loss": 0.7449, + "step": 7020 + }, + { + "epoch": 1.0990920475892298, + "grad_norm": 2.864924669265747, + "learning_rate": 4.697784294558488e-05, + "loss": 0.9881, + "step": 7021 + }, + { + "epoch": 1.099248591108328, + "grad_norm": 2.9133360385894775, + "learning_rate": 4.696969696969697e-05, + "loss": 0.7884, + "step": 7022 + }, + { + "epoch": 1.0994051346274265, + "grad_norm": 4.701315879821777, + "learning_rate": 4.6961550993809064e-05, + "loss": 0.8224, + "step": 7023 + }, + { + "epoch": 1.0995616781465247, + "grad_norm": 1.6192275285720825, + "learning_rate": 4.695340501792115e-05, + "loss": 0.3847, + "step": 7024 + }, + { + "epoch": 1.099718221665623, + "grad_norm": 3.2774534225463867, + "learning_rate": 4.694525904203324e-05, + "loss": 0.7898, + "step": 7025 + }, + { + "epoch": 1.0998747651847214, + "grad_norm": 5.875033855438232, + "learning_rate": 4.693711306614533e-05, + "loss": 1.1833, + "step": 7026 + }, + { + "epoch": 1.1000313087038196, + "grad_norm": 8.686802864074707, + "learning_rate": 4.692896709025741e-05, + "loss": 1.1346, + "step": 7027 + }, + { + "epoch": 1.100187852222918, + "grad_norm": 3.232365608215332, + "learning_rate": 4.692082111436951e-05, + "loss": 1.0687, + "step": 7028 + }, + { + "epoch": 1.1003443957420163, + "grad_norm": 3.109034538269043, + "learning_rate": 4.691267513848159e-05, + "loss": 1.2344, + "step": 7029 + }, + { + "epoch": 1.1005009392611145, + "grad_norm": 5.093968868255615, + "learning_rate": 4.690452916259368e-05, + "loss": 1.6468, + "step": 7030 + }, + { + "epoch": 1.100657482780213, + "grad_norm": 6.434152126312256, + "learning_rate": 4.6896383186705774e-05, + "loss": 1.0864, + "step": 7031 + }, + { + "epoch": 1.1008140262993111, + "grad_norm": 4.261313438415527, + "learning_rate": 4.688823721081786e-05, + "loss": 1.1566, + "step": 7032 + }, + { + "epoch": 1.1009705698184096, + "grad_norm": 2.9876596927642822, + "learning_rate": 4.688009123492995e-05, + "loss": 0.8711, + "step": 7033 + }, + { + "epoch": 1.1011271133375078, + "grad_norm": 3.193394422531128, + "learning_rate": 4.687194525904203e-05, + "loss": 0.5338, + "step": 7034 + }, + { + "epoch": 1.1012836568566062, + "grad_norm": 3.902446985244751, + "learning_rate": 4.686379928315412e-05, + "loss": 0.6753, + "step": 7035 + }, + { + "epoch": 1.1014402003757044, + "grad_norm": 3.5356178283691406, + "learning_rate": 4.6855653307266213e-05, + "loss": 0.6102, + "step": 7036 + }, + { + "epoch": 1.1015967438948027, + "grad_norm": 5.445432662963867, + "learning_rate": 4.68475073313783e-05, + "loss": 1.3331, + "step": 7037 + }, + { + "epoch": 1.101753287413901, + "grad_norm": 4.391554355621338, + "learning_rate": 4.683936135549039e-05, + "loss": 1.0643, + "step": 7038 + }, + { + "epoch": 1.1019098309329993, + "grad_norm": 0.4559873342514038, + "learning_rate": 4.683121537960248e-05, + "loss": 0.2244, + "step": 7039 + }, + { + "epoch": 1.1020663744520978, + "grad_norm": 0.30352526903152466, + "learning_rate": 4.682306940371456e-05, + "loss": 0.1403, + "step": 7040 + }, + { + "epoch": 1.102222917971196, + "grad_norm": 0.6196094751358032, + "learning_rate": 4.681492342782666e-05, + "loss": 0.2283, + "step": 7041 + }, + { + "epoch": 1.1023794614902942, + "grad_norm": 0.5372360348701477, + "learning_rate": 4.680677745193874e-05, + "loss": 0.2141, + "step": 7042 + }, + { + "epoch": 1.1025360050093926, + "grad_norm": 0.798531711101532, + "learning_rate": 4.6798631476050834e-05, + "loss": 0.4554, + "step": 7043 + }, + { + "epoch": 1.1026925485284909, + "grad_norm": 1.1791584491729736, + "learning_rate": 4.6790485500162924e-05, + "loss": 0.3124, + "step": 7044 + }, + { + "epoch": 1.1028490920475893, + "grad_norm": 0.8606584668159485, + "learning_rate": 4.678233952427501e-05, + "loss": 0.3035, + "step": 7045 + }, + { + "epoch": 1.1030056355666875, + "grad_norm": 0.90922611951828, + "learning_rate": 4.67741935483871e-05, + "loss": 0.2944, + "step": 7046 + }, + { + "epoch": 1.103162179085786, + "grad_norm": 1.5816376209259033, + "learning_rate": 4.676604757249919e-05, + "loss": 0.1845, + "step": 7047 + }, + { + "epoch": 1.1033187226048842, + "grad_norm": 1.1496223211288452, + "learning_rate": 4.675790159661127e-05, + "loss": 0.3387, + "step": 7048 + }, + { + "epoch": 1.1034752661239824, + "grad_norm": 3.2025508880615234, + "learning_rate": 4.674975562072336e-05, + "loss": 0.2889, + "step": 7049 + }, + { + "epoch": 1.1036318096430808, + "grad_norm": 2.7516517639160156, + "learning_rate": 4.6741609644835454e-05, + "loss": 0.7589, + "step": 7050 + }, + { + "epoch": 1.103788353162179, + "grad_norm": 1.1529078483581543, + "learning_rate": 4.6733463668947544e-05, + "loss": 0.3222, + "step": 7051 + }, + { + "epoch": 1.1039448966812775, + "grad_norm": 0.7960324287414551, + "learning_rate": 4.672531769305963e-05, + "loss": 0.2752, + "step": 7052 + }, + { + "epoch": 1.1041014402003757, + "grad_norm": 0.8990991711616516, + "learning_rate": 4.671717171717172e-05, + "loss": 0.2725, + "step": 7053 + }, + { + "epoch": 1.104257983719474, + "grad_norm": 0.7447043061256409, + "learning_rate": 4.670902574128381e-05, + "loss": 0.219, + "step": 7054 + }, + { + "epoch": 1.1044145272385724, + "grad_norm": 2.2172889709472656, + "learning_rate": 4.670087976539589e-05, + "loss": 0.7758, + "step": 7055 + }, + { + "epoch": 1.1045710707576706, + "grad_norm": 2.0655455589294434, + "learning_rate": 4.669273378950798e-05, + "loss": 0.2823, + "step": 7056 + }, + { + "epoch": 1.104727614276769, + "grad_norm": 1.5848236083984375, + "learning_rate": 4.6684587813620074e-05, + "loss": 0.385, + "step": 7057 + }, + { + "epoch": 1.1048841577958672, + "grad_norm": 1.5397604703903198, + "learning_rate": 4.667644183773216e-05, + "loss": 0.4368, + "step": 7058 + }, + { + "epoch": 1.1050407013149655, + "grad_norm": 1.5375547409057617, + "learning_rate": 4.6668295861844255e-05, + "loss": 0.5212, + "step": 7059 + }, + { + "epoch": 1.105197244834064, + "grad_norm": 1.468423843383789, + "learning_rate": 4.666014988595634e-05, + "loss": 0.2964, + "step": 7060 + }, + { + "epoch": 1.1053537883531621, + "grad_norm": 1.8820159435272217, + "learning_rate": 4.665200391006843e-05, + "loss": 0.7394, + "step": 7061 + }, + { + "epoch": 1.1055103318722606, + "grad_norm": 1.5423916578292847, + "learning_rate": 4.664385793418052e-05, + "loss": 0.3806, + "step": 7062 + }, + { + "epoch": 1.1056668753913588, + "grad_norm": 1.5163826942443848, + "learning_rate": 4.6635711958292603e-05, + "loss": 0.3924, + "step": 7063 + }, + { + "epoch": 1.105823418910457, + "grad_norm": 3.7105672359466553, + "learning_rate": 4.6627565982404694e-05, + "loss": 0.581, + "step": 7064 + }, + { + "epoch": 1.1059799624295554, + "grad_norm": 1.2961264848709106, + "learning_rate": 4.6619420006516784e-05, + "loss": 0.3503, + "step": 7065 + }, + { + "epoch": 1.1061365059486536, + "grad_norm": 1.1067790985107422, + "learning_rate": 4.661127403062887e-05, + "loss": 0.3759, + "step": 7066 + }, + { + "epoch": 1.106293049467752, + "grad_norm": 4.105103969573975, + "learning_rate": 4.660312805474096e-05, + "loss": 0.3574, + "step": 7067 + }, + { + "epoch": 1.1064495929868503, + "grad_norm": 2.4762988090515137, + "learning_rate": 4.659498207885305e-05, + "loss": 0.5804, + "step": 7068 + }, + { + "epoch": 1.1066061365059487, + "grad_norm": 2.5807371139526367, + "learning_rate": 4.658683610296514e-05, + "loss": 0.649, + "step": 7069 + }, + { + "epoch": 1.106762680025047, + "grad_norm": 1.779807448387146, + "learning_rate": 4.6578690127077224e-05, + "loss": 0.5814, + "step": 7070 + }, + { + "epoch": 1.1069192235441452, + "grad_norm": 1.2901296615600586, + "learning_rate": 4.6570544151189314e-05, + "loss": 0.5239, + "step": 7071 + }, + { + "epoch": 1.1070757670632436, + "grad_norm": 1.963424801826477, + "learning_rate": 4.6562398175301405e-05, + "loss": 0.8488, + "step": 7072 + }, + { + "epoch": 1.1072323105823418, + "grad_norm": 2.8892982006073, + "learning_rate": 4.655425219941349e-05, + "loss": 0.8589, + "step": 7073 + }, + { + "epoch": 1.1073888541014403, + "grad_norm": 2.866197347640991, + "learning_rate": 4.654610622352558e-05, + "loss": 0.9592, + "step": 7074 + }, + { + "epoch": 1.1075453976205385, + "grad_norm": 4.427245616912842, + "learning_rate": 4.653796024763767e-05, + "loss": 1.0343, + "step": 7075 + }, + { + "epoch": 1.107701941139637, + "grad_norm": 2.9783236980438232, + "learning_rate": 4.652981427174975e-05, + "loss": 0.7955, + "step": 7076 + }, + { + "epoch": 1.1078584846587352, + "grad_norm": 4.315681457519531, + "learning_rate": 4.652166829586185e-05, + "loss": 0.9758, + "step": 7077 + }, + { + "epoch": 1.1080150281778334, + "grad_norm": 8.329524040222168, + "learning_rate": 4.6513522319973934e-05, + "loss": 1.2671, + "step": 7078 + }, + { + "epoch": 1.1081715716969318, + "grad_norm": 1.3844133615493774, + "learning_rate": 4.650537634408602e-05, + "loss": 0.6036, + "step": 7079 + }, + { + "epoch": 1.10832811521603, + "grad_norm": 1.7545969486236572, + "learning_rate": 4.6497230368198115e-05, + "loss": 0.5266, + "step": 7080 + }, + { + "epoch": 1.1084846587351285, + "grad_norm": 3.8193390369415283, + "learning_rate": 4.64890843923102e-05, + "loss": 1.031, + "step": 7081 + }, + { + "epoch": 1.1086412022542267, + "grad_norm": 3.835456132888794, + "learning_rate": 4.648093841642229e-05, + "loss": 1.166, + "step": 7082 + }, + { + "epoch": 1.108797745773325, + "grad_norm": 5.260733604431152, + "learning_rate": 4.647279244053438e-05, + "loss": 1.2187, + "step": 7083 + }, + { + "epoch": 1.1089542892924233, + "grad_norm": 2.831048011779785, + "learning_rate": 4.6464646464646464e-05, + "loss": 0.8378, + "step": 7084 + }, + { + "epoch": 1.1091108328115216, + "grad_norm": 3.6090757846832275, + "learning_rate": 4.6456500488758554e-05, + "loss": 0.8138, + "step": 7085 + }, + { + "epoch": 1.10926737633062, + "grad_norm": 6.53856897354126, + "learning_rate": 4.6448354512870645e-05, + "loss": 0.782, + "step": 7086 + }, + { + "epoch": 1.1094239198497182, + "grad_norm": 3.3461496829986572, + "learning_rate": 4.6440208536982735e-05, + "loss": 1.0771, + "step": 7087 + }, + { + "epoch": 1.1095804633688164, + "grad_norm": 2.021716833114624, + "learning_rate": 4.643206256109482e-05, + "loss": 0.7377, + "step": 7088 + }, + { + "epoch": 1.1097370068879149, + "grad_norm": 0.5516185760498047, + "learning_rate": 4.642391658520691e-05, + "loss": 0.1727, + "step": 7089 + }, + { + "epoch": 1.109893550407013, + "grad_norm": 0.3653852045536041, + "learning_rate": 4.6415770609319e-05, + "loss": 0.14, + "step": 7090 + }, + { + "epoch": 1.1100500939261115, + "grad_norm": 0.5755681395530701, + "learning_rate": 4.6407624633431084e-05, + "loss": 0.1925, + "step": 7091 + }, + { + "epoch": 1.1102066374452098, + "grad_norm": 0.5842314958572388, + "learning_rate": 4.6399478657543174e-05, + "loss": 0.1665, + "step": 7092 + }, + { + "epoch": 1.110363180964308, + "grad_norm": 0.6899935603141785, + "learning_rate": 4.6391332681655265e-05, + "loss": 0.1903, + "step": 7093 + }, + { + "epoch": 1.1105197244834064, + "grad_norm": 1.1011712551116943, + "learning_rate": 4.638318670576735e-05, + "loss": 0.2829, + "step": 7094 + }, + { + "epoch": 1.1106762680025046, + "grad_norm": 0.8421292901039124, + "learning_rate": 4.6375040729879446e-05, + "loss": 0.29, + "step": 7095 + }, + { + "epoch": 1.110832811521603, + "grad_norm": 1.0830127000808716, + "learning_rate": 4.636689475399153e-05, + "loss": 0.2676, + "step": 7096 + }, + { + "epoch": 1.1109893550407013, + "grad_norm": 1.097837209701538, + "learning_rate": 4.6358748778103614e-05, + "loss": 0.2075, + "step": 7097 + }, + { + "epoch": 1.1111458985597997, + "grad_norm": 1.0159848928451538, + "learning_rate": 4.635060280221571e-05, + "loss": 0.2806, + "step": 7098 + }, + { + "epoch": 1.111302442078898, + "grad_norm": 1.3572096824645996, + "learning_rate": 4.6342456826327795e-05, + "loss": 0.2629, + "step": 7099 + }, + { + "epoch": 1.1114589855979962, + "grad_norm": 0.6321014165878296, + "learning_rate": 4.6334310850439885e-05, + "loss": 0.2053, + "step": 7100 + }, + { + "epoch": 1.1116155291170946, + "grad_norm": 1.4243448972702026, + "learning_rate": 4.6326164874551976e-05, + "loss": 0.2945, + "step": 7101 + }, + { + "epoch": 1.1117720726361928, + "grad_norm": 2.947360038757324, + "learning_rate": 4.631801889866406e-05, + "loss": 0.3154, + "step": 7102 + }, + { + "epoch": 1.1119286161552913, + "grad_norm": 1.4162098169326782, + "learning_rate": 4.630987292277615e-05, + "loss": 0.3458, + "step": 7103 + }, + { + "epoch": 1.1120851596743895, + "grad_norm": 1.3673771619796753, + "learning_rate": 4.630172694688824e-05, + "loss": 0.5472, + "step": 7104 + }, + { + "epoch": 1.1122417031934877, + "grad_norm": 2.3980460166931152, + "learning_rate": 4.629358097100033e-05, + "loss": 0.4659, + "step": 7105 + }, + { + "epoch": 1.1123982467125861, + "grad_norm": 1.1924399137496948, + "learning_rate": 4.6285434995112415e-05, + "loss": 0.4357, + "step": 7106 + }, + { + "epoch": 1.1125547902316844, + "grad_norm": 1.6689929962158203, + "learning_rate": 4.6277289019224505e-05, + "loss": 0.4128, + "step": 7107 + }, + { + "epoch": 1.1127113337507828, + "grad_norm": 2.0398330688476562, + "learning_rate": 4.6269143043336596e-05, + "loss": 0.7666, + "step": 7108 + }, + { + "epoch": 1.112867877269881, + "grad_norm": 0.8472825288772583, + "learning_rate": 4.626099706744868e-05, + "loss": 0.1971, + "step": 7109 + }, + { + "epoch": 1.1130244207889795, + "grad_norm": 1.7129729986190796, + "learning_rate": 4.625285109156077e-05, + "loss": 0.4093, + "step": 7110 + }, + { + "epoch": 1.1131809643080777, + "grad_norm": 2.038397789001465, + "learning_rate": 4.624470511567286e-05, + "loss": 0.5852, + "step": 7111 + }, + { + "epoch": 1.1133375078271759, + "grad_norm": 5.977364540100098, + "learning_rate": 4.6236559139784944e-05, + "loss": 0.4225, + "step": 7112 + }, + { + "epoch": 1.1134940513462743, + "grad_norm": 1.558032512664795, + "learning_rate": 4.622841316389704e-05, + "loss": 0.4995, + "step": 7113 + }, + { + "epoch": 1.1136505948653725, + "grad_norm": 2.192225456237793, + "learning_rate": 4.6220267188009125e-05, + "loss": 0.4356, + "step": 7114 + }, + { + "epoch": 1.113807138384471, + "grad_norm": 2.037269353866577, + "learning_rate": 4.621212121212121e-05, + "loss": 0.4674, + "step": 7115 + }, + { + "epoch": 1.1139636819035692, + "grad_norm": 2.6507985591888428, + "learning_rate": 4.6203975236233306e-05, + "loss": 0.4938, + "step": 7116 + }, + { + "epoch": 1.1141202254226674, + "grad_norm": 1.3299022912979126, + "learning_rate": 4.619582926034539e-05, + "loss": 0.5128, + "step": 7117 + }, + { + "epoch": 1.1142767689417659, + "grad_norm": 2.847087860107422, + "learning_rate": 4.618768328445748e-05, + "loss": 0.7897, + "step": 7118 + }, + { + "epoch": 1.114433312460864, + "grad_norm": 3.7999584674835205, + "learning_rate": 4.617953730856957e-05, + "loss": 0.9757, + "step": 7119 + }, + { + "epoch": 1.1145898559799625, + "grad_norm": 2.7040839195251465, + "learning_rate": 4.6171391332681655e-05, + "loss": 1.0091, + "step": 7120 + }, + { + "epoch": 1.1147463994990607, + "grad_norm": 2.204098701477051, + "learning_rate": 4.6163245356793745e-05, + "loss": 0.5642, + "step": 7121 + }, + { + "epoch": 1.114902943018159, + "grad_norm": 5.365701675415039, + "learning_rate": 4.6155099380905836e-05, + "loss": 0.612, + "step": 7122 + }, + { + "epoch": 1.1150594865372574, + "grad_norm": 3.4516818523406982, + "learning_rate": 4.614695340501792e-05, + "loss": 0.8587, + "step": 7123 + }, + { + "epoch": 1.1152160300563556, + "grad_norm": 5.880587577819824, + "learning_rate": 4.613880742913001e-05, + "loss": 0.7704, + "step": 7124 + }, + { + "epoch": 1.115372573575454, + "grad_norm": 2.1779985427856445, + "learning_rate": 4.61306614532421e-05, + "loss": 1.0851, + "step": 7125 + }, + { + "epoch": 1.1155291170945523, + "grad_norm": 2.1487934589385986, + "learning_rate": 4.612251547735419e-05, + "loss": 0.6087, + "step": 7126 + }, + { + "epoch": 1.1156856606136505, + "grad_norm": 3.358794689178467, + "learning_rate": 4.6114369501466275e-05, + "loss": 0.8562, + "step": 7127 + }, + { + "epoch": 1.115842204132749, + "grad_norm": 2.000138282775879, + "learning_rate": 4.6106223525578366e-05, + "loss": 0.6507, + "step": 7128 + }, + { + "epoch": 1.1159987476518471, + "grad_norm": 3.79778790473938, + "learning_rate": 4.6098077549690456e-05, + "loss": 1.0059, + "step": 7129 + }, + { + "epoch": 1.1161552911709456, + "grad_norm": 1.825524091720581, + "learning_rate": 4.608993157380254e-05, + "loss": 0.9447, + "step": 7130 + }, + { + "epoch": 1.1163118346900438, + "grad_norm": 2.9331791400909424, + "learning_rate": 4.608178559791464e-05, + "loss": 0.8713, + "step": 7131 + }, + { + "epoch": 1.1164683782091422, + "grad_norm": 4.321588516235352, + "learning_rate": 4.607363962202672e-05, + "loss": 0.7135, + "step": 7132 + }, + { + "epoch": 1.1166249217282405, + "grad_norm": 2.942323923110962, + "learning_rate": 4.6065493646138805e-05, + "loss": 0.9277, + "step": 7133 + }, + { + "epoch": 1.1167814652473387, + "grad_norm": 2.069410800933838, + "learning_rate": 4.60573476702509e-05, + "loss": 1.1073, + "step": 7134 + }, + { + "epoch": 1.1169380087664371, + "grad_norm": 2.256408452987671, + "learning_rate": 4.6049201694362986e-05, + "loss": 0.3672, + "step": 7135 + }, + { + "epoch": 1.1170945522855353, + "grad_norm": 2.2868542671203613, + "learning_rate": 4.6041055718475076e-05, + "loss": 0.411, + "step": 7136 + }, + { + "epoch": 1.1172510958046338, + "grad_norm": 1.8388230800628662, + "learning_rate": 4.603290974258717e-05, + "loss": 0.6567, + "step": 7137 + }, + { + "epoch": 1.117407639323732, + "grad_norm": 3.1684420108795166, + "learning_rate": 4.602476376669925e-05, + "loss": 1.71, + "step": 7138 + }, + { + "epoch": 1.1175641828428302, + "grad_norm": 0.6478560566902161, + "learning_rate": 4.601661779081134e-05, + "loss": 0.3497, + "step": 7139 + }, + { + "epoch": 1.1177207263619287, + "grad_norm": 0.930634081363678, + "learning_rate": 4.600847181492343e-05, + "loss": 0.2404, + "step": 7140 + }, + { + "epoch": 1.1178772698810269, + "grad_norm": 0.9225629568099976, + "learning_rate": 4.6000325839035515e-05, + "loss": 0.2608, + "step": 7141 + }, + { + "epoch": 1.1180338134001253, + "grad_norm": 0.7274706363677979, + "learning_rate": 4.5992179863147606e-05, + "loss": 0.2171, + "step": 7142 + }, + { + "epoch": 1.1181903569192235, + "grad_norm": 0.9984283447265625, + "learning_rate": 4.5984033887259696e-05, + "loss": 0.3153, + "step": 7143 + }, + { + "epoch": 1.118346900438322, + "grad_norm": 0.9379408955574036, + "learning_rate": 4.597588791137179e-05, + "loss": 0.2757, + "step": 7144 + }, + { + "epoch": 1.1185034439574202, + "grad_norm": 3.7494313716888428, + "learning_rate": 4.596774193548387e-05, + "loss": 0.4747, + "step": 7145 + }, + { + "epoch": 1.1186599874765184, + "grad_norm": 0.7913817763328552, + "learning_rate": 4.595959595959596e-05, + "loss": 0.2186, + "step": 7146 + }, + { + "epoch": 1.1188165309956168, + "grad_norm": 0.7229637503623962, + "learning_rate": 4.595144998370805e-05, + "loss": 0.1688, + "step": 7147 + }, + { + "epoch": 1.118973074514715, + "grad_norm": 1.0048013925552368, + "learning_rate": 4.5943304007820135e-05, + "loss": 0.1893, + "step": 7148 + }, + { + "epoch": 1.1191296180338135, + "grad_norm": 0.7359488010406494, + "learning_rate": 4.593515803193223e-05, + "loss": 0.2854, + "step": 7149 + }, + { + "epoch": 1.1192861615529117, + "grad_norm": 12.950104713439941, + "learning_rate": 4.5927012056044317e-05, + "loss": 1.9085, + "step": 7150 + }, + { + "epoch": 1.11944270507201, + "grad_norm": 0.8763856887817383, + "learning_rate": 4.59188660801564e-05, + "loss": 0.351, + "step": 7151 + }, + { + "epoch": 1.1195992485911084, + "grad_norm": 6.72769832611084, + "learning_rate": 4.59107201042685e-05, + "loss": 0.3218, + "step": 7152 + }, + { + "epoch": 1.1197557921102066, + "grad_norm": 1.0807136297225952, + "learning_rate": 4.590257412838058e-05, + "loss": 0.2734, + "step": 7153 + }, + { + "epoch": 1.119912335629305, + "grad_norm": 0.8965975046157837, + "learning_rate": 4.589442815249267e-05, + "loss": 0.304, + "step": 7154 + }, + { + "epoch": 1.1200688791484033, + "grad_norm": 2.191037178039551, + "learning_rate": 4.588628217660476e-05, + "loss": 0.5582, + "step": 7155 + }, + { + "epoch": 1.1202254226675015, + "grad_norm": 2.3321971893310547, + "learning_rate": 4.5878136200716846e-05, + "loss": 0.4989, + "step": 7156 + }, + { + "epoch": 1.1203819661866, + "grad_norm": 1.3948644399642944, + "learning_rate": 4.586999022482894e-05, + "loss": 0.3601, + "step": 7157 + }, + { + "epoch": 1.1205385097056981, + "grad_norm": 1.3596835136413574, + "learning_rate": 4.586184424894103e-05, + "loss": 0.5445, + "step": 7158 + }, + { + "epoch": 1.1206950532247966, + "grad_norm": 1.2232849597930908, + "learning_rate": 4.585369827305311e-05, + "loss": 0.3297, + "step": 7159 + }, + { + "epoch": 1.1208515967438948, + "grad_norm": 1.914535641670227, + "learning_rate": 4.58455522971652e-05, + "loss": 0.4055, + "step": 7160 + }, + { + "epoch": 1.121008140262993, + "grad_norm": 1.4152475595474243, + "learning_rate": 4.583740632127729e-05, + "loss": 0.4957, + "step": 7161 + }, + { + "epoch": 1.1211646837820914, + "grad_norm": 1.3770763874053955, + "learning_rate": 4.582926034538938e-05, + "loss": 0.5751, + "step": 7162 + }, + { + "epoch": 1.1213212273011897, + "grad_norm": 1.4857916831970215, + "learning_rate": 4.5821114369501466e-05, + "loss": 0.4258, + "step": 7163 + }, + { + "epoch": 1.121477770820288, + "grad_norm": 4.447564125061035, + "learning_rate": 4.581296839361356e-05, + "loss": 0.3939, + "step": 7164 + }, + { + "epoch": 1.1216343143393863, + "grad_norm": 2.5845446586608887, + "learning_rate": 4.580482241772565e-05, + "loss": 0.5922, + "step": 7165 + }, + { + "epoch": 1.1217908578584848, + "grad_norm": 2.0831427574157715, + "learning_rate": 4.579667644183773e-05, + "loss": 0.639, + "step": 7166 + }, + { + "epoch": 1.121947401377583, + "grad_norm": 1.4776822328567505, + "learning_rate": 4.578853046594982e-05, + "loss": 0.433, + "step": 7167 + }, + { + "epoch": 1.1221039448966812, + "grad_norm": 1.899320125579834, + "learning_rate": 4.578038449006191e-05, + "loss": 0.879, + "step": 7168 + }, + { + "epoch": 1.1222604884157796, + "grad_norm": 2.068946123123169, + "learning_rate": 4.5772238514173996e-05, + "loss": 0.8693, + "step": 7169 + }, + { + "epoch": 1.1224170319348779, + "grad_norm": 5.375672817230225, + "learning_rate": 4.576409253828609e-05, + "loss": 0.7295, + "step": 7170 + }, + { + "epoch": 1.1225735754539763, + "grad_norm": 2.7109763622283936, + "learning_rate": 4.575594656239818e-05, + "loss": 1.0536, + "step": 7171 + }, + { + "epoch": 1.1227301189730745, + "grad_norm": 4.810489177703857, + "learning_rate": 4.574780058651027e-05, + "loss": 0.9598, + "step": 7172 + }, + { + "epoch": 1.1228866624921727, + "grad_norm": 2.1912343502044678, + "learning_rate": 4.573965461062236e-05, + "loss": 0.7028, + "step": 7173 + }, + { + "epoch": 1.1230432060112712, + "grad_norm": 4.499035835266113, + "learning_rate": 4.573150863473444e-05, + "loss": 1.0861, + "step": 7174 + }, + { + "epoch": 1.1231997495303694, + "grad_norm": 3.324098825454712, + "learning_rate": 4.572336265884653e-05, + "loss": 1.2494, + "step": 7175 + }, + { + "epoch": 1.1233562930494678, + "grad_norm": 3.025256395339966, + "learning_rate": 4.571521668295862e-05, + "loss": 1.4055, + "step": 7176 + }, + { + "epoch": 1.123512836568566, + "grad_norm": 2.6878857612609863, + "learning_rate": 4.5707070707070706e-05, + "loss": 0.8052, + "step": 7177 + }, + { + "epoch": 1.1236693800876645, + "grad_norm": 3.3607401847839355, + "learning_rate": 4.56989247311828e-05, + "loss": 0.7636, + "step": 7178 + }, + { + "epoch": 1.1238259236067627, + "grad_norm": 3.2659730911254883, + "learning_rate": 4.569077875529489e-05, + "loss": 1.3717, + "step": 7179 + }, + { + "epoch": 1.123982467125861, + "grad_norm": 3.287301778793335, + "learning_rate": 4.568263277940698e-05, + "loss": 0.8257, + "step": 7180 + }, + { + "epoch": 1.1241390106449594, + "grad_norm": 2.1066598892211914, + "learning_rate": 4.567448680351906e-05, + "loss": 0.9785, + "step": 7181 + }, + { + "epoch": 1.1242955541640576, + "grad_norm": 2.7645657062530518, + "learning_rate": 4.566634082763115e-05, + "loss": 1.2946, + "step": 7182 + }, + { + "epoch": 1.124452097683156, + "grad_norm": 2.7541561126708984, + "learning_rate": 4.565819485174324e-05, + "loss": 0.8036, + "step": 7183 + }, + { + "epoch": 1.1246086412022542, + "grad_norm": 2.8610188961029053, + "learning_rate": 4.565004887585533e-05, + "loss": 1.1046, + "step": 7184 + }, + { + "epoch": 1.1247651847213525, + "grad_norm": 2.434833526611328, + "learning_rate": 4.564190289996742e-05, + "loss": 1.0374, + "step": 7185 + }, + { + "epoch": 1.124921728240451, + "grad_norm": 5.348953723907471, + "learning_rate": 4.563375692407951e-05, + "loss": 1.1444, + "step": 7186 + }, + { + "epoch": 1.125078271759549, + "grad_norm": 2.9084174633026123, + "learning_rate": 4.562561094819159e-05, + "loss": 1.0731, + "step": 7187 + }, + { + "epoch": 1.1252348152786475, + "grad_norm": 3.6478400230407715, + "learning_rate": 4.561746497230369e-05, + "loss": 1.2675, + "step": 7188 + }, + { + "epoch": 1.1253913587977458, + "grad_norm": 0.4602870047092438, + "learning_rate": 4.560931899641577e-05, + "loss": 0.1754, + "step": 7189 + }, + { + "epoch": 1.125547902316844, + "grad_norm": 1.3090803623199463, + "learning_rate": 4.560117302052786e-05, + "loss": 0.32, + "step": 7190 + }, + { + "epoch": 1.1257044458359424, + "grad_norm": 0.5287513732910156, + "learning_rate": 4.5593027044639954e-05, + "loss": 0.1776, + "step": 7191 + }, + { + "epoch": 1.1258609893550406, + "grad_norm": 0.33093491196632385, + "learning_rate": 4.558488106875204e-05, + "loss": 0.1073, + "step": 7192 + }, + { + "epoch": 1.126017532874139, + "grad_norm": 0.6471360325813293, + "learning_rate": 4.557673509286413e-05, + "loss": 0.2136, + "step": 7193 + }, + { + "epoch": 1.1261740763932373, + "grad_norm": 0.7471775412559509, + "learning_rate": 4.556858911697622e-05, + "loss": 0.3062, + "step": 7194 + }, + { + "epoch": 1.1263306199123355, + "grad_norm": 0.9636616110801697, + "learning_rate": 4.55604431410883e-05, + "loss": 0.6307, + "step": 7195 + }, + { + "epoch": 1.126487163431434, + "grad_norm": 0.94664466381073, + "learning_rate": 4.555229716520039e-05, + "loss": 0.2236, + "step": 7196 + }, + { + "epoch": 1.1266437069505322, + "grad_norm": 0.49701762199401855, + "learning_rate": 4.554415118931248e-05, + "loss": 0.1539, + "step": 7197 + }, + { + "epoch": 1.1268002504696306, + "grad_norm": 0.4673086106777191, + "learning_rate": 4.5536005213424574e-05, + "loss": 0.2662, + "step": 7198 + }, + { + "epoch": 1.1269567939887288, + "grad_norm": 1.206229329109192, + "learning_rate": 4.552785923753666e-05, + "loss": 0.2948, + "step": 7199 + }, + { + "epoch": 1.127113337507827, + "grad_norm": 0.8740424513816833, + "learning_rate": 4.551971326164875e-05, + "loss": 0.297, + "step": 7200 + }, + { + "epoch": 1.1272698810269255, + "grad_norm": 0.8812323212623596, + "learning_rate": 4.551156728576084e-05, + "loss": 0.2882, + "step": 7201 + }, + { + "epoch": 1.1274264245460237, + "grad_norm": 0.8207396864891052, + "learning_rate": 4.550342130987292e-05, + "loss": 0.1838, + "step": 7202 + }, + { + "epoch": 1.1275829680651221, + "grad_norm": 0.8367976546287537, + "learning_rate": 4.549527533398501e-05, + "loss": 0.3311, + "step": 7203 + }, + { + "epoch": 1.1277395115842204, + "grad_norm": 2.763134717941284, + "learning_rate": 4.54871293580971e-05, + "loss": 0.3882, + "step": 7204 + }, + { + "epoch": 1.1278960551033188, + "grad_norm": 1.5264462232589722, + "learning_rate": 4.547898338220919e-05, + "loss": 0.5311, + "step": 7205 + }, + { + "epoch": 1.128052598622417, + "grad_norm": 2.8884873390197754, + "learning_rate": 4.5470837406321284e-05, + "loss": 0.3717, + "step": 7206 + }, + { + "epoch": 1.1282091421415155, + "grad_norm": 0.8529811501502991, + "learning_rate": 4.546269143043337e-05, + "loss": 0.2836, + "step": 7207 + }, + { + "epoch": 1.1283656856606137, + "grad_norm": 1.525913119316101, + "learning_rate": 4.545454545454546e-05, + "loss": 0.5135, + "step": 7208 + }, + { + "epoch": 1.128522229179712, + "grad_norm": 1.354903221130371, + "learning_rate": 4.544639947865755e-05, + "loss": 0.4537, + "step": 7209 + }, + { + "epoch": 1.1286787726988103, + "grad_norm": 2.4607443809509277, + "learning_rate": 4.543825350276963e-05, + "loss": 0.4068, + "step": 7210 + }, + { + "epoch": 1.1288353162179086, + "grad_norm": 1.5670439004898071, + "learning_rate": 4.543010752688172e-05, + "loss": 0.605, + "step": 7211 + }, + { + "epoch": 1.128991859737007, + "grad_norm": 5.147032260894775, + "learning_rate": 4.5421961550993814e-05, + "loss": 0.4721, + "step": 7212 + }, + { + "epoch": 1.1291484032561052, + "grad_norm": 1.633934736251831, + "learning_rate": 4.54138155751059e-05, + "loss": 0.5701, + "step": 7213 + }, + { + "epoch": 1.1293049467752034, + "grad_norm": 2.388125419616699, + "learning_rate": 4.540566959921799e-05, + "loss": 0.7287, + "step": 7214 + }, + { + "epoch": 1.1294614902943019, + "grad_norm": 1.2497475147247314, + "learning_rate": 4.539752362333008e-05, + "loss": 0.2562, + "step": 7215 + }, + { + "epoch": 1.1296180338134, + "grad_norm": 6.578226089477539, + "learning_rate": 4.538937764744217e-05, + "loss": 0.7953, + "step": 7216 + }, + { + "epoch": 1.1297745773324985, + "grad_norm": 2.505445957183838, + "learning_rate": 4.538123167155425e-05, + "loss": 0.9136, + "step": 7217 + }, + { + "epoch": 1.1299311208515967, + "grad_norm": 1.9233968257904053, + "learning_rate": 4.5373085695666343e-05, + "loss": 0.7208, + "step": 7218 + }, + { + "epoch": 1.130087664370695, + "grad_norm": 2.0586371421813965, + "learning_rate": 4.5364939719778434e-05, + "loss": 0.6284, + "step": 7219 + }, + { + "epoch": 1.1302442078897934, + "grad_norm": 1.548795461654663, + "learning_rate": 4.535679374389052e-05, + "loss": 0.2899, + "step": 7220 + }, + { + "epoch": 1.1304007514088916, + "grad_norm": 2.774446725845337, + "learning_rate": 4.534864776800261e-05, + "loss": 0.5868, + "step": 7221 + }, + { + "epoch": 1.13055729492799, + "grad_norm": 2.045945405960083, + "learning_rate": 4.53405017921147e-05, + "loss": 0.9583, + "step": 7222 + }, + { + "epoch": 1.1307138384470883, + "grad_norm": 2.851270914077759, + "learning_rate": 4.533235581622678e-05, + "loss": 0.6221, + "step": 7223 + }, + { + "epoch": 1.1308703819661865, + "grad_norm": 4.664710998535156, + "learning_rate": 4.532420984033888e-05, + "loss": 0.6698, + "step": 7224 + }, + { + "epoch": 1.131026925485285, + "grad_norm": 3.4294888973236084, + "learning_rate": 4.5316063864450964e-05, + "loss": 0.7561, + "step": 7225 + }, + { + "epoch": 1.1311834690043832, + "grad_norm": 3.6397135257720947, + "learning_rate": 4.530791788856305e-05, + "loss": 0.5877, + "step": 7226 + }, + { + "epoch": 1.1313400125234816, + "grad_norm": 2.83855938911438, + "learning_rate": 4.5299771912675145e-05, + "loss": 1.0846, + "step": 7227 + }, + { + "epoch": 1.1314965560425798, + "grad_norm": 2.2647147178649902, + "learning_rate": 4.529162593678723e-05, + "loss": 0.8003, + "step": 7228 + }, + { + "epoch": 1.131653099561678, + "grad_norm": 2.7726197242736816, + "learning_rate": 4.528347996089932e-05, + "loss": 0.8513, + "step": 7229 + }, + { + "epoch": 1.1318096430807765, + "grad_norm": 1.653808355331421, + "learning_rate": 4.527533398501141e-05, + "loss": 0.8802, + "step": 7230 + }, + { + "epoch": 1.1319661865998747, + "grad_norm": 2.2269833087921143, + "learning_rate": 4.526718800912349e-05, + "loss": 1.0952, + "step": 7231 + }, + { + "epoch": 1.1321227301189731, + "grad_norm": 7.560417175292969, + "learning_rate": 4.5259042033235584e-05, + "loss": 1.499, + "step": 7232 + }, + { + "epoch": 1.1322792736380713, + "grad_norm": 3.106497287750244, + "learning_rate": 4.5250896057347674e-05, + "loss": 1.8063, + "step": 7233 + }, + { + "epoch": 1.1324358171571698, + "grad_norm": 3.935227155685425, + "learning_rate": 4.5242750081459765e-05, + "loss": 1.2236, + "step": 7234 + }, + { + "epoch": 1.132592360676268, + "grad_norm": 3.85542368888855, + "learning_rate": 4.523460410557185e-05, + "loss": 0.6654, + "step": 7235 + }, + { + "epoch": 1.1327489041953662, + "grad_norm": 6.605512619018555, + "learning_rate": 4.522645812968394e-05, + "loss": 0.8985, + "step": 7236 + }, + { + "epoch": 1.1329054477144647, + "grad_norm": 3.4731953144073486, + "learning_rate": 4.521831215379603e-05, + "loss": 0.8335, + "step": 7237 + }, + { + "epoch": 1.1330619912335629, + "grad_norm": 4.650855541229248, + "learning_rate": 4.521016617790811e-05, + "loss": 1.3957, + "step": 7238 + }, + { + "epoch": 1.1332185347526613, + "grad_norm": 0.48180267214775085, + "learning_rate": 4.5202020202020204e-05, + "loss": 0.2153, + "step": 7239 + }, + { + "epoch": 1.1333750782717595, + "grad_norm": 0.5351779460906982, + "learning_rate": 4.5193874226132294e-05, + "loss": 0.1529, + "step": 7240 + }, + { + "epoch": 1.133531621790858, + "grad_norm": 0.919869065284729, + "learning_rate": 4.518572825024438e-05, + "loss": 0.36, + "step": 7241 + }, + { + "epoch": 1.1336881653099562, + "grad_norm": 1.0457878112792969, + "learning_rate": 4.5177582274356475e-05, + "loss": 0.2068, + "step": 7242 + }, + { + "epoch": 1.1338447088290544, + "grad_norm": 0.5122427344322205, + "learning_rate": 4.516943629846856e-05, + "loss": 0.1726, + "step": 7243 + }, + { + "epoch": 1.1340012523481529, + "grad_norm": 0.6453258395195007, + "learning_rate": 4.516129032258064e-05, + "loss": 0.2723, + "step": 7244 + }, + { + "epoch": 1.134157795867251, + "grad_norm": 0.5325981378555298, + "learning_rate": 4.515314434669274e-05, + "loss": 0.2379, + "step": 7245 + }, + { + "epoch": 1.1343143393863495, + "grad_norm": 1.3837199211120605, + "learning_rate": 4.5144998370804824e-05, + "loss": 0.2189, + "step": 7246 + }, + { + "epoch": 1.1344708829054477, + "grad_norm": 0.5799726843833923, + "learning_rate": 4.5136852394916915e-05, + "loss": 0.15, + "step": 7247 + }, + { + "epoch": 1.134627426424546, + "grad_norm": 0.9805989265441895, + "learning_rate": 4.5128706419029005e-05, + "loss": 0.2697, + "step": 7248 + }, + { + "epoch": 1.1347839699436444, + "grad_norm": 1.0309746265411377, + "learning_rate": 4.512056044314109e-05, + "loss": 0.3232, + "step": 7249 + }, + { + "epoch": 1.1349405134627426, + "grad_norm": 1.250929594039917, + "learning_rate": 4.511241446725318e-05, + "loss": 0.3248, + "step": 7250 + }, + { + "epoch": 1.135097056981841, + "grad_norm": 1.3732529878616333, + "learning_rate": 4.510426849136527e-05, + "loss": 0.3169, + "step": 7251 + }, + { + "epoch": 1.1352536005009393, + "grad_norm": 0.8241703510284424, + "learning_rate": 4.509612251547736e-05, + "loss": 0.1705, + "step": 7252 + }, + { + "epoch": 1.1354101440200375, + "grad_norm": 1.16002357006073, + "learning_rate": 4.5087976539589444e-05, + "loss": 0.3674, + "step": 7253 + }, + { + "epoch": 1.135566687539136, + "grad_norm": 2.4571971893310547, + "learning_rate": 4.5079830563701535e-05, + "loss": 0.4575, + "step": 7254 + }, + { + "epoch": 1.1357232310582341, + "grad_norm": 2.654797077178955, + "learning_rate": 4.5071684587813625e-05, + "loss": 0.5011, + "step": 7255 + }, + { + "epoch": 1.1358797745773326, + "grad_norm": 1.6870514154434204, + "learning_rate": 4.506353861192571e-05, + "loss": 0.3169, + "step": 7256 + }, + { + "epoch": 1.1360363180964308, + "grad_norm": 1.6694329977035522, + "learning_rate": 4.50553926360378e-05, + "loss": 0.2778, + "step": 7257 + }, + { + "epoch": 1.136192861615529, + "grad_norm": 1.1390973329544067, + "learning_rate": 4.504724666014989e-05, + "loss": 0.3786, + "step": 7258 + }, + { + "epoch": 1.1363494051346275, + "grad_norm": 6.726223468780518, + "learning_rate": 4.5039100684261974e-05, + "loss": 0.7595, + "step": 7259 + }, + { + "epoch": 1.1365059486537257, + "grad_norm": 2.584972381591797, + "learning_rate": 4.503095470837407e-05, + "loss": 0.4372, + "step": 7260 + }, + { + "epoch": 1.1366624921728241, + "grad_norm": 5.261322498321533, + "learning_rate": 4.5022808732486155e-05, + "loss": 0.634, + "step": 7261 + }, + { + "epoch": 1.1368190356919223, + "grad_norm": 1.6840064525604248, + "learning_rate": 4.501466275659824e-05, + "loss": 0.3288, + "step": 7262 + }, + { + "epoch": 1.1369755792110205, + "grad_norm": 2.1878674030303955, + "learning_rate": 4.5006516780710336e-05, + "loss": 0.3439, + "step": 7263 + }, + { + "epoch": 1.137132122730119, + "grad_norm": 2.218515396118164, + "learning_rate": 4.499837080482242e-05, + "loss": 0.4945, + "step": 7264 + }, + { + "epoch": 1.1372886662492172, + "grad_norm": 2.215306520462036, + "learning_rate": 4.499022482893451e-05, + "loss": 0.6144, + "step": 7265 + }, + { + "epoch": 1.1374452097683156, + "grad_norm": 1.5415308475494385, + "learning_rate": 4.49820788530466e-05, + "loss": 0.3637, + "step": 7266 + }, + { + "epoch": 1.1376017532874139, + "grad_norm": 2.8170719146728516, + "learning_rate": 4.4973932877158684e-05, + "loss": 0.4521, + "step": 7267 + }, + { + "epoch": 1.1377582968065123, + "grad_norm": 1.865453839302063, + "learning_rate": 4.4965786901270775e-05, + "loss": 0.5107, + "step": 7268 + }, + { + "epoch": 1.1379148403256105, + "grad_norm": 1.3356142044067383, + "learning_rate": 4.4957640925382865e-05, + "loss": 0.4599, + "step": 7269 + }, + { + "epoch": 1.1380713838447087, + "grad_norm": 1.8151419162750244, + "learning_rate": 4.494949494949495e-05, + "loss": 0.7115, + "step": 7270 + }, + { + "epoch": 1.1382279273638072, + "grad_norm": 2.207143545150757, + "learning_rate": 4.494134897360704e-05, + "loss": 0.6508, + "step": 7271 + }, + { + "epoch": 1.1383844708829054, + "grad_norm": 2.8030431270599365, + "learning_rate": 4.493320299771913e-05, + "loss": 0.7656, + "step": 7272 + }, + { + "epoch": 1.1385410144020038, + "grad_norm": 3.6412196159362793, + "learning_rate": 4.492505702183122e-05, + "loss": 0.8112, + "step": 7273 + }, + { + "epoch": 1.138697557921102, + "grad_norm": 2.7121756076812744, + "learning_rate": 4.4916911045943305e-05, + "loss": 0.9212, + "step": 7274 + }, + { + "epoch": 1.1388541014402005, + "grad_norm": 4.243899345397949, + "learning_rate": 4.4908765070055395e-05, + "loss": 1.5668, + "step": 7275 + }, + { + "epoch": 1.1390106449592987, + "grad_norm": 2.812617778778076, + "learning_rate": 4.4900619094167486e-05, + "loss": 0.8326, + "step": 7276 + }, + { + "epoch": 1.139167188478397, + "grad_norm": 5.445814609527588, + "learning_rate": 4.489247311827957e-05, + "loss": 0.9859, + "step": 7277 + }, + { + "epoch": 1.1393237319974954, + "grad_norm": 3.0909717082977295, + "learning_rate": 4.488432714239167e-05, + "loss": 1.1357, + "step": 7278 + }, + { + "epoch": 1.1394802755165936, + "grad_norm": 2.686894655227661, + "learning_rate": 4.487618116650375e-05, + "loss": 1.0457, + "step": 7279 + }, + { + "epoch": 1.139636819035692, + "grad_norm": 5.899528503417969, + "learning_rate": 4.4868035190615834e-05, + "loss": 1.6448, + "step": 7280 + }, + { + "epoch": 1.1397933625547902, + "grad_norm": 3.4505701065063477, + "learning_rate": 4.485988921472793e-05, + "loss": 1.2562, + "step": 7281 + }, + { + "epoch": 1.1399499060738885, + "grad_norm": 5.522470951080322, + "learning_rate": 4.4851743238840015e-05, + "loss": 1.5828, + "step": 7282 + }, + { + "epoch": 1.140106449592987, + "grad_norm": 2.21687912940979, + "learning_rate": 4.4843597262952106e-05, + "loss": 0.6982, + "step": 7283 + }, + { + "epoch": 1.1402629931120851, + "grad_norm": 3.9291961193084717, + "learning_rate": 4.4835451287064196e-05, + "loss": 0.8267, + "step": 7284 + }, + { + "epoch": 1.1404195366311836, + "grad_norm": 1.8007020950317383, + "learning_rate": 4.482730531117628e-05, + "loss": 0.701, + "step": 7285 + }, + { + "epoch": 1.1405760801502818, + "grad_norm": 2.513591766357422, + "learning_rate": 4.481915933528837e-05, + "loss": 0.3378, + "step": 7286 + }, + { + "epoch": 1.14073262366938, + "grad_norm": 2.372490167617798, + "learning_rate": 4.481101335940046e-05, + "loss": 0.7745, + "step": 7287 + }, + { + "epoch": 1.1408891671884784, + "grad_norm": 3.523176908493042, + "learning_rate": 4.4802867383512545e-05, + "loss": 0.315, + "step": 7288 + }, + { + "epoch": 1.1410457107075767, + "grad_norm": 0.4556964337825775, + "learning_rate": 4.4794721407624635e-05, + "loss": 0.2318, + "step": 7289 + }, + { + "epoch": 1.141202254226675, + "grad_norm": 0.7262635231018066, + "learning_rate": 4.478657543173672e-05, + "loss": 0.2355, + "step": 7290 + }, + { + "epoch": 1.1413587977457733, + "grad_norm": 0.7299020886421204, + "learning_rate": 4.4778429455848816e-05, + "loss": 0.2184, + "step": 7291 + }, + { + "epoch": 1.1415153412648715, + "grad_norm": 0.7772171497344971, + "learning_rate": 4.47702834799609e-05, + "loss": 0.1919, + "step": 7292 + }, + { + "epoch": 1.14167188478397, + "grad_norm": 0.6234022974967957, + "learning_rate": 4.476213750407299e-05, + "loss": 0.2707, + "step": 7293 + }, + { + "epoch": 1.1418284283030682, + "grad_norm": 0.608502984046936, + "learning_rate": 4.475399152818508e-05, + "loss": 0.223, + "step": 7294 + }, + { + "epoch": 1.1419849718221666, + "grad_norm": 1.0083324909210205, + "learning_rate": 4.4745845552297165e-05, + "loss": 0.3349, + "step": 7295 + }, + { + "epoch": 1.1421415153412648, + "grad_norm": 0.6245504021644592, + "learning_rate": 4.4737699576409255e-05, + "loss": 0.2115, + "step": 7296 + }, + { + "epoch": 1.142298058860363, + "grad_norm": 0.6965034604072571, + "learning_rate": 4.4729553600521346e-05, + "loss": 0.2182, + "step": 7297 + }, + { + "epoch": 1.1424546023794615, + "grad_norm": 1.333763599395752, + "learning_rate": 4.472140762463343e-05, + "loss": 0.19, + "step": 7298 + }, + { + "epoch": 1.1426111458985597, + "grad_norm": 0.6153396964073181, + "learning_rate": 4.471326164874552e-05, + "loss": 0.1871, + "step": 7299 + }, + { + "epoch": 1.1427676894176582, + "grad_norm": 1.097135305404663, + "learning_rate": 4.470511567285761e-05, + "loss": 0.3531, + "step": 7300 + }, + { + "epoch": 1.1429242329367564, + "grad_norm": 0.8024317622184753, + "learning_rate": 4.46969696969697e-05, + "loss": 0.3647, + "step": 7301 + }, + { + "epoch": 1.1430807764558548, + "grad_norm": 1.363620400428772, + "learning_rate": 4.4688823721081785e-05, + "loss": 0.2567, + "step": 7302 + }, + { + "epoch": 1.143237319974953, + "grad_norm": 1.2441742420196533, + "learning_rate": 4.4680677745193876e-05, + "loss": 0.3655, + "step": 7303 + }, + { + "epoch": 1.1433938634940513, + "grad_norm": 1.34625244140625, + "learning_rate": 4.4672531769305966e-05, + "loss": 0.2461, + "step": 7304 + }, + { + "epoch": 1.1435504070131497, + "grad_norm": 1.5352834463119507, + "learning_rate": 4.466438579341805e-05, + "loss": 0.3887, + "step": 7305 + }, + { + "epoch": 1.143706950532248, + "grad_norm": 1.568723440170288, + "learning_rate": 4.465623981753014e-05, + "loss": 0.5388, + "step": 7306 + }, + { + "epoch": 1.1438634940513464, + "grad_norm": 1.2335896492004395, + "learning_rate": 4.464809384164223e-05, + "loss": 0.4743, + "step": 7307 + }, + { + "epoch": 1.1440200375704446, + "grad_norm": 3.1800618171691895, + "learning_rate": 4.4639947865754315e-05, + "loss": 0.4994, + "step": 7308 + }, + { + "epoch": 1.144176581089543, + "grad_norm": 1.5825868844985962, + "learning_rate": 4.463180188986641e-05, + "loss": 0.4082, + "step": 7309 + }, + { + "epoch": 1.1443331246086412, + "grad_norm": 2.244006872177124, + "learning_rate": 4.4623655913978496e-05, + "loss": 0.771, + "step": 7310 + }, + { + "epoch": 1.1444896681277394, + "grad_norm": 1.1375467777252197, + "learning_rate": 4.461550993809058e-05, + "loss": 0.2905, + "step": 7311 + }, + { + "epoch": 1.1446462116468379, + "grad_norm": 3.4926326274871826, + "learning_rate": 4.460736396220268e-05, + "loss": 0.6095, + "step": 7312 + }, + { + "epoch": 1.144802755165936, + "grad_norm": 2.5832858085632324, + "learning_rate": 4.459921798631476e-05, + "loss": 0.5719, + "step": 7313 + }, + { + "epoch": 1.1449592986850345, + "grad_norm": 2.9117021560668945, + "learning_rate": 4.459107201042685e-05, + "loss": 0.4807, + "step": 7314 + }, + { + "epoch": 1.1451158422041328, + "grad_norm": 1.920396089553833, + "learning_rate": 4.458292603453894e-05, + "loss": 0.5037, + "step": 7315 + }, + { + "epoch": 1.145272385723231, + "grad_norm": 3.290886640548706, + "learning_rate": 4.4574780058651025e-05, + "loss": 0.7784, + "step": 7316 + }, + { + "epoch": 1.1454289292423294, + "grad_norm": 1.4350833892822266, + "learning_rate": 4.4566634082763116e-05, + "loss": 0.5027, + "step": 7317 + }, + { + "epoch": 1.1455854727614276, + "grad_norm": 2.5722649097442627, + "learning_rate": 4.4558488106875206e-05, + "loss": 0.8406, + "step": 7318 + }, + { + "epoch": 1.145742016280526, + "grad_norm": 1.3408175706863403, + "learning_rate": 4.45503421309873e-05, + "loss": 0.4607, + "step": 7319 + }, + { + "epoch": 1.1458985597996243, + "grad_norm": 2.088860511779785, + "learning_rate": 4.454219615509938e-05, + "loss": 0.7195, + "step": 7320 + }, + { + "epoch": 1.1460551033187225, + "grad_norm": 3.4659061431884766, + "learning_rate": 4.453405017921147e-05, + "loss": 0.8817, + "step": 7321 + }, + { + "epoch": 1.146211646837821, + "grad_norm": 3.5971364974975586, + "learning_rate": 4.452590420332356e-05, + "loss": 0.3939, + "step": 7322 + }, + { + "epoch": 1.1463681903569192, + "grad_norm": 3.105269193649292, + "learning_rate": 4.4517758227435645e-05, + "loss": 0.9098, + "step": 7323 + }, + { + "epoch": 1.1465247338760176, + "grad_norm": 3.4811038970947266, + "learning_rate": 4.4509612251547736e-05, + "loss": 0.915, + "step": 7324 + }, + { + "epoch": 1.1466812773951158, + "grad_norm": 1.8780028820037842, + "learning_rate": 4.4501466275659826e-05, + "loss": 0.6238, + "step": 7325 + }, + { + "epoch": 1.146837820914214, + "grad_norm": 2.0529115200042725, + "learning_rate": 4.449332029977191e-05, + "loss": 0.9713, + "step": 7326 + }, + { + "epoch": 1.1469943644333125, + "grad_norm": 3.8605687618255615, + "learning_rate": 4.448517432388401e-05, + "loss": 0.7343, + "step": 7327 + }, + { + "epoch": 1.1471509079524107, + "grad_norm": 7.069092273712158, + "learning_rate": 4.447702834799609e-05, + "loss": 0.8285, + "step": 7328 + }, + { + "epoch": 1.1473074514715091, + "grad_norm": 2.2536985874176025, + "learning_rate": 4.4468882372108175e-05, + "loss": 1.0183, + "step": 7329 + }, + { + "epoch": 1.1474639949906074, + "grad_norm": 5.852692127227783, + "learning_rate": 4.446073639622027e-05, + "loss": 0.771, + "step": 7330 + }, + { + "epoch": 1.1476205385097056, + "grad_norm": 3.110602617263794, + "learning_rate": 4.4452590420332356e-05, + "loss": 1.2486, + "step": 7331 + }, + { + "epoch": 1.147777082028804, + "grad_norm": 3.2110061645507812, + "learning_rate": 4.4444444444444447e-05, + "loss": 1.1836, + "step": 7332 + }, + { + "epoch": 1.1479336255479022, + "grad_norm": 5.160244464874268, + "learning_rate": 4.443629846855654e-05, + "loss": 1.1039, + "step": 7333 + }, + { + "epoch": 1.1480901690670007, + "grad_norm": 7.7566657066345215, + "learning_rate": 4.442815249266862e-05, + "loss": 1.5931, + "step": 7334 + }, + { + "epoch": 1.148246712586099, + "grad_norm": 1.6389583349227905, + "learning_rate": 4.442000651678071e-05, + "loss": 0.2269, + "step": 7335 + }, + { + "epoch": 1.1484032561051973, + "grad_norm": 5.9702043533325195, + "learning_rate": 4.44118605408928e-05, + "loss": 0.8387, + "step": 7336 + }, + { + "epoch": 1.1485597996242956, + "grad_norm": 5.6341776847839355, + "learning_rate": 4.440371456500489e-05, + "loss": 0.3158, + "step": 7337 + }, + { + "epoch": 1.1487163431433938, + "grad_norm": 2.4342403411865234, + "learning_rate": 4.4395568589116976e-05, + "loss": 0.9368, + "step": 7338 + }, + { + "epoch": 1.1488728866624922, + "grad_norm": 0.8099783062934875, + "learning_rate": 4.438742261322907e-05, + "loss": 0.2298, + "step": 7339 + }, + { + "epoch": 1.1490294301815904, + "grad_norm": 0.3929459750652313, + "learning_rate": 4.437927663734116e-05, + "loss": 0.1477, + "step": 7340 + }, + { + "epoch": 1.1491859737006889, + "grad_norm": 0.38002824783325195, + "learning_rate": 4.437113066145324e-05, + "loss": 0.187, + "step": 7341 + }, + { + "epoch": 1.149342517219787, + "grad_norm": 0.6055118441581726, + "learning_rate": 4.436298468556533e-05, + "loss": 0.2217, + "step": 7342 + }, + { + "epoch": 1.1494990607388855, + "grad_norm": 0.4108304977416992, + "learning_rate": 4.435483870967742e-05, + "loss": 0.1409, + "step": 7343 + }, + { + "epoch": 1.1496556042579837, + "grad_norm": 0.6558899879455566, + "learning_rate": 4.4346692733789506e-05, + "loss": 0.258, + "step": 7344 + }, + { + "epoch": 1.149812147777082, + "grad_norm": 1.0400702953338623, + "learning_rate": 4.43385467579016e-05, + "loss": 0.2157, + "step": 7345 + }, + { + "epoch": 1.1499686912961804, + "grad_norm": 0.7942031621932983, + "learning_rate": 4.433040078201369e-05, + "loss": 0.277, + "step": 7346 + }, + { + "epoch": 1.1501252348152786, + "grad_norm": 0.6176934242248535, + "learning_rate": 4.432225480612577e-05, + "loss": 0.1928, + "step": 7347 + }, + { + "epoch": 1.150281778334377, + "grad_norm": 0.7902481555938721, + "learning_rate": 4.431410883023787e-05, + "loss": 0.2432, + "step": 7348 + }, + { + "epoch": 1.1504383218534753, + "grad_norm": 1.116546392440796, + "learning_rate": 4.430596285434995e-05, + "loss": 0.1918, + "step": 7349 + }, + { + "epoch": 1.1505948653725735, + "grad_norm": 0.8673053979873657, + "learning_rate": 4.429781687846204e-05, + "loss": 0.3269, + "step": 7350 + }, + { + "epoch": 1.150751408891672, + "grad_norm": 0.9393253922462463, + "learning_rate": 4.428967090257413e-05, + "loss": 0.4055, + "step": 7351 + }, + { + "epoch": 1.1509079524107702, + "grad_norm": 2.010387659072876, + "learning_rate": 4.4281524926686216e-05, + "loss": 0.4114, + "step": 7352 + }, + { + "epoch": 1.1510644959298686, + "grad_norm": 1.2237019538879395, + "learning_rate": 4.427337895079831e-05, + "loss": 0.2913, + "step": 7353 + }, + { + "epoch": 1.1512210394489668, + "grad_norm": 0.88564532995224, + "learning_rate": 4.42652329749104e-05, + "loss": 0.3375, + "step": 7354 + }, + { + "epoch": 1.151377582968065, + "grad_norm": 1.088997721672058, + "learning_rate": 4.425708699902248e-05, + "loss": 0.2878, + "step": 7355 + }, + { + "epoch": 1.1515341264871635, + "grad_norm": 1.866581678390503, + "learning_rate": 4.424894102313457e-05, + "loss": 0.4082, + "step": 7356 + }, + { + "epoch": 1.1516906700062617, + "grad_norm": 2.173382043838501, + "learning_rate": 4.424079504724666e-05, + "loss": 0.5789, + "step": 7357 + }, + { + "epoch": 1.1518472135253601, + "grad_norm": 1.0355849266052246, + "learning_rate": 4.423264907135875e-05, + "loss": 0.4111, + "step": 7358 + }, + { + "epoch": 1.1520037570444583, + "grad_norm": 2.454167604446411, + "learning_rate": 4.4224503095470837e-05, + "loss": 0.295, + "step": 7359 + }, + { + "epoch": 1.1521603005635566, + "grad_norm": 2.2854068279266357, + "learning_rate": 4.421635711958293e-05, + "loss": 0.5612, + "step": 7360 + }, + { + "epoch": 1.152316844082655, + "grad_norm": 1.7729896306991577, + "learning_rate": 4.420821114369502e-05, + "loss": 0.54, + "step": 7361 + }, + { + "epoch": 1.1524733876017532, + "grad_norm": 2.142289161682129, + "learning_rate": 4.42000651678071e-05, + "loss": 0.5374, + "step": 7362 + }, + { + "epoch": 1.1526299311208517, + "grad_norm": 1.8909095525741577, + "learning_rate": 4.41919191919192e-05, + "loss": 0.5223, + "step": 7363 + }, + { + "epoch": 1.1527864746399499, + "grad_norm": 2.314936637878418, + "learning_rate": 4.418377321603128e-05, + "loss": 0.6095, + "step": 7364 + }, + { + "epoch": 1.152943018159048, + "grad_norm": 2.016695976257324, + "learning_rate": 4.4175627240143366e-05, + "loss": 0.4566, + "step": 7365 + }, + { + "epoch": 1.1530995616781465, + "grad_norm": 1.5262436866760254, + "learning_rate": 4.4167481264255463e-05, + "loss": 0.4189, + "step": 7366 + }, + { + "epoch": 1.1532561051972448, + "grad_norm": 3.2376813888549805, + "learning_rate": 4.415933528836755e-05, + "loss": 0.8654, + "step": 7367 + }, + { + "epoch": 1.1534126487163432, + "grad_norm": 2.292970895767212, + "learning_rate": 4.415118931247964e-05, + "loss": 0.7473, + "step": 7368 + }, + { + "epoch": 1.1535691922354414, + "grad_norm": 1.1853928565979004, + "learning_rate": 4.414304333659173e-05, + "loss": 0.335, + "step": 7369 + }, + { + "epoch": 1.1537257357545398, + "grad_norm": 2.8809306621551514, + "learning_rate": 4.413489736070381e-05, + "loss": 0.8826, + "step": 7370 + }, + { + "epoch": 1.153882279273638, + "grad_norm": 3.58487868309021, + "learning_rate": 4.41267513848159e-05, + "loss": 0.5258, + "step": 7371 + }, + { + "epoch": 1.1540388227927363, + "grad_norm": 1.618719458580017, + "learning_rate": 4.411860540892799e-05, + "loss": 0.5741, + "step": 7372 + }, + { + "epoch": 1.1541953663118347, + "grad_norm": 3.884047508239746, + "learning_rate": 4.411045943304008e-05, + "loss": 1.1439, + "step": 7373 + }, + { + "epoch": 1.154351909830933, + "grad_norm": 3.2586042881011963, + "learning_rate": 4.410231345715217e-05, + "loss": 0.5261, + "step": 7374 + }, + { + "epoch": 1.1545084533500314, + "grad_norm": 2.7470972537994385, + "learning_rate": 4.409416748126426e-05, + "loss": 1.0121, + "step": 7375 + }, + { + "epoch": 1.1546649968691296, + "grad_norm": 2.0722053050994873, + "learning_rate": 4.408602150537635e-05, + "loss": 0.6812, + "step": 7376 + }, + { + "epoch": 1.154821540388228, + "grad_norm": 3.313063383102417, + "learning_rate": 4.407787552948843e-05, + "loss": 1.0474, + "step": 7377 + }, + { + "epoch": 1.1549780839073263, + "grad_norm": 3.281834840774536, + "learning_rate": 4.406972955360052e-05, + "loss": 0.7429, + "step": 7378 + }, + { + "epoch": 1.1551346274264245, + "grad_norm": 8.199694633483887, + "learning_rate": 4.406158357771261e-05, + "loss": 0.7714, + "step": 7379 + }, + { + "epoch": 1.155291170945523, + "grad_norm": 3.4682552814483643, + "learning_rate": 4.40534376018247e-05, + "loss": 1.0348, + "step": 7380 + }, + { + "epoch": 1.1554477144646211, + "grad_norm": 2.88739013671875, + "learning_rate": 4.4045291625936794e-05, + "loss": 1.1844, + "step": 7381 + }, + { + "epoch": 1.1556042579837196, + "grad_norm": 2.9808948040008545, + "learning_rate": 4.403714565004888e-05, + "loss": 1.1782, + "step": 7382 + }, + { + "epoch": 1.1557608015028178, + "grad_norm": 3.1850578784942627, + "learning_rate": 4.402899967416096e-05, + "loss": 0.9267, + "step": 7383 + }, + { + "epoch": 1.155917345021916, + "grad_norm": 4.631824970245361, + "learning_rate": 4.402085369827306e-05, + "loss": 0.6829, + "step": 7384 + }, + { + "epoch": 1.1560738885410144, + "grad_norm": 2.5808918476104736, + "learning_rate": 4.401270772238514e-05, + "loss": 0.536, + "step": 7385 + }, + { + "epoch": 1.1562304320601127, + "grad_norm": 4.396053314208984, + "learning_rate": 4.400456174649723e-05, + "loss": 0.6492, + "step": 7386 + }, + { + "epoch": 1.156386975579211, + "grad_norm": 1.2842758893966675, + "learning_rate": 4.3996415770609324e-05, + "loss": 0.1745, + "step": 7387 + }, + { + "epoch": 1.1565435190983093, + "grad_norm": 3.5225210189819336, + "learning_rate": 4.398826979472141e-05, + "loss": 0.8927, + "step": 7388 + }, + { + "epoch": 1.1567000626174075, + "grad_norm": 0.4123728573322296, + "learning_rate": 4.39801238188335e-05, + "loss": 0.2357, + "step": 7389 + }, + { + "epoch": 1.156856606136506, + "grad_norm": 0.5017610788345337, + "learning_rate": 4.397197784294559e-05, + "loss": 0.208, + "step": 7390 + }, + { + "epoch": 1.1570131496556042, + "grad_norm": 0.6419593095779419, + "learning_rate": 4.396383186705767e-05, + "loss": 0.2089, + "step": 7391 + }, + { + "epoch": 1.1571696931747026, + "grad_norm": 0.7808046936988831, + "learning_rate": 4.395568589116976e-05, + "loss": 0.1917, + "step": 7392 + }, + { + "epoch": 1.1573262366938009, + "grad_norm": 0.9203245639801025, + "learning_rate": 4.3947539915281853e-05, + "loss": 0.2477, + "step": 7393 + }, + { + "epoch": 1.157482780212899, + "grad_norm": 0.8702841997146606, + "learning_rate": 4.3939393939393944e-05, + "loss": 0.3116, + "step": 7394 + }, + { + "epoch": 1.1576393237319975, + "grad_norm": 0.8706940412521362, + "learning_rate": 4.393124796350603e-05, + "loss": 0.1867, + "step": 7395 + }, + { + "epoch": 1.1577958672510957, + "grad_norm": 1.2878637313842773, + "learning_rate": 4.392310198761812e-05, + "loss": 0.2197, + "step": 7396 + }, + { + "epoch": 1.1579524107701942, + "grad_norm": 0.7053172588348389, + "learning_rate": 4.391495601173021e-05, + "loss": 0.2309, + "step": 7397 + }, + { + "epoch": 1.1581089542892924, + "grad_norm": 1.4703402519226074, + "learning_rate": 4.390681003584229e-05, + "loss": 0.2251, + "step": 7398 + }, + { + "epoch": 1.1582654978083906, + "grad_norm": 0.6635390520095825, + "learning_rate": 4.389866405995438e-05, + "loss": 0.1751, + "step": 7399 + }, + { + "epoch": 1.158422041327489, + "grad_norm": 1.7713674306869507, + "learning_rate": 4.3890518084066474e-05, + "loss": 0.3676, + "step": 7400 + }, + { + "epoch": 1.1585785848465873, + "grad_norm": 1.3054169416427612, + "learning_rate": 4.388237210817856e-05, + "loss": 0.3265, + "step": 7401 + }, + { + "epoch": 1.1587351283656857, + "grad_norm": 1.0266096591949463, + "learning_rate": 4.3874226132290655e-05, + "loss": 0.1901, + "step": 7402 + }, + { + "epoch": 1.158891671884784, + "grad_norm": 1.2729531526565552, + "learning_rate": 4.386608015640274e-05, + "loss": 0.2782, + "step": 7403 + }, + { + "epoch": 1.1590482154038824, + "grad_norm": 2.0501837730407715, + "learning_rate": 4.385793418051483e-05, + "loss": 0.3598, + "step": 7404 + }, + { + "epoch": 1.1592047589229806, + "grad_norm": 1.8013752698898315, + "learning_rate": 4.384978820462692e-05, + "loss": 0.4475, + "step": 7405 + }, + { + "epoch": 1.159361302442079, + "grad_norm": 0.7897927165031433, + "learning_rate": 4.3841642228739e-05, + "loss": 0.2157, + "step": 7406 + }, + { + "epoch": 1.1595178459611772, + "grad_norm": 0.9069951772689819, + "learning_rate": 4.3833496252851094e-05, + "loss": 0.2346, + "step": 7407 + }, + { + "epoch": 1.1596743894802755, + "grad_norm": 0.9619044661521912, + "learning_rate": 4.3825350276963184e-05, + "loss": 0.2987, + "step": 7408 + }, + { + "epoch": 1.159830932999374, + "grad_norm": 2.3944473266601562, + "learning_rate": 4.381720430107527e-05, + "loss": 0.9433, + "step": 7409 + }, + { + "epoch": 1.1599874765184721, + "grad_norm": 2.1403722763061523, + "learning_rate": 4.380905832518736e-05, + "loss": 0.4557, + "step": 7410 + }, + { + "epoch": 1.1601440200375706, + "grad_norm": 2.9070818424224854, + "learning_rate": 4.380091234929945e-05, + "loss": 0.384, + "step": 7411 + }, + { + "epoch": 1.1603005635566688, + "grad_norm": 2.2654614448547363, + "learning_rate": 4.379276637341154e-05, + "loss": 0.3155, + "step": 7412 + }, + { + "epoch": 1.160457107075767, + "grad_norm": 2.515272855758667, + "learning_rate": 4.378462039752362e-05, + "loss": 0.7071, + "step": 7413 + }, + { + "epoch": 1.1606136505948654, + "grad_norm": 3.1775381565093994, + "learning_rate": 4.3776474421635714e-05, + "loss": 0.6442, + "step": 7414 + }, + { + "epoch": 1.1607701941139636, + "grad_norm": 2.618929147720337, + "learning_rate": 4.3768328445747804e-05, + "loss": 0.8001, + "step": 7415 + }, + { + "epoch": 1.160926737633062, + "grad_norm": 3.2422475814819336, + "learning_rate": 4.376018246985989e-05, + "loss": 0.5077, + "step": 7416 + }, + { + "epoch": 1.1610832811521603, + "grad_norm": 0.9044103622436523, + "learning_rate": 4.375203649397198e-05, + "loss": 0.266, + "step": 7417 + }, + { + "epoch": 1.1612398246712585, + "grad_norm": 4.24599552154541, + "learning_rate": 4.374389051808407e-05, + "loss": 0.5401, + "step": 7418 + }, + { + "epoch": 1.161396368190357, + "grad_norm": 1.5097661018371582, + "learning_rate": 4.373574454219615e-05, + "loss": 0.5958, + "step": 7419 + }, + { + "epoch": 1.1615529117094552, + "grad_norm": 1.769010066986084, + "learning_rate": 4.372759856630825e-05, + "loss": 0.4212, + "step": 7420 + }, + { + "epoch": 1.1617094552285536, + "grad_norm": 3.4853315353393555, + "learning_rate": 4.3719452590420334e-05, + "loss": 0.4874, + "step": 7421 + }, + { + "epoch": 1.1618659987476518, + "grad_norm": 2.514618396759033, + "learning_rate": 4.3711306614532424e-05, + "loss": 0.7844, + "step": 7422 + }, + { + "epoch": 1.16202254226675, + "grad_norm": 1.7896952629089355, + "learning_rate": 4.3703160638644515e-05, + "loss": 0.7573, + "step": 7423 + }, + { + "epoch": 1.1621790857858485, + "grad_norm": 2.755934476852417, + "learning_rate": 4.36950146627566e-05, + "loss": 0.7776, + "step": 7424 + }, + { + "epoch": 1.1623356293049467, + "grad_norm": 4.440371990203857, + "learning_rate": 4.368686868686869e-05, + "loss": 1.2375, + "step": 7425 + }, + { + "epoch": 1.1624921728240452, + "grad_norm": 3.78739333152771, + "learning_rate": 4.367872271098078e-05, + "loss": 0.8858, + "step": 7426 + }, + { + "epoch": 1.1626487163431434, + "grad_norm": 2.59503436088562, + "learning_rate": 4.3670576735092864e-05, + "loss": 0.9644, + "step": 7427 + }, + { + "epoch": 1.1628052598622416, + "grad_norm": 2.6772429943084717, + "learning_rate": 4.3662430759204954e-05, + "loss": 1.0924, + "step": 7428 + }, + { + "epoch": 1.16296180338134, + "grad_norm": 1.8951579332351685, + "learning_rate": 4.3654284783317045e-05, + "loss": 0.9897, + "step": 7429 + }, + { + "epoch": 1.1631183469004382, + "grad_norm": 2.649754762649536, + "learning_rate": 4.3646138807429135e-05, + "loss": 1.688, + "step": 7430 + }, + { + "epoch": 1.1632748904195367, + "grad_norm": 3.4860384464263916, + "learning_rate": 4.363799283154122e-05, + "loss": 1.6089, + "step": 7431 + }, + { + "epoch": 1.163431433938635, + "grad_norm": 3.382145404815674, + "learning_rate": 4.362984685565331e-05, + "loss": 0.9638, + "step": 7432 + }, + { + "epoch": 1.1635879774577333, + "grad_norm": 4.255252838134766, + "learning_rate": 4.36217008797654e-05, + "loss": 1.0226, + "step": 7433 + }, + { + "epoch": 1.1637445209768316, + "grad_norm": 2.0940427780151367, + "learning_rate": 4.3613554903877484e-05, + "loss": 0.3136, + "step": 7434 + }, + { + "epoch": 1.1639010644959298, + "grad_norm": 5.167916297912598, + "learning_rate": 4.3605408927989574e-05, + "loss": 0.9197, + "step": 7435 + }, + { + "epoch": 1.1640576080150282, + "grad_norm": 4.102384567260742, + "learning_rate": 4.3597262952101665e-05, + "loss": 0.9614, + "step": 7436 + }, + { + "epoch": 1.1642141515341264, + "grad_norm": 2.4266879558563232, + "learning_rate": 4.358911697621375e-05, + "loss": 0.353, + "step": 7437 + }, + { + "epoch": 1.1643706950532249, + "grad_norm": 2.508066177368164, + "learning_rate": 4.3580971000325846e-05, + "loss": 0.7013, + "step": 7438 + }, + { + "epoch": 1.164527238572323, + "grad_norm": 0.9760140180587769, + "learning_rate": 4.357282502443793e-05, + "loss": 0.1744, + "step": 7439 + }, + { + "epoch": 1.1646837820914215, + "grad_norm": 0.4670180380344391, + "learning_rate": 4.356467904855002e-05, + "loss": 0.1909, + "step": 7440 + }, + { + "epoch": 1.1648403256105198, + "grad_norm": 0.5468526482582092, + "learning_rate": 4.355653307266211e-05, + "loss": 0.2133, + "step": 7441 + }, + { + "epoch": 1.164996869129618, + "grad_norm": 0.54132479429245, + "learning_rate": 4.3548387096774194e-05, + "loss": 0.1539, + "step": 7442 + }, + { + "epoch": 1.1651534126487164, + "grad_norm": 0.6784156560897827, + "learning_rate": 4.3540241120886285e-05, + "loss": 0.1858, + "step": 7443 + }, + { + "epoch": 1.1653099561678146, + "grad_norm": 0.4071294963359833, + "learning_rate": 4.3532095144998375e-05, + "loss": 0.1864, + "step": 7444 + }, + { + "epoch": 1.165466499686913, + "grad_norm": 0.8760849237442017, + "learning_rate": 4.352394916911046e-05, + "loss": 0.2662, + "step": 7445 + }, + { + "epoch": 1.1656230432060113, + "grad_norm": 0.67447429895401, + "learning_rate": 4.351580319322255e-05, + "loss": 0.2102, + "step": 7446 + }, + { + "epoch": 1.1657795867251095, + "grad_norm": 0.691402018070221, + "learning_rate": 4.350765721733464e-05, + "loss": 0.2341, + "step": 7447 + }, + { + "epoch": 1.165936130244208, + "grad_norm": 2.439497709274292, + "learning_rate": 4.349951124144673e-05, + "loss": 0.314, + "step": 7448 + }, + { + "epoch": 1.1660926737633062, + "grad_norm": 0.9053082466125488, + "learning_rate": 4.3491365265558814e-05, + "loss": 0.2674, + "step": 7449 + }, + { + "epoch": 1.1662492172824046, + "grad_norm": 0.9000219106674194, + "learning_rate": 4.3483219289670905e-05, + "loss": 0.3085, + "step": 7450 + }, + { + "epoch": 1.1664057608015028, + "grad_norm": 0.7985740900039673, + "learning_rate": 4.3475073313782996e-05, + "loss": 0.2196, + "step": 7451 + }, + { + "epoch": 1.166562304320601, + "grad_norm": 1.0354197025299072, + "learning_rate": 4.346692733789508e-05, + "loss": 0.2803, + "step": 7452 + }, + { + "epoch": 1.1667188478396995, + "grad_norm": 1.6833173036575317, + "learning_rate": 4.345878136200717e-05, + "loss": 0.3323, + "step": 7453 + }, + { + "epoch": 1.1668753913587977, + "grad_norm": 1.764115333557129, + "learning_rate": 4.345063538611926e-05, + "loss": 0.6394, + "step": 7454 + }, + { + "epoch": 1.1670319348778961, + "grad_norm": 0.8617987036705017, + "learning_rate": 4.3442489410231344e-05, + "loss": 0.1818, + "step": 7455 + }, + { + "epoch": 1.1671884783969944, + "grad_norm": 1.1596835851669312, + "learning_rate": 4.343434343434344e-05, + "loss": 0.3488, + "step": 7456 + }, + { + "epoch": 1.1673450219160926, + "grad_norm": 1.041130542755127, + "learning_rate": 4.3426197458455525e-05, + "loss": 0.2502, + "step": 7457 + }, + { + "epoch": 1.167501565435191, + "grad_norm": 1.652297019958496, + "learning_rate": 4.341805148256761e-05, + "loss": 0.5116, + "step": 7458 + }, + { + "epoch": 1.1676581089542892, + "grad_norm": 1.4556409120559692, + "learning_rate": 4.3409905506679706e-05, + "loss": 0.4459, + "step": 7459 + }, + { + "epoch": 1.1678146524733877, + "grad_norm": 1.9038128852844238, + "learning_rate": 4.340175953079179e-05, + "loss": 0.8404, + "step": 7460 + }, + { + "epoch": 1.1679711959924859, + "grad_norm": 1.0308804512023926, + "learning_rate": 4.339361355490388e-05, + "loss": 0.2316, + "step": 7461 + }, + { + "epoch": 1.168127739511584, + "grad_norm": 2.1830906867980957, + "learning_rate": 4.338546757901597e-05, + "loss": 0.8563, + "step": 7462 + }, + { + "epoch": 1.1682842830306825, + "grad_norm": 1.4305647611618042, + "learning_rate": 4.3377321603128055e-05, + "loss": 0.324, + "step": 7463 + }, + { + "epoch": 1.1684408265497808, + "grad_norm": 2.033592462539673, + "learning_rate": 4.3369175627240145e-05, + "loss": 0.482, + "step": 7464 + }, + { + "epoch": 1.1685973700688792, + "grad_norm": 5.867396354675293, + "learning_rate": 4.3361029651352236e-05, + "loss": 0.7696, + "step": 7465 + }, + { + "epoch": 1.1687539135879774, + "grad_norm": 2.2162790298461914, + "learning_rate": 4.3352883675464326e-05, + "loss": 0.4435, + "step": 7466 + }, + { + "epoch": 1.1689104571070759, + "grad_norm": 2.5263688564300537, + "learning_rate": 4.334473769957641e-05, + "loss": 0.7696, + "step": 7467 + }, + { + "epoch": 1.169067000626174, + "grad_norm": 7.515866756439209, + "learning_rate": 4.33365917236885e-05, + "loss": 0.6673, + "step": 7468 + }, + { + "epoch": 1.1692235441452723, + "grad_norm": 2.7848362922668457, + "learning_rate": 4.332844574780059e-05, + "loss": 1.1392, + "step": 7469 + }, + { + "epoch": 1.1693800876643707, + "grad_norm": 1.9060924053192139, + "learning_rate": 4.3320299771912675e-05, + "loss": 0.358, + "step": 7470 + }, + { + "epoch": 1.169536631183469, + "grad_norm": 1.4615267515182495, + "learning_rate": 4.3312153796024765e-05, + "loss": 0.553, + "step": 7471 + }, + { + "epoch": 1.1696931747025674, + "grad_norm": 5.306066989898682, + "learning_rate": 4.3304007820136856e-05, + "loss": 0.7455, + "step": 7472 + }, + { + "epoch": 1.1698497182216656, + "grad_norm": 2.3401408195495605, + "learning_rate": 4.329586184424894e-05, + "loss": 0.5275, + "step": 7473 + }, + { + "epoch": 1.170006261740764, + "grad_norm": 2.5843100547790527, + "learning_rate": 4.328771586836104e-05, + "loss": 0.8604, + "step": 7474 + }, + { + "epoch": 1.1701628052598623, + "grad_norm": 2.5978500843048096, + "learning_rate": 4.327956989247312e-05, + "loss": 0.6654, + "step": 7475 + }, + { + "epoch": 1.1703193487789605, + "grad_norm": 1.7436059713363647, + "learning_rate": 4.3271423916585204e-05, + "loss": 0.6179, + "step": 7476 + }, + { + "epoch": 1.170475892298059, + "grad_norm": 2.125009059906006, + "learning_rate": 4.32632779406973e-05, + "loss": 0.7193, + "step": 7477 + }, + { + "epoch": 1.1706324358171571, + "grad_norm": 4.716000556945801, + "learning_rate": 4.3255131964809385e-05, + "loss": 1.3822, + "step": 7478 + }, + { + "epoch": 1.1707889793362556, + "grad_norm": 2.420305013656616, + "learning_rate": 4.3246985988921476e-05, + "loss": 0.6889, + "step": 7479 + }, + { + "epoch": 1.1709455228553538, + "grad_norm": 2.528611421585083, + "learning_rate": 4.3238840013033567e-05, + "loss": 1.0534, + "step": 7480 + }, + { + "epoch": 1.171102066374452, + "grad_norm": 1.9380764961242676, + "learning_rate": 4.323069403714565e-05, + "loss": 1.1575, + "step": 7481 + }, + { + "epoch": 1.1712586098935505, + "grad_norm": 2.891427516937256, + "learning_rate": 4.322254806125774e-05, + "loss": 1.267, + "step": 7482 + }, + { + "epoch": 1.1714151534126487, + "grad_norm": 2.6554548740386963, + "learning_rate": 4.321440208536983e-05, + "loss": 1.4943, + "step": 7483 + }, + { + "epoch": 1.1715716969317471, + "grad_norm": 2.54551100730896, + "learning_rate": 4.320625610948192e-05, + "loss": 0.9845, + "step": 7484 + }, + { + "epoch": 1.1717282404508453, + "grad_norm": 1.8217066526412964, + "learning_rate": 4.3198110133594006e-05, + "loss": 1.0278, + "step": 7485 + }, + { + "epoch": 1.1718847839699436, + "grad_norm": 2.3901453018188477, + "learning_rate": 4.3189964157706096e-05, + "loss": 0.9356, + "step": 7486 + }, + { + "epoch": 1.172041327489042, + "grad_norm": 3.2649734020233154, + "learning_rate": 4.318181818181819e-05, + "loss": 0.6292, + "step": 7487 + }, + { + "epoch": 1.1721978710081402, + "grad_norm": 2.075179100036621, + "learning_rate": 4.317367220593027e-05, + "loss": 0.4637, + "step": 7488 + }, + { + "epoch": 1.1723544145272387, + "grad_norm": 0.9878961443901062, + "learning_rate": 4.316552623004236e-05, + "loss": 0.2463, + "step": 7489 + }, + { + "epoch": 1.1725109580463369, + "grad_norm": 0.6367024183273315, + "learning_rate": 4.315738025415445e-05, + "loss": 0.1925, + "step": 7490 + }, + { + "epoch": 1.172667501565435, + "grad_norm": 1.1499682664871216, + "learning_rate": 4.3149234278266535e-05, + "loss": 0.6663, + "step": 7491 + }, + { + "epoch": 1.1728240450845335, + "grad_norm": 1.2073107957839966, + "learning_rate": 4.314108830237863e-05, + "loss": 0.3479, + "step": 7492 + }, + { + "epoch": 1.1729805886036317, + "grad_norm": 0.6336411833763123, + "learning_rate": 4.3132942326490716e-05, + "loss": 0.2178, + "step": 7493 + }, + { + "epoch": 1.1731371321227302, + "grad_norm": 0.736139714717865, + "learning_rate": 4.31247963506028e-05, + "loss": 0.2522, + "step": 7494 + }, + { + "epoch": 1.1732936756418284, + "grad_norm": 0.7840383052825928, + "learning_rate": 4.31166503747149e-05, + "loss": 0.1973, + "step": 7495 + }, + { + "epoch": 1.1734502191609266, + "grad_norm": 0.7892981171607971, + "learning_rate": 4.310850439882698e-05, + "loss": 0.35, + "step": 7496 + }, + { + "epoch": 1.173606762680025, + "grad_norm": 0.799587070941925, + "learning_rate": 4.310035842293907e-05, + "loss": 0.4496, + "step": 7497 + }, + { + "epoch": 1.1737633061991233, + "grad_norm": 1.9302626848220825, + "learning_rate": 4.309221244705116e-05, + "loss": 0.2257, + "step": 7498 + }, + { + "epoch": 1.1739198497182217, + "grad_norm": 0.8015849590301514, + "learning_rate": 4.3084066471163246e-05, + "loss": 0.319, + "step": 7499 + }, + { + "epoch": 1.17407639323732, + "grad_norm": 0.9374929070472717, + "learning_rate": 4.3075920495275336e-05, + "loss": 0.3655, + "step": 7500 + }, + { + "epoch": 1.1742329367564184, + "grad_norm": 3.3480637073516846, + "learning_rate": 4.306777451938743e-05, + "loss": 0.3127, + "step": 7501 + }, + { + "epoch": 1.1743894802755166, + "grad_norm": 1.3478431701660156, + "learning_rate": 4.305962854349951e-05, + "loss": 0.2634, + "step": 7502 + }, + { + "epoch": 1.1745460237946148, + "grad_norm": 1.468350887298584, + "learning_rate": 4.30514825676116e-05, + "loss": 0.3269, + "step": 7503 + }, + { + "epoch": 1.1747025673137133, + "grad_norm": 0.903861939907074, + "learning_rate": 4.304333659172369e-05, + "loss": 0.2745, + "step": 7504 + }, + { + "epoch": 1.1748591108328115, + "grad_norm": 0.7582395672798157, + "learning_rate": 4.303519061583578e-05, + "loss": 0.2302, + "step": 7505 + }, + { + "epoch": 1.17501565435191, + "grad_norm": 1.234514594078064, + "learning_rate": 4.3027044639947866e-05, + "loss": 0.2592, + "step": 7506 + }, + { + "epoch": 1.1751721978710081, + "grad_norm": 1.2538691759109497, + "learning_rate": 4.3018898664059957e-05, + "loss": 0.4967, + "step": 7507 + }, + { + "epoch": 1.1753287413901066, + "grad_norm": 1.834860920906067, + "learning_rate": 4.301075268817205e-05, + "loss": 0.5318, + "step": 7508 + }, + { + "epoch": 1.1754852849092048, + "grad_norm": 2.4056074619293213, + "learning_rate": 4.300260671228413e-05, + "loss": 0.5198, + "step": 7509 + }, + { + "epoch": 1.175641828428303, + "grad_norm": 1.557142972946167, + "learning_rate": 4.299446073639623e-05, + "loss": 0.4099, + "step": 7510 + }, + { + "epoch": 1.1757983719474014, + "grad_norm": 0.9414952397346497, + "learning_rate": 4.298631476050831e-05, + "loss": 0.3774, + "step": 7511 + }, + { + "epoch": 1.1759549154664997, + "grad_norm": 1.3344507217407227, + "learning_rate": 4.2978168784620396e-05, + "loss": 0.4017, + "step": 7512 + }, + { + "epoch": 1.176111458985598, + "grad_norm": 1.1285439729690552, + "learning_rate": 4.297002280873249e-05, + "loss": 0.31, + "step": 7513 + }, + { + "epoch": 1.1762680025046963, + "grad_norm": 2.28080415725708, + "learning_rate": 4.296187683284458e-05, + "loss": 0.7338, + "step": 7514 + }, + { + "epoch": 1.1764245460237945, + "grad_norm": 1.200590968132019, + "learning_rate": 4.295373085695667e-05, + "loss": 0.4944, + "step": 7515 + }, + { + "epoch": 1.176581089542893, + "grad_norm": 4.957828521728516, + "learning_rate": 4.294558488106876e-05, + "loss": 0.5961, + "step": 7516 + }, + { + "epoch": 1.1767376330619912, + "grad_norm": 2.644402027130127, + "learning_rate": 4.293743890518084e-05, + "loss": 0.8586, + "step": 7517 + }, + { + "epoch": 1.1768941765810896, + "grad_norm": 1.5281527042388916, + "learning_rate": 4.292929292929293e-05, + "loss": 0.4925, + "step": 7518 + }, + { + "epoch": 1.1770507201001879, + "grad_norm": 2.340204954147339, + "learning_rate": 4.292114695340502e-05, + "loss": 0.5871, + "step": 7519 + }, + { + "epoch": 1.177207263619286, + "grad_norm": 4.4752116203308105, + "learning_rate": 4.2913000977517106e-05, + "loss": 0.3558, + "step": 7520 + }, + { + "epoch": 1.1773638071383845, + "grad_norm": 1.0764799118041992, + "learning_rate": 4.29048550016292e-05, + "loss": 0.3815, + "step": 7521 + }, + { + "epoch": 1.1775203506574827, + "grad_norm": 1.7586995363235474, + "learning_rate": 4.289670902574129e-05, + "loss": 0.5577, + "step": 7522 + }, + { + "epoch": 1.1776768941765812, + "grad_norm": 2.3260207176208496, + "learning_rate": 4.288856304985338e-05, + "loss": 0.7018, + "step": 7523 + }, + { + "epoch": 1.1778334376956794, + "grad_norm": 2.9735751152038574, + "learning_rate": 4.288041707396546e-05, + "loss": 0.7303, + "step": 7524 + }, + { + "epoch": 1.1779899812147776, + "grad_norm": 2.603829860687256, + "learning_rate": 4.287227109807755e-05, + "loss": 0.9858, + "step": 7525 + }, + { + "epoch": 1.178146524733876, + "grad_norm": 2.290743589401245, + "learning_rate": 4.286412512218964e-05, + "loss": 0.7772, + "step": 7526 + }, + { + "epoch": 1.1783030682529743, + "grad_norm": 2.4948980808258057, + "learning_rate": 4.2855979146301726e-05, + "loss": 0.9955, + "step": 7527 + }, + { + "epoch": 1.1784596117720727, + "grad_norm": 2.999537229537964, + "learning_rate": 4.284783317041382e-05, + "loss": 1.0257, + "step": 7528 + }, + { + "epoch": 1.178616155291171, + "grad_norm": 2.274686098098755, + "learning_rate": 4.283968719452591e-05, + "loss": 0.9094, + "step": 7529 + }, + { + "epoch": 1.1787726988102691, + "grad_norm": 4.768954277038574, + "learning_rate": 4.283154121863799e-05, + "loss": 1.3417, + "step": 7530 + }, + { + "epoch": 1.1789292423293676, + "grad_norm": 2.554898262023926, + "learning_rate": 4.282339524275009e-05, + "loss": 1.2539, + "step": 7531 + }, + { + "epoch": 1.1790857858484658, + "grad_norm": 5.034427165985107, + "learning_rate": 4.281524926686217e-05, + "loss": 1.3201, + "step": 7532 + }, + { + "epoch": 1.1792423293675642, + "grad_norm": 6.147922515869141, + "learning_rate": 4.280710329097426e-05, + "loss": 1.7705, + "step": 7533 + }, + { + "epoch": 1.1793988728866625, + "grad_norm": 3.999133348464966, + "learning_rate": 4.279895731508635e-05, + "loss": 1.0784, + "step": 7534 + }, + { + "epoch": 1.179555416405761, + "grad_norm": 2.1029393672943115, + "learning_rate": 4.279081133919844e-05, + "loss": 0.3066, + "step": 7535 + }, + { + "epoch": 1.179711959924859, + "grad_norm": 3.796429395675659, + "learning_rate": 4.278266536331053e-05, + "loss": 0.7959, + "step": 7536 + }, + { + "epoch": 1.1798685034439573, + "grad_norm": 3.516425848007202, + "learning_rate": 4.277451938742262e-05, + "loss": 0.8636, + "step": 7537 + }, + { + "epoch": 1.1800250469630558, + "grad_norm": 2.732954263687134, + "learning_rate": 4.27663734115347e-05, + "loss": 0.7287, + "step": 7538 + }, + { + "epoch": 1.180181590482154, + "grad_norm": 0.44687405228614807, + "learning_rate": 4.275822743564679e-05, + "loss": 0.1774, + "step": 7539 + }, + { + "epoch": 1.1803381340012524, + "grad_norm": 0.4619537591934204, + "learning_rate": 4.275008145975888e-05, + "loss": 0.1685, + "step": 7540 + }, + { + "epoch": 1.1804946775203506, + "grad_norm": 0.3845154345035553, + "learning_rate": 4.2741935483870973e-05, + "loss": 0.1851, + "step": 7541 + }, + { + "epoch": 1.180651221039449, + "grad_norm": 0.8656876087188721, + "learning_rate": 4.273378950798306e-05, + "loss": 0.214, + "step": 7542 + }, + { + "epoch": 1.1808077645585473, + "grad_norm": 0.7153392434120178, + "learning_rate": 4.272564353209515e-05, + "loss": 0.1627, + "step": 7543 + }, + { + "epoch": 1.1809643080776455, + "grad_norm": 1.0597116947174072, + "learning_rate": 4.271749755620724e-05, + "loss": 0.2014, + "step": 7544 + }, + { + "epoch": 1.181120851596744, + "grad_norm": 0.7091361284255981, + "learning_rate": 4.270935158031932e-05, + "loss": 0.2799, + "step": 7545 + }, + { + "epoch": 1.1812773951158422, + "grad_norm": 0.8970938920974731, + "learning_rate": 4.270120560443141e-05, + "loss": 0.2621, + "step": 7546 + }, + { + "epoch": 1.1814339386349406, + "grad_norm": 0.9690564274787903, + "learning_rate": 4.26930596285435e-05, + "loss": 0.2079, + "step": 7547 + }, + { + "epoch": 1.1815904821540388, + "grad_norm": 0.8498902916908264, + "learning_rate": 4.268491365265559e-05, + "loss": 0.2867, + "step": 7548 + }, + { + "epoch": 1.181747025673137, + "grad_norm": 1.0017342567443848, + "learning_rate": 4.267676767676768e-05, + "loss": 0.2523, + "step": 7549 + }, + { + "epoch": 1.1819035691922355, + "grad_norm": 0.9274699091911316, + "learning_rate": 4.266862170087977e-05, + "loss": 0.294, + "step": 7550 + }, + { + "epoch": 1.1820601127113337, + "grad_norm": 1.2203940153121948, + "learning_rate": 4.266047572499186e-05, + "loss": 0.4233, + "step": 7551 + }, + { + "epoch": 1.1822166562304322, + "grad_norm": 2.0257444381713867, + "learning_rate": 4.265232974910394e-05, + "loss": 0.6151, + "step": 7552 + }, + { + "epoch": 1.1823731997495304, + "grad_norm": 1.4854422807693481, + "learning_rate": 4.264418377321603e-05, + "loss": 0.304, + "step": 7553 + }, + { + "epoch": 1.1825297432686286, + "grad_norm": 1.086285948753357, + "learning_rate": 4.263603779732812e-05, + "loss": 0.3463, + "step": 7554 + }, + { + "epoch": 1.182686286787727, + "grad_norm": 1.57077956199646, + "learning_rate": 4.262789182144021e-05, + "loss": 0.3122, + "step": 7555 + }, + { + "epoch": 1.1828428303068252, + "grad_norm": 4.426021099090576, + "learning_rate": 4.26197458455523e-05, + "loss": 0.5565, + "step": 7556 + }, + { + "epoch": 1.1829993738259237, + "grad_norm": 2.1761510372161865, + "learning_rate": 4.261159986966439e-05, + "loss": 0.3925, + "step": 7557 + }, + { + "epoch": 1.183155917345022, + "grad_norm": 3.154581069946289, + "learning_rate": 4.260345389377647e-05, + "loss": 0.4786, + "step": 7558 + }, + { + "epoch": 1.1833124608641201, + "grad_norm": 1.9408434629440308, + "learning_rate": 4.259530791788857e-05, + "loss": 0.4233, + "step": 7559 + }, + { + "epoch": 1.1834690043832186, + "grad_norm": 0.95510333776474, + "learning_rate": 4.258716194200065e-05, + "loss": 0.3312, + "step": 7560 + }, + { + "epoch": 1.1836255479023168, + "grad_norm": 1.131394386291504, + "learning_rate": 4.2579015966112736e-05, + "loss": 0.3757, + "step": 7561 + }, + { + "epoch": 1.1837820914214152, + "grad_norm": 1.476610541343689, + "learning_rate": 4.2570869990224834e-05, + "loss": 0.4369, + "step": 7562 + }, + { + "epoch": 1.1839386349405134, + "grad_norm": 1.099660873413086, + "learning_rate": 4.256272401433692e-05, + "loss": 0.3252, + "step": 7563 + }, + { + "epoch": 1.1840951784596117, + "grad_norm": 3.2687742710113525, + "learning_rate": 4.255457803844901e-05, + "loss": 0.7641, + "step": 7564 + }, + { + "epoch": 1.18425172197871, + "grad_norm": 2.4303886890411377, + "learning_rate": 4.25464320625611e-05, + "loss": 0.634, + "step": 7565 + }, + { + "epoch": 1.1844082654978083, + "grad_norm": 2.1763527393341064, + "learning_rate": 4.253828608667318e-05, + "loss": 0.392, + "step": 7566 + }, + { + "epoch": 1.1845648090169068, + "grad_norm": 1.4100617170333862, + "learning_rate": 4.253014011078527e-05, + "loss": 0.444, + "step": 7567 + }, + { + "epoch": 1.184721352536005, + "grad_norm": 1.9989985227584839, + "learning_rate": 4.252199413489736e-05, + "loss": 0.4049, + "step": 7568 + }, + { + "epoch": 1.1848778960551034, + "grad_norm": 3.309465169906616, + "learning_rate": 4.2513848159009454e-05, + "loss": 0.6323, + "step": 7569 + }, + { + "epoch": 1.1850344395742016, + "grad_norm": 3.665440559387207, + "learning_rate": 4.250570218312154e-05, + "loss": 0.7811, + "step": 7570 + }, + { + "epoch": 1.1851909830932998, + "grad_norm": 2.244659423828125, + "learning_rate": 4.249755620723363e-05, + "loss": 0.7771, + "step": 7571 + }, + { + "epoch": 1.1853475266123983, + "grad_norm": 3.9708445072174072, + "learning_rate": 4.248941023134572e-05, + "loss": 0.8051, + "step": 7572 + }, + { + "epoch": 1.1855040701314965, + "grad_norm": 3.3093862533569336, + "learning_rate": 4.24812642554578e-05, + "loss": 0.7089, + "step": 7573 + }, + { + "epoch": 1.185660613650595, + "grad_norm": 1.988769769668579, + "learning_rate": 4.247311827956989e-05, + "loss": 0.6332, + "step": 7574 + }, + { + "epoch": 1.1858171571696932, + "grad_norm": 2.890474319458008, + "learning_rate": 4.2464972303681984e-05, + "loss": 0.7845, + "step": 7575 + }, + { + "epoch": 1.1859737006887916, + "grad_norm": 3.593125581741333, + "learning_rate": 4.245682632779407e-05, + "loss": 0.7604, + "step": 7576 + }, + { + "epoch": 1.1861302442078898, + "grad_norm": 3.802920341491699, + "learning_rate": 4.2448680351906165e-05, + "loss": 1.0373, + "step": 7577 + }, + { + "epoch": 1.186286787726988, + "grad_norm": 2.985668659210205, + "learning_rate": 4.244053437601825e-05, + "loss": 0.8753, + "step": 7578 + }, + { + "epoch": 1.1864433312460865, + "grad_norm": 2.140315055847168, + "learning_rate": 4.243238840013033e-05, + "loss": 0.7321, + "step": 7579 + }, + { + "epoch": 1.1865998747651847, + "grad_norm": 2.770469903945923, + "learning_rate": 4.242424242424243e-05, + "loss": 0.8926, + "step": 7580 + }, + { + "epoch": 1.1867564182842831, + "grad_norm": 3.2940196990966797, + "learning_rate": 4.241609644835451e-05, + "loss": 1.1129, + "step": 7581 + }, + { + "epoch": 1.1869129618033814, + "grad_norm": 2.585655450820923, + "learning_rate": 4.2407950472466604e-05, + "loss": 0.9341, + "step": 7582 + }, + { + "epoch": 1.1870695053224796, + "grad_norm": 2.821709394454956, + "learning_rate": 4.2399804496578694e-05, + "loss": 1.0864, + "step": 7583 + }, + { + "epoch": 1.187226048841578, + "grad_norm": 3.369448661804199, + "learning_rate": 4.239165852069078e-05, + "loss": 1.1283, + "step": 7584 + }, + { + "epoch": 1.1873825923606762, + "grad_norm": 4.831813812255859, + "learning_rate": 4.238351254480287e-05, + "loss": 0.812, + "step": 7585 + }, + { + "epoch": 1.1875391358797747, + "grad_norm": 2.6275744438171387, + "learning_rate": 4.237536656891496e-05, + "loss": 0.4249, + "step": 7586 + }, + { + "epoch": 1.1876956793988729, + "grad_norm": 2.4437100887298584, + "learning_rate": 4.236722059302704e-05, + "loss": 0.5715, + "step": 7587 + }, + { + "epoch": 1.187852222917971, + "grad_norm": 4.004086971282959, + "learning_rate": 4.235907461713913e-05, + "loss": 0.5431, + "step": 7588 + }, + { + "epoch": 1.1880087664370695, + "grad_norm": 0.5263730883598328, + "learning_rate": 4.2350928641251224e-05, + "loss": 0.2692, + "step": 7589 + }, + { + "epoch": 1.1881653099561678, + "grad_norm": 0.4318518042564392, + "learning_rate": 4.2342782665363314e-05, + "loss": 0.1918, + "step": 7590 + }, + { + "epoch": 1.1883218534752662, + "grad_norm": 0.43381467461586, + "learning_rate": 4.23346366894754e-05, + "loss": 0.1391, + "step": 7591 + }, + { + "epoch": 1.1884783969943644, + "grad_norm": 0.5866572856903076, + "learning_rate": 4.232649071358749e-05, + "loss": 0.1688, + "step": 7592 + }, + { + "epoch": 1.1886349405134626, + "grad_norm": 0.8060342073440552, + "learning_rate": 4.231834473769958e-05, + "loss": 0.17, + "step": 7593 + }, + { + "epoch": 1.188791484032561, + "grad_norm": 1.0649102926254272, + "learning_rate": 4.231019876181166e-05, + "loss": 0.2065, + "step": 7594 + }, + { + "epoch": 1.1889480275516593, + "grad_norm": 1.5344916582107544, + "learning_rate": 4.230205278592376e-05, + "loss": 0.3634, + "step": 7595 + }, + { + "epoch": 1.1891045710707577, + "grad_norm": 0.5490397810935974, + "learning_rate": 4.2293906810035844e-05, + "loss": 0.1653, + "step": 7596 + }, + { + "epoch": 1.189261114589856, + "grad_norm": 0.971591055393219, + "learning_rate": 4.228576083414793e-05, + "loss": 0.2244, + "step": 7597 + }, + { + "epoch": 1.1894176581089542, + "grad_norm": 0.9644955992698669, + "learning_rate": 4.2277614858260025e-05, + "loss": 0.3087, + "step": 7598 + }, + { + "epoch": 1.1895742016280526, + "grad_norm": 1.1193548440933228, + "learning_rate": 4.226946888237211e-05, + "loss": 0.3417, + "step": 7599 + }, + { + "epoch": 1.1897307451471508, + "grad_norm": 0.9339085221290588, + "learning_rate": 4.22613229064842e-05, + "loss": 0.215, + "step": 7600 + }, + { + "epoch": 1.1898872886662493, + "grad_norm": 2.0619237422943115, + "learning_rate": 4.225317693059629e-05, + "loss": 0.2789, + "step": 7601 + }, + { + "epoch": 1.1900438321853475, + "grad_norm": 1.303120493888855, + "learning_rate": 4.2245030954708373e-05, + "loss": 0.3171, + "step": 7602 + }, + { + "epoch": 1.190200375704446, + "grad_norm": 1.816625952720642, + "learning_rate": 4.2236884978820464e-05, + "loss": 0.1865, + "step": 7603 + }, + { + "epoch": 1.1903569192235441, + "grad_norm": 0.7998942136764526, + "learning_rate": 4.2228739002932555e-05, + "loss": 0.1979, + "step": 7604 + }, + { + "epoch": 1.1905134627426424, + "grad_norm": 3.5539703369140625, + "learning_rate": 4.222059302704464e-05, + "loss": 1.0957, + "step": 7605 + }, + { + "epoch": 1.1906700062617408, + "grad_norm": 1.6926579475402832, + "learning_rate": 4.221244705115673e-05, + "loss": 0.5385, + "step": 7606 + }, + { + "epoch": 1.190826549780839, + "grad_norm": 1.2153196334838867, + "learning_rate": 4.220430107526882e-05, + "loss": 0.2463, + "step": 7607 + }, + { + "epoch": 1.1909830932999375, + "grad_norm": 1.8795416355133057, + "learning_rate": 4.219615509938091e-05, + "loss": 0.3233, + "step": 7608 + }, + { + "epoch": 1.1911396368190357, + "grad_norm": 1.399431824684143, + "learning_rate": 4.2188009123492994e-05, + "loss": 0.2968, + "step": 7609 + }, + { + "epoch": 1.1912961803381341, + "grad_norm": 2.1480655670166016, + "learning_rate": 4.2179863147605084e-05, + "loss": 0.3829, + "step": 7610 + }, + { + "epoch": 1.1914527238572323, + "grad_norm": 1.4547255039215088, + "learning_rate": 4.2171717171717175e-05, + "loss": 0.6957, + "step": 7611 + }, + { + "epoch": 1.1916092673763305, + "grad_norm": 2.007030963897705, + "learning_rate": 4.216357119582926e-05, + "loss": 0.3885, + "step": 7612 + }, + { + "epoch": 1.191765810895429, + "grad_norm": 3.19069504737854, + "learning_rate": 4.2155425219941356e-05, + "loss": 0.8718, + "step": 7613 + }, + { + "epoch": 1.1919223544145272, + "grad_norm": 2.0014445781707764, + "learning_rate": 4.214727924405344e-05, + "loss": 0.402, + "step": 7614 + }, + { + "epoch": 1.1920788979336256, + "grad_norm": 2.973073720932007, + "learning_rate": 4.213913326816552e-05, + "loss": 0.8571, + "step": 7615 + }, + { + "epoch": 1.1922354414527239, + "grad_norm": 6.137259006500244, + "learning_rate": 4.213098729227762e-05, + "loss": 0.8556, + "step": 7616 + }, + { + "epoch": 1.192391984971822, + "grad_norm": 6.407308578491211, + "learning_rate": 4.2122841316389704e-05, + "loss": 0.7542, + "step": 7617 + }, + { + "epoch": 1.1925485284909205, + "grad_norm": 3.9009881019592285, + "learning_rate": 4.2114695340501795e-05, + "loss": 0.8565, + "step": 7618 + }, + { + "epoch": 1.1927050720100187, + "grad_norm": 2.694136381149292, + "learning_rate": 4.2106549364613885e-05, + "loss": 0.5213, + "step": 7619 + }, + { + "epoch": 1.1928616155291172, + "grad_norm": 1.9465793371200562, + "learning_rate": 4.209840338872597e-05, + "loss": 0.462, + "step": 7620 + }, + { + "epoch": 1.1930181590482154, + "grad_norm": 2.64078426361084, + "learning_rate": 4.209025741283806e-05, + "loss": 0.8483, + "step": 7621 + }, + { + "epoch": 1.1931747025673136, + "grad_norm": 32.5177116394043, + "learning_rate": 4.208211143695015e-05, + "loss": 0.9153, + "step": 7622 + }, + { + "epoch": 1.193331246086412, + "grad_norm": 2.8427765369415283, + "learning_rate": 4.2073965461062234e-05, + "loss": 0.5415, + "step": 7623 + }, + { + "epoch": 1.1934877896055103, + "grad_norm": 4.098700046539307, + "learning_rate": 4.2065819485174324e-05, + "loss": 1.0393, + "step": 7624 + }, + { + "epoch": 1.1936443331246087, + "grad_norm": 3.4022436141967773, + "learning_rate": 4.2057673509286415e-05, + "loss": 0.7776, + "step": 7625 + }, + { + "epoch": 1.193800876643707, + "grad_norm": 3.4092535972595215, + "learning_rate": 4.2049527533398505e-05, + "loss": 1.5598, + "step": 7626 + }, + { + "epoch": 1.1939574201628051, + "grad_norm": 3.45811128616333, + "learning_rate": 4.204138155751059e-05, + "loss": 1.3017, + "step": 7627 + }, + { + "epoch": 1.1941139636819036, + "grad_norm": 2.7732999324798584, + "learning_rate": 4.203323558162268e-05, + "loss": 1.6836, + "step": 7628 + }, + { + "epoch": 1.1942705072010018, + "grad_norm": 6.843001842498779, + "learning_rate": 4.202508960573477e-05, + "loss": 0.9943, + "step": 7629 + }, + { + "epoch": 1.1944270507201002, + "grad_norm": 4.7422637939453125, + "learning_rate": 4.2016943629846854e-05, + "loss": 1.1461, + "step": 7630 + }, + { + "epoch": 1.1945835942391985, + "grad_norm": 3.265285015106201, + "learning_rate": 4.2008797653958945e-05, + "loss": 0.9934, + "step": 7631 + }, + { + "epoch": 1.1947401377582967, + "grad_norm": 2.764425754547119, + "learning_rate": 4.2000651678071035e-05, + "loss": 0.8227, + "step": 7632 + }, + { + "epoch": 1.1948966812773951, + "grad_norm": 3.2310240268707275, + "learning_rate": 4.199250570218312e-05, + "loss": 1.222, + "step": 7633 + }, + { + "epoch": 1.1950532247964933, + "grad_norm": 3.656456470489502, + "learning_rate": 4.1984359726295216e-05, + "loss": 1.0176, + "step": 7634 + }, + { + "epoch": 1.1952097683155918, + "grad_norm": 3.8100972175598145, + "learning_rate": 4.19762137504073e-05, + "loss": 0.4094, + "step": 7635 + }, + { + "epoch": 1.19536631183469, + "grad_norm": 4.9295735359191895, + "learning_rate": 4.196806777451939e-05, + "loss": 0.7982, + "step": 7636 + }, + { + "epoch": 1.1955228553537884, + "grad_norm": 3.36177659034729, + "learning_rate": 4.195992179863148e-05, + "loss": 0.6729, + "step": 7637 + }, + { + "epoch": 1.1956793988728867, + "grad_norm": 4.4818596839904785, + "learning_rate": 4.1951775822743565e-05, + "loss": 0.5971, + "step": 7638 + }, + { + "epoch": 1.195835942391985, + "grad_norm": 0.43882909417152405, + "learning_rate": 4.1943629846855655e-05, + "loss": 0.2461, + "step": 7639 + }, + { + "epoch": 1.1959924859110833, + "grad_norm": 0.7095560431480408, + "learning_rate": 4.1935483870967746e-05, + "loss": 0.2568, + "step": 7640 + }, + { + "epoch": 1.1961490294301815, + "grad_norm": 0.6303721070289612, + "learning_rate": 4.192733789507983e-05, + "loss": 0.2072, + "step": 7641 + }, + { + "epoch": 1.19630557294928, + "grad_norm": 0.44955572485923767, + "learning_rate": 4.191919191919192e-05, + "loss": 0.1929, + "step": 7642 + }, + { + "epoch": 1.1964621164683782, + "grad_norm": 0.5120641589164734, + "learning_rate": 4.191104594330401e-05, + "loss": 0.186, + "step": 7643 + }, + { + "epoch": 1.1966186599874766, + "grad_norm": 0.822207510471344, + "learning_rate": 4.19028999674161e-05, + "loss": 0.3269, + "step": 7644 + }, + { + "epoch": 1.1967752035065748, + "grad_norm": 0.6905671954154968, + "learning_rate": 4.1894753991528185e-05, + "loss": 0.2792, + "step": 7645 + }, + { + "epoch": 1.196931747025673, + "grad_norm": 0.5945687294006348, + "learning_rate": 4.1886608015640275e-05, + "loss": 0.236, + "step": 7646 + }, + { + "epoch": 1.1970882905447715, + "grad_norm": 0.6058511734008789, + "learning_rate": 4.1878462039752366e-05, + "loss": 0.2997, + "step": 7647 + }, + { + "epoch": 1.1972448340638697, + "grad_norm": 0.6999122500419617, + "learning_rate": 4.187031606386445e-05, + "loss": 0.1503, + "step": 7648 + }, + { + "epoch": 1.1974013775829682, + "grad_norm": 0.9258671402931213, + "learning_rate": 4.186217008797654e-05, + "loss": 0.2528, + "step": 7649 + }, + { + "epoch": 1.1975579211020664, + "grad_norm": 1.4196528196334839, + "learning_rate": 4.185402411208863e-05, + "loss": 0.5235, + "step": 7650 + }, + { + "epoch": 1.1977144646211646, + "grad_norm": 0.6400874257087708, + "learning_rate": 4.1845878136200714e-05, + "loss": 0.2496, + "step": 7651 + }, + { + "epoch": 1.197871008140263, + "grad_norm": 2.280231475830078, + "learning_rate": 4.183773216031281e-05, + "loss": 0.3672, + "step": 7652 + }, + { + "epoch": 1.1980275516593613, + "grad_norm": 0.9725130200386047, + "learning_rate": 4.1829586184424895e-05, + "loss": 0.3437, + "step": 7653 + }, + { + "epoch": 1.1981840951784597, + "grad_norm": 1.2294270992279053, + "learning_rate": 4.1821440208536986e-05, + "loss": 0.3396, + "step": 7654 + }, + { + "epoch": 1.198340638697558, + "grad_norm": 1.5381113290786743, + "learning_rate": 4.1813294232649076e-05, + "loss": 0.4702, + "step": 7655 + }, + { + "epoch": 1.1984971822166561, + "grad_norm": 1.268452525138855, + "learning_rate": 4.180514825676116e-05, + "loss": 0.3505, + "step": 7656 + }, + { + "epoch": 1.1986537257357546, + "grad_norm": 1.6208356618881226, + "learning_rate": 4.179700228087325e-05, + "loss": 0.606, + "step": 7657 + }, + { + "epoch": 1.1988102692548528, + "grad_norm": 1.9072937965393066, + "learning_rate": 4.178885630498534e-05, + "loss": 0.4466, + "step": 7658 + }, + { + "epoch": 1.1989668127739512, + "grad_norm": 2.7985594272613525, + "learning_rate": 4.1780710329097425e-05, + "loss": 0.3942, + "step": 7659 + }, + { + "epoch": 1.1991233562930494, + "grad_norm": 1.8701616525650024, + "learning_rate": 4.1772564353209516e-05, + "loss": 0.3161, + "step": 7660 + }, + { + "epoch": 1.1992798998121477, + "grad_norm": 2.583343029022217, + "learning_rate": 4.1764418377321606e-05, + "loss": 0.2437, + "step": 7661 + }, + { + "epoch": 1.199436443331246, + "grad_norm": 2.622345447540283, + "learning_rate": 4.1756272401433697e-05, + "loss": 0.4964, + "step": 7662 + }, + { + "epoch": 1.1995929868503443, + "grad_norm": 2.423140048980713, + "learning_rate": 4.174812642554578e-05, + "loss": 0.8324, + "step": 7663 + }, + { + "epoch": 1.1997495303694428, + "grad_norm": 2.2859678268432617, + "learning_rate": 4.173998044965787e-05, + "loss": 0.473, + "step": 7664 + }, + { + "epoch": 1.199906073888541, + "grad_norm": 3.3984014987945557, + "learning_rate": 4.173183447376996e-05, + "loss": 0.4694, + "step": 7665 + }, + { + "epoch": 1.2000626174076394, + "grad_norm": 3.0700321197509766, + "learning_rate": 4.1723688497882045e-05, + "loss": 0.9999, + "step": 7666 + }, + { + "epoch": 1.2002191609267376, + "grad_norm": 2.3975508213043213, + "learning_rate": 4.1715542521994136e-05, + "loss": 0.3856, + "step": 7667 + }, + { + "epoch": 1.2003757044458359, + "grad_norm": 2.750429630279541, + "learning_rate": 4.1707396546106226e-05, + "loss": 0.7917, + "step": 7668 + }, + { + "epoch": 1.2005322479649343, + "grad_norm": 2.657573699951172, + "learning_rate": 4.169925057021831e-05, + "loss": 0.6501, + "step": 7669 + }, + { + "epoch": 1.2006887914840325, + "grad_norm": 2.0824294090270996, + "learning_rate": 4.169110459433041e-05, + "loss": 0.7014, + "step": 7670 + }, + { + "epoch": 1.200845335003131, + "grad_norm": 1.9251677989959717, + "learning_rate": 4.168295861844249e-05, + "loss": 0.5795, + "step": 7671 + }, + { + "epoch": 1.2010018785222292, + "grad_norm": 3.3631784915924072, + "learning_rate": 4.167481264255458e-05, + "loss": 0.8722, + "step": 7672 + }, + { + "epoch": 1.2011584220413276, + "grad_norm": 2.183046340942383, + "learning_rate": 4.166666666666667e-05, + "loss": 0.9207, + "step": 7673 + }, + { + "epoch": 1.2013149655604258, + "grad_norm": 3.4114108085632324, + "learning_rate": 4.1658520690778756e-05, + "loss": 0.7579, + "step": 7674 + }, + { + "epoch": 1.201471509079524, + "grad_norm": 3.367283344268799, + "learning_rate": 4.1650374714890846e-05, + "loss": 0.9558, + "step": 7675 + }, + { + "epoch": 1.2016280525986225, + "grad_norm": 2.6612329483032227, + "learning_rate": 4.164222873900294e-05, + "loss": 1.3329, + "step": 7676 + }, + { + "epoch": 1.2017845961177207, + "grad_norm": 3.23567795753479, + "learning_rate": 4.163408276311502e-05, + "loss": 0.9096, + "step": 7677 + }, + { + "epoch": 1.2019411396368191, + "grad_norm": 6.625858783721924, + "learning_rate": 4.162593678722711e-05, + "loss": 1.1582, + "step": 7678 + }, + { + "epoch": 1.2020976831559174, + "grad_norm": 1.653128743171692, + "learning_rate": 4.16177908113392e-05, + "loss": 0.6803, + "step": 7679 + }, + { + "epoch": 1.2022542266750156, + "grad_norm": 3.232273578643799, + "learning_rate": 4.160964483545129e-05, + "loss": 1.1741, + "step": 7680 + }, + { + "epoch": 1.202410770194114, + "grad_norm": 4.149262428283691, + "learning_rate": 4.1601498859563376e-05, + "loss": 1.4292, + "step": 7681 + }, + { + "epoch": 1.2025673137132122, + "grad_norm": 5.1398234367370605, + "learning_rate": 4.1593352883675466e-05, + "loss": 1.1714, + "step": 7682 + }, + { + "epoch": 1.2027238572323107, + "grad_norm": 4.262434482574463, + "learning_rate": 4.158520690778756e-05, + "loss": 0.6695, + "step": 7683 + }, + { + "epoch": 1.202880400751409, + "grad_norm": 4.86053466796875, + "learning_rate": 4.157706093189964e-05, + "loss": 1.059, + "step": 7684 + }, + { + "epoch": 1.2030369442705071, + "grad_norm": 2.587500810623169, + "learning_rate": 4.156891495601173e-05, + "loss": 0.6476, + "step": 7685 + }, + { + "epoch": 1.2031934877896056, + "grad_norm": 6.372177600860596, + "learning_rate": 4.156076898012382e-05, + "loss": 1.3032, + "step": 7686 + }, + { + "epoch": 1.2033500313087038, + "grad_norm": 1.960640549659729, + "learning_rate": 4.1552623004235906e-05, + "loss": 1.1463, + "step": 7687 + }, + { + "epoch": 1.2035065748278022, + "grad_norm": 2.9804399013519287, + "learning_rate": 4.1544477028348e-05, + "loss": 0.6395, + "step": 7688 + }, + { + "epoch": 1.2036631183469004, + "grad_norm": 0.5319779515266418, + "learning_rate": 4.1536331052460087e-05, + "loss": 0.2195, + "step": 7689 + }, + { + "epoch": 1.2038196618659986, + "grad_norm": 0.38855433464050293, + "learning_rate": 4.152818507657217e-05, + "loss": 0.1904, + "step": 7690 + }, + { + "epoch": 1.203976205385097, + "grad_norm": 0.4212869703769684, + "learning_rate": 4.152003910068427e-05, + "loss": 0.1769, + "step": 7691 + }, + { + "epoch": 1.2041327489041953, + "grad_norm": 0.37668564915657043, + "learning_rate": 4.151189312479635e-05, + "loss": 0.1578, + "step": 7692 + }, + { + "epoch": 1.2042892924232937, + "grad_norm": 0.5703322887420654, + "learning_rate": 4.150374714890844e-05, + "loss": 0.1502, + "step": 7693 + }, + { + "epoch": 1.204445835942392, + "grad_norm": 0.8186959028244019, + "learning_rate": 4.149560117302053e-05, + "loss": 0.2603, + "step": 7694 + }, + { + "epoch": 1.2046023794614902, + "grad_norm": 0.6403840184211731, + "learning_rate": 4.1487455197132616e-05, + "loss": 0.2813, + "step": 7695 + }, + { + "epoch": 1.2047589229805886, + "grad_norm": 0.5678405165672302, + "learning_rate": 4.147930922124471e-05, + "loss": 0.2269, + "step": 7696 + }, + { + "epoch": 1.2049154664996868, + "grad_norm": 1.3642687797546387, + "learning_rate": 4.14711632453568e-05, + "loss": 0.2752, + "step": 7697 + }, + { + "epoch": 1.2050720100187853, + "grad_norm": 1.3367599248886108, + "learning_rate": 4.146301726946889e-05, + "loss": 0.3223, + "step": 7698 + }, + { + "epoch": 1.2052285535378835, + "grad_norm": 2.3896889686584473, + "learning_rate": 4.145487129358097e-05, + "loss": 0.3722, + "step": 7699 + }, + { + "epoch": 1.205385097056982, + "grad_norm": 0.8341039419174194, + "learning_rate": 4.144672531769306e-05, + "loss": 0.3273, + "step": 7700 + }, + { + "epoch": 1.2055416405760802, + "grad_norm": 1.268674612045288, + "learning_rate": 4.143857934180515e-05, + "loss": 0.2032, + "step": 7701 + }, + { + "epoch": 1.2056981840951784, + "grad_norm": 2.14616060256958, + "learning_rate": 4.1430433365917236e-05, + "loss": 0.3478, + "step": 7702 + }, + { + "epoch": 1.2058547276142768, + "grad_norm": 1.7326555252075195, + "learning_rate": 4.142228739002933e-05, + "loss": 0.4818, + "step": 7703 + }, + { + "epoch": 1.206011271133375, + "grad_norm": 1.1842617988586426, + "learning_rate": 4.141414141414142e-05, + "loss": 0.3195, + "step": 7704 + }, + { + "epoch": 1.2061678146524735, + "grad_norm": 1.6756958961486816, + "learning_rate": 4.14059954382535e-05, + "loss": 0.6279, + "step": 7705 + }, + { + "epoch": 1.2063243581715717, + "grad_norm": 1.3794134855270386, + "learning_rate": 4.13978494623656e-05, + "loss": 0.3183, + "step": 7706 + }, + { + "epoch": 1.2064809016906701, + "grad_norm": 1.626779556274414, + "learning_rate": 4.138970348647768e-05, + "loss": 0.3589, + "step": 7707 + }, + { + "epoch": 1.2066374452097683, + "grad_norm": 1.256281852722168, + "learning_rate": 4.1381557510589766e-05, + "loss": 0.2221, + "step": 7708 + }, + { + "epoch": 1.2067939887288666, + "grad_norm": 2.1198232173919678, + "learning_rate": 4.137341153470186e-05, + "loss": 0.6175, + "step": 7709 + }, + { + "epoch": 1.206950532247965, + "grad_norm": 2.038159132003784, + "learning_rate": 4.136526555881395e-05, + "loss": 0.3623, + "step": 7710 + }, + { + "epoch": 1.2071070757670632, + "grad_norm": 1.8949168920516968, + "learning_rate": 4.135711958292604e-05, + "loss": 0.4867, + "step": 7711 + }, + { + "epoch": 1.2072636192861617, + "grad_norm": 1.6459636688232422, + "learning_rate": 4.134897360703813e-05, + "loss": 0.4235, + "step": 7712 + }, + { + "epoch": 1.2074201628052599, + "grad_norm": 1.2701637744903564, + "learning_rate": 4.134082763115021e-05, + "loss": 0.5145, + "step": 7713 + }, + { + "epoch": 1.207576706324358, + "grad_norm": 2.1027655601501465, + "learning_rate": 4.13326816552623e-05, + "loss": 0.4301, + "step": 7714 + }, + { + "epoch": 1.2077332498434565, + "grad_norm": 2.7878599166870117, + "learning_rate": 4.132453567937439e-05, + "loss": 0.4604, + "step": 7715 + }, + { + "epoch": 1.2078897933625548, + "grad_norm": 2.1717307567596436, + "learning_rate": 4.131638970348648e-05, + "loss": 0.6705, + "step": 7716 + }, + { + "epoch": 1.2080463368816532, + "grad_norm": 2.9694337844848633, + "learning_rate": 4.130824372759857e-05, + "loss": 0.4722, + "step": 7717 + }, + { + "epoch": 1.2082028804007514, + "grad_norm": 1.723144769668579, + "learning_rate": 4.130009775171066e-05, + "loss": 0.5225, + "step": 7718 + }, + { + "epoch": 1.2083594239198496, + "grad_norm": 2.7808971405029297, + "learning_rate": 4.129195177582275e-05, + "loss": 0.6331, + "step": 7719 + }, + { + "epoch": 1.208515967438948, + "grad_norm": 1.961945652961731, + "learning_rate": 4.128380579993483e-05, + "loss": 0.6381, + "step": 7720 + }, + { + "epoch": 1.2086725109580463, + "grad_norm": 2.970932722091675, + "learning_rate": 4.127565982404692e-05, + "loss": 0.7768, + "step": 7721 + }, + { + "epoch": 1.2088290544771447, + "grad_norm": 4.400678634643555, + "learning_rate": 4.126751384815901e-05, + "loss": 0.7974, + "step": 7722 + }, + { + "epoch": 1.208985597996243, + "grad_norm": 3.725374460220337, + "learning_rate": 4.12593678722711e-05, + "loss": 0.8052, + "step": 7723 + }, + { + "epoch": 1.2091421415153412, + "grad_norm": 2.585777997970581, + "learning_rate": 4.1251221896383194e-05, + "loss": 1.1221, + "step": 7724 + }, + { + "epoch": 1.2092986850344396, + "grad_norm": 2.5500972270965576, + "learning_rate": 4.124307592049528e-05, + "loss": 0.9223, + "step": 7725 + }, + { + "epoch": 1.2094552285535378, + "grad_norm": 5.65903377532959, + "learning_rate": 4.123492994460736e-05, + "loss": 1.169, + "step": 7726 + }, + { + "epoch": 1.2096117720726363, + "grad_norm": 9.278945922851562, + "learning_rate": 4.122678396871946e-05, + "loss": 1.0089, + "step": 7727 + }, + { + "epoch": 1.2097683155917345, + "grad_norm": 2.4663612842559814, + "learning_rate": 4.121863799283154e-05, + "loss": 1.1025, + "step": 7728 + }, + { + "epoch": 1.2099248591108327, + "grad_norm": 3.2453651428222656, + "learning_rate": 4.121049201694363e-05, + "loss": 0.6987, + "step": 7729 + }, + { + "epoch": 1.2100814026299311, + "grad_norm": 2.521491765975952, + "learning_rate": 4.1202346041055724e-05, + "loss": 1.0417, + "step": 7730 + }, + { + "epoch": 1.2102379461490294, + "grad_norm": 4.287261486053467, + "learning_rate": 4.119420006516781e-05, + "loss": 1.1218, + "step": 7731 + }, + { + "epoch": 1.2103944896681278, + "grad_norm": 2.9399588108062744, + "learning_rate": 4.11860540892799e-05, + "loss": 1.2946, + "step": 7732 + }, + { + "epoch": 1.210551033187226, + "grad_norm": 1.8962292671203613, + "learning_rate": 4.117790811339199e-05, + "loss": 1.3899, + "step": 7733 + }, + { + "epoch": 1.2107075767063245, + "grad_norm": 1.5714776515960693, + "learning_rate": 4.116976213750407e-05, + "loss": 0.36, + "step": 7734 + }, + { + "epoch": 1.2108641202254227, + "grad_norm": 1.3168561458587646, + "learning_rate": 4.116161616161616e-05, + "loss": 0.5718, + "step": 7735 + }, + { + "epoch": 1.2110206637445209, + "grad_norm": 4.918756484985352, + "learning_rate": 4.115347018572825e-05, + "loss": 0.2696, + "step": 7736 + }, + { + "epoch": 1.2111772072636193, + "grad_norm": 2.3092098236083984, + "learning_rate": 4.1145324209840344e-05, + "loss": 0.6759, + "step": 7737 + }, + { + "epoch": 1.2113337507827175, + "grad_norm": 3.744291067123413, + "learning_rate": 4.113717823395243e-05, + "loss": 1.0529, + "step": 7738 + }, + { + "epoch": 1.211490294301816, + "grad_norm": 0.4708799123764038, + "learning_rate": 4.112903225806452e-05, + "loss": 0.2032, + "step": 7739 + }, + { + "epoch": 1.2116468378209142, + "grad_norm": 0.4233437180519104, + "learning_rate": 4.112088628217661e-05, + "loss": 0.1915, + "step": 7740 + }, + { + "epoch": 1.2118033813400126, + "grad_norm": 0.5874823331832886, + "learning_rate": 4.111274030628869e-05, + "loss": 0.209, + "step": 7741 + }, + { + "epoch": 1.2119599248591109, + "grad_norm": 0.6697525382041931, + "learning_rate": 4.110459433040079e-05, + "loss": 0.2473, + "step": 7742 + }, + { + "epoch": 1.212116468378209, + "grad_norm": 0.7392506003379822, + "learning_rate": 4.109644835451287e-05, + "loss": 0.2625, + "step": 7743 + }, + { + "epoch": 1.2122730118973075, + "grad_norm": 0.5947787165641785, + "learning_rate": 4.108830237862496e-05, + "loss": 0.3026, + "step": 7744 + }, + { + "epoch": 1.2124295554164057, + "grad_norm": 0.8696983456611633, + "learning_rate": 4.1080156402737054e-05, + "loss": 0.2564, + "step": 7745 + }, + { + "epoch": 1.2125860989355042, + "grad_norm": 1.1987943649291992, + "learning_rate": 4.107201042684914e-05, + "loss": 0.2033, + "step": 7746 + }, + { + "epoch": 1.2127426424546024, + "grad_norm": 1.576550841331482, + "learning_rate": 4.106386445096123e-05, + "loss": 0.3799, + "step": 7747 + }, + { + "epoch": 1.2128991859737006, + "grad_norm": 2.522412061691284, + "learning_rate": 4.105571847507332e-05, + "loss": 0.2286, + "step": 7748 + }, + { + "epoch": 1.213055729492799, + "grad_norm": 1.7665133476257324, + "learning_rate": 4.10475724991854e-05, + "loss": 0.3361, + "step": 7749 + }, + { + "epoch": 1.2132122730118973, + "grad_norm": 0.989568829536438, + "learning_rate": 4.1039426523297493e-05, + "loss": 0.2776, + "step": 7750 + }, + { + "epoch": 1.2133688165309957, + "grad_norm": 1.2897595167160034, + "learning_rate": 4.1031280547409584e-05, + "loss": 0.2612, + "step": 7751 + }, + { + "epoch": 1.213525360050094, + "grad_norm": 1.473572015762329, + "learning_rate": 4.102313457152167e-05, + "loss": 0.31, + "step": 7752 + }, + { + "epoch": 1.2136819035691921, + "grad_norm": 0.8325414657592773, + "learning_rate": 4.101498859563376e-05, + "loss": 0.3213, + "step": 7753 + }, + { + "epoch": 1.2138384470882906, + "grad_norm": 1.151835322380066, + "learning_rate": 4.100684261974585e-05, + "loss": 0.3469, + "step": 7754 + }, + { + "epoch": 1.2139949906073888, + "grad_norm": 6.0455546379089355, + "learning_rate": 4.099869664385794e-05, + "loss": 0.6985, + "step": 7755 + }, + { + "epoch": 1.2141515341264872, + "grad_norm": 1.9716885089874268, + "learning_rate": 4.099055066797002e-05, + "loss": 0.5079, + "step": 7756 + }, + { + "epoch": 1.2143080776455855, + "grad_norm": 2.0643935203552246, + "learning_rate": 4.0982404692082114e-05, + "loss": 0.345, + "step": 7757 + }, + { + "epoch": 1.2144646211646837, + "grad_norm": 1.3890876770019531, + "learning_rate": 4.0974258716194204e-05, + "loss": 0.3456, + "step": 7758 + }, + { + "epoch": 1.2146211646837821, + "grad_norm": 2.3092429637908936, + "learning_rate": 4.096611274030629e-05, + "loss": 0.3427, + "step": 7759 + }, + { + "epoch": 1.2147777082028803, + "grad_norm": 3.4397523403167725, + "learning_rate": 4.0957966764418385e-05, + "loss": 0.7378, + "step": 7760 + }, + { + "epoch": 1.2149342517219788, + "grad_norm": 1.884817123413086, + "learning_rate": 4.094982078853047e-05, + "loss": 0.5763, + "step": 7761 + }, + { + "epoch": 1.215090795241077, + "grad_norm": 2.514066696166992, + "learning_rate": 4.094167481264255e-05, + "loss": 0.62, + "step": 7762 + }, + { + "epoch": 1.2152473387601752, + "grad_norm": 2.5409398078918457, + "learning_rate": 4.093352883675465e-05, + "loss": 0.7624, + "step": 7763 + }, + { + "epoch": 1.2154038822792737, + "grad_norm": 3.02388858795166, + "learning_rate": 4.0925382860866734e-05, + "loss": 0.4105, + "step": 7764 + }, + { + "epoch": 1.2155604257983719, + "grad_norm": 2.419630527496338, + "learning_rate": 4.0917236884978824e-05, + "loss": 0.7355, + "step": 7765 + }, + { + "epoch": 1.2157169693174703, + "grad_norm": 2.5946998596191406, + "learning_rate": 4.0909090909090915e-05, + "loss": 0.8353, + "step": 7766 + }, + { + "epoch": 1.2158735128365685, + "grad_norm": 8.0993070602417, + "learning_rate": 4.0900944933203e-05, + "loss": 1.1103, + "step": 7767 + }, + { + "epoch": 1.216030056355667, + "grad_norm": 2.267594337463379, + "learning_rate": 4.089279895731509e-05, + "loss": 1.045, + "step": 7768 + }, + { + "epoch": 1.2161865998747652, + "grad_norm": 3.1944103240966797, + "learning_rate": 4.088465298142718e-05, + "loss": 0.6499, + "step": 7769 + }, + { + "epoch": 1.2163431433938634, + "grad_norm": 5.0447492599487305, + "learning_rate": 4.087650700553926e-05, + "loss": 1.0153, + "step": 7770 + }, + { + "epoch": 1.2164996869129618, + "grad_norm": 5.2701334953308105, + "learning_rate": 4.0868361029651354e-05, + "loss": 1.5551, + "step": 7771 + }, + { + "epoch": 1.21665623043206, + "grad_norm": 2.198878526687622, + "learning_rate": 4.0860215053763444e-05, + "loss": 0.8622, + "step": 7772 + }, + { + "epoch": 1.2168127739511585, + "grad_norm": 2.117863178253174, + "learning_rate": 4.0852069077875535e-05, + "loss": 0.5248, + "step": 7773 + }, + { + "epoch": 1.2169693174702567, + "grad_norm": 6.46619176864624, + "learning_rate": 4.084392310198762e-05, + "loss": 1.0306, + "step": 7774 + }, + { + "epoch": 1.2171258609893552, + "grad_norm": 3.371569871902466, + "learning_rate": 4.083577712609971e-05, + "loss": 0.7815, + "step": 7775 + }, + { + "epoch": 1.2172824045084534, + "grad_norm": 2.6792821884155273, + "learning_rate": 4.08276311502118e-05, + "loss": 0.7627, + "step": 7776 + }, + { + "epoch": 1.2174389480275516, + "grad_norm": 2.2840776443481445, + "learning_rate": 4.0819485174323883e-05, + "loss": 0.8428, + "step": 7777 + }, + { + "epoch": 1.21759549154665, + "grad_norm": 3.254038095474243, + "learning_rate": 4.0811339198435974e-05, + "loss": 0.7331, + "step": 7778 + }, + { + "epoch": 1.2177520350657483, + "grad_norm": 2.1887731552124023, + "learning_rate": 4.0803193222548064e-05, + "loss": 0.5201, + "step": 7779 + }, + { + "epoch": 1.2179085785848467, + "grad_norm": 2.1577541828155518, + "learning_rate": 4.079504724666015e-05, + "loss": 0.6844, + "step": 7780 + }, + { + "epoch": 1.218065122103945, + "grad_norm": 6.390582084655762, + "learning_rate": 4.0786901270772246e-05, + "loss": 0.9124, + "step": 7781 + }, + { + "epoch": 1.2182216656230431, + "grad_norm": 3.0352532863616943, + "learning_rate": 4.077875529488433e-05, + "loss": 0.6756, + "step": 7782 + }, + { + "epoch": 1.2183782091421416, + "grad_norm": 3.915550947189331, + "learning_rate": 4.077060931899642e-05, + "loss": 1.1811, + "step": 7783 + }, + { + "epoch": 1.2185347526612398, + "grad_norm": 6.116030216217041, + "learning_rate": 4.076246334310851e-05, + "loss": 1.5198, + "step": 7784 + }, + { + "epoch": 1.2186912961803382, + "grad_norm": 3.0630221366882324, + "learning_rate": 4.0754317367220594e-05, + "loss": 0.9762, + "step": 7785 + }, + { + "epoch": 1.2188478396994364, + "grad_norm": 2.6783523559570312, + "learning_rate": 4.0746171391332685e-05, + "loss": 0.4049, + "step": 7786 + }, + { + "epoch": 1.2190043832185347, + "grad_norm": 1.3078604936599731, + "learning_rate": 4.0738025415444775e-05, + "loss": 0.2408, + "step": 7787 + }, + { + "epoch": 1.219160926737633, + "grad_norm": 3.3182995319366455, + "learning_rate": 4.072987943955686e-05, + "loss": 0.9497, + "step": 7788 + }, + { + "epoch": 1.2193174702567313, + "grad_norm": 0.311099112033844, + "learning_rate": 4.072173346366895e-05, + "loss": 0.1525, + "step": 7789 + }, + { + "epoch": 1.2194740137758298, + "grad_norm": 0.806235671043396, + "learning_rate": 4.071358748778104e-05, + "loss": 0.2104, + "step": 7790 + }, + { + "epoch": 1.219630557294928, + "grad_norm": 0.7746682167053223, + "learning_rate": 4.070544151189313e-05, + "loss": 0.1704, + "step": 7791 + }, + { + "epoch": 1.2197871008140262, + "grad_norm": 1.7647334337234497, + "learning_rate": 4.0697295536005214e-05, + "loss": 0.2022, + "step": 7792 + }, + { + "epoch": 1.2199436443331246, + "grad_norm": 0.7966252565383911, + "learning_rate": 4.0689149560117305e-05, + "loss": 0.2216, + "step": 7793 + }, + { + "epoch": 1.2201001878522229, + "grad_norm": 0.6837196946144104, + "learning_rate": 4.0681003584229395e-05, + "loss": 0.2315, + "step": 7794 + }, + { + "epoch": 1.2202567313713213, + "grad_norm": 0.6758849620819092, + "learning_rate": 4.067285760834148e-05, + "loss": 0.2848, + "step": 7795 + }, + { + "epoch": 1.2204132748904195, + "grad_norm": 0.5614440441131592, + "learning_rate": 4.066471163245357e-05, + "loss": 0.1593, + "step": 7796 + }, + { + "epoch": 1.2205698184095177, + "grad_norm": 0.6038888096809387, + "learning_rate": 4.065656565656566e-05, + "loss": 0.1922, + "step": 7797 + }, + { + "epoch": 1.2207263619286162, + "grad_norm": 0.5941992402076721, + "learning_rate": 4.0648419680677744e-05, + "loss": 0.1867, + "step": 7798 + }, + { + "epoch": 1.2208829054477144, + "grad_norm": 0.8413442373275757, + "learning_rate": 4.064027370478984e-05, + "loss": 0.3542, + "step": 7799 + }, + { + "epoch": 1.2210394489668128, + "grad_norm": 1.120608925819397, + "learning_rate": 4.0632127728901925e-05, + "loss": 0.3245, + "step": 7800 + }, + { + "epoch": 1.221195992485911, + "grad_norm": 1.3813908100128174, + "learning_rate": 4.0623981753014015e-05, + "loss": 0.2454, + "step": 7801 + }, + { + "epoch": 1.2213525360050095, + "grad_norm": 2.6711037158966064, + "learning_rate": 4.06158357771261e-05, + "loss": 0.701, + "step": 7802 + }, + { + "epoch": 1.2215090795241077, + "grad_norm": 0.8298321962356567, + "learning_rate": 4.060768980123819e-05, + "loss": 0.2319, + "step": 7803 + }, + { + "epoch": 1.221665623043206, + "grad_norm": 0.8146275281906128, + "learning_rate": 4.059954382535028e-05, + "loss": 0.2202, + "step": 7804 + }, + { + "epoch": 1.2218221665623044, + "grad_norm": 1.6893386840820312, + "learning_rate": 4.0591397849462364e-05, + "loss": 0.4887, + "step": 7805 + }, + { + "epoch": 1.2219787100814026, + "grad_norm": 1.2046064138412476, + "learning_rate": 4.0583251873574454e-05, + "loss": 0.4003, + "step": 7806 + }, + { + "epoch": 1.222135253600501, + "grad_norm": 2.0957634449005127, + "learning_rate": 4.0575105897686545e-05, + "loss": 0.7743, + "step": 7807 + }, + { + "epoch": 1.2222917971195992, + "grad_norm": 1.051973819732666, + "learning_rate": 4.056695992179863e-05, + "loss": 0.3978, + "step": 7808 + }, + { + "epoch": 1.2224483406386977, + "grad_norm": 1.6761633157730103, + "learning_rate": 4.0558813945910726e-05, + "loss": 0.4356, + "step": 7809 + }, + { + "epoch": 1.222604884157796, + "grad_norm": 1.6791043281555176, + "learning_rate": 4.055066797002281e-05, + "loss": 0.5342, + "step": 7810 + }, + { + "epoch": 1.222761427676894, + "grad_norm": 3.2050397396087646, + "learning_rate": 4.0542521994134894e-05, + "loss": 0.7169, + "step": 7811 + }, + { + "epoch": 1.2229179711959925, + "grad_norm": 2.078911304473877, + "learning_rate": 4.053437601824699e-05, + "loss": 0.4709, + "step": 7812 + }, + { + "epoch": 1.2230745147150908, + "grad_norm": 1.955700159072876, + "learning_rate": 4.0526230042359075e-05, + "loss": 0.6858, + "step": 7813 + }, + { + "epoch": 1.2232310582341892, + "grad_norm": 3.63061785697937, + "learning_rate": 4.0518084066471165e-05, + "loss": 0.8202, + "step": 7814 + }, + { + "epoch": 1.2233876017532874, + "grad_norm": 1.5994926691055298, + "learning_rate": 4.0509938090583256e-05, + "loss": 0.3921, + "step": 7815 + }, + { + "epoch": 1.2235441452723856, + "grad_norm": 2.169919729232788, + "learning_rate": 4.050179211469534e-05, + "loss": 0.3749, + "step": 7816 + }, + { + "epoch": 1.223700688791484, + "grad_norm": 3.3450517654418945, + "learning_rate": 4.049364613880743e-05, + "loss": 0.4125, + "step": 7817 + }, + { + "epoch": 1.2238572323105823, + "grad_norm": 1.6765313148498535, + "learning_rate": 4.048550016291952e-05, + "loss": 0.3817, + "step": 7818 + }, + { + "epoch": 1.2240137758296807, + "grad_norm": 3.9646122455596924, + "learning_rate": 4.0477354187031604e-05, + "loss": 0.9811, + "step": 7819 + }, + { + "epoch": 1.224170319348779, + "grad_norm": 2.153231143951416, + "learning_rate": 4.0469208211143695e-05, + "loss": 0.7829, + "step": 7820 + }, + { + "epoch": 1.2243268628678772, + "grad_norm": 5.720675945281982, + "learning_rate": 4.0461062235255785e-05, + "loss": 1.0254, + "step": 7821 + }, + { + "epoch": 1.2244834063869756, + "grad_norm": 2.9187488555908203, + "learning_rate": 4.0452916259367876e-05, + "loss": 0.7179, + "step": 7822 + }, + { + "epoch": 1.2246399499060738, + "grad_norm": 1.7171374559402466, + "learning_rate": 4.044477028347996e-05, + "loss": 0.529, + "step": 7823 + }, + { + "epoch": 1.2247964934251723, + "grad_norm": 4.401496410369873, + "learning_rate": 4.043662430759205e-05, + "loss": 1.3406, + "step": 7824 + }, + { + "epoch": 1.2249530369442705, + "grad_norm": 7.417834758758545, + "learning_rate": 4.042847833170414e-05, + "loss": 0.7531, + "step": 7825 + }, + { + "epoch": 1.2251095804633687, + "grad_norm": 4.837009429931641, + "learning_rate": 4.0420332355816224e-05, + "loss": 0.8905, + "step": 7826 + }, + { + "epoch": 1.2252661239824671, + "grad_norm": 6.211047649383545, + "learning_rate": 4.041218637992832e-05, + "loss": 1.4173, + "step": 7827 + }, + { + "epoch": 1.2254226675015654, + "grad_norm": 1.9618042707443237, + "learning_rate": 4.0404040404040405e-05, + "loss": 0.7442, + "step": 7828 + }, + { + "epoch": 1.2255792110206638, + "grad_norm": 3.0650863647460938, + "learning_rate": 4.039589442815249e-05, + "loss": 0.9998, + "step": 7829 + }, + { + "epoch": 1.225735754539762, + "grad_norm": 2.427259683609009, + "learning_rate": 4.0387748452264586e-05, + "loss": 0.7956, + "step": 7830 + }, + { + "epoch": 1.2258922980588602, + "grad_norm": 7.408844470977783, + "learning_rate": 4.037960247637667e-05, + "loss": 1.1375, + "step": 7831 + }, + { + "epoch": 1.2260488415779587, + "grad_norm": 5.211965560913086, + "learning_rate": 4.037145650048876e-05, + "loss": 1.331, + "step": 7832 + }, + { + "epoch": 1.226205385097057, + "grad_norm": 3.3737967014312744, + "learning_rate": 4.036331052460085e-05, + "loss": 1.2589, + "step": 7833 + }, + { + "epoch": 1.2263619286161553, + "grad_norm": 8.366302490234375, + "learning_rate": 4.0355164548712935e-05, + "loss": 0.3737, + "step": 7834 + }, + { + "epoch": 1.2265184721352536, + "grad_norm": 4.789961814880371, + "learning_rate": 4.0347018572825025e-05, + "loss": 0.6587, + "step": 7835 + }, + { + "epoch": 1.226675015654352, + "grad_norm": 6.1868767738342285, + "learning_rate": 4.0338872596937116e-05, + "loss": 0.7832, + "step": 7836 + }, + { + "epoch": 1.2268315591734502, + "grad_norm": 2.5992801189422607, + "learning_rate": 4.03307266210492e-05, + "loss": 0.9146, + "step": 7837 + }, + { + "epoch": 1.2269881026925484, + "grad_norm": 3.3605499267578125, + "learning_rate": 4.032258064516129e-05, + "loss": 0.8179, + "step": 7838 + }, + { + "epoch": 1.2271446462116469, + "grad_norm": 0.5024340152740479, + "learning_rate": 4.031443466927338e-05, + "loss": 0.2011, + "step": 7839 + }, + { + "epoch": 1.227301189730745, + "grad_norm": 0.6244211196899414, + "learning_rate": 4.030628869338547e-05, + "loss": 0.189, + "step": 7840 + }, + { + "epoch": 1.2274577332498435, + "grad_norm": 0.553145706653595, + "learning_rate": 4.0298142717497555e-05, + "loss": 0.1954, + "step": 7841 + }, + { + "epoch": 1.2276142767689417, + "grad_norm": 0.49140286445617676, + "learning_rate": 4.0289996741609646e-05, + "loss": 0.1442, + "step": 7842 + }, + { + "epoch": 1.2277708202880402, + "grad_norm": 0.676360547542572, + "learning_rate": 4.0281850765721736e-05, + "loss": 0.2, + "step": 7843 + }, + { + "epoch": 1.2279273638071384, + "grad_norm": 0.5992287993431091, + "learning_rate": 4.027370478983382e-05, + "loss": 0.2052, + "step": 7844 + }, + { + "epoch": 1.2280839073262366, + "grad_norm": 0.9023377299308777, + "learning_rate": 4.026555881394592e-05, + "loss": 0.2829, + "step": 7845 + }, + { + "epoch": 1.228240450845335, + "grad_norm": 1.1624482870101929, + "learning_rate": 4.0257412838058e-05, + "loss": 0.2626, + "step": 7846 + }, + { + "epoch": 1.2283969943644333, + "grad_norm": 0.6850337386131287, + "learning_rate": 4.0249266862170085e-05, + "loss": 0.325, + "step": 7847 + }, + { + "epoch": 1.2285535378835317, + "grad_norm": 1.4556047916412354, + "learning_rate": 4.024112088628218e-05, + "loss": 0.2753, + "step": 7848 + }, + { + "epoch": 1.22871008140263, + "grad_norm": 0.6493906378746033, + "learning_rate": 4.0232974910394266e-05, + "loss": 0.2718, + "step": 7849 + }, + { + "epoch": 1.2288666249217282, + "grad_norm": 1.6281062364578247, + "learning_rate": 4.0224828934506356e-05, + "loss": 0.483, + "step": 7850 + }, + { + "epoch": 1.2290231684408266, + "grad_norm": 1.227245807647705, + "learning_rate": 4.021668295861845e-05, + "loss": 0.3006, + "step": 7851 + }, + { + "epoch": 1.2291797119599248, + "grad_norm": 0.8473982214927673, + "learning_rate": 4.020853698273053e-05, + "loss": 0.1961, + "step": 7852 + }, + { + "epoch": 1.2293362554790233, + "grad_norm": 1.1954712867736816, + "learning_rate": 4.020039100684262e-05, + "loss": 0.3499, + "step": 7853 + }, + { + "epoch": 1.2294927989981215, + "grad_norm": 1.7953161001205444, + "learning_rate": 4.019224503095471e-05, + "loss": 0.3149, + "step": 7854 + }, + { + "epoch": 1.2296493425172197, + "grad_norm": 1.6158283948898315, + "learning_rate": 4.0184099055066795e-05, + "loss": 0.4611, + "step": 7855 + }, + { + "epoch": 1.2298058860363181, + "grad_norm": 0.6444686651229858, + "learning_rate": 4.0175953079178886e-05, + "loss": 0.2414, + "step": 7856 + }, + { + "epoch": 1.2299624295554163, + "grad_norm": 1.8256889581680298, + "learning_rate": 4.0167807103290976e-05, + "loss": 0.482, + "step": 7857 + }, + { + "epoch": 1.2301189730745148, + "grad_norm": 1.05446195602417, + "learning_rate": 4.015966112740307e-05, + "loss": 0.2161, + "step": 7858 + }, + { + "epoch": 1.230275516593613, + "grad_norm": 2.2366483211517334, + "learning_rate": 4.015151515151515e-05, + "loss": 0.5199, + "step": 7859 + }, + { + "epoch": 1.2304320601127112, + "grad_norm": 1.1718478202819824, + "learning_rate": 4.014336917562724e-05, + "loss": 0.3534, + "step": 7860 + }, + { + "epoch": 1.2305886036318097, + "grad_norm": 1.83755362033844, + "learning_rate": 4.013522319973933e-05, + "loss": 0.3591, + "step": 7861 + }, + { + "epoch": 1.2307451471509079, + "grad_norm": 2.3541131019592285, + "learning_rate": 4.0127077223851415e-05, + "loss": 0.6277, + "step": 7862 + }, + { + "epoch": 1.2309016906700063, + "grad_norm": 2.7741587162017822, + "learning_rate": 4.0118931247963506e-05, + "loss": 0.623, + "step": 7863 + }, + { + "epoch": 1.2310582341891045, + "grad_norm": 1.3282216787338257, + "learning_rate": 4.0110785272075597e-05, + "loss": 0.3059, + "step": 7864 + }, + { + "epoch": 1.2312147777082028, + "grad_norm": 4.057281017303467, + "learning_rate": 4.010263929618768e-05, + "loss": 0.6453, + "step": 7865 + }, + { + "epoch": 1.2313713212273012, + "grad_norm": 2.1511647701263428, + "learning_rate": 4.009449332029978e-05, + "loss": 0.7755, + "step": 7866 + }, + { + "epoch": 1.2315278647463994, + "grad_norm": 2.595256805419922, + "learning_rate": 4.008634734441186e-05, + "loss": 0.5507, + "step": 7867 + }, + { + "epoch": 1.2316844082654979, + "grad_norm": 4.56450891494751, + "learning_rate": 4.007820136852395e-05, + "loss": 1.2931, + "step": 7868 + }, + { + "epoch": 1.231840951784596, + "grad_norm": 3.857823133468628, + "learning_rate": 4.007005539263604e-05, + "loss": 0.8998, + "step": 7869 + }, + { + "epoch": 1.2319974953036945, + "grad_norm": 2.082387924194336, + "learning_rate": 4.0061909416748126e-05, + "loss": 0.4713, + "step": 7870 + }, + { + "epoch": 1.2321540388227927, + "grad_norm": 1.6701544523239136, + "learning_rate": 4.005376344086022e-05, + "loss": 0.606, + "step": 7871 + }, + { + "epoch": 1.2323105823418912, + "grad_norm": 1.2754632234573364, + "learning_rate": 4.004561746497231e-05, + "loss": 0.5046, + "step": 7872 + }, + { + "epoch": 1.2324671258609894, + "grad_norm": 4.393087863922119, + "learning_rate": 4.003747148908439e-05, + "loss": 0.9505, + "step": 7873 + }, + { + "epoch": 1.2326236693800876, + "grad_norm": 5.92290735244751, + "learning_rate": 4.002932551319648e-05, + "loss": 1.0734, + "step": 7874 + }, + { + "epoch": 1.232780212899186, + "grad_norm": 2.567551374435425, + "learning_rate": 4.002117953730857e-05, + "loss": 0.6556, + "step": 7875 + }, + { + "epoch": 1.2329367564182843, + "grad_norm": 3.1955044269561768, + "learning_rate": 4.001303356142066e-05, + "loss": 1.0395, + "step": 7876 + }, + { + "epoch": 1.2330932999373827, + "grad_norm": 2.4793925285339355, + "learning_rate": 4.0004887585532746e-05, + "loss": 0.9491, + "step": 7877 + }, + { + "epoch": 1.233249843456481, + "grad_norm": 2.799659252166748, + "learning_rate": 3.999674160964484e-05, + "loss": 1.053, + "step": 7878 + }, + { + "epoch": 1.2334063869755791, + "grad_norm": 2.0097544193267822, + "learning_rate": 3.998859563375693e-05, + "loss": 0.5377, + "step": 7879 + }, + { + "epoch": 1.2335629304946776, + "grad_norm": 3.7643227577209473, + "learning_rate": 3.998044965786901e-05, + "loss": 1.189, + "step": 7880 + }, + { + "epoch": 1.2337194740137758, + "grad_norm": 4.8595404624938965, + "learning_rate": 3.99723036819811e-05, + "loss": 1.0655, + "step": 7881 + }, + { + "epoch": 1.2338760175328742, + "grad_norm": 2.1621181964874268, + "learning_rate": 3.996415770609319e-05, + "loss": 1.2644, + "step": 7882 + }, + { + "epoch": 1.2340325610519725, + "grad_norm": 2.503446578979492, + "learning_rate": 3.9956011730205276e-05, + "loss": 1.0142, + "step": 7883 + }, + { + "epoch": 1.2341891045710707, + "grad_norm": 3.6391000747680664, + "learning_rate": 3.994786575431737e-05, + "loss": 0.7589, + "step": 7884 + }, + { + "epoch": 1.2343456480901691, + "grad_norm": 2.8267295360565186, + "learning_rate": 3.993971977842946e-05, + "loss": 0.7286, + "step": 7885 + }, + { + "epoch": 1.2345021916092673, + "grad_norm": 2.315208911895752, + "learning_rate": 3.993157380254155e-05, + "loss": 1.2628, + "step": 7886 + }, + { + "epoch": 1.2346587351283658, + "grad_norm": 4.627626895904541, + "learning_rate": 3.992342782665364e-05, + "loss": 0.9741, + "step": 7887 + }, + { + "epoch": 1.234815278647464, + "grad_norm": 2.86019229888916, + "learning_rate": 3.991528185076572e-05, + "loss": 1.3444, + "step": 7888 + }, + { + "epoch": 1.2349718221665622, + "grad_norm": 0.4920736849308014, + "learning_rate": 3.990713587487781e-05, + "loss": 0.2706, + "step": 7889 + }, + { + "epoch": 1.2351283656856606, + "grad_norm": 0.47822633385658264, + "learning_rate": 3.98989898989899e-05, + "loss": 0.2672, + "step": 7890 + }, + { + "epoch": 1.2352849092047589, + "grad_norm": 0.4706505835056305, + "learning_rate": 3.9890843923101986e-05, + "loss": 0.1525, + "step": 7891 + }, + { + "epoch": 1.2354414527238573, + "grad_norm": 0.6486652493476868, + "learning_rate": 3.988269794721408e-05, + "loss": 0.1995, + "step": 7892 + }, + { + "epoch": 1.2355979962429555, + "grad_norm": 0.5031141638755798, + "learning_rate": 3.987455197132617e-05, + "loss": 0.1501, + "step": 7893 + }, + { + "epoch": 1.2357545397620537, + "grad_norm": 0.43801987171173096, + "learning_rate": 3.986640599543826e-05, + "loss": 0.1788, + "step": 7894 + }, + { + "epoch": 1.2359110832811522, + "grad_norm": 0.6532099843025208, + "learning_rate": 3.985826001955034e-05, + "loss": 0.2272, + "step": 7895 + }, + { + "epoch": 1.2360676268002504, + "grad_norm": 0.9455823302268982, + "learning_rate": 3.985011404366243e-05, + "loss": 0.2456, + "step": 7896 + }, + { + "epoch": 1.2362241703193488, + "grad_norm": 0.7159378528594971, + "learning_rate": 3.984196806777452e-05, + "loss": 0.238, + "step": 7897 + }, + { + "epoch": 1.236380713838447, + "grad_norm": 0.5336363911628723, + "learning_rate": 3.983382209188661e-05, + "loss": 0.1643, + "step": 7898 + }, + { + "epoch": 1.2365372573575455, + "grad_norm": 1.2905402183532715, + "learning_rate": 3.98256761159987e-05, + "loss": 0.3538, + "step": 7899 + }, + { + "epoch": 1.2366938008766437, + "grad_norm": 1.8717563152313232, + "learning_rate": 3.981753014011079e-05, + "loss": 0.5837, + "step": 7900 + }, + { + "epoch": 1.236850344395742, + "grad_norm": 1.1318551301956177, + "learning_rate": 3.980938416422287e-05, + "loss": 0.5888, + "step": 7901 + }, + { + "epoch": 1.2370068879148404, + "grad_norm": 0.7787688970565796, + "learning_rate": 3.980123818833497e-05, + "loss": 0.2621, + "step": 7902 + }, + { + "epoch": 1.2371634314339386, + "grad_norm": 0.7201224565505981, + "learning_rate": 3.979309221244705e-05, + "loss": 0.1896, + "step": 7903 + }, + { + "epoch": 1.237319974953037, + "grad_norm": 2.4650862216949463, + "learning_rate": 3.978494623655914e-05, + "loss": 0.413, + "step": 7904 + }, + { + "epoch": 1.2374765184721352, + "grad_norm": 7.3415632247924805, + "learning_rate": 3.9776800260671234e-05, + "loss": 1.0737, + "step": 7905 + }, + { + "epoch": 1.2376330619912337, + "grad_norm": 1.396141767501831, + "learning_rate": 3.976865428478332e-05, + "loss": 0.349, + "step": 7906 + }, + { + "epoch": 1.237789605510332, + "grad_norm": 1.3751226663589478, + "learning_rate": 3.976050830889541e-05, + "loss": 0.5223, + "step": 7907 + }, + { + "epoch": 1.2379461490294301, + "grad_norm": 1.9633326530456543, + "learning_rate": 3.97523623330075e-05, + "loss": 0.5157, + "step": 7908 + }, + { + "epoch": 1.2381026925485286, + "grad_norm": 1.4622845649719238, + "learning_rate": 3.974421635711958e-05, + "loss": 0.3378, + "step": 7909 + }, + { + "epoch": 1.2382592360676268, + "grad_norm": 2.3205630779266357, + "learning_rate": 3.973607038123167e-05, + "loss": 0.6295, + "step": 7910 + }, + { + "epoch": 1.2384157795867252, + "grad_norm": 1.6743199825286865, + "learning_rate": 3.972792440534376e-05, + "loss": 0.4383, + "step": 7911 + }, + { + "epoch": 1.2385723231058234, + "grad_norm": 1.5907877683639526, + "learning_rate": 3.9719778429455854e-05, + "loss": 0.7185, + "step": 7912 + }, + { + "epoch": 1.2387288666249217, + "grad_norm": 1.3310531377792358, + "learning_rate": 3.971163245356794e-05, + "loss": 0.3106, + "step": 7913 + }, + { + "epoch": 1.23888541014402, + "grad_norm": 1.4525573253631592, + "learning_rate": 3.970348647768003e-05, + "loss": 0.3577, + "step": 7914 + }, + { + "epoch": 1.2390419536631183, + "grad_norm": 2.19777250289917, + "learning_rate": 3.969534050179212e-05, + "loss": 0.6845, + "step": 7915 + }, + { + "epoch": 1.2391984971822168, + "grad_norm": 2.404719829559326, + "learning_rate": 3.96871945259042e-05, + "loss": 0.8738, + "step": 7916 + }, + { + "epoch": 1.239355040701315, + "grad_norm": 3.308324098587036, + "learning_rate": 3.967904855001629e-05, + "loss": 0.6673, + "step": 7917 + }, + { + "epoch": 1.2395115842204132, + "grad_norm": 2.9720957279205322, + "learning_rate": 3.967090257412838e-05, + "loss": 0.8264, + "step": 7918 + }, + { + "epoch": 1.2396681277395116, + "grad_norm": 1.9595211744308472, + "learning_rate": 3.966275659824047e-05, + "loss": 0.8824, + "step": 7919 + }, + { + "epoch": 1.2398246712586098, + "grad_norm": 3.9464612007141113, + "learning_rate": 3.9654610622352564e-05, + "loss": 1.2061, + "step": 7920 + }, + { + "epoch": 1.2399812147777083, + "grad_norm": 3.616525173187256, + "learning_rate": 3.964646464646465e-05, + "loss": 0.8467, + "step": 7921 + }, + { + "epoch": 1.2401377582968065, + "grad_norm": 2.215108871459961, + "learning_rate": 3.963831867057673e-05, + "loss": 0.9076, + "step": 7922 + }, + { + "epoch": 1.2402943018159047, + "grad_norm": 2.8966729640960693, + "learning_rate": 3.963017269468883e-05, + "loss": 1.4071, + "step": 7923 + }, + { + "epoch": 1.2404508453350032, + "grad_norm": 3.5867161750793457, + "learning_rate": 3.962202671880091e-05, + "loss": 0.6189, + "step": 7924 + }, + { + "epoch": 1.2406073888541014, + "grad_norm": 2.023092031478882, + "learning_rate": 3.9613880742913e-05, + "loss": 0.8147, + "step": 7925 + }, + { + "epoch": 1.2407639323731998, + "grad_norm": 2.3457136154174805, + "learning_rate": 3.9605734767025094e-05, + "loss": 0.4868, + "step": 7926 + }, + { + "epoch": 1.240920475892298, + "grad_norm": 1.8363497257232666, + "learning_rate": 3.959758879113718e-05, + "loss": 0.9228, + "step": 7927 + }, + { + "epoch": 1.2410770194113963, + "grad_norm": 2.2233405113220215, + "learning_rate": 3.958944281524927e-05, + "loss": 0.5903, + "step": 7928 + }, + { + "epoch": 1.2412335629304947, + "grad_norm": 5.117366313934326, + "learning_rate": 3.958129683936136e-05, + "loss": 1.7283, + "step": 7929 + }, + { + "epoch": 1.241390106449593, + "grad_norm": 2.3053958415985107, + "learning_rate": 3.957315086347345e-05, + "loss": 1.1346, + "step": 7930 + }, + { + "epoch": 1.2415466499686914, + "grad_norm": 2.418189525604248, + "learning_rate": 3.956500488758553e-05, + "loss": 0.7948, + "step": 7931 + }, + { + "epoch": 1.2417031934877896, + "grad_norm": 2.2288455963134766, + "learning_rate": 3.9556858911697624e-05, + "loss": 0.6817, + "step": 7932 + }, + { + "epoch": 1.241859737006888, + "grad_norm": 2.4967315196990967, + "learning_rate": 3.9548712935809714e-05, + "loss": 0.6108, + "step": 7933 + }, + { + "epoch": 1.2420162805259862, + "grad_norm": 4.300291061401367, + "learning_rate": 3.95405669599218e-05, + "loss": 0.7037, + "step": 7934 + }, + { + "epoch": 1.2421728240450844, + "grad_norm": 5.373396396636963, + "learning_rate": 3.953242098403389e-05, + "loss": 0.8716, + "step": 7935 + }, + { + "epoch": 1.2423293675641829, + "grad_norm": 3.4109280109405518, + "learning_rate": 3.952427500814598e-05, + "loss": 0.9511, + "step": 7936 + }, + { + "epoch": 1.242485911083281, + "grad_norm": 3.0970077514648438, + "learning_rate": 3.951612903225806e-05, + "loss": 0.8151, + "step": 7937 + }, + { + "epoch": 1.2426424546023795, + "grad_norm": 4.269380569458008, + "learning_rate": 3.950798305637016e-05, + "loss": 1.4, + "step": 7938 + }, + { + "epoch": 1.2427989981214778, + "grad_norm": 0.5039753317832947, + "learning_rate": 3.9499837080482244e-05, + "loss": 0.2216, + "step": 7939 + }, + { + "epoch": 1.2429555416405762, + "grad_norm": 0.5269017815589905, + "learning_rate": 3.949169110459433e-05, + "loss": 0.23, + "step": 7940 + }, + { + "epoch": 1.2431120851596744, + "grad_norm": 0.3700673282146454, + "learning_rate": 3.9483545128706425e-05, + "loss": 0.183, + "step": 7941 + }, + { + "epoch": 1.2432686286787726, + "grad_norm": 0.4385509788990021, + "learning_rate": 3.947539915281851e-05, + "loss": 0.1642, + "step": 7942 + }, + { + "epoch": 1.243425172197871, + "grad_norm": 0.856357991695404, + "learning_rate": 3.94672531769306e-05, + "loss": 0.2368, + "step": 7943 + }, + { + "epoch": 1.2435817157169693, + "grad_norm": 0.7279939651489258, + "learning_rate": 3.945910720104269e-05, + "loss": 0.1797, + "step": 7944 + }, + { + "epoch": 1.2437382592360677, + "grad_norm": 0.9569168090820312, + "learning_rate": 3.945096122515477e-05, + "loss": 0.2225, + "step": 7945 + }, + { + "epoch": 1.243894802755166, + "grad_norm": 0.7991119623184204, + "learning_rate": 3.9442815249266864e-05, + "loss": 0.224, + "step": 7946 + }, + { + "epoch": 1.2440513462742642, + "grad_norm": 5.683965682983398, + "learning_rate": 3.9434669273378954e-05, + "loss": 0.291, + "step": 7947 + }, + { + "epoch": 1.2442078897933626, + "grad_norm": 0.9734595417976379, + "learning_rate": 3.9426523297491045e-05, + "loss": 0.2614, + "step": 7948 + }, + { + "epoch": 1.2443644333124608, + "grad_norm": 0.522993266582489, + "learning_rate": 3.941837732160313e-05, + "loss": 0.1292, + "step": 7949 + }, + { + "epoch": 1.2445209768315593, + "grad_norm": 1.1721924543380737, + "learning_rate": 3.941023134571522e-05, + "loss": 0.2743, + "step": 7950 + }, + { + "epoch": 1.2446775203506575, + "grad_norm": 1.0499588251113892, + "learning_rate": 3.940208536982731e-05, + "loss": 0.2933, + "step": 7951 + }, + { + "epoch": 1.2448340638697557, + "grad_norm": 1.8475884199142456, + "learning_rate": 3.939393939393939e-05, + "loss": 0.3844, + "step": 7952 + }, + { + "epoch": 1.2449906073888541, + "grad_norm": 1.6360729932785034, + "learning_rate": 3.9385793418051484e-05, + "loss": 0.3214, + "step": 7953 + }, + { + "epoch": 1.2451471509079524, + "grad_norm": 1.185647964477539, + "learning_rate": 3.9377647442163574e-05, + "loss": 0.5131, + "step": 7954 + }, + { + "epoch": 1.2453036944270508, + "grad_norm": 1.0250483751296997, + "learning_rate": 3.936950146627566e-05, + "loss": 0.1632, + "step": 7955 + }, + { + "epoch": 1.245460237946149, + "grad_norm": 1.3814512491226196, + "learning_rate": 3.9361355490387755e-05, + "loss": 0.3237, + "step": 7956 + }, + { + "epoch": 1.2456167814652472, + "grad_norm": 1.9650413990020752, + "learning_rate": 3.935320951449984e-05, + "loss": 0.3462, + "step": 7957 + }, + { + "epoch": 1.2457733249843457, + "grad_norm": 2.437375545501709, + "learning_rate": 3.934506353861192e-05, + "loss": 0.3292, + "step": 7958 + }, + { + "epoch": 1.245929868503444, + "grad_norm": 2.2386679649353027, + "learning_rate": 3.933691756272402e-05, + "loss": 0.5734, + "step": 7959 + }, + { + "epoch": 1.2460864120225423, + "grad_norm": 2.048077344894409, + "learning_rate": 3.9328771586836104e-05, + "loss": 0.336, + "step": 7960 + }, + { + "epoch": 1.2462429555416406, + "grad_norm": 1.8403156995773315, + "learning_rate": 3.9320625610948195e-05, + "loss": 0.5719, + "step": 7961 + }, + { + "epoch": 1.2463994990607388, + "grad_norm": 2.108642339706421, + "learning_rate": 3.9312479635060285e-05, + "loss": 0.4394, + "step": 7962 + }, + { + "epoch": 1.2465560425798372, + "grad_norm": 2.062166929244995, + "learning_rate": 3.930433365917237e-05, + "loss": 0.4138, + "step": 7963 + }, + { + "epoch": 1.2467125860989354, + "grad_norm": 2.6170260906219482, + "learning_rate": 3.929618768328446e-05, + "loss": 0.4877, + "step": 7964 + }, + { + "epoch": 1.2468691296180339, + "grad_norm": 1.8064284324645996, + "learning_rate": 3.928804170739655e-05, + "loss": 0.5191, + "step": 7965 + }, + { + "epoch": 1.247025673137132, + "grad_norm": 2.343088388442993, + "learning_rate": 3.9279895731508634e-05, + "loss": 0.6073, + "step": 7966 + }, + { + "epoch": 1.2471822166562305, + "grad_norm": 2.753354787826538, + "learning_rate": 3.9271749755620724e-05, + "loss": 0.8594, + "step": 7967 + }, + { + "epoch": 1.2473387601753287, + "grad_norm": 2.3910295963287354, + "learning_rate": 3.9263603779732815e-05, + "loss": 0.8853, + "step": 7968 + }, + { + "epoch": 1.247495303694427, + "grad_norm": 2.0719335079193115, + "learning_rate": 3.9255457803844905e-05, + "loss": 0.671, + "step": 7969 + }, + { + "epoch": 1.2476518472135254, + "grad_norm": 2.2218570709228516, + "learning_rate": 3.924731182795699e-05, + "loss": 0.7132, + "step": 7970 + }, + { + "epoch": 1.2478083907326236, + "grad_norm": 2.5646026134490967, + "learning_rate": 3.923916585206908e-05, + "loss": 0.578, + "step": 7971 + }, + { + "epoch": 1.247964934251722, + "grad_norm": 2.8402252197265625, + "learning_rate": 3.923101987618117e-05, + "loss": 0.5615, + "step": 7972 + }, + { + "epoch": 1.2481214777708203, + "grad_norm": 1.3216972351074219, + "learning_rate": 3.9222873900293254e-05, + "loss": 0.5693, + "step": 7973 + }, + { + "epoch": 1.2482780212899187, + "grad_norm": 5.667109489440918, + "learning_rate": 3.921472792440535e-05, + "loss": 0.7694, + "step": 7974 + }, + { + "epoch": 1.248434564809017, + "grad_norm": 3.6590027809143066, + "learning_rate": 3.9206581948517435e-05, + "loss": 0.8036, + "step": 7975 + }, + { + "epoch": 1.2485911083281152, + "grad_norm": 3.812190294265747, + "learning_rate": 3.919843597262952e-05, + "loss": 1.2035, + "step": 7976 + }, + { + "epoch": 1.2487476518472136, + "grad_norm": 3.8372960090637207, + "learning_rate": 3.9190289996741616e-05, + "loss": 0.969, + "step": 7977 + }, + { + "epoch": 1.2489041953663118, + "grad_norm": 5.423093318939209, + "learning_rate": 3.91821440208537e-05, + "loss": 0.9299, + "step": 7978 + }, + { + "epoch": 1.2490607388854102, + "grad_norm": 3.409607172012329, + "learning_rate": 3.917399804496579e-05, + "loss": 0.7555, + "step": 7979 + }, + { + "epoch": 1.2492172824045085, + "grad_norm": 5.371906757354736, + "learning_rate": 3.916585206907788e-05, + "loss": 1.0024, + "step": 7980 + }, + { + "epoch": 1.2493738259236067, + "grad_norm": 3.99310040473938, + "learning_rate": 3.9157706093189964e-05, + "loss": 0.9721, + "step": 7981 + }, + { + "epoch": 1.2495303694427051, + "grad_norm": 4.269493579864502, + "learning_rate": 3.9149560117302055e-05, + "loss": 0.5878, + "step": 7982 + }, + { + "epoch": 1.2496869129618033, + "grad_norm": 4.091816425323486, + "learning_rate": 3.9141414141414145e-05, + "loss": 1.1688, + "step": 7983 + }, + { + "epoch": 1.2498434564809018, + "grad_norm": 2.8121039867401123, + "learning_rate": 3.913326816552623e-05, + "loss": 0.8693, + "step": 7984 + }, + { + "epoch": 1.25, + "grad_norm": 4.116065502166748, + "learning_rate": 3.912512218963832e-05, + "loss": 0.5065, + "step": 7985 + }, + { + "epoch": 1.2501565435190982, + "grad_norm": 2.249877691268921, + "learning_rate": 3.911697621375041e-05, + "loss": 0.7636, + "step": 7986 + }, + { + "epoch": 1.2503130870381967, + "grad_norm": 1.6179249286651611, + "learning_rate": 3.91088302378625e-05, + "loss": 0.4354, + "step": 7987 + }, + { + "epoch": 1.2504696305572949, + "grad_norm": 2.502060890197754, + "learning_rate": 3.9100684261974585e-05, + "loss": 0.6264, + "step": 7988 + }, + { + "epoch": 1.2506261740763933, + "grad_norm": 0.5363879203796387, + "learning_rate": 3.9092538286086675e-05, + "loss": 0.1664, + "step": 7989 + }, + { + "epoch": 1.2507827175954915, + "grad_norm": 0.623237133026123, + "learning_rate": 3.9084392310198766e-05, + "loss": 0.1679, + "step": 7990 + }, + { + "epoch": 1.2509392611145898, + "grad_norm": 1.0310111045837402, + "learning_rate": 3.907624633431085e-05, + "loss": 0.2269, + "step": 7991 + }, + { + "epoch": 1.2510958046336882, + "grad_norm": 0.61732417345047, + "learning_rate": 3.906810035842295e-05, + "loss": 0.3016, + "step": 7992 + }, + { + "epoch": 1.2512523481527864, + "grad_norm": 0.575916051864624, + "learning_rate": 3.905995438253503e-05, + "loss": 0.2083, + "step": 7993 + }, + { + "epoch": 1.2514088916718848, + "grad_norm": 0.4413313567638397, + "learning_rate": 3.9051808406647114e-05, + "loss": 0.1763, + "step": 7994 + }, + { + "epoch": 1.251565435190983, + "grad_norm": 0.7492988705635071, + "learning_rate": 3.904366243075921e-05, + "loss": 0.2061, + "step": 7995 + }, + { + "epoch": 1.2517219787100813, + "grad_norm": 0.8468184471130371, + "learning_rate": 3.9035516454871295e-05, + "loss": 0.3037, + "step": 7996 + }, + { + "epoch": 1.2518785222291797, + "grad_norm": 1.3908134698867798, + "learning_rate": 3.9027370478983386e-05, + "loss": 0.512, + "step": 7997 + }, + { + "epoch": 1.252035065748278, + "grad_norm": 0.6503696441650391, + "learning_rate": 3.9019224503095476e-05, + "loss": 0.2035, + "step": 7998 + }, + { + "epoch": 1.2521916092673764, + "grad_norm": 1.5032057762145996, + "learning_rate": 3.901107852720756e-05, + "loss": 0.1975, + "step": 7999 + }, + { + "epoch": 1.2523481527864746, + "grad_norm": 0.8394721150398254, + "learning_rate": 3.900293255131965e-05, + "loss": 0.1934, + "step": 8000 + }, + { + "epoch": 1.2523481527864746, + "eval_loss": 0.4895005226135254, + "eval_runtime": 203.7811, + "eval_samples_per_second": 60.766, + "eval_steps_per_second": 3.798, + "eval_wer": 0.31306713135133984, + "step": 8000 + }, + { + "epoch": 1.2525046963055728, + "grad_norm": 1.4871906042099, + "learning_rate": 3.899478657543174e-05, + "loss": 0.4102, + "step": 8001 + }, + { + "epoch": 1.2526612398246713, + "grad_norm": 1.8475325107574463, + "learning_rate": 3.8986640599543825e-05, + "loss": 0.363, + "step": 8002 + }, + { + "epoch": 1.2528177833437697, + "grad_norm": 1.3180683851242065, + "learning_rate": 3.8978494623655915e-05, + "loss": 0.2525, + "step": 8003 + }, + { + "epoch": 1.252974326862868, + "grad_norm": 1.691438913345337, + "learning_rate": 3.8970348647768006e-05, + "loss": 0.3047, + "step": 8004 + }, + { + "epoch": 1.2531308703819661, + "grad_norm": 1.334466576576233, + "learning_rate": 3.8962202671880096e-05, + "loss": 0.4316, + "step": 8005 + }, + { + "epoch": 1.2532874139010646, + "grad_norm": 0.9530674815177917, + "learning_rate": 3.895405669599218e-05, + "loss": 0.4036, + "step": 8006 + }, + { + "epoch": 1.2534439574201628, + "grad_norm": 1.6380890607833862, + "learning_rate": 3.894591072010427e-05, + "loss": 0.2969, + "step": 8007 + }, + { + "epoch": 1.2536005009392612, + "grad_norm": 2.118945360183716, + "learning_rate": 3.893776474421636e-05, + "loss": 0.4265, + "step": 8008 + }, + { + "epoch": 1.2537570444583594, + "grad_norm": 1.46430504322052, + "learning_rate": 3.8929618768328445e-05, + "loss": 0.4474, + "step": 8009 + }, + { + "epoch": 1.2539135879774577, + "grad_norm": 1.0656672716140747, + "learning_rate": 3.8921472792440535e-05, + "loss": 0.3869, + "step": 8010 + }, + { + "epoch": 1.254070131496556, + "grad_norm": 2.558854818344116, + "learning_rate": 3.8913326816552626e-05, + "loss": 0.2983, + "step": 8011 + }, + { + "epoch": 1.2542266750156543, + "grad_norm": 1.6094218492507935, + "learning_rate": 3.890518084066471e-05, + "loss": 0.5728, + "step": 8012 + }, + { + "epoch": 1.2543832185347528, + "grad_norm": 2.763535976409912, + "learning_rate": 3.889703486477681e-05, + "loss": 0.4181, + "step": 8013 + }, + { + "epoch": 1.254539762053851, + "grad_norm": 1.6084928512573242, + "learning_rate": 3.888888888888889e-05, + "loss": 0.4539, + "step": 8014 + }, + { + "epoch": 1.2546963055729492, + "grad_norm": 1.8146976232528687, + "learning_rate": 3.888074291300098e-05, + "loss": 0.5461, + "step": 8015 + }, + { + "epoch": 1.2548528490920476, + "grad_norm": 0.9268945455551147, + "learning_rate": 3.887259693711307e-05, + "loss": 0.2323, + "step": 8016 + }, + { + "epoch": 1.2550093926111459, + "grad_norm": 1.5899856090545654, + "learning_rate": 3.8864450961225156e-05, + "loss": 0.5515, + "step": 8017 + }, + { + "epoch": 1.2551659361302443, + "grad_norm": 3.159527540206909, + "learning_rate": 3.8856304985337246e-05, + "loss": 0.5661, + "step": 8018 + }, + { + "epoch": 1.2553224796493425, + "grad_norm": 1.4854668378829956, + "learning_rate": 3.8848159009449337e-05, + "loss": 0.3545, + "step": 8019 + }, + { + "epoch": 1.2554790231684407, + "grad_norm": 1.8869200944900513, + "learning_rate": 3.884001303356142e-05, + "loss": 0.6861, + "step": 8020 + }, + { + "epoch": 1.2556355666875392, + "grad_norm": 2.297090768814087, + "learning_rate": 3.883186705767351e-05, + "loss": 0.6574, + "step": 8021 + }, + { + "epoch": 1.2557921102066374, + "grad_norm": 2.55340838432312, + "learning_rate": 3.88237210817856e-05, + "loss": 0.7185, + "step": 8022 + }, + { + "epoch": 1.2559486537257358, + "grad_norm": 2.4492573738098145, + "learning_rate": 3.881557510589769e-05, + "loss": 0.7787, + "step": 8023 + }, + { + "epoch": 1.256105197244834, + "grad_norm": 2.848492383956909, + "learning_rate": 3.8807429130009776e-05, + "loss": 0.5507, + "step": 8024 + }, + { + "epoch": 1.2562617407639323, + "grad_norm": 1.8587149381637573, + "learning_rate": 3.8799283154121866e-05, + "loss": 0.5982, + "step": 8025 + }, + { + "epoch": 1.2564182842830307, + "grad_norm": 3.3497982025146484, + "learning_rate": 3.879113717823396e-05, + "loss": 1.4058, + "step": 8026 + }, + { + "epoch": 1.256574827802129, + "grad_norm": 3.76898193359375, + "learning_rate": 3.878299120234604e-05, + "loss": 1.1773, + "step": 8027 + }, + { + "epoch": 1.2567313713212274, + "grad_norm": 6.088141441345215, + "learning_rate": 3.877484522645813e-05, + "loss": 1.1316, + "step": 8028 + }, + { + "epoch": 1.2568879148403256, + "grad_norm": 2.507021903991699, + "learning_rate": 3.876669925057022e-05, + "loss": 0.8472, + "step": 8029 + }, + { + "epoch": 1.2570444583594238, + "grad_norm": 3.9785988330841064, + "learning_rate": 3.8758553274682305e-05, + "loss": 1.0487, + "step": 8030 + }, + { + "epoch": 1.2572010018785222, + "grad_norm": 2.093693971633911, + "learning_rate": 3.87504072987944e-05, + "loss": 0.9814, + "step": 8031 + }, + { + "epoch": 1.2573575453976205, + "grad_norm": 3.169156074523926, + "learning_rate": 3.8742261322906486e-05, + "loss": 0.6232, + "step": 8032 + }, + { + "epoch": 1.257514088916719, + "grad_norm": 1.8769004344940186, + "learning_rate": 3.873411534701858e-05, + "loss": 0.5145, + "step": 8033 + }, + { + "epoch": 1.2576706324358171, + "grad_norm": 3.5049068927764893, + "learning_rate": 3.872596937113067e-05, + "loss": 0.5845, + "step": 8034 + }, + { + "epoch": 1.2578271759549153, + "grad_norm": 3.390618324279785, + "learning_rate": 3.871782339524275e-05, + "loss": 0.5163, + "step": 8035 + }, + { + "epoch": 1.2579837194740138, + "grad_norm": 2.679562568664551, + "learning_rate": 3.870967741935484e-05, + "loss": 0.5249, + "step": 8036 + }, + { + "epoch": 1.2581402629931122, + "grad_norm": 3.390705108642578, + "learning_rate": 3.870153144346693e-05, + "loss": 0.7587, + "step": 8037 + }, + { + "epoch": 1.2582968065122104, + "grad_norm": 2.401384115219116, + "learning_rate": 3.8693385467579016e-05, + "loss": 0.6757, + "step": 8038 + }, + { + "epoch": 1.2584533500313086, + "grad_norm": 0.7522432804107666, + "learning_rate": 3.8685239491691106e-05, + "loss": 0.2388, + "step": 8039 + }, + { + "epoch": 1.258609893550407, + "grad_norm": 0.6442375779151917, + "learning_rate": 3.86770935158032e-05, + "loss": 0.3134, + "step": 8040 + }, + { + "epoch": 1.2587664370695053, + "grad_norm": 0.6556742191314697, + "learning_rate": 3.866894753991529e-05, + "loss": 0.2449, + "step": 8041 + }, + { + "epoch": 1.2589229805886037, + "grad_norm": 0.6328599452972412, + "learning_rate": 3.866080156402737e-05, + "loss": 0.1817, + "step": 8042 + }, + { + "epoch": 1.259079524107702, + "grad_norm": 0.7160307765007019, + "learning_rate": 3.865265558813946e-05, + "loss": 0.2113, + "step": 8043 + }, + { + "epoch": 1.2592360676268002, + "grad_norm": 0.9893240928649902, + "learning_rate": 3.864450961225155e-05, + "loss": 0.2146, + "step": 8044 + }, + { + "epoch": 1.2593926111458986, + "grad_norm": 2.0600433349609375, + "learning_rate": 3.8636363636363636e-05, + "loss": 0.2379, + "step": 8045 + }, + { + "epoch": 1.2595491546649968, + "grad_norm": 0.8536028861999512, + "learning_rate": 3.8628217660475727e-05, + "loss": 0.1791, + "step": 8046 + }, + { + "epoch": 1.2597056981840953, + "grad_norm": 1.2713578939437866, + "learning_rate": 3.862007168458782e-05, + "loss": 0.2191, + "step": 8047 + }, + { + "epoch": 1.2598622417031935, + "grad_norm": 0.6765434145927429, + "learning_rate": 3.86119257086999e-05, + "loss": 0.2098, + "step": 8048 + }, + { + "epoch": 1.2600187852222917, + "grad_norm": 0.9137091040611267, + "learning_rate": 3.8603779732812e-05, + "loss": 0.3454, + "step": 8049 + }, + { + "epoch": 1.2601753287413902, + "grad_norm": 0.9079806208610535, + "learning_rate": 3.859563375692408e-05, + "loss": 0.2829, + "step": 8050 + }, + { + "epoch": 1.2603318722604884, + "grad_norm": 1.2766894102096558, + "learning_rate": 3.8587487781036166e-05, + "loss": 0.2564, + "step": 8051 + }, + { + "epoch": 1.2604884157795868, + "grad_norm": 0.830506443977356, + "learning_rate": 3.857934180514826e-05, + "loss": 0.2457, + "step": 8052 + }, + { + "epoch": 1.260644959298685, + "grad_norm": 1.0213711261749268, + "learning_rate": 3.857119582926035e-05, + "loss": 0.1805, + "step": 8053 + }, + { + "epoch": 1.2608015028177832, + "grad_norm": 1.4553818702697754, + "learning_rate": 3.856304985337244e-05, + "loss": 0.7445, + "step": 8054 + }, + { + "epoch": 1.2609580463368817, + "grad_norm": 0.7930148243904114, + "learning_rate": 3.855490387748453e-05, + "loss": 0.2484, + "step": 8055 + }, + { + "epoch": 1.26111458985598, + "grad_norm": 0.9578269124031067, + "learning_rate": 3.854675790159661e-05, + "loss": 0.2915, + "step": 8056 + }, + { + "epoch": 1.2612711333750783, + "grad_norm": 1.233955979347229, + "learning_rate": 3.85386119257087e-05, + "loss": 0.3098, + "step": 8057 + }, + { + "epoch": 1.2614276768941766, + "grad_norm": 1.4282591342926025, + "learning_rate": 3.8530465949820786e-05, + "loss": 0.497, + "step": 8058 + }, + { + "epoch": 1.2615842204132748, + "grad_norm": 2.206385374069214, + "learning_rate": 3.852231997393288e-05, + "loss": 0.7703, + "step": 8059 + }, + { + "epoch": 1.2617407639323732, + "grad_norm": 1.2983883619308472, + "learning_rate": 3.851417399804497e-05, + "loss": 0.6496, + "step": 8060 + }, + { + "epoch": 1.2618973074514714, + "grad_norm": 2.846193313598633, + "learning_rate": 3.850602802215705e-05, + "loss": 0.2393, + "step": 8061 + }, + { + "epoch": 1.2620538509705699, + "grad_norm": 2.879210948944092, + "learning_rate": 3.849788204626915e-05, + "loss": 0.7393, + "step": 8062 + }, + { + "epoch": 1.262210394489668, + "grad_norm": 2.1046836376190186, + "learning_rate": 3.848973607038123e-05, + "loss": 0.3197, + "step": 8063 + }, + { + "epoch": 1.2623669380087663, + "grad_norm": 2.2145674228668213, + "learning_rate": 3.848159009449332e-05, + "loss": 0.7453, + "step": 8064 + }, + { + "epoch": 1.2625234815278648, + "grad_norm": 2.113584518432617, + "learning_rate": 3.847344411860541e-05, + "loss": 0.6226, + "step": 8065 + }, + { + "epoch": 1.262680025046963, + "grad_norm": 1.870540738105774, + "learning_rate": 3.8465298142717496e-05, + "loss": 0.7862, + "step": 8066 + }, + { + "epoch": 1.2628365685660614, + "grad_norm": 6.934815406799316, + "learning_rate": 3.845715216682959e-05, + "loss": 1.059, + "step": 8067 + }, + { + "epoch": 1.2629931120851596, + "grad_norm": 2.1379897594451904, + "learning_rate": 3.844900619094168e-05, + "loss": 0.7958, + "step": 8068 + }, + { + "epoch": 1.2631496556042578, + "grad_norm": 1.788071632385254, + "learning_rate": 3.844086021505376e-05, + "loss": 0.6915, + "step": 8069 + }, + { + "epoch": 1.2633061991233563, + "grad_norm": 1.702497124671936, + "learning_rate": 3.843271423916585e-05, + "loss": 0.4694, + "step": 8070 + }, + { + "epoch": 1.2634627426424547, + "grad_norm": 1.4374622106552124, + "learning_rate": 3.842456826327794e-05, + "loss": 0.3518, + "step": 8071 + }, + { + "epoch": 1.263619286161553, + "grad_norm": 2.7678065299987793, + "learning_rate": 3.841642228739003e-05, + "loss": 0.9313, + "step": 8072 + }, + { + "epoch": 1.2637758296806512, + "grad_norm": 2.9042587280273438, + "learning_rate": 3.8408276311502117e-05, + "loss": 0.69, + "step": 8073 + }, + { + "epoch": 1.2639323731997496, + "grad_norm": 3.12514591217041, + "learning_rate": 3.840013033561421e-05, + "loss": 0.954, + "step": 8074 + }, + { + "epoch": 1.2640889167188478, + "grad_norm": 2.984900951385498, + "learning_rate": 3.83919843597263e-05, + "loss": 0.5461, + "step": 8075 + }, + { + "epoch": 1.2642454602379463, + "grad_norm": 2.8380961418151855, + "learning_rate": 3.838383838383838e-05, + "loss": 0.8062, + "step": 8076 + }, + { + "epoch": 1.2644020037570445, + "grad_norm": 6.117973804473877, + "learning_rate": 3.837569240795048e-05, + "loss": 1.6907, + "step": 8077 + }, + { + "epoch": 1.2645585472761427, + "grad_norm": 3.6601719856262207, + "learning_rate": 3.836754643206256e-05, + "loss": 0.7193, + "step": 8078 + }, + { + "epoch": 1.2647150907952411, + "grad_norm": 7.531338214874268, + "learning_rate": 3.8359400456174646e-05, + "loss": 1.296, + "step": 8079 + }, + { + "epoch": 1.2648716343143394, + "grad_norm": 3.1144859790802, + "learning_rate": 3.8351254480286743e-05, + "loss": 0.8591, + "step": 8080 + }, + { + "epoch": 1.2650281778334378, + "grad_norm": 4.199008464813232, + "learning_rate": 3.834310850439883e-05, + "loss": 1.9279, + "step": 8081 + }, + { + "epoch": 1.265184721352536, + "grad_norm": 3.183302640914917, + "learning_rate": 3.833496252851092e-05, + "loss": 0.9562, + "step": 8082 + }, + { + "epoch": 1.2653412648716342, + "grad_norm": 3.362990379333496, + "learning_rate": 3.832681655262301e-05, + "loss": 1.3106, + "step": 8083 + }, + { + "epoch": 1.2654978083907327, + "grad_norm": 2.4624454975128174, + "learning_rate": 3.831867057673509e-05, + "loss": 0.8464, + "step": 8084 + }, + { + "epoch": 1.2656543519098309, + "grad_norm": 2.19112229347229, + "learning_rate": 3.831052460084718e-05, + "loss": 0.3768, + "step": 8085 + }, + { + "epoch": 1.2658108954289293, + "grad_norm": 0.9861286878585815, + "learning_rate": 3.830237862495927e-05, + "loss": 0.095, + "step": 8086 + }, + { + "epoch": 1.2659674389480275, + "grad_norm": 2.9116809368133545, + "learning_rate": 3.829423264907136e-05, + "loss": 0.4899, + "step": 8087 + }, + { + "epoch": 1.2661239824671258, + "grad_norm": 1.878893256187439, + "learning_rate": 3.828608667318345e-05, + "loss": 0.5708, + "step": 8088 + }, + { + "epoch": 1.2662805259862242, + "grad_norm": 0.5216870903968811, + "learning_rate": 3.827794069729554e-05, + "loss": 0.2722, + "step": 8089 + }, + { + "epoch": 1.2664370695053224, + "grad_norm": 0.6317028403282166, + "learning_rate": 3.826979472140763e-05, + "loss": 0.247, + "step": 8090 + }, + { + "epoch": 1.2665936130244209, + "grad_norm": 0.7541641592979431, + "learning_rate": 3.826164874551971e-05, + "loss": 0.2592, + "step": 8091 + }, + { + "epoch": 1.266750156543519, + "grad_norm": 0.5593295097351074, + "learning_rate": 3.82535027696318e-05, + "loss": 0.207, + "step": 8092 + }, + { + "epoch": 1.2669067000626173, + "grad_norm": 0.6375908851623535, + "learning_rate": 3.824535679374389e-05, + "loss": 0.2294, + "step": 8093 + }, + { + "epoch": 1.2670632435817157, + "grad_norm": 0.8706074953079224, + "learning_rate": 3.823721081785598e-05, + "loss": 0.317, + "step": 8094 + }, + { + "epoch": 1.267219787100814, + "grad_norm": 0.7060977816581726, + "learning_rate": 3.822906484196807e-05, + "loss": 0.2918, + "step": 8095 + }, + { + "epoch": 1.2673763306199124, + "grad_norm": 0.49593105912208557, + "learning_rate": 3.822091886608016e-05, + "loss": 0.1717, + "step": 8096 + }, + { + "epoch": 1.2675328741390106, + "grad_norm": 1.4731078147888184, + "learning_rate": 3.821277289019224e-05, + "loss": 0.3476, + "step": 8097 + }, + { + "epoch": 1.2676894176581088, + "grad_norm": 0.7835409045219421, + "learning_rate": 3.820462691430434e-05, + "loss": 0.3257, + "step": 8098 + }, + { + "epoch": 1.2678459611772073, + "grad_norm": 0.8455621004104614, + "learning_rate": 3.819648093841642e-05, + "loss": 0.3151, + "step": 8099 + }, + { + "epoch": 1.2680025046963057, + "grad_norm": 1.0854228734970093, + "learning_rate": 3.818833496252851e-05, + "loss": 0.2628, + "step": 8100 + }, + { + "epoch": 1.268159048215404, + "grad_norm": 2.5934884548187256, + "learning_rate": 3.8180188986640604e-05, + "loss": 0.628, + "step": 8101 + }, + { + "epoch": 1.2683155917345021, + "grad_norm": 0.8368053436279297, + "learning_rate": 3.817204301075269e-05, + "loss": 0.2254, + "step": 8102 + }, + { + "epoch": 1.2684721352536004, + "grad_norm": 0.9290556907653809, + "learning_rate": 3.816389703486478e-05, + "loss": 0.206, + "step": 8103 + }, + { + "epoch": 1.2686286787726988, + "grad_norm": 0.8159581422805786, + "learning_rate": 3.815575105897687e-05, + "loss": 0.4618, + "step": 8104 + }, + { + "epoch": 1.2687852222917972, + "grad_norm": 1.284554123878479, + "learning_rate": 3.814760508308895e-05, + "loss": 0.5535, + "step": 8105 + }, + { + "epoch": 1.2689417658108955, + "grad_norm": 1.7211121320724487, + "learning_rate": 3.813945910720104e-05, + "loss": 0.3277, + "step": 8106 + }, + { + "epoch": 1.2690983093299937, + "grad_norm": 1.7612743377685547, + "learning_rate": 3.8131313131313133e-05, + "loss": 0.4871, + "step": 8107 + }, + { + "epoch": 1.2692548528490921, + "grad_norm": 1.1259597539901733, + "learning_rate": 3.8123167155425224e-05, + "loss": 0.3582, + "step": 8108 + }, + { + "epoch": 1.2694113963681903, + "grad_norm": 1.1160728931427002, + "learning_rate": 3.811502117953731e-05, + "loss": 0.4503, + "step": 8109 + }, + { + "epoch": 1.2695679398872888, + "grad_norm": 1.8098442554473877, + "learning_rate": 3.81068752036494e-05, + "loss": 0.5441, + "step": 8110 + }, + { + "epoch": 1.269724483406387, + "grad_norm": 1.4321788549423218, + "learning_rate": 3.809872922776149e-05, + "loss": 0.2284, + "step": 8111 + }, + { + "epoch": 1.2698810269254852, + "grad_norm": 2.7235727310180664, + "learning_rate": 3.809058325187357e-05, + "loss": 0.6684, + "step": 8112 + }, + { + "epoch": 1.2700375704445837, + "grad_norm": 1.6844605207443237, + "learning_rate": 3.808243727598566e-05, + "loss": 0.4852, + "step": 8113 + }, + { + "epoch": 1.2701941139636819, + "grad_norm": 2.1242153644561768, + "learning_rate": 3.8074291300097754e-05, + "loss": 0.6942, + "step": 8114 + }, + { + "epoch": 1.2703506574827803, + "grad_norm": 2.6160078048706055, + "learning_rate": 3.806614532420984e-05, + "loss": 0.5112, + "step": 8115 + }, + { + "epoch": 1.2705072010018785, + "grad_norm": 1.5263172388076782, + "learning_rate": 3.8057999348321935e-05, + "loss": 0.451, + "step": 8116 + }, + { + "epoch": 1.2706637445209767, + "grad_norm": 2.847381591796875, + "learning_rate": 3.804985337243402e-05, + "loss": 0.4981, + "step": 8117 + }, + { + "epoch": 1.2708202880400752, + "grad_norm": 6.708479881286621, + "learning_rate": 3.804170739654611e-05, + "loss": 0.9886, + "step": 8118 + }, + { + "epoch": 1.2709768315591734, + "grad_norm": 3.405620574951172, + "learning_rate": 3.80335614206582e-05, + "loss": 0.671, + "step": 8119 + }, + { + "epoch": 1.2711333750782718, + "grad_norm": 3.3337864875793457, + "learning_rate": 3.802541544477028e-05, + "loss": 0.8253, + "step": 8120 + }, + { + "epoch": 1.27128991859737, + "grad_norm": 2.6992087364196777, + "learning_rate": 3.8017269468882374e-05, + "loss": 0.686, + "step": 8121 + }, + { + "epoch": 1.2714464621164683, + "grad_norm": 2.0084950923919678, + "learning_rate": 3.8009123492994464e-05, + "loss": 0.6734, + "step": 8122 + }, + { + "epoch": 1.2716030056355667, + "grad_norm": 2.7462360858917236, + "learning_rate": 3.800097751710655e-05, + "loss": 0.8745, + "step": 8123 + }, + { + "epoch": 1.271759549154665, + "grad_norm": 2.299868106842041, + "learning_rate": 3.799283154121864e-05, + "loss": 1.3648, + "step": 8124 + }, + { + "epoch": 1.2719160926737634, + "grad_norm": 3.4578287601470947, + "learning_rate": 3.798468556533073e-05, + "loss": 0.9215, + "step": 8125 + }, + { + "epoch": 1.2720726361928616, + "grad_norm": 3.721229314804077, + "learning_rate": 3.797653958944282e-05, + "loss": 1.123, + "step": 8126 + }, + { + "epoch": 1.2722291797119598, + "grad_norm": 4.457612991333008, + "learning_rate": 3.79683936135549e-05, + "loss": 0.7678, + "step": 8127 + }, + { + "epoch": 1.2723857232310583, + "grad_norm": 4.358388900756836, + "learning_rate": 3.7960247637666994e-05, + "loss": 1.5638, + "step": 8128 + }, + { + "epoch": 1.2725422667501565, + "grad_norm": 10.28227710723877, + "learning_rate": 3.7952101661779084e-05, + "loss": 0.9173, + "step": 8129 + }, + { + "epoch": 1.272698810269255, + "grad_norm": 3.469728946685791, + "learning_rate": 3.794395568589117e-05, + "loss": 1.194, + "step": 8130 + }, + { + "epoch": 1.2728553537883531, + "grad_norm": 17.58767318725586, + "learning_rate": 3.793580971000326e-05, + "loss": 2.2115, + "step": 8131 + }, + { + "epoch": 1.2730118973074513, + "grad_norm": 3.296712875366211, + "learning_rate": 3.792766373411535e-05, + "loss": 1.0679, + "step": 8132 + }, + { + "epoch": 1.2731684408265498, + "grad_norm": 4.610718727111816, + "learning_rate": 3.791951775822743e-05, + "loss": 1.1891, + "step": 8133 + }, + { + "epoch": 1.2733249843456482, + "grad_norm": 2.6906161308288574, + "learning_rate": 3.791137178233953e-05, + "loss": 0.9093, + "step": 8134 + }, + { + "epoch": 1.2734815278647464, + "grad_norm": 5.330771446228027, + "learning_rate": 3.7903225806451614e-05, + "loss": 0.9538, + "step": 8135 + }, + { + "epoch": 1.2736380713838447, + "grad_norm": 3.6233224868774414, + "learning_rate": 3.7895079830563704e-05, + "loss": 0.7792, + "step": 8136 + }, + { + "epoch": 1.273794614902943, + "grad_norm": 3.4868950843811035, + "learning_rate": 3.7886933854675795e-05, + "loss": 0.8091, + "step": 8137 + }, + { + "epoch": 1.2739511584220413, + "grad_norm": 2.325716495513916, + "learning_rate": 3.787878787878788e-05, + "loss": 0.5838, + "step": 8138 + }, + { + "epoch": 1.2741077019411398, + "grad_norm": 0.5783799886703491, + "learning_rate": 3.787064190289997e-05, + "loss": 0.2028, + "step": 8139 + }, + { + "epoch": 1.274264245460238, + "grad_norm": 0.8565154075622559, + "learning_rate": 3.786249592701206e-05, + "loss": 0.4153, + "step": 8140 + }, + { + "epoch": 1.2744207889793362, + "grad_norm": 0.6654836535453796, + "learning_rate": 3.7854349951124144e-05, + "loss": 0.2345, + "step": 8141 + }, + { + "epoch": 1.2745773324984346, + "grad_norm": 0.8324194550514221, + "learning_rate": 3.7846203975236234e-05, + "loss": 0.2442, + "step": 8142 + }, + { + "epoch": 1.2747338760175329, + "grad_norm": 0.5148590803146362, + "learning_rate": 3.7838057999348325e-05, + "loss": 0.1846, + "step": 8143 + }, + { + "epoch": 1.2748904195366313, + "grad_norm": 0.5895898342132568, + "learning_rate": 3.7829912023460415e-05, + "loss": 0.2746, + "step": 8144 + }, + { + "epoch": 1.2750469630557295, + "grad_norm": 0.6522838473320007, + "learning_rate": 3.78217660475725e-05, + "loss": 0.1933, + "step": 8145 + }, + { + "epoch": 1.2752035065748277, + "grad_norm": 0.7667749524116516, + "learning_rate": 3.781362007168459e-05, + "loss": 0.2314, + "step": 8146 + }, + { + "epoch": 1.2753600500939262, + "grad_norm": 0.8130273222923279, + "learning_rate": 3.780547409579668e-05, + "loss": 0.2161, + "step": 8147 + }, + { + "epoch": 1.2755165936130244, + "grad_norm": 0.8650034070014954, + "learning_rate": 3.7797328119908764e-05, + "loss": 0.1811, + "step": 8148 + }, + { + "epoch": 1.2756731371321228, + "grad_norm": 0.9838403463363647, + "learning_rate": 3.7789182144020854e-05, + "loss": 0.2344, + "step": 8149 + }, + { + "epoch": 1.275829680651221, + "grad_norm": 0.678410530090332, + "learning_rate": 3.7781036168132945e-05, + "loss": 0.272, + "step": 8150 + }, + { + "epoch": 1.2759862241703193, + "grad_norm": 1.0979126691818237, + "learning_rate": 3.777289019224503e-05, + "loss": 0.3889, + "step": 8151 + }, + { + "epoch": 1.2761427676894177, + "grad_norm": 1.5956573486328125, + "learning_rate": 3.7764744216357126e-05, + "loss": 0.2584, + "step": 8152 + }, + { + "epoch": 1.276299311208516, + "grad_norm": 0.994066596031189, + "learning_rate": 3.775659824046921e-05, + "loss": 0.256, + "step": 8153 + }, + { + "epoch": 1.2764558547276144, + "grad_norm": 1.7044061422348022, + "learning_rate": 3.774845226458129e-05, + "loss": 0.4359, + "step": 8154 + }, + { + "epoch": 1.2766123982467126, + "grad_norm": 3.5214855670928955, + "learning_rate": 3.774030628869339e-05, + "loss": 0.7659, + "step": 8155 + }, + { + "epoch": 1.2767689417658108, + "grad_norm": 0.7349703907966614, + "learning_rate": 3.7732160312805474e-05, + "loss": 0.2288, + "step": 8156 + }, + { + "epoch": 1.2769254852849092, + "grad_norm": 1.2892565727233887, + "learning_rate": 3.7724014336917565e-05, + "loss": 0.4326, + "step": 8157 + }, + { + "epoch": 1.2770820288040075, + "grad_norm": 2.1205921173095703, + "learning_rate": 3.7715868361029655e-05, + "loss": 0.4073, + "step": 8158 + }, + { + "epoch": 1.277238572323106, + "grad_norm": 5.821913242340088, + "learning_rate": 3.770772238514174e-05, + "loss": 0.6394, + "step": 8159 + }, + { + "epoch": 1.277395115842204, + "grad_norm": 4.6258721351623535, + "learning_rate": 3.769957640925383e-05, + "loss": 0.8071, + "step": 8160 + }, + { + "epoch": 1.2775516593613023, + "grad_norm": 1.453572154045105, + "learning_rate": 3.769143043336592e-05, + "loss": 0.3391, + "step": 8161 + }, + { + "epoch": 1.2777082028804008, + "grad_norm": 1.7295039892196655, + "learning_rate": 3.768328445747801e-05, + "loss": 0.5717, + "step": 8162 + }, + { + "epoch": 1.277864746399499, + "grad_norm": 1.3844341039657593, + "learning_rate": 3.7675138481590094e-05, + "loss": 0.3753, + "step": 8163 + }, + { + "epoch": 1.2780212899185974, + "grad_norm": 2.462177276611328, + "learning_rate": 3.7666992505702185e-05, + "loss": 0.6988, + "step": 8164 + }, + { + "epoch": 1.2781778334376956, + "grad_norm": 2.191981077194214, + "learning_rate": 3.7658846529814276e-05, + "loss": 0.6919, + "step": 8165 + }, + { + "epoch": 1.2783343769567939, + "grad_norm": 2.019040822982788, + "learning_rate": 3.765070055392636e-05, + "loss": 0.5186, + "step": 8166 + }, + { + "epoch": 1.2784909204758923, + "grad_norm": 1.8887407779693604, + "learning_rate": 3.764255457803845e-05, + "loss": 0.841, + "step": 8167 + }, + { + "epoch": 1.2786474639949907, + "grad_norm": 1.9255820512771606, + "learning_rate": 3.763440860215054e-05, + "loss": 0.6018, + "step": 8168 + }, + { + "epoch": 1.278804007514089, + "grad_norm": 2.453023910522461, + "learning_rate": 3.7626262626262624e-05, + "loss": 1.227, + "step": 8169 + }, + { + "epoch": 1.2789605510331872, + "grad_norm": 3.4767301082611084, + "learning_rate": 3.761811665037472e-05, + "loss": 0.8825, + "step": 8170 + }, + { + "epoch": 1.2791170945522856, + "grad_norm": 4.167951583862305, + "learning_rate": 3.7609970674486805e-05, + "loss": 0.7913, + "step": 8171 + }, + { + "epoch": 1.2792736380713838, + "grad_norm": 2.40873384475708, + "learning_rate": 3.760182469859889e-05, + "loss": 0.6308, + "step": 8172 + }, + { + "epoch": 1.2794301815904823, + "grad_norm": 3.1107287406921387, + "learning_rate": 3.7593678722710986e-05, + "loss": 0.9549, + "step": 8173 + }, + { + "epoch": 1.2795867251095805, + "grad_norm": 3.135653495788574, + "learning_rate": 3.758553274682307e-05, + "loss": 0.8798, + "step": 8174 + }, + { + "epoch": 1.2797432686286787, + "grad_norm": 3.22216534614563, + "learning_rate": 3.757738677093516e-05, + "loss": 1.0062, + "step": 8175 + }, + { + "epoch": 1.2798998121477771, + "grad_norm": 2.1665802001953125, + "learning_rate": 3.756924079504725e-05, + "loss": 1.0258, + "step": 8176 + }, + { + "epoch": 1.2800563556668754, + "grad_norm": 1.8157739639282227, + "learning_rate": 3.7561094819159335e-05, + "loss": 0.6806, + "step": 8177 + }, + { + "epoch": 1.2802128991859738, + "grad_norm": 5.391702651977539, + "learning_rate": 3.7552948843271425e-05, + "loss": 1.3071, + "step": 8178 + }, + { + "epoch": 1.280369442705072, + "grad_norm": 2.1497185230255127, + "learning_rate": 3.7544802867383516e-05, + "loss": 1.1733, + "step": 8179 + }, + { + "epoch": 1.2805259862241702, + "grad_norm": 3.53263521194458, + "learning_rate": 3.7536656891495606e-05, + "loss": 0.9357, + "step": 8180 + }, + { + "epoch": 1.2806825297432687, + "grad_norm": 7.4888129234313965, + "learning_rate": 3.752851091560769e-05, + "loss": 0.6715, + "step": 8181 + }, + { + "epoch": 1.280839073262367, + "grad_norm": 4.788745880126953, + "learning_rate": 3.752036493971978e-05, + "loss": 0.8623, + "step": 8182 + }, + { + "epoch": 1.2809956167814653, + "grad_norm": 3.723118305206299, + "learning_rate": 3.751221896383187e-05, + "loss": 0.6734, + "step": 8183 + }, + { + "epoch": 1.2811521603005636, + "grad_norm": 4.636291980743408, + "learning_rate": 3.7504072987943955e-05, + "loss": 0.8163, + "step": 8184 + }, + { + "epoch": 1.2813087038196618, + "grad_norm": 2.1254844665527344, + "learning_rate": 3.7495927012056045e-05, + "loss": 0.4324, + "step": 8185 + }, + { + "epoch": 1.2814652473387602, + "grad_norm": 1.6064908504486084, + "learning_rate": 3.7487781036168136e-05, + "loss": 0.5459, + "step": 8186 + }, + { + "epoch": 1.2816217908578584, + "grad_norm": 2.2636022567749023, + "learning_rate": 3.747963506028022e-05, + "loss": 0.8588, + "step": 8187 + }, + { + "epoch": 1.2817783343769569, + "grad_norm": 2.6124720573425293, + "learning_rate": 3.747148908439232e-05, + "loss": 0.635, + "step": 8188 + }, + { + "epoch": 1.281934877896055, + "grad_norm": 0.4595654010772705, + "learning_rate": 3.74633431085044e-05, + "loss": 0.2167, + "step": 8189 + }, + { + "epoch": 1.2820914214151533, + "grad_norm": 2.2020394802093506, + "learning_rate": 3.7455197132616484e-05, + "loss": 0.9897, + "step": 8190 + }, + { + "epoch": 1.2822479649342517, + "grad_norm": 0.5967679023742676, + "learning_rate": 3.744705115672858e-05, + "loss": 0.1364, + "step": 8191 + }, + { + "epoch": 1.28240450845335, + "grad_norm": 0.4756145179271698, + "learning_rate": 3.7438905180840665e-05, + "loss": 0.1863, + "step": 8192 + }, + { + "epoch": 1.2825610519724484, + "grad_norm": 0.722423255443573, + "learning_rate": 3.7430759204952756e-05, + "loss": 0.3, + "step": 8193 + }, + { + "epoch": 1.2827175954915466, + "grad_norm": 1.0714317560195923, + "learning_rate": 3.7422613229064847e-05, + "loss": 0.2507, + "step": 8194 + }, + { + "epoch": 1.2828741390106448, + "grad_norm": 0.9064537882804871, + "learning_rate": 3.741446725317693e-05, + "loss": 0.2615, + "step": 8195 + }, + { + "epoch": 1.2830306825297433, + "grad_norm": 0.8969106078147888, + "learning_rate": 3.740632127728902e-05, + "loss": 0.2932, + "step": 8196 + }, + { + "epoch": 1.2831872260488415, + "grad_norm": 0.7680531144142151, + "learning_rate": 3.739817530140111e-05, + "loss": 0.2169, + "step": 8197 + }, + { + "epoch": 1.28334376956794, + "grad_norm": 1.0894557237625122, + "learning_rate": 3.7390029325513195e-05, + "loss": 0.2298, + "step": 8198 + }, + { + "epoch": 1.2835003130870382, + "grad_norm": 0.7047888040542603, + "learning_rate": 3.7381883349625286e-05, + "loss": 0.1585, + "step": 8199 + }, + { + "epoch": 1.2836568566061364, + "grad_norm": 0.6284682154655457, + "learning_rate": 3.7373737373737376e-05, + "loss": 0.2121, + "step": 8200 + }, + { + "epoch": 1.2838134001252348, + "grad_norm": 0.8962263464927673, + "learning_rate": 3.736559139784947e-05, + "loss": 0.2048, + "step": 8201 + }, + { + "epoch": 1.2839699436443333, + "grad_norm": 1.3182865381240845, + "learning_rate": 3.735744542196155e-05, + "loss": 0.4013, + "step": 8202 + }, + { + "epoch": 1.2841264871634315, + "grad_norm": 1.3066051006317139, + "learning_rate": 3.734929944607364e-05, + "loss": 0.2115, + "step": 8203 + }, + { + "epoch": 1.2842830306825297, + "grad_norm": 1.4070496559143066, + "learning_rate": 3.734115347018573e-05, + "loss": 0.3406, + "step": 8204 + }, + { + "epoch": 1.2844395742016281, + "grad_norm": 2.31776762008667, + "learning_rate": 3.7333007494297815e-05, + "loss": 0.3909, + "step": 8205 + }, + { + "epoch": 1.2845961177207263, + "grad_norm": 1.0540151596069336, + "learning_rate": 3.732486151840991e-05, + "loss": 0.2301, + "step": 8206 + }, + { + "epoch": 1.2847526612398248, + "grad_norm": 1.5004738569259644, + "learning_rate": 3.7316715542521996e-05, + "loss": 0.3655, + "step": 8207 + }, + { + "epoch": 1.284909204758923, + "grad_norm": 3.1626126766204834, + "learning_rate": 3.730856956663408e-05, + "loss": 0.3212, + "step": 8208 + }, + { + "epoch": 1.2850657482780212, + "grad_norm": 3.8862380981445312, + "learning_rate": 3.730042359074618e-05, + "loss": 0.6632, + "step": 8209 + }, + { + "epoch": 1.2852222917971197, + "grad_norm": 2.3023641109466553, + "learning_rate": 3.729227761485826e-05, + "loss": 0.7375, + "step": 8210 + }, + { + "epoch": 1.2853788353162179, + "grad_norm": 2.199770212173462, + "learning_rate": 3.728413163897035e-05, + "loss": 0.5807, + "step": 8211 + }, + { + "epoch": 1.2855353788353163, + "grad_norm": 1.2923027276992798, + "learning_rate": 3.727598566308244e-05, + "loss": 0.302, + "step": 8212 + }, + { + "epoch": 1.2856919223544145, + "grad_norm": 1.5692330598831177, + "learning_rate": 3.7267839687194526e-05, + "loss": 0.5175, + "step": 8213 + }, + { + "epoch": 1.2858484658735128, + "grad_norm": 1.2270524501800537, + "learning_rate": 3.7259693711306616e-05, + "loss": 0.3237, + "step": 8214 + }, + { + "epoch": 1.2860050093926112, + "grad_norm": 3.185635566711426, + "learning_rate": 3.725154773541871e-05, + "loss": 0.5012, + "step": 8215 + }, + { + "epoch": 1.2861615529117094, + "grad_norm": 1.8321226835250854, + "learning_rate": 3.724340175953079e-05, + "loss": 0.4727, + "step": 8216 + }, + { + "epoch": 1.2863180964308079, + "grad_norm": 3.25809907913208, + "learning_rate": 3.723525578364288e-05, + "loss": 0.8064, + "step": 8217 + }, + { + "epoch": 1.286474639949906, + "grad_norm": 1.349090576171875, + "learning_rate": 3.722710980775497e-05, + "loss": 0.5349, + "step": 8218 + }, + { + "epoch": 1.2866311834690043, + "grad_norm": 2.5475761890411377, + "learning_rate": 3.721896383186706e-05, + "loss": 0.5843, + "step": 8219 + }, + { + "epoch": 1.2867877269881027, + "grad_norm": 3.1912875175476074, + "learning_rate": 3.7210817855979146e-05, + "loss": 0.8117, + "step": 8220 + }, + { + "epoch": 1.286944270507201, + "grad_norm": 5.790009498596191, + "learning_rate": 3.7202671880091237e-05, + "loss": 0.9092, + "step": 8221 + }, + { + "epoch": 1.2871008140262994, + "grad_norm": 8.302469253540039, + "learning_rate": 3.719452590420333e-05, + "loss": 0.5951, + "step": 8222 + }, + { + "epoch": 1.2872573575453976, + "grad_norm": 2.3686819076538086, + "learning_rate": 3.718637992831541e-05, + "loss": 0.573, + "step": 8223 + }, + { + "epoch": 1.2874139010644958, + "grad_norm": 3.5674939155578613, + "learning_rate": 3.717823395242751e-05, + "loss": 0.9936, + "step": 8224 + }, + { + "epoch": 1.2875704445835943, + "grad_norm": 6.967074394226074, + "learning_rate": 3.717008797653959e-05, + "loss": 1.3629, + "step": 8225 + }, + { + "epoch": 1.2877269881026925, + "grad_norm": 4.083107948303223, + "learning_rate": 3.7161942000651676e-05, + "loss": 0.9489, + "step": 8226 + }, + { + "epoch": 1.287883531621791, + "grad_norm": 4.960583686828613, + "learning_rate": 3.715379602476377e-05, + "loss": 1.1737, + "step": 8227 + }, + { + "epoch": 1.2880400751408891, + "grad_norm": 2.8449113368988037, + "learning_rate": 3.714565004887586e-05, + "loss": 0.868, + "step": 8228 + }, + { + "epoch": 1.2881966186599874, + "grad_norm": 3.9354259967803955, + "learning_rate": 3.713750407298795e-05, + "loss": 0.6009, + "step": 8229 + }, + { + "epoch": 1.2883531621790858, + "grad_norm": 4.685842037200928, + "learning_rate": 3.712935809710004e-05, + "loss": 1.032, + "step": 8230 + }, + { + "epoch": 1.288509705698184, + "grad_norm": 4.600156307220459, + "learning_rate": 3.712121212121212e-05, + "loss": 1.454, + "step": 8231 + }, + { + "epoch": 1.2886662492172825, + "grad_norm": 4.006269454956055, + "learning_rate": 3.711306614532421e-05, + "loss": 0.851, + "step": 8232 + }, + { + "epoch": 1.2888227927363807, + "grad_norm": 2.447187662124634, + "learning_rate": 3.71049201694363e-05, + "loss": 0.755, + "step": 8233 + }, + { + "epoch": 1.288979336255479, + "grad_norm": 3.2001352310180664, + "learning_rate": 3.7096774193548386e-05, + "loss": 0.6123, + "step": 8234 + }, + { + "epoch": 1.2891358797745773, + "grad_norm": 1.6664212942123413, + "learning_rate": 3.708862821766048e-05, + "loss": 0.3115, + "step": 8235 + }, + { + "epoch": 1.2892924232936758, + "grad_norm": 2.4282877445220947, + "learning_rate": 3.708048224177257e-05, + "loss": 0.6626, + "step": 8236 + }, + { + "epoch": 1.289448966812774, + "grad_norm": 2.1909074783325195, + "learning_rate": 3.707233626588466e-05, + "loss": 0.2154, + "step": 8237 + }, + { + "epoch": 1.2896055103318722, + "grad_norm": 4.882100582122803, + "learning_rate": 3.706419028999674e-05, + "loss": 1.2685, + "step": 8238 + }, + { + "epoch": 1.2897620538509706, + "grad_norm": 0.4515455365180969, + "learning_rate": 3.705604431410883e-05, + "loss": 0.2052, + "step": 8239 + }, + { + "epoch": 1.2899185973700689, + "grad_norm": 0.5235328078269958, + "learning_rate": 3.704789833822092e-05, + "loss": 0.2218, + "step": 8240 + }, + { + "epoch": 1.2900751408891673, + "grad_norm": 0.7002595663070679, + "learning_rate": 3.7039752362333006e-05, + "loss": 0.2295, + "step": 8241 + }, + { + "epoch": 1.2902316844082655, + "grad_norm": 1.6836103200912476, + "learning_rate": 3.70316063864451e-05, + "loss": 0.2915, + "step": 8242 + }, + { + "epoch": 1.2903882279273637, + "grad_norm": 0.5236932039260864, + "learning_rate": 3.702346041055719e-05, + "loss": 0.2403, + "step": 8243 + }, + { + "epoch": 1.2905447714464622, + "grad_norm": 0.8345181345939636, + "learning_rate": 3.701531443466927e-05, + "loss": 0.2549, + "step": 8244 + }, + { + "epoch": 1.2907013149655604, + "grad_norm": 0.8174896240234375, + "learning_rate": 3.700716845878137e-05, + "loss": 0.3171, + "step": 8245 + }, + { + "epoch": 1.2908578584846588, + "grad_norm": 1.3181581497192383, + "learning_rate": 3.699902248289345e-05, + "loss": 0.2895, + "step": 8246 + }, + { + "epoch": 1.291014402003757, + "grad_norm": 1.3533363342285156, + "learning_rate": 3.699087650700554e-05, + "loss": 0.304, + "step": 8247 + }, + { + "epoch": 1.2911709455228553, + "grad_norm": 1.009425163269043, + "learning_rate": 3.698273053111763e-05, + "loss": 0.3282, + "step": 8248 + }, + { + "epoch": 1.2913274890419537, + "grad_norm": 2.8146414756774902, + "learning_rate": 3.697458455522972e-05, + "loss": 0.3552, + "step": 8249 + }, + { + "epoch": 1.291484032561052, + "grad_norm": 1.3906856775283813, + "learning_rate": 3.696643857934181e-05, + "loss": 0.3682, + "step": 8250 + }, + { + "epoch": 1.2916405760801504, + "grad_norm": 1.3024194240570068, + "learning_rate": 3.69582926034539e-05, + "loss": 0.3016, + "step": 8251 + }, + { + "epoch": 1.2917971195992486, + "grad_norm": 0.8703611493110657, + "learning_rate": 3.695014662756598e-05, + "loss": 0.3827, + "step": 8252 + }, + { + "epoch": 1.2919536631183468, + "grad_norm": 1.0678985118865967, + "learning_rate": 3.694200065167807e-05, + "loss": 0.3407, + "step": 8253 + }, + { + "epoch": 1.2921102066374452, + "grad_norm": 1.273707389831543, + "learning_rate": 3.693385467579016e-05, + "loss": 0.4698, + "step": 8254 + }, + { + "epoch": 1.2922667501565435, + "grad_norm": 0.9281603097915649, + "learning_rate": 3.6925708699902253e-05, + "loss": 0.3072, + "step": 8255 + }, + { + "epoch": 1.292423293675642, + "grad_norm": 0.7918943762779236, + "learning_rate": 3.691756272401434e-05, + "loss": 0.2136, + "step": 8256 + }, + { + "epoch": 1.2925798371947401, + "grad_norm": 1.6063213348388672, + "learning_rate": 3.690941674812643e-05, + "loss": 0.599, + "step": 8257 + }, + { + "epoch": 1.2927363807138383, + "grad_norm": 5.919332027435303, + "learning_rate": 3.690127077223852e-05, + "loss": 1.3107, + "step": 8258 + }, + { + "epoch": 1.2928929242329368, + "grad_norm": 1.694652795791626, + "learning_rate": 3.68931247963506e-05, + "loss": 0.5979, + "step": 8259 + }, + { + "epoch": 1.293049467752035, + "grad_norm": 1.2781790494918823, + "learning_rate": 3.688497882046269e-05, + "loss": 0.3322, + "step": 8260 + }, + { + "epoch": 1.2932060112711334, + "grad_norm": 2.7257227897644043, + "learning_rate": 3.687683284457478e-05, + "loss": 0.6038, + "step": 8261 + }, + { + "epoch": 1.2933625547902317, + "grad_norm": 1.4279322624206543, + "learning_rate": 3.686868686868687e-05, + "loss": 0.3884, + "step": 8262 + }, + { + "epoch": 1.2935190983093299, + "grad_norm": 1.4515399932861328, + "learning_rate": 3.6860540892798964e-05, + "loss": 0.4486, + "step": 8263 + }, + { + "epoch": 1.2936756418284283, + "grad_norm": 1.0842549800872803, + "learning_rate": 3.685239491691105e-05, + "loss": 0.3361, + "step": 8264 + }, + { + "epoch": 1.2938321853475265, + "grad_norm": 2.93871808052063, + "learning_rate": 3.684424894102314e-05, + "loss": 0.5181, + "step": 8265 + }, + { + "epoch": 1.293988728866625, + "grad_norm": 2.3792502880096436, + "learning_rate": 3.683610296513523e-05, + "loss": 0.5005, + "step": 8266 + }, + { + "epoch": 1.2941452723857232, + "grad_norm": 3.3524680137634277, + "learning_rate": 3.682795698924731e-05, + "loss": 0.7756, + "step": 8267 + }, + { + "epoch": 1.2943018159048214, + "grad_norm": 2.0374503135681152, + "learning_rate": 3.68198110133594e-05, + "loss": 0.6269, + "step": 8268 + }, + { + "epoch": 1.2944583594239198, + "grad_norm": 2.9174437522888184, + "learning_rate": 3.6811665037471494e-05, + "loss": 0.8427, + "step": 8269 + }, + { + "epoch": 1.2946149029430183, + "grad_norm": 3.9024789333343506, + "learning_rate": 3.680351906158358e-05, + "loss": 0.6271, + "step": 8270 + }, + { + "epoch": 1.2947714464621165, + "grad_norm": 3.876448154449463, + "learning_rate": 3.679537308569567e-05, + "loss": 0.909, + "step": 8271 + }, + { + "epoch": 1.2949279899812147, + "grad_norm": 3.1957345008850098, + "learning_rate": 3.678722710980776e-05, + "loss": 1.153, + "step": 8272 + }, + { + "epoch": 1.2950845335003132, + "grad_norm": 1.605472445487976, + "learning_rate": 3.677908113391985e-05, + "loss": 0.489, + "step": 8273 + }, + { + "epoch": 1.2952410770194114, + "grad_norm": 3.715114116668701, + "learning_rate": 3.677093515803193e-05, + "loss": 0.7017, + "step": 8274 + }, + { + "epoch": 1.2953976205385098, + "grad_norm": 3.7314491271972656, + "learning_rate": 3.676278918214402e-05, + "loss": 0.8177, + "step": 8275 + }, + { + "epoch": 1.295554164057608, + "grad_norm": 3.1414425373077393, + "learning_rate": 3.6754643206256114e-05, + "loss": 0.7991, + "step": 8276 + }, + { + "epoch": 1.2957107075767063, + "grad_norm": 2.3962490558624268, + "learning_rate": 3.67464972303682e-05, + "loss": 0.8161, + "step": 8277 + }, + { + "epoch": 1.2958672510958047, + "grad_norm": 2.802091121673584, + "learning_rate": 3.673835125448029e-05, + "loss": 1.0028, + "step": 8278 + }, + { + "epoch": 1.296023794614903, + "grad_norm": 3.081627130508423, + "learning_rate": 3.673020527859238e-05, + "loss": 0.9543, + "step": 8279 + }, + { + "epoch": 1.2961803381340014, + "grad_norm": 3.0208446979522705, + "learning_rate": 3.672205930270446e-05, + "loss": 1.2325, + "step": 8280 + }, + { + "epoch": 1.2963368816530996, + "grad_norm": 3.770573377609253, + "learning_rate": 3.671391332681656e-05, + "loss": 1.3839, + "step": 8281 + }, + { + "epoch": 1.2964934251721978, + "grad_norm": 3.4973537921905518, + "learning_rate": 3.670576735092864e-05, + "loss": 1.4286, + "step": 8282 + }, + { + "epoch": 1.2966499686912962, + "grad_norm": 3.290574073791504, + "learning_rate": 3.669762137504073e-05, + "loss": 0.6731, + "step": 8283 + }, + { + "epoch": 1.2968065122103944, + "grad_norm": 4.016219139099121, + "learning_rate": 3.6689475399152824e-05, + "loss": 0.5353, + "step": 8284 + }, + { + "epoch": 1.2969630557294929, + "grad_norm": 1.9722002744674683, + "learning_rate": 3.668132942326491e-05, + "loss": 0.6531, + "step": 8285 + }, + { + "epoch": 1.297119599248591, + "grad_norm": 2.286285638809204, + "learning_rate": 3.6673183447377e-05, + "loss": 0.7763, + "step": 8286 + }, + { + "epoch": 1.2972761427676893, + "grad_norm": 5.073317050933838, + "learning_rate": 3.666503747148909e-05, + "loss": 0.7276, + "step": 8287 + }, + { + "epoch": 1.2974326862867878, + "grad_norm": 5.559406757354736, + "learning_rate": 3.665689149560117e-05, + "loss": 0.8097, + "step": 8288 + }, + { + "epoch": 1.297589229805886, + "grad_norm": 0.690601110458374, + "learning_rate": 3.6648745519713264e-05, + "loss": 0.1874, + "step": 8289 + }, + { + "epoch": 1.2977457733249844, + "grad_norm": 0.7728970050811768, + "learning_rate": 3.6640599543825354e-05, + "loss": 0.1747, + "step": 8290 + }, + { + "epoch": 1.2979023168440826, + "grad_norm": 0.8327479362487793, + "learning_rate": 3.6632453567937445e-05, + "loss": 0.2392, + "step": 8291 + }, + { + "epoch": 1.2980588603631809, + "grad_norm": 0.9761979579925537, + "learning_rate": 3.662430759204953e-05, + "loss": 0.2458, + "step": 8292 + }, + { + "epoch": 1.2982154038822793, + "grad_norm": 0.6985478401184082, + "learning_rate": 3.661616161616162e-05, + "loss": 0.1842, + "step": 8293 + }, + { + "epoch": 1.2983719474013775, + "grad_norm": 3.0407776832580566, + "learning_rate": 3.660801564027371e-05, + "loss": 0.4203, + "step": 8294 + }, + { + "epoch": 1.298528490920476, + "grad_norm": 1.0183478593826294, + "learning_rate": 3.659986966438579e-05, + "loss": 0.1774, + "step": 8295 + }, + { + "epoch": 1.2986850344395742, + "grad_norm": 0.9781076908111572, + "learning_rate": 3.6591723688497884e-05, + "loss": 0.3467, + "step": 8296 + }, + { + "epoch": 1.2988415779586724, + "grad_norm": 1.1406937837600708, + "learning_rate": 3.6583577712609974e-05, + "loss": 0.3027, + "step": 8297 + }, + { + "epoch": 1.2989981214777708, + "grad_norm": 0.9285098314285278, + "learning_rate": 3.657543173672206e-05, + "loss": 0.3179, + "step": 8298 + }, + { + "epoch": 1.2991546649968693, + "grad_norm": 1.0037516355514526, + "learning_rate": 3.6567285760834155e-05, + "loss": 0.3096, + "step": 8299 + }, + { + "epoch": 1.2993112085159675, + "grad_norm": 1.3051728010177612, + "learning_rate": 3.655913978494624e-05, + "loss": 0.1959, + "step": 8300 + }, + { + "epoch": 1.2994677520350657, + "grad_norm": 0.9687951803207397, + "learning_rate": 3.655099380905832e-05, + "loss": 0.2333, + "step": 8301 + }, + { + "epoch": 1.299624295554164, + "grad_norm": 1.3615139722824097, + "learning_rate": 3.654284783317042e-05, + "loss": 0.2836, + "step": 8302 + }, + { + "epoch": 1.2997808390732624, + "grad_norm": 1.1166033744812012, + "learning_rate": 3.6534701857282504e-05, + "loss": 0.3403, + "step": 8303 + }, + { + "epoch": 1.2999373825923608, + "grad_norm": 1.7049620151519775, + "learning_rate": 3.6526555881394594e-05, + "loss": 0.2912, + "step": 8304 + }, + { + "epoch": 1.300093926111459, + "grad_norm": 0.8449323177337646, + "learning_rate": 3.6518409905506685e-05, + "loss": 0.1676, + "step": 8305 + }, + { + "epoch": 1.3002504696305572, + "grad_norm": 1.9239192008972168, + "learning_rate": 3.651026392961877e-05, + "loss": 0.5, + "step": 8306 + }, + { + "epoch": 1.3004070131496557, + "grad_norm": 3.2217142581939697, + "learning_rate": 3.650211795373086e-05, + "loss": 0.7275, + "step": 8307 + }, + { + "epoch": 1.300563556668754, + "grad_norm": 6.949481964111328, + "learning_rate": 3.649397197784295e-05, + "loss": 0.76, + "step": 8308 + }, + { + "epoch": 1.3007201001878523, + "grad_norm": 2.570607900619507, + "learning_rate": 3.648582600195504e-05, + "loss": 0.454, + "step": 8309 + }, + { + "epoch": 1.3008766437069506, + "grad_norm": 2.5137858390808105, + "learning_rate": 3.6477680026067124e-05, + "loss": 0.5013, + "step": 8310 + }, + { + "epoch": 1.3010331872260488, + "grad_norm": 2.2659666538238525, + "learning_rate": 3.6469534050179214e-05, + "loss": 0.3908, + "step": 8311 + }, + { + "epoch": 1.3011897307451472, + "grad_norm": 5.997730255126953, + "learning_rate": 3.6461388074291305e-05, + "loss": 0.655, + "step": 8312 + }, + { + "epoch": 1.3013462742642454, + "grad_norm": 1.2290068864822388, + "learning_rate": 3.645324209840339e-05, + "loss": 0.2809, + "step": 8313 + }, + { + "epoch": 1.3015028177833439, + "grad_norm": 2.427107810974121, + "learning_rate": 3.644509612251548e-05, + "loss": 0.7139, + "step": 8314 + }, + { + "epoch": 1.301659361302442, + "grad_norm": 6.717464447021484, + "learning_rate": 3.643695014662757e-05, + "loss": 1.1795, + "step": 8315 + }, + { + "epoch": 1.3018159048215403, + "grad_norm": 1.5553152561187744, + "learning_rate": 3.6428804170739653e-05, + "loss": 0.5413, + "step": 8316 + }, + { + "epoch": 1.3019724483406387, + "grad_norm": 1.7322893142700195, + "learning_rate": 3.6420658194851744e-05, + "loss": 0.2925, + "step": 8317 + }, + { + "epoch": 1.302128991859737, + "grad_norm": 2.3240489959716797, + "learning_rate": 3.6412512218963835e-05, + "loss": 0.4585, + "step": 8318 + }, + { + "epoch": 1.3022855353788354, + "grad_norm": 1.9833545684814453, + "learning_rate": 3.640436624307592e-05, + "loss": 0.2468, + "step": 8319 + }, + { + "epoch": 1.3024420788979336, + "grad_norm": 1.8733137845993042, + "learning_rate": 3.639622026718801e-05, + "loss": 0.4472, + "step": 8320 + }, + { + "epoch": 1.3025986224170318, + "grad_norm": 6.76923131942749, + "learning_rate": 3.63880742913001e-05, + "loss": 0.6638, + "step": 8321 + }, + { + "epoch": 1.3027551659361303, + "grad_norm": 3.4282937049865723, + "learning_rate": 3.637992831541219e-05, + "loss": 0.9837, + "step": 8322 + }, + { + "epoch": 1.3029117094552285, + "grad_norm": 1.7036755084991455, + "learning_rate": 3.6371782339524274e-05, + "loss": 0.2114, + "step": 8323 + }, + { + "epoch": 1.303068252974327, + "grad_norm": 2.814706325531006, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.6776, + "step": 8324 + }, + { + "epoch": 1.3032247964934252, + "grad_norm": 4.191710472106934, + "learning_rate": 3.6355490387748455e-05, + "loss": 0.7539, + "step": 8325 + }, + { + "epoch": 1.3033813400125234, + "grad_norm": 2.0552496910095215, + "learning_rate": 3.634734441186054e-05, + "loss": 1.089, + "step": 8326 + }, + { + "epoch": 1.3035378835316218, + "grad_norm": 3.4165596961975098, + "learning_rate": 3.633919843597263e-05, + "loss": 1.3885, + "step": 8327 + }, + { + "epoch": 1.30369442705072, + "grad_norm": 4.621950149536133, + "learning_rate": 3.633105246008472e-05, + "loss": 1.0737, + "step": 8328 + }, + { + "epoch": 1.3038509705698185, + "grad_norm": 5.255007266998291, + "learning_rate": 3.63229064841968e-05, + "loss": 0.8854, + "step": 8329 + }, + { + "epoch": 1.3040075140889167, + "grad_norm": 4.060128688812256, + "learning_rate": 3.63147605083089e-05, + "loss": 0.8646, + "step": 8330 + }, + { + "epoch": 1.304164057608015, + "grad_norm": 4.876949787139893, + "learning_rate": 3.6306614532420984e-05, + "loss": 1.8788, + "step": 8331 + }, + { + "epoch": 1.3043206011271133, + "grad_norm": 5.254243850708008, + "learning_rate": 3.6298468556533075e-05, + "loss": 1.8794, + "step": 8332 + }, + { + "epoch": 1.3044771446462118, + "grad_norm": 3.597507953643799, + "learning_rate": 3.6290322580645165e-05, + "loss": 1.2106, + "step": 8333 + }, + { + "epoch": 1.30463368816531, + "grad_norm": 5.750164985656738, + "learning_rate": 3.628217660475725e-05, + "loss": 0.6881, + "step": 8334 + }, + { + "epoch": 1.3047902316844082, + "grad_norm": 6.0576276779174805, + "learning_rate": 3.627403062886934e-05, + "loss": 1.007, + "step": 8335 + }, + { + "epoch": 1.3049467752035064, + "grad_norm": 4.12968111038208, + "learning_rate": 3.626588465298143e-05, + "loss": 0.6473, + "step": 8336 + }, + { + "epoch": 1.3051033187226049, + "grad_norm": 5.8092732429504395, + "learning_rate": 3.6257738677093514e-05, + "loss": 0.658, + "step": 8337 + }, + { + "epoch": 1.3052598622417033, + "grad_norm": 4.17836856842041, + "learning_rate": 3.6249592701205604e-05, + "loss": 1.1471, + "step": 8338 + }, + { + "epoch": 1.3054164057608015, + "grad_norm": 0.7234653830528259, + "learning_rate": 3.6241446725317695e-05, + "loss": 0.2447, + "step": 8339 + }, + { + "epoch": 1.3055729492798998, + "grad_norm": 0.5241149067878723, + "learning_rate": 3.6233300749429785e-05, + "loss": 0.238, + "step": 8340 + }, + { + "epoch": 1.3057294927989982, + "grad_norm": 0.5405260324478149, + "learning_rate": 3.622515477354187e-05, + "loss": 0.2117, + "step": 8341 + }, + { + "epoch": 1.3058860363180964, + "grad_norm": 0.563108503818512, + "learning_rate": 3.621700879765396e-05, + "loss": 0.2154, + "step": 8342 + }, + { + "epoch": 1.3060425798371949, + "grad_norm": 1.9046828746795654, + "learning_rate": 3.620886282176605e-05, + "loss": 0.2657, + "step": 8343 + }, + { + "epoch": 1.306199123356293, + "grad_norm": 0.6183670163154602, + "learning_rate": 3.6200716845878134e-05, + "loss": 0.1925, + "step": 8344 + }, + { + "epoch": 1.3063556668753913, + "grad_norm": 0.4338443875312805, + "learning_rate": 3.6192570869990225e-05, + "loss": 0.1671, + "step": 8345 + }, + { + "epoch": 1.3065122103944897, + "grad_norm": 0.5254813432693481, + "learning_rate": 3.6184424894102315e-05, + "loss": 0.2175, + "step": 8346 + }, + { + "epoch": 1.306668753913588, + "grad_norm": 0.7840124368667603, + "learning_rate": 3.61762789182144e-05, + "loss": 0.1939, + "step": 8347 + }, + { + "epoch": 1.3068252974326864, + "grad_norm": 0.9246293902397156, + "learning_rate": 3.6168132942326496e-05, + "loss": 0.263, + "step": 8348 + }, + { + "epoch": 1.3069818409517846, + "grad_norm": 0.6950631141662598, + "learning_rate": 3.615998696643858e-05, + "loss": 0.2443, + "step": 8349 + }, + { + "epoch": 1.3071383844708828, + "grad_norm": 2.297773838043213, + "learning_rate": 3.615184099055067e-05, + "loss": 0.4801, + "step": 8350 + }, + { + "epoch": 1.3072949279899813, + "grad_norm": 1.178663969039917, + "learning_rate": 3.614369501466276e-05, + "loss": 0.2541, + "step": 8351 + }, + { + "epoch": 1.3074514715090795, + "grad_norm": 1.39583420753479, + "learning_rate": 3.6135549038774845e-05, + "loss": 0.3642, + "step": 8352 + }, + { + "epoch": 1.307608015028178, + "grad_norm": 0.862809956073761, + "learning_rate": 3.6127403062886935e-05, + "loss": 0.2576, + "step": 8353 + }, + { + "epoch": 1.3077645585472761, + "grad_norm": 1.075594425201416, + "learning_rate": 3.6119257086999026e-05, + "loss": 0.293, + "step": 8354 + }, + { + "epoch": 1.3079211020663744, + "grad_norm": 1.4112396240234375, + "learning_rate": 3.611111111111111e-05, + "loss": 0.4001, + "step": 8355 + }, + { + "epoch": 1.3080776455854728, + "grad_norm": 1.2605395317077637, + "learning_rate": 3.61029651352232e-05, + "loss": 0.4023, + "step": 8356 + }, + { + "epoch": 1.308234189104571, + "grad_norm": 1.3537571430206299, + "learning_rate": 3.609481915933529e-05, + "loss": 0.2492, + "step": 8357 + }, + { + "epoch": 1.3083907326236695, + "grad_norm": 1.4173976182937622, + "learning_rate": 3.608667318344738e-05, + "loss": 0.7843, + "step": 8358 + }, + { + "epoch": 1.3085472761427677, + "grad_norm": 1.401224970817566, + "learning_rate": 3.6078527207559465e-05, + "loss": 0.2829, + "step": 8359 + }, + { + "epoch": 1.3087038196618659, + "grad_norm": 2.358093023300171, + "learning_rate": 3.6070381231671555e-05, + "loss": 0.7072, + "step": 8360 + }, + { + "epoch": 1.3088603631809643, + "grad_norm": 1.7753691673278809, + "learning_rate": 3.6062235255783646e-05, + "loss": 0.6047, + "step": 8361 + }, + { + "epoch": 1.3090169067000625, + "grad_norm": 1.5455163717269897, + "learning_rate": 3.605408927989573e-05, + "loss": 0.5031, + "step": 8362 + }, + { + "epoch": 1.309173450219161, + "grad_norm": 0.8948125839233398, + "learning_rate": 3.604594330400782e-05, + "loss": 0.3652, + "step": 8363 + }, + { + "epoch": 1.3093299937382592, + "grad_norm": 1.5980559587478638, + "learning_rate": 3.603779732811991e-05, + "loss": 0.6351, + "step": 8364 + }, + { + "epoch": 1.3094865372573574, + "grad_norm": 1.5320773124694824, + "learning_rate": 3.6029651352231994e-05, + "loss": 0.572, + "step": 8365 + }, + { + "epoch": 1.3096430807764559, + "grad_norm": 2.1251659393310547, + "learning_rate": 3.602150537634409e-05, + "loss": 0.7786, + "step": 8366 + }, + { + "epoch": 1.3097996242955543, + "grad_norm": 2.8889222145080566, + "learning_rate": 3.6013359400456175e-05, + "loss": 0.6901, + "step": 8367 + }, + { + "epoch": 1.3099561678146525, + "grad_norm": 3.3981573581695557, + "learning_rate": 3.6005213424568266e-05, + "loss": 0.6688, + "step": 8368 + }, + { + "epoch": 1.3101127113337507, + "grad_norm": 2.239867687225342, + "learning_rate": 3.5997067448680356e-05, + "loss": 0.4883, + "step": 8369 + }, + { + "epoch": 1.3102692548528492, + "grad_norm": 1.277032732963562, + "learning_rate": 3.598892147279244e-05, + "loss": 0.4669, + "step": 8370 + }, + { + "epoch": 1.3104257983719474, + "grad_norm": 2.768907308578491, + "learning_rate": 3.598077549690453e-05, + "loss": 0.6716, + "step": 8371 + }, + { + "epoch": 1.3105823418910458, + "grad_norm": 3.049144983291626, + "learning_rate": 3.597262952101662e-05, + "loss": 0.7628, + "step": 8372 + }, + { + "epoch": 1.310738885410144, + "grad_norm": 2.80165433883667, + "learning_rate": 3.5964483545128705e-05, + "loss": 0.6372, + "step": 8373 + }, + { + "epoch": 1.3108954289292423, + "grad_norm": 1.9509645700454712, + "learning_rate": 3.5956337569240796e-05, + "loss": 1.0285, + "step": 8374 + }, + { + "epoch": 1.3110519724483407, + "grad_norm": 2.356203079223633, + "learning_rate": 3.5948191593352886e-05, + "loss": 0.8344, + "step": 8375 + }, + { + "epoch": 1.311208515967439, + "grad_norm": 1.8994863033294678, + "learning_rate": 3.5940045617464977e-05, + "loss": 0.5002, + "step": 8376 + }, + { + "epoch": 1.3113650594865374, + "grad_norm": 3.437910795211792, + "learning_rate": 3.593189964157706e-05, + "loss": 0.9137, + "step": 8377 + }, + { + "epoch": 1.3115216030056356, + "grad_norm": 3.6035869121551514, + "learning_rate": 3.592375366568915e-05, + "loss": 1.2646, + "step": 8378 + }, + { + "epoch": 1.3116781465247338, + "grad_norm": 2.5926730632781982, + "learning_rate": 3.591560768980124e-05, + "loss": 0.9121, + "step": 8379 + }, + { + "epoch": 1.3118346900438322, + "grad_norm": 4.081279754638672, + "learning_rate": 3.5907461713913325e-05, + "loss": 0.4909, + "step": 8380 + }, + { + "epoch": 1.3119912335629305, + "grad_norm": 2.3783037662506104, + "learning_rate": 3.5899315738025416e-05, + "loss": 0.5734, + "step": 8381 + }, + { + "epoch": 1.312147777082029, + "grad_norm": 4.253009796142578, + "learning_rate": 3.5891169762137506e-05, + "loss": 1.7093, + "step": 8382 + }, + { + "epoch": 1.3123043206011271, + "grad_norm": 2.8035929203033447, + "learning_rate": 3.588302378624959e-05, + "loss": 1.0413, + "step": 8383 + }, + { + "epoch": 1.3124608641202253, + "grad_norm": 8.49101734161377, + "learning_rate": 3.587487781036169e-05, + "loss": 0.341, + "step": 8384 + }, + { + "epoch": 1.3126174076393238, + "grad_norm": 1.6296868324279785, + "learning_rate": 3.586673183447377e-05, + "loss": 0.4518, + "step": 8385 + }, + { + "epoch": 1.312773951158422, + "grad_norm": 2.9873158931732178, + "learning_rate": 3.5858585858585855e-05, + "loss": 0.5926, + "step": 8386 + }, + { + "epoch": 1.3129304946775204, + "grad_norm": 2.2243189811706543, + "learning_rate": 3.585043988269795e-05, + "loss": 0.7907, + "step": 8387 + }, + { + "epoch": 1.3130870381966186, + "grad_norm": 2.82812237739563, + "learning_rate": 3.5842293906810036e-05, + "loss": 0.909, + "step": 8388 + }, + { + "epoch": 1.3132435817157169, + "grad_norm": 0.4810287058353424, + "learning_rate": 3.5834147930922126e-05, + "loss": 0.1823, + "step": 8389 + }, + { + "epoch": 1.3134001252348153, + "grad_norm": 0.5509448051452637, + "learning_rate": 3.582600195503422e-05, + "loss": 0.1761, + "step": 8390 + }, + { + "epoch": 1.3135566687539135, + "grad_norm": 0.6401727795600891, + "learning_rate": 3.58178559791463e-05, + "loss": 0.1729, + "step": 8391 + }, + { + "epoch": 1.313713212273012, + "grad_norm": 0.599826991558075, + "learning_rate": 3.580971000325839e-05, + "loss": 0.1946, + "step": 8392 + }, + { + "epoch": 1.3138697557921102, + "grad_norm": 0.6596594452857971, + "learning_rate": 3.580156402737048e-05, + "loss": 0.2864, + "step": 8393 + }, + { + "epoch": 1.3140262993112084, + "grad_norm": 0.7653840780258179, + "learning_rate": 3.579341805148257e-05, + "loss": 0.1859, + "step": 8394 + }, + { + "epoch": 1.3141828428303068, + "grad_norm": 1.2580124139785767, + "learning_rate": 3.5785272075594656e-05, + "loss": 0.2102, + "step": 8395 + }, + { + "epoch": 1.314339386349405, + "grad_norm": 0.7925068736076355, + "learning_rate": 3.5777126099706746e-05, + "loss": 0.2369, + "step": 8396 + }, + { + "epoch": 1.3144959298685035, + "grad_norm": 0.7634553909301758, + "learning_rate": 3.576898012381884e-05, + "loss": 0.2639, + "step": 8397 + }, + { + "epoch": 1.3146524733876017, + "grad_norm": 0.5611194968223572, + "learning_rate": 3.576083414793092e-05, + "loss": 0.2451, + "step": 8398 + }, + { + "epoch": 1.3148090169067, + "grad_norm": 1.5226852893829346, + "learning_rate": 3.575268817204301e-05, + "loss": 0.2762, + "step": 8399 + }, + { + "epoch": 1.3149655604257984, + "grad_norm": 0.5722342133522034, + "learning_rate": 3.57445421961551e-05, + "loss": 0.1616, + "step": 8400 + }, + { + "epoch": 1.3151221039448968, + "grad_norm": 1.0409742593765259, + "learning_rate": 3.5736396220267186e-05, + "loss": 0.3834, + "step": 8401 + }, + { + "epoch": 1.315278647463995, + "grad_norm": 2.582254648208618, + "learning_rate": 3.572825024437928e-05, + "loss": 0.371, + "step": 8402 + }, + { + "epoch": 1.3154351909830932, + "grad_norm": 1.3226311206817627, + "learning_rate": 3.5720104268491367e-05, + "loss": 0.3956, + "step": 8403 + }, + { + "epoch": 1.3155917345021917, + "grad_norm": 1.1846390962600708, + "learning_rate": 3.571195829260345e-05, + "loss": 0.362, + "step": 8404 + }, + { + "epoch": 1.31574827802129, + "grad_norm": 1.0749825239181519, + "learning_rate": 3.570381231671555e-05, + "loss": 0.338, + "step": 8405 + }, + { + "epoch": 1.3159048215403883, + "grad_norm": 1.5820978879928589, + "learning_rate": 3.569566634082763e-05, + "loss": 0.2532, + "step": 8406 + }, + { + "epoch": 1.3160613650594866, + "grad_norm": 1.6913187503814697, + "learning_rate": 3.568752036493972e-05, + "loss": 0.3394, + "step": 8407 + }, + { + "epoch": 1.3162179085785848, + "grad_norm": 1.282674789428711, + "learning_rate": 3.567937438905181e-05, + "loss": 0.2667, + "step": 8408 + }, + { + "epoch": 1.3163744520976832, + "grad_norm": 2.2425525188446045, + "learning_rate": 3.5671228413163896e-05, + "loss": 0.3706, + "step": 8409 + }, + { + "epoch": 1.3165309956167814, + "grad_norm": 1.8661149740219116, + "learning_rate": 3.566308243727599e-05, + "loss": 0.6167, + "step": 8410 + }, + { + "epoch": 1.3166875391358799, + "grad_norm": 0.5857168436050415, + "learning_rate": 3.565493646138808e-05, + "loss": 0.2647, + "step": 8411 + }, + { + "epoch": 1.316844082654978, + "grad_norm": 2.418144941329956, + "learning_rate": 3.564679048550017e-05, + "loss": 0.5876, + "step": 8412 + }, + { + "epoch": 1.3170006261740763, + "grad_norm": 1.706059217453003, + "learning_rate": 3.563864450961225e-05, + "loss": 0.4116, + "step": 8413 + }, + { + "epoch": 1.3171571696931748, + "grad_norm": 2.618663787841797, + "learning_rate": 3.563049853372434e-05, + "loss": 0.4871, + "step": 8414 + }, + { + "epoch": 1.317313713212273, + "grad_norm": 3.595963954925537, + "learning_rate": 3.562235255783643e-05, + "loss": 0.4992, + "step": 8415 + }, + { + "epoch": 1.3174702567313714, + "grad_norm": 1.2214974164962769, + "learning_rate": 3.5614206581948516e-05, + "loss": 0.2923, + "step": 8416 + }, + { + "epoch": 1.3176268002504696, + "grad_norm": 1.1802163124084473, + "learning_rate": 3.560606060606061e-05, + "loss": 0.301, + "step": 8417 + }, + { + "epoch": 1.3177833437695678, + "grad_norm": 3.522930145263672, + "learning_rate": 3.55979146301727e-05, + "loss": 0.3765, + "step": 8418 + }, + { + "epoch": 1.3179398872886663, + "grad_norm": 5.202719688415527, + "learning_rate": 3.558976865428478e-05, + "loss": 0.5701, + "step": 8419 + }, + { + "epoch": 1.3180964308077645, + "grad_norm": 3.701211929321289, + "learning_rate": 3.558162267839688e-05, + "loss": 0.6596, + "step": 8420 + }, + { + "epoch": 1.318252974326863, + "grad_norm": 7.267334461212158, + "learning_rate": 3.557347670250896e-05, + "loss": 0.6066, + "step": 8421 + }, + { + "epoch": 1.3184095178459612, + "grad_norm": 2.4460484981536865, + "learning_rate": 3.5565330726621046e-05, + "loss": 0.9534, + "step": 8422 + }, + { + "epoch": 1.3185660613650594, + "grad_norm": 1.8347128629684448, + "learning_rate": 3.555718475073314e-05, + "loss": 1.0973, + "step": 8423 + }, + { + "epoch": 1.3187226048841578, + "grad_norm": 4.090438365936279, + "learning_rate": 3.554903877484523e-05, + "loss": 1.2048, + "step": 8424 + }, + { + "epoch": 1.318879148403256, + "grad_norm": 2.2385342121124268, + "learning_rate": 3.554089279895732e-05, + "loss": 1.1388, + "step": 8425 + }, + { + "epoch": 1.3190356919223545, + "grad_norm": 2.5285515785217285, + "learning_rate": 3.553274682306941e-05, + "loss": 0.9163, + "step": 8426 + }, + { + "epoch": 1.3191922354414527, + "grad_norm": 2.1527762413024902, + "learning_rate": 3.552460084718149e-05, + "loss": 0.3323, + "step": 8427 + }, + { + "epoch": 1.319348778960551, + "grad_norm": 3.7166354656219482, + "learning_rate": 3.551645487129358e-05, + "loss": 1.0827, + "step": 8428 + }, + { + "epoch": 1.3195053224796494, + "grad_norm": 3.070312261581421, + "learning_rate": 3.550830889540567e-05, + "loss": 0.9512, + "step": 8429 + }, + { + "epoch": 1.3196618659987476, + "grad_norm": 3.5065159797668457, + "learning_rate": 3.5500162919517757e-05, + "loss": 0.9, + "step": 8430 + }, + { + "epoch": 1.319818409517846, + "grad_norm": 6.754910469055176, + "learning_rate": 3.549201694362985e-05, + "loss": 1.4416, + "step": 8431 + }, + { + "epoch": 1.3199749530369442, + "grad_norm": 5.400577545166016, + "learning_rate": 3.548387096774194e-05, + "loss": 0.9089, + "step": 8432 + }, + { + "epoch": 1.3201314965560424, + "grad_norm": 2.2845847606658936, + "learning_rate": 3.547572499185403e-05, + "loss": 1.1423, + "step": 8433 + }, + { + "epoch": 1.320288040075141, + "grad_norm": 2.015960693359375, + "learning_rate": 3.546757901596611e-05, + "loss": 0.4891, + "step": 8434 + }, + { + "epoch": 1.3204445835942393, + "grad_norm": 2.4525668621063232, + "learning_rate": 3.54594330400782e-05, + "loss": 0.5911, + "step": 8435 + }, + { + "epoch": 1.3206011271133375, + "grad_norm": 3.622922420501709, + "learning_rate": 3.545128706419029e-05, + "loss": 0.9947, + "step": 8436 + }, + { + "epoch": 1.3207576706324358, + "grad_norm": 3.7278690338134766, + "learning_rate": 3.544314108830238e-05, + "loss": 1.2979, + "step": 8437 + }, + { + "epoch": 1.3209142141515342, + "grad_norm": 1.5201191902160645, + "learning_rate": 3.5434995112414474e-05, + "loss": 0.9855, + "step": 8438 + }, + { + "epoch": 1.3210707576706324, + "grad_norm": 0.5269079208374023, + "learning_rate": 3.542684913652656e-05, + "loss": 0.1866, + "step": 8439 + }, + { + "epoch": 1.3212273011897309, + "grad_norm": 0.46003150939941406, + "learning_rate": 3.541870316063864e-05, + "loss": 0.1961, + "step": 8440 + }, + { + "epoch": 1.321383844708829, + "grad_norm": 0.594732403755188, + "learning_rate": 3.541055718475074e-05, + "loss": 0.2986, + "step": 8441 + }, + { + "epoch": 1.3215403882279273, + "grad_norm": 0.8863162398338318, + "learning_rate": 3.540241120886282e-05, + "loss": 0.2173, + "step": 8442 + }, + { + "epoch": 1.3216969317470257, + "grad_norm": 0.79352867603302, + "learning_rate": 3.539426523297491e-05, + "loss": 0.2195, + "step": 8443 + }, + { + "epoch": 1.321853475266124, + "grad_norm": 0.46303078532218933, + "learning_rate": 3.5386119257087004e-05, + "loss": 0.1963, + "step": 8444 + }, + { + "epoch": 1.3220100187852224, + "grad_norm": 0.6782279014587402, + "learning_rate": 3.537797328119909e-05, + "loss": 0.2018, + "step": 8445 + }, + { + "epoch": 1.3221665623043206, + "grad_norm": 1.5342501401901245, + "learning_rate": 3.536982730531118e-05, + "loss": 0.2402, + "step": 8446 + }, + { + "epoch": 1.3223231058234188, + "grad_norm": 0.6222476959228516, + "learning_rate": 3.536168132942327e-05, + "loss": 0.2973, + "step": 8447 + }, + { + "epoch": 1.3224796493425173, + "grad_norm": 0.6118439435958862, + "learning_rate": 3.535353535353535e-05, + "loss": 0.1614, + "step": 8448 + }, + { + "epoch": 1.3226361928616155, + "grad_norm": 1.543129324913025, + "learning_rate": 3.534538937764744e-05, + "loss": 0.2414, + "step": 8449 + }, + { + "epoch": 1.322792736380714, + "grad_norm": 0.9899714589118958, + "learning_rate": 3.533724340175953e-05, + "loss": 0.4405, + "step": 8450 + }, + { + "epoch": 1.3229492798998121, + "grad_norm": 2.5661861896514893, + "learning_rate": 3.5329097425871624e-05, + "loss": 0.5399, + "step": 8451 + }, + { + "epoch": 1.3231058234189104, + "grad_norm": 1.149288296699524, + "learning_rate": 3.532095144998371e-05, + "loss": 0.354, + "step": 8452 + }, + { + "epoch": 1.3232623669380088, + "grad_norm": 2.7846317291259766, + "learning_rate": 3.53128054740958e-05, + "loss": 0.4693, + "step": 8453 + }, + { + "epoch": 1.323418910457107, + "grad_norm": 1.3100318908691406, + "learning_rate": 3.530465949820789e-05, + "loss": 0.3524, + "step": 8454 + }, + { + "epoch": 1.3235754539762055, + "grad_norm": 1.4325578212738037, + "learning_rate": 3.529651352231997e-05, + "loss": 0.4864, + "step": 8455 + }, + { + "epoch": 1.3237319974953037, + "grad_norm": 2.0184271335601807, + "learning_rate": 3.528836754643207e-05, + "loss": 0.4303, + "step": 8456 + }, + { + "epoch": 1.323888541014402, + "grad_norm": 0.6871387958526611, + "learning_rate": 3.528022157054415e-05, + "loss": 0.2375, + "step": 8457 + }, + { + "epoch": 1.3240450845335003, + "grad_norm": 1.9836632013320923, + "learning_rate": 3.527207559465624e-05, + "loss": 0.5903, + "step": 8458 + }, + { + "epoch": 1.3242016280525986, + "grad_norm": 1.4640228748321533, + "learning_rate": 3.5263929618768334e-05, + "loss": 0.2692, + "step": 8459 + }, + { + "epoch": 1.324358171571697, + "grad_norm": 1.6339555978775024, + "learning_rate": 3.525578364288042e-05, + "loss": 0.1927, + "step": 8460 + }, + { + "epoch": 1.3245147150907952, + "grad_norm": 1.4160233736038208, + "learning_rate": 3.524763766699251e-05, + "loss": 0.3086, + "step": 8461 + }, + { + "epoch": 1.3246712586098934, + "grad_norm": 1.5622040033340454, + "learning_rate": 3.52394916911046e-05, + "loss": 0.3897, + "step": 8462 + }, + { + "epoch": 1.3248278021289919, + "grad_norm": 0.9581974148750305, + "learning_rate": 3.523134571521668e-05, + "loss": 0.3058, + "step": 8463 + }, + { + "epoch": 1.32498434564809, + "grad_norm": 2.091980457305908, + "learning_rate": 3.5223199739328773e-05, + "loss": 0.4025, + "step": 8464 + }, + { + "epoch": 1.3251408891671885, + "grad_norm": 1.6223695278167725, + "learning_rate": 3.5215053763440864e-05, + "loss": 0.4057, + "step": 8465 + }, + { + "epoch": 1.3252974326862867, + "grad_norm": 3.624971866607666, + "learning_rate": 3.520690778755295e-05, + "loss": 0.712, + "step": 8466 + }, + { + "epoch": 1.325453976205385, + "grad_norm": 3.524052381515503, + "learning_rate": 3.519876181166504e-05, + "loss": 0.7493, + "step": 8467 + }, + { + "epoch": 1.3256105197244834, + "grad_norm": 2.4064857959747314, + "learning_rate": 3.519061583577713e-05, + "loss": 0.8866, + "step": 8468 + }, + { + "epoch": 1.3257670632435818, + "grad_norm": 2.0751688480377197, + "learning_rate": 3.518246985988922e-05, + "loss": 0.5488, + "step": 8469 + }, + { + "epoch": 1.32592360676268, + "grad_norm": 6.076294422149658, + "learning_rate": 3.51743238840013e-05, + "loss": 0.941, + "step": 8470 + }, + { + "epoch": 1.3260801502817783, + "grad_norm": 5.948340892791748, + "learning_rate": 3.5166177908113394e-05, + "loss": 0.8527, + "step": 8471 + }, + { + "epoch": 1.3262366938008767, + "grad_norm": 3.086989402770996, + "learning_rate": 3.5158031932225484e-05, + "loss": 1.0439, + "step": 8472 + }, + { + "epoch": 1.326393237319975, + "grad_norm": 3.229024887084961, + "learning_rate": 3.514988595633757e-05, + "loss": 1.1843, + "step": 8473 + }, + { + "epoch": 1.3265497808390734, + "grad_norm": 4.449682712554932, + "learning_rate": 3.514173998044966e-05, + "loss": 0.9801, + "step": 8474 + }, + { + "epoch": 1.3267063243581716, + "grad_norm": 4.552778244018555, + "learning_rate": 3.513359400456175e-05, + "loss": 1.0438, + "step": 8475 + }, + { + "epoch": 1.3268628678772698, + "grad_norm": 2.86601185798645, + "learning_rate": 3.512544802867383e-05, + "loss": 1.1076, + "step": 8476 + }, + { + "epoch": 1.3270194113963683, + "grad_norm": 3.568070888519287, + "learning_rate": 3.511730205278593e-05, + "loss": 1.2333, + "step": 8477 + }, + { + "epoch": 1.3271759549154665, + "grad_norm": 2.7777373790740967, + "learning_rate": 3.5109156076898014e-05, + "loss": 0.9403, + "step": 8478 + }, + { + "epoch": 1.327332498434565, + "grad_norm": 4.398962497711182, + "learning_rate": 3.5101010101010104e-05, + "loss": 1.1793, + "step": 8479 + }, + { + "epoch": 1.3274890419536631, + "grad_norm": 3.417990207672119, + "learning_rate": 3.5092864125122195e-05, + "loss": 1.4676, + "step": 8480 + }, + { + "epoch": 1.3276455854727613, + "grad_norm": 5.640031814575195, + "learning_rate": 3.508471814923428e-05, + "loss": 1.1197, + "step": 8481 + }, + { + "epoch": 1.3278021289918598, + "grad_norm": 3.421017646789551, + "learning_rate": 3.507657217334637e-05, + "loss": 0.9751, + "step": 8482 + }, + { + "epoch": 1.327958672510958, + "grad_norm": 3.373124122619629, + "learning_rate": 3.506842619745846e-05, + "loss": 1.0111, + "step": 8483 + }, + { + "epoch": 1.3281152160300564, + "grad_norm": 2.8926467895507812, + "learning_rate": 3.506028022157054e-05, + "loss": 1.0105, + "step": 8484 + }, + { + "epoch": 1.3282717595491547, + "grad_norm": 2.588656187057495, + "learning_rate": 3.5052134245682634e-05, + "loss": 0.3026, + "step": 8485 + }, + { + "epoch": 1.3284283030682529, + "grad_norm": 4.531556129455566, + "learning_rate": 3.5043988269794724e-05, + "loss": 0.9989, + "step": 8486 + }, + { + "epoch": 1.3285848465873513, + "grad_norm": 4.955917835235596, + "learning_rate": 3.5035842293906815e-05, + "loss": 1.191, + "step": 8487 + }, + { + "epoch": 1.3287413901064495, + "grad_norm": 2.2187604904174805, + "learning_rate": 3.50276963180189e-05, + "loss": 0.8362, + "step": 8488 + }, + { + "epoch": 1.328897933625548, + "grad_norm": 0.4675837755203247, + "learning_rate": 3.501955034213099e-05, + "loss": 0.1901, + "step": 8489 + }, + { + "epoch": 1.3290544771446462, + "grad_norm": 0.7385925650596619, + "learning_rate": 3.501140436624308e-05, + "loss": 0.3493, + "step": 8490 + }, + { + "epoch": 1.3292110206637444, + "grad_norm": 0.7834662199020386, + "learning_rate": 3.5003258390355163e-05, + "loss": 0.2153, + "step": 8491 + }, + { + "epoch": 1.3293675641828429, + "grad_norm": 0.6386341452598572, + "learning_rate": 3.4995112414467254e-05, + "loss": 0.2086, + "step": 8492 + }, + { + "epoch": 1.329524107701941, + "grad_norm": 0.616392195224762, + "learning_rate": 3.4986966438579344e-05, + "loss": 0.1958, + "step": 8493 + }, + { + "epoch": 1.3296806512210395, + "grad_norm": 0.587738573551178, + "learning_rate": 3.497882046269143e-05, + "loss": 0.198, + "step": 8494 + }, + { + "epoch": 1.3298371947401377, + "grad_norm": 0.647150993347168, + "learning_rate": 3.4970674486803526e-05, + "loss": 0.1112, + "step": 8495 + }, + { + "epoch": 1.329993738259236, + "grad_norm": 0.7944263815879822, + "learning_rate": 3.496252851091561e-05, + "loss": 0.2793, + "step": 8496 + }, + { + "epoch": 1.3301502817783344, + "grad_norm": 0.6324779987335205, + "learning_rate": 3.49543825350277e-05, + "loss": 0.2692, + "step": 8497 + }, + { + "epoch": 1.3303068252974326, + "grad_norm": 1.7483203411102295, + "learning_rate": 3.494623655913979e-05, + "loss": 0.2286, + "step": 8498 + }, + { + "epoch": 1.330463368816531, + "grad_norm": 0.9984942674636841, + "learning_rate": 3.4938090583251874e-05, + "loss": 0.1682, + "step": 8499 + }, + { + "epoch": 1.3306199123356293, + "grad_norm": 1.1153671741485596, + "learning_rate": 3.4929944607363965e-05, + "loss": 0.3368, + "step": 8500 + }, + { + "epoch": 1.3307764558547275, + "grad_norm": 1.9307458400726318, + "learning_rate": 3.4921798631476055e-05, + "loss": 0.2864, + "step": 8501 + }, + { + "epoch": 1.330932999373826, + "grad_norm": 1.6151777505874634, + "learning_rate": 3.491365265558814e-05, + "loss": 0.5573, + "step": 8502 + }, + { + "epoch": 1.3310895428929244, + "grad_norm": 1.1734970808029175, + "learning_rate": 3.490550667970023e-05, + "loss": 0.5107, + "step": 8503 + }, + { + "epoch": 1.3312460864120226, + "grad_norm": 1.2298996448516846, + "learning_rate": 3.489736070381232e-05, + "loss": 0.1869, + "step": 8504 + }, + { + "epoch": 1.3314026299311208, + "grad_norm": 0.8931962847709656, + "learning_rate": 3.488921472792441e-05, + "loss": 0.3101, + "step": 8505 + }, + { + "epoch": 1.3315591734502192, + "grad_norm": 2.5522069931030273, + "learning_rate": 3.4881068752036494e-05, + "loss": 0.3949, + "step": 8506 + }, + { + "epoch": 1.3317157169693175, + "grad_norm": 1.3891721963882446, + "learning_rate": 3.4872922776148585e-05, + "loss": 0.2528, + "step": 8507 + }, + { + "epoch": 1.331872260488416, + "grad_norm": 2.153671979904175, + "learning_rate": 3.4864776800260675e-05, + "loss": 0.271, + "step": 8508 + }, + { + "epoch": 1.3320288040075141, + "grad_norm": 2.180036783218384, + "learning_rate": 3.485663082437276e-05, + "loss": 0.4888, + "step": 8509 + }, + { + "epoch": 1.3321853475266123, + "grad_norm": 1.2948036193847656, + "learning_rate": 3.484848484848485e-05, + "loss": 0.3685, + "step": 8510 + }, + { + "epoch": 1.3323418910457108, + "grad_norm": 3.0337564945220947, + "learning_rate": 3.484033887259694e-05, + "loss": 0.7211, + "step": 8511 + }, + { + "epoch": 1.332498434564809, + "grad_norm": 2.1143248081207275, + "learning_rate": 3.4832192896709024e-05, + "loss": 0.6653, + "step": 8512 + }, + { + "epoch": 1.3326549780839074, + "grad_norm": 3.841062068939209, + "learning_rate": 3.482404692082112e-05, + "loss": 0.7232, + "step": 8513 + }, + { + "epoch": 1.3328115216030056, + "grad_norm": 2.247379779815674, + "learning_rate": 3.4815900944933205e-05, + "loss": 0.5673, + "step": 8514 + }, + { + "epoch": 1.3329680651221039, + "grad_norm": 1.3758397102355957, + "learning_rate": 3.480775496904529e-05, + "loss": 0.4978, + "step": 8515 + }, + { + "epoch": 1.3331246086412023, + "grad_norm": 1.6916828155517578, + "learning_rate": 3.4799608993157386e-05, + "loss": 0.7545, + "step": 8516 + }, + { + "epoch": 1.3332811521603005, + "grad_norm": 3.160736560821533, + "learning_rate": 3.479146301726947e-05, + "loss": 0.601, + "step": 8517 + }, + { + "epoch": 1.333437695679399, + "grad_norm": 2.1640846729278564, + "learning_rate": 3.478331704138156e-05, + "loss": 0.8378, + "step": 8518 + }, + { + "epoch": 1.3335942391984972, + "grad_norm": 3.0255305767059326, + "learning_rate": 3.477517106549365e-05, + "loss": 0.8595, + "step": 8519 + }, + { + "epoch": 1.3337507827175954, + "grad_norm": 3.239607810974121, + "learning_rate": 3.4767025089605734e-05, + "loss": 0.5336, + "step": 8520 + }, + { + "epoch": 1.3339073262366938, + "grad_norm": 1.4567298889160156, + "learning_rate": 3.4758879113717825e-05, + "loss": 0.4192, + "step": 8521 + }, + { + "epoch": 1.334063869755792, + "grad_norm": 3.313661575317383, + "learning_rate": 3.4750733137829916e-05, + "loss": 0.853, + "step": 8522 + }, + { + "epoch": 1.3342204132748905, + "grad_norm": 1.6384788751602173, + "learning_rate": 3.4742587161942006e-05, + "loss": 0.6594, + "step": 8523 + }, + { + "epoch": 1.3343769567939887, + "grad_norm": 1.4710774421691895, + "learning_rate": 3.473444118605409e-05, + "loss": 0.6279, + "step": 8524 + }, + { + "epoch": 1.334533500313087, + "grad_norm": 3.4232258796691895, + "learning_rate": 3.472629521016618e-05, + "loss": 0.7204, + "step": 8525 + }, + { + "epoch": 1.3346900438321854, + "grad_norm": 3.2995142936706543, + "learning_rate": 3.471814923427827e-05, + "loss": 0.762, + "step": 8526 + }, + { + "epoch": 1.3348465873512836, + "grad_norm": 2.6816959381103516, + "learning_rate": 3.4710003258390355e-05, + "loss": 1.0935, + "step": 8527 + }, + { + "epoch": 1.335003130870382, + "grad_norm": 3.2564897537231445, + "learning_rate": 3.4701857282502445e-05, + "loss": 1.4617, + "step": 8528 + }, + { + "epoch": 1.3351596743894802, + "grad_norm": 3.0503525733947754, + "learning_rate": 3.4693711306614536e-05, + "loss": 1.1169, + "step": 8529 + }, + { + "epoch": 1.3353162179085785, + "grad_norm": 5.497310638427734, + "learning_rate": 3.468556533072662e-05, + "loss": 1.4125, + "step": 8530 + }, + { + "epoch": 1.335472761427677, + "grad_norm": 2.423121213912964, + "learning_rate": 3.467741935483872e-05, + "loss": 0.7867, + "step": 8531 + }, + { + "epoch": 1.3356293049467753, + "grad_norm": 2.8122169971466064, + "learning_rate": 3.46692733789508e-05, + "loss": 0.6025, + "step": 8532 + }, + { + "epoch": 1.3357858484658736, + "grad_norm": 5.084628105163574, + "learning_rate": 3.4661127403062884e-05, + "loss": 0.6424, + "step": 8533 + }, + { + "epoch": 1.3359423919849718, + "grad_norm": 3.429636240005493, + "learning_rate": 3.465298142717498e-05, + "loss": 0.6806, + "step": 8534 + }, + { + "epoch": 1.33609893550407, + "grad_norm": 2.7919554710388184, + "learning_rate": 3.4644835451287065e-05, + "loss": 0.4697, + "step": 8535 + }, + { + "epoch": 1.3362554790231684, + "grad_norm": 2.650437116622925, + "learning_rate": 3.4636689475399156e-05, + "loss": 0.8018, + "step": 8536 + }, + { + "epoch": 1.3364120225422669, + "grad_norm": 2.3600454330444336, + "learning_rate": 3.4628543499511246e-05, + "loss": 0.6631, + "step": 8537 + }, + { + "epoch": 1.336568566061365, + "grad_norm": 3.590238094329834, + "learning_rate": 3.462039752362333e-05, + "loss": 0.8444, + "step": 8538 + }, + { + "epoch": 1.3367251095804633, + "grad_norm": 0.39657554030418396, + "learning_rate": 3.461225154773542e-05, + "loss": 0.2215, + "step": 8539 + }, + { + "epoch": 1.3368816530995618, + "grad_norm": 0.763107419013977, + "learning_rate": 3.460410557184751e-05, + "loss": 0.2254, + "step": 8540 + }, + { + "epoch": 1.33703819661866, + "grad_norm": 0.5544085502624512, + "learning_rate": 3.45959595959596e-05, + "loss": 0.2396, + "step": 8541 + }, + { + "epoch": 1.3371947401377584, + "grad_norm": 0.7665224671363831, + "learning_rate": 3.4587813620071685e-05, + "loss": 0.2894, + "step": 8542 + }, + { + "epoch": 1.3373512836568566, + "grad_norm": 1.5328655242919922, + "learning_rate": 3.4579667644183776e-05, + "loss": 0.2546, + "step": 8543 + }, + { + "epoch": 1.3375078271759548, + "grad_norm": 0.769133448600769, + "learning_rate": 3.4571521668295866e-05, + "loss": 0.2795, + "step": 8544 + }, + { + "epoch": 1.3376643706950533, + "grad_norm": 0.5221086144447327, + "learning_rate": 3.456337569240795e-05, + "loss": 0.169, + "step": 8545 + }, + { + "epoch": 1.3378209142141515, + "grad_norm": 0.7057730555534363, + "learning_rate": 3.455522971652004e-05, + "loss": 0.2675, + "step": 8546 + }, + { + "epoch": 1.33797745773325, + "grad_norm": 0.8677318096160889, + "learning_rate": 3.454708374063213e-05, + "loss": 0.231, + "step": 8547 + }, + { + "epoch": 1.3381340012523482, + "grad_norm": 0.6023112535476685, + "learning_rate": 3.4538937764744215e-05, + "loss": 0.2404, + "step": 8548 + }, + { + "epoch": 1.3382905447714464, + "grad_norm": 1.0225276947021484, + "learning_rate": 3.453079178885631e-05, + "loss": 0.2619, + "step": 8549 + }, + { + "epoch": 1.3384470882905448, + "grad_norm": 1.7177293300628662, + "learning_rate": 3.4522645812968396e-05, + "loss": 0.3168, + "step": 8550 + }, + { + "epoch": 1.338603631809643, + "grad_norm": 0.9468241930007935, + "learning_rate": 3.451449983708048e-05, + "loss": 0.2543, + "step": 8551 + }, + { + "epoch": 1.3387601753287415, + "grad_norm": 1.316314697265625, + "learning_rate": 3.450635386119258e-05, + "loss": 0.2531, + "step": 8552 + }, + { + "epoch": 1.3389167188478397, + "grad_norm": 1.0557522773742676, + "learning_rate": 3.449820788530466e-05, + "loss": 0.2705, + "step": 8553 + }, + { + "epoch": 1.339073262366938, + "grad_norm": 2.2281219959259033, + "learning_rate": 3.449006190941675e-05, + "loss": 0.4512, + "step": 8554 + }, + { + "epoch": 1.3392298058860364, + "grad_norm": 1.4612858295440674, + "learning_rate": 3.448191593352884e-05, + "loss": 0.497, + "step": 8555 + }, + { + "epoch": 1.3393863494051346, + "grad_norm": 3.740870952606201, + "learning_rate": 3.4473769957640926e-05, + "loss": 0.4001, + "step": 8556 + }, + { + "epoch": 1.339542892924233, + "grad_norm": 1.4971380233764648, + "learning_rate": 3.4465623981753016e-05, + "loss": 0.4318, + "step": 8557 + }, + { + "epoch": 1.3396994364433312, + "grad_norm": 1.2865182161331177, + "learning_rate": 3.445747800586511e-05, + "loss": 0.2715, + "step": 8558 + }, + { + "epoch": 1.3398559799624294, + "grad_norm": 1.4822709560394287, + "learning_rate": 3.444933202997719e-05, + "loss": 0.3629, + "step": 8559 + }, + { + "epoch": 1.3400125234815279, + "grad_norm": 1.571848750114441, + "learning_rate": 3.444118605408928e-05, + "loss": 0.2083, + "step": 8560 + }, + { + "epoch": 1.340169067000626, + "grad_norm": 1.4497569799423218, + "learning_rate": 3.443304007820137e-05, + "loss": 0.3189, + "step": 8561 + }, + { + "epoch": 1.3403256105197245, + "grad_norm": 1.8920315504074097, + "learning_rate": 3.442489410231346e-05, + "loss": 0.6558, + "step": 8562 + }, + { + "epoch": 1.3404821540388228, + "grad_norm": 9.250653266906738, + "learning_rate": 3.4416748126425546e-05, + "loss": 0.8756, + "step": 8563 + }, + { + "epoch": 1.340638697557921, + "grad_norm": 1.924851655960083, + "learning_rate": 3.4408602150537636e-05, + "loss": 0.4506, + "step": 8564 + }, + { + "epoch": 1.3407952410770194, + "grad_norm": 1.8420242071151733, + "learning_rate": 3.440045617464973e-05, + "loss": 0.3997, + "step": 8565 + }, + { + "epoch": 1.3409517845961179, + "grad_norm": 5.671468257904053, + "learning_rate": 3.439231019876181e-05, + "loss": 0.5846, + "step": 8566 + }, + { + "epoch": 1.341108328115216, + "grad_norm": 2.170936346054077, + "learning_rate": 3.438416422287391e-05, + "loss": 0.6455, + "step": 8567 + }, + { + "epoch": 1.3412648716343143, + "grad_norm": 2.267292022705078, + "learning_rate": 3.437601824698599e-05, + "loss": 0.7309, + "step": 8568 + }, + { + "epoch": 1.3414214151534125, + "grad_norm": 2.76926326751709, + "learning_rate": 3.4367872271098075e-05, + "loss": 0.8268, + "step": 8569 + }, + { + "epoch": 1.341577958672511, + "grad_norm": 1.5377984046936035, + "learning_rate": 3.4359726295210166e-05, + "loss": 0.5349, + "step": 8570 + }, + { + "epoch": 1.3417345021916094, + "grad_norm": 4.835992813110352, + "learning_rate": 3.4351580319322256e-05, + "loss": 0.7763, + "step": 8571 + }, + { + "epoch": 1.3418910457107076, + "grad_norm": 2.5367648601531982, + "learning_rate": 3.434343434343435e-05, + "loss": 0.8993, + "step": 8572 + }, + { + "epoch": 1.3420475892298058, + "grad_norm": 2.5833725929260254, + "learning_rate": 3.433528836754643e-05, + "loss": 0.4572, + "step": 8573 + }, + { + "epoch": 1.3422041327489043, + "grad_norm": 2.5807135105133057, + "learning_rate": 3.432714239165852e-05, + "loss": 0.8781, + "step": 8574 + }, + { + "epoch": 1.3423606762680025, + "grad_norm": 3.8804471492767334, + "learning_rate": 3.431899641577061e-05, + "loss": 0.952, + "step": 8575 + }, + { + "epoch": 1.342517219787101, + "grad_norm": 3.4788148403167725, + "learning_rate": 3.4310850439882695e-05, + "loss": 0.7882, + "step": 8576 + }, + { + "epoch": 1.3426737633061991, + "grad_norm": 2.7166504859924316, + "learning_rate": 3.4302704463994786e-05, + "loss": 0.7976, + "step": 8577 + }, + { + "epoch": 1.3428303068252974, + "grad_norm": 9.429838180541992, + "learning_rate": 3.4294558488106877e-05, + "loss": 1.197, + "step": 8578 + }, + { + "epoch": 1.3429868503443958, + "grad_norm": 4.496863842010498, + "learning_rate": 3.428641251221896e-05, + "loss": 1.3423, + "step": 8579 + }, + { + "epoch": 1.343143393863494, + "grad_norm": 4.382953643798828, + "learning_rate": 3.427826653633106e-05, + "loss": 1.0895, + "step": 8580 + }, + { + "epoch": 1.3432999373825925, + "grad_norm": 4.08974027633667, + "learning_rate": 3.427012056044314e-05, + "loss": 0.4387, + "step": 8581 + }, + { + "epoch": 1.3434564809016907, + "grad_norm": 4.366562366485596, + "learning_rate": 3.426197458455523e-05, + "loss": 0.5868, + "step": 8582 + }, + { + "epoch": 1.343613024420789, + "grad_norm": 3.017238140106201, + "learning_rate": 3.425382860866732e-05, + "loss": 0.7396, + "step": 8583 + }, + { + "epoch": 1.3437695679398873, + "grad_norm": 2.6422348022460938, + "learning_rate": 3.4245682632779406e-05, + "loss": 0.8973, + "step": 8584 + }, + { + "epoch": 1.3439261114589856, + "grad_norm": 3.0161664485931396, + "learning_rate": 3.42375366568915e-05, + "loss": 0.6174, + "step": 8585 + }, + { + "epoch": 1.344082654978084, + "grad_norm": 4.018739223480225, + "learning_rate": 3.422939068100359e-05, + "loss": 1.2241, + "step": 8586 + }, + { + "epoch": 1.3442391984971822, + "grad_norm": 3.061509847640991, + "learning_rate": 3.422124470511567e-05, + "loss": 1.0312, + "step": 8587 + }, + { + "epoch": 1.3443957420162804, + "grad_norm": 3.57681941986084, + "learning_rate": 3.421309872922776e-05, + "loss": 0.6294, + "step": 8588 + }, + { + "epoch": 1.3445522855353789, + "grad_norm": 0.325825572013855, + "learning_rate": 3.420495275333985e-05, + "loss": 0.1446, + "step": 8589 + }, + { + "epoch": 1.344708829054477, + "grad_norm": 0.5533137917518616, + "learning_rate": 3.419680677745194e-05, + "loss": 0.1898, + "step": 8590 + }, + { + "epoch": 1.3448653725735755, + "grad_norm": 0.6314570903778076, + "learning_rate": 3.4188660801564026e-05, + "loss": 0.283, + "step": 8591 + }, + { + "epoch": 1.3450219160926737, + "grad_norm": 0.47087395191192627, + "learning_rate": 3.418051482567612e-05, + "loss": 0.1789, + "step": 8592 + }, + { + "epoch": 1.345178459611772, + "grad_norm": 0.7400456666946411, + "learning_rate": 3.417236884978821e-05, + "loss": 0.2782, + "step": 8593 + }, + { + "epoch": 1.3453350031308704, + "grad_norm": 1.1663942337036133, + "learning_rate": 3.416422287390029e-05, + "loss": 0.3091, + "step": 8594 + }, + { + "epoch": 1.3454915466499686, + "grad_norm": 0.5840210914611816, + "learning_rate": 3.415607689801238e-05, + "loss": 0.2312, + "step": 8595 + }, + { + "epoch": 1.345648090169067, + "grad_norm": 0.5137709379196167, + "learning_rate": 3.414793092212447e-05, + "loss": 0.1368, + "step": 8596 + }, + { + "epoch": 1.3458046336881653, + "grad_norm": 2.095010757446289, + "learning_rate": 3.4139784946236556e-05, + "loss": 0.3108, + "step": 8597 + }, + { + "epoch": 1.3459611772072635, + "grad_norm": 1.138384461402893, + "learning_rate": 3.413163897034865e-05, + "loss": 0.3391, + "step": 8598 + }, + { + "epoch": 1.346117720726362, + "grad_norm": 1.040691614151001, + "learning_rate": 3.412349299446074e-05, + "loss": 0.3547, + "step": 8599 + }, + { + "epoch": 1.3462742642454604, + "grad_norm": 1.000227928161621, + "learning_rate": 3.411534701857283e-05, + "loss": 0.4161, + "step": 8600 + }, + { + "epoch": 1.3464308077645586, + "grad_norm": 1.921265959739685, + "learning_rate": 3.410720104268492e-05, + "loss": 0.4868, + "step": 8601 + }, + { + "epoch": 1.3465873512836568, + "grad_norm": 1.087766408920288, + "learning_rate": 3.4099055066797e-05, + "loss": 0.3303, + "step": 8602 + }, + { + "epoch": 1.3467438948027552, + "grad_norm": 0.8267583250999451, + "learning_rate": 3.409090909090909e-05, + "loss": 0.2274, + "step": 8603 + }, + { + "epoch": 1.3469004383218535, + "grad_norm": 1.2344897985458374, + "learning_rate": 3.408276311502118e-05, + "loss": 0.2991, + "step": 8604 + }, + { + "epoch": 1.347056981840952, + "grad_norm": 1.3016506433486938, + "learning_rate": 3.4074617139133267e-05, + "loss": 0.3577, + "step": 8605 + }, + { + "epoch": 1.3472135253600501, + "grad_norm": 1.235203504562378, + "learning_rate": 3.406647116324536e-05, + "loss": 0.4933, + "step": 8606 + }, + { + "epoch": 1.3473700688791483, + "grad_norm": 1.5250542163848877, + "learning_rate": 3.405832518735745e-05, + "loss": 0.4702, + "step": 8607 + }, + { + "epoch": 1.3475266123982468, + "grad_norm": 1.589677333831787, + "learning_rate": 3.405017921146954e-05, + "loss": 0.4584, + "step": 8608 + }, + { + "epoch": 1.347683155917345, + "grad_norm": 1.7251396179199219, + "learning_rate": 3.404203323558162e-05, + "loss": 0.577, + "step": 8609 + }, + { + "epoch": 1.3478396994364434, + "grad_norm": 1.8857855796813965, + "learning_rate": 3.403388725969371e-05, + "loss": 0.5397, + "step": 8610 + }, + { + "epoch": 1.3479962429555417, + "grad_norm": 2.5450491905212402, + "learning_rate": 3.40257412838058e-05, + "loss": 0.3512, + "step": 8611 + }, + { + "epoch": 1.3481527864746399, + "grad_norm": 3.1982269287109375, + "learning_rate": 3.401759530791789e-05, + "loss": 0.4364, + "step": 8612 + }, + { + "epoch": 1.3483093299937383, + "grad_norm": 2.0947189331054688, + "learning_rate": 3.400944933202998e-05, + "loss": 0.7424, + "step": 8613 + }, + { + "epoch": 1.3484658735128365, + "grad_norm": 2.3970444202423096, + "learning_rate": 3.400130335614207e-05, + "loss": 0.8898, + "step": 8614 + }, + { + "epoch": 1.348622417031935, + "grad_norm": 1.7338210344314575, + "learning_rate": 3.399315738025415e-05, + "loss": 0.451, + "step": 8615 + }, + { + "epoch": 1.3487789605510332, + "grad_norm": 1.9922966957092285, + "learning_rate": 3.398501140436625e-05, + "loss": 0.6149, + "step": 8616 + }, + { + "epoch": 1.3489355040701314, + "grad_norm": 2.002521514892578, + "learning_rate": 3.397686542847833e-05, + "loss": 0.8048, + "step": 8617 + }, + { + "epoch": 1.3490920475892298, + "grad_norm": 2.610513687133789, + "learning_rate": 3.3968719452590416e-05, + "loss": 0.7521, + "step": 8618 + }, + { + "epoch": 1.349248591108328, + "grad_norm": 1.5836657285690308, + "learning_rate": 3.3960573476702514e-05, + "loss": 0.5364, + "step": 8619 + }, + { + "epoch": 1.3494051346274265, + "grad_norm": 1.442020297050476, + "learning_rate": 3.39524275008146e-05, + "loss": 0.6903, + "step": 8620 + }, + { + "epoch": 1.3495616781465247, + "grad_norm": 3.379625082015991, + "learning_rate": 3.394428152492669e-05, + "loss": 0.8931, + "step": 8621 + }, + { + "epoch": 1.349718221665623, + "grad_norm": 1.7931550741195679, + "learning_rate": 3.393613554903878e-05, + "loss": 0.6814, + "step": 8622 + }, + { + "epoch": 1.3498747651847214, + "grad_norm": 4.374965190887451, + "learning_rate": 3.392798957315086e-05, + "loss": 1.1726, + "step": 8623 + }, + { + "epoch": 1.3500313087038196, + "grad_norm": 2.682018756866455, + "learning_rate": 3.391984359726295e-05, + "loss": 0.4243, + "step": 8624 + }, + { + "epoch": 1.350187852222918, + "grad_norm": 1.7653383016586304, + "learning_rate": 3.391169762137504e-05, + "loss": 0.8055, + "step": 8625 + }, + { + "epoch": 1.3503443957420163, + "grad_norm": 2.907439947128296, + "learning_rate": 3.3903551645487134e-05, + "loss": 1.1565, + "step": 8626 + }, + { + "epoch": 1.3505009392611145, + "grad_norm": 2.5115749835968018, + "learning_rate": 3.389540566959922e-05, + "loss": 1.0624, + "step": 8627 + }, + { + "epoch": 1.350657482780213, + "grad_norm": 3.5605645179748535, + "learning_rate": 3.388725969371131e-05, + "loss": 1.0108, + "step": 8628 + }, + { + "epoch": 1.3508140262993111, + "grad_norm": 2.9082274436950684, + "learning_rate": 3.38791137178234e-05, + "loss": 0.9243, + "step": 8629 + }, + { + "epoch": 1.3509705698184096, + "grad_norm": 4.4317851066589355, + "learning_rate": 3.387096774193548e-05, + "loss": 0.9188, + "step": 8630 + }, + { + "epoch": 1.3511271133375078, + "grad_norm": 2.10827374458313, + "learning_rate": 3.386282176604757e-05, + "loss": 1.0148, + "step": 8631 + }, + { + "epoch": 1.351283656856606, + "grad_norm": 0.9828125238418579, + "learning_rate": 3.385467579015966e-05, + "loss": 0.3446, + "step": 8632 + }, + { + "epoch": 1.3514402003757044, + "grad_norm": 8.85916805267334, + "learning_rate": 3.384652981427175e-05, + "loss": 1.0031, + "step": 8633 + }, + { + "epoch": 1.3515967438948029, + "grad_norm": 2.982647180557251, + "learning_rate": 3.3838383838383844e-05, + "loss": 1.2217, + "step": 8634 + }, + { + "epoch": 1.351753287413901, + "grad_norm": 3.838351249694824, + "learning_rate": 3.383023786249593e-05, + "loss": 0.7627, + "step": 8635 + }, + { + "epoch": 1.3519098309329993, + "grad_norm": 2.284728765487671, + "learning_rate": 3.382209188660801e-05, + "loss": 1.1229, + "step": 8636 + }, + { + "epoch": 1.3520663744520978, + "grad_norm": 4.109117031097412, + "learning_rate": 3.381394591072011e-05, + "loss": 0.7239, + "step": 8637 + }, + { + "epoch": 1.352222917971196, + "grad_norm": 7.208752155303955, + "learning_rate": 3.380579993483219e-05, + "loss": 0.9481, + "step": 8638 + }, + { + "epoch": 1.3523794614902944, + "grad_norm": 0.3924144506454468, + "learning_rate": 3.3797653958944283e-05, + "loss": 0.1794, + "step": 8639 + }, + { + "epoch": 1.3525360050093926, + "grad_norm": 0.39488011598587036, + "learning_rate": 3.3789507983056374e-05, + "loss": 0.2121, + "step": 8640 + }, + { + "epoch": 1.3526925485284909, + "grad_norm": 0.8870707154273987, + "learning_rate": 3.378136200716846e-05, + "loss": 0.2634, + "step": 8641 + }, + { + "epoch": 1.3528490920475893, + "grad_norm": 0.5173534154891968, + "learning_rate": 3.377321603128055e-05, + "loss": 0.2717, + "step": 8642 + }, + { + "epoch": 1.3530056355666875, + "grad_norm": 0.5873249769210815, + "learning_rate": 3.376507005539264e-05, + "loss": 0.145, + "step": 8643 + }, + { + "epoch": 1.353162179085786, + "grad_norm": 0.9445502161979675, + "learning_rate": 3.375692407950473e-05, + "loss": 0.3073, + "step": 8644 + }, + { + "epoch": 1.3533187226048842, + "grad_norm": 0.8846638202667236, + "learning_rate": 3.374877810361681e-05, + "loss": 0.2446, + "step": 8645 + }, + { + "epoch": 1.3534752661239824, + "grad_norm": 0.4679293930530548, + "learning_rate": 3.3740632127728904e-05, + "loss": 0.1997, + "step": 8646 + }, + { + "epoch": 1.3536318096430808, + "grad_norm": 1.0892565250396729, + "learning_rate": 3.3732486151840994e-05, + "loss": 0.3263, + "step": 8647 + }, + { + "epoch": 1.353788353162179, + "grad_norm": 1.1345254182815552, + "learning_rate": 3.372434017595308e-05, + "loss": 0.2805, + "step": 8648 + }, + { + "epoch": 1.3539448966812775, + "grad_norm": 0.5193237662315369, + "learning_rate": 3.371619420006517e-05, + "loss": 0.1829, + "step": 8649 + }, + { + "epoch": 1.3541014402003757, + "grad_norm": 0.6953559517860413, + "learning_rate": 3.370804822417726e-05, + "loss": 0.2817, + "step": 8650 + }, + { + "epoch": 1.354257983719474, + "grad_norm": 0.5030224919319153, + "learning_rate": 3.369990224828934e-05, + "loss": 0.1485, + "step": 8651 + }, + { + "epoch": 1.3544145272385724, + "grad_norm": 2.1322057247161865, + "learning_rate": 3.369175627240144e-05, + "loss": 0.3426, + "step": 8652 + }, + { + "epoch": 1.3545710707576706, + "grad_norm": 1.634839653968811, + "learning_rate": 3.3683610296513524e-05, + "loss": 0.2693, + "step": 8653 + }, + { + "epoch": 1.354727614276769, + "grad_norm": 1.6242371797561646, + "learning_rate": 3.367546432062561e-05, + "loss": 0.2753, + "step": 8654 + }, + { + "epoch": 1.3548841577958672, + "grad_norm": 1.62703537940979, + "learning_rate": 3.3667318344737705e-05, + "loss": 0.4941, + "step": 8655 + }, + { + "epoch": 1.3550407013149655, + "grad_norm": 1.8387484550476074, + "learning_rate": 3.365917236884979e-05, + "loss": 0.5073, + "step": 8656 + }, + { + "epoch": 1.355197244834064, + "grad_norm": 2.7662055492401123, + "learning_rate": 3.365102639296188e-05, + "loss": 0.3399, + "step": 8657 + }, + { + "epoch": 1.3553537883531621, + "grad_norm": 1.5255297422409058, + "learning_rate": 3.364288041707397e-05, + "loss": 0.2577, + "step": 8658 + }, + { + "epoch": 1.3555103318722606, + "grad_norm": 1.867443323135376, + "learning_rate": 3.363473444118605e-05, + "loss": 0.4308, + "step": 8659 + }, + { + "epoch": 1.3556668753913588, + "grad_norm": 4.475526332855225, + "learning_rate": 3.3626588465298144e-05, + "loss": 0.4155, + "step": 8660 + }, + { + "epoch": 1.355823418910457, + "grad_norm": 2.0113000869750977, + "learning_rate": 3.3618442489410234e-05, + "loss": 0.7718, + "step": 8661 + }, + { + "epoch": 1.3559799624295554, + "grad_norm": 1.8548848628997803, + "learning_rate": 3.361029651352232e-05, + "loss": 0.7366, + "step": 8662 + }, + { + "epoch": 1.3561365059486536, + "grad_norm": 1.183732271194458, + "learning_rate": 3.360215053763441e-05, + "loss": 0.335, + "step": 8663 + }, + { + "epoch": 1.356293049467752, + "grad_norm": 1.9238051176071167, + "learning_rate": 3.35940045617465e-05, + "loss": 0.3768, + "step": 8664 + }, + { + "epoch": 1.3564495929868503, + "grad_norm": 3.2654316425323486, + "learning_rate": 3.358585858585859e-05, + "loss": 0.4213, + "step": 8665 + }, + { + "epoch": 1.3566061365059485, + "grad_norm": 3.4783692359924316, + "learning_rate": 3.357771260997067e-05, + "loss": 0.703, + "step": 8666 + }, + { + "epoch": 1.356762680025047, + "grad_norm": 1.9044610261917114, + "learning_rate": 3.3569566634082764e-05, + "loss": 0.4557, + "step": 8667 + }, + { + "epoch": 1.3569192235441454, + "grad_norm": 1.6792100667953491, + "learning_rate": 3.3561420658194854e-05, + "loss": 0.4943, + "step": 8668 + }, + { + "epoch": 1.3570757670632436, + "grad_norm": 2.897214889526367, + "learning_rate": 3.355327468230694e-05, + "loss": 0.9895, + "step": 8669 + }, + { + "epoch": 1.3572323105823418, + "grad_norm": 2.064365863800049, + "learning_rate": 3.3545128706419035e-05, + "loss": 0.4354, + "step": 8670 + }, + { + "epoch": 1.3573888541014403, + "grad_norm": 3.454066276550293, + "learning_rate": 3.353698273053112e-05, + "loss": 0.4273, + "step": 8671 + }, + { + "epoch": 1.3575453976205385, + "grad_norm": 3.4278404712677, + "learning_rate": 3.35288367546432e-05, + "loss": 0.9922, + "step": 8672 + }, + { + "epoch": 1.357701941139637, + "grad_norm": 2.46130633354187, + "learning_rate": 3.35206907787553e-05, + "loss": 0.8517, + "step": 8673 + }, + { + "epoch": 1.3578584846587352, + "grad_norm": 4.8607177734375, + "learning_rate": 3.3512544802867384e-05, + "loss": 0.6997, + "step": 8674 + }, + { + "epoch": 1.3580150281778334, + "grad_norm": 1.7738631963729858, + "learning_rate": 3.3504398826979475e-05, + "loss": 0.5937, + "step": 8675 + }, + { + "epoch": 1.3581715716969318, + "grad_norm": 4.0335283279418945, + "learning_rate": 3.3496252851091565e-05, + "loss": 1.676, + "step": 8676 + }, + { + "epoch": 1.35832811521603, + "grad_norm": 2.4844274520874023, + "learning_rate": 3.348810687520365e-05, + "loss": 0.982, + "step": 8677 + }, + { + "epoch": 1.3584846587351285, + "grad_norm": 11.213478088378906, + "learning_rate": 3.347996089931574e-05, + "loss": 0.8594, + "step": 8678 + }, + { + "epoch": 1.3586412022542267, + "grad_norm": 3.2366931438446045, + "learning_rate": 3.347181492342783e-05, + "loss": 1.344, + "step": 8679 + }, + { + "epoch": 1.358797745773325, + "grad_norm": 2.3598546981811523, + "learning_rate": 3.3463668947539914e-05, + "loss": 0.4873, + "step": 8680 + }, + { + "epoch": 1.3589542892924233, + "grad_norm": 1.90955650806427, + "learning_rate": 3.3455522971652004e-05, + "loss": 0.5695, + "step": 8681 + }, + { + "epoch": 1.3591108328115216, + "grad_norm": 3.6185710430145264, + "learning_rate": 3.3447376995764095e-05, + "loss": 1.2343, + "step": 8682 + }, + { + "epoch": 1.35926737633062, + "grad_norm": 3.21639347076416, + "learning_rate": 3.3439231019876185e-05, + "loss": 0.7369, + "step": 8683 + }, + { + "epoch": 1.3594239198497182, + "grad_norm": 2.5916833877563477, + "learning_rate": 3.343108504398827e-05, + "loss": 0.4915, + "step": 8684 + }, + { + "epoch": 1.3595804633688164, + "grad_norm": 1.5892181396484375, + "learning_rate": 3.342293906810036e-05, + "loss": 0.383, + "step": 8685 + }, + { + "epoch": 1.3597370068879149, + "grad_norm": 2.9293105602264404, + "learning_rate": 3.341479309221245e-05, + "loss": 0.452, + "step": 8686 + }, + { + "epoch": 1.359893550407013, + "grad_norm": 5.679934024810791, + "learning_rate": 3.3406647116324534e-05, + "loss": 0.7375, + "step": 8687 + }, + { + "epoch": 1.3600500939261115, + "grad_norm": 3.848245620727539, + "learning_rate": 3.339850114043663e-05, + "loss": 1.5528, + "step": 8688 + }, + { + "epoch": 1.3602066374452098, + "grad_norm": 0.6105460524559021, + "learning_rate": 3.3390355164548715e-05, + "loss": 0.2915, + "step": 8689 + }, + { + "epoch": 1.360363180964308, + "grad_norm": 0.4206850528717041, + "learning_rate": 3.33822091886608e-05, + "loss": 0.1576, + "step": 8690 + }, + { + "epoch": 1.3605197244834064, + "grad_norm": 0.765316367149353, + "learning_rate": 3.3374063212772896e-05, + "loss": 0.2091, + "step": 8691 + }, + { + "epoch": 1.3606762680025046, + "grad_norm": 0.8302891254425049, + "learning_rate": 3.336591723688498e-05, + "loss": 0.2276, + "step": 8692 + }, + { + "epoch": 1.360832811521603, + "grad_norm": 0.5722601413726807, + "learning_rate": 3.335777126099707e-05, + "loss": 0.2018, + "step": 8693 + }, + { + "epoch": 1.3609893550407013, + "grad_norm": 0.7374024987220764, + "learning_rate": 3.334962528510916e-05, + "loss": 0.2259, + "step": 8694 + }, + { + "epoch": 1.3611458985597995, + "grad_norm": 0.8772002458572388, + "learning_rate": 3.3341479309221244e-05, + "loss": 0.2831, + "step": 8695 + }, + { + "epoch": 1.361302442078898, + "grad_norm": 0.7088584899902344, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.2635, + "step": 8696 + }, + { + "epoch": 1.3614589855979962, + "grad_norm": 0.86966872215271, + "learning_rate": 3.3325187357445425e-05, + "loss": 0.2203, + "step": 8697 + }, + { + "epoch": 1.3616155291170946, + "grad_norm": 0.752950131893158, + "learning_rate": 3.331704138155751e-05, + "loss": 0.2564, + "step": 8698 + }, + { + "epoch": 1.3617720726361928, + "grad_norm": 0.8176794648170471, + "learning_rate": 3.33088954056696e-05, + "loss": 0.1755, + "step": 8699 + }, + { + "epoch": 1.361928616155291, + "grad_norm": 1.2790545225143433, + "learning_rate": 3.330074942978169e-05, + "loss": 0.3758, + "step": 8700 + }, + { + "epoch": 1.3620851596743895, + "grad_norm": 0.9615888595581055, + "learning_rate": 3.329260345389378e-05, + "loss": 0.2695, + "step": 8701 + }, + { + "epoch": 1.362241703193488, + "grad_norm": 1.4245201349258423, + "learning_rate": 3.3284457478005865e-05, + "loss": 0.5227, + "step": 8702 + }, + { + "epoch": 1.3623982467125861, + "grad_norm": 1.006946086883545, + "learning_rate": 3.3276311502117955e-05, + "loss": 0.3093, + "step": 8703 + }, + { + "epoch": 1.3625547902316844, + "grad_norm": 1.1057575941085815, + "learning_rate": 3.3268165526230046e-05, + "loss": 0.3057, + "step": 8704 + }, + { + "epoch": 1.3627113337507828, + "grad_norm": 1.3768692016601562, + "learning_rate": 3.326001955034213e-05, + "loss": 0.529, + "step": 8705 + }, + { + "epoch": 1.362867877269881, + "grad_norm": 1.481512427330017, + "learning_rate": 3.325187357445422e-05, + "loss": 0.352, + "step": 8706 + }, + { + "epoch": 1.3630244207889795, + "grad_norm": 1.4301629066467285, + "learning_rate": 3.324372759856631e-05, + "loss": 0.3434, + "step": 8707 + }, + { + "epoch": 1.3631809643080777, + "grad_norm": 2.3747429847717285, + "learning_rate": 3.3235581622678394e-05, + "loss": 0.5316, + "step": 8708 + }, + { + "epoch": 1.3633375078271759, + "grad_norm": 3.568209171295166, + "learning_rate": 3.322743564679049e-05, + "loss": 0.5077, + "step": 8709 + }, + { + "epoch": 1.3634940513462743, + "grad_norm": 1.7856477499008179, + "learning_rate": 3.3219289670902575e-05, + "loss": 0.7436, + "step": 8710 + }, + { + "epoch": 1.3636505948653725, + "grad_norm": 1.9535614252090454, + "learning_rate": 3.3211143695014666e-05, + "loss": 0.3766, + "step": 8711 + }, + { + "epoch": 1.363807138384471, + "grad_norm": 1.6923868656158447, + "learning_rate": 3.3202997719126756e-05, + "loss": 0.3597, + "step": 8712 + }, + { + "epoch": 1.3639636819035692, + "grad_norm": 1.4332658052444458, + "learning_rate": 3.319485174323884e-05, + "loss": 0.2678, + "step": 8713 + }, + { + "epoch": 1.3641202254226674, + "grad_norm": 1.1550507545471191, + "learning_rate": 3.318670576735093e-05, + "loss": 0.343, + "step": 8714 + }, + { + "epoch": 1.3642767689417659, + "grad_norm": 3.1253702640533447, + "learning_rate": 3.317855979146302e-05, + "loss": 0.8699, + "step": 8715 + }, + { + "epoch": 1.364433312460864, + "grad_norm": 2.140629768371582, + "learning_rate": 3.3170413815575105e-05, + "loss": 0.9275, + "step": 8716 + }, + { + "epoch": 1.3645898559799625, + "grad_norm": 2.945847272872925, + "learning_rate": 3.3162267839687195e-05, + "loss": 1.0095, + "step": 8717 + }, + { + "epoch": 1.3647463994990607, + "grad_norm": 1.7551683187484741, + "learning_rate": 3.3154121863799286e-05, + "loss": 0.549, + "step": 8718 + }, + { + "epoch": 1.364902943018159, + "grad_norm": 2.8647751808166504, + "learning_rate": 3.3145975887911376e-05, + "loss": 0.8785, + "step": 8719 + }, + { + "epoch": 1.3650594865372574, + "grad_norm": 2.276350736618042, + "learning_rate": 3.313782991202346e-05, + "loss": 0.8001, + "step": 8720 + }, + { + "epoch": 1.3652160300563556, + "grad_norm": 2.1110851764678955, + "learning_rate": 3.312968393613555e-05, + "loss": 0.7234, + "step": 8721 + }, + { + "epoch": 1.365372573575454, + "grad_norm": 2.395008087158203, + "learning_rate": 3.312153796024764e-05, + "loss": 0.5369, + "step": 8722 + }, + { + "epoch": 1.3655291170945523, + "grad_norm": 2.074681043624878, + "learning_rate": 3.3113391984359725e-05, + "loss": 0.6217, + "step": 8723 + }, + { + "epoch": 1.3656856606136505, + "grad_norm": 3.553521156311035, + "learning_rate": 3.3105246008471815e-05, + "loss": 0.6814, + "step": 8724 + }, + { + "epoch": 1.365842204132749, + "grad_norm": 2.5744292736053467, + "learning_rate": 3.3097100032583906e-05, + "loss": 0.9786, + "step": 8725 + }, + { + "epoch": 1.3659987476518471, + "grad_norm": 2.846867084503174, + "learning_rate": 3.308895405669599e-05, + "loss": 0.8817, + "step": 8726 + }, + { + "epoch": 1.3661552911709456, + "grad_norm": 3.7106590270996094, + "learning_rate": 3.308080808080809e-05, + "loss": 0.8751, + "step": 8727 + }, + { + "epoch": 1.3663118346900438, + "grad_norm": 3.729397773742676, + "learning_rate": 3.307266210492017e-05, + "loss": 0.6096, + "step": 8728 + }, + { + "epoch": 1.366468378209142, + "grad_norm": 2.583374500274658, + "learning_rate": 3.306451612903226e-05, + "loss": 0.5109, + "step": 8729 + }, + { + "epoch": 1.3666249217282405, + "grad_norm": 3.226750135421753, + "learning_rate": 3.305637015314435e-05, + "loss": 0.7037, + "step": 8730 + }, + { + "epoch": 1.3667814652473387, + "grad_norm": 2.903658866882324, + "learning_rate": 3.3048224177256436e-05, + "loss": 0.8513, + "step": 8731 + }, + { + "epoch": 1.3669380087664371, + "grad_norm": 4.016299724578857, + "learning_rate": 3.3040078201368526e-05, + "loss": 1.0135, + "step": 8732 + }, + { + "epoch": 1.3670945522855353, + "grad_norm": 2.618595600128174, + "learning_rate": 3.303193222548062e-05, + "loss": 0.5903, + "step": 8733 + }, + { + "epoch": 1.3672510958046336, + "grad_norm": 5.0938520431518555, + "learning_rate": 3.30237862495927e-05, + "loss": 1.1184, + "step": 8734 + }, + { + "epoch": 1.367407639323732, + "grad_norm": 2.8265137672424316, + "learning_rate": 3.301564027370479e-05, + "loss": 0.5446, + "step": 8735 + }, + { + "epoch": 1.3675641828428304, + "grad_norm": 3.5896215438842773, + "learning_rate": 3.300749429781688e-05, + "loss": 0.5942, + "step": 8736 + }, + { + "epoch": 1.3677207263619287, + "grad_norm": 2.3241660594940186, + "learning_rate": 3.299934832192897e-05, + "loss": 0.7206, + "step": 8737 + }, + { + "epoch": 1.3678772698810269, + "grad_norm": 2.814577102661133, + "learning_rate": 3.2991202346041056e-05, + "loss": 0.479, + "step": 8738 + }, + { + "epoch": 1.3680338134001253, + "grad_norm": 0.6443060636520386, + "learning_rate": 3.2983056370153146e-05, + "loss": 0.2697, + "step": 8739 + }, + { + "epoch": 1.3681903569192235, + "grad_norm": 0.6231942772865295, + "learning_rate": 3.297491039426524e-05, + "loss": 0.1957, + "step": 8740 + }, + { + "epoch": 1.368346900438322, + "grad_norm": 0.6939897537231445, + "learning_rate": 3.296676441837732e-05, + "loss": 0.2273, + "step": 8741 + }, + { + "epoch": 1.3685034439574202, + "grad_norm": 0.4288860559463501, + "learning_rate": 3.295861844248941e-05, + "loss": 0.1175, + "step": 8742 + }, + { + "epoch": 1.3686599874765184, + "grad_norm": 0.43955516815185547, + "learning_rate": 3.29504724666015e-05, + "loss": 0.1765, + "step": 8743 + }, + { + "epoch": 1.3688165309956168, + "grad_norm": 1.5631275177001953, + "learning_rate": 3.2942326490713585e-05, + "loss": 0.4815, + "step": 8744 + }, + { + "epoch": 1.368973074514715, + "grad_norm": 0.7973602414131165, + "learning_rate": 3.293418051482568e-05, + "loss": 0.2689, + "step": 8745 + }, + { + "epoch": 1.3691296180338135, + "grad_norm": 0.9242101311683655, + "learning_rate": 3.2926034538937766e-05, + "loss": 0.1829, + "step": 8746 + }, + { + "epoch": 1.3692861615529117, + "grad_norm": 0.7121622562408447, + "learning_rate": 3.291788856304985e-05, + "loss": 0.2349, + "step": 8747 + }, + { + "epoch": 1.36944270507201, + "grad_norm": 1.1493428945541382, + "learning_rate": 3.290974258716195e-05, + "loss": 0.2452, + "step": 8748 + }, + { + "epoch": 1.3695992485911084, + "grad_norm": 1.0126712322235107, + "learning_rate": 3.290159661127403e-05, + "loss": 0.2683, + "step": 8749 + }, + { + "epoch": 1.3697557921102066, + "grad_norm": 0.7771222591400146, + "learning_rate": 3.289345063538612e-05, + "loss": 0.2387, + "step": 8750 + }, + { + "epoch": 1.369912335629305, + "grad_norm": 3.1991240978240967, + "learning_rate": 3.288530465949821e-05, + "loss": 0.6123, + "step": 8751 + }, + { + "epoch": 1.3700688791484033, + "grad_norm": 2.1771955490112305, + "learning_rate": 3.2877158683610296e-05, + "loss": 0.3831, + "step": 8752 + }, + { + "epoch": 1.3702254226675015, + "grad_norm": 1.5873305797576904, + "learning_rate": 3.2869012707722386e-05, + "loss": 0.2836, + "step": 8753 + }, + { + "epoch": 1.3703819661866, + "grad_norm": 0.8145593404769897, + "learning_rate": 3.286086673183448e-05, + "loss": 0.0979, + "step": 8754 + }, + { + "epoch": 1.3705385097056981, + "grad_norm": 0.9811624884605408, + "learning_rate": 3.285272075594657e-05, + "loss": 0.3465, + "step": 8755 + }, + { + "epoch": 1.3706950532247966, + "grad_norm": 0.8368296027183533, + "learning_rate": 3.284457478005865e-05, + "loss": 0.2401, + "step": 8756 + }, + { + "epoch": 1.3708515967438948, + "grad_norm": 2.658220052719116, + "learning_rate": 3.283642880417074e-05, + "loss": 0.6861, + "step": 8757 + }, + { + "epoch": 1.371008140262993, + "grad_norm": 0.8606971502304077, + "learning_rate": 3.282828282828283e-05, + "loss": 0.2843, + "step": 8758 + }, + { + "epoch": 1.3711646837820914, + "grad_norm": 2.194949150085449, + "learning_rate": 3.2820136852394916e-05, + "loss": 0.5248, + "step": 8759 + }, + { + "epoch": 1.3713212273011897, + "grad_norm": 1.9347641468048096, + "learning_rate": 3.2811990876507007e-05, + "loss": 0.4807, + "step": 8760 + }, + { + "epoch": 1.371477770820288, + "grad_norm": 1.2829827070236206, + "learning_rate": 3.28038449006191e-05, + "loss": 0.4984, + "step": 8761 + }, + { + "epoch": 1.3716343143393863, + "grad_norm": 2.407233238220215, + "learning_rate": 3.279569892473118e-05, + "loss": 0.6408, + "step": 8762 + }, + { + "epoch": 1.3717908578584845, + "grad_norm": 4.0898542404174805, + "learning_rate": 3.278755294884328e-05, + "loss": 0.478, + "step": 8763 + }, + { + "epoch": 1.371947401377583, + "grad_norm": 2.224686861038208, + "learning_rate": 3.277940697295536e-05, + "loss": 0.6087, + "step": 8764 + }, + { + "epoch": 1.3721039448966814, + "grad_norm": 1.7987958192825317, + "learning_rate": 3.2771260997067446e-05, + "loss": 0.3862, + "step": 8765 + }, + { + "epoch": 1.3722604884157796, + "grad_norm": 2.536289691925049, + "learning_rate": 3.276311502117954e-05, + "loss": 0.578, + "step": 8766 + }, + { + "epoch": 1.3724170319348779, + "grad_norm": 3.2014994621276855, + "learning_rate": 3.275496904529163e-05, + "loss": 0.7451, + "step": 8767 + }, + { + "epoch": 1.372573575453976, + "grad_norm": 1.19336998462677, + "learning_rate": 3.274682306940372e-05, + "loss": 0.435, + "step": 8768 + }, + { + "epoch": 1.3727301189730745, + "grad_norm": 1.9891248941421509, + "learning_rate": 3.273867709351581e-05, + "loss": 0.763, + "step": 8769 + }, + { + "epoch": 1.372886662492173, + "grad_norm": 4.113495349884033, + "learning_rate": 3.273053111762789e-05, + "loss": 0.9268, + "step": 8770 + }, + { + "epoch": 1.3730432060112712, + "grad_norm": 1.9712504148483276, + "learning_rate": 3.272238514173998e-05, + "loss": 0.3758, + "step": 8771 + }, + { + "epoch": 1.3731997495303694, + "grad_norm": 2.5075013637542725, + "learning_rate": 3.271423916585207e-05, + "loss": 0.7708, + "step": 8772 + }, + { + "epoch": 1.3733562930494678, + "grad_norm": 7.086827754974365, + "learning_rate": 3.270609318996416e-05, + "loss": 0.6801, + "step": 8773 + }, + { + "epoch": 1.373512836568566, + "grad_norm": 3.909684181213379, + "learning_rate": 3.269794721407625e-05, + "loss": 1.2851, + "step": 8774 + }, + { + "epoch": 1.3736693800876645, + "grad_norm": 2.701270818710327, + "learning_rate": 3.268980123818834e-05, + "loss": 0.9468, + "step": 8775 + }, + { + "epoch": 1.3738259236067627, + "grad_norm": 3.773635149002075, + "learning_rate": 3.268165526230043e-05, + "loss": 1.6486, + "step": 8776 + }, + { + "epoch": 1.373982467125861, + "grad_norm": 4.2368364334106445, + "learning_rate": 3.267350928641251e-05, + "loss": 1.4236, + "step": 8777 + }, + { + "epoch": 1.3741390106449594, + "grad_norm": 3.813992738723755, + "learning_rate": 3.26653633105246e-05, + "loss": 0.9995, + "step": 8778 + }, + { + "epoch": 1.3742955541640576, + "grad_norm": 4.643187999725342, + "learning_rate": 3.265721733463669e-05, + "loss": 0.858, + "step": 8779 + }, + { + "epoch": 1.374452097683156, + "grad_norm": 2.561020612716675, + "learning_rate": 3.2649071358748776e-05, + "loss": 1.211, + "step": 8780 + }, + { + "epoch": 1.3746086412022542, + "grad_norm": 2.527142286300659, + "learning_rate": 3.2640925382860874e-05, + "loss": 0.9763, + "step": 8781 + }, + { + "epoch": 1.3747651847213525, + "grad_norm": 1.852537751197815, + "learning_rate": 3.263277940697296e-05, + "loss": 0.8048, + "step": 8782 + }, + { + "epoch": 1.374921728240451, + "grad_norm": 5.436705112457275, + "learning_rate": 3.262463343108504e-05, + "loss": 1.0342, + "step": 8783 + }, + { + "epoch": 1.375078271759549, + "grad_norm": 8.510066032409668, + "learning_rate": 3.261648745519714e-05, + "loss": 0.6935, + "step": 8784 + }, + { + "epoch": 1.3752348152786475, + "grad_norm": 7.176856994628906, + "learning_rate": 3.260834147930922e-05, + "loss": 0.3456, + "step": 8785 + }, + { + "epoch": 1.3753913587977458, + "grad_norm": 3.3210079669952393, + "learning_rate": 3.260019550342131e-05, + "loss": 0.5457, + "step": 8786 + }, + { + "epoch": 1.375547902316844, + "grad_norm": 7.063208103179932, + "learning_rate": 3.25920495275334e-05, + "loss": 1.2039, + "step": 8787 + }, + { + "epoch": 1.3757044458359424, + "grad_norm": 2.9852280616760254, + "learning_rate": 3.258390355164549e-05, + "loss": 0.8846, + "step": 8788 + }, + { + "epoch": 1.3758609893550406, + "grad_norm": 0.5511553883552551, + "learning_rate": 3.257575757575758e-05, + "loss": 0.2549, + "step": 8789 + }, + { + "epoch": 1.376017532874139, + "grad_norm": 0.8858806490898132, + "learning_rate": 3.256761159986967e-05, + "loss": 0.2842, + "step": 8790 + }, + { + "epoch": 1.3761740763932373, + "grad_norm": 1.6131675243377686, + "learning_rate": 3.255946562398175e-05, + "loss": 0.2382, + "step": 8791 + }, + { + "epoch": 1.3763306199123355, + "grad_norm": 0.7691659331321716, + "learning_rate": 3.255131964809384e-05, + "loss": 0.2127, + "step": 8792 + }, + { + "epoch": 1.376487163431434, + "grad_norm": 0.46108391880989075, + "learning_rate": 3.254317367220593e-05, + "loss": 0.26, + "step": 8793 + }, + { + "epoch": 1.3766437069505322, + "grad_norm": 1.0443722009658813, + "learning_rate": 3.2535027696318023e-05, + "loss": 0.2401, + "step": 8794 + }, + { + "epoch": 1.3768002504696306, + "grad_norm": 0.5902356505393982, + "learning_rate": 3.252688172043011e-05, + "loss": 0.2201, + "step": 8795 + }, + { + "epoch": 1.3769567939887288, + "grad_norm": 0.6830036044120789, + "learning_rate": 3.25187357445422e-05, + "loss": 0.2307, + "step": 8796 + }, + { + "epoch": 1.377113337507827, + "grad_norm": 5.095394611358643, + "learning_rate": 3.251058976865429e-05, + "loss": 0.8972, + "step": 8797 + }, + { + "epoch": 1.3772698810269255, + "grad_norm": 0.9916277527809143, + "learning_rate": 3.250244379276637e-05, + "loss": 0.2761, + "step": 8798 + }, + { + "epoch": 1.377426424546024, + "grad_norm": 2.4364166259765625, + "learning_rate": 3.249429781687847e-05, + "loss": 0.2574, + "step": 8799 + }, + { + "epoch": 1.3775829680651221, + "grad_norm": 0.8580630421638489, + "learning_rate": 3.248615184099055e-05, + "loss": 0.3598, + "step": 8800 + }, + { + "epoch": 1.3777395115842204, + "grad_norm": 0.6923543810844421, + "learning_rate": 3.247800586510264e-05, + "loss": 0.2109, + "step": 8801 + }, + { + "epoch": 1.3778960551033186, + "grad_norm": 2.98860764503479, + "learning_rate": 3.2469859889214734e-05, + "loss": 0.4379, + "step": 8802 + }, + { + "epoch": 1.378052598622417, + "grad_norm": 1.690706491470337, + "learning_rate": 3.246171391332682e-05, + "loss": 0.4947, + "step": 8803 + }, + { + "epoch": 1.3782091421415155, + "grad_norm": 1.2420005798339844, + "learning_rate": 3.245356793743891e-05, + "loss": 0.2497, + "step": 8804 + }, + { + "epoch": 1.3783656856606137, + "grad_norm": 1.8836838006973267, + "learning_rate": 3.2445421961551e-05, + "loss": 0.3203, + "step": 8805 + }, + { + "epoch": 1.378522229179712, + "grad_norm": 4.5526347160339355, + "learning_rate": 3.243727598566308e-05, + "loss": 0.3373, + "step": 8806 + }, + { + "epoch": 1.3786787726988103, + "grad_norm": 2.5194904804229736, + "learning_rate": 3.242913000977517e-05, + "loss": 0.3794, + "step": 8807 + }, + { + "epoch": 1.3788353162179086, + "grad_norm": 1.9055770635604858, + "learning_rate": 3.2420984033887264e-05, + "loss": 0.4996, + "step": 8808 + }, + { + "epoch": 1.378991859737007, + "grad_norm": 1.552714467048645, + "learning_rate": 3.241283805799935e-05, + "loss": 0.2317, + "step": 8809 + }, + { + "epoch": 1.3791484032561052, + "grad_norm": 1.761737585067749, + "learning_rate": 3.240469208211144e-05, + "loss": 0.5531, + "step": 8810 + }, + { + "epoch": 1.3793049467752034, + "grad_norm": 1.4126627445220947, + "learning_rate": 3.239654610622353e-05, + "loss": 0.3864, + "step": 8811 + }, + { + "epoch": 1.3794614902943019, + "grad_norm": 1.3071115016937256, + "learning_rate": 3.238840013033562e-05, + "loss": 0.3513, + "step": 8812 + }, + { + "epoch": 1.3796180338134, + "grad_norm": 2.6153762340545654, + "learning_rate": 3.23802541544477e-05, + "loss": 0.5544, + "step": 8813 + }, + { + "epoch": 1.3797745773324985, + "grad_norm": 3.192902088165283, + "learning_rate": 3.237210817855979e-05, + "loss": 0.4945, + "step": 8814 + }, + { + "epoch": 1.3799311208515967, + "grad_norm": 2.144998073577881, + "learning_rate": 3.2363962202671884e-05, + "loss": 0.8272, + "step": 8815 + }, + { + "epoch": 1.380087664370695, + "grad_norm": 1.6888270378112793, + "learning_rate": 3.235581622678397e-05, + "loss": 0.5296, + "step": 8816 + }, + { + "epoch": 1.3802442078897934, + "grad_norm": 2.7774949073791504, + "learning_rate": 3.2347670250896065e-05, + "loss": 0.9383, + "step": 8817 + }, + { + "epoch": 1.3804007514088916, + "grad_norm": 4.553781509399414, + "learning_rate": 3.233952427500815e-05, + "loss": 0.9124, + "step": 8818 + }, + { + "epoch": 1.38055729492799, + "grad_norm": 2.63759183883667, + "learning_rate": 3.233137829912023e-05, + "loss": 0.4461, + "step": 8819 + }, + { + "epoch": 1.3807138384470883, + "grad_norm": 3.193955183029175, + "learning_rate": 3.232323232323233e-05, + "loss": 0.7467, + "step": 8820 + }, + { + "epoch": 1.3808703819661865, + "grad_norm": 2.5758326053619385, + "learning_rate": 3.2315086347344413e-05, + "loss": 0.7467, + "step": 8821 + }, + { + "epoch": 1.381026925485285, + "grad_norm": 1.6774457693099976, + "learning_rate": 3.2306940371456504e-05, + "loss": 0.471, + "step": 8822 + }, + { + "epoch": 1.3811834690043832, + "grad_norm": 2.729668378829956, + "learning_rate": 3.2298794395568595e-05, + "loss": 1.0805, + "step": 8823 + }, + { + "epoch": 1.3813400125234816, + "grad_norm": 4.014828681945801, + "learning_rate": 3.229064841968068e-05, + "loss": 0.5675, + "step": 8824 + }, + { + "epoch": 1.3814965560425798, + "grad_norm": 2.8439903259277344, + "learning_rate": 3.228250244379277e-05, + "loss": 0.6368, + "step": 8825 + }, + { + "epoch": 1.381653099561678, + "grad_norm": 5.804491996765137, + "learning_rate": 3.227435646790485e-05, + "loss": 1.2582, + "step": 8826 + }, + { + "epoch": 1.3818096430807765, + "grad_norm": 2.559844493865967, + "learning_rate": 3.226621049201694e-05, + "loss": 1.0252, + "step": 8827 + }, + { + "epoch": 1.3819661865998747, + "grad_norm": 2.8622794151306152, + "learning_rate": 3.2258064516129034e-05, + "loss": 1.1629, + "step": 8828 + }, + { + "epoch": 1.3821227301189731, + "grad_norm": 3.920487880706787, + "learning_rate": 3.224991854024112e-05, + "loss": 1.0392, + "step": 8829 + }, + { + "epoch": 1.3822792736380713, + "grad_norm": 3.699842691421509, + "learning_rate": 3.2241772564353215e-05, + "loss": 0.9992, + "step": 8830 + }, + { + "epoch": 1.3824358171571696, + "grad_norm": 2.0876338481903076, + "learning_rate": 3.22336265884653e-05, + "loss": 0.5971, + "step": 8831 + }, + { + "epoch": 1.382592360676268, + "grad_norm": 2.200843572616577, + "learning_rate": 3.222548061257739e-05, + "loss": 1.1762, + "step": 8832 + }, + { + "epoch": 1.3827489041953664, + "grad_norm": 2.0027990341186523, + "learning_rate": 3.221733463668948e-05, + "loss": 0.9747, + "step": 8833 + }, + { + "epoch": 1.3829054477144647, + "grad_norm": 2.8207273483276367, + "learning_rate": 3.220918866080156e-05, + "loss": 0.6452, + "step": 8834 + }, + { + "epoch": 1.3830619912335629, + "grad_norm": 3.846458911895752, + "learning_rate": 3.2201042684913654e-05, + "loss": 0.9308, + "step": 8835 + }, + { + "epoch": 1.3832185347526613, + "grad_norm": 3.0922794342041016, + "learning_rate": 3.2192896709025744e-05, + "loss": 1.3177, + "step": 8836 + }, + { + "epoch": 1.3833750782717595, + "grad_norm": 3.672003984451294, + "learning_rate": 3.218475073313783e-05, + "loss": 0.9361, + "step": 8837 + }, + { + "epoch": 1.383531621790858, + "grad_norm": 3.2357187271118164, + "learning_rate": 3.217660475724992e-05, + "loss": 0.9095, + "step": 8838 + }, + { + "epoch": 1.3836881653099562, + "grad_norm": 0.4880688786506653, + "learning_rate": 3.216845878136201e-05, + "loss": 0.1885, + "step": 8839 + }, + { + "epoch": 1.3838447088290544, + "grad_norm": 0.5602995753288269, + "learning_rate": 3.21603128054741e-05, + "loss": 0.2856, + "step": 8840 + }, + { + "epoch": 1.3840012523481529, + "grad_norm": 0.5403319597244263, + "learning_rate": 3.215216682958618e-05, + "loss": 0.1654, + "step": 8841 + }, + { + "epoch": 1.384157795867251, + "grad_norm": 0.5817023515701294, + "learning_rate": 3.2144020853698274e-05, + "loss": 0.2079, + "step": 8842 + }, + { + "epoch": 1.3843143393863495, + "grad_norm": 0.5526574850082397, + "learning_rate": 3.2135874877810364e-05, + "loss": 0.2527, + "step": 8843 + }, + { + "epoch": 1.3844708829054477, + "grad_norm": 0.6120612025260925, + "learning_rate": 3.212772890192245e-05, + "loss": 0.1983, + "step": 8844 + }, + { + "epoch": 1.384627426424546, + "grad_norm": 0.48940086364746094, + "learning_rate": 3.211958292603454e-05, + "loss": 0.211, + "step": 8845 + }, + { + "epoch": 1.3847839699436444, + "grad_norm": 0.6316811442375183, + "learning_rate": 3.211143695014663e-05, + "loss": 0.258, + "step": 8846 + }, + { + "epoch": 1.3849405134627426, + "grad_norm": 0.6793708801269531, + "learning_rate": 3.210329097425871e-05, + "loss": 0.2557, + "step": 8847 + }, + { + "epoch": 1.385097056981841, + "grad_norm": 0.5452753305435181, + "learning_rate": 3.209514499837081e-05, + "loss": 0.1926, + "step": 8848 + }, + { + "epoch": 1.3852536005009393, + "grad_norm": 0.709191620349884, + "learning_rate": 3.2086999022482894e-05, + "loss": 0.2246, + "step": 8849 + }, + { + "epoch": 1.3854101440200375, + "grad_norm": 0.5636671781539917, + "learning_rate": 3.207885304659498e-05, + "loss": 0.2141, + "step": 8850 + }, + { + "epoch": 1.385566687539136, + "grad_norm": 1.9398207664489746, + "learning_rate": 3.2070707070707075e-05, + "loss": 0.6277, + "step": 8851 + }, + { + "epoch": 1.3857232310582341, + "grad_norm": 0.9980046153068542, + "learning_rate": 3.206256109481916e-05, + "loss": 0.2153, + "step": 8852 + }, + { + "epoch": 1.3858797745773326, + "grad_norm": 0.7794275283813477, + "learning_rate": 3.205441511893125e-05, + "loss": 0.3022, + "step": 8853 + }, + { + "epoch": 1.3860363180964308, + "grad_norm": 0.8307777047157288, + "learning_rate": 3.204626914304334e-05, + "loss": 0.3525, + "step": 8854 + }, + { + "epoch": 1.386192861615529, + "grad_norm": 1.0878459215164185, + "learning_rate": 3.2038123167155424e-05, + "loss": 0.411, + "step": 8855 + }, + { + "epoch": 1.3863494051346275, + "grad_norm": 1.6932963132858276, + "learning_rate": 3.2029977191267514e-05, + "loss": 0.539, + "step": 8856 + }, + { + "epoch": 1.3865059486537257, + "grad_norm": 2.346792459487915, + "learning_rate": 3.2021831215379605e-05, + "loss": 0.3631, + "step": 8857 + }, + { + "epoch": 1.3866624921728241, + "grad_norm": 1.5973185300827026, + "learning_rate": 3.2013685239491695e-05, + "loss": 0.3625, + "step": 8858 + }, + { + "epoch": 1.3868190356919223, + "grad_norm": 3.6287529468536377, + "learning_rate": 3.200553926360378e-05, + "loss": 0.7301, + "step": 8859 + }, + { + "epoch": 1.3869755792110205, + "grad_norm": 2.0105111598968506, + "learning_rate": 3.199739328771587e-05, + "loss": 0.6007, + "step": 8860 + }, + { + "epoch": 1.387132122730119, + "grad_norm": 2.512051582336426, + "learning_rate": 3.198924731182796e-05, + "loss": 0.7777, + "step": 8861 + }, + { + "epoch": 1.3872886662492172, + "grad_norm": 2.521836280822754, + "learning_rate": 3.1981101335940044e-05, + "loss": 0.3868, + "step": 8862 + }, + { + "epoch": 1.3874452097683156, + "grad_norm": 1.9244658946990967, + "learning_rate": 3.1972955360052134e-05, + "loss": 0.4815, + "step": 8863 + }, + { + "epoch": 1.3876017532874139, + "grad_norm": 2.39119291305542, + "learning_rate": 3.1964809384164225e-05, + "loss": 0.543, + "step": 8864 + }, + { + "epoch": 1.387758296806512, + "grad_norm": 2.3876755237579346, + "learning_rate": 3.195666340827631e-05, + "loss": 0.5832, + "step": 8865 + }, + { + "epoch": 1.3879148403256105, + "grad_norm": 2.3310341835021973, + "learning_rate": 3.1948517432388406e-05, + "loss": 0.6376, + "step": 8866 + }, + { + "epoch": 1.388071383844709, + "grad_norm": 2.3339715003967285, + "learning_rate": 3.194037145650049e-05, + "loss": 0.4478, + "step": 8867 + }, + { + "epoch": 1.3882279273638072, + "grad_norm": 2.328845500946045, + "learning_rate": 3.193222548061257e-05, + "loss": 0.5659, + "step": 8868 + }, + { + "epoch": 1.3883844708829054, + "grad_norm": 2.7281086444854736, + "learning_rate": 3.192407950472467e-05, + "loss": 0.8523, + "step": 8869 + }, + { + "epoch": 1.3885410144020038, + "grad_norm": 2.6798267364501953, + "learning_rate": 3.1915933528836754e-05, + "loss": 0.8053, + "step": 8870 + }, + { + "epoch": 1.388697557921102, + "grad_norm": 2.3362667560577393, + "learning_rate": 3.1907787552948845e-05, + "loss": 0.9035, + "step": 8871 + }, + { + "epoch": 1.3888541014402005, + "grad_norm": 3.534924030303955, + "learning_rate": 3.1899641577060935e-05, + "loss": 0.4752, + "step": 8872 + }, + { + "epoch": 1.3890106449592987, + "grad_norm": 2.5268023014068604, + "learning_rate": 3.189149560117302e-05, + "loss": 0.9192, + "step": 8873 + }, + { + "epoch": 1.389167188478397, + "grad_norm": 3.9577150344848633, + "learning_rate": 3.188334962528511e-05, + "loss": 0.9267, + "step": 8874 + }, + { + "epoch": 1.3893237319974954, + "grad_norm": 2.430797815322876, + "learning_rate": 3.18752036493972e-05, + "loss": 0.8008, + "step": 8875 + }, + { + "epoch": 1.3894802755165936, + "grad_norm": 2.842752695083618, + "learning_rate": 3.186705767350929e-05, + "loss": 0.9189, + "step": 8876 + }, + { + "epoch": 1.389636819035692, + "grad_norm": 3.716202974319458, + "learning_rate": 3.1858911697621374e-05, + "loss": 0.9084, + "step": 8877 + }, + { + "epoch": 1.3897933625547902, + "grad_norm": 2.5382421016693115, + "learning_rate": 3.1850765721733465e-05, + "loss": 0.7452, + "step": 8878 + }, + { + "epoch": 1.3899499060738885, + "grad_norm": 2.0694613456726074, + "learning_rate": 3.1842619745845556e-05, + "loss": 0.5543, + "step": 8879 + }, + { + "epoch": 1.390106449592987, + "grad_norm": 5.059754371643066, + "learning_rate": 3.183447376995764e-05, + "loss": 1.0176, + "step": 8880 + }, + { + "epoch": 1.3902629931120851, + "grad_norm": 2.006254196166992, + "learning_rate": 3.182632779406973e-05, + "loss": 0.8136, + "step": 8881 + }, + { + "epoch": 1.3904195366311836, + "grad_norm": 2.6896843910217285, + "learning_rate": 3.181818181818182e-05, + "loss": 0.505, + "step": 8882 + }, + { + "epoch": 1.3905760801502818, + "grad_norm": 2.859174966812134, + "learning_rate": 3.1810035842293904e-05, + "loss": 1.4052, + "step": 8883 + }, + { + "epoch": 1.39073262366938, + "grad_norm": 1.1606605052947998, + "learning_rate": 3.1801889866406e-05, + "loss": 0.5382, + "step": 8884 + }, + { + "epoch": 1.3908891671884784, + "grad_norm": 1.3831660747528076, + "learning_rate": 3.1793743890518085e-05, + "loss": 0.7021, + "step": 8885 + }, + { + "epoch": 1.3910457107075767, + "grad_norm": 4.616837501525879, + "learning_rate": 3.178559791463017e-05, + "loss": 0.7604, + "step": 8886 + }, + { + "epoch": 1.391202254226675, + "grad_norm": 3.5814239978790283, + "learning_rate": 3.1777451938742266e-05, + "loss": 1.1252, + "step": 8887 + }, + { + "epoch": 1.3913587977457733, + "grad_norm": 3.9987952709198, + "learning_rate": 3.176930596285435e-05, + "loss": 1.259, + "step": 8888 + }, + { + "epoch": 1.3915153412648715, + "grad_norm": 0.5100349187850952, + "learning_rate": 3.176115998696644e-05, + "loss": 0.1959, + "step": 8889 + }, + { + "epoch": 1.39167188478397, + "grad_norm": 0.5407730937004089, + "learning_rate": 3.175301401107853e-05, + "loss": 0.2365, + "step": 8890 + }, + { + "epoch": 1.3918284283030682, + "grad_norm": 0.41387295722961426, + "learning_rate": 3.1744868035190615e-05, + "loss": 0.1753, + "step": 8891 + }, + { + "epoch": 1.3919849718221666, + "grad_norm": 0.5393170714378357, + "learning_rate": 3.1736722059302705e-05, + "loss": 0.1785, + "step": 8892 + }, + { + "epoch": 1.3921415153412648, + "grad_norm": 0.8031520247459412, + "learning_rate": 3.1728576083414796e-05, + "loss": 0.1812, + "step": 8893 + }, + { + "epoch": 1.392298058860363, + "grad_norm": 1.3911019563674927, + "learning_rate": 3.172043010752688e-05, + "loss": 0.6116, + "step": 8894 + }, + { + "epoch": 1.3924546023794615, + "grad_norm": 1.0987216234207153, + "learning_rate": 3.171228413163897e-05, + "loss": 0.3138, + "step": 8895 + }, + { + "epoch": 1.3926111458985597, + "grad_norm": 0.6703523397445679, + "learning_rate": 3.170413815575106e-05, + "loss": 0.1702, + "step": 8896 + }, + { + "epoch": 1.3927676894176582, + "grad_norm": 0.7394130825996399, + "learning_rate": 3.169599217986315e-05, + "loss": 0.1827, + "step": 8897 + }, + { + "epoch": 1.3929242329367564, + "grad_norm": 1.6977427005767822, + "learning_rate": 3.1687846203975235e-05, + "loss": 0.2692, + "step": 8898 + }, + { + "epoch": 1.3930807764558546, + "grad_norm": 0.8804395198822021, + "learning_rate": 3.1679700228087325e-05, + "loss": 0.4209, + "step": 8899 + }, + { + "epoch": 1.393237319974953, + "grad_norm": 0.9951710104942322, + "learning_rate": 3.1671554252199416e-05, + "loss": 0.2832, + "step": 8900 + }, + { + "epoch": 1.3933938634940515, + "grad_norm": 1.7179399728775024, + "learning_rate": 3.16634082763115e-05, + "loss": 0.4752, + "step": 8901 + }, + { + "epoch": 1.3935504070131497, + "grad_norm": 0.8287004232406616, + "learning_rate": 3.16552623004236e-05, + "loss": 0.2695, + "step": 8902 + }, + { + "epoch": 1.393706950532248, + "grad_norm": 2.0744431018829346, + "learning_rate": 3.164711632453568e-05, + "loss": 0.4327, + "step": 8903 + }, + { + "epoch": 1.3938634940513464, + "grad_norm": 1.1916314363479614, + "learning_rate": 3.1638970348647764e-05, + "loss": 0.3617, + "step": 8904 + }, + { + "epoch": 1.3940200375704446, + "grad_norm": 1.8511568307876587, + "learning_rate": 3.163082437275986e-05, + "loss": 0.3906, + "step": 8905 + }, + { + "epoch": 1.394176581089543, + "grad_norm": 1.6102995872497559, + "learning_rate": 3.1622678396871946e-05, + "loss": 0.4667, + "step": 8906 + }, + { + "epoch": 1.3943331246086412, + "grad_norm": 1.6488795280456543, + "learning_rate": 3.1614532420984036e-05, + "loss": 0.3468, + "step": 8907 + }, + { + "epoch": 1.3944896681277394, + "grad_norm": 1.6073062419891357, + "learning_rate": 3.1606386445096127e-05, + "loss": 0.2707, + "step": 8908 + }, + { + "epoch": 1.3946462116468379, + "grad_norm": 1.0660237073898315, + "learning_rate": 3.159824046920821e-05, + "loss": 0.3007, + "step": 8909 + }, + { + "epoch": 1.394802755165936, + "grad_norm": 1.5157692432403564, + "learning_rate": 3.15900944933203e-05, + "loss": 0.501, + "step": 8910 + }, + { + "epoch": 1.3949592986850345, + "grad_norm": 2.045330762863159, + "learning_rate": 3.158194851743239e-05, + "loss": 0.3978, + "step": 8911 + }, + { + "epoch": 1.3951158422041328, + "grad_norm": 1.5229228734970093, + "learning_rate": 3.1573802541544475e-05, + "loss": 0.4843, + "step": 8912 + }, + { + "epoch": 1.395272385723231, + "grad_norm": 2.1558008193969727, + "learning_rate": 3.1565656565656566e-05, + "loss": 0.4669, + "step": 8913 + }, + { + "epoch": 1.3954289292423294, + "grad_norm": 2.131721258163452, + "learning_rate": 3.1557510589768656e-05, + "loss": 0.3322, + "step": 8914 + }, + { + "epoch": 1.3955854727614276, + "grad_norm": 2.6215240955352783, + "learning_rate": 3.154936461388075e-05, + "loss": 0.529, + "step": 8915 + }, + { + "epoch": 1.395742016280526, + "grad_norm": 6.051763534545898, + "learning_rate": 3.154121863799283e-05, + "loss": 0.6159, + "step": 8916 + }, + { + "epoch": 1.3958985597996243, + "grad_norm": 2.2313530445098877, + "learning_rate": 3.153307266210492e-05, + "loss": 0.6545, + "step": 8917 + }, + { + "epoch": 1.3960551033187225, + "grad_norm": 2.253613233566284, + "learning_rate": 3.152492668621701e-05, + "loss": 0.573, + "step": 8918 + }, + { + "epoch": 1.396211646837821, + "grad_norm": 3.6899826526641846, + "learning_rate": 3.1516780710329095e-05, + "loss": 0.7014, + "step": 8919 + }, + { + "epoch": 1.3963681903569192, + "grad_norm": 3.693246841430664, + "learning_rate": 3.150863473444119e-05, + "loss": 0.8344, + "step": 8920 + }, + { + "epoch": 1.3965247338760176, + "grad_norm": 2.3286309242248535, + "learning_rate": 3.1500488758553276e-05, + "loss": 0.6355, + "step": 8921 + }, + { + "epoch": 1.3966812773951158, + "grad_norm": 4.884693145751953, + "learning_rate": 3.149234278266536e-05, + "loss": 0.7413, + "step": 8922 + }, + { + "epoch": 1.396837820914214, + "grad_norm": 3.215996503829956, + "learning_rate": 3.148419680677746e-05, + "loss": 0.6267, + "step": 8923 + }, + { + "epoch": 1.3969943644333125, + "grad_norm": 2.949984312057495, + "learning_rate": 3.147605083088954e-05, + "loss": 0.6532, + "step": 8924 + }, + { + "epoch": 1.3971509079524107, + "grad_norm": 4.167171478271484, + "learning_rate": 3.146790485500163e-05, + "loss": 0.6143, + "step": 8925 + }, + { + "epoch": 1.3973074514715091, + "grad_norm": 4.170174598693848, + "learning_rate": 3.145975887911372e-05, + "loss": 1.0625, + "step": 8926 + }, + { + "epoch": 1.3974639949906074, + "grad_norm": 6.1511688232421875, + "learning_rate": 3.1451612903225806e-05, + "loss": 1.2821, + "step": 8927 + }, + { + "epoch": 1.3976205385097056, + "grad_norm": 6.221730709075928, + "learning_rate": 3.1443466927337896e-05, + "loss": 1.4286, + "step": 8928 + }, + { + "epoch": 1.397777082028804, + "grad_norm": 3.3348803520202637, + "learning_rate": 3.143532095144999e-05, + "loss": 0.8723, + "step": 8929 + }, + { + "epoch": 1.3979336255479022, + "grad_norm": 4.256657123565674, + "learning_rate": 3.142717497556207e-05, + "loss": 0.751, + "step": 8930 + }, + { + "epoch": 1.3980901690670007, + "grad_norm": 2.399866819381714, + "learning_rate": 3.141902899967416e-05, + "loss": 1.224, + "step": 8931 + }, + { + "epoch": 1.398246712586099, + "grad_norm": 4.362685203552246, + "learning_rate": 3.141088302378625e-05, + "loss": 0.7659, + "step": 8932 + }, + { + "epoch": 1.3984032561051971, + "grad_norm": 9.0147123336792, + "learning_rate": 3.140273704789834e-05, + "loss": 1.2696, + "step": 8933 + }, + { + "epoch": 1.3985597996242956, + "grad_norm": 1.5164681673049927, + "learning_rate": 3.1394591072010426e-05, + "loss": 0.421, + "step": 8934 + }, + { + "epoch": 1.398716343143394, + "grad_norm": 5.107836723327637, + "learning_rate": 3.1386445096122517e-05, + "loss": 0.3424, + "step": 8935 + }, + { + "epoch": 1.3988728866624922, + "grad_norm": 2.3844504356384277, + "learning_rate": 3.137829912023461e-05, + "loss": 0.6773, + "step": 8936 + }, + { + "epoch": 1.3990294301815904, + "grad_norm": 3.786966323852539, + "learning_rate": 3.137015314434669e-05, + "loss": 1.1173, + "step": 8937 + }, + { + "epoch": 1.3991859737006889, + "grad_norm": 2.3460283279418945, + "learning_rate": 3.136200716845878e-05, + "loss": 1.3835, + "step": 8938 + }, + { + "epoch": 1.399342517219787, + "grad_norm": 0.7692614793777466, + "learning_rate": 3.135386119257087e-05, + "loss": 0.2789, + "step": 8939 + }, + { + "epoch": 1.3994990607388855, + "grad_norm": 0.696351170539856, + "learning_rate": 3.1345715216682956e-05, + "loss": 0.2081, + "step": 8940 + }, + { + "epoch": 1.3996556042579837, + "grad_norm": 0.5057497620582581, + "learning_rate": 3.133756924079505e-05, + "loss": 0.2109, + "step": 8941 + }, + { + "epoch": 1.399812147777082, + "grad_norm": 1.0874603986740112, + "learning_rate": 3.132942326490714e-05, + "loss": 0.2379, + "step": 8942 + }, + { + "epoch": 1.3999686912961804, + "grad_norm": 0.6960557699203491, + "learning_rate": 3.132127728901923e-05, + "loss": 0.2348, + "step": 8943 + }, + { + "epoch": 1.4001252348152786, + "grad_norm": 1.4531316757202148, + "learning_rate": 3.131313131313132e-05, + "loss": 0.3017, + "step": 8944 + }, + { + "epoch": 1.400281778334377, + "grad_norm": 0.6027225852012634, + "learning_rate": 3.13049853372434e-05, + "loss": 0.1988, + "step": 8945 + }, + { + "epoch": 1.4004383218534753, + "grad_norm": 1.1742315292358398, + "learning_rate": 3.129683936135549e-05, + "loss": 0.271, + "step": 8946 + }, + { + "epoch": 1.4005948653725735, + "grad_norm": 1.0689533948898315, + "learning_rate": 3.128869338546758e-05, + "loss": 0.2823, + "step": 8947 + }, + { + "epoch": 1.400751408891672, + "grad_norm": 0.8868377804756165, + "learning_rate": 3.1280547409579666e-05, + "loss": 0.2189, + "step": 8948 + }, + { + "epoch": 1.4009079524107702, + "grad_norm": 1.4185199737548828, + "learning_rate": 3.127240143369176e-05, + "loss": 0.2753, + "step": 8949 + }, + { + "epoch": 1.4010644959298686, + "grad_norm": 1.094377040863037, + "learning_rate": 3.126425545780385e-05, + "loss": 0.3197, + "step": 8950 + }, + { + "epoch": 1.4012210394489668, + "grad_norm": 1.5287457704544067, + "learning_rate": 3.125610948191594e-05, + "loss": 0.3899, + "step": 8951 + }, + { + "epoch": 1.401377582968065, + "grad_norm": 0.733588457107544, + "learning_rate": 3.124796350602802e-05, + "loss": 0.2016, + "step": 8952 + }, + { + "epoch": 1.4015341264871635, + "grad_norm": 1.331459879875183, + "learning_rate": 3.123981753014011e-05, + "loss": 0.2432, + "step": 8953 + }, + { + "epoch": 1.4016906700062617, + "grad_norm": 2.5502333641052246, + "learning_rate": 3.12316715542522e-05, + "loss": 0.5029, + "step": 8954 + }, + { + "epoch": 1.4018472135253601, + "grad_norm": 2.688525676727295, + "learning_rate": 3.1223525578364286e-05, + "loss": 1.0576, + "step": 8955 + }, + { + "epoch": 1.4020037570444583, + "grad_norm": 4.307522773742676, + "learning_rate": 3.121537960247638e-05, + "loss": 0.5837, + "step": 8956 + }, + { + "epoch": 1.4021603005635566, + "grad_norm": 5.109297752380371, + "learning_rate": 3.120723362658847e-05, + "loss": 0.5771, + "step": 8957 + }, + { + "epoch": 1.402316844082655, + "grad_norm": 1.1165077686309814, + "learning_rate": 3.119908765070055e-05, + "loss": 0.3171, + "step": 8958 + }, + { + "epoch": 1.4024733876017532, + "grad_norm": 2.3640666007995605, + "learning_rate": 3.119094167481265e-05, + "loss": 0.6225, + "step": 8959 + }, + { + "epoch": 1.4026299311208517, + "grad_norm": 3.679969310760498, + "learning_rate": 3.118279569892473e-05, + "loss": 0.5093, + "step": 8960 + }, + { + "epoch": 1.4027864746399499, + "grad_norm": 4.957012176513672, + "learning_rate": 3.117464972303682e-05, + "loss": 1.1513, + "step": 8961 + }, + { + "epoch": 1.402943018159048, + "grad_norm": 2.3617074489593506, + "learning_rate": 3.116650374714891e-05, + "loss": 0.603, + "step": 8962 + }, + { + "epoch": 1.4030995616781465, + "grad_norm": 2.2772409915924072, + "learning_rate": 3.1158357771261e-05, + "loss": 0.8247, + "step": 8963 + }, + { + "epoch": 1.4032561051972448, + "grad_norm": 1.7749269008636475, + "learning_rate": 3.115021179537309e-05, + "loss": 0.6589, + "step": 8964 + }, + { + "epoch": 1.4034126487163432, + "grad_norm": 1.642035961151123, + "learning_rate": 3.114206581948518e-05, + "loss": 0.2885, + "step": 8965 + }, + { + "epoch": 1.4035691922354414, + "grad_norm": 1.614306926727295, + "learning_rate": 3.113391984359726e-05, + "loss": 0.7919, + "step": 8966 + }, + { + "epoch": 1.4037257357545396, + "grad_norm": 1.9646732807159424, + "learning_rate": 3.112577386770935e-05, + "loss": 0.6213, + "step": 8967 + }, + { + "epoch": 1.403882279273638, + "grad_norm": 1.5945091247558594, + "learning_rate": 3.111762789182144e-05, + "loss": 0.4714, + "step": 8968 + }, + { + "epoch": 1.4040388227927365, + "grad_norm": 1.4441488981246948, + "learning_rate": 3.1109481915933533e-05, + "loss": 0.2991, + "step": 8969 + }, + { + "epoch": 1.4041953663118347, + "grad_norm": 1.244579553604126, + "learning_rate": 3.110133594004562e-05, + "loss": 0.2719, + "step": 8970 + }, + { + "epoch": 1.404351909830933, + "grad_norm": 2.1998507976531982, + "learning_rate": 3.109318996415771e-05, + "loss": 0.7738, + "step": 8971 + }, + { + "epoch": 1.4045084533500314, + "grad_norm": 2.833699941635132, + "learning_rate": 3.10850439882698e-05, + "loss": 0.4758, + "step": 8972 + }, + { + "epoch": 1.4046649968691296, + "grad_norm": 2.6063220500946045, + "learning_rate": 3.107689801238188e-05, + "loss": 0.8675, + "step": 8973 + }, + { + "epoch": 1.404821540388228, + "grad_norm": 2.490124225616455, + "learning_rate": 3.106875203649397e-05, + "loss": 0.8539, + "step": 8974 + }, + { + "epoch": 1.4049780839073263, + "grad_norm": 4.0373640060424805, + "learning_rate": 3.106060606060606e-05, + "loss": 0.7727, + "step": 8975 + }, + { + "epoch": 1.4051346274264245, + "grad_norm": 5.780215263366699, + "learning_rate": 3.105246008471815e-05, + "loss": 1.2966, + "step": 8976 + }, + { + "epoch": 1.405291170945523, + "grad_norm": 2.7906606197357178, + "learning_rate": 3.1044314108830244e-05, + "loss": 1.2056, + "step": 8977 + }, + { + "epoch": 1.4054477144646211, + "grad_norm": 2.0503833293914795, + "learning_rate": 3.103616813294233e-05, + "loss": 0.8603, + "step": 8978 + }, + { + "epoch": 1.4056042579837196, + "grad_norm": 5.000894069671631, + "learning_rate": 3.102802215705441e-05, + "loss": 1.5653, + "step": 8979 + }, + { + "epoch": 1.4057608015028178, + "grad_norm": 2.071892023086548, + "learning_rate": 3.101987618116651e-05, + "loss": 0.6505, + "step": 8980 + }, + { + "epoch": 1.405917345021916, + "grad_norm": 2.4467055797576904, + "learning_rate": 3.101173020527859e-05, + "loss": 1.5097, + "step": 8981 + }, + { + "epoch": 1.4060738885410144, + "grad_norm": 4.3751959800720215, + "learning_rate": 3.100358422939068e-05, + "loss": 1.259, + "step": 8982 + }, + { + "epoch": 1.4062304320601127, + "grad_norm": 2.2510879039764404, + "learning_rate": 3.0995438253502774e-05, + "loss": 1.1119, + "step": 8983 + }, + { + "epoch": 1.406386975579211, + "grad_norm": 2.851996660232544, + "learning_rate": 3.098729227761486e-05, + "loss": 0.7036, + "step": 8984 + }, + { + "epoch": 1.4065435190983093, + "grad_norm": 1.2599716186523438, + "learning_rate": 3.097914630172695e-05, + "loss": 0.1924, + "step": 8985 + }, + { + "epoch": 1.4067000626174075, + "grad_norm": 1.8087093830108643, + "learning_rate": 3.097100032583904e-05, + "loss": 0.3797, + "step": 8986 + }, + { + "epoch": 1.406856606136506, + "grad_norm": 2.4047012329101562, + "learning_rate": 3.096285434995113e-05, + "loss": 0.7205, + "step": 8987 + }, + { + "epoch": 1.4070131496556042, + "grad_norm": 2.2337403297424316, + "learning_rate": 3.095470837406321e-05, + "loss": 0.4221, + "step": 8988 + }, + { + "epoch": 1.4071696931747026, + "grad_norm": 0.527746856212616, + "learning_rate": 3.09465623981753e-05, + "loss": 0.2202, + "step": 8989 + }, + { + "epoch": 1.4073262366938009, + "grad_norm": 0.5715660452842712, + "learning_rate": 3.0938416422287394e-05, + "loss": 0.2476, + "step": 8990 + }, + { + "epoch": 1.407482780212899, + "grad_norm": 0.5027009844779968, + "learning_rate": 3.093027044639948e-05, + "loss": 0.2154, + "step": 8991 + }, + { + "epoch": 1.4076393237319975, + "grad_norm": 1.2513927221298218, + "learning_rate": 3.092212447051157e-05, + "loss": 0.3245, + "step": 8992 + }, + { + "epoch": 1.4077958672510957, + "grad_norm": 0.6560338139533997, + "learning_rate": 3.091397849462366e-05, + "loss": 0.2688, + "step": 8993 + }, + { + "epoch": 1.4079524107701942, + "grad_norm": 1.456520915031433, + "learning_rate": 3.090583251873574e-05, + "loss": 0.3726, + "step": 8994 + }, + { + "epoch": 1.4081089542892924, + "grad_norm": 0.8152646422386169, + "learning_rate": 3.089768654284784e-05, + "loss": 0.2702, + "step": 8995 + }, + { + "epoch": 1.4082654978083906, + "grad_norm": 0.8101275563240051, + "learning_rate": 3.0889540566959923e-05, + "loss": 0.2936, + "step": 8996 + }, + { + "epoch": 1.408422041327489, + "grad_norm": 1.224259614944458, + "learning_rate": 3.088139459107201e-05, + "loss": 0.2653, + "step": 8997 + }, + { + "epoch": 1.4085785848465875, + "grad_norm": 0.6847306489944458, + "learning_rate": 3.0873248615184104e-05, + "loss": 0.2385, + "step": 8998 + }, + { + "epoch": 1.4087351283656857, + "grad_norm": 0.9370330572128296, + "learning_rate": 3.086510263929619e-05, + "loss": 0.2277, + "step": 8999 + }, + { + "epoch": 1.408891671884784, + "grad_norm": 0.8652722239494324, + "learning_rate": 3.085695666340828e-05, + "loss": 0.2713, + "step": 9000 + }, + { + "epoch": 1.408891671884784, + "eval_loss": 0.4808903932571411, + "eval_runtime": 203.3122, + "eval_samples_per_second": 60.906, + "eval_steps_per_second": 3.807, + "eval_wer": 0.30645812525235355, + "step": 9000 + }, + { + "epoch": 1.4090482154038821, + "grad_norm": 1.5482051372528076, + "learning_rate": 3.084881068752037e-05, + "loss": 0.2616, + "step": 9001 + }, + { + "epoch": 1.4092047589229806, + "grad_norm": 1.5386277437210083, + "learning_rate": 3.084066471163245e-05, + "loss": 0.5191, + "step": 9002 + }, + { + "epoch": 1.409361302442079, + "grad_norm": 1.0946321487426758, + "learning_rate": 3.0832518735744544e-05, + "loss": 0.22, + "step": 9003 + }, + { + "epoch": 1.4095178459611772, + "grad_norm": 1.2785476446151733, + "learning_rate": 3.0824372759856634e-05, + "loss": 0.4555, + "step": 9004 + }, + { + "epoch": 1.4096743894802755, + "grad_norm": 0.9731870293617249, + "learning_rate": 3.0816226783968725e-05, + "loss": 0.3483, + "step": 9005 + }, + { + "epoch": 1.409830932999374, + "grad_norm": 2.129300832748413, + "learning_rate": 3.080808080808081e-05, + "loss": 0.4928, + "step": 9006 + }, + { + "epoch": 1.4099874765184721, + "grad_norm": 1.09791898727417, + "learning_rate": 3.07999348321929e-05, + "loss": 0.4208, + "step": 9007 + }, + { + "epoch": 1.4101440200375706, + "grad_norm": 1.248123049736023, + "learning_rate": 3.079178885630499e-05, + "loss": 0.3159, + "step": 9008 + }, + { + "epoch": 1.4103005635566688, + "grad_norm": 1.874197244644165, + "learning_rate": 3.078364288041707e-05, + "loss": 0.4795, + "step": 9009 + }, + { + "epoch": 1.410457107075767, + "grad_norm": 1.6105743646621704, + "learning_rate": 3.0775496904529164e-05, + "loss": 0.4415, + "step": 9010 + }, + { + "epoch": 1.4106136505948654, + "grad_norm": 2.211848735809326, + "learning_rate": 3.0767350928641254e-05, + "loss": 0.4729, + "step": 9011 + }, + { + "epoch": 1.4107701941139636, + "grad_norm": 1.4950428009033203, + "learning_rate": 3.075920495275334e-05, + "loss": 0.4253, + "step": 9012 + }, + { + "epoch": 1.410926737633062, + "grad_norm": 1.7181389331817627, + "learning_rate": 3.0751058976865435e-05, + "loss": 0.2175, + "step": 9013 + }, + { + "epoch": 1.4110832811521603, + "grad_norm": 2.2796618938446045, + "learning_rate": 3.074291300097752e-05, + "loss": 0.7232, + "step": 9014 + }, + { + "epoch": 1.4112398246712585, + "grad_norm": 3.1354904174804688, + "learning_rate": 3.07347670250896e-05, + "loss": 0.7223, + "step": 9015 + }, + { + "epoch": 1.411396368190357, + "grad_norm": 1.6996370553970337, + "learning_rate": 3.07266210492017e-05, + "loss": 0.6496, + "step": 9016 + }, + { + "epoch": 1.4115529117094552, + "grad_norm": 2.103088617324829, + "learning_rate": 3.0718475073313784e-05, + "loss": 0.3178, + "step": 9017 + }, + { + "epoch": 1.4117094552285536, + "grad_norm": 4.790323734283447, + "learning_rate": 3.0710329097425874e-05, + "loss": 1.1483, + "step": 9018 + }, + { + "epoch": 1.4118659987476518, + "grad_norm": 2.648101329803467, + "learning_rate": 3.0702183121537965e-05, + "loss": 0.4541, + "step": 9019 + }, + { + "epoch": 1.41202254226675, + "grad_norm": 2.3575282096862793, + "learning_rate": 3.069403714565005e-05, + "loss": 0.4837, + "step": 9020 + }, + { + "epoch": 1.4121790857858485, + "grad_norm": 1.682194471359253, + "learning_rate": 3.068589116976214e-05, + "loss": 0.638, + "step": 9021 + }, + { + "epoch": 1.4123356293049467, + "grad_norm": 6.3118367195129395, + "learning_rate": 3.067774519387423e-05, + "loss": 1.1992, + "step": 9022 + }, + { + "epoch": 1.4124921728240452, + "grad_norm": 3.296771287918091, + "learning_rate": 3.066959921798631e-05, + "loss": 0.767, + "step": 9023 + }, + { + "epoch": 1.4126487163431434, + "grad_norm": 3.878030300140381, + "learning_rate": 3.0661453242098404e-05, + "loss": 1.1898, + "step": 9024 + }, + { + "epoch": 1.4128052598622416, + "grad_norm": 4.581468105316162, + "learning_rate": 3.0653307266210494e-05, + "loss": 1.0822, + "step": 9025 + }, + { + "epoch": 1.41296180338134, + "grad_norm": 5.137700080871582, + "learning_rate": 3.0645161290322585e-05, + "loss": 0.8629, + "step": 9026 + }, + { + "epoch": 1.4131183469004382, + "grad_norm": 4.86765193939209, + "learning_rate": 3.063701531443467e-05, + "loss": 1.0459, + "step": 9027 + }, + { + "epoch": 1.4132748904195367, + "grad_norm": 4.254866123199463, + "learning_rate": 3.062886933854676e-05, + "loss": 0.8728, + "step": 9028 + }, + { + "epoch": 1.413431433938635, + "grad_norm": 2.5001885890960693, + "learning_rate": 3.062072336265885e-05, + "loss": 0.9167, + "step": 9029 + }, + { + "epoch": 1.4135879774577331, + "grad_norm": 6.236412048339844, + "learning_rate": 3.0612577386770934e-05, + "loss": 1.0733, + "step": 9030 + }, + { + "epoch": 1.4137445209768316, + "grad_norm": 4.3497443199157715, + "learning_rate": 3.060443141088303e-05, + "loss": 1.2791, + "step": 9031 + }, + { + "epoch": 1.41390106449593, + "grad_norm": 2.581913948059082, + "learning_rate": 3.0596285434995115e-05, + "loss": 0.8993, + "step": 9032 + }, + { + "epoch": 1.4140576080150282, + "grad_norm": 7.192967891693115, + "learning_rate": 3.05881394591072e-05, + "loss": 1.1322, + "step": 9033 + }, + { + "epoch": 1.4142141515341264, + "grad_norm": 1.6219629049301147, + "learning_rate": 3.0579993483219296e-05, + "loss": 0.4588, + "step": 9034 + }, + { + "epoch": 1.4143706950532247, + "grad_norm": 5.249449729919434, + "learning_rate": 3.057184750733138e-05, + "loss": 1.288, + "step": 9035 + }, + { + "epoch": 1.414527238572323, + "grad_norm": 5.499079704284668, + "learning_rate": 3.056370153144347e-05, + "loss": 0.7872, + "step": 9036 + }, + { + "epoch": 1.4146837820914215, + "grad_norm": 5.29900598526001, + "learning_rate": 3.055555555555556e-05, + "loss": 1.2769, + "step": 9037 + }, + { + "epoch": 1.4148403256105198, + "grad_norm": 3.1568527221679688, + "learning_rate": 3.0547409579667644e-05, + "loss": 1.1372, + "step": 9038 + }, + { + "epoch": 1.414996869129618, + "grad_norm": 0.4554528295993805, + "learning_rate": 3.0539263603779735e-05, + "loss": 0.1811, + "step": 9039 + }, + { + "epoch": 1.4151534126487164, + "grad_norm": 0.712009072303772, + "learning_rate": 3.0531117627891825e-05, + "loss": 0.1812, + "step": 9040 + }, + { + "epoch": 1.4153099561678146, + "grad_norm": 1.2593419551849365, + "learning_rate": 3.052297165200391e-05, + "loss": 0.2383, + "step": 9041 + }, + { + "epoch": 1.415466499686913, + "grad_norm": 0.4085516035556793, + "learning_rate": 3.0514825676116e-05, + "loss": 0.2026, + "step": 9042 + }, + { + "epoch": 1.4156230432060113, + "grad_norm": 0.5390114784240723, + "learning_rate": 3.050667970022809e-05, + "loss": 0.2025, + "step": 9043 + }, + { + "epoch": 1.4157795867251095, + "grad_norm": 0.40586188435554504, + "learning_rate": 3.0498533724340177e-05, + "loss": 0.1872, + "step": 9044 + }, + { + "epoch": 1.415936130244208, + "grad_norm": 0.7489004135131836, + "learning_rate": 3.0490387748452264e-05, + "loss": 0.3079, + "step": 9045 + }, + { + "epoch": 1.4160926737633062, + "grad_norm": 0.8743011951446533, + "learning_rate": 3.0482241772564358e-05, + "loss": 0.2453, + "step": 9046 + }, + { + "epoch": 1.4162492172824046, + "grad_norm": 0.7281467914581299, + "learning_rate": 3.0474095796676445e-05, + "loss": 0.2101, + "step": 9047 + }, + { + "epoch": 1.4164057608015028, + "grad_norm": 0.8589158654212952, + "learning_rate": 3.046594982078853e-05, + "loss": 0.2746, + "step": 9048 + }, + { + "epoch": 1.416562304320601, + "grad_norm": 1.3603335618972778, + "learning_rate": 3.0457803844900623e-05, + "loss": 0.1357, + "step": 9049 + }, + { + "epoch": 1.4167188478396995, + "grad_norm": 1.0831230878829956, + "learning_rate": 3.044965786901271e-05, + "loss": 0.1631, + "step": 9050 + }, + { + "epoch": 1.4168753913587977, + "grad_norm": 0.8930145502090454, + "learning_rate": 3.0441511893124797e-05, + "loss": 0.216, + "step": 9051 + }, + { + "epoch": 1.4170319348778961, + "grad_norm": 1.2302227020263672, + "learning_rate": 3.0433365917236888e-05, + "loss": 0.449, + "step": 9052 + }, + { + "epoch": 1.4171884783969944, + "grad_norm": 1.6655604839324951, + "learning_rate": 3.0425219941348975e-05, + "loss": 0.5576, + "step": 9053 + }, + { + "epoch": 1.4173450219160926, + "grad_norm": 0.9307979941368103, + "learning_rate": 3.0417073965461062e-05, + "loss": 0.3864, + "step": 9054 + }, + { + "epoch": 1.417501565435191, + "grad_norm": 2.022871255874634, + "learning_rate": 3.0408927989573156e-05, + "loss": 0.53, + "step": 9055 + }, + { + "epoch": 1.4176581089542892, + "grad_norm": 1.069441556930542, + "learning_rate": 3.0400782013685243e-05, + "loss": 0.198, + "step": 9056 + }, + { + "epoch": 1.4178146524733877, + "grad_norm": 1.2228885889053345, + "learning_rate": 3.0392636037797327e-05, + "loss": 0.2889, + "step": 9057 + }, + { + "epoch": 1.4179711959924859, + "grad_norm": 2.6495628356933594, + "learning_rate": 3.038449006190942e-05, + "loss": 0.48, + "step": 9058 + }, + { + "epoch": 1.418127739511584, + "grad_norm": 1.635676622390747, + "learning_rate": 3.0376344086021508e-05, + "loss": 0.2944, + "step": 9059 + }, + { + "epoch": 1.4182842830306825, + "grad_norm": 3.092078685760498, + "learning_rate": 3.0368198110133595e-05, + "loss": 0.3742, + "step": 9060 + }, + { + "epoch": 1.4184408265497808, + "grad_norm": 1.1708624362945557, + "learning_rate": 3.0360052134245686e-05, + "loss": 0.3558, + "step": 9061 + }, + { + "epoch": 1.4185973700688792, + "grad_norm": 1.4633182287216187, + "learning_rate": 3.0351906158357773e-05, + "loss": 0.4758, + "step": 9062 + }, + { + "epoch": 1.4187539135879774, + "grad_norm": 2.0803744792938232, + "learning_rate": 3.034376018246986e-05, + "loss": 0.5435, + "step": 9063 + }, + { + "epoch": 1.4189104571070756, + "grad_norm": 1.491407036781311, + "learning_rate": 3.0335614206581954e-05, + "loss": 0.3708, + "step": 9064 + }, + { + "epoch": 1.419067000626174, + "grad_norm": 2.870185375213623, + "learning_rate": 3.032746823069404e-05, + "loss": 0.6666, + "step": 9065 + }, + { + "epoch": 1.4192235441452725, + "grad_norm": 1.5418882369995117, + "learning_rate": 3.0319322254806125e-05, + "loss": 0.4102, + "step": 9066 + }, + { + "epoch": 1.4193800876643707, + "grad_norm": 2.8943469524383545, + "learning_rate": 3.031117627891822e-05, + "loss": 0.7711, + "step": 9067 + }, + { + "epoch": 1.419536631183469, + "grad_norm": 1.8805248737335205, + "learning_rate": 3.0303030303030306e-05, + "loss": 0.4828, + "step": 9068 + }, + { + "epoch": 1.4196931747025674, + "grad_norm": 2.670583724975586, + "learning_rate": 3.0294884327142393e-05, + "loss": 0.8719, + "step": 9069 + }, + { + "epoch": 1.4198497182216656, + "grad_norm": 3.372076988220215, + "learning_rate": 3.0286738351254483e-05, + "loss": 1.1156, + "step": 9070 + }, + { + "epoch": 1.420006261740764, + "grad_norm": 3.2165441513061523, + "learning_rate": 3.027859237536657e-05, + "loss": 0.6949, + "step": 9071 + }, + { + "epoch": 1.4201628052598623, + "grad_norm": 2.7363505363464355, + "learning_rate": 3.0270446399478658e-05, + "loss": 0.6279, + "step": 9072 + }, + { + "epoch": 1.4203193487789605, + "grad_norm": 4.877222061157227, + "learning_rate": 3.026230042359075e-05, + "loss": 0.6296, + "step": 9073 + }, + { + "epoch": 1.420475892298059, + "grad_norm": 2.801483392715454, + "learning_rate": 3.025415444770284e-05, + "loss": 0.5959, + "step": 9074 + }, + { + "epoch": 1.4206324358171571, + "grad_norm": 2.3421237468719482, + "learning_rate": 3.0246008471814922e-05, + "loss": 0.6689, + "step": 9075 + }, + { + "epoch": 1.4207889793362556, + "grad_norm": 2.3915936946868896, + "learning_rate": 3.0237862495927016e-05, + "loss": 0.7549, + "step": 9076 + }, + { + "epoch": 1.4209455228553538, + "grad_norm": 4.086336135864258, + "learning_rate": 3.0229716520039104e-05, + "loss": 0.5153, + "step": 9077 + }, + { + "epoch": 1.421102066374452, + "grad_norm": 2.5420982837677, + "learning_rate": 3.022157054415119e-05, + "loss": 0.9628, + "step": 9078 + }, + { + "epoch": 1.4212586098935505, + "grad_norm": 4.087306976318359, + "learning_rate": 3.021342456826328e-05, + "loss": 1.025, + "step": 9079 + }, + { + "epoch": 1.4214151534126487, + "grad_norm": 2.9391086101531982, + "learning_rate": 3.0205278592375368e-05, + "loss": 1.0645, + "step": 9080 + }, + { + "epoch": 1.4215716969317471, + "grad_norm": 2.6404354572296143, + "learning_rate": 3.0197132616487455e-05, + "loss": 0.7829, + "step": 9081 + }, + { + "epoch": 1.4217282404508453, + "grad_norm": 2.3348071575164795, + "learning_rate": 3.0188986640599543e-05, + "loss": 1.0794, + "step": 9082 + }, + { + "epoch": 1.4218847839699436, + "grad_norm": 2.7366585731506348, + "learning_rate": 3.0180840664711636e-05, + "loss": 1.5445, + "step": 9083 + }, + { + "epoch": 1.422041327489042, + "grad_norm": 1.505448579788208, + "learning_rate": 3.017269468882372e-05, + "loss": 0.3957, + "step": 9084 + }, + { + "epoch": 1.4221978710081402, + "grad_norm": 2.58994197845459, + "learning_rate": 3.0164548712935807e-05, + "loss": 0.6675, + "step": 9085 + }, + { + "epoch": 1.4223544145272387, + "grad_norm": 1.6133825778961182, + "learning_rate": 3.01564027370479e-05, + "loss": 0.4711, + "step": 9086 + }, + { + "epoch": 1.4225109580463369, + "grad_norm": 2.328418016433716, + "learning_rate": 3.014825676115999e-05, + "loss": 0.4129, + "step": 9087 + }, + { + "epoch": 1.422667501565435, + "grad_norm": 1.7624003887176514, + "learning_rate": 3.0140110785272076e-05, + "loss": 0.8112, + "step": 9088 + }, + { + "epoch": 1.4228240450845335, + "grad_norm": 0.4680425822734833, + "learning_rate": 3.0131964809384166e-05, + "loss": 0.2165, + "step": 9089 + }, + { + "epoch": 1.4229805886036317, + "grad_norm": 0.5908309817314148, + "learning_rate": 3.0123818833496253e-05, + "loss": 0.2339, + "step": 9090 + }, + { + "epoch": 1.4231371321227302, + "grad_norm": 0.5292312502861023, + "learning_rate": 3.011567285760834e-05, + "loss": 0.2774, + "step": 9091 + }, + { + "epoch": 1.4232936756418284, + "grad_norm": 0.4805545210838318, + "learning_rate": 3.010752688172043e-05, + "loss": 0.2403, + "step": 9092 + }, + { + "epoch": 1.4234502191609266, + "grad_norm": 1.0807852745056152, + "learning_rate": 3.0099380905832518e-05, + "loss": 0.3111, + "step": 9093 + }, + { + "epoch": 1.423606762680025, + "grad_norm": 2.3920114040374756, + "learning_rate": 3.0091234929944605e-05, + "loss": 0.3481, + "step": 9094 + }, + { + "epoch": 1.4237633061991233, + "grad_norm": 0.9107086658477783, + "learning_rate": 3.00830889540567e-05, + "loss": 0.2408, + "step": 9095 + }, + { + "epoch": 1.4239198497182217, + "grad_norm": 1.0763477087020874, + "learning_rate": 3.0074942978168786e-05, + "loss": 0.2649, + "step": 9096 + }, + { + "epoch": 1.42407639323732, + "grad_norm": 0.7184234261512756, + "learning_rate": 3.0066797002280873e-05, + "loss": 0.2415, + "step": 9097 + }, + { + "epoch": 1.4242329367564182, + "grad_norm": 3.1756181716918945, + "learning_rate": 3.0058651026392964e-05, + "loss": 0.4312, + "step": 9098 + }, + { + "epoch": 1.4243894802755166, + "grad_norm": 0.4726775288581848, + "learning_rate": 3.005050505050505e-05, + "loss": 0.1614, + "step": 9099 + }, + { + "epoch": 1.424546023794615, + "grad_norm": 0.5586493611335754, + "learning_rate": 3.0042359074617138e-05, + "loss": 0.2066, + "step": 9100 + }, + { + "epoch": 1.4247025673137133, + "grad_norm": 2.3587839603424072, + "learning_rate": 3.003421309872923e-05, + "loss": 0.3715, + "step": 9101 + }, + { + "epoch": 1.4248591108328115, + "grad_norm": 1.3185144662857056, + "learning_rate": 3.0026067122841316e-05, + "loss": 0.3625, + "step": 9102 + }, + { + "epoch": 1.42501565435191, + "grad_norm": 1.019474983215332, + "learning_rate": 3.0017921146953403e-05, + "loss": 0.2697, + "step": 9103 + }, + { + "epoch": 1.4251721978710081, + "grad_norm": 1.4281975030899048, + "learning_rate": 3.0009775171065497e-05, + "loss": 0.3667, + "step": 9104 + }, + { + "epoch": 1.4253287413901066, + "grad_norm": 1.5011413097381592, + "learning_rate": 3.0001629195177584e-05, + "loss": 0.35, + "step": 9105 + }, + { + "epoch": 1.4254852849092048, + "grad_norm": 1.168835997581482, + "learning_rate": 2.999348321928967e-05, + "loss": 0.4655, + "step": 9106 + }, + { + "epoch": 1.425641828428303, + "grad_norm": 1.7899309396743774, + "learning_rate": 2.998533724340176e-05, + "loss": 0.4262, + "step": 9107 + }, + { + "epoch": 1.4257983719474014, + "grad_norm": 3.021923780441284, + "learning_rate": 2.997719126751385e-05, + "loss": 0.5963, + "step": 9108 + }, + { + "epoch": 1.4259549154664997, + "grad_norm": 1.4983525276184082, + "learning_rate": 2.9969045291625936e-05, + "loss": 0.5479, + "step": 9109 + }, + { + "epoch": 1.426111458985598, + "grad_norm": 0.9858981966972351, + "learning_rate": 2.9960899315738026e-05, + "loss": 0.328, + "step": 9110 + }, + { + "epoch": 1.4262680025046963, + "grad_norm": 3.7139532566070557, + "learning_rate": 2.9952753339850114e-05, + "loss": 0.7471, + "step": 9111 + }, + { + "epoch": 1.4264245460237945, + "grad_norm": 1.7445214986801147, + "learning_rate": 2.99446073639622e-05, + "loss": 0.711, + "step": 9112 + }, + { + "epoch": 1.426581089542893, + "grad_norm": 1.431089162826538, + "learning_rate": 2.9936461388074295e-05, + "loss": 0.5573, + "step": 9113 + }, + { + "epoch": 1.4267376330619912, + "grad_norm": 1.9931241273880005, + "learning_rate": 2.9928315412186382e-05, + "loss": 0.376, + "step": 9114 + }, + { + "epoch": 1.4268941765810896, + "grad_norm": 2.0402958393096924, + "learning_rate": 2.992016943629847e-05, + "loss": 0.5209, + "step": 9115 + }, + { + "epoch": 1.4270507201001879, + "grad_norm": 2.283554792404175, + "learning_rate": 2.991202346041056e-05, + "loss": 0.2765, + "step": 9116 + }, + { + "epoch": 1.427207263619286, + "grad_norm": 2.4228768348693848, + "learning_rate": 2.9903877484522647e-05, + "loss": 0.5537, + "step": 9117 + }, + { + "epoch": 1.4273638071383845, + "grad_norm": 2.3207266330718994, + "learning_rate": 2.9895731508634734e-05, + "loss": 0.5248, + "step": 9118 + }, + { + "epoch": 1.4275203506574827, + "grad_norm": 4.636921405792236, + "learning_rate": 2.9887585532746824e-05, + "loss": 1.1306, + "step": 9119 + }, + { + "epoch": 1.4276768941765812, + "grad_norm": 1.9744071960449219, + "learning_rate": 2.987943955685891e-05, + "loss": 0.4436, + "step": 9120 + }, + { + "epoch": 1.4278334376956794, + "grad_norm": 3.5793533325195312, + "learning_rate": 2.9871293580971e-05, + "loss": 0.7683, + "step": 9121 + }, + { + "epoch": 1.4279899812147776, + "grad_norm": 3.639662981033325, + "learning_rate": 2.9863147605083092e-05, + "loss": 0.83, + "step": 9122 + }, + { + "epoch": 1.428146524733876, + "grad_norm": 2.5103588104248047, + "learning_rate": 2.985500162919518e-05, + "loss": 0.8313, + "step": 9123 + }, + { + "epoch": 1.4283030682529743, + "grad_norm": 4.218875885009766, + "learning_rate": 2.9846855653307267e-05, + "loss": 0.9351, + "step": 9124 + }, + { + "epoch": 1.4284596117720727, + "grad_norm": 2.396580696105957, + "learning_rate": 2.9838709677419357e-05, + "loss": 0.7792, + "step": 9125 + }, + { + "epoch": 1.428616155291171, + "grad_norm": 4.627772331237793, + "learning_rate": 2.9830563701531444e-05, + "loss": 1.0618, + "step": 9126 + }, + { + "epoch": 1.4287726988102691, + "grad_norm": 5.774690628051758, + "learning_rate": 2.982241772564353e-05, + "loss": 0.8566, + "step": 9127 + }, + { + "epoch": 1.4289292423293676, + "grad_norm": 2.34684157371521, + "learning_rate": 2.9814271749755622e-05, + "loss": 1.084, + "step": 9128 + }, + { + "epoch": 1.4290857858484658, + "grad_norm": 2.565448045730591, + "learning_rate": 2.980612577386771e-05, + "loss": 0.8949, + "step": 9129 + }, + { + "epoch": 1.4292423293675642, + "grad_norm": 3.6093413829803467, + "learning_rate": 2.9797979797979796e-05, + "loss": 1.2698, + "step": 9130 + }, + { + "epoch": 1.4293988728866625, + "grad_norm": 2.7198972702026367, + "learning_rate": 2.978983382209189e-05, + "loss": 0.7143, + "step": 9131 + }, + { + "epoch": 1.4295554164057607, + "grad_norm": 4.30855131149292, + "learning_rate": 2.9781687846203977e-05, + "loss": 1.5093, + "step": 9132 + }, + { + "epoch": 1.429711959924859, + "grad_norm": 3.3831164836883545, + "learning_rate": 2.9773541870316065e-05, + "loss": 1.0066, + "step": 9133 + }, + { + "epoch": 1.4298685034439576, + "grad_norm": 1.08523690700531, + "learning_rate": 2.9765395894428155e-05, + "loss": 0.2964, + "step": 9134 + }, + { + "epoch": 1.4300250469630558, + "grad_norm": 4.321139335632324, + "learning_rate": 2.9757249918540242e-05, + "loss": 0.7967, + "step": 9135 + }, + { + "epoch": 1.430181590482154, + "grad_norm": 2.1940715312957764, + "learning_rate": 2.974910394265233e-05, + "loss": 0.4076, + "step": 9136 + }, + { + "epoch": 1.4303381340012524, + "grad_norm": 2.9125678539276123, + "learning_rate": 2.974095796676442e-05, + "loss": 0.9213, + "step": 9137 + }, + { + "epoch": 1.4304946775203506, + "grad_norm": 1.8422141075134277, + "learning_rate": 2.9732811990876507e-05, + "loss": 0.9977, + "step": 9138 + }, + { + "epoch": 1.430651221039449, + "grad_norm": 0.5762105584144592, + "learning_rate": 2.9724666014988594e-05, + "loss": 0.1673, + "step": 9139 + }, + { + "epoch": 1.4308077645585473, + "grad_norm": 1.3055166006088257, + "learning_rate": 2.9716520039100688e-05, + "loss": 0.3051, + "step": 9140 + }, + { + "epoch": 1.4309643080776455, + "grad_norm": 0.7812267541885376, + "learning_rate": 2.9708374063212775e-05, + "loss": 0.2346, + "step": 9141 + }, + { + "epoch": 1.431120851596744, + "grad_norm": 0.7346001267433167, + "learning_rate": 2.9700228087324862e-05, + "loss": 0.2268, + "step": 9142 + }, + { + "epoch": 1.4312773951158422, + "grad_norm": 0.3883357346057892, + "learning_rate": 2.9692082111436953e-05, + "loss": 0.2046, + "step": 9143 + }, + { + "epoch": 1.4314339386349406, + "grad_norm": 1.3369841575622559, + "learning_rate": 2.968393613554904e-05, + "loss": 0.4618, + "step": 9144 + }, + { + "epoch": 1.4315904821540388, + "grad_norm": 0.8958072662353516, + "learning_rate": 2.9675790159661127e-05, + "loss": 0.3071, + "step": 9145 + }, + { + "epoch": 1.431747025673137, + "grad_norm": 0.7952312231063843, + "learning_rate": 2.9667644183773218e-05, + "loss": 0.2347, + "step": 9146 + }, + { + "epoch": 1.4319035691922355, + "grad_norm": 0.6530971527099609, + "learning_rate": 2.9659498207885305e-05, + "loss": 0.2039, + "step": 9147 + }, + { + "epoch": 1.4320601127113337, + "grad_norm": 1.779439091682434, + "learning_rate": 2.9651352231997392e-05, + "loss": 0.4274, + "step": 9148 + }, + { + "epoch": 1.4322166562304322, + "grad_norm": 1.7894961833953857, + "learning_rate": 2.9643206256109486e-05, + "loss": 0.1716, + "step": 9149 + }, + { + "epoch": 1.4323731997495304, + "grad_norm": 0.9964228868484497, + "learning_rate": 2.9635060280221573e-05, + "loss": 0.2807, + "step": 9150 + }, + { + "epoch": 1.4325297432686286, + "grad_norm": 1.6337634325027466, + "learning_rate": 2.9626914304333657e-05, + "loss": 0.5161, + "step": 9151 + }, + { + "epoch": 1.432686286787727, + "grad_norm": 1.0451078414916992, + "learning_rate": 2.961876832844575e-05, + "loss": 0.3897, + "step": 9152 + }, + { + "epoch": 1.4328428303068252, + "grad_norm": 2.4196879863739014, + "learning_rate": 2.9610622352557838e-05, + "loss": 0.4885, + "step": 9153 + }, + { + "epoch": 1.4329993738259237, + "grad_norm": 1.1097357273101807, + "learning_rate": 2.9602476376669925e-05, + "loss": 0.2759, + "step": 9154 + }, + { + "epoch": 1.433155917345022, + "grad_norm": 0.7595136165618896, + "learning_rate": 2.9594330400782015e-05, + "loss": 0.1708, + "step": 9155 + }, + { + "epoch": 1.4333124608641201, + "grad_norm": 0.9571323990821838, + "learning_rate": 2.9586184424894103e-05, + "loss": 0.2468, + "step": 9156 + }, + { + "epoch": 1.4334690043832186, + "grad_norm": 0.9999768137931824, + "learning_rate": 2.957803844900619e-05, + "loss": 0.3251, + "step": 9157 + }, + { + "epoch": 1.4336255479023168, + "grad_norm": 1.632944941520691, + "learning_rate": 2.9569892473118284e-05, + "loss": 0.4334, + "step": 9158 + }, + { + "epoch": 1.4337820914214152, + "grad_norm": 2.0051565170288086, + "learning_rate": 2.956174649723037e-05, + "loss": 0.3573, + "step": 9159 + }, + { + "epoch": 1.4339386349405134, + "grad_norm": 2.320676326751709, + "learning_rate": 2.9553600521342454e-05, + "loss": 0.5018, + "step": 9160 + }, + { + "epoch": 1.4340951784596117, + "grad_norm": 0.6792388558387756, + "learning_rate": 2.954545454545455e-05, + "loss": 0.2178, + "step": 9161 + }, + { + "epoch": 1.43425172197871, + "grad_norm": 2.4542124271392822, + "learning_rate": 2.9537308569566636e-05, + "loss": 0.6306, + "step": 9162 + }, + { + "epoch": 1.4344082654978083, + "grad_norm": 0.9374074339866638, + "learning_rate": 2.9529162593678723e-05, + "loss": 0.2962, + "step": 9163 + }, + { + "epoch": 1.4345648090169068, + "grad_norm": 1.982703685760498, + "learning_rate": 2.9521016617790813e-05, + "loss": 0.3849, + "step": 9164 + }, + { + "epoch": 1.434721352536005, + "grad_norm": 3.033521890640259, + "learning_rate": 2.95128706419029e-05, + "loss": 0.4094, + "step": 9165 + }, + { + "epoch": 1.4348778960551032, + "grad_norm": 4.457775592803955, + "learning_rate": 2.9504724666014987e-05, + "loss": 0.6636, + "step": 9166 + }, + { + "epoch": 1.4350344395742016, + "grad_norm": 3.0642588138580322, + "learning_rate": 2.949657869012708e-05, + "loss": 0.7802, + "step": 9167 + }, + { + "epoch": 1.4351909830933, + "grad_norm": 2.540259599685669, + "learning_rate": 2.948843271423917e-05, + "loss": 0.9476, + "step": 9168 + }, + { + "epoch": 1.4353475266123983, + "grad_norm": 1.9001942873001099, + "learning_rate": 2.9480286738351252e-05, + "loss": 0.3654, + "step": 9169 + }, + { + "epoch": 1.4355040701314965, + "grad_norm": 2.8572235107421875, + "learning_rate": 2.9472140762463346e-05, + "loss": 0.4515, + "step": 9170 + }, + { + "epoch": 1.435660613650595, + "grad_norm": 4.509949684143066, + "learning_rate": 2.9463994786575433e-05, + "loss": 0.8566, + "step": 9171 + }, + { + "epoch": 1.4358171571696932, + "grad_norm": 1.5681129693984985, + "learning_rate": 2.945584881068752e-05, + "loss": 0.5666, + "step": 9172 + }, + { + "epoch": 1.4359737006887916, + "grad_norm": 6.305080890655518, + "learning_rate": 2.944770283479961e-05, + "loss": 1.6256, + "step": 9173 + }, + { + "epoch": 1.4361302442078898, + "grad_norm": 5.202556133270264, + "learning_rate": 2.9439556858911698e-05, + "loss": 0.749, + "step": 9174 + }, + { + "epoch": 1.436286787726988, + "grad_norm": 2.470564603805542, + "learning_rate": 2.9431410883023785e-05, + "loss": 0.8937, + "step": 9175 + }, + { + "epoch": 1.4364433312460865, + "grad_norm": 2.4593448638916016, + "learning_rate": 2.942326490713588e-05, + "loss": 0.4369, + "step": 9176 + }, + { + "epoch": 1.4365998747651847, + "grad_norm": 2.526078224182129, + "learning_rate": 2.9415118931247966e-05, + "loss": 0.4713, + "step": 9177 + }, + { + "epoch": 1.4367564182842831, + "grad_norm": 4.2002644538879395, + "learning_rate": 2.940697295536005e-05, + "loss": 1.1623, + "step": 9178 + }, + { + "epoch": 1.4369129618033814, + "grad_norm": 6.166375160217285, + "learning_rate": 2.9398826979472144e-05, + "loss": 1.1074, + "step": 9179 + }, + { + "epoch": 1.4370695053224796, + "grad_norm": 3.859870195388794, + "learning_rate": 2.939068100358423e-05, + "loss": 0.6767, + "step": 9180 + }, + { + "epoch": 1.437226048841578, + "grad_norm": 2.448218822479248, + "learning_rate": 2.9382535027696318e-05, + "loss": 1.1532, + "step": 9181 + }, + { + "epoch": 1.4373825923606762, + "grad_norm": 3.642277717590332, + "learning_rate": 2.937438905180841e-05, + "loss": 0.883, + "step": 9182 + }, + { + "epoch": 1.4375391358797747, + "grad_norm": 7.94639253616333, + "learning_rate": 2.9366243075920496e-05, + "loss": 0.8742, + "step": 9183 + }, + { + "epoch": 1.4376956793988729, + "grad_norm": 4.167221546173096, + "learning_rate": 2.9358097100032583e-05, + "loss": 0.3545, + "step": 9184 + }, + { + "epoch": 1.437852222917971, + "grad_norm": 2.1714513301849365, + "learning_rate": 2.9349951124144677e-05, + "loss": 0.5516, + "step": 9185 + }, + { + "epoch": 1.4380087664370695, + "grad_norm": 3.682559013366699, + "learning_rate": 2.934180514825676e-05, + "loss": 0.5059, + "step": 9186 + }, + { + "epoch": 1.4381653099561678, + "grad_norm": 2.3684046268463135, + "learning_rate": 2.9333659172368848e-05, + "loss": 0.3442, + "step": 9187 + }, + { + "epoch": 1.4383218534752662, + "grad_norm": 3.2412192821502686, + "learning_rate": 2.9325513196480942e-05, + "loss": 0.8471, + "step": 9188 + }, + { + "epoch": 1.4384783969943644, + "grad_norm": 0.9847676157951355, + "learning_rate": 2.931736722059303e-05, + "loss": 0.1993, + "step": 9189 + }, + { + "epoch": 1.4386349405134626, + "grad_norm": 0.8860464096069336, + "learning_rate": 2.9309221244705116e-05, + "loss": 0.2792, + "step": 9190 + }, + { + "epoch": 1.438791484032561, + "grad_norm": 1.2870557308197021, + "learning_rate": 2.9301075268817207e-05, + "loss": 0.4781, + "step": 9191 + }, + { + "epoch": 1.4389480275516593, + "grad_norm": 0.6193233132362366, + "learning_rate": 2.9292929292929294e-05, + "loss": 0.265, + "step": 9192 + }, + { + "epoch": 1.4391045710707577, + "grad_norm": 0.7679431438446045, + "learning_rate": 2.928478331704138e-05, + "loss": 0.3028, + "step": 9193 + }, + { + "epoch": 1.439261114589856, + "grad_norm": 0.5783966779708862, + "learning_rate": 2.9276637341153475e-05, + "loss": 0.1668, + "step": 9194 + }, + { + "epoch": 1.4394176581089542, + "grad_norm": 0.6841817498207092, + "learning_rate": 2.926849136526556e-05, + "loss": 0.2329, + "step": 9195 + }, + { + "epoch": 1.4395742016280526, + "grad_norm": 1.0043625831604004, + "learning_rate": 2.9260345389377646e-05, + "loss": 0.2703, + "step": 9196 + }, + { + "epoch": 1.4397307451471508, + "grad_norm": 0.7629160284996033, + "learning_rate": 2.925219941348974e-05, + "loss": 0.1345, + "step": 9197 + }, + { + "epoch": 1.4398872886662493, + "grad_norm": 0.6363821029663086, + "learning_rate": 2.9244053437601827e-05, + "loss": 0.2231, + "step": 9198 + }, + { + "epoch": 1.4400438321853475, + "grad_norm": 0.7706338167190552, + "learning_rate": 2.9235907461713914e-05, + "loss": 0.3534, + "step": 9199 + }, + { + "epoch": 1.4402003757044457, + "grad_norm": 1.163434386253357, + "learning_rate": 2.9227761485826004e-05, + "loss": 0.2727, + "step": 9200 + }, + { + "epoch": 1.4403569192235441, + "grad_norm": 1.0893352031707764, + "learning_rate": 2.921961550993809e-05, + "loss": 0.3908, + "step": 9201 + }, + { + "epoch": 1.4405134627426426, + "grad_norm": 1.4719706773757935, + "learning_rate": 2.921146953405018e-05, + "loss": 0.3717, + "step": 9202 + }, + { + "epoch": 1.4406700062617408, + "grad_norm": 1.9402117729187012, + "learning_rate": 2.9203323558162273e-05, + "loss": 0.2779, + "step": 9203 + }, + { + "epoch": 1.440826549780839, + "grad_norm": 2.5257246494293213, + "learning_rate": 2.9195177582274356e-05, + "loss": 0.3145, + "step": 9204 + }, + { + "epoch": 1.4409830932999375, + "grad_norm": 2.3094327449798584, + "learning_rate": 2.9187031606386443e-05, + "loss": 0.4953, + "step": 9205 + }, + { + "epoch": 1.4411396368190357, + "grad_norm": 2.4022865295410156, + "learning_rate": 2.9178885630498537e-05, + "loss": 0.5402, + "step": 9206 + }, + { + "epoch": 1.4412961803381341, + "grad_norm": 1.5242502689361572, + "learning_rate": 2.9170739654610624e-05, + "loss": 0.496, + "step": 9207 + }, + { + "epoch": 1.4414527238572323, + "grad_norm": 1.4858949184417725, + "learning_rate": 2.916259367872271e-05, + "loss": 0.3253, + "step": 9208 + }, + { + "epoch": 1.4416092673763305, + "grad_norm": 1.9870860576629639, + "learning_rate": 2.9154447702834802e-05, + "loss": 0.4716, + "step": 9209 + }, + { + "epoch": 1.441765810895429, + "grad_norm": 5.004307270050049, + "learning_rate": 2.914630172694689e-05, + "loss": 0.6329, + "step": 9210 + }, + { + "epoch": 1.4419223544145272, + "grad_norm": 2.7749929428100586, + "learning_rate": 2.9138155751058976e-05, + "loss": 0.5109, + "step": 9211 + }, + { + "epoch": 1.4420788979336256, + "grad_norm": 2.263678550720215, + "learning_rate": 2.913000977517107e-05, + "loss": 0.7389, + "step": 9212 + }, + { + "epoch": 1.4422354414527239, + "grad_norm": 1.8763084411621094, + "learning_rate": 2.9121863799283154e-05, + "loss": 0.4236, + "step": 9213 + }, + { + "epoch": 1.442391984971822, + "grad_norm": 2.8621537685394287, + "learning_rate": 2.911371782339524e-05, + "loss": 0.9667, + "step": 9214 + }, + { + "epoch": 1.4425485284909205, + "grad_norm": 3.7442240715026855, + "learning_rate": 2.9105571847507335e-05, + "loss": 0.6475, + "step": 9215 + }, + { + "epoch": 1.4427050720100187, + "grad_norm": 6.475198745727539, + "learning_rate": 2.9097425871619422e-05, + "loss": 0.4768, + "step": 9216 + }, + { + "epoch": 1.4428616155291172, + "grad_norm": 3.5459818840026855, + "learning_rate": 2.908927989573151e-05, + "loss": 0.8545, + "step": 9217 + }, + { + "epoch": 1.4430181590482154, + "grad_norm": 2.933454990386963, + "learning_rate": 2.90811339198436e-05, + "loss": 0.6178, + "step": 9218 + }, + { + "epoch": 1.4431747025673136, + "grad_norm": 2.678849935531616, + "learning_rate": 2.9072987943955687e-05, + "loss": 1.2734, + "step": 9219 + }, + { + "epoch": 1.443331246086412, + "grad_norm": 1.3151048421859741, + "learning_rate": 2.9064841968067774e-05, + "loss": 0.3808, + "step": 9220 + }, + { + "epoch": 1.4434877896055103, + "grad_norm": 3.1946184635162354, + "learning_rate": 2.9056695992179868e-05, + "loss": 0.8722, + "step": 9221 + }, + { + "epoch": 1.4436443331246087, + "grad_norm": 3.4977223873138428, + "learning_rate": 2.9048550016291952e-05, + "loss": 0.9594, + "step": 9222 + }, + { + "epoch": 1.443800876643707, + "grad_norm": 3.2593531608581543, + "learning_rate": 2.904040404040404e-05, + "loss": 0.888, + "step": 9223 + }, + { + "epoch": 1.4439574201628051, + "grad_norm": 1.1953721046447754, + "learning_rate": 2.9032258064516133e-05, + "loss": 0.3049, + "step": 9224 + }, + { + "epoch": 1.4441139636819036, + "grad_norm": 2.402891159057617, + "learning_rate": 2.902411208862822e-05, + "loss": 0.7032, + "step": 9225 + }, + { + "epoch": 1.4442705072010018, + "grad_norm": 4.19395637512207, + "learning_rate": 2.9015966112740307e-05, + "loss": 1.0967, + "step": 9226 + }, + { + "epoch": 1.4444270507201002, + "grad_norm": 2.07891845703125, + "learning_rate": 2.9007820136852398e-05, + "loss": 0.836, + "step": 9227 + }, + { + "epoch": 1.4445835942391985, + "grad_norm": 4.135005474090576, + "learning_rate": 2.8999674160964485e-05, + "loss": 0.8715, + "step": 9228 + }, + { + "epoch": 1.4447401377582967, + "grad_norm": 4.887277603149414, + "learning_rate": 2.8991528185076572e-05, + "loss": 1.052, + "step": 9229 + }, + { + "epoch": 1.4448966812773951, + "grad_norm": 3.5871307849884033, + "learning_rate": 2.8983382209188663e-05, + "loss": 1.7876, + "step": 9230 + }, + { + "epoch": 1.4450532247964936, + "grad_norm": 1.9294942617416382, + "learning_rate": 2.897523623330075e-05, + "loss": 0.7967, + "step": 9231 + }, + { + "epoch": 1.4452097683155918, + "grad_norm": 3.041128158569336, + "learning_rate": 2.8967090257412837e-05, + "loss": 1.0597, + "step": 9232 + }, + { + "epoch": 1.44536631183469, + "grad_norm": 3.8247621059417725, + "learning_rate": 2.895894428152493e-05, + "loss": 1.715, + "step": 9233 + }, + { + "epoch": 1.4455228553537882, + "grad_norm": 3.7648873329162598, + "learning_rate": 2.8950798305637018e-05, + "loss": 0.36, + "step": 9234 + }, + { + "epoch": 1.4456793988728867, + "grad_norm": 2.371906042098999, + "learning_rate": 2.8942652329749105e-05, + "loss": 1.1175, + "step": 9235 + }, + { + "epoch": 1.445835942391985, + "grad_norm": 2.6067895889282227, + "learning_rate": 2.8934506353861196e-05, + "loss": 0.6438, + "step": 9236 + }, + { + "epoch": 1.4459924859110833, + "grad_norm": 3.66300630569458, + "learning_rate": 2.8926360377973283e-05, + "loss": 0.9973, + "step": 9237 + }, + { + "epoch": 1.4461490294301815, + "grad_norm": 3.392850399017334, + "learning_rate": 2.891821440208537e-05, + "loss": 0.8937, + "step": 9238 + }, + { + "epoch": 1.44630557294928, + "grad_norm": 0.5588842034339905, + "learning_rate": 2.891006842619746e-05, + "loss": 0.2487, + "step": 9239 + }, + { + "epoch": 1.4464621164683782, + "grad_norm": 0.46355772018432617, + "learning_rate": 2.8901922450309547e-05, + "loss": 0.1797, + "step": 9240 + }, + { + "epoch": 1.4466186599874766, + "grad_norm": 2.34989333152771, + "learning_rate": 2.8893776474421635e-05, + "loss": 0.4642, + "step": 9241 + }, + { + "epoch": 1.4467752035065748, + "grad_norm": 0.7578288316726685, + "learning_rate": 2.888563049853373e-05, + "loss": 0.1949, + "step": 9242 + }, + { + "epoch": 1.446931747025673, + "grad_norm": 0.6748387813568115, + "learning_rate": 2.8877484522645816e-05, + "loss": 0.2167, + "step": 9243 + }, + { + "epoch": 1.4470882905447715, + "grad_norm": 0.6108400225639343, + "learning_rate": 2.8869338546757903e-05, + "loss": 0.2024, + "step": 9244 + }, + { + "epoch": 1.4472448340638697, + "grad_norm": 0.6118058562278748, + "learning_rate": 2.8861192570869993e-05, + "loss": 0.1466, + "step": 9245 + }, + { + "epoch": 1.4474013775829682, + "grad_norm": 0.9493047595024109, + "learning_rate": 2.885304659498208e-05, + "loss": 0.1876, + "step": 9246 + }, + { + "epoch": 1.4475579211020664, + "grad_norm": 0.8361440896987915, + "learning_rate": 2.8844900619094168e-05, + "loss": 0.2158, + "step": 9247 + }, + { + "epoch": 1.4477144646211646, + "grad_norm": 1.0443620681762695, + "learning_rate": 2.8836754643206258e-05, + "loss": 0.3271, + "step": 9248 + }, + { + "epoch": 1.447871008140263, + "grad_norm": 0.7602059841156006, + "learning_rate": 2.8828608667318345e-05, + "loss": 0.2629, + "step": 9249 + }, + { + "epoch": 1.4480275516593613, + "grad_norm": 1.2922664880752563, + "learning_rate": 2.8820462691430432e-05, + "loss": 0.2038, + "step": 9250 + }, + { + "epoch": 1.4481840951784597, + "grad_norm": 0.9812444448471069, + "learning_rate": 2.8812316715542526e-05, + "loss": 0.3135, + "step": 9251 + }, + { + "epoch": 1.448340638697558, + "grad_norm": 2.005506753921509, + "learning_rate": 2.8804170739654613e-05, + "loss": 0.5483, + "step": 9252 + }, + { + "epoch": 1.4484971822166561, + "grad_norm": 2.3777196407318115, + "learning_rate": 2.87960247637667e-05, + "loss": 0.4231, + "step": 9253 + }, + { + "epoch": 1.4486537257357546, + "grad_norm": 1.6938245296478271, + "learning_rate": 2.878787878787879e-05, + "loss": 0.3348, + "step": 9254 + }, + { + "epoch": 1.4488102692548528, + "grad_norm": 0.9166391491889954, + "learning_rate": 2.8779732811990878e-05, + "loss": 0.2812, + "step": 9255 + }, + { + "epoch": 1.4489668127739512, + "grad_norm": 4.673968315124512, + "learning_rate": 2.8771586836102965e-05, + "loss": 0.3419, + "step": 9256 + }, + { + "epoch": 1.4491233562930494, + "grad_norm": 2.5977237224578857, + "learning_rate": 2.8763440860215056e-05, + "loss": 0.33, + "step": 9257 + }, + { + "epoch": 1.4492798998121477, + "grad_norm": 1.6243988275527954, + "learning_rate": 2.8755294884327143e-05, + "loss": 0.5254, + "step": 9258 + }, + { + "epoch": 1.449436443331246, + "grad_norm": 1.4846608638763428, + "learning_rate": 2.874714890843923e-05, + "loss": 0.3154, + "step": 9259 + }, + { + "epoch": 1.4495929868503443, + "grad_norm": 1.820762038230896, + "learning_rate": 2.8739002932551324e-05, + "loss": 0.4353, + "step": 9260 + }, + { + "epoch": 1.4497495303694428, + "grad_norm": 2.874633550643921, + "learning_rate": 2.873085695666341e-05, + "loss": 0.563, + "step": 9261 + }, + { + "epoch": 1.449906073888541, + "grad_norm": 1.2919212579727173, + "learning_rate": 2.87227109807755e-05, + "loss": 0.3199, + "step": 9262 + }, + { + "epoch": 1.4500626174076392, + "grad_norm": 1.1790359020233154, + "learning_rate": 2.871456500488759e-05, + "loss": 0.3714, + "step": 9263 + }, + { + "epoch": 1.4502191609267376, + "grad_norm": 5.12162971496582, + "learning_rate": 2.8706419028999676e-05, + "loss": 0.5627, + "step": 9264 + }, + { + "epoch": 1.450375704445836, + "grad_norm": 1.53943932056427, + "learning_rate": 2.8698273053111763e-05, + "loss": 0.5378, + "step": 9265 + }, + { + "epoch": 1.4505322479649343, + "grad_norm": 2.343928098678589, + "learning_rate": 2.8690127077223854e-05, + "loss": 0.5982, + "step": 9266 + }, + { + "epoch": 1.4506887914840325, + "grad_norm": 2.7053604125976562, + "learning_rate": 2.868198110133594e-05, + "loss": 0.5334, + "step": 9267 + }, + { + "epoch": 1.4508453350031307, + "grad_norm": 5.6361894607543945, + "learning_rate": 2.8673835125448028e-05, + "loss": 0.6699, + "step": 9268 + }, + { + "epoch": 1.4510018785222292, + "grad_norm": 2.661451816558838, + "learning_rate": 2.8665689149560122e-05, + "loss": 0.5453, + "step": 9269 + }, + { + "epoch": 1.4511584220413276, + "grad_norm": 2.6731269359588623, + "learning_rate": 2.865754317367221e-05, + "loss": 0.8588, + "step": 9270 + }, + { + "epoch": 1.4513149655604258, + "grad_norm": 2.244652032852173, + "learning_rate": 2.8649397197784296e-05, + "loss": 0.7526, + "step": 9271 + }, + { + "epoch": 1.451471509079524, + "grad_norm": 3.198621988296509, + "learning_rate": 2.8641251221896387e-05, + "loss": 0.7247, + "step": 9272 + }, + { + "epoch": 1.4516280525986225, + "grad_norm": 3.856882333755493, + "learning_rate": 2.8633105246008474e-05, + "loss": 0.725, + "step": 9273 + }, + { + "epoch": 1.4517845961177207, + "grad_norm": 3.636338949203491, + "learning_rate": 2.862495927012056e-05, + "loss": 0.648, + "step": 9274 + }, + { + "epoch": 1.4519411396368191, + "grad_norm": 5.877135276794434, + "learning_rate": 2.861681329423265e-05, + "loss": 1.0851, + "step": 9275 + }, + { + "epoch": 1.4520976831559174, + "grad_norm": 6.040339469909668, + "learning_rate": 2.860866731834474e-05, + "loss": 1.3656, + "step": 9276 + }, + { + "epoch": 1.4522542266750156, + "grad_norm": 3.3278396129608154, + "learning_rate": 2.8600521342456826e-05, + "loss": 0.7247, + "step": 9277 + }, + { + "epoch": 1.452410770194114, + "grad_norm": 3.9508719444274902, + "learning_rate": 2.859237536656892e-05, + "loss": 0.8343, + "step": 9278 + }, + { + "epoch": 1.4525673137132122, + "grad_norm": 4.198532581329346, + "learning_rate": 2.8584229390681007e-05, + "loss": 1.3364, + "step": 9279 + }, + { + "epoch": 1.4527238572323107, + "grad_norm": 5.1601338386535645, + "learning_rate": 2.8576083414793094e-05, + "loss": 0.6628, + "step": 9280 + }, + { + "epoch": 1.452880400751409, + "grad_norm": 3.2475759983062744, + "learning_rate": 2.8567937438905184e-05, + "loss": 0.951, + "step": 9281 + }, + { + "epoch": 1.4530369442705071, + "grad_norm": 2.430426597595215, + "learning_rate": 2.855979146301727e-05, + "loss": 1.3646, + "step": 9282 + }, + { + "epoch": 1.4531934877896056, + "grad_norm": 3.3000223636627197, + "learning_rate": 2.855164548712936e-05, + "loss": 1.4157, + "step": 9283 + }, + { + "epoch": 1.4533500313087038, + "grad_norm": 3.0357372760772705, + "learning_rate": 2.854349951124145e-05, + "loss": 0.4856, + "step": 9284 + }, + { + "epoch": 1.4535065748278022, + "grad_norm": 4.850840091705322, + "learning_rate": 2.8535353535353536e-05, + "loss": 0.9732, + "step": 9285 + }, + { + "epoch": 1.4536631183469004, + "grad_norm": 2.2087607383728027, + "learning_rate": 2.8527207559465624e-05, + "loss": 0.5205, + "step": 9286 + }, + { + "epoch": 1.4538196618659986, + "grad_norm": 2.2640066146850586, + "learning_rate": 2.8519061583577717e-05, + "loss": 0.581, + "step": 9287 + }, + { + "epoch": 1.453976205385097, + "grad_norm": 4.780018329620361, + "learning_rate": 2.8510915607689805e-05, + "loss": 1.2142, + "step": 9288 + }, + { + "epoch": 1.4541327489041953, + "grad_norm": 0.3719369173049927, + "learning_rate": 2.850276963180189e-05, + "loss": 0.1484, + "step": 9289 + }, + { + "epoch": 1.4542892924232937, + "grad_norm": 0.3909616470336914, + "learning_rate": 2.8494623655913982e-05, + "loss": 0.1715, + "step": 9290 + }, + { + "epoch": 1.454445835942392, + "grad_norm": 0.6053410768508911, + "learning_rate": 2.848647768002607e-05, + "loss": 0.2626, + "step": 9291 + }, + { + "epoch": 1.4546023794614902, + "grad_norm": 0.5825343728065491, + "learning_rate": 2.8478331704138157e-05, + "loss": 0.1959, + "step": 9292 + }, + { + "epoch": 1.4547589229805886, + "grad_norm": 0.8655470013618469, + "learning_rate": 2.8470185728250247e-05, + "loss": 0.2153, + "step": 9293 + }, + { + "epoch": 1.4549154664996868, + "grad_norm": 0.5999854207038879, + "learning_rate": 2.8462039752362334e-05, + "loss": 0.212, + "step": 9294 + }, + { + "epoch": 1.4550720100187853, + "grad_norm": 0.9253398180007935, + "learning_rate": 2.845389377647442e-05, + "loss": 0.129, + "step": 9295 + }, + { + "epoch": 1.4552285535378835, + "grad_norm": 0.8174847960472107, + "learning_rate": 2.8445747800586515e-05, + "loss": 0.2217, + "step": 9296 + }, + { + "epoch": 1.4553850970569817, + "grad_norm": 0.9726320505142212, + "learning_rate": 2.8437601824698602e-05, + "loss": 0.2527, + "step": 9297 + }, + { + "epoch": 1.4555416405760802, + "grad_norm": 0.9029231667518616, + "learning_rate": 2.8429455848810686e-05, + "loss": 0.2312, + "step": 9298 + }, + { + "epoch": 1.4556981840951786, + "grad_norm": 0.7350387573242188, + "learning_rate": 2.842130987292278e-05, + "loss": 0.1799, + "step": 9299 + }, + { + "epoch": 1.4558547276142768, + "grad_norm": 0.7414048910140991, + "learning_rate": 2.8413163897034867e-05, + "loss": 0.2049, + "step": 9300 + }, + { + "epoch": 1.456011271133375, + "grad_norm": 1.3471629619598389, + "learning_rate": 2.8405017921146954e-05, + "loss": 0.4224, + "step": 9301 + }, + { + "epoch": 1.4561678146524735, + "grad_norm": 1.3038084506988525, + "learning_rate": 2.8396871945259045e-05, + "loss": 0.2181, + "step": 9302 + }, + { + "epoch": 1.4563243581715717, + "grad_norm": 1.1290844678878784, + "learning_rate": 2.8388725969371132e-05, + "loss": 0.3382, + "step": 9303 + }, + { + "epoch": 1.4564809016906701, + "grad_norm": 1.3281996250152588, + "learning_rate": 2.838057999348322e-05, + "loss": 0.2947, + "step": 9304 + }, + { + "epoch": 1.4566374452097683, + "grad_norm": 2.137291193008423, + "learning_rate": 2.8372434017595313e-05, + "loss": 0.5905, + "step": 9305 + }, + { + "epoch": 1.4567939887288666, + "grad_norm": 0.9109092950820923, + "learning_rate": 2.83642880417074e-05, + "loss": 0.1704, + "step": 9306 + }, + { + "epoch": 1.456950532247965, + "grad_norm": 1.2739495038986206, + "learning_rate": 2.8356142065819484e-05, + "loss": 0.4393, + "step": 9307 + }, + { + "epoch": 1.4571070757670632, + "grad_norm": 2.31005597114563, + "learning_rate": 2.8347996089931578e-05, + "loss": 0.5686, + "step": 9308 + }, + { + "epoch": 1.4572636192861617, + "grad_norm": 1.236411213874817, + "learning_rate": 2.8339850114043665e-05, + "loss": 0.4153, + "step": 9309 + }, + { + "epoch": 1.4574201628052599, + "grad_norm": 2.265061855316162, + "learning_rate": 2.8331704138155752e-05, + "loss": 0.7149, + "step": 9310 + }, + { + "epoch": 1.457576706324358, + "grad_norm": 2.5484323501586914, + "learning_rate": 2.8323558162267843e-05, + "loss": 0.4994, + "step": 9311 + }, + { + "epoch": 1.4577332498434565, + "grad_norm": 3.536334753036499, + "learning_rate": 2.831541218637993e-05, + "loss": 0.6078, + "step": 9312 + }, + { + "epoch": 1.4578897933625548, + "grad_norm": 3.5708975791931152, + "learning_rate": 2.8307266210492017e-05, + "loss": 0.9152, + "step": 9313 + }, + { + "epoch": 1.4580463368816532, + "grad_norm": 1.6033700704574585, + "learning_rate": 2.829912023460411e-05, + "loss": 0.4537, + "step": 9314 + }, + { + "epoch": 1.4582028804007514, + "grad_norm": 2.6832683086395264, + "learning_rate": 2.8290974258716198e-05, + "loss": 0.3805, + "step": 9315 + }, + { + "epoch": 1.4583594239198496, + "grad_norm": 2.4790542125701904, + "learning_rate": 2.8282828282828282e-05, + "loss": 0.782, + "step": 9316 + }, + { + "epoch": 1.458515967438948, + "grad_norm": 2.280626058578491, + "learning_rate": 2.8274682306940376e-05, + "loss": 0.9516, + "step": 9317 + }, + { + "epoch": 1.4586725109580463, + "grad_norm": 2.699793815612793, + "learning_rate": 2.8266536331052463e-05, + "loss": 0.6725, + "step": 9318 + }, + { + "epoch": 1.4588290544771447, + "grad_norm": 1.6624287366867065, + "learning_rate": 2.825839035516455e-05, + "loss": 0.6612, + "step": 9319 + }, + { + "epoch": 1.458985597996243, + "grad_norm": 3.1686909198760986, + "learning_rate": 2.825024437927664e-05, + "loss": 1.0206, + "step": 9320 + }, + { + "epoch": 1.4591421415153412, + "grad_norm": 3.8399693965911865, + "learning_rate": 2.8242098403388728e-05, + "loss": 1.2644, + "step": 9321 + }, + { + "epoch": 1.4592986850344396, + "grad_norm": 3.6813082695007324, + "learning_rate": 2.8233952427500815e-05, + "loss": 1.1446, + "step": 9322 + }, + { + "epoch": 1.4594552285535378, + "grad_norm": 4.7325053215026855, + "learning_rate": 2.822580645161291e-05, + "loss": 0.804, + "step": 9323 + }, + { + "epoch": 1.4596117720726363, + "grad_norm": 1.4476512670516968, + "learning_rate": 2.8217660475724992e-05, + "loss": 0.5113, + "step": 9324 + }, + { + "epoch": 1.4597683155917345, + "grad_norm": 2.824132204055786, + "learning_rate": 2.820951449983708e-05, + "loss": 0.4445, + "step": 9325 + }, + { + "epoch": 1.4599248591108327, + "grad_norm": 4.909213542938232, + "learning_rate": 2.8201368523949173e-05, + "loss": 1.4418, + "step": 9326 + }, + { + "epoch": 1.4600814026299311, + "grad_norm": 15.3295316696167, + "learning_rate": 2.819322254806126e-05, + "loss": 1.1783, + "step": 9327 + }, + { + "epoch": 1.4602379461490294, + "grad_norm": 4.676061153411865, + "learning_rate": 2.8185076572173348e-05, + "loss": 1.0219, + "step": 9328 + }, + { + "epoch": 1.4603944896681278, + "grad_norm": 2.019185781478882, + "learning_rate": 2.8176930596285438e-05, + "loss": 0.6977, + "step": 9329 + }, + { + "epoch": 1.460551033187226, + "grad_norm": 7.921292304992676, + "learning_rate": 2.8168784620397525e-05, + "loss": 1.2694, + "step": 9330 + }, + { + "epoch": 1.4607075767063242, + "grad_norm": 3.5613150596618652, + "learning_rate": 2.8160638644509612e-05, + "loss": 1.014, + "step": 9331 + }, + { + "epoch": 1.4608641202254227, + "grad_norm": 6.102621078491211, + "learning_rate": 2.8152492668621706e-05, + "loss": 1.1763, + "step": 9332 + }, + { + "epoch": 1.461020663744521, + "grad_norm": 3.861571788787842, + "learning_rate": 2.814434669273379e-05, + "loss": 1.2769, + "step": 9333 + }, + { + "epoch": 1.4611772072636193, + "grad_norm": 1.8250223398208618, + "learning_rate": 2.8136200716845877e-05, + "loss": 0.2875, + "step": 9334 + }, + { + "epoch": 1.4613337507827175, + "grad_norm": 4.746304512023926, + "learning_rate": 2.812805474095797e-05, + "loss": 0.8609, + "step": 9335 + }, + { + "epoch": 1.461490294301816, + "grad_norm": 1.728627324104309, + "learning_rate": 2.811990876507006e-05, + "loss": 0.7292, + "step": 9336 + }, + { + "epoch": 1.4616468378209142, + "grad_norm": 2.1702768802642822, + "learning_rate": 2.8111762789182145e-05, + "loss": 0.3607, + "step": 9337 + }, + { + "epoch": 1.4618033813400126, + "grad_norm": 1.9116103649139404, + "learning_rate": 2.8103616813294233e-05, + "loss": 0.5785, + "step": 9338 + }, + { + "epoch": 1.4619599248591109, + "grad_norm": 0.7361023426055908, + "learning_rate": 2.8095470837406323e-05, + "loss": 0.2805, + "step": 9339 + }, + { + "epoch": 1.462116468378209, + "grad_norm": 0.5739222168922424, + "learning_rate": 2.808732486151841e-05, + "loss": 0.2382, + "step": 9340 + }, + { + "epoch": 1.4622730118973075, + "grad_norm": 0.5059024095535278, + "learning_rate": 2.8079178885630497e-05, + "loss": 0.1347, + "step": 9341 + }, + { + "epoch": 1.4624295554164057, + "grad_norm": 0.747679591178894, + "learning_rate": 2.8071032909742588e-05, + "loss": 0.2624, + "step": 9342 + }, + { + "epoch": 1.4625860989355042, + "grad_norm": 0.8865615725517273, + "learning_rate": 2.8062886933854675e-05, + "loss": 0.2109, + "step": 9343 + }, + { + "epoch": 1.4627426424546024, + "grad_norm": 0.5296130776405334, + "learning_rate": 2.8054740957966762e-05, + "loss": 0.1269, + "step": 9344 + }, + { + "epoch": 1.4628991859737006, + "grad_norm": 0.8047333359718323, + "learning_rate": 2.8046594982078856e-05, + "loss": 0.25, + "step": 9345 + }, + { + "epoch": 1.463055729492799, + "grad_norm": 0.7867716550827026, + "learning_rate": 2.8038449006190943e-05, + "loss": 0.2267, + "step": 9346 + }, + { + "epoch": 1.4632122730118973, + "grad_norm": 1.6518833637237549, + "learning_rate": 2.803030303030303e-05, + "loss": 0.3189, + "step": 9347 + }, + { + "epoch": 1.4633688165309957, + "grad_norm": 0.91560298204422, + "learning_rate": 2.802215705441512e-05, + "loss": 0.1925, + "step": 9348 + }, + { + "epoch": 1.463525360050094, + "grad_norm": 1.5580296516418457, + "learning_rate": 2.8014011078527208e-05, + "loss": 0.3796, + "step": 9349 + }, + { + "epoch": 1.4636819035691921, + "grad_norm": 2.773740291595459, + "learning_rate": 2.8005865102639295e-05, + "loss": 0.6969, + "step": 9350 + }, + { + "epoch": 1.4638384470882906, + "grad_norm": 0.5665886402130127, + "learning_rate": 2.7997719126751386e-05, + "loss": 0.1767, + "step": 9351 + }, + { + "epoch": 1.4639949906073888, + "grad_norm": 1.1627330780029297, + "learning_rate": 2.7989573150863473e-05, + "loss": 0.2462, + "step": 9352 + }, + { + "epoch": 1.4641515341264872, + "grad_norm": 1.121222734451294, + "learning_rate": 2.798142717497556e-05, + "loss": 0.274, + "step": 9353 + }, + { + "epoch": 1.4643080776455855, + "grad_norm": 2.1979119777679443, + "learning_rate": 2.7973281199087654e-05, + "loss": 0.3634, + "step": 9354 + }, + { + "epoch": 1.4644646211646837, + "grad_norm": 1.051969051361084, + "learning_rate": 2.796513522319974e-05, + "loss": 0.4275, + "step": 9355 + }, + { + "epoch": 1.4646211646837821, + "grad_norm": 1.1546826362609863, + "learning_rate": 2.7956989247311828e-05, + "loss": 0.392, + "step": 9356 + }, + { + "epoch": 1.4647777082028803, + "grad_norm": 2.6550395488739014, + "learning_rate": 2.794884327142392e-05, + "loss": 0.5094, + "step": 9357 + }, + { + "epoch": 1.4649342517219788, + "grad_norm": 1.1206350326538086, + "learning_rate": 2.7940697295536006e-05, + "loss": 0.3995, + "step": 9358 + }, + { + "epoch": 1.465090795241077, + "grad_norm": 1.340263843536377, + "learning_rate": 2.7932551319648093e-05, + "loss": 0.4219, + "step": 9359 + }, + { + "epoch": 1.4652473387601752, + "grad_norm": 1.0141466856002808, + "learning_rate": 2.7924405343760184e-05, + "loss": 0.3819, + "step": 9360 + }, + { + "epoch": 1.4654038822792737, + "grad_norm": 3.6947035789489746, + "learning_rate": 2.791625936787227e-05, + "loss": 1.0038, + "step": 9361 + }, + { + "epoch": 1.4655604257983719, + "grad_norm": 1.6340135335922241, + "learning_rate": 2.7908113391984358e-05, + "loss": 0.2028, + "step": 9362 + }, + { + "epoch": 1.4657169693174703, + "grad_norm": 1.7632452249526978, + "learning_rate": 2.7899967416096452e-05, + "loss": 0.5075, + "step": 9363 + }, + { + "epoch": 1.4658735128365685, + "grad_norm": 1.1052398681640625, + "learning_rate": 2.789182144020854e-05, + "loss": 0.3401, + "step": 9364 + }, + { + "epoch": 1.4660300563556667, + "grad_norm": 1.8252686262130737, + "learning_rate": 2.7883675464320626e-05, + "loss": 0.6688, + "step": 9365 + }, + { + "epoch": 1.4661865998747652, + "grad_norm": 2.6698529720306396, + "learning_rate": 2.7875529488432717e-05, + "loss": 0.6579, + "step": 9366 + }, + { + "epoch": 1.4663431433938636, + "grad_norm": 1.6105272769927979, + "learning_rate": 2.7867383512544804e-05, + "loss": 0.4344, + "step": 9367 + }, + { + "epoch": 1.4664996869129618, + "grad_norm": 1.2804337739944458, + "learning_rate": 2.785923753665689e-05, + "loss": 0.2659, + "step": 9368 + }, + { + "epoch": 1.46665623043206, + "grad_norm": 2.4912776947021484, + "learning_rate": 2.785109156076898e-05, + "loss": 0.5038, + "step": 9369 + }, + { + "epoch": 1.4668127739511585, + "grad_norm": 5.057380676269531, + "learning_rate": 2.784294558488107e-05, + "loss": 0.645, + "step": 9370 + }, + { + "epoch": 1.4669693174702567, + "grad_norm": 4.849185466766357, + "learning_rate": 2.7834799608993156e-05, + "loss": 1.1154, + "step": 9371 + }, + { + "epoch": 1.4671258609893552, + "grad_norm": 1.9737064838409424, + "learning_rate": 2.782665363310525e-05, + "loss": 0.4224, + "step": 9372 + }, + { + "epoch": 1.4672824045084534, + "grad_norm": 3.2065932750701904, + "learning_rate": 2.7818507657217337e-05, + "loss": 0.4689, + "step": 9373 + }, + { + "epoch": 1.4674389480275516, + "grad_norm": 2.3560051918029785, + "learning_rate": 2.7810361681329424e-05, + "loss": 1.0723, + "step": 9374 + }, + { + "epoch": 1.46759549154665, + "grad_norm": 3.9507548809051514, + "learning_rate": 2.7802215705441514e-05, + "loss": 0.5514, + "step": 9375 + }, + { + "epoch": 1.4677520350657483, + "grad_norm": 5.901495933532715, + "learning_rate": 2.77940697295536e-05, + "loss": 0.7752, + "step": 9376 + }, + { + "epoch": 1.4679085785848467, + "grad_norm": 4.21453332901001, + "learning_rate": 2.778592375366569e-05, + "loss": 1.2227, + "step": 9377 + }, + { + "epoch": 1.468065122103945, + "grad_norm": 2.7521204948425293, + "learning_rate": 2.777777777777778e-05, + "loss": 0.8373, + "step": 9378 + }, + { + "epoch": 1.4682216656230431, + "grad_norm": 2.9706404209136963, + "learning_rate": 2.7769631801889866e-05, + "loss": 1.0832, + "step": 9379 + }, + { + "epoch": 1.4683782091421416, + "grad_norm": 4.366386890411377, + "learning_rate": 2.7761485826001953e-05, + "loss": 1.422, + "step": 9380 + }, + { + "epoch": 1.4685347526612398, + "grad_norm": 2.6654887199401855, + "learning_rate": 2.7753339850114047e-05, + "loss": 1.1319, + "step": 9381 + }, + { + "epoch": 1.4686912961803382, + "grad_norm": 3.60880708694458, + "learning_rate": 2.7745193874226134e-05, + "loss": 1.2519, + "step": 9382 + }, + { + "epoch": 1.4688478396994364, + "grad_norm": 4.929933071136475, + "learning_rate": 2.7737047898338218e-05, + "loss": 1.3009, + "step": 9383 + }, + { + "epoch": 1.4690043832185347, + "grad_norm": 7.905126094818115, + "learning_rate": 2.7728901922450312e-05, + "loss": 1.2865, + "step": 9384 + }, + { + "epoch": 1.469160926737633, + "grad_norm": 5.627189636230469, + "learning_rate": 2.77207559465624e-05, + "loss": 1.4233, + "step": 9385 + }, + { + "epoch": 1.4693174702567313, + "grad_norm": 7.087461471557617, + "learning_rate": 2.7712609970674486e-05, + "loss": 0.5267, + "step": 9386 + }, + { + "epoch": 1.4694740137758298, + "grad_norm": 3.8283510208129883, + "learning_rate": 2.7704463994786577e-05, + "loss": 0.8661, + "step": 9387 + }, + { + "epoch": 1.469630557294928, + "grad_norm": 2.0659871101379395, + "learning_rate": 2.7696318018898664e-05, + "loss": 0.7158, + "step": 9388 + }, + { + "epoch": 1.4697871008140262, + "grad_norm": 0.5192438960075378, + "learning_rate": 2.768817204301075e-05, + "loss": 0.2065, + "step": 9389 + }, + { + "epoch": 1.4699436443331246, + "grad_norm": 0.444163978099823, + "learning_rate": 2.7680026067122845e-05, + "loss": 0.1438, + "step": 9390 + }, + { + "epoch": 1.4701001878522229, + "grad_norm": 0.9680379629135132, + "learning_rate": 2.7671880091234932e-05, + "loss": 0.2394, + "step": 9391 + }, + { + "epoch": 1.4702567313713213, + "grad_norm": 0.47854068875312805, + "learning_rate": 2.7663734115347016e-05, + "loss": 0.2311, + "step": 9392 + }, + { + "epoch": 1.4704132748904195, + "grad_norm": 0.6199548244476318, + "learning_rate": 2.765558813945911e-05, + "loss": 0.1744, + "step": 9393 + }, + { + "epoch": 1.4705698184095177, + "grad_norm": 0.7861536145210266, + "learning_rate": 2.7647442163571197e-05, + "loss": 0.2143, + "step": 9394 + }, + { + "epoch": 1.4707263619286162, + "grad_norm": 0.6795837879180908, + "learning_rate": 2.7639296187683284e-05, + "loss": 0.2366, + "step": 9395 + }, + { + "epoch": 1.4708829054477144, + "grad_norm": 0.6804821491241455, + "learning_rate": 2.7631150211795375e-05, + "loss": 0.2804, + "step": 9396 + }, + { + "epoch": 1.4710394489668128, + "grad_norm": 0.70804363489151, + "learning_rate": 2.7623004235907462e-05, + "loss": 0.1886, + "step": 9397 + }, + { + "epoch": 1.471195992485911, + "grad_norm": 0.661156415939331, + "learning_rate": 2.761485826001955e-05, + "loss": 0.1564, + "step": 9398 + }, + { + "epoch": 1.4713525360050093, + "grad_norm": 0.9630101919174194, + "learning_rate": 2.7606712284131643e-05, + "loss": 0.3526, + "step": 9399 + }, + { + "epoch": 1.4715090795241077, + "grad_norm": 1.8995405435562134, + "learning_rate": 2.759856630824373e-05, + "loss": 0.3132, + "step": 9400 + }, + { + "epoch": 1.4716656230432061, + "grad_norm": 1.023144245147705, + "learning_rate": 2.7590420332355814e-05, + "loss": 0.2568, + "step": 9401 + }, + { + "epoch": 1.4718221665623044, + "grad_norm": 0.9447213411331177, + "learning_rate": 2.7582274356467908e-05, + "loss": 0.2513, + "step": 9402 + }, + { + "epoch": 1.4719787100814026, + "grad_norm": 0.5267271995544434, + "learning_rate": 2.7574128380579995e-05, + "loss": 0.1971, + "step": 9403 + }, + { + "epoch": 1.472135253600501, + "grad_norm": 1.3876922130584717, + "learning_rate": 2.7565982404692082e-05, + "loss": 0.302, + "step": 9404 + }, + { + "epoch": 1.4722917971195992, + "grad_norm": 2.062208890914917, + "learning_rate": 2.7557836428804172e-05, + "loss": 0.3738, + "step": 9405 + }, + { + "epoch": 1.4724483406386977, + "grad_norm": 2.1170716285705566, + "learning_rate": 2.754969045291626e-05, + "loss": 0.3569, + "step": 9406 + }, + { + "epoch": 1.472604884157796, + "grad_norm": 2.2100372314453125, + "learning_rate": 2.7541544477028347e-05, + "loss": 0.386, + "step": 9407 + }, + { + "epoch": 1.472761427676894, + "grad_norm": 0.9877368807792664, + "learning_rate": 2.753339850114044e-05, + "loss": 0.3898, + "step": 9408 + }, + { + "epoch": 1.4729179711959925, + "grad_norm": 4.498376846313477, + "learning_rate": 2.7525252525252528e-05, + "loss": 0.3497, + "step": 9409 + }, + { + "epoch": 1.4730745147150908, + "grad_norm": 2.003938674926758, + "learning_rate": 2.751710654936461e-05, + "loss": 0.4519, + "step": 9410 + }, + { + "epoch": 1.4732310582341892, + "grad_norm": 1.667968988418579, + "learning_rate": 2.7508960573476705e-05, + "loss": 0.5322, + "step": 9411 + }, + { + "epoch": 1.4733876017532874, + "grad_norm": 1.1348645687103271, + "learning_rate": 2.7500814597588793e-05, + "loss": 0.3914, + "step": 9412 + }, + { + "epoch": 1.4735441452723856, + "grad_norm": 1.1474837064743042, + "learning_rate": 2.749266862170088e-05, + "loss": 0.2471, + "step": 9413 + }, + { + "epoch": 1.473700688791484, + "grad_norm": 2.6282296180725098, + "learning_rate": 2.748452264581297e-05, + "loss": 0.6699, + "step": 9414 + }, + { + "epoch": 1.4738572323105823, + "grad_norm": 1.0179904699325562, + "learning_rate": 2.7476376669925057e-05, + "loss": 0.267, + "step": 9415 + }, + { + "epoch": 1.4740137758296807, + "grad_norm": 4.816948890686035, + "learning_rate": 2.7468230694037145e-05, + "loss": 0.9155, + "step": 9416 + }, + { + "epoch": 1.474170319348779, + "grad_norm": 1.9450587034225464, + "learning_rate": 2.746008471814924e-05, + "loss": 0.3857, + "step": 9417 + }, + { + "epoch": 1.4743268628678772, + "grad_norm": 2.3048555850982666, + "learning_rate": 2.7451938742261322e-05, + "loss": 0.5302, + "step": 9418 + }, + { + "epoch": 1.4744834063869756, + "grad_norm": 1.8395758867263794, + "learning_rate": 2.744379276637341e-05, + "loss": 0.3393, + "step": 9419 + }, + { + "epoch": 1.4746399499060738, + "grad_norm": 2.128751754760742, + "learning_rate": 2.7435646790485503e-05, + "loss": 0.4029, + "step": 9420 + }, + { + "epoch": 1.4747964934251723, + "grad_norm": 3.4866116046905518, + "learning_rate": 2.742750081459759e-05, + "loss": 0.7322, + "step": 9421 + }, + { + "epoch": 1.4749530369442705, + "grad_norm": 4.700732231140137, + "learning_rate": 2.7419354838709678e-05, + "loss": 0.7141, + "step": 9422 + }, + { + "epoch": 1.4751095804633687, + "grad_norm": 4.007346153259277, + "learning_rate": 2.7411208862821768e-05, + "loss": 1.245, + "step": 9423 + }, + { + "epoch": 1.4752661239824671, + "grad_norm": 3.8427751064300537, + "learning_rate": 2.7403062886933855e-05, + "loss": 0.8115, + "step": 9424 + }, + { + "epoch": 1.4754226675015654, + "grad_norm": 3.067357063293457, + "learning_rate": 2.7394916911045942e-05, + "loss": 1.0921, + "step": 9425 + }, + { + "epoch": 1.4755792110206638, + "grad_norm": 3.4110159873962402, + "learning_rate": 2.7386770935158036e-05, + "loss": 1.0096, + "step": 9426 + }, + { + "epoch": 1.475735754539762, + "grad_norm": 3.405402898788452, + "learning_rate": 2.737862495927012e-05, + "loss": 0.637, + "step": 9427 + }, + { + "epoch": 1.4758922980588602, + "grad_norm": 5.433472633361816, + "learning_rate": 2.7370478983382207e-05, + "loss": 0.9996, + "step": 9428 + }, + { + "epoch": 1.4760488415779587, + "grad_norm": 3.0812580585479736, + "learning_rate": 2.73623330074943e-05, + "loss": 0.8255, + "step": 9429 + }, + { + "epoch": 1.476205385097057, + "grad_norm": 3.6244962215423584, + "learning_rate": 2.7354187031606388e-05, + "loss": 1.4698, + "step": 9430 + }, + { + "epoch": 1.4763619286161553, + "grad_norm": 5.561184883117676, + "learning_rate": 2.7346041055718475e-05, + "loss": 1.2026, + "step": 9431 + }, + { + "epoch": 1.4765184721352536, + "grad_norm": 2.8922553062438965, + "learning_rate": 2.7337895079830566e-05, + "loss": 1.3229, + "step": 9432 + }, + { + "epoch": 1.4766750156543518, + "grad_norm": 2.1225316524505615, + "learning_rate": 2.7329749103942653e-05, + "loss": 0.6996, + "step": 9433 + }, + { + "epoch": 1.4768315591734502, + "grad_norm": 1.970570683479309, + "learning_rate": 2.732160312805474e-05, + "loss": 0.4467, + "step": 9434 + }, + { + "epoch": 1.4769881026925487, + "grad_norm": 1.4223753213882446, + "learning_rate": 2.7313457152166834e-05, + "loss": 0.3867, + "step": 9435 + }, + { + "epoch": 1.4771446462116469, + "grad_norm": 4.59428071975708, + "learning_rate": 2.7305311176278918e-05, + "loss": 0.8293, + "step": 9436 + }, + { + "epoch": 1.477301189730745, + "grad_norm": 3.2438759803771973, + "learning_rate": 2.7297165200391005e-05, + "loss": 1.3508, + "step": 9437 + }, + { + "epoch": 1.4774577332498435, + "grad_norm": 2.398716449737549, + "learning_rate": 2.72890192245031e-05, + "loss": 1.0659, + "step": 9438 + }, + { + "epoch": 1.4776142767689417, + "grad_norm": 0.5085003972053528, + "learning_rate": 2.7280873248615186e-05, + "loss": 0.2067, + "step": 9439 + }, + { + "epoch": 1.4777708202880402, + "grad_norm": 0.8732066750526428, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.3474, + "step": 9440 + }, + { + "epoch": 1.4779273638071384, + "grad_norm": 0.4948869049549103, + "learning_rate": 2.7264581296839364e-05, + "loss": 0.1375, + "step": 9441 + }, + { + "epoch": 1.4780839073262366, + "grad_norm": 0.5861649513244629, + "learning_rate": 2.725643532095145e-05, + "loss": 0.2926, + "step": 9442 + }, + { + "epoch": 1.478240450845335, + "grad_norm": 0.4491073191165924, + "learning_rate": 2.7248289345063538e-05, + "loss": 0.1331, + "step": 9443 + }, + { + "epoch": 1.4783969943644333, + "grad_norm": 0.7141631245613098, + "learning_rate": 2.7240143369175632e-05, + "loss": 0.2706, + "step": 9444 + }, + { + "epoch": 1.4785535378835317, + "grad_norm": 0.7635671496391296, + "learning_rate": 2.7231997393287716e-05, + "loss": 0.2721, + "step": 9445 + }, + { + "epoch": 1.47871008140263, + "grad_norm": 1.9829148054122925, + "learning_rate": 2.7223851417399803e-05, + "loss": 0.2993, + "step": 9446 + }, + { + "epoch": 1.4788666249217282, + "grad_norm": 1.312972068786621, + "learning_rate": 2.7215705441511897e-05, + "loss": 0.365, + "step": 9447 + }, + { + "epoch": 1.4790231684408266, + "grad_norm": 0.7782942652702332, + "learning_rate": 2.7207559465623984e-05, + "loss": 0.2542, + "step": 9448 + }, + { + "epoch": 1.4791797119599248, + "grad_norm": 0.9792256355285645, + "learning_rate": 2.719941348973607e-05, + "loss": 0.1769, + "step": 9449 + }, + { + "epoch": 1.4793362554790233, + "grad_norm": 1.145917296409607, + "learning_rate": 2.719126751384816e-05, + "loss": 0.3633, + "step": 9450 + }, + { + "epoch": 1.4794927989981215, + "grad_norm": 1.606670618057251, + "learning_rate": 2.718312153796025e-05, + "loss": 0.2996, + "step": 9451 + }, + { + "epoch": 1.4796493425172197, + "grad_norm": 1.0231378078460693, + "learning_rate": 2.7174975562072336e-05, + "loss": 0.1924, + "step": 9452 + }, + { + "epoch": 1.4798058860363181, + "grad_norm": 1.3505796194076538, + "learning_rate": 2.716682958618443e-05, + "loss": 0.2943, + "step": 9453 + }, + { + "epoch": 1.4799624295554163, + "grad_norm": 2.0016889572143555, + "learning_rate": 2.7158683610296513e-05, + "loss": 0.4609, + "step": 9454 + }, + { + "epoch": 1.4801189730745148, + "grad_norm": 1.5164896249771118, + "learning_rate": 2.71505376344086e-05, + "loss": 0.3624, + "step": 9455 + }, + { + "epoch": 1.480275516593613, + "grad_norm": 2.082320213317871, + "learning_rate": 2.7142391658520694e-05, + "loss": 0.405, + "step": 9456 + }, + { + "epoch": 1.4804320601127112, + "grad_norm": 2.7795627117156982, + "learning_rate": 2.713424568263278e-05, + "loss": 0.4326, + "step": 9457 + }, + { + "epoch": 1.4805886036318097, + "grad_norm": 1.2738381624221802, + "learning_rate": 2.712609970674487e-05, + "loss": 0.4043, + "step": 9458 + }, + { + "epoch": 1.4807451471509079, + "grad_norm": 0.6970977783203125, + "learning_rate": 2.711795373085696e-05, + "loss": 0.237, + "step": 9459 + }, + { + "epoch": 1.4809016906700063, + "grad_norm": 2.6232333183288574, + "learning_rate": 2.7109807754969046e-05, + "loss": 0.5199, + "step": 9460 + }, + { + "epoch": 1.4810582341891045, + "grad_norm": 2.5272157192230225, + "learning_rate": 2.7101661779081133e-05, + "loss": 0.4529, + "step": 9461 + }, + { + "epoch": 1.4812147777082028, + "grad_norm": 3.4523890018463135, + "learning_rate": 2.7093515803193224e-05, + "loss": 0.578, + "step": 9462 + }, + { + "epoch": 1.4813713212273012, + "grad_norm": 2.079491376876831, + "learning_rate": 2.708536982730531e-05, + "loss": 0.3688, + "step": 9463 + }, + { + "epoch": 1.4815278647463996, + "grad_norm": 1.9959230422973633, + "learning_rate": 2.7077223851417398e-05, + "loss": 0.3797, + "step": 9464 + }, + { + "epoch": 1.4816844082654979, + "grad_norm": 1.096095085144043, + "learning_rate": 2.7069077875529492e-05, + "loss": 0.2504, + "step": 9465 + }, + { + "epoch": 1.481840951784596, + "grad_norm": 2.1664347648620605, + "learning_rate": 2.706093189964158e-05, + "loss": 0.5887, + "step": 9466 + }, + { + "epoch": 1.4819974953036943, + "grad_norm": 2.6615357398986816, + "learning_rate": 2.7052785923753666e-05, + "loss": 0.7113, + "step": 9467 + }, + { + "epoch": 1.4821540388227927, + "grad_norm": 1.9165362119674683, + "learning_rate": 2.7044639947865757e-05, + "loss": 0.6084, + "step": 9468 + }, + { + "epoch": 1.4823105823418912, + "grad_norm": 2.2468016147613525, + "learning_rate": 2.7036493971977844e-05, + "loss": 0.3956, + "step": 9469 + }, + { + "epoch": 1.4824671258609894, + "grad_norm": 2.4495458602905273, + "learning_rate": 2.702834799608993e-05, + "loss": 0.5075, + "step": 9470 + }, + { + "epoch": 1.4826236693800876, + "grad_norm": 6.15167760848999, + "learning_rate": 2.7020202020202022e-05, + "loss": 1.4881, + "step": 9471 + }, + { + "epoch": 1.482780212899186, + "grad_norm": 5.293679237365723, + "learning_rate": 2.701205604431411e-05, + "loss": 1.0951, + "step": 9472 + }, + { + "epoch": 1.4829367564182843, + "grad_norm": 3.6461410522460938, + "learning_rate": 2.7003910068426196e-05, + "loss": 0.6566, + "step": 9473 + }, + { + "epoch": 1.4830932999373827, + "grad_norm": 2.5759811401367188, + "learning_rate": 2.699576409253829e-05, + "loss": 0.5236, + "step": 9474 + }, + { + "epoch": 1.483249843456481, + "grad_norm": 2.1124682426452637, + "learning_rate": 2.6987618116650377e-05, + "loss": 0.9068, + "step": 9475 + }, + { + "epoch": 1.4834063869755791, + "grad_norm": 3.230703115463257, + "learning_rate": 2.6979472140762464e-05, + "loss": 0.9229, + "step": 9476 + }, + { + "epoch": 1.4835629304946776, + "grad_norm": 2.4464826583862305, + "learning_rate": 2.6971326164874555e-05, + "loss": 0.7975, + "step": 9477 + }, + { + "epoch": 1.4837194740137758, + "grad_norm": 2.896838426589966, + "learning_rate": 2.6963180188986642e-05, + "loss": 0.5766, + "step": 9478 + }, + { + "epoch": 1.4838760175328742, + "grad_norm": 3.8171513080596924, + "learning_rate": 2.695503421309873e-05, + "loss": 1.0532, + "step": 9479 + }, + { + "epoch": 1.4840325610519725, + "grad_norm": 5.271205902099609, + "learning_rate": 2.694688823721082e-05, + "loss": 1.1969, + "step": 9480 + }, + { + "epoch": 1.4841891045710707, + "grad_norm": 2.454367160797119, + "learning_rate": 2.6938742261322907e-05, + "loss": 0.8167, + "step": 9481 + }, + { + "epoch": 1.4843456480901691, + "grad_norm": 1.2994531393051147, + "learning_rate": 2.6930596285434994e-05, + "loss": 0.6915, + "step": 9482 + }, + { + "epoch": 1.4845021916092673, + "grad_norm": 3.3088676929473877, + "learning_rate": 2.6922450309547088e-05, + "loss": 1.2122, + "step": 9483 + }, + { + "epoch": 1.4846587351283658, + "grad_norm": 3.678455352783203, + "learning_rate": 2.6914304333659175e-05, + "loss": 1.1183, + "step": 9484 + }, + { + "epoch": 1.484815278647464, + "grad_norm": 1.8670254945755005, + "learning_rate": 2.6906158357771262e-05, + "loss": 0.3917, + "step": 9485 + }, + { + "epoch": 1.4849718221665622, + "grad_norm": 2.1209065914154053, + "learning_rate": 2.6898012381883353e-05, + "loss": 0.7932, + "step": 9486 + }, + { + "epoch": 1.4851283656856606, + "grad_norm": 5.596905708312988, + "learning_rate": 2.688986640599544e-05, + "loss": 1.2282, + "step": 9487 + }, + { + "epoch": 1.4852849092047589, + "grad_norm": 2.4594786167144775, + "learning_rate": 2.6881720430107527e-05, + "loss": 0.8021, + "step": 9488 + }, + { + "epoch": 1.4854414527238573, + "grad_norm": 0.8173612356185913, + "learning_rate": 2.6873574454219617e-05, + "loss": 0.1842, + "step": 9489 + }, + { + "epoch": 1.4855979962429555, + "grad_norm": 0.5167773365974426, + "learning_rate": 2.6865428478331705e-05, + "loss": 0.1805, + "step": 9490 + }, + { + "epoch": 1.4857545397620537, + "grad_norm": 0.6175295114517212, + "learning_rate": 2.685728250244379e-05, + "loss": 0.1846, + "step": 9491 + }, + { + "epoch": 1.4859110832811522, + "grad_norm": 0.532281756401062, + "learning_rate": 2.6849136526555886e-05, + "loss": 0.245, + "step": 9492 + }, + { + "epoch": 1.4860676268002504, + "grad_norm": 0.7628893256187439, + "learning_rate": 2.6840990550667973e-05, + "loss": 0.3589, + "step": 9493 + }, + { + "epoch": 1.4862241703193488, + "grad_norm": 0.8323299884796143, + "learning_rate": 2.683284457478006e-05, + "loss": 0.1817, + "step": 9494 + }, + { + "epoch": 1.486380713838447, + "grad_norm": 0.9285368323326111, + "learning_rate": 2.682469859889215e-05, + "loss": 0.2477, + "step": 9495 + }, + { + "epoch": 1.4865372573575453, + "grad_norm": 3.5654895305633545, + "learning_rate": 2.6816552623004238e-05, + "loss": 0.5757, + "step": 9496 + }, + { + "epoch": 1.4866938008766437, + "grad_norm": 0.9033946990966797, + "learning_rate": 2.6808406647116325e-05, + "loss": 0.2014, + "step": 9497 + }, + { + "epoch": 1.4868503443957422, + "grad_norm": 1.3646537065505981, + "learning_rate": 2.6800260671228415e-05, + "loss": 0.3646, + "step": 9498 + }, + { + "epoch": 1.4870068879148404, + "grad_norm": 0.7605290412902832, + "learning_rate": 2.6792114695340502e-05, + "loss": 0.2333, + "step": 9499 + }, + { + "epoch": 1.4871634314339386, + "grad_norm": 0.43714869022369385, + "learning_rate": 2.678396871945259e-05, + "loss": 0.1196, + "step": 9500 + }, + { + "epoch": 1.487319974953037, + "grad_norm": 1.3263784646987915, + "learning_rate": 2.6775822743564683e-05, + "loss": 0.3668, + "step": 9501 + }, + { + "epoch": 1.4874765184721352, + "grad_norm": 0.7984063029289246, + "learning_rate": 2.676767676767677e-05, + "loss": 0.2966, + "step": 9502 + }, + { + "epoch": 1.4876330619912337, + "grad_norm": NaN, + "learning_rate": 2.676767676767677e-05, + "loss": 0.0, + "step": 9503 + }, + { + "epoch": 1.487789605510332, + "grad_norm": 1.2449305057525635, + "learning_rate": 2.6759530791788858e-05, + "loss": 0.3447, + "step": 9504 + }, + { + "epoch": 1.4879461490294301, + "grad_norm": 1.9447460174560547, + "learning_rate": 2.6751384815900948e-05, + "loss": 0.3039, + "step": 9505 + }, + { + "epoch": 1.4881026925485286, + "grad_norm": 0.7585000991821289, + "learning_rate": 2.6743238840013035e-05, + "loss": 0.3267, + "step": 9506 + }, + { + "epoch": 1.4882592360676268, + "grad_norm": 1.4662150144577026, + "learning_rate": 2.6735092864125122e-05, + "loss": 0.4271, + "step": 9507 + }, + { + "epoch": 1.4884157795867252, + "grad_norm": 4.477964401245117, + "learning_rate": 2.6726946888237213e-05, + "loss": 0.7277, + "step": 9508 + }, + { + "epoch": 1.4885723231058234, + "grad_norm": 1.2841525077819824, + "learning_rate": 2.67188009123493e-05, + "loss": 0.4983, + "step": 9509 + }, + { + "epoch": 1.4887288666249217, + "grad_norm": 1.5324639081954956, + "learning_rate": 2.6710654936461387e-05, + "loss": 0.332, + "step": 9510 + }, + { + "epoch": 1.48888541014402, + "grad_norm": 1.7775115966796875, + "learning_rate": 2.670250896057348e-05, + "loss": 0.3908, + "step": 9511 + }, + { + "epoch": 1.4890419536631183, + "grad_norm": 2.0675837993621826, + "learning_rate": 2.6694362984685568e-05, + "loss": 0.378, + "step": 9512 + }, + { + "epoch": 1.4891984971822168, + "grad_norm": 2.4881365299224854, + "learning_rate": 2.6686217008797655e-05, + "loss": 0.6215, + "step": 9513 + }, + { + "epoch": 1.489355040701315, + "grad_norm": 3.6641693115234375, + "learning_rate": 2.6678071032909746e-05, + "loss": 0.5242, + "step": 9514 + }, + { + "epoch": 1.4895115842204132, + "grad_norm": 3.774697780609131, + "learning_rate": 2.6669925057021833e-05, + "loss": 0.7413, + "step": 9515 + }, + { + "epoch": 1.4896681277395116, + "grad_norm": 2.714733839035034, + "learning_rate": 2.666177908113392e-05, + "loss": 0.6626, + "step": 9516 + }, + { + "epoch": 1.4898246712586098, + "grad_norm": 1.4085626602172852, + "learning_rate": 2.665363310524601e-05, + "loss": 0.4258, + "step": 9517 + }, + { + "epoch": 1.4899812147777083, + "grad_norm": 1.9647160768508911, + "learning_rate": 2.6645487129358098e-05, + "loss": 0.6017, + "step": 9518 + }, + { + "epoch": 1.4901377582968065, + "grad_norm": 1.9175912141799927, + "learning_rate": 2.6637341153470185e-05, + "loss": 0.6013, + "step": 9519 + }, + { + "epoch": 1.4902943018159047, + "grad_norm": 2.4013755321502686, + "learning_rate": 2.662919517758228e-05, + "loss": 1.0333, + "step": 9520 + }, + { + "epoch": 1.4904508453350032, + "grad_norm": 1.9563109874725342, + "learning_rate": 2.6621049201694366e-05, + "loss": 0.8674, + "step": 9521 + }, + { + "epoch": 1.4906073888541014, + "grad_norm": 4.092169761657715, + "learning_rate": 2.661290322580645e-05, + "loss": 0.9021, + "step": 9522 + }, + { + "epoch": 1.4907639323731998, + "grad_norm": 1.9173184633255005, + "learning_rate": 2.6604757249918544e-05, + "loss": 0.5917, + "step": 9523 + }, + { + "epoch": 1.490920475892298, + "grad_norm": 2.354511260986328, + "learning_rate": 2.659661127403063e-05, + "loss": 0.4892, + "step": 9524 + }, + { + "epoch": 1.4910770194113963, + "grad_norm": 5.38683557510376, + "learning_rate": 2.6588465298142718e-05, + "loss": 1.1524, + "step": 9525 + }, + { + "epoch": 1.4912335629304947, + "grad_norm": 4.997884273529053, + "learning_rate": 2.658031932225481e-05, + "loss": 0.4324, + "step": 9526 + }, + { + "epoch": 1.491390106449593, + "grad_norm": 2.5405800342559814, + "learning_rate": 2.6572173346366896e-05, + "loss": 0.8708, + "step": 9527 + }, + { + "epoch": 1.4915466499686914, + "grad_norm": 2.698965549468994, + "learning_rate": 2.6564027370478983e-05, + "loss": 0.8821, + "step": 9528 + }, + { + "epoch": 1.4917031934877896, + "grad_norm": 3.1761722564697266, + "learning_rate": 2.6555881394591077e-05, + "loss": 0.8475, + "step": 9529 + }, + { + "epoch": 1.4918597370068878, + "grad_norm": 7.530089378356934, + "learning_rate": 2.6547735418703164e-05, + "loss": 0.6129, + "step": 9530 + }, + { + "epoch": 1.4920162805259862, + "grad_norm": 3.4164578914642334, + "learning_rate": 2.6539589442815248e-05, + "loss": 1.1473, + "step": 9531 + }, + { + "epoch": 1.4921728240450847, + "grad_norm": 5.093358516693115, + "learning_rate": 2.653144346692734e-05, + "loss": 1.4402, + "step": 9532 + }, + { + "epoch": 1.4923293675641829, + "grad_norm": 4.287496566772461, + "learning_rate": 2.652329749103943e-05, + "loss": 0.9821, + "step": 9533 + }, + { + "epoch": 1.492485911083281, + "grad_norm": 2.219910144805908, + "learning_rate": 2.6515151515151516e-05, + "loss": 1.0489, + "step": 9534 + }, + { + "epoch": 1.4926424546023795, + "grad_norm": 2.1449756622314453, + "learning_rate": 2.6507005539263606e-05, + "loss": 0.3643, + "step": 9535 + }, + { + "epoch": 1.4927989981214778, + "grad_norm": 2.8611257076263428, + "learning_rate": 2.6498859563375693e-05, + "loss": 0.6826, + "step": 9536 + }, + { + "epoch": 1.4929555416405762, + "grad_norm": 2.6975793838500977, + "learning_rate": 2.649071358748778e-05, + "loss": 0.5574, + "step": 9537 + }, + { + "epoch": 1.4931120851596744, + "grad_norm": 2.085230588912964, + "learning_rate": 2.6482567611599875e-05, + "loss": 0.3002, + "step": 9538 + }, + { + "epoch": 1.4932686286787726, + "grad_norm": 0.5471194982528687, + "learning_rate": 2.647442163571196e-05, + "loss": 0.2076, + "step": 9539 + }, + { + "epoch": 1.493425172197871, + "grad_norm": 0.44799843430519104, + "learning_rate": 2.6466275659824045e-05, + "loss": 0.1991, + "step": 9540 + }, + { + "epoch": 1.4935817157169693, + "grad_norm": 0.6468820571899414, + "learning_rate": 2.645812968393614e-05, + "loss": 0.2305, + "step": 9541 + }, + { + "epoch": 1.4937382592360677, + "grad_norm": 0.6387411952018738, + "learning_rate": 2.6449983708048226e-05, + "loss": 0.2895, + "step": 9542 + }, + { + "epoch": 1.493894802755166, + "grad_norm": 0.7281488180160522, + "learning_rate": 2.6441837732160314e-05, + "loss": 0.2303, + "step": 9543 + }, + { + "epoch": 1.4940513462742642, + "grad_norm": 0.43148812651634216, + "learning_rate": 2.6433691756272404e-05, + "loss": 0.1931, + "step": 9544 + }, + { + "epoch": 1.4942078897933626, + "grad_norm": 0.9047631025314331, + "learning_rate": 2.642554578038449e-05, + "loss": 0.1988, + "step": 9545 + }, + { + "epoch": 1.4943644333124608, + "grad_norm": 1.3888280391693115, + "learning_rate": 2.641739980449658e-05, + "loss": 0.2448, + "step": 9546 + }, + { + "epoch": 1.4945209768315593, + "grad_norm": 1.038866639137268, + "learning_rate": 2.6409253828608672e-05, + "loss": 0.2423, + "step": 9547 + }, + { + "epoch": 1.4946775203506575, + "grad_norm": 0.660569965839386, + "learning_rate": 2.640110785272076e-05, + "loss": 0.4034, + "step": 9548 + }, + { + "epoch": 1.4948340638697557, + "grad_norm": 0.9785370230674744, + "learning_rate": 2.6392961876832843e-05, + "loss": 0.2583, + "step": 9549 + }, + { + "epoch": 1.4949906073888541, + "grad_norm": 0.8251994848251343, + "learning_rate": 2.6384815900944937e-05, + "loss": 0.3938, + "step": 9550 + }, + { + "epoch": 1.4951471509079524, + "grad_norm": 1.2075448036193848, + "learning_rate": 2.6376669925057024e-05, + "loss": 0.3558, + "step": 9551 + }, + { + "epoch": 1.4953036944270508, + "grad_norm": 2.126147747039795, + "learning_rate": 2.636852394916911e-05, + "loss": 0.5252, + "step": 9552 + }, + { + "epoch": 1.495460237946149, + "grad_norm": 0.8328792452812195, + "learning_rate": 2.6360377973281202e-05, + "loss": 0.2821, + "step": 9553 + }, + { + "epoch": 1.4956167814652472, + "grad_norm": 0.8158516883850098, + "learning_rate": 2.635223199739329e-05, + "loss": 0.2737, + "step": 9554 + }, + { + "epoch": 1.4957733249843457, + "grad_norm": 1.3494904041290283, + "learning_rate": 2.6344086021505376e-05, + "loss": 0.3657, + "step": 9555 + }, + { + "epoch": 1.495929868503444, + "grad_norm": 1.1280261278152466, + "learning_rate": 2.633594004561747e-05, + "loss": 0.3071, + "step": 9556 + }, + { + "epoch": 1.4960864120225423, + "grad_norm": 1.6910948753356934, + "learning_rate": 2.6327794069729554e-05, + "loss": 0.4042, + "step": 9557 + }, + { + "epoch": 1.4962429555416406, + "grad_norm": 0.9667158126831055, + "learning_rate": 2.631964809384164e-05, + "loss": 0.2353, + "step": 9558 + }, + { + "epoch": 1.4963994990607388, + "grad_norm": 1.7652267217636108, + "learning_rate": 2.6311502117953735e-05, + "loss": 0.4257, + "step": 9559 + }, + { + "epoch": 1.4965560425798372, + "grad_norm": 2.2069504261016846, + "learning_rate": 2.6303356142065822e-05, + "loss": 0.4953, + "step": 9560 + }, + { + "epoch": 1.4967125860989354, + "grad_norm": 2.003131628036499, + "learning_rate": 2.629521016617791e-05, + "loss": 0.3979, + "step": 9561 + }, + { + "epoch": 1.4968691296180339, + "grad_norm": 1.7784473896026611, + "learning_rate": 2.628706419029e-05, + "loss": 0.3312, + "step": 9562 + }, + { + "epoch": 1.497025673137132, + "grad_norm": 2.901759624481201, + "learning_rate": 2.6278918214402087e-05, + "loss": 0.4882, + "step": 9563 + }, + { + "epoch": 1.4971822166562303, + "grad_norm": 1.7349281311035156, + "learning_rate": 2.6270772238514174e-05, + "loss": 0.587, + "step": 9564 + }, + { + "epoch": 1.4973387601753287, + "grad_norm": 3.0124027729034424, + "learning_rate": 2.6262626262626268e-05, + "loss": 0.8148, + "step": 9565 + }, + { + "epoch": 1.4974953036944272, + "grad_norm": 2.0093753337860107, + "learning_rate": 2.625448028673835e-05, + "loss": 0.6261, + "step": 9566 + }, + { + "epoch": 1.4976518472135254, + "grad_norm": 2.0126044750213623, + "learning_rate": 2.624633431085044e-05, + "loss": 0.6629, + "step": 9567 + }, + { + "epoch": 1.4978083907326236, + "grad_norm": 1.97677481174469, + "learning_rate": 2.6238188334962533e-05, + "loss": 0.5267, + "step": 9568 + }, + { + "epoch": 1.497964934251722, + "grad_norm": 2.8586912155151367, + "learning_rate": 2.623004235907462e-05, + "loss": 0.5585, + "step": 9569 + }, + { + "epoch": 1.4981214777708203, + "grad_norm": 4.593770503997803, + "learning_rate": 2.6221896383186707e-05, + "loss": 0.5699, + "step": 9570 + }, + { + "epoch": 1.4982780212899187, + "grad_norm": 3.505887031555176, + "learning_rate": 2.6213750407298797e-05, + "loss": 0.7248, + "step": 9571 + }, + { + "epoch": 1.498434564809017, + "grad_norm": 4.637121200561523, + "learning_rate": 2.6205604431410885e-05, + "loss": 1.0167, + "step": 9572 + }, + { + "epoch": 1.4985911083281152, + "grad_norm": 5.4799394607543945, + "learning_rate": 2.6197458455522972e-05, + "loss": 0.8681, + "step": 9573 + }, + { + "epoch": 1.4987476518472136, + "grad_norm": 2.8709096908569336, + "learning_rate": 2.6189312479635066e-05, + "loss": 0.2693, + "step": 9574 + }, + { + "epoch": 1.4989041953663118, + "grad_norm": 3.076777935028076, + "learning_rate": 2.618116650374715e-05, + "loss": 0.8972, + "step": 9575 + }, + { + "epoch": 1.4990607388854102, + "grad_norm": 2.248126268386841, + "learning_rate": 2.6173020527859237e-05, + "loss": 0.9882, + "step": 9576 + }, + { + "epoch": 1.4992172824045085, + "grad_norm": 4.022992134094238, + "learning_rate": 2.616487455197133e-05, + "loss": 1.4033, + "step": 9577 + }, + { + "epoch": 1.4993738259236067, + "grad_norm": 3.6171231269836426, + "learning_rate": 2.6156728576083418e-05, + "loss": 1.3506, + "step": 9578 + }, + { + "epoch": 1.4995303694427051, + "grad_norm": 3.458073377609253, + "learning_rate": 2.6148582600195505e-05, + "loss": 0.9244, + "step": 9579 + }, + { + "epoch": 1.4996869129618033, + "grad_norm": 2.1450693607330322, + "learning_rate": 2.6140436624307595e-05, + "loss": 0.3959, + "step": 9580 + }, + { + "epoch": 1.4998434564809018, + "grad_norm": 3.092402935028076, + "learning_rate": 2.6132290648419682e-05, + "loss": 1.4205, + "step": 9581 + }, + { + "epoch": 1.5, + "grad_norm": 3.2756574153900146, + "learning_rate": 2.612414467253177e-05, + "loss": 1.2664, + "step": 9582 + }, + { + "epoch": 1.5001565435190982, + "grad_norm": 3.6030821800231934, + "learning_rate": 2.6115998696643863e-05, + "loss": 0.5364, + "step": 9583 + }, + { + "epoch": 1.5003130870381967, + "grad_norm": 2.4277257919311523, + "learning_rate": 2.6107852720755947e-05, + "loss": 0.5744, + "step": 9584 + }, + { + "epoch": 1.5004696305572949, + "grad_norm": 3.097407102584839, + "learning_rate": 2.6099706744868034e-05, + "loss": 0.4744, + "step": 9585 + }, + { + "epoch": 1.5006261740763933, + "grad_norm": 5.261967658996582, + "learning_rate": 2.6091560768980128e-05, + "loss": 1.3322, + "step": 9586 + }, + { + "epoch": 1.5007827175954915, + "grad_norm": 2.721452236175537, + "learning_rate": 2.6083414793092215e-05, + "loss": 0.8381, + "step": 9587 + }, + { + "epoch": 1.5009392611145898, + "grad_norm": 3.6398587226867676, + "learning_rate": 2.6075268817204303e-05, + "loss": 1.8543, + "step": 9588 + }, + { + "epoch": 1.5010958046336882, + "grad_norm": 1.8342961072921753, + "learning_rate": 2.6067122841316393e-05, + "loss": 0.244, + "step": 9589 + }, + { + "epoch": 1.5012523481527866, + "grad_norm": 0.6380369663238525, + "learning_rate": 2.605897686542848e-05, + "loss": 0.249, + "step": 9590 + }, + { + "epoch": 1.5014088916718848, + "grad_norm": 0.6356605887413025, + "learning_rate": 2.6050830889540567e-05, + "loss": 0.2513, + "step": 9591 + }, + { + "epoch": 1.501565435190983, + "grad_norm": 0.5172343850135803, + "learning_rate": 2.604268491365266e-05, + "loss": 0.124, + "step": 9592 + }, + { + "epoch": 1.5017219787100813, + "grad_norm": 0.5901498794555664, + "learning_rate": 2.6034538937764745e-05, + "loss": 0.2051, + "step": 9593 + }, + { + "epoch": 1.5018785222291797, + "grad_norm": 0.8294780850410461, + "learning_rate": 2.6026392961876832e-05, + "loss": 0.2154, + "step": 9594 + }, + { + "epoch": 1.5020350657482782, + "grad_norm": 1.2887063026428223, + "learning_rate": 2.601824698598892e-05, + "loss": 0.2648, + "step": 9595 + }, + { + "epoch": 1.5021916092673764, + "grad_norm": 0.5860312581062317, + "learning_rate": 2.6010101010101013e-05, + "loss": 0.1853, + "step": 9596 + }, + { + "epoch": 1.5023481527864746, + "grad_norm": 0.634138286113739, + "learning_rate": 2.60019550342131e-05, + "loss": 0.1951, + "step": 9597 + }, + { + "epoch": 1.5025046963055728, + "grad_norm": 1.1432788372039795, + "learning_rate": 2.5993809058325187e-05, + "loss": 0.2765, + "step": 9598 + }, + { + "epoch": 1.5026612398246713, + "grad_norm": 1.1583616733551025, + "learning_rate": 2.5985663082437278e-05, + "loss": 0.2728, + "step": 9599 + }, + { + "epoch": 1.5028177833437697, + "grad_norm": 1.5633398294448853, + "learning_rate": 2.5977517106549365e-05, + "loss": 0.3231, + "step": 9600 + }, + { + "epoch": 1.502974326862868, + "grad_norm": 1.1054425239562988, + "learning_rate": 2.5969371130661452e-05, + "loss": 0.2296, + "step": 9601 + }, + { + "epoch": 1.5031308703819661, + "grad_norm": 1.4156244993209839, + "learning_rate": 2.5961225154773543e-05, + "loss": 0.4161, + "step": 9602 + }, + { + "epoch": 1.5032874139010644, + "grad_norm": 1.2571558952331543, + "learning_rate": 2.595307917888563e-05, + "loss": 0.4556, + "step": 9603 + }, + { + "epoch": 1.5034439574201628, + "grad_norm": 1.1400467157363892, + "learning_rate": 2.5944933202997717e-05, + "loss": 0.3522, + "step": 9604 + }, + { + "epoch": 1.5036005009392612, + "grad_norm": 0.8139132857322693, + "learning_rate": 2.593678722710981e-05, + "loss": 0.2117, + "step": 9605 + }, + { + "epoch": 1.5037570444583594, + "grad_norm": 1.8487614393234253, + "learning_rate": 2.5928641251221898e-05, + "loss": 0.4328, + "step": 9606 + }, + { + "epoch": 1.5039135879774577, + "grad_norm": 2.1566202640533447, + "learning_rate": 2.5920495275333985e-05, + "loss": 0.5913, + "step": 9607 + }, + { + "epoch": 1.5040701314965559, + "grad_norm": 1.4446022510528564, + "learning_rate": 2.5912349299446076e-05, + "loss": 0.4214, + "step": 9608 + }, + { + "epoch": 1.5042266750156543, + "grad_norm": 0.8992534875869751, + "learning_rate": 2.5904203323558163e-05, + "loss": 0.342, + "step": 9609 + }, + { + "epoch": 1.5043832185347528, + "grad_norm": 1.1623833179473877, + "learning_rate": 2.589605734767025e-05, + "loss": 0.3942, + "step": 9610 + }, + { + "epoch": 1.504539762053851, + "grad_norm": 1.6539268493652344, + "learning_rate": 2.588791137178234e-05, + "loss": 0.5341, + "step": 9611 + }, + { + "epoch": 1.5046963055729492, + "grad_norm": 3.3836541175842285, + "learning_rate": 2.5879765395894428e-05, + "loss": 0.5943, + "step": 9612 + }, + { + "epoch": 1.5048528490920476, + "grad_norm": 2.3589909076690674, + "learning_rate": 2.5871619420006515e-05, + "loss": 0.4355, + "step": 9613 + }, + { + "epoch": 1.5050093926111459, + "grad_norm": 1.6888536214828491, + "learning_rate": 2.586347344411861e-05, + "loss": 0.5684, + "step": 9614 + }, + { + "epoch": 1.5051659361302443, + "grad_norm": 1.7131670713424683, + "learning_rate": 2.5855327468230696e-05, + "loss": 0.4131, + "step": 9615 + }, + { + "epoch": 1.5053224796493425, + "grad_norm": 1.801002025604248, + "learning_rate": 2.584718149234278e-05, + "loss": 0.4, + "step": 9616 + }, + { + "epoch": 1.5054790231684407, + "grad_norm": 2.1297812461853027, + "learning_rate": 2.5839035516454874e-05, + "loss": 0.3895, + "step": 9617 + }, + { + "epoch": 1.5056355666875392, + "grad_norm": 2.0699119567871094, + "learning_rate": 2.583088954056696e-05, + "loss": 0.6507, + "step": 9618 + }, + { + "epoch": 1.5057921102066374, + "grad_norm": 2.236872911453247, + "learning_rate": 2.5822743564679048e-05, + "loss": 0.2709, + "step": 9619 + }, + { + "epoch": 1.5059486537257358, + "grad_norm": 3.408116340637207, + "learning_rate": 2.581459758879114e-05, + "loss": 1.0693, + "step": 9620 + }, + { + "epoch": 1.506105197244834, + "grad_norm": 5.5854291915893555, + "learning_rate": 2.5806451612903226e-05, + "loss": 0.6863, + "step": 9621 + }, + { + "epoch": 1.5062617407639323, + "grad_norm": 4.314059257507324, + "learning_rate": 2.5798305637015313e-05, + "loss": 0.7063, + "step": 9622 + }, + { + "epoch": 1.5064182842830307, + "grad_norm": 3.5036556720733643, + "learning_rate": 2.5790159661127407e-05, + "loss": 0.9551, + "step": 9623 + }, + { + "epoch": 1.5065748278021291, + "grad_norm": 5.416233062744141, + "learning_rate": 2.5782013685239494e-05, + "loss": 1.0891, + "step": 9624 + }, + { + "epoch": 1.5067313713212274, + "grad_norm": 3.431368350982666, + "learning_rate": 2.5773867709351577e-05, + "loss": 0.9881, + "step": 9625 + }, + { + "epoch": 1.5068879148403256, + "grad_norm": 2.9430055618286133, + "learning_rate": 2.576572173346367e-05, + "loss": 1.528, + "step": 9626 + }, + { + "epoch": 1.5070444583594238, + "grad_norm": 2.908641815185547, + "learning_rate": 2.575757575757576e-05, + "loss": 0.4115, + "step": 9627 + }, + { + "epoch": 1.5072010018785222, + "grad_norm": 3.0178334712982178, + "learning_rate": 2.5749429781687846e-05, + "loss": 1.1971, + "step": 9628 + }, + { + "epoch": 1.5073575453976207, + "grad_norm": 3.073467969894409, + "learning_rate": 2.5741283805799936e-05, + "loss": 1.2062, + "step": 9629 + }, + { + "epoch": 1.507514088916719, + "grad_norm": 2.085393190383911, + "learning_rate": 2.5733137829912023e-05, + "loss": 0.6303, + "step": 9630 + }, + { + "epoch": 1.5076706324358171, + "grad_norm": 3.482193946838379, + "learning_rate": 2.572499185402411e-05, + "loss": 1.2933, + "step": 9631 + }, + { + "epoch": 1.5078271759549153, + "grad_norm": 4.107168197631836, + "learning_rate": 2.5716845878136204e-05, + "loss": 0.9358, + "step": 9632 + }, + { + "epoch": 1.5079837194740138, + "grad_norm": 1.5427114963531494, + "learning_rate": 2.570869990224829e-05, + "loss": 0.6145, + "step": 9633 + }, + { + "epoch": 1.5081402629931122, + "grad_norm": 2.167677402496338, + "learning_rate": 2.5700553926360375e-05, + "loss": 0.5657, + "step": 9634 + }, + { + "epoch": 1.5082968065122104, + "grad_norm": 3.3868002891540527, + "learning_rate": 2.569240795047247e-05, + "loss": 0.831, + "step": 9635 + }, + { + "epoch": 1.5084533500313086, + "grad_norm": 2.888097047805786, + "learning_rate": 2.5684261974584556e-05, + "loss": 0.7297, + "step": 9636 + }, + { + "epoch": 1.5086098935504069, + "grad_norm": 2.0489985942840576, + "learning_rate": 2.5676115998696643e-05, + "loss": 0.5339, + "step": 9637 + }, + { + "epoch": 1.5087664370695053, + "grad_norm": 2.0418801307678223, + "learning_rate": 2.5667970022808734e-05, + "loss": 0.6327, + "step": 9638 + }, + { + "epoch": 1.5089229805886037, + "grad_norm": 0.6422616839408875, + "learning_rate": 2.565982404692082e-05, + "loss": 0.2347, + "step": 9639 + }, + { + "epoch": 1.509079524107702, + "grad_norm": 0.5398318767547607, + "learning_rate": 2.5651678071032908e-05, + "loss": 0.233, + "step": 9640 + }, + { + "epoch": 1.5092360676268002, + "grad_norm": 0.499836266040802, + "learning_rate": 2.5643532095145002e-05, + "loss": 0.2228, + "step": 9641 + }, + { + "epoch": 1.5093926111458984, + "grad_norm": 0.4023159444332123, + "learning_rate": 2.563538611925709e-05, + "loss": 0.1483, + "step": 9642 + }, + { + "epoch": 1.5095491546649968, + "grad_norm": 0.6737760901451111, + "learning_rate": 2.5627240143369173e-05, + "loss": 0.166, + "step": 9643 + }, + { + "epoch": 1.5097056981840953, + "grad_norm": 0.5744112133979797, + "learning_rate": 2.5619094167481267e-05, + "loss": 0.2559, + "step": 9644 + }, + { + "epoch": 1.5098622417031935, + "grad_norm": 0.6070721745491028, + "learning_rate": 2.5610948191593354e-05, + "loss": 0.2663, + "step": 9645 + }, + { + "epoch": 1.5100187852222917, + "grad_norm": 0.6761013865470886, + "learning_rate": 2.560280221570544e-05, + "loss": 0.2609, + "step": 9646 + }, + { + "epoch": 1.5101753287413902, + "grad_norm": 0.8536835312843323, + "learning_rate": 2.5594656239817532e-05, + "loss": 0.4019, + "step": 9647 + }, + { + "epoch": 1.5103318722604884, + "grad_norm": 1.5906157493591309, + "learning_rate": 2.558651026392962e-05, + "loss": 0.3005, + "step": 9648 + }, + { + "epoch": 1.5104884157795868, + "grad_norm": 0.8467947244644165, + "learning_rate": 2.5578364288041706e-05, + "loss": 0.3488, + "step": 9649 + }, + { + "epoch": 1.510644959298685, + "grad_norm": 0.6715324521064758, + "learning_rate": 2.55702183121538e-05, + "loss": 0.1976, + "step": 9650 + }, + { + "epoch": 1.5108015028177832, + "grad_norm": 0.947036623954773, + "learning_rate": 2.5562072336265887e-05, + "loss": 0.3173, + "step": 9651 + }, + { + "epoch": 1.5109580463368817, + "grad_norm": 1.2199866771697998, + "learning_rate": 2.555392636037797e-05, + "loss": 0.3812, + "step": 9652 + }, + { + "epoch": 1.5111145898559801, + "grad_norm": 1.0950279235839844, + "learning_rate": 2.5545780384490065e-05, + "loss": 0.4747, + "step": 9653 + }, + { + "epoch": 1.5112711333750783, + "grad_norm": 2.057777166366577, + "learning_rate": 2.5537634408602152e-05, + "loss": 0.3816, + "step": 9654 + }, + { + "epoch": 1.5114276768941766, + "grad_norm": 1.4696046113967896, + "learning_rate": 2.552948843271424e-05, + "loss": 0.4657, + "step": 9655 + }, + { + "epoch": 1.5115842204132748, + "grad_norm": 1.0610140562057495, + "learning_rate": 2.552134245682633e-05, + "loss": 0.4178, + "step": 9656 + }, + { + "epoch": 1.5117407639323732, + "grad_norm": 1.0931453704833984, + "learning_rate": 2.5513196480938417e-05, + "loss": 0.23, + "step": 9657 + }, + { + "epoch": 1.5118973074514717, + "grad_norm": 0.975875198841095, + "learning_rate": 2.5505050505050504e-05, + "loss": 0.3265, + "step": 9658 + }, + { + "epoch": 1.5120538509705699, + "grad_norm": 3.0775396823883057, + "learning_rate": 2.5496904529162598e-05, + "loss": 0.3837, + "step": 9659 + }, + { + "epoch": 1.512210394489668, + "grad_norm": 2.1198456287384033, + "learning_rate": 2.548875855327468e-05, + "loss": 0.5, + "step": 9660 + }, + { + "epoch": 1.5123669380087663, + "grad_norm": 2.4716999530792236, + "learning_rate": 2.548061257738677e-05, + "loss": 0.4841, + "step": 9661 + }, + { + "epoch": 1.5125234815278648, + "grad_norm": 3.426034688949585, + "learning_rate": 2.5472466601498863e-05, + "loss": 0.4609, + "step": 9662 + }, + { + "epoch": 1.5126800250469632, + "grad_norm": 4.156007289886475, + "learning_rate": 2.546432062561095e-05, + "loss": 0.9834, + "step": 9663 + }, + { + "epoch": 1.5128365685660614, + "grad_norm": 2.9961304664611816, + "learning_rate": 2.5456174649723037e-05, + "loss": 0.7588, + "step": 9664 + }, + { + "epoch": 1.5129931120851596, + "grad_norm": 1.6190215349197388, + "learning_rate": 2.5448028673835127e-05, + "loss": 0.6024, + "step": 9665 + }, + { + "epoch": 1.5131496556042578, + "grad_norm": 2.636300563812256, + "learning_rate": 2.5439882697947214e-05, + "loss": 0.5152, + "step": 9666 + }, + { + "epoch": 1.5133061991233563, + "grad_norm": 1.9882903099060059, + "learning_rate": 2.54317367220593e-05, + "loss": 0.8403, + "step": 9667 + }, + { + "epoch": 1.5134627426424547, + "grad_norm": 2.1909849643707275, + "learning_rate": 2.5423590746171396e-05, + "loss": 0.7247, + "step": 9668 + }, + { + "epoch": 1.513619286161553, + "grad_norm": 2.4098691940307617, + "learning_rate": 2.541544477028348e-05, + "loss": 0.5747, + "step": 9669 + }, + { + "epoch": 1.5137758296806512, + "grad_norm": 3.890497922897339, + "learning_rate": 2.5407298794395566e-05, + "loss": 0.4985, + "step": 9670 + }, + { + "epoch": 1.5139323731997494, + "grad_norm": 2.500171184539795, + "learning_rate": 2.539915281850766e-05, + "loss": 0.532, + "step": 9671 + }, + { + "epoch": 1.5140889167188478, + "grad_norm": 3.5970990657806396, + "learning_rate": 2.5391006842619747e-05, + "loss": 0.7352, + "step": 9672 + }, + { + "epoch": 1.5142454602379463, + "grad_norm": 2.4700024127960205, + "learning_rate": 2.5382860866731835e-05, + "loss": 0.4897, + "step": 9673 + }, + { + "epoch": 1.5144020037570445, + "grad_norm": 5.7217817306518555, + "learning_rate": 2.5374714890843925e-05, + "loss": 1.0817, + "step": 9674 + }, + { + "epoch": 1.5145585472761427, + "grad_norm": 3.4545400142669678, + "learning_rate": 2.5366568914956012e-05, + "loss": 1.1898, + "step": 9675 + }, + { + "epoch": 1.514715090795241, + "grad_norm": 3.5168509483337402, + "learning_rate": 2.53584229390681e-05, + "loss": 0.7133, + "step": 9676 + }, + { + "epoch": 1.5148716343143394, + "grad_norm": 3.8286666870117188, + "learning_rate": 2.5350276963180193e-05, + "loss": 0.6728, + "step": 9677 + }, + { + "epoch": 1.5150281778334378, + "grad_norm": 4.160821914672852, + "learning_rate": 2.5342130987292277e-05, + "loss": 1.0711, + "step": 9678 + }, + { + "epoch": 1.515184721352536, + "grad_norm": 4.709293842315674, + "learning_rate": 2.5333985011404364e-05, + "loss": 1.2195, + "step": 9679 + }, + { + "epoch": 1.5153412648716342, + "grad_norm": 2.6233389377593994, + "learning_rate": 2.5325839035516458e-05, + "loss": 0.6838, + "step": 9680 + }, + { + "epoch": 1.5154978083907327, + "grad_norm": 4.976420879364014, + "learning_rate": 2.5317693059628545e-05, + "loss": 1.3726, + "step": 9681 + }, + { + "epoch": 1.5156543519098309, + "grad_norm": 5.006450653076172, + "learning_rate": 2.5309547083740632e-05, + "loss": 0.902, + "step": 9682 + }, + { + "epoch": 1.5158108954289293, + "grad_norm": 3.309006690979004, + "learning_rate": 2.5301401107852723e-05, + "loss": 1.7587, + "step": 9683 + }, + { + "epoch": 1.5159674389480275, + "grad_norm": 3.1780662536621094, + "learning_rate": 2.529325513196481e-05, + "loss": 1.0629, + "step": 9684 + }, + { + "epoch": 1.5161239824671258, + "grad_norm": 4.424006938934326, + "learning_rate": 2.5285109156076897e-05, + "loss": 0.4498, + "step": 9685 + }, + { + "epoch": 1.5162805259862242, + "grad_norm": 9.340277671813965, + "learning_rate": 2.527696318018899e-05, + "loss": 0.7871, + "step": 9686 + }, + { + "epoch": 1.5164370695053226, + "grad_norm": 3.9194276332855225, + "learning_rate": 2.5268817204301075e-05, + "loss": 0.655, + "step": 9687 + }, + { + "epoch": 1.5165936130244209, + "grad_norm": 2.9457154273986816, + "learning_rate": 2.5260671228413162e-05, + "loss": 0.7517, + "step": 9688 + }, + { + "epoch": 1.516750156543519, + "grad_norm": 0.501905620098114, + "learning_rate": 2.5252525252525256e-05, + "loss": 0.2448, + "step": 9689 + }, + { + "epoch": 1.5169067000626173, + "grad_norm": 0.7116760015487671, + "learning_rate": 2.5244379276637343e-05, + "loss": 0.1912, + "step": 9690 + }, + { + "epoch": 1.5170632435817157, + "grad_norm": 0.4621204435825348, + "learning_rate": 2.523623330074943e-05, + "loss": 0.1733, + "step": 9691 + }, + { + "epoch": 1.5172197871008142, + "grad_norm": 0.9050464034080505, + "learning_rate": 2.522808732486152e-05, + "loss": 0.3635, + "step": 9692 + }, + { + "epoch": 1.5173763306199124, + "grad_norm": 0.6342960596084595, + "learning_rate": 2.5219941348973608e-05, + "loss": 0.1898, + "step": 9693 + }, + { + "epoch": 1.5175328741390106, + "grad_norm": 0.9422409534454346, + "learning_rate": 2.5211795373085695e-05, + "loss": 0.1667, + "step": 9694 + }, + { + "epoch": 1.5176894176581088, + "grad_norm": 0.9081127047538757, + "learning_rate": 2.5203649397197785e-05, + "loss": 0.2441, + "step": 9695 + }, + { + "epoch": 1.5178459611772073, + "grad_norm": 0.6492360830307007, + "learning_rate": 2.5195503421309873e-05, + "loss": 0.277, + "step": 9696 + }, + { + "epoch": 1.5180025046963057, + "grad_norm": 6.221653461456299, + "learning_rate": 2.518735744542196e-05, + "loss": 0.312, + "step": 9697 + }, + { + "epoch": 1.518159048215404, + "grad_norm": 1.2717584371566772, + "learning_rate": 2.5179211469534054e-05, + "loss": 0.415, + "step": 9698 + }, + { + "epoch": 1.5183155917345021, + "grad_norm": 0.6162561178207397, + "learning_rate": 2.517106549364614e-05, + "loss": 0.2104, + "step": 9699 + }, + { + "epoch": 1.5184721352536004, + "grad_norm": 1.1228376626968384, + "learning_rate": 2.5162919517758228e-05, + "loss": 0.2885, + "step": 9700 + }, + { + "epoch": 1.5186286787726988, + "grad_norm": 1.0652949810028076, + "learning_rate": 2.515477354187032e-05, + "loss": 0.3866, + "step": 9701 + }, + { + "epoch": 1.5187852222917972, + "grad_norm": 0.9229676127433777, + "learning_rate": 2.5146627565982406e-05, + "loss": 0.2169, + "step": 9702 + }, + { + "epoch": 1.5189417658108955, + "grad_norm": 1.7707406282424927, + "learning_rate": 2.5138481590094493e-05, + "loss": 0.2559, + "step": 9703 + }, + { + "epoch": 1.5190983093299937, + "grad_norm": 0.8050142526626587, + "learning_rate": 2.5130335614206583e-05, + "loss": 0.2739, + "step": 9704 + }, + { + "epoch": 1.519254852849092, + "grad_norm": 2.1986007690429688, + "learning_rate": 2.512218963831867e-05, + "loss": 0.2606, + "step": 9705 + }, + { + "epoch": 1.5194113963681903, + "grad_norm": 1.3240673542022705, + "learning_rate": 2.5114043662430758e-05, + "loss": 0.2986, + "step": 9706 + }, + { + "epoch": 1.5195679398872888, + "grad_norm": 1.2714520692825317, + "learning_rate": 2.510589768654285e-05, + "loss": 0.4601, + "step": 9707 + }, + { + "epoch": 1.519724483406387, + "grad_norm": 2.3956596851348877, + "learning_rate": 2.509775171065494e-05, + "loss": 0.485, + "step": 9708 + }, + { + "epoch": 1.5198810269254852, + "grad_norm": 1.7654016017913818, + "learning_rate": 2.5089605734767026e-05, + "loss": 0.383, + "step": 9709 + }, + { + "epoch": 1.5200375704445834, + "grad_norm": 1.704193353652954, + "learning_rate": 2.5081459758879116e-05, + "loss": 0.4857, + "step": 9710 + }, + { + "epoch": 1.5201941139636819, + "grad_norm": 2.721149206161499, + "learning_rate": 2.5073313782991203e-05, + "loss": 0.6581, + "step": 9711 + }, + { + "epoch": 1.5203506574827803, + "grad_norm": 1.609613299369812, + "learning_rate": 2.506516780710329e-05, + "loss": 0.4163, + "step": 9712 + }, + { + "epoch": 1.5205072010018785, + "grad_norm": 1.8533555269241333, + "learning_rate": 2.505702183121538e-05, + "loss": 0.5969, + "step": 9713 + }, + { + "epoch": 1.5206637445209767, + "grad_norm": 2.950644016265869, + "learning_rate": 2.5048875855327468e-05, + "loss": 0.5492, + "step": 9714 + }, + { + "epoch": 1.5208202880400752, + "grad_norm": 3.1184802055358887, + "learning_rate": 2.5040729879439555e-05, + "loss": 0.4863, + "step": 9715 + }, + { + "epoch": 1.5209768315591734, + "grad_norm": 2.473741054534912, + "learning_rate": 2.503258390355165e-05, + "loss": 0.5926, + "step": 9716 + }, + { + "epoch": 1.5211333750782718, + "grad_norm": 2.0763349533081055, + "learning_rate": 2.5024437927663736e-05, + "loss": 0.4989, + "step": 9717 + }, + { + "epoch": 1.52128991859737, + "grad_norm": 5.198651313781738, + "learning_rate": 2.5016291951775824e-05, + "loss": 0.8227, + "step": 9718 + }, + { + "epoch": 1.5214464621164683, + "grad_norm": 9.21263313293457, + "learning_rate": 2.5008145975887914e-05, + "loss": 0.7132, + "step": 9719 + }, + { + "epoch": 1.5216030056355667, + "grad_norm": 3.380812168121338, + "learning_rate": 2.5e-05, + "loss": 0.8681, + "step": 9720 + }, + { + "epoch": 1.5217595491546652, + "grad_norm": 13.304729461669922, + "learning_rate": 2.4991854024112092e-05, + "loss": 1.3878, + "step": 9721 + }, + { + "epoch": 1.5219160926737634, + "grad_norm": 6.864569187164307, + "learning_rate": 2.498370804822418e-05, + "loss": 0.7735, + "step": 9722 + }, + { + "epoch": 1.5220726361928616, + "grad_norm": 2.914254903793335, + "learning_rate": 2.4975562072336266e-05, + "loss": 0.6292, + "step": 9723 + }, + { + "epoch": 1.5222291797119598, + "grad_norm": 11.456281661987305, + "learning_rate": 2.4967416096448357e-05, + "loss": 0.9687, + "step": 9724 + }, + { + "epoch": 1.5223857232310583, + "grad_norm": 2.8840293884277344, + "learning_rate": 2.4959270120560444e-05, + "loss": 0.8283, + "step": 9725 + }, + { + "epoch": 1.5225422667501567, + "grad_norm": 2.6319406032562256, + "learning_rate": 2.4951124144672534e-05, + "loss": 0.8112, + "step": 9726 + }, + { + "epoch": 1.522698810269255, + "grad_norm": 2.625075101852417, + "learning_rate": 2.494297816878462e-05, + "loss": 0.8554, + "step": 9727 + }, + { + "epoch": 1.5228553537883531, + "grad_norm": 4.039719581604004, + "learning_rate": 2.493483219289671e-05, + "loss": 1.2555, + "step": 9728 + }, + { + "epoch": 1.5230118973074513, + "grad_norm": 3.0091447830200195, + "learning_rate": 2.49266862170088e-05, + "loss": 0.9961, + "step": 9729 + }, + { + "epoch": 1.5231684408265498, + "grad_norm": 5.673411846160889, + "learning_rate": 2.491854024112089e-05, + "loss": 1.5284, + "step": 9730 + }, + { + "epoch": 1.5233249843456482, + "grad_norm": 3.6839756965637207, + "learning_rate": 2.4910394265232977e-05, + "loss": 1.0632, + "step": 9731 + }, + { + "epoch": 1.5234815278647464, + "grad_norm": 2.587073564529419, + "learning_rate": 2.4902248289345064e-05, + "loss": 0.9816, + "step": 9732 + }, + { + "epoch": 1.5236380713838447, + "grad_norm": 3.492713689804077, + "learning_rate": 2.4894102313457154e-05, + "loss": 0.6654, + "step": 9733 + }, + { + "epoch": 1.5237946149029429, + "grad_norm": 3.981224536895752, + "learning_rate": 2.488595633756924e-05, + "loss": 0.9969, + "step": 9734 + }, + { + "epoch": 1.5239511584220413, + "grad_norm": 1.464372158050537, + "learning_rate": 2.4877810361681332e-05, + "loss": 0.7017, + "step": 9735 + }, + { + "epoch": 1.5241077019411398, + "grad_norm": 3.22999906539917, + "learning_rate": 2.486966438579342e-05, + "loss": 0.7156, + "step": 9736 + }, + { + "epoch": 1.524264245460238, + "grad_norm": 3.839923143386841, + "learning_rate": 2.4861518409905506e-05, + "loss": 0.5332, + "step": 9737 + }, + { + "epoch": 1.5244207889793362, + "grad_norm": 4.80755615234375, + "learning_rate": 2.4853372434017597e-05, + "loss": 0.7031, + "step": 9738 + }, + { + "epoch": 1.5245773324984344, + "grad_norm": 0.5337120890617371, + "learning_rate": 2.4845226458129687e-05, + "loss": 0.1849, + "step": 9739 + }, + { + "epoch": 1.5247338760175329, + "grad_norm": 0.52425217628479, + "learning_rate": 2.4837080482241774e-05, + "loss": 0.188, + "step": 9740 + }, + { + "epoch": 1.5248904195366313, + "grad_norm": 0.3951246738433838, + "learning_rate": 2.482893450635386e-05, + "loss": 0.1775, + "step": 9741 + }, + { + "epoch": 1.5250469630557295, + "grad_norm": 0.497856467962265, + "learning_rate": 2.4820788530465952e-05, + "loss": 0.2184, + "step": 9742 + }, + { + "epoch": 1.5252035065748277, + "grad_norm": 1.0039336681365967, + "learning_rate": 2.481264255457804e-05, + "loss": 0.2367, + "step": 9743 + }, + { + "epoch": 1.525360050093926, + "grad_norm": 0.6684547066688538, + "learning_rate": 2.480449657869013e-05, + "loss": 0.2797, + "step": 9744 + }, + { + "epoch": 1.5255165936130244, + "grad_norm": 1.2803349494934082, + "learning_rate": 2.4796350602802217e-05, + "loss": 0.3069, + "step": 9745 + }, + { + "epoch": 1.5256731371321228, + "grad_norm": 1.769838809967041, + "learning_rate": 2.4788204626914304e-05, + "loss": 0.4608, + "step": 9746 + }, + { + "epoch": 1.525829680651221, + "grad_norm": 2.9705052375793457, + "learning_rate": 2.4780058651026395e-05, + "loss": 0.2383, + "step": 9747 + }, + { + "epoch": 1.5259862241703193, + "grad_norm": 0.7684370875358582, + "learning_rate": 2.4771912675138485e-05, + "loss": 0.1256, + "step": 9748 + }, + { + "epoch": 1.5261427676894177, + "grad_norm": 0.9183100461959839, + "learning_rate": 2.4763766699250572e-05, + "loss": 0.2422, + "step": 9749 + }, + { + "epoch": 1.526299311208516, + "grad_norm": 1.1035720109939575, + "learning_rate": 2.475562072336266e-05, + "loss": 0.3725, + "step": 9750 + }, + { + "epoch": 1.5264558547276144, + "grad_norm": 1.7718714475631714, + "learning_rate": 2.474747474747475e-05, + "loss": 0.4257, + "step": 9751 + }, + { + "epoch": 1.5266123982467126, + "grad_norm": 0.9140228033065796, + "learning_rate": 2.4739328771586837e-05, + "loss": 0.3614, + "step": 9752 + }, + { + "epoch": 1.5267689417658108, + "grad_norm": 1.5046706199645996, + "learning_rate": 2.4731182795698928e-05, + "loss": 0.3863, + "step": 9753 + }, + { + "epoch": 1.5269254852849092, + "grad_norm": 1.8662611246109009, + "learning_rate": 2.4723036819811015e-05, + "loss": 0.5777, + "step": 9754 + }, + { + "epoch": 1.5270820288040077, + "grad_norm": 4.0693745613098145, + "learning_rate": 2.4714890843923102e-05, + "loss": 0.4204, + "step": 9755 + }, + { + "epoch": 1.527238572323106, + "grad_norm": 1.3963104486465454, + "learning_rate": 2.4706744868035192e-05, + "loss": 0.3927, + "step": 9756 + }, + { + "epoch": 1.527395115842204, + "grad_norm": 1.5740095376968384, + "learning_rate": 2.4698598892147283e-05, + "loss": 0.4969, + "step": 9757 + }, + { + "epoch": 1.5275516593613023, + "grad_norm": 1.1160489320755005, + "learning_rate": 2.469045291625937e-05, + "loss": 0.2777, + "step": 9758 + }, + { + "epoch": 1.5277082028804008, + "grad_norm": 1.9066733121871948, + "learning_rate": 2.4682306940371457e-05, + "loss": 0.5496, + "step": 9759 + }, + { + "epoch": 1.5278647463994992, + "grad_norm": 4.845818996429443, + "learning_rate": 2.4674160964483548e-05, + "loss": 0.6643, + "step": 9760 + }, + { + "epoch": 1.5280212899185974, + "grad_norm": 1.4926012754440308, + "learning_rate": 2.4666014988595635e-05, + "loss": 0.2596, + "step": 9761 + }, + { + "epoch": 1.5281778334376956, + "grad_norm": 2.8589468002319336, + "learning_rate": 2.4657869012707725e-05, + "loss": 0.7794, + "step": 9762 + }, + { + "epoch": 1.5283343769567939, + "grad_norm": 2.4323747158050537, + "learning_rate": 2.4649723036819812e-05, + "loss": 0.6374, + "step": 9763 + }, + { + "epoch": 1.5284909204758923, + "grad_norm": 2.731858730316162, + "learning_rate": 2.46415770609319e-05, + "loss": 0.4849, + "step": 9764 + }, + { + "epoch": 1.5286474639949907, + "grad_norm": 2.079498529434204, + "learning_rate": 2.463343108504399e-05, + "loss": 0.5899, + "step": 9765 + }, + { + "epoch": 1.528804007514089, + "grad_norm": 3.2891364097595215, + "learning_rate": 2.462528510915608e-05, + "loss": 0.8825, + "step": 9766 + }, + { + "epoch": 1.5289605510331872, + "grad_norm": 6.84429931640625, + "learning_rate": 2.4617139133268168e-05, + "loss": 1.17, + "step": 9767 + }, + { + "epoch": 1.5291170945522854, + "grad_norm": 3.3965518474578857, + "learning_rate": 2.4608993157380255e-05, + "loss": 0.4404, + "step": 9768 + }, + { + "epoch": 1.5292736380713838, + "grad_norm": 2.295670986175537, + "learning_rate": 2.4600847181492345e-05, + "loss": 0.6247, + "step": 9769 + }, + { + "epoch": 1.5294301815904823, + "grad_norm": 2.6868932247161865, + "learning_rate": 2.4592701205604433e-05, + "loss": 0.8283, + "step": 9770 + }, + { + "epoch": 1.5295867251095805, + "grad_norm": 2.342942953109741, + "learning_rate": 2.4584555229716523e-05, + "loss": 0.5322, + "step": 9771 + }, + { + "epoch": 1.5297432686286787, + "grad_norm": 3.583503007888794, + "learning_rate": 2.457640925382861e-05, + "loss": 0.7185, + "step": 9772 + }, + { + "epoch": 1.529899812147777, + "grad_norm": 2.676663398742676, + "learning_rate": 2.4568263277940697e-05, + "loss": 0.8309, + "step": 9773 + }, + { + "epoch": 1.5300563556668754, + "grad_norm": 2.0790555477142334, + "learning_rate": 2.4560117302052788e-05, + "loss": 0.3359, + "step": 9774 + }, + { + "epoch": 1.5302128991859738, + "grad_norm": 3.501311779022217, + "learning_rate": 2.455197132616488e-05, + "loss": 1.2159, + "step": 9775 + }, + { + "epoch": 1.530369442705072, + "grad_norm": 2.548949718475342, + "learning_rate": 2.4543825350276962e-05, + "loss": 0.9086, + "step": 9776 + }, + { + "epoch": 1.5305259862241702, + "grad_norm": 3.355860471725464, + "learning_rate": 2.4535679374389053e-05, + "loss": 1.0175, + "step": 9777 + }, + { + "epoch": 1.5306825297432687, + "grad_norm": 4.0084710121154785, + "learning_rate": 2.4527533398501143e-05, + "loss": 1.3137, + "step": 9778 + }, + { + "epoch": 1.530839073262367, + "grad_norm": 3.916973829269409, + "learning_rate": 2.451938742261323e-05, + "loss": 0.8767, + "step": 9779 + }, + { + "epoch": 1.5309956167814653, + "grad_norm": 7.185335159301758, + "learning_rate": 2.451124144672532e-05, + "loss": 1.4429, + "step": 9780 + }, + { + "epoch": 1.5311521603005636, + "grad_norm": 3.5824618339538574, + "learning_rate": 2.4503095470837408e-05, + "loss": 0.9839, + "step": 9781 + }, + { + "epoch": 1.5313087038196618, + "grad_norm": 4.431051731109619, + "learning_rate": 2.4494949494949495e-05, + "loss": 1.8262, + "step": 9782 + }, + { + "epoch": 1.5314652473387602, + "grad_norm": 4.402218818664551, + "learning_rate": 2.4486803519061586e-05, + "loss": 0.685, + "step": 9783 + }, + { + "epoch": 1.5316217908578584, + "grad_norm": 1.280283808708191, + "learning_rate": 2.4478657543173673e-05, + "loss": 0.3443, + "step": 9784 + }, + { + "epoch": 1.5317783343769569, + "grad_norm": 2.327603816986084, + "learning_rate": 2.447051156728576e-05, + "loss": 0.7927, + "step": 9785 + }, + { + "epoch": 1.531934877896055, + "grad_norm": 2.883913040161133, + "learning_rate": 2.446236559139785e-05, + "loss": 0.547, + "step": 9786 + }, + { + "epoch": 1.5320914214151533, + "grad_norm": 2.25495982170105, + "learning_rate": 2.4454219615509938e-05, + "loss": 0.7012, + "step": 9787 + }, + { + "epoch": 1.5322479649342517, + "grad_norm": 2.3975868225097656, + "learning_rate": 2.4446073639622028e-05, + "loss": 0.4134, + "step": 9788 + }, + { + "epoch": 1.5324045084533502, + "grad_norm": 0.4930766522884369, + "learning_rate": 2.4437927663734115e-05, + "loss": 0.2394, + "step": 9789 + }, + { + "epoch": 1.5325610519724484, + "grad_norm": 0.8989336490631104, + "learning_rate": 2.4429781687846202e-05, + "loss": 0.1751, + "step": 9790 + }, + { + "epoch": 1.5327175954915466, + "grad_norm": 1.1957112550735474, + "learning_rate": 2.4421635711958293e-05, + "loss": 0.3115, + "step": 9791 + }, + { + "epoch": 1.5328741390106448, + "grad_norm": 0.4295750856399536, + "learning_rate": 2.4413489736070384e-05, + "loss": 0.1989, + "step": 9792 + }, + { + "epoch": 1.5330306825297433, + "grad_norm": 0.40903469920158386, + "learning_rate": 2.440534376018247e-05, + "loss": 0.1456, + "step": 9793 + }, + { + "epoch": 1.5331872260488417, + "grad_norm": 0.7719000577926636, + "learning_rate": 2.4397197784294558e-05, + "loss": 0.2576, + "step": 9794 + }, + { + "epoch": 1.53334376956794, + "grad_norm": 1.234179139137268, + "learning_rate": 2.438905180840665e-05, + "loss": 0.184, + "step": 9795 + }, + { + "epoch": 1.5335003130870382, + "grad_norm": 1.0665225982666016, + "learning_rate": 2.4380905832518735e-05, + "loss": 0.2177, + "step": 9796 + }, + { + "epoch": 1.5336568566061364, + "grad_norm": 0.664923369884491, + "learning_rate": 2.4372759856630826e-05, + "loss": 0.2268, + "step": 9797 + }, + { + "epoch": 1.5338134001252348, + "grad_norm": 0.781663715839386, + "learning_rate": 2.4364613880742913e-05, + "loss": 0.227, + "step": 9798 + }, + { + "epoch": 1.5339699436443333, + "grad_norm": 1.092073678970337, + "learning_rate": 2.4356467904855e-05, + "loss": 0.4392, + "step": 9799 + }, + { + "epoch": 1.5341264871634315, + "grad_norm": 0.6115849614143372, + "learning_rate": 2.434832192896709e-05, + "loss": 0.21, + "step": 9800 + }, + { + "epoch": 1.5342830306825297, + "grad_norm": 1.9208537340164185, + "learning_rate": 2.434017595307918e-05, + "loss": 0.4419, + "step": 9801 + }, + { + "epoch": 1.534439574201628, + "grad_norm": 1.5528136491775513, + "learning_rate": 2.433202997719127e-05, + "loss": 0.294, + "step": 9802 + }, + { + "epoch": 1.5345961177207263, + "grad_norm": 0.7973250150680542, + "learning_rate": 2.4323884001303356e-05, + "loss": 0.1595, + "step": 9803 + }, + { + "epoch": 1.5347526612398248, + "grad_norm": 1.4548386335372925, + "learning_rate": 2.4315738025415446e-05, + "loss": 0.1955, + "step": 9804 + }, + { + "epoch": 1.534909204758923, + "grad_norm": 1.2545663118362427, + "learning_rate": 2.4307592049527533e-05, + "loss": 0.3924, + "step": 9805 + }, + { + "epoch": 1.5350657482780212, + "grad_norm": 2.162946939468384, + "learning_rate": 2.4299446073639624e-05, + "loss": 0.2231, + "step": 9806 + }, + { + "epoch": 1.5352222917971194, + "grad_norm": 2.4246628284454346, + "learning_rate": 2.429130009775171e-05, + "loss": 0.3344, + "step": 9807 + }, + { + "epoch": 1.5353788353162179, + "grad_norm": 1.461885929107666, + "learning_rate": 2.4283154121863798e-05, + "loss": 0.4894, + "step": 9808 + }, + { + "epoch": 1.5355353788353163, + "grad_norm": 1.5469928979873657, + "learning_rate": 2.427500814597589e-05, + "loss": 0.4121, + "step": 9809 + }, + { + "epoch": 1.5356919223544145, + "grad_norm": 1.1098395586013794, + "learning_rate": 2.426686217008798e-05, + "loss": 0.3543, + "step": 9810 + }, + { + "epoch": 1.5358484658735128, + "grad_norm": 2.5992016792297363, + "learning_rate": 2.4258716194200066e-05, + "loss": 0.6546, + "step": 9811 + }, + { + "epoch": 1.5360050093926112, + "grad_norm": 4.049336910247803, + "learning_rate": 2.4250570218312153e-05, + "loss": 0.4934, + "step": 9812 + }, + { + "epoch": 1.5361615529117094, + "grad_norm": 2.7644622325897217, + "learning_rate": 2.4242424242424244e-05, + "loss": 0.5659, + "step": 9813 + }, + { + "epoch": 1.5363180964308079, + "grad_norm": 3.78848934173584, + "learning_rate": 2.423427826653633e-05, + "loss": 0.7214, + "step": 9814 + }, + { + "epoch": 1.536474639949906, + "grad_norm": 1.71207857131958, + "learning_rate": 2.422613229064842e-05, + "loss": 0.6018, + "step": 9815 + }, + { + "epoch": 1.5366311834690043, + "grad_norm": 3.1579442024230957, + "learning_rate": 2.421798631476051e-05, + "loss": 0.6268, + "step": 9816 + }, + { + "epoch": 1.5367877269881027, + "grad_norm": 2.4029157161712646, + "learning_rate": 2.4209840338872596e-05, + "loss": 0.4866, + "step": 9817 + }, + { + "epoch": 1.536944270507201, + "grad_norm": 2.011138439178467, + "learning_rate": 2.4201694362984686e-05, + "loss": 0.4068, + "step": 9818 + }, + { + "epoch": 1.5371008140262994, + "grad_norm": 5.196176052093506, + "learning_rate": 2.4193548387096777e-05, + "loss": 0.6784, + "step": 9819 + }, + { + "epoch": 1.5372573575453976, + "grad_norm": 4.582691192626953, + "learning_rate": 2.4185402411208864e-05, + "loss": 0.7025, + "step": 9820 + }, + { + "epoch": 1.5374139010644958, + "grad_norm": 2.518059253692627, + "learning_rate": 2.417725643532095e-05, + "loss": 1.2659, + "step": 9821 + }, + { + "epoch": 1.5375704445835943, + "grad_norm": 3.5704562664031982, + "learning_rate": 2.416911045943304e-05, + "loss": 1.1088, + "step": 9822 + }, + { + "epoch": 1.5377269881026927, + "grad_norm": 3.5118231773376465, + "learning_rate": 2.416096448354513e-05, + "loss": 0.7115, + "step": 9823 + }, + { + "epoch": 1.537883531621791, + "grad_norm": 5.194871425628662, + "learning_rate": 2.415281850765722e-05, + "loss": 0.6097, + "step": 9824 + }, + { + "epoch": 1.5380400751408891, + "grad_norm": 4.408677577972412, + "learning_rate": 2.4144672531769306e-05, + "loss": 1.0906, + "step": 9825 + }, + { + "epoch": 1.5381966186599874, + "grad_norm": 2.772620916366577, + "learning_rate": 2.4136526555881394e-05, + "loss": 1.0466, + "step": 9826 + }, + { + "epoch": 1.5383531621790858, + "grad_norm": 3.693997859954834, + "learning_rate": 2.4128380579993484e-05, + "loss": 1.5499, + "step": 9827 + }, + { + "epoch": 1.5385097056981842, + "grad_norm": 2.347057580947876, + "learning_rate": 2.4120234604105575e-05, + "loss": 0.4773, + "step": 9828 + }, + { + "epoch": 1.5386662492172825, + "grad_norm": 1.9065396785736084, + "learning_rate": 2.4112088628217662e-05, + "loss": 0.5991, + "step": 9829 + }, + { + "epoch": 1.5388227927363807, + "grad_norm": 4.117300987243652, + "learning_rate": 2.410394265232975e-05, + "loss": 1.0916, + "step": 9830 + }, + { + "epoch": 1.538979336255479, + "grad_norm": 3.3792104721069336, + "learning_rate": 2.409579667644184e-05, + "loss": 1.1518, + "step": 9831 + }, + { + "epoch": 1.5391358797745773, + "grad_norm": 2.3964948654174805, + "learning_rate": 2.4087650700553927e-05, + "loss": 0.7905, + "step": 9832 + }, + { + "epoch": 1.5392924232936758, + "grad_norm": 4.1899189949035645, + "learning_rate": 2.4079504724666017e-05, + "loss": 1.1486, + "step": 9833 + }, + { + "epoch": 1.539448966812774, + "grad_norm": 1.9406132698059082, + "learning_rate": 2.4071358748778104e-05, + "loss": 0.2098, + "step": 9834 + }, + { + "epoch": 1.5396055103318722, + "grad_norm": 3.33667254447937, + "learning_rate": 2.406321277289019e-05, + "loss": 0.872, + "step": 9835 + }, + { + "epoch": 1.5397620538509704, + "grad_norm": 3.628021478652954, + "learning_rate": 2.4055066797002282e-05, + "loss": 1.3227, + "step": 9836 + }, + { + "epoch": 1.5399185973700689, + "grad_norm": 4.0646562576293945, + "learning_rate": 2.4046920821114372e-05, + "loss": 1.3163, + "step": 9837 + }, + { + "epoch": 1.5400751408891673, + "grad_norm": 2.7329697608947754, + "learning_rate": 2.403877484522646e-05, + "loss": 0.7568, + "step": 9838 + }, + { + "epoch": 1.5402316844082655, + "grad_norm": 0.31973639130592346, + "learning_rate": 2.4030628869338547e-05, + "loss": 0.1817, + "step": 9839 + }, + { + "epoch": 1.5403882279273637, + "grad_norm": 0.49623650312423706, + "learning_rate": 2.4022482893450637e-05, + "loss": 0.174, + "step": 9840 + }, + { + "epoch": 1.540544771446462, + "grad_norm": 0.4573627710342407, + "learning_rate": 2.4014336917562724e-05, + "loss": 0.1367, + "step": 9841 + }, + { + "epoch": 1.5407013149655604, + "grad_norm": 0.6927856206893921, + "learning_rate": 2.4006190941674815e-05, + "loss": 0.1824, + "step": 9842 + }, + { + "epoch": 1.5408578584846588, + "grad_norm": 0.5725102424621582, + "learning_rate": 2.3998044965786902e-05, + "loss": 0.2149, + "step": 9843 + }, + { + "epoch": 1.541014402003757, + "grad_norm": 0.5977395176887512, + "learning_rate": 2.398989898989899e-05, + "loss": 0.2316, + "step": 9844 + }, + { + "epoch": 1.5411709455228553, + "grad_norm": 1.2062755823135376, + "learning_rate": 2.398175301401108e-05, + "loss": 0.2907, + "step": 9845 + }, + { + "epoch": 1.5413274890419537, + "grad_norm": 0.9903820753097534, + "learning_rate": 2.397360703812317e-05, + "loss": 0.2832, + "step": 9846 + }, + { + "epoch": 1.541484032561052, + "grad_norm": 1.2367730140686035, + "learning_rate": 2.3965461062235257e-05, + "loss": 0.2785, + "step": 9847 + }, + { + "epoch": 1.5416405760801504, + "grad_norm": 0.6876146197319031, + "learning_rate": 2.3957315086347345e-05, + "loss": 0.1481, + "step": 9848 + }, + { + "epoch": 1.5417971195992486, + "grad_norm": 0.8422080874443054, + "learning_rate": 2.3949169110459435e-05, + "loss": 0.2023, + "step": 9849 + }, + { + "epoch": 1.5419536631183468, + "grad_norm": 1.3894271850585938, + "learning_rate": 2.3941023134571522e-05, + "loss": 0.3074, + "step": 9850 + }, + { + "epoch": 1.5421102066374452, + "grad_norm": 1.113702416419983, + "learning_rate": 2.3932877158683613e-05, + "loss": 0.3104, + "step": 9851 + }, + { + "epoch": 1.5422667501565435, + "grad_norm": 1.4769620895385742, + "learning_rate": 2.39247311827957e-05, + "loss": 0.3478, + "step": 9852 + }, + { + "epoch": 1.542423293675642, + "grad_norm": 1.3030887842178345, + "learning_rate": 2.3916585206907787e-05, + "loss": 0.5147, + "step": 9853 + }, + { + "epoch": 1.5425798371947401, + "grad_norm": 0.6365472674369812, + "learning_rate": 2.3908439231019878e-05, + "loss": 0.1961, + "step": 9854 + }, + { + "epoch": 1.5427363807138383, + "grad_norm": 1.3959211111068726, + "learning_rate": 2.3900293255131968e-05, + "loss": 0.4099, + "step": 9855 + }, + { + "epoch": 1.5428929242329368, + "grad_norm": 2.1118004322052, + "learning_rate": 2.3892147279244055e-05, + "loss": 0.6066, + "step": 9856 + }, + { + "epoch": 1.5430494677520352, + "grad_norm": 1.1651909351348877, + "learning_rate": 2.3884001303356142e-05, + "loss": 0.4401, + "step": 9857 + }, + { + "epoch": 1.5432060112711334, + "grad_norm": 1.2997528314590454, + "learning_rate": 2.3875855327468233e-05, + "loss": 0.4349, + "step": 9858 + }, + { + "epoch": 1.5433625547902317, + "grad_norm": 2.6824121475219727, + "learning_rate": 2.386770935158032e-05, + "loss": 0.4333, + "step": 9859 + }, + { + "epoch": 1.5435190983093299, + "grad_norm": 2.2237205505371094, + "learning_rate": 2.385956337569241e-05, + "loss": 0.6785, + "step": 9860 + }, + { + "epoch": 1.5436756418284283, + "grad_norm": 1.8722060918807983, + "learning_rate": 2.3851417399804498e-05, + "loss": 0.5185, + "step": 9861 + }, + { + "epoch": 1.5438321853475268, + "grad_norm": 0.9346224665641785, + "learning_rate": 2.3843271423916585e-05, + "loss": 0.3723, + "step": 9862 + }, + { + "epoch": 1.543988728866625, + "grad_norm": 2.745413064956665, + "learning_rate": 2.3835125448028675e-05, + "loss": 0.696, + "step": 9863 + }, + { + "epoch": 1.5441452723857232, + "grad_norm": 1.918119192123413, + "learning_rate": 2.3826979472140766e-05, + "loss": 0.4697, + "step": 9864 + }, + { + "epoch": 1.5443018159048214, + "grad_norm": 2.4792346954345703, + "learning_rate": 2.3818833496252853e-05, + "loss": 0.6809, + "step": 9865 + }, + { + "epoch": 1.5444583594239198, + "grad_norm": 1.838979721069336, + "learning_rate": 2.381068752036494e-05, + "loss": 0.5236, + "step": 9866 + }, + { + "epoch": 1.5446149029430183, + "grad_norm": 1.5450053215026855, + "learning_rate": 2.380254154447703e-05, + "loss": 0.3494, + "step": 9867 + }, + { + "epoch": 1.5447714464621165, + "grad_norm": 2.5526158809661865, + "learning_rate": 2.3794395568589118e-05, + "loss": 0.6506, + "step": 9868 + }, + { + "epoch": 1.5449279899812147, + "grad_norm": 2.729252338409424, + "learning_rate": 2.3786249592701208e-05, + "loss": 0.6515, + "step": 9869 + }, + { + "epoch": 1.545084533500313, + "grad_norm": 3.2484097480773926, + "learning_rate": 2.3778103616813295e-05, + "loss": 0.7347, + "step": 9870 + }, + { + "epoch": 1.5452410770194114, + "grad_norm": 1.908268928527832, + "learning_rate": 2.3769957640925383e-05, + "loss": 0.5446, + "step": 9871 + }, + { + "epoch": 1.5453976205385098, + "grad_norm": 3.8127663135528564, + "learning_rate": 2.3761811665037473e-05, + "loss": 0.5999, + "step": 9872 + }, + { + "epoch": 1.545554164057608, + "grad_norm": 3.0954723358154297, + "learning_rate": 2.3753665689149564e-05, + "loss": 0.4459, + "step": 9873 + }, + { + "epoch": 1.5457107075767063, + "grad_norm": 1.9999969005584717, + "learning_rate": 2.374551971326165e-05, + "loss": 0.5722, + "step": 9874 + }, + { + "epoch": 1.5458672510958045, + "grad_norm": 2.761960029602051, + "learning_rate": 2.3737373737373738e-05, + "loss": 0.8499, + "step": 9875 + }, + { + "epoch": 1.546023794614903, + "grad_norm": 2.540869951248169, + "learning_rate": 2.372922776148583e-05, + "loss": 1.1027, + "step": 9876 + }, + { + "epoch": 1.5461803381340014, + "grad_norm": 3.8375866413116455, + "learning_rate": 2.3721081785597916e-05, + "loss": 1.1271, + "step": 9877 + }, + { + "epoch": 1.5463368816530996, + "grad_norm": 4.295181751251221, + "learning_rate": 2.3712935809710006e-05, + "loss": 0.9381, + "step": 9878 + }, + { + "epoch": 1.5464934251721978, + "grad_norm": 4.459695816040039, + "learning_rate": 2.3704789833822093e-05, + "loss": 0.9763, + "step": 9879 + }, + { + "epoch": 1.5466499686912962, + "grad_norm": 3.091482162475586, + "learning_rate": 2.369664385793418e-05, + "loss": 0.8941, + "step": 9880 + }, + { + "epoch": 1.5468065122103944, + "grad_norm": 2.6495602130889893, + "learning_rate": 2.368849788204627e-05, + "loss": 0.5458, + "step": 9881 + }, + { + "epoch": 1.5469630557294929, + "grad_norm": 2.3472275733947754, + "learning_rate": 2.368035190615836e-05, + "loss": 1.041, + "step": 9882 + }, + { + "epoch": 1.547119599248591, + "grad_norm": 3.1800882816314697, + "learning_rate": 2.367220593027045e-05, + "loss": 1.2572, + "step": 9883 + }, + { + "epoch": 1.5472761427676893, + "grad_norm": 2.02528715133667, + "learning_rate": 2.3664059954382536e-05, + "loss": 0.3931, + "step": 9884 + }, + { + "epoch": 1.5474326862867878, + "grad_norm": 2.0794012546539307, + "learning_rate": 2.3655913978494626e-05, + "loss": 0.7305, + "step": 9885 + }, + { + "epoch": 1.5475892298058862, + "grad_norm": 1.5974836349487305, + "learning_rate": 2.3647768002606713e-05, + "loss": 0.3859, + "step": 9886 + }, + { + "epoch": 1.5477457733249844, + "grad_norm": 5.602487087249756, + "learning_rate": 2.3639622026718804e-05, + "loss": 0.8413, + "step": 9887 + }, + { + "epoch": 1.5479023168440826, + "grad_norm": 2.291785478591919, + "learning_rate": 2.363147605083089e-05, + "loss": 0.8334, + "step": 9888 + }, + { + "epoch": 1.5480588603631809, + "grad_norm": 0.7016488909721375, + "learning_rate": 2.3623330074942978e-05, + "loss": 0.252, + "step": 9889 + }, + { + "epoch": 1.5482154038822793, + "grad_norm": 0.5047834515571594, + "learning_rate": 2.361518409905507e-05, + "loss": 0.2357, + "step": 9890 + }, + { + "epoch": 1.5483719474013777, + "grad_norm": 0.5415497422218323, + "learning_rate": 2.360703812316716e-05, + "loss": 0.2003, + "step": 9891 + }, + { + "epoch": 1.548528490920476, + "grad_norm": 0.7412533164024353, + "learning_rate": 2.3598892147279243e-05, + "loss": 0.2214, + "step": 9892 + }, + { + "epoch": 1.5486850344395742, + "grad_norm": 0.6263313889503479, + "learning_rate": 2.3590746171391333e-05, + "loss": 0.2217, + "step": 9893 + }, + { + "epoch": 1.5488415779586724, + "grad_norm": 0.9283852577209473, + "learning_rate": 2.3582600195503424e-05, + "loss": 0.2902, + "step": 9894 + }, + { + "epoch": 1.5489981214777708, + "grad_norm": 0.5100945234298706, + "learning_rate": 2.357445421961551e-05, + "loss": 0.1543, + "step": 9895 + }, + { + "epoch": 1.5491546649968693, + "grad_norm": 1.7341874837875366, + "learning_rate": 2.35663082437276e-05, + "loss": 0.4304, + "step": 9896 + }, + { + "epoch": 1.5493112085159675, + "grad_norm": 0.8087712526321411, + "learning_rate": 2.355816226783969e-05, + "loss": 0.2625, + "step": 9897 + }, + { + "epoch": 1.5494677520350657, + "grad_norm": 1.253807783126831, + "learning_rate": 2.3550016291951776e-05, + "loss": 0.3149, + "step": 9898 + }, + { + "epoch": 1.549624295554164, + "grad_norm": 1.341329574584961, + "learning_rate": 2.3541870316063866e-05, + "loss": 0.1802, + "step": 9899 + }, + { + "epoch": 1.5497808390732624, + "grad_norm": 1.1341520547866821, + "learning_rate": 2.3533724340175957e-05, + "loss": 0.3121, + "step": 9900 + }, + { + "epoch": 1.5499373825923608, + "grad_norm": 0.9776425957679749, + "learning_rate": 2.352557836428804e-05, + "loss": 0.3944, + "step": 9901 + }, + { + "epoch": 1.550093926111459, + "grad_norm": 1.064538598060608, + "learning_rate": 2.351743238840013e-05, + "loss": 0.4395, + "step": 9902 + }, + { + "epoch": 1.5502504696305572, + "grad_norm": 1.1886507272720337, + "learning_rate": 2.3509286412512222e-05, + "loss": 0.4009, + "step": 9903 + }, + { + "epoch": 1.5504070131496555, + "grad_norm": 2.211325168609619, + "learning_rate": 2.350114043662431e-05, + "loss": 0.344, + "step": 9904 + }, + { + "epoch": 1.550563556668754, + "grad_norm": 1.698384404182434, + "learning_rate": 2.34929944607364e-05, + "loss": 0.3559, + "step": 9905 + }, + { + "epoch": 1.5507201001878523, + "grad_norm": 1.314292550086975, + "learning_rate": 2.3484848484848487e-05, + "loss": 0.3219, + "step": 9906 + }, + { + "epoch": 1.5508766437069506, + "grad_norm": 2.084984540939331, + "learning_rate": 2.3476702508960574e-05, + "loss": 0.6232, + "step": 9907 + }, + { + "epoch": 1.5510331872260488, + "grad_norm": 1.5475796461105347, + "learning_rate": 2.3468556533072664e-05, + "loss": 0.5172, + "step": 9908 + }, + { + "epoch": 1.551189730745147, + "grad_norm": 2.0848593711853027, + "learning_rate": 2.3460410557184755e-05, + "loss": 0.3752, + "step": 9909 + }, + { + "epoch": 1.5513462742642454, + "grad_norm": 1.3769322633743286, + "learning_rate": 2.345226458129684e-05, + "loss": 0.5307, + "step": 9910 + }, + { + "epoch": 1.5515028177833439, + "grad_norm": 1.66656494140625, + "learning_rate": 2.344411860540893e-05, + "loss": 0.457, + "step": 9911 + }, + { + "epoch": 1.551659361302442, + "grad_norm": 2.7680206298828125, + "learning_rate": 2.3435972629521016e-05, + "loss": 0.8333, + "step": 9912 + }, + { + "epoch": 1.5518159048215403, + "grad_norm": 2.541588544845581, + "learning_rate": 2.3427826653633107e-05, + "loss": 0.5014, + "step": 9913 + }, + { + "epoch": 1.5519724483406387, + "grad_norm": 3.760458469390869, + "learning_rate": 2.3419680677745194e-05, + "loss": 0.7543, + "step": 9914 + }, + { + "epoch": 1.552128991859737, + "grad_norm": 1.6424442529678345, + "learning_rate": 2.341153470185728e-05, + "loss": 0.249, + "step": 9915 + }, + { + "epoch": 1.5522855353788354, + "grad_norm": 3.0099105834960938, + "learning_rate": 2.340338872596937e-05, + "loss": 0.5419, + "step": 9916 + }, + { + "epoch": 1.5524420788979336, + "grad_norm": 1.3342924118041992, + "learning_rate": 2.3395242750081462e-05, + "loss": 0.4525, + "step": 9917 + }, + { + "epoch": 1.5525986224170318, + "grad_norm": 1.2753708362579346, + "learning_rate": 2.338709677419355e-05, + "loss": 0.2813, + "step": 9918 + }, + { + "epoch": 1.5527551659361303, + "grad_norm": 3.1385338306427, + "learning_rate": 2.3378950798305636e-05, + "loss": 0.8291, + "step": 9919 + }, + { + "epoch": 1.5529117094552287, + "grad_norm": 2.9183027744293213, + "learning_rate": 2.3370804822417727e-05, + "loss": 0.5026, + "step": 9920 + }, + { + "epoch": 1.553068252974327, + "grad_norm": 5.281633377075195, + "learning_rate": 2.3362658846529814e-05, + "loss": 0.9177, + "step": 9921 + }, + { + "epoch": 1.5532247964934252, + "grad_norm": 2.9123730659484863, + "learning_rate": 2.3354512870641905e-05, + "loss": 0.5119, + "step": 9922 + }, + { + "epoch": 1.5533813400125234, + "grad_norm": 1.7772990465164185, + "learning_rate": 2.334636689475399e-05, + "loss": 0.7911, + "step": 9923 + }, + { + "epoch": 1.5535378835316218, + "grad_norm": 2.5788722038269043, + "learning_rate": 2.333822091886608e-05, + "loss": 0.7884, + "step": 9924 + }, + { + "epoch": 1.5536944270507203, + "grad_norm": 4.900419235229492, + "learning_rate": 2.333007494297817e-05, + "loss": 0.8128, + "step": 9925 + }, + { + "epoch": 1.5538509705698185, + "grad_norm": 3.0191988945007324, + "learning_rate": 2.332192896709026e-05, + "loss": 0.5439, + "step": 9926 + }, + { + "epoch": 1.5540075140889167, + "grad_norm": 4.587233543395996, + "learning_rate": 2.3313782991202347e-05, + "loss": 0.6515, + "step": 9927 + }, + { + "epoch": 1.554164057608015, + "grad_norm": 4.583433628082275, + "learning_rate": 2.3305637015314434e-05, + "loss": 0.9249, + "step": 9928 + }, + { + "epoch": 1.5543206011271133, + "grad_norm": 2.629729986190796, + "learning_rate": 2.3297491039426525e-05, + "loss": 0.8435, + "step": 9929 + }, + { + "epoch": 1.5544771446462118, + "grad_norm": 2.931708812713623, + "learning_rate": 2.3289345063538612e-05, + "loss": 0.9297, + "step": 9930 + }, + { + "epoch": 1.55463368816531, + "grad_norm": 3.738051652908325, + "learning_rate": 2.3281199087650702e-05, + "loss": 1.0978, + "step": 9931 + }, + { + "epoch": 1.5547902316844082, + "grad_norm": 6.855445861816406, + "learning_rate": 2.327305311176279e-05, + "loss": 1.9534, + "step": 9932 + }, + { + "epoch": 1.5549467752035064, + "grad_norm": 1.933883547782898, + "learning_rate": 2.3264907135874877e-05, + "loss": 0.6982, + "step": 9933 + }, + { + "epoch": 1.5551033187226049, + "grad_norm": 3.22774338722229, + "learning_rate": 2.3256761159986967e-05, + "loss": 0.8795, + "step": 9934 + }, + { + "epoch": 1.5552598622417033, + "grad_norm": 2.982266426086426, + "learning_rate": 2.3248615184099058e-05, + "loss": 0.8044, + "step": 9935 + }, + { + "epoch": 1.5554164057608015, + "grad_norm": 3.09027361869812, + "learning_rate": 2.3240469208211145e-05, + "loss": 1.4425, + "step": 9936 + }, + { + "epoch": 1.5555729492798998, + "grad_norm": 1.8416551351547241, + "learning_rate": 2.3232323232323232e-05, + "loss": 0.7483, + "step": 9937 + }, + { + "epoch": 1.555729492798998, + "grad_norm": 2.8720691204071045, + "learning_rate": 2.3224177256435322e-05, + "loss": 0.9641, + "step": 9938 + }, + { + "epoch": 1.5558860363180964, + "grad_norm": 0.4775635004043579, + "learning_rate": 2.321603128054741e-05, + "loss": 0.1728, + "step": 9939 + }, + { + "epoch": 1.5560425798371949, + "grad_norm": 0.500299334526062, + "learning_rate": 2.32078853046595e-05, + "loss": 0.1541, + "step": 9940 + }, + { + "epoch": 1.556199123356293, + "grad_norm": 0.6266286373138428, + "learning_rate": 2.3199739328771587e-05, + "loss": 0.1825, + "step": 9941 + }, + { + "epoch": 1.5563556668753913, + "grad_norm": 0.4394839107990265, + "learning_rate": 2.3191593352883674e-05, + "loss": 0.2066, + "step": 9942 + }, + { + "epoch": 1.5565122103944895, + "grad_norm": 0.710074245929718, + "learning_rate": 2.3183447376995765e-05, + "loss": 0.1995, + "step": 9943 + }, + { + "epoch": 1.556668753913588, + "grad_norm": 0.8196855783462524, + "learning_rate": 2.3175301401107855e-05, + "loss": 0.1747, + "step": 9944 + }, + { + "epoch": 1.5568252974326864, + "grad_norm": 0.8740781545639038, + "learning_rate": 2.3167155425219943e-05, + "loss": 0.2404, + "step": 9945 + }, + { + "epoch": 1.5569818409517846, + "grad_norm": 0.5745200514793396, + "learning_rate": 2.315900944933203e-05, + "loss": 0.2134, + "step": 9946 + }, + { + "epoch": 1.5571383844708828, + "grad_norm": 1.17318856716156, + "learning_rate": 2.315086347344412e-05, + "loss": 0.2623, + "step": 9947 + }, + { + "epoch": 1.5572949279899813, + "grad_norm": 1.595487117767334, + "learning_rate": 2.3142717497556207e-05, + "loss": 0.446, + "step": 9948 + }, + { + "epoch": 1.5574514715090795, + "grad_norm": 0.9424474835395813, + "learning_rate": 2.3134571521668298e-05, + "loss": 0.3045, + "step": 9949 + }, + { + "epoch": 1.557608015028178, + "grad_norm": 1.005547285079956, + "learning_rate": 2.3126425545780385e-05, + "loss": 0.243, + "step": 9950 + }, + { + "epoch": 1.5577645585472761, + "grad_norm": 0.7574334144592285, + "learning_rate": 2.3118279569892472e-05, + "loss": 0.2484, + "step": 9951 + }, + { + "epoch": 1.5579211020663744, + "grad_norm": 1.0445295572280884, + "learning_rate": 2.3110133594004563e-05, + "loss": 0.4004, + "step": 9952 + }, + { + "epoch": 1.5580776455854728, + "grad_norm": 0.7897704243659973, + "learning_rate": 2.3101987618116653e-05, + "loss": 0.1653, + "step": 9953 + }, + { + "epoch": 1.5582341891045712, + "grad_norm": 1.814918875694275, + "learning_rate": 2.309384164222874e-05, + "loss": 0.4157, + "step": 9954 + }, + { + "epoch": 1.5583907326236695, + "grad_norm": 1.1712642908096313, + "learning_rate": 2.3085695666340827e-05, + "loss": 0.4073, + "step": 9955 + }, + { + "epoch": 1.5585472761427677, + "grad_norm": 1.9007118940353394, + "learning_rate": 2.3077549690452918e-05, + "loss": 0.7152, + "step": 9956 + }, + { + "epoch": 1.5587038196618659, + "grad_norm": 1.951905369758606, + "learning_rate": 2.3069403714565005e-05, + "loss": 0.3342, + "step": 9957 + }, + { + "epoch": 1.5588603631809643, + "grad_norm": 1.1269463300704956, + "learning_rate": 2.3061257738677096e-05, + "loss": 0.2669, + "step": 9958 + }, + { + "epoch": 1.5590169067000628, + "grad_norm": 1.6812591552734375, + "learning_rate": 2.3053111762789183e-05, + "loss": 0.5507, + "step": 9959 + }, + { + "epoch": 1.559173450219161, + "grad_norm": 1.7768874168395996, + "learning_rate": 2.304496578690127e-05, + "loss": 0.3836, + "step": 9960 + }, + { + "epoch": 1.5593299937382592, + "grad_norm": 2.9446301460266113, + "learning_rate": 2.303681981101336e-05, + "loss": 0.5997, + "step": 9961 + }, + { + "epoch": 1.5594865372573574, + "grad_norm": 1.0136429071426392, + "learning_rate": 2.302867383512545e-05, + "loss": 0.3024, + "step": 9962 + }, + { + "epoch": 1.5596430807764559, + "grad_norm": 2.119811773300171, + "learning_rate": 2.3020527859237538e-05, + "loss": 0.7207, + "step": 9963 + }, + { + "epoch": 1.5597996242955543, + "grad_norm": 4.464352607727051, + "learning_rate": 2.3012381883349625e-05, + "loss": 0.7277, + "step": 9964 + }, + { + "epoch": 1.5599561678146525, + "grad_norm": 1.8272819519042969, + "learning_rate": 2.3004235907461716e-05, + "loss": 0.6101, + "step": 9965 + }, + { + "epoch": 1.5601127113337507, + "grad_norm": 1.534544587135315, + "learning_rate": 2.2996089931573803e-05, + "loss": 0.3928, + "step": 9966 + }, + { + "epoch": 1.560269254852849, + "grad_norm": 3.70503568649292, + "learning_rate": 2.2987943955685893e-05, + "loss": 0.8888, + "step": 9967 + }, + { + "epoch": 1.5604257983719474, + "grad_norm": 1.4271700382232666, + "learning_rate": 2.297979797979798e-05, + "loss": 0.5989, + "step": 9968 + }, + { + "epoch": 1.5605823418910458, + "grad_norm": 2.380782127380371, + "learning_rate": 2.2971652003910068e-05, + "loss": 0.4789, + "step": 9969 + }, + { + "epoch": 1.560738885410144, + "grad_norm": 2.907968282699585, + "learning_rate": 2.2963506028022158e-05, + "loss": 0.632, + "step": 9970 + }, + { + "epoch": 1.5608954289292423, + "grad_norm": 2.2492058277130127, + "learning_rate": 2.295536005213425e-05, + "loss": 0.587, + "step": 9971 + }, + { + "epoch": 1.5610519724483405, + "grad_norm": 2.3665571212768555, + "learning_rate": 2.2947214076246336e-05, + "loss": 0.7958, + "step": 9972 + }, + { + "epoch": 1.561208515967439, + "grad_norm": 2.5934932231903076, + "learning_rate": 2.2939068100358423e-05, + "loss": 0.9618, + "step": 9973 + }, + { + "epoch": 1.5613650594865374, + "grad_norm": 3.9065611362457275, + "learning_rate": 2.2930922124470514e-05, + "loss": 1.2115, + "step": 9974 + }, + { + "epoch": 1.5615216030056356, + "grad_norm": 4.305639743804932, + "learning_rate": 2.29227761485826e-05, + "loss": 1.2517, + "step": 9975 + }, + { + "epoch": 1.5616781465247338, + "grad_norm": 4.183220386505127, + "learning_rate": 2.291463017269469e-05, + "loss": 0.5628, + "step": 9976 + }, + { + "epoch": 1.561834690043832, + "grad_norm": 3.8796894550323486, + "learning_rate": 2.290648419680678e-05, + "loss": 0.8958, + "step": 9977 + }, + { + "epoch": 1.5619912335629305, + "grad_norm": 4.152509689331055, + "learning_rate": 2.2898338220918866e-05, + "loss": 0.7464, + "step": 9978 + }, + { + "epoch": 1.562147777082029, + "grad_norm": 4.288389205932617, + "learning_rate": 2.2890192245030956e-05, + "loss": 1.1891, + "step": 9979 + }, + { + "epoch": 1.5623043206011271, + "grad_norm": 3.1033058166503906, + "learning_rate": 2.2882046269143047e-05, + "loss": 1.0786, + "step": 9980 + }, + { + "epoch": 1.5624608641202253, + "grad_norm": 3.960963726043701, + "learning_rate": 2.2873900293255134e-05, + "loss": 0.6723, + "step": 9981 + }, + { + "epoch": 1.5626174076393238, + "grad_norm": 2.1994376182556152, + "learning_rate": 2.286575431736722e-05, + "loss": 0.7206, + "step": 9982 + }, + { + "epoch": 1.562773951158422, + "grad_norm": 3.3920867443084717, + "learning_rate": 2.285760834147931e-05, + "loss": 1.2384, + "step": 9983 + }, + { + "epoch": 1.5629304946775204, + "grad_norm": 2.4802215099334717, + "learning_rate": 2.28494623655914e-05, + "loss": 1.0801, + "step": 9984 + }, + { + "epoch": 1.5630870381966186, + "grad_norm": 1.5487043857574463, + "learning_rate": 2.284131638970349e-05, + "loss": 0.3751, + "step": 9985 + }, + { + "epoch": 1.5632435817157169, + "grad_norm": 2.2770631313323975, + "learning_rate": 2.2833170413815576e-05, + "loss": 0.5298, + "step": 9986 + }, + { + "epoch": 1.5634001252348153, + "grad_norm": 2.1810410022735596, + "learning_rate": 2.2825024437927663e-05, + "loss": 0.2882, + "step": 9987 + }, + { + "epoch": 1.5635566687539137, + "grad_norm": 4.2969584465026855, + "learning_rate": 2.2816878462039754e-05, + "loss": 0.7196, + "step": 9988 + }, + { + "epoch": 1.563713212273012, + "grad_norm": 0.4581182897090912, + "learning_rate": 2.2808732486151844e-05, + "loss": 0.1582, + "step": 9989 + }, + { + "epoch": 1.5638697557921102, + "grad_norm": 0.48975202441215515, + "learning_rate": 2.280058651026393e-05, + "loss": 0.1114, + "step": 9990 + }, + { + "epoch": 1.5640262993112084, + "grad_norm": 0.5889154076576233, + "learning_rate": 2.279244053437602e-05, + "loss": 0.2351, + "step": 9991 + }, + { + "epoch": 1.5641828428303068, + "grad_norm": 0.8353304862976074, + "learning_rate": 2.278429455848811e-05, + "loss": 0.2074, + "step": 9992 + }, + { + "epoch": 1.5643393863494053, + "grad_norm": 2.04494571685791, + "learning_rate": 2.2776148582600196e-05, + "loss": 0.3303, + "step": 9993 + }, + { + "epoch": 1.5644959298685035, + "grad_norm": 0.6768299341201782, + "learning_rate": 2.2768002606712287e-05, + "loss": 0.302, + "step": 9994 + }, + { + "epoch": 1.5646524733876017, + "grad_norm": 1.0507726669311523, + "learning_rate": 2.2759856630824374e-05, + "loss": 0.3517, + "step": 9995 + }, + { + "epoch": 1.5648090169067, + "grad_norm": 0.5583631992340088, + "learning_rate": 2.275171065493646e-05, + "loss": 0.223, + "step": 9996 + }, + { + "epoch": 1.5649655604257984, + "grad_norm": 1.0685746669769287, + "learning_rate": 2.274356467904855e-05, + "loss": 0.179, + "step": 9997 + }, + { + "epoch": 1.5651221039448968, + "grad_norm": 0.8346176147460938, + "learning_rate": 2.2735418703160642e-05, + "loss": 0.3243, + "step": 9998 + }, + { + "epoch": 1.565278647463995, + "grad_norm": 0.9658491015434265, + "learning_rate": 2.272727272727273e-05, + "loss": 0.1359, + "step": 9999 + }, + { + "epoch": 1.5654351909830932, + "grad_norm": 0.5991528034210205, + "learning_rate": 2.2719126751384816e-05, + "loss": 0.1388, + "step": 10000 + }, + { + "epoch": 1.5654351909830932, + "eval_loss": 0.4984326660633087, + "eval_runtime": 204.9732, + "eval_samples_per_second": 60.413, + "eval_steps_per_second": 3.776, + "eval_wer": 0.30608623584163885, + "step": 10000 + }, + { + "epoch": 1.5655917345021915, + "grad_norm": 1.9929157495498657, + "learning_rate": 2.2710980775496907e-05, + "loss": 0.3923, + "step": 10001 + }, + { + "epoch": 1.56574827802129, + "grad_norm": 1.7138148546218872, + "learning_rate": 2.2702834799608994e-05, + "loss": 0.1527, + "step": 10002 + }, + { + "epoch": 1.5659048215403883, + "grad_norm": 0.5842229127883911, + "learning_rate": 2.2694688823721085e-05, + "loss": 0.2171, + "step": 10003 + }, + { + "epoch": 1.5660613650594866, + "grad_norm": 1.1905561685562134, + "learning_rate": 2.2686542847833172e-05, + "loss": 0.2546, + "step": 10004 + }, + { + "epoch": 1.5662179085785848, + "grad_norm": 1.2873033285140991, + "learning_rate": 2.267839687194526e-05, + "loss": 0.2705, + "step": 10005 + }, + { + "epoch": 1.566374452097683, + "grad_norm": 2.281245708465576, + "learning_rate": 2.267025089605735e-05, + "loss": 0.5342, + "step": 10006 + }, + { + "epoch": 1.5665309956167814, + "grad_norm": 2.3149325847625732, + "learning_rate": 2.266210492016944e-05, + "loss": 0.3156, + "step": 10007 + }, + { + "epoch": 1.5666875391358799, + "grad_norm": 1.7536264657974243, + "learning_rate": 2.2653958944281524e-05, + "loss": 0.3828, + "step": 10008 + }, + { + "epoch": 1.566844082654978, + "grad_norm": 2.327113628387451, + "learning_rate": 2.2645812968393614e-05, + "loss": 0.4127, + "step": 10009 + }, + { + "epoch": 1.5670006261740763, + "grad_norm": 1.6266465187072754, + "learning_rate": 2.2637666992505705e-05, + "loss": 0.3627, + "step": 10010 + }, + { + "epoch": 1.5671571696931748, + "grad_norm": 1.3536170721054077, + "learning_rate": 2.2629521016617792e-05, + "loss": 0.2804, + "step": 10011 + }, + { + "epoch": 1.567313713212273, + "grad_norm": 1.5961014032363892, + "learning_rate": 2.2621375040729882e-05, + "loss": 0.3354, + "step": 10012 + }, + { + "epoch": 1.5674702567313714, + "grad_norm": 4.83693265914917, + "learning_rate": 2.261322906484197e-05, + "loss": 0.6471, + "step": 10013 + }, + { + "epoch": 1.5676268002504696, + "grad_norm": 2.1743533611297607, + "learning_rate": 2.2605083088954057e-05, + "loss": 0.5988, + "step": 10014 + }, + { + "epoch": 1.5677833437695678, + "grad_norm": 1.969578504562378, + "learning_rate": 2.2596937113066147e-05, + "loss": 0.4342, + "step": 10015 + }, + { + "epoch": 1.5679398872886663, + "grad_norm": 2.1052892208099365, + "learning_rate": 2.2588791137178238e-05, + "loss": 0.5659, + "step": 10016 + }, + { + "epoch": 1.5680964308077645, + "grad_norm": 1.9152888059616089, + "learning_rate": 2.258064516129032e-05, + "loss": 0.637, + "step": 10017 + }, + { + "epoch": 1.568252974326863, + "grad_norm": 2.5344038009643555, + "learning_rate": 2.2572499185402412e-05, + "loss": 0.6897, + "step": 10018 + }, + { + "epoch": 1.5684095178459612, + "grad_norm": 3.619785785675049, + "learning_rate": 2.2564353209514503e-05, + "loss": 0.9818, + "step": 10019 + }, + { + "epoch": 1.5685660613650594, + "grad_norm": 2.203646659851074, + "learning_rate": 2.255620723362659e-05, + "loss": 0.5698, + "step": 10020 + }, + { + "epoch": 1.5687226048841578, + "grad_norm": 5.2918782234191895, + "learning_rate": 2.254806125773868e-05, + "loss": 1.1662, + "step": 10021 + }, + { + "epoch": 1.5688791484032563, + "grad_norm": 3.263131856918335, + "learning_rate": 2.2539915281850767e-05, + "loss": 0.5979, + "step": 10022 + }, + { + "epoch": 1.5690356919223545, + "grad_norm": 2.6613948345184326, + "learning_rate": 2.2531769305962854e-05, + "loss": 0.8394, + "step": 10023 + }, + { + "epoch": 1.5691922354414527, + "grad_norm": 1.3187425136566162, + "learning_rate": 2.2523623330074945e-05, + "loss": 0.4361, + "step": 10024 + }, + { + "epoch": 1.569348778960551, + "grad_norm": 4.988018035888672, + "learning_rate": 2.2515477354187036e-05, + "loss": 0.8902, + "step": 10025 + }, + { + "epoch": 1.5695053224796494, + "grad_norm": 3.115518093109131, + "learning_rate": 2.250733137829912e-05, + "loss": 0.9444, + "step": 10026 + }, + { + "epoch": 1.5696618659987478, + "grad_norm": 4.101736068725586, + "learning_rate": 2.249918540241121e-05, + "loss": 0.5797, + "step": 10027 + }, + { + "epoch": 1.569818409517846, + "grad_norm": 2.410694122314453, + "learning_rate": 2.24910394265233e-05, + "loss": 0.8799, + "step": 10028 + }, + { + "epoch": 1.5699749530369442, + "grad_norm": 5.649385452270508, + "learning_rate": 2.2482893450635387e-05, + "loss": 1.345, + "step": 10029 + }, + { + "epoch": 1.5701314965560424, + "grad_norm": 2.7277567386627197, + "learning_rate": 2.2474747474747475e-05, + "loss": 0.944, + "step": 10030 + }, + { + "epoch": 1.570288040075141, + "grad_norm": 7.223354339599609, + "learning_rate": 2.2466601498859565e-05, + "loss": 1.2866, + "step": 10031 + }, + { + "epoch": 1.5704445835942393, + "grad_norm": 5.387571334838867, + "learning_rate": 2.2458455522971652e-05, + "loss": 0.7829, + "step": 10032 + }, + { + "epoch": 1.5706011271133375, + "grad_norm": 5.861685276031494, + "learning_rate": 2.2450309547083743e-05, + "loss": 1.5174, + "step": 10033 + }, + { + "epoch": 1.5707576706324358, + "grad_norm": 2.27160382270813, + "learning_rate": 2.2442163571195833e-05, + "loss": 1.0956, + "step": 10034 + }, + { + "epoch": 1.570914214151534, + "grad_norm": 3.669724464416504, + "learning_rate": 2.2434017595307917e-05, + "loss": 1.1828, + "step": 10035 + }, + { + "epoch": 1.5710707576706324, + "grad_norm": 3.9557442665100098, + "learning_rate": 2.2425871619420008e-05, + "loss": 0.6727, + "step": 10036 + }, + { + "epoch": 1.5712273011897309, + "grad_norm": 1.863874077796936, + "learning_rate": 2.2417725643532098e-05, + "loss": 0.6425, + "step": 10037 + }, + { + "epoch": 1.571383844708829, + "grad_norm": 4.285005569458008, + "learning_rate": 2.2409579667644185e-05, + "loss": 1.2232, + "step": 10038 + }, + { + "epoch": 1.5715403882279273, + "grad_norm": 0.44174695014953613, + "learning_rate": 2.2401433691756272e-05, + "loss": 0.2003, + "step": 10039 + }, + { + "epoch": 1.5716969317470255, + "grad_norm": 0.559836208820343, + "learning_rate": 2.239328771586836e-05, + "loss": 0.1547, + "step": 10040 + }, + { + "epoch": 1.571853475266124, + "grad_norm": 0.4276370406150818, + "learning_rate": 2.238514173998045e-05, + "loss": 0.1508, + "step": 10041 + }, + { + "epoch": 1.5720100187852224, + "grad_norm": 0.44464749097824097, + "learning_rate": 2.237699576409254e-05, + "loss": 0.1322, + "step": 10042 + }, + { + "epoch": 1.5721665623043206, + "grad_norm": 0.45291396975517273, + "learning_rate": 2.2368849788204628e-05, + "loss": 0.1744, + "step": 10043 + }, + { + "epoch": 1.5723231058234188, + "grad_norm": 0.9347220659255981, + "learning_rate": 2.2360703812316715e-05, + "loss": 0.201, + "step": 10044 + }, + { + "epoch": 1.5724796493425173, + "grad_norm": 0.5801993012428284, + "learning_rate": 2.2352557836428805e-05, + "loss": 0.1923, + "step": 10045 + }, + { + "epoch": 1.5726361928616155, + "grad_norm": 0.5372093915939331, + "learning_rate": 2.2344411860540893e-05, + "loss": 0.2317, + "step": 10046 + }, + { + "epoch": 1.572792736380714, + "grad_norm": 0.5902731418609619, + "learning_rate": 2.2336265884652983e-05, + "loss": 0.2137, + "step": 10047 + }, + { + "epoch": 1.5729492798998121, + "grad_norm": 0.7949742674827576, + "learning_rate": 2.232811990876507e-05, + "loss": 0.2401, + "step": 10048 + }, + { + "epoch": 1.5731058234189104, + "grad_norm": 0.7495807409286499, + "learning_rate": 2.2319973932877157e-05, + "loss": 0.1828, + "step": 10049 + }, + { + "epoch": 1.5732623669380088, + "grad_norm": 1.0367367267608643, + "learning_rate": 2.2311827956989248e-05, + "loss": 0.2725, + "step": 10050 + }, + { + "epoch": 1.573418910457107, + "grad_norm": 1.0104857683181763, + "learning_rate": 2.230368198110134e-05, + "loss": 0.1844, + "step": 10051 + }, + { + "epoch": 1.5735754539762055, + "grad_norm": 0.8591790199279785, + "learning_rate": 2.2295536005213425e-05, + "loss": 0.2444, + "step": 10052 + }, + { + "epoch": 1.5737319974953037, + "grad_norm": 1.5208595991134644, + "learning_rate": 2.2287390029325513e-05, + "loss": 0.3393, + "step": 10053 + }, + { + "epoch": 1.573888541014402, + "grad_norm": 0.8836588263511658, + "learning_rate": 2.2279244053437603e-05, + "loss": 0.1931, + "step": 10054 + }, + { + "epoch": 1.5740450845335003, + "grad_norm": 1.8104723691940308, + "learning_rate": 2.227109807754969e-05, + "loss": 0.3256, + "step": 10055 + }, + { + "epoch": 1.5742016280525988, + "grad_norm": 1.891265630722046, + "learning_rate": 2.226295210166178e-05, + "loss": 0.446, + "step": 10056 + }, + { + "epoch": 1.574358171571697, + "grad_norm": 2.484590530395508, + "learning_rate": 2.2254806125773868e-05, + "loss": 0.453, + "step": 10057 + }, + { + "epoch": 1.5745147150907952, + "grad_norm": 1.9067918062210083, + "learning_rate": 2.2246660149885955e-05, + "loss": 0.4499, + "step": 10058 + }, + { + "epoch": 1.5746712586098934, + "grad_norm": 1.6850560903549194, + "learning_rate": 2.2238514173998046e-05, + "loss": 0.5447, + "step": 10059 + }, + { + "epoch": 1.5748278021289919, + "grad_norm": 1.6162102222442627, + "learning_rate": 2.2230368198110136e-05, + "loss": 0.3229, + "step": 10060 + }, + { + "epoch": 1.5749843456480903, + "grad_norm": 3.8435311317443848, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.37, + "step": 10061 + }, + { + "epoch": 1.5751408891671885, + "grad_norm": 1.3372102975845337, + "learning_rate": 2.221407624633431e-05, + "loss": 0.5637, + "step": 10062 + }, + { + "epoch": 1.5752974326862867, + "grad_norm": 2.134716749191284, + "learning_rate": 2.22059302704464e-05, + "loss": 0.4717, + "step": 10063 + }, + { + "epoch": 1.575453976205385, + "grad_norm": 2.0496010780334473, + "learning_rate": 2.2197784294558488e-05, + "loss": 0.6656, + "step": 10064 + }, + { + "epoch": 1.5756105197244834, + "grad_norm": 2.2132251262664795, + "learning_rate": 2.218963831867058e-05, + "loss": 0.6368, + "step": 10065 + }, + { + "epoch": 1.5757670632435818, + "grad_norm": 1.77400541305542, + "learning_rate": 2.2181492342782666e-05, + "loss": 0.3351, + "step": 10066 + }, + { + "epoch": 1.57592360676268, + "grad_norm": 2.1106982231140137, + "learning_rate": 2.2173346366894753e-05, + "loss": 0.7342, + "step": 10067 + }, + { + "epoch": 1.5760801502817783, + "grad_norm": 2.2163093090057373, + "learning_rate": 2.2165200391006843e-05, + "loss": 0.6934, + "step": 10068 + }, + { + "epoch": 1.5762366938008765, + "grad_norm": 2.294734239578247, + "learning_rate": 2.2157054415118934e-05, + "loss": 0.5434, + "step": 10069 + }, + { + "epoch": 1.576393237319975, + "grad_norm": 2.6277103424072266, + "learning_rate": 2.214890843923102e-05, + "loss": 0.4759, + "step": 10070 + }, + { + "epoch": 1.5765497808390734, + "grad_norm": 1.7746760845184326, + "learning_rate": 2.2140762463343108e-05, + "loss": 0.3522, + "step": 10071 + }, + { + "epoch": 1.5767063243581716, + "grad_norm": 2.6829135417938232, + "learning_rate": 2.21326164874552e-05, + "loss": 0.6594, + "step": 10072 + }, + { + "epoch": 1.5768628678772698, + "grad_norm": 2.7537667751312256, + "learning_rate": 2.2124470511567286e-05, + "loss": 0.5582, + "step": 10073 + }, + { + "epoch": 1.577019411396368, + "grad_norm": 3.1607677936553955, + "learning_rate": 2.2116324535679376e-05, + "loss": 0.9852, + "step": 10074 + }, + { + "epoch": 1.5771759549154665, + "grad_norm": 4.130873203277588, + "learning_rate": 2.2108178559791464e-05, + "loss": 0.7714, + "step": 10075 + }, + { + "epoch": 1.577332498434565, + "grad_norm": 5.207320690155029, + "learning_rate": 2.210003258390355e-05, + "loss": 1.1225, + "step": 10076 + }, + { + "epoch": 1.5774890419536631, + "grad_norm": 3.0928163528442383, + "learning_rate": 2.209188660801564e-05, + "loss": 1.2172, + "step": 10077 + }, + { + "epoch": 1.5776455854727613, + "grad_norm": 3.0915355682373047, + "learning_rate": 2.2083740632127732e-05, + "loss": 1.3678, + "step": 10078 + }, + { + "epoch": 1.5778021289918598, + "grad_norm": 5.351958274841309, + "learning_rate": 2.207559465623982e-05, + "loss": 0.7305, + "step": 10079 + }, + { + "epoch": 1.577958672510958, + "grad_norm": 3.046412706375122, + "learning_rate": 2.2067448680351906e-05, + "loss": 0.9141, + "step": 10080 + }, + { + "epoch": 1.5781152160300564, + "grad_norm": 4.52030611038208, + "learning_rate": 2.2059302704463997e-05, + "loss": 1.307, + "step": 10081 + }, + { + "epoch": 1.5782717595491547, + "grad_norm": 2.8154048919677734, + "learning_rate": 2.2051156728576084e-05, + "loss": 0.9505, + "step": 10082 + }, + { + "epoch": 1.5784283030682529, + "grad_norm": 2.6124799251556396, + "learning_rate": 2.2043010752688174e-05, + "loss": 0.4995, + "step": 10083 + }, + { + "epoch": 1.5785848465873513, + "grad_norm": 6.238215446472168, + "learning_rate": 2.203486477680026e-05, + "loss": 0.2749, + "step": 10084 + }, + { + "epoch": 1.5787413901064495, + "grad_norm": 7.219119071960449, + "learning_rate": 2.202671880091235e-05, + "loss": 0.7287, + "step": 10085 + }, + { + "epoch": 1.578897933625548, + "grad_norm": 2.70338773727417, + "learning_rate": 2.201857282502444e-05, + "loss": 0.6705, + "step": 10086 + }, + { + "epoch": 1.5790544771446462, + "grad_norm": 5.501718997955322, + "learning_rate": 2.201042684913653e-05, + "loss": 1.5146, + "step": 10087 + }, + { + "epoch": 1.5792110206637444, + "grad_norm": 6.763995170593262, + "learning_rate": 2.2002280873248617e-05, + "loss": 1.6567, + "step": 10088 + }, + { + "epoch": 1.5793675641828429, + "grad_norm": 0.41342025995254517, + "learning_rate": 2.1994134897360704e-05, + "loss": 0.2049, + "step": 10089 + }, + { + "epoch": 1.5795241077019413, + "grad_norm": 0.5601213574409485, + "learning_rate": 2.1985988921472794e-05, + "loss": 0.3182, + "step": 10090 + }, + { + "epoch": 1.5796806512210395, + "grad_norm": 1.091718316078186, + "learning_rate": 2.197784294558488e-05, + "loss": 0.3462, + "step": 10091 + }, + { + "epoch": 1.5798371947401377, + "grad_norm": 0.5938206315040588, + "learning_rate": 2.1969696969696972e-05, + "loss": 0.1744, + "step": 10092 + }, + { + "epoch": 1.579993738259236, + "grad_norm": 0.9708141684532166, + "learning_rate": 2.196155099380906e-05, + "loss": 0.2129, + "step": 10093 + }, + { + "epoch": 1.5801502817783344, + "grad_norm": 0.9319791793823242, + "learning_rate": 2.1953405017921146e-05, + "loss": 0.2744, + "step": 10094 + }, + { + "epoch": 1.5803068252974328, + "grad_norm": 0.6032662987709045, + "learning_rate": 2.1945259042033237e-05, + "loss": 0.2375, + "step": 10095 + }, + { + "epoch": 1.580463368816531, + "grad_norm": 1.0670620203018188, + "learning_rate": 2.1937113066145327e-05, + "loss": 0.2133, + "step": 10096 + }, + { + "epoch": 1.5806199123356293, + "grad_norm": 0.6484942436218262, + "learning_rate": 2.1928967090257414e-05, + "loss": 0.1823, + "step": 10097 + }, + { + "epoch": 1.5807764558547275, + "grad_norm": 2.2411065101623535, + "learning_rate": 2.19208211143695e-05, + "loss": 0.3557, + "step": 10098 + }, + { + "epoch": 1.580932999373826, + "grad_norm": 0.8422627449035645, + "learning_rate": 2.1912675138481592e-05, + "loss": 0.249, + "step": 10099 + }, + { + "epoch": 1.5810895428929244, + "grad_norm": 1.2941734790802002, + "learning_rate": 2.190452916259368e-05, + "loss": 0.2958, + "step": 10100 + }, + { + "epoch": 1.5812460864120226, + "grad_norm": 0.574828028678894, + "learning_rate": 2.189638318670577e-05, + "loss": 0.1974, + "step": 10101 + }, + { + "epoch": 1.5814026299311208, + "grad_norm": 2.380363702774048, + "learning_rate": 2.1888237210817857e-05, + "loss": 0.4013, + "step": 10102 + }, + { + "epoch": 1.581559173450219, + "grad_norm": 2.2322425842285156, + "learning_rate": 2.1880091234929944e-05, + "loss": 0.3664, + "step": 10103 + }, + { + "epoch": 1.5817157169693175, + "grad_norm": 1.3966444730758667, + "learning_rate": 2.1871945259042035e-05, + "loss": 0.3995, + "step": 10104 + }, + { + "epoch": 1.581872260488416, + "grad_norm": 2.103789806365967, + "learning_rate": 2.1863799283154125e-05, + "loss": 0.3957, + "step": 10105 + }, + { + "epoch": 1.5820288040075141, + "grad_norm": 1.2662473917007446, + "learning_rate": 2.1855653307266212e-05, + "loss": 0.5377, + "step": 10106 + }, + { + "epoch": 1.5821853475266123, + "grad_norm": 1.6789207458496094, + "learning_rate": 2.18475073313783e-05, + "loss": 0.3512, + "step": 10107 + }, + { + "epoch": 1.5823418910457105, + "grad_norm": 1.1164813041687012, + "learning_rate": 2.183936135549039e-05, + "loss": 0.4159, + "step": 10108 + }, + { + "epoch": 1.582498434564809, + "grad_norm": 1.40835440158844, + "learning_rate": 2.1831215379602477e-05, + "loss": 0.2769, + "step": 10109 + }, + { + "epoch": 1.5826549780839074, + "grad_norm": 6.686927318572998, + "learning_rate": 2.1823069403714568e-05, + "loss": 0.7434, + "step": 10110 + }, + { + "epoch": 1.5828115216030056, + "grad_norm": 2.2592833042144775, + "learning_rate": 2.1814923427826655e-05, + "loss": 0.695, + "step": 10111 + }, + { + "epoch": 1.5829680651221039, + "grad_norm": 1.5674444437026978, + "learning_rate": 2.1806777451938742e-05, + "loss": 0.5371, + "step": 10112 + }, + { + "epoch": 1.5831246086412023, + "grad_norm": 3.04132342338562, + "learning_rate": 2.1798631476050832e-05, + "loss": 0.4851, + "step": 10113 + }, + { + "epoch": 1.5832811521603005, + "grad_norm": 1.4575765132904053, + "learning_rate": 2.1790485500162923e-05, + "loss": 0.3947, + "step": 10114 + }, + { + "epoch": 1.583437695679399, + "grad_norm": 1.4195470809936523, + "learning_rate": 2.178233952427501e-05, + "loss": 0.284, + "step": 10115 + }, + { + "epoch": 1.5835942391984972, + "grad_norm": 1.9054851531982422, + "learning_rate": 2.1774193548387097e-05, + "loss": 0.5331, + "step": 10116 + }, + { + "epoch": 1.5837507827175954, + "grad_norm": 2.1673901081085205, + "learning_rate": 2.1766047572499188e-05, + "loss": 0.5396, + "step": 10117 + }, + { + "epoch": 1.5839073262366938, + "grad_norm": 2.834182024002075, + "learning_rate": 2.1757901596611275e-05, + "loss": 0.6717, + "step": 10118 + }, + { + "epoch": 1.5840638697557923, + "grad_norm": 1.7353304624557495, + "learning_rate": 2.1749755620723365e-05, + "loss": 0.4847, + "step": 10119 + }, + { + "epoch": 1.5842204132748905, + "grad_norm": 5.660650253295898, + "learning_rate": 2.1741609644835452e-05, + "loss": 0.8323, + "step": 10120 + }, + { + "epoch": 1.5843769567939887, + "grad_norm": 2.570338010787964, + "learning_rate": 2.173346366894754e-05, + "loss": 1.2162, + "step": 10121 + }, + { + "epoch": 1.584533500313087, + "grad_norm": 2.2253403663635254, + "learning_rate": 2.172531769305963e-05, + "loss": 0.4084, + "step": 10122 + }, + { + "epoch": 1.5846900438321854, + "grad_norm": 6.164251327514648, + "learning_rate": 2.171717171717172e-05, + "loss": 0.559, + "step": 10123 + }, + { + "epoch": 1.5848465873512838, + "grad_norm": 2.1731927394866943, + "learning_rate": 2.1709025741283804e-05, + "loss": 0.8736, + "step": 10124 + }, + { + "epoch": 1.585003130870382, + "grad_norm": 2.953941583633423, + "learning_rate": 2.1700879765395895e-05, + "loss": 0.2948, + "step": 10125 + }, + { + "epoch": 1.5851596743894802, + "grad_norm": 5.1901655197143555, + "learning_rate": 2.1692733789507985e-05, + "loss": 0.7972, + "step": 10126 + }, + { + "epoch": 1.5853162179085785, + "grad_norm": 7.556105136871338, + "learning_rate": 2.1684587813620073e-05, + "loss": 0.9987, + "step": 10127 + }, + { + "epoch": 1.585472761427677, + "grad_norm": 3.4633874893188477, + "learning_rate": 2.1676441837732163e-05, + "loss": 1.189, + "step": 10128 + }, + { + "epoch": 1.5856293049467753, + "grad_norm": 4.648806571960449, + "learning_rate": 2.166829586184425e-05, + "loss": 0.879, + "step": 10129 + }, + { + "epoch": 1.5857858484658736, + "grad_norm": 3.363091468811035, + "learning_rate": 2.1660149885956337e-05, + "loss": 1.4448, + "step": 10130 + }, + { + "epoch": 1.5859423919849718, + "grad_norm": 2.729926824569702, + "learning_rate": 2.1652003910068428e-05, + "loss": 1.0048, + "step": 10131 + }, + { + "epoch": 1.58609893550407, + "grad_norm": 7.208045482635498, + "learning_rate": 2.164385793418052e-05, + "loss": 1.0423, + "step": 10132 + }, + { + "epoch": 1.5862554790231684, + "grad_norm": 2.263331174850464, + "learning_rate": 2.1635711958292602e-05, + "loss": 1.1105, + "step": 10133 + }, + { + "epoch": 1.5864120225422669, + "grad_norm": 3.3908658027648926, + "learning_rate": 2.1627565982404693e-05, + "loss": 0.9955, + "step": 10134 + }, + { + "epoch": 1.586568566061365, + "grad_norm": 1.484663963317871, + "learning_rate": 2.1619420006516783e-05, + "loss": 0.4405, + "step": 10135 + }, + { + "epoch": 1.5867251095804633, + "grad_norm": 0.645127534866333, + "learning_rate": 2.161127403062887e-05, + "loss": 0.0976, + "step": 10136 + }, + { + "epoch": 1.5868816530995615, + "grad_norm": 2.235924243927002, + "learning_rate": 2.160312805474096e-05, + "loss": 0.641, + "step": 10137 + }, + { + "epoch": 1.58703819661866, + "grad_norm": 2.4064838886260986, + "learning_rate": 2.1594982078853048e-05, + "loss": 0.7212, + "step": 10138 + }, + { + "epoch": 1.5871947401377584, + "grad_norm": 0.5814518928527832, + "learning_rate": 2.1586836102965135e-05, + "loss": 0.25, + "step": 10139 + }, + { + "epoch": 1.5873512836568566, + "grad_norm": 0.5902174115180969, + "learning_rate": 2.1578690127077226e-05, + "loss": 0.2175, + "step": 10140 + }, + { + "epoch": 1.5875078271759548, + "grad_norm": 0.654135525226593, + "learning_rate": 2.1570544151189316e-05, + "loss": 0.2399, + "step": 10141 + }, + { + "epoch": 1.587664370695053, + "grad_norm": 0.8487042188644409, + "learning_rate": 2.15623981753014e-05, + "loss": 0.278, + "step": 10142 + }, + { + "epoch": 1.5878209142141515, + "grad_norm": 0.6521177291870117, + "learning_rate": 2.155425219941349e-05, + "loss": 0.276, + "step": 10143 + }, + { + "epoch": 1.58797745773325, + "grad_norm": 0.5911752581596375, + "learning_rate": 2.154610622352558e-05, + "loss": 0.1868, + "step": 10144 + }, + { + "epoch": 1.5881340012523482, + "grad_norm": 0.9250339269638062, + "learning_rate": 2.1537960247637668e-05, + "loss": 0.3246, + "step": 10145 + }, + { + "epoch": 1.5882905447714464, + "grad_norm": 1.0638575553894043, + "learning_rate": 2.1529814271749755e-05, + "loss": 0.374, + "step": 10146 + }, + { + "epoch": 1.5884470882905448, + "grad_norm": 0.9114158153533936, + "learning_rate": 2.1521668295861846e-05, + "loss": 0.3527, + "step": 10147 + }, + { + "epoch": 1.588603631809643, + "grad_norm": 1.3660855293273926, + "learning_rate": 2.1513522319973933e-05, + "loss": 0.4812, + "step": 10148 + }, + { + "epoch": 1.5887601753287415, + "grad_norm": 0.678665816783905, + "learning_rate": 2.1505376344086024e-05, + "loss": 0.1885, + "step": 10149 + }, + { + "epoch": 1.5889167188478397, + "grad_norm": 0.7789933085441589, + "learning_rate": 2.1497230368198114e-05, + "loss": 0.2622, + "step": 10150 + }, + { + "epoch": 1.589073262366938, + "grad_norm": 0.8975081443786621, + "learning_rate": 2.1489084392310198e-05, + "loss": 0.235, + "step": 10151 + }, + { + "epoch": 1.5892298058860364, + "grad_norm": 1.0126644372940063, + "learning_rate": 2.148093841642229e-05, + "loss": 0.26, + "step": 10152 + }, + { + "epoch": 1.5893863494051348, + "grad_norm": 6.004667282104492, + "learning_rate": 2.147279244053438e-05, + "loss": 0.4313, + "step": 10153 + }, + { + "epoch": 1.589542892924233, + "grad_norm": 2.92146897315979, + "learning_rate": 2.1464646464646466e-05, + "loss": 0.551, + "step": 10154 + }, + { + "epoch": 1.5896994364433312, + "grad_norm": 2.2435309886932373, + "learning_rate": 2.1456500488758553e-05, + "loss": 0.5852, + "step": 10155 + }, + { + "epoch": 1.5898559799624294, + "grad_norm": 1.6485599279403687, + "learning_rate": 2.1448354512870644e-05, + "loss": 0.2841, + "step": 10156 + }, + { + "epoch": 1.5900125234815279, + "grad_norm": 0.9750407934188843, + "learning_rate": 2.144020853698273e-05, + "loss": 0.3468, + "step": 10157 + }, + { + "epoch": 1.5901690670006263, + "grad_norm": 1.4097962379455566, + "learning_rate": 2.143206256109482e-05, + "loss": 0.3202, + "step": 10158 + }, + { + "epoch": 1.5903256105197245, + "grad_norm": 1.735344648361206, + "learning_rate": 2.142391658520691e-05, + "loss": 0.3944, + "step": 10159 + }, + { + "epoch": 1.5904821540388228, + "grad_norm": 1.3260771036148071, + "learning_rate": 2.1415770609318996e-05, + "loss": 0.48, + "step": 10160 + }, + { + "epoch": 1.590638697557921, + "grad_norm": 1.573287010192871, + "learning_rate": 2.1407624633431086e-05, + "loss": 0.2748, + "step": 10161 + }, + { + "epoch": 1.5907952410770194, + "grad_norm": 1.9471657276153564, + "learning_rate": 2.1399478657543177e-05, + "loss": 0.3952, + "step": 10162 + }, + { + "epoch": 1.5909517845961179, + "grad_norm": 2.1694037914276123, + "learning_rate": 2.1391332681655264e-05, + "loss": 0.6153, + "step": 10163 + }, + { + "epoch": 1.591108328115216, + "grad_norm": 1.7018488645553589, + "learning_rate": 2.138318670576735e-05, + "loss": 0.4322, + "step": 10164 + }, + { + "epoch": 1.5912648716343143, + "grad_norm": 3.2899885177612305, + "learning_rate": 2.137504072987944e-05, + "loss": 0.5838, + "step": 10165 + }, + { + "epoch": 1.5914214151534125, + "grad_norm": 2.7127275466918945, + "learning_rate": 2.136689475399153e-05, + "loss": 0.7874, + "step": 10166 + }, + { + "epoch": 1.591577958672511, + "grad_norm": 5.885900020599365, + "learning_rate": 2.135874877810362e-05, + "loss": 0.8773, + "step": 10167 + }, + { + "epoch": 1.5917345021916094, + "grad_norm": 1.9634215831756592, + "learning_rate": 2.1350602802215706e-05, + "loss": 0.5324, + "step": 10168 + }, + { + "epoch": 1.5918910457107076, + "grad_norm": 2.522061824798584, + "learning_rate": 2.1342456826327793e-05, + "loss": 0.663, + "step": 10169 + }, + { + "epoch": 1.5920475892298058, + "grad_norm": 1.8484842777252197, + "learning_rate": 2.1334310850439884e-05, + "loss": 0.6193, + "step": 10170 + }, + { + "epoch": 1.592204132748904, + "grad_norm": 5.089610576629639, + "learning_rate": 2.132616487455197e-05, + "loss": 0.8059, + "step": 10171 + }, + { + "epoch": 1.5923606762680025, + "grad_norm": 3.33300518989563, + "learning_rate": 2.131801889866406e-05, + "loss": 1.0289, + "step": 10172 + }, + { + "epoch": 1.592517219787101, + "grad_norm": 2.222883462905884, + "learning_rate": 2.130987292277615e-05, + "loss": 0.6842, + "step": 10173 + }, + { + "epoch": 1.5926737633061991, + "grad_norm": 2.8210065364837646, + "learning_rate": 2.1301726946888236e-05, + "loss": 0.7211, + "step": 10174 + }, + { + "epoch": 1.5928303068252974, + "grad_norm": 2.0147745609283447, + "learning_rate": 2.1293580971000326e-05, + "loss": 0.7583, + "step": 10175 + }, + { + "epoch": 1.5929868503443956, + "grad_norm": 2.189894199371338, + "learning_rate": 2.1285434995112417e-05, + "loss": 0.587, + "step": 10176 + }, + { + "epoch": 1.593143393863494, + "grad_norm": 3.180469512939453, + "learning_rate": 2.1277289019224504e-05, + "loss": 0.5524, + "step": 10177 + }, + { + "epoch": 1.5932999373825925, + "grad_norm": 6.680810928344727, + "learning_rate": 2.126914304333659e-05, + "loss": 0.907, + "step": 10178 + }, + { + "epoch": 1.5934564809016907, + "grad_norm": 2.4174089431762695, + "learning_rate": 2.126099706744868e-05, + "loss": 1.0203, + "step": 10179 + }, + { + "epoch": 1.593613024420789, + "grad_norm": 8.417987823486328, + "learning_rate": 2.125285109156077e-05, + "loss": 1.0271, + "step": 10180 + }, + { + "epoch": 1.5937695679398873, + "grad_norm": 4.5896759033203125, + "learning_rate": 2.124470511567286e-05, + "loss": 0.7443, + "step": 10181 + }, + { + "epoch": 1.5939261114589856, + "grad_norm": 3.681248426437378, + "learning_rate": 2.1236559139784946e-05, + "loss": 0.9853, + "step": 10182 + }, + { + "epoch": 1.594082654978084, + "grad_norm": 2.811600685119629, + "learning_rate": 2.1228413163897034e-05, + "loss": 1.1006, + "step": 10183 + }, + { + "epoch": 1.5942391984971822, + "grad_norm": 2.357684373855591, + "learning_rate": 2.1220267188009124e-05, + "loss": 0.3062, + "step": 10184 + }, + { + "epoch": 1.5943957420162804, + "grad_norm": 2.5608015060424805, + "learning_rate": 2.1212121212121215e-05, + "loss": 0.4624, + "step": 10185 + }, + { + "epoch": 1.5945522855353789, + "grad_norm": 6.207283020019531, + "learning_rate": 2.1203975236233302e-05, + "loss": 0.6082, + "step": 10186 + }, + { + "epoch": 1.5947088290544773, + "grad_norm": 2.735097646713257, + "learning_rate": 2.119582926034539e-05, + "loss": 0.5744, + "step": 10187 + }, + { + "epoch": 1.5948653725735755, + "grad_norm": 4.116175174713135, + "learning_rate": 2.118768328445748e-05, + "loss": 1.0979, + "step": 10188 + }, + { + "epoch": 1.5950219160926737, + "grad_norm": 0.6020925641059875, + "learning_rate": 2.1179537308569567e-05, + "loss": 0.2012, + "step": 10189 + }, + { + "epoch": 1.595178459611772, + "grad_norm": 0.4008946716785431, + "learning_rate": 2.1171391332681657e-05, + "loss": 0.1332, + "step": 10190 + }, + { + "epoch": 1.5953350031308704, + "grad_norm": 0.7731022834777832, + "learning_rate": 2.1163245356793744e-05, + "loss": 0.2648, + "step": 10191 + }, + { + "epoch": 1.5954915466499688, + "grad_norm": 0.9887546300888062, + "learning_rate": 2.115509938090583e-05, + "loss": 0.3141, + "step": 10192 + }, + { + "epoch": 1.595648090169067, + "grad_norm": 0.6013096570968628, + "learning_rate": 2.1146953405017922e-05, + "loss": 0.2206, + "step": 10193 + }, + { + "epoch": 1.5958046336881653, + "grad_norm": 0.5530204176902771, + "learning_rate": 2.1138807429130012e-05, + "loss": 0.2332, + "step": 10194 + }, + { + "epoch": 1.5959611772072635, + "grad_norm": 0.6343004107475281, + "learning_rate": 2.11306614532421e-05, + "loss": 0.2697, + "step": 10195 + }, + { + "epoch": 1.596117720726362, + "grad_norm": 0.8770650029182434, + "learning_rate": 2.1122515477354187e-05, + "loss": 0.2223, + "step": 10196 + }, + { + "epoch": 1.5962742642454604, + "grad_norm": 1.3516103029251099, + "learning_rate": 2.1114369501466277e-05, + "loss": 0.4374, + "step": 10197 + }, + { + "epoch": 1.5964308077645586, + "grad_norm": 0.8817640542984009, + "learning_rate": 2.1106223525578364e-05, + "loss": 0.2371, + "step": 10198 + }, + { + "epoch": 1.5965873512836568, + "grad_norm": 2.1199121475219727, + "learning_rate": 2.1098077549690455e-05, + "loss": 0.386, + "step": 10199 + }, + { + "epoch": 1.596743894802755, + "grad_norm": 2.1556339263916016, + "learning_rate": 2.1089931573802542e-05, + "loss": 0.4089, + "step": 10200 + }, + { + "epoch": 1.5969004383218535, + "grad_norm": 1.0028367042541504, + "learning_rate": 2.108178559791463e-05, + "loss": 0.2504, + "step": 10201 + }, + { + "epoch": 1.597056981840952, + "grad_norm": 1.8331717252731323, + "learning_rate": 2.107363962202672e-05, + "loss": 0.5124, + "step": 10202 + }, + { + "epoch": 1.5972135253600501, + "grad_norm": 1.4498990774154663, + "learning_rate": 2.106549364613881e-05, + "loss": 0.3791, + "step": 10203 + }, + { + "epoch": 1.5973700688791483, + "grad_norm": 1.2187668085098267, + "learning_rate": 2.1057347670250897e-05, + "loss": 0.326, + "step": 10204 + }, + { + "epoch": 1.5975266123982466, + "grad_norm": 1.280874490737915, + "learning_rate": 2.1049201694362985e-05, + "loss": 0.3773, + "step": 10205 + }, + { + "epoch": 1.597683155917345, + "grad_norm": 1.7650017738342285, + "learning_rate": 2.1041055718475075e-05, + "loss": 0.424, + "step": 10206 + }, + { + "epoch": 1.5978396994364434, + "grad_norm": 0.6418395042419434, + "learning_rate": 2.1032909742587162e-05, + "loss": 0.1593, + "step": 10207 + }, + { + "epoch": 1.5979962429555417, + "grad_norm": 2.4991915225982666, + "learning_rate": 2.1024763766699253e-05, + "loss": 0.5989, + "step": 10208 + }, + { + "epoch": 1.5981527864746399, + "grad_norm": 6.967658996582031, + "learning_rate": 2.101661779081134e-05, + "loss": 0.5561, + "step": 10209 + }, + { + "epoch": 1.5983093299937383, + "grad_norm": 1.5519819259643555, + "learning_rate": 2.1008471814923427e-05, + "loss": 0.3638, + "step": 10210 + }, + { + "epoch": 1.5984658735128365, + "grad_norm": 2.5759992599487305, + "learning_rate": 2.1000325839035518e-05, + "loss": 0.539, + "step": 10211 + }, + { + "epoch": 1.598622417031935, + "grad_norm": 1.5714833736419678, + "learning_rate": 2.0992179863147608e-05, + "loss": 0.5415, + "step": 10212 + }, + { + "epoch": 1.5987789605510332, + "grad_norm": 1.860336184501648, + "learning_rate": 2.0984033887259695e-05, + "loss": 0.4067, + "step": 10213 + }, + { + "epoch": 1.5989355040701314, + "grad_norm": 2.5017271041870117, + "learning_rate": 2.0975887911371782e-05, + "loss": 0.776, + "step": 10214 + }, + { + "epoch": 1.5990920475892298, + "grad_norm": 2.47879695892334, + "learning_rate": 2.0967741935483873e-05, + "loss": 0.5257, + "step": 10215 + }, + { + "epoch": 1.599248591108328, + "grad_norm": 2.936952829360962, + "learning_rate": 2.095959595959596e-05, + "loss": 0.6224, + "step": 10216 + }, + { + "epoch": 1.5994051346274265, + "grad_norm": 3.536778211593628, + "learning_rate": 2.095144998370805e-05, + "loss": 0.8763, + "step": 10217 + }, + { + "epoch": 1.5995616781465247, + "grad_norm": 1.958005666732788, + "learning_rate": 2.0943304007820138e-05, + "loss": 0.6056, + "step": 10218 + }, + { + "epoch": 1.599718221665623, + "grad_norm": 4.751214981079102, + "learning_rate": 2.0935158031932225e-05, + "loss": 1.1865, + "step": 10219 + }, + { + "epoch": 1.5998747651847214, + "grad_norm": 1.8862663507461548, + "learning_rate": 2.0927012056044315e-05, + "loss": 0.825, + "step": 10220 + }, + { + "epoch": 1.6000313087038198, + "grad_norm": 2.871076822280884, + "learning_rate": 2.0918866080156406e-05, + "loss": 0.839, + "step": 10221 + }, + { + "epoch": 1.600187852222918, + "grad_norm": 1.7996809482574463, + "learning_rate": 2.0910720104268493e-05, + "loss": 0.5563, + "step": 10222 + }, + { + "epoch": 1.6003443957420163, + "grad_norm": 8.449995040893555, + "learning_rate": 2.090257412838058e-05, + "loss": 1.0158, + "step": 10223 + }, + { + "epoch": 1.6005009392611145, + "grad_norm": 3.5177698135375977, + "learning_rate": 2.089442815249267e-05, + "loss": 0.9912, + "step": 10224 + }, + { + "epoch": 1.600657482780213, + "grad_norm": 2.9115688800811768, + "learning_rate": 2.0886282176604758e-05, + "loss": 0.7739, + "step": 10225 + }, + { + "epoch": 1.6008140262993114, + "grad_norm": 8.927573204040527, + "learning_rate": 2.0878136200716848e-05, + "loss": 1.0208, + "step": 10226 + }, + { + "epoch": 1.6009705698184096, + "grad_norm": 2.7009334564208984, + "learning_rate": 2.0869990224828935e-05, + "loss": 0.8623, + "step": 10227 + }, + { + "epoch": 1.6011271133375078, + "grad_norm": 3.4519436359405518, + "learning_rate": 2.0861844248941023e-05, + "loss": 1.164, + "step": 10228 + }, + { + "epoch": 1.601283656856606, + "grad_norm": 4.45281982421875, + "learning_rate": 2.0853698273053113e-05, + "loss": 0.8325, + "step": 10229 + }, + { + "epoch": 1.6014402003757044, + "grad_norm": 3.1451778411865234, + "learning_rate": 2.0845552297165204e-05, + "loss": 1.2937, + "step": 10230 + }, + { + "epoch": 1.6015967438948029, + "grad_norm": 2.4359099864959717, + "learning_rate": 2.083740632127729e-05, + "loss": 0.8111, + "step": 10231 + }, + { + "epoch": 1.601753287413901, + "grad_norm": 1.0259488821029663, + "learning_rate": 2.0829260345389378e-05, + "loss": 0.2718, + "step": 10232 + }, + { + "epoch": 1.6019098309329993, + "grad_norm": 3.8691201210021973, + "learning_rate": 2.082111436950147e-05, + "loss": 0.7882, + "step": 10233 + }, + { + "epoch": 1.6020663744520975, + "grad_norm": 2.2194314002990723, + "learning_rate": 2.0812968393613556e-05, + "loss": 0.7407, + "step": 10234 + }, + { + "epoch": 1.602222917971196, + "grad_norm": 0.8423169255256653, + "learning_rate": 2.0804822417725646e-05, + "loss": 0.1837, + "step": 10235 + }, + { + "epoch": 1.6023794614902944, + "grad_norm": 2.184020519256592, + "learning_rate": 2.0796676441837733e-05, + "loss": 0.5625, + "step": 10236 + }, + { + "epoch": 1.6025360050093926, + "grad_norm": 1.7898670434951782, + "learning_rate": 2.078853046594982e-05, + "loss": 0.7617, + "step": 10237 + }, + { + "epoch": 1.6026925485284909, + "grad_norm": 1.7059695720672607, + "learning_rate": 2.078038449006191e-05, + "loss": 1.0986, + "step": 10238 + }, + { + "epoch": 1.602849092047589, + "grad_norm": 0.5156115889549255, + "learning_rate": 2.0772238514174e-05, + "loss": 0.2006, + "step": 10239 + }, + { + "epoch": 1.6030056355666875, + "grad_norm": 0.535067081451416, + "learning_rate": 2.0764092538286085e-05, + "loss": 0.2457, + "step": 10240 + }, + { + "epoch": 1.603162179085786, + "grad_norm": 0.3973061740398407, + "learning_rate": 2.0755946562398176e-05, + "loss": 0.1486, + "step": 10241 + }, + { + "epoch": 1.6033187226048842, + "grad_norm": 0.5349472165107727, + "learning_rate": 2.0747800586510266e-05, + "loss": 0.2829, + "step": 10242 + }, + { + "epoch": 1.6034752661239824, + "grad_norm": 0.5498707890510559, + "learning_rate": 2.0739654610622353e-05, + "loss": 0.1903, + "step": 10243 + }, + { + "epoch": 1.6036318096430808, + "grad_norm": 1.1939010620117188, + "learning_rate": 2.0731508634734444e-05, + "loss": 0.4002, + "step": 10244 + }, + { + "epoch": 1.603788353162179, + "grad_norm": 0.7345845103263855, + "learning_rate": 2.072336265884653e-05, + "loss": 0.3093, + "step": 10245 + }, + { + "epoch": 1.6039448966812775, + "grad_norm": 1.1648309230804443, + "learning_rate": 2.0715216682958618e-05, + "loss": 0.2013, + "step": 10246 + }, + { + "epoch": 1.6041014402003757, + "grad_norm": 0.5930057764053345, + "learning_rate": 2.070707070707071e-05, + "loss": 0.2229, + "step": 10247 + }, + { + "epoch": 1.604257983719474, + "grad_norm": 1.085510015487671, + "learning_rate": 2.06989247311828e-05, + "loss": 0.3187, + "step": 10248 + }, + { + "epoch": 1.6044145272385724, + "grad_norm": 0.8410932421684265, + "learning_rate": 2.0690778755294883e-05, + "loss": 0.2245, + "step": 10249 + }, + { + "epoch": 1.6045710707576706, + "grad_norm": 1.254292607307434, + "learning_rate": 2.0682632779406973e-05, + "loss": 0.3265, + "step": 10250 + }, + { + "epoch": 1.604727614276769, + "grad_norm": 1.2341804504394531, + "learning_rate": 2.0674486803519064e-05, + "loss": 0.1214, + "step": 10251 + }, + { + "epoch": 1.6048841577958672, + "grad_norm": 1.3117117881774902, + "learning_rate": 2.066634082763115e-05, + "loss": 0.3736, + "step": 10252 + }, + { + "epoch": 1.6050407013149655, + "grad_norm": 1.2742987871170044, + "learning_rate": 2.065819485174324e-05, + "loss": 0.3818, + "step": 10253 + }, + { + "epoch": 1.605197244834064, + "grad_norm": 1.5768238306045532, + "learning_rate": 2.065004887585533e-05, + "loss": 0.3129, + "step": 10254 + }, + { + "epoch": 1.6053537883531623, + "grad_norm": 1.9089137315750122, + "learning_rate": 2.0641902899967416e-05, + "loss": 0.4926, + "step": 10255 + }, + { + "epoch": 1.6055103318722606, + "grad_norm": 1.4710845947265625, + "learning_rate": 2.0633756924079506e-05, + "loss": 0.4277, + "step": 10256 + }, + { + "epoch": 1.6056668753913588, + "grad_norm": 1.3503708839416504, + "learning_rate": 2.0625610948191597e-05, + "loss": 0.2587, + "step": 10257 + }, + { + "epoch": 1.605823418910457, + "grad_norm": 2.062999725341797, + "learning_rate": 2.061746497230368e-05, + "loss": 0.4071, + "step": 10258 + }, + { + "epoch": 1.6059799624295554, + "grad_norm": 1.4687066078186035, + "learning_rate": 2.060931899641577e-05, + "loss": 0.5958, + "step": 10259 + }, + { + "epoch": 1.6061365059486539, + "grad_norm": 1.842149257659912, + "learning_rate": 2.0601173020527862e-05, + "loss": 0.2959, + "step": 10260 + }, + { + "epoch": 1.606293049467752, + "grad_norm": 1.14681077003479, + "learning_rate": 2.059302704463995e-05, + "loss": 0.2088, + "step": 10261 + }, + { + "epoch": 1.6064495929868503, + "grad_norm": 2.8755252361297607, + "learning_rate": 2.0584881068752036e-05, + "loss": 0.6675, + "step": 10262 + }, + { + "epoch": 1.6066061365059485, + "grad_norm": 1.557287335395813, + "learning_rate": 2.0576735092864127e-05, + "loss": 0.4688, + "step": 10263 + }, + { + "epoch": 1.606762680025047, + "grad_norm": 1.3130252361297607, + "learning_rate": 2.0568589116976214e-05, + "loss": 0.6894, + "step": 10264 + }, + { + "epoch": 1.6069192235441454, + "grad_norm": 1.6940077543258667, + "learning_rate": 2.0560443141088304e-05, + "loss": 0.2663, + "step": 10265 + }, + { + "epoch": 1.6070757670632436, + "grad_norm": 2.614046573638916, + "learning_rate": 2.0552297165200395e-05, + "loss": 0.3232, + "step": 10266 + }, + { + "epoch": 1.6072323105823418, + "grad_norm": 2.5123918056488037, + "learning_rate": 2.054415118931248e-05, + "loss": 0.5055, + "step": 10267 + }, + { + "epoch": 1.60738885410144, + "grad_norm": 2.0351808071136475, + "learning_rate": 2.053600521342457e-05, + "loss": 0.451, + "step": 10268 + }, + { + "epoch": 1.6075453976205385, + "grad_norm": 3.867814779281616, + "learning_rate": 2.052785923753666e-05, + "loss": 1.1049, + "step": 10269 + }, + { + "epoch": 1.607701941139637, + "grad_norm": 1.6765589714050293, + "learning_rate": 2.0519713261648747e-05, + "loss": 0.4399, + "step": 10270 + }, + { + "epoch": 1.6078584846587352, + "grad_norm": 4.719061374664307, + "learning_rate": 2.0511567285760834e-05, + "loss": 0.8729, + "step": 10271 + }, + { + "epoch": 1.6080150281778334, + "grad_norm": 2.0653295516967773, + "learning_rate": 2.0503421309872924e-05, + "loss": 0.7954, + "step": 10272 + }, + { + "epoch": 1.6081715716969316, + "grad_norm": 3.9201056957244873, + "learning_rate": 2.049527533398501e-05, + "loss": 0.6988, + "step": 10273 + }, + { + "epoch": 1.60832811521603, + "grad_norm": 3.4977173805236816, + "learning_rate": 2.0487129358097102e-05, + "loss": 1.2416, + "step": 10274 + }, + { + "epoch": 1.6084846587351285, + "grad_norm": 5.886085510253906, + "learning_rate": 2.0478983382209193e-05, + "loss": 1.217, + "step": 10275 + }, + { + "epoch": 1.6086412022542267, + "grad_norm": 2.572796106338501, + "learning_rate": 2.0470837406321276e-05, + "loss": 0.8322, + "step": 10276 + }, + { + "epoch": 1.608797745773325, + "grad_norm": NaN, + "learning_rate": 2.0470837406321276e-05, + "loss": 0.0, + "step": 10277 + }, + { + "epoch": 1.6089542892924233, + "grad_norm": 3.4777302742004395, + "learning_rate": 2.0462691430433367e-05, + "loss": 0.9929, + "step": 10278 + }, + { + "epoch": 1.6091108328115216, + "grad_norm": 2.124823570251465, + "learning_rate": 2.0454545454545457e-05, + "loss": 0.6372, + "step": 10279 + }, + { + "epoch": 1.60926737633062, + "grad_norm": 2.8805737495422363, + "learning_rate": 2.0446399478657545e-05, + "loss": 1.2243, + "step": 10280 + }, + { + "epoch": 1.6094239198497182, + "grad_norm": 1.8434771299362183, + "learning_rate": 2.043825350276963e-05, + "loss": 0.8771, + "step": 10281 + }, + { + "epoch": 1.6095804633688164, + "grad_norm": 3.5126566886901855, + "learning_rate": 2.0430107526881722e-05, + "loss": 1.0626, + "step": 10282 + }, + { + "epoch": 1.6097370068879149, + "grad_norm": 4.863789081573486, + "learning_rate": 2.042196155099381e-05, + "loss": 1.1996, + "step": 10283 + }, + { + "epoch": 1.609893550407013, + "grad_norm": 2.7294840812683105, + "learning_rate": 2.04138155751059e-05, + "loss": 0.5921, + "step": 10284 + }, + { + "epoch": 1.6100500939261115, + "grad_norm": 2.275157928466797, + "learning_rate": 2.0405669599217987e-05, + "loss": 0.6672, + "step": 10285 + }, + { + "epoch": 1.6102066374452098, + "grad_norm": 1.5523526668548584, + "learning_rate": 2.0397523623330074e-05, + "loss": 0.7168, + "step": 10286 + }, + { + "epoch": 1.610363180964308, + "grad_norm": 5.165606498718262, + "learning_rate": 2.0389377647442165e-05, + "loss": 0.9596, + "step": 10287 + }, + { + "epoch": 1.6105197244834064, + "grad_norm": 2.7745704650878906, + "learning_rate": 2.0381231671554255e-05, + "loss": 1.3679, + "step": 10288 + }, + { + "epoch": 1.6106762680025049, + "grad_norm": 0.511305034160614, + "learning_rate": 2.0373085695666342e-05, + "loss": 0.1551, + "step": 10289 + }, + { + "epoch": 1.610832811521603, + "grad_norm": 1.0990225076675415, + "learning_rate": 2.036493971977843e-05, + "loss": 0.2962, + "step": 10290 + }, + { + "epoch": 1.6109893550407013, + "grad_norm": 0.6349324584007263, + "learning_rate": 2.035679374389052e-05, + "loss": 0.1814, + "step": 10291 + }, + { + "epoch": 1.6111458985597995, + "grad_norm": 1.0268129110336304, + "learning_rate": 2.0348647768002607e-05, + "loss": 0.1137, + "step": 10292 + }, + { + "epoch": 1.611302442078898, + "grad_norm": 1.241778016090393, + "learning_rate": 2.0340501792114698e-05, + "loss": 0.2422, + "step": 10293 + }, + { + "epoch": 1.6114589855979964, + "grad_norm": 0.5685115456581116, + "learning_rate": 2.0332355816226785e-05, + "loss": 0.2207, + "step": 10294 + }, + { + "epoch": 1.6116155291170946, + "grad_norm": 0.5130642056465149, + "learning_rate": 2.0324209840338872e-05, + "loss": 0.2948, + "step": 10295 + }, + { + "epoch": 1.6117720726361928, + "grad_norm": 1.5458450317382812, + "learning_rate": 2.0316063864450962e-05, + "loss": 0.3155, + "step": 10296 + }, + { + "epoch": 1.611928616155291, + "grad_norm": 0.9877769351005554, + "learning_rate": 2.030791788856305e-05, + "loss": 0.1746, + "step": 10297 + }, + { + "epoch": 1.6120851596743895, + "grad_norm": 0.8671873807907104, + "learning_rate": 2.029977191267514e-05, + "loss": 0.3308, + "step": 10298 + }, + { + "epoch": 1.612241703193488, + "grad_norm": 0.628835141658783, + "learning_rate": 2.0291625936787227e-05, + "loss": 0.21, + "step": 10299 + }, + { + "epoch": 1.6123982467125861, + "grad_norm": 0.6392622590065002, + "learning_rate": 2.0283479960899314e-05, + "loss": 0.2321, + "step": 10300 + }, + { + "epoch": 1.6125547902316844, + "grad_norm": 0.7944560050964355, + "learning_rate": 2.0275333985011405e-05, + "loss": 0.2427, + "step": 10301 + }, + { + "epoch": 1.6127113337507826, + "grad_norm": 1.097984790802002, + "learning_rate": 2.0267188009123495e-05, + "loss": 0.2376, + "step": 10302 + }, + { + "epoch": 1.612867877269881, + "grad_norm": 1.75529146194458, + "learning_rate": 2.0259042033235583e-05, + "loss": 0.3511, + "step": 10303 + }, + { + "epoch": 1.6130244207889795, + "grad_norm": 0.8020296096801758, + "learning_rate": 2.025089605734767e-05, + "loss": 0.3468, + "step": 10304 + }, + { + "epoch": 1.6131809643080777, + "grad_norm": 1.6465972661972046, + "learning_rate": 2.024275008145976e-05, + "loss": 0.3856, + "step": 10305 + }, + { + "epoch": 1.6133375078271759, + "grad_norm": 1.089805006980896, + "learning_rate": 2.0234604105571847e-05, + "loss": 0.4091, + "step": 10306 + }, + { + "epoch": 1.613494051346274, + "grad_norm": 3.378706455230713, + "learning_rate": 2.0226458129683938e-05, + "loss": 0.6655, + "step": 10307 + }, + { + "epoch": 1.6136505948653725, + "grad_norm": 1.7636624574661255, + "learning_rate": 2.0218312153796025e-05, + "loss": 0.5921, + "step": 10308 + }, + { + "epoch": 1.613807138384471, + "grad_norm": 1.4930670261383057, + "learning_rate": 2.0210166177908112e-05, + "loss": 0.248, + "step": 10309 + }, + { + "epoch": 1.6139636819035692, + "grad_norm": 1.678263783454895, + "learning_rate": 2.0202020202020203e-05, + "loss": 0.3892, + "step": 10310 + }, + { + "epoch": 1.6141202254226674, + "grad_norm": 1.4035494327545166, + "learning_rate": 2.0193874226132293e-05, + "loss": 0.3047, + "step": 10311 + }, + { + "epoch": 1.6142767689417659, + "grad_norm": 2.219306230545044, + "learning_rate": 2.018572825024438e-05, + "loss": 0.4214, + "step": 10312 + }, + { + "epoch": 1.614433312460864, + "grad_norm": 4.1408610343933105, + "learning_rate": 2.0177582274356467e-05, + "loss": 0.735, + "step": 10313 + }, + { + "epoch": 1.6145898559799625, + "grad_norm": 2.9284255504608154, + "learning_rate": 2.0169436298468558e-05, + "loss": 0.4752, + "step": 10314 + }, + { + "epoch": 1.6147463994990607, + "grad_norm": 3.1653499603271484, + "learning_rate": 2.0161290322580645e-05, + "loss": 0.6797, + "step": 10315 + }, + { + "epoch": 1.614902943018159, + "grad_norm": 2.9362828731536865, + "learning_rate": 2.0153144346692736e-05, + "loss": 0.7839, + "step": 10316 + }, + { + "epoch": 1.6150594865372574, + "grad_norm": 1.671846628189087, + "learning_rate": 2.0144998370804823e-05, + "loss": 0.6196, + "step": 10317 + }, + { + "epoch": 1.6152160300563556, + "grad_norm": 1.831560730934143, + "learning_rate": 2.013685239491691e-05, + "loss": 0.537, + "step": 10318 + }, + { + "epoch": 1.615372573575454, + "grad_norm": 2.9861881732940674, + "learning_rate": 2.0128706419029e-05, + "loss": 0.5756, + "step": 10319 + }, + { + "epoch": 1.6155291170945523, + "grad_norm": 1.9889366626739502, + "learning_rate": 2.012056044314109e-05, + "loss": 0.389, + "step": 10320 + }, + { + "epoch": 1.6156856606136505, + "grad_norm": 4.4227728843688965, + "learning_rate": 2.0112414467253178e-05, + "loss": 1.1607, + "step": 10321 + }, + { + "epoch": 1.615842204132749, + "grad_norm": 1.7247295379638672, + "learning_rate": 2.0104268491365265e-05, + "loss": 0.8311, + "step": 10322 + }, + { + "epoch": 1.6159987476518474, + "grad_norm": 5.667601108551025, + "learning_rate": 2.0096122515477356e-05, + "loss": 0.6128, + "step": 10323 + }, + { + "epoch": 1.6161552911709456, + "grad_norm": 1.9812357425689697, + "learning_rate": 2.0087976539589443e-05, + "loss": 0.4907, + "step": 10324 + }, + { + "epoch": 1.6163118346900438, + "grad_norm": 1.9502283334732056, + "learning_rate": 2.0079830563701533e-05, + "loss": 0.7375, + "step": 10325 + }, + { + "epoch": 1.616468378209142, + "grad_norm": 6.490029811859131, + "learning_rate": 2.007168458781362e-05, + "loss": 0.6816, + "step": 10326 + }, + { + "epoch": 1.6166249217282405, + "grad_norm": 1.6848992109298706, + "learning_rate": 2.0063538611925708e-05, + "loss": 0.8601, + "step": 10327 + }, + { + "epoch": 1.616781465247339, + "grad_norm": 4.153322696685791, + "learning_rate": 2.0055392636037798e-05, + "loss": 0.7744, + "step": 10328 + }, + { + "epoch": 1.6169380087664371, + "grad_norm": 3.933520793914795, + "learning_rate": 2.004724666014989e-05, + "loss": 1.4771, + "step": 10329 + }, + { + "epoch": 1.6170945522855353, + "grad_norm": 2.307128429412842, + "learning_rate": 2.0039100684261976e-05, + "loss": 0.5057, + "step": 10330 + }, + { + "epoch": 1.6172510958046336, + "grad_norm": 7.078334808349609, + "learning_rate": 2.0030954708374063e-05, + "loss": 1.2583, + "step": 10331 + }, + { + "epoch": 1.617407639323732, + "grad_norm": 2.2063095569610596, + "learning_rate": 2.0022808732486154e-05, + "loss": 0.7431, + "step": 10332 + }, + { + "epoch": 1.6175641828428304, + "grad_norm": 2.963918447494507, + "learning_rate": 2.001466275659824e-05, + "loss": 1.2592, + "step": 10333 + }, + { + "epoch": 1.6177207263619287, + "grad_norm": 4.155981540679932, + "learning_rate": 2.000651678071033e-05, + "loss": 1.4949, + "step": 10334 + }, + { + "epoch": 1.6178772698810269, + "grad_norm": 4.020863056182861, + "learning_rate": 1.999837080482242e-05, + "loss": 0.6439, + "step": 10335 + }, + { + "epoch": 1.618033813400125, + "grad_norm": 1.2379478216171265, + "learning_rate": 1.9990224828934506e-05, + "loss": 0.5863, + "step": 10336 + }, + { + "epoch": 1.6181903569192235, + "grad_norm": 3.510092258453369, + "learning_rate": 1.9982078853046596e-05, + "loss": 0.9444, + "step": 10337 + }, + { + "epoch": 1.618346900438322, + "grad_norm": 3.3037843704223633, + "learning_rate": 1.9973932877158687e-05, + "loss": 0.6526, + "step": 10338 + }, + { + "epoch": 1.6185034439574202, + "grad_norm": 0.6012192964553833, + "learning_rate": 1.9965786901270774e-05, + "loss": 0.1911, + "step": 10339 + }, + { + "epoch": 1.6186599874765184, + "grad_norm": 0.4750175476074219, + "learning_rate": 1.995764092538286e-05, + "loss": 0.1776, + "step": 10340 + }, + { + "epoch": 1.6188165309956166, + "grad_norm": 0.4904175400733948, + "learning_rate": 1.994949494949495e-05, + "loss": 0.2017, + "step": 10341 + }, + { + "epoch": 1.618973074514715, + "grad_norm": 0.33910077810287476, + "learning_rate": 1.994134897360704e-05, + "loss": 0.1215, + "step": 10342 + }, + { + "epoch": 1.6191296180338135, + "grad_norm": 1.0075701475143433, + "learning_rate": 1.993320299771913e-05, + "loss": 0.2316, + "step": 10343 + }, + { + "epoch": 1.6192861615529117, + "grad_norm": 0.6985920071601868, + "learning_rate": 1.9925057021831216e-05, + "loss": 0.2419, + "step": 10344 + }, + { + "epoch": 1.61944270507201, + "grad_norm": 0.7907557487487793, + "learning_rate": 1.9916911045943303e-05, + "loss": 0.1461, + "step": 10345 + }, + { + "epoch": 1.6195992485911084, + "grad_norm": 0.6733208298683167, + "learning_rate": 1.9908765070055394e-05, + "loss": 0.2025, + "step": 10346 + }, + { + "epoch": 1.6197557921102066, + "grad_norm": 0.7201072573661804, + "learning_rate": 1.9900619094167484e-05, + "loss": 0.1515, + "step": 10347 + }, + { + "epoch": 1.619912335629305, + "grad_norm": 1.185990333557129, + "learning_rate": 1.989247311827957e-05, + "loss": 0.3934, + "step": 10348 + }, + { + "epoch": 1.6200688791484033, + "grad_norm": 2.2285702228546143, + "learning_rate": 1.988432714239166e-05, + "loss": 0.6274, + "step": 10349 + }, + { + "epoch": 1.6202254226675015, + "grad_norm": 0.7934250235557556, + "learning_rate": 1.987618116650375e-05, + "loss": 0.2025, + "step": 10350 + }, + { + "epoch": 1.6203819661866, + "grad_norm": 1.1951051950454712, + "learning_rate": 1.9868035190615836e-05, + "loss": 0.4103, + "step": 10351 + }, + { + "epoch": 1.6205385097056983, + "grad_norm": 0.8307292461395264, + "learning_rate": 1.9859889214727927e-05, + "loss": 0.202, + "step": 10352 + }, + { + "epoch": 1.6206950532247966, + "grad_norm": 0.9230514764785767, + "learning_rate": 1.9851743238840014e-05, + "loss": 0.288, + "step": 10353 + }, + { + "epoch": 1.6208515967438948, + "grad_norm": 1.8937816619873047, + "learning_rate": 1.98435972629521e-05, + "loss": 0.3437, + "step": 10354 + }, + { + "epoch": 1.621008140262993, + "grad_norm": 1.674739956855774, + "learning_rate": 1.983545128706419e-05, + "loss": 0.6769, + "step": 10355 + }, + { + "epoch": 1.6211646837820914, + "grad_norm": 1.5428707599639893, + "learning_rate": 1.9827305311176282e-05, + "loss": 0.4623, + "step": 10356 + }, + { + "epoch": 1.6213212273011899, + "grad_norm": 1.847981572151184, + "learning_rate": 1.9819159335288366e-05, + "loss": 0.3554, + "step": 10357 + }, + { + "epoch": 1.621477770820288, + "grad_norm": 2.339102029800415, + "learning_rate": 1.9811013359400456e-05, + "loss": 0.3956, + "step": 10358 + }, + { + "epoch": 1.6216343143393863, + "grad_norm": 1.8228871822357178, + "learning_rate": 1.9802867383512547e-05, + "loss": 0.4937, + "step": 10359 + }, + { + "epoch": 1.6217908578584845, + "grad_norm": 2.309626340866089, + "learning_rate": 1.9794721407624634e-05, + "loss": 0.459, + "step": 10360 + }, + { + "epoch": 1.621947401377583, + "grad_norm": 1.9432828426361084, + "learning_rate": 1.9786575431736725e-05, + "loss": 0.4191, + "step": 10361 + }, + { + "epoch": 1.6221039448966814, + "grad_norm": 3.1549131870269775, + "learning_rate": 1.9778429455848812e-05, + "loss": 0.5561, + "step": 10362 + }, + { + "epoch": 1.6222604884157796, + "grad_norm": 3.6030447483062744, + "learning_rate": 1.97702834799609e-05, + "loss": 0.6711, + "step": 10363 + }, + { + "epoch": 1.6224170319348779, + "grad_norm": 2.4683148860931396, + "learning_rate": 1.976213750407299e-05, + "loss": 0.5967, + "step": 10364 + }, + { + "epoch": 1.622573575453976, + "grad_norm": 3.2946598529815674, + "learning_rate": 1.975399152818508e-05, + "loss": 0.7823, + "step": 10365 + }, + { + "epoch": 1.6227301189730745, + "grad_norm": 4.065118789672852, + "learning_rate": 1.9745845552297164e-05, + "loss": 0.73, + "step": 10366 + }, + { + "epoch": 1.622886662492173, + "grad_norm": 1.6865403652191162, + "learning_rate": 1.9737699576409254e-05, + "loss": 0.6575, + "step": 10367 + }, + { + "epoch": 1.6230432060112712, + "grad_norm": 9.835071563720703, + "learning_rate": 1.9729553600521345e-05, + "loss": 0.7292, + "step": 10368 + }, + { + "epoch": 1.6231997495303694, + "grad_norm": 1.8819154500961304, + "learning_rate": 1.9721407624633432e-05, + "loss": 0.755, + "step": 10369 + }, + { + "epoch": 1.6233562930494676, + "grad_norm": 1.7671360969543457, + "learning_rate": 1.9713261648745522e-05, + "loss": 0.3372, + "step": 10370 + }, + { + "epoch": 1.623512836568566, + "grad_norm": 3.7930731773376465, + "learning_rate": 1.970511567285761e-05, + "loss": 0.5812, + "step": 10371 + }, + { + "epoch": 1.6236693800876645, + "grad_norm": 4.344659328460693, + "learning_rate": 1.9696969696969697e-05, + "loss": 0.5842, + "step": 10372 + }, + { + "epoch": 1.6238259236067627, + "grad_norm": 2.0720763206481934, + "learning_rate": 1.9688823721081787e-05, + "loss": 0.698, + "step": 10373 + }, + { + "epoch": 1.623982467125861, + "grad_norm": 3.1723203659057617, + "learning_rate": 1.9680677745193878e-05, + "loss": 0.8667, + "step": 10374 + }, + { + "epoch": 1.6241390106449591, + "grad_norm": 1.4990514516830444, + "learning_rate": 1.967253176930596e-05, + "loss": 0.3833, + "step": 10375 + }, + { + "epoch": 1.6242955541640576, + "grad_norm": 2.605715751647949, + "learning_rate": 1.9664385793418052e-05, + "loss": 0.7601, + "step": 10376 + }, + { + "epoch": 1.624452097683156, + "grad_norm": 4.497624397277832, + "learning_rate": 1.9656239817530143e-05, + "loss": 1.2623, + "step": 10377 + }, + { + "epoch": 1.6246086412022542, + "grad_norm": 2.12886118888855, + "learning_rate": 1.964809384164223e-05, + "loss": 0.5559, + "step": 10378 + }, + { + "epoch": 1.6247651847213525, + "grad_norm": 3.734893798828125, + "learning_rate": 1.9639947865754317e-05, + "loss": 1.0857, + "step": 10379 + }, + { + "epoch": 1.624921728240451, + "grad_norm": 2.6834025382995605, + "learning_rate": 1.9631801889866407e-05, + "loss": 1.3174, + "step": 10380 + }, + { + "epoch": 1.625078271759549, + "grad_norm": 2.0532753467559814, + "learning_rate": 1.9623655913978494e-05, + "loss": 0.6142, + "step": 10381 + }, + { + "epoch": 1.6252348152786475, + "grad_norm": 4.229538440704346, + "learning_rate": 1.9615509938090585e-05, + "loss": 1.001, + "step": 10382 + }, + { + "epoch": 1.6253913587977458, + "grad_norm": 5.919605731964111, + "learning_rate": 1.9607363962202676e-05, + "loss": 1.2016, + "step": 10383 + }, + { + "epoch": 1.625547902316844, + "grad_norm": 1.8070989847183228, + "learning_rate": 1.959921798631476e-05, + "loss": 0.3329, + "step": 10384 + }, + { + "epoch": 1.6257044458359424, + "grad_norm": 1.7401186227798462, + "learning_rate": 1.959107201042685e-05, + "loss": 0.6896, + "step": 10385 + }, + { + "epoch": 1.6258609893550409, + "grad_norm": 1.8397754430770874, + "learning_rate": 1.958292603453894e-05, + "loss": 0.5725, + "step": 10386 + }, + { + "epoch": 1.626017532874139, + "grad_norm": 1.8710349798202515, + "learning_rate": 1.9574780058651027e-05, + "loss": 0.5131, + "step": 10387 + }, + { + "epoch": 1.6261740763932373, + "grad_norm": 2.5483853816986084, + "learning_rate": 1.9566634082763115e-05, + "loss": 0.5336, + "step": 10388 + }, + { + "epoch": 1.6263306199123355, + "grad_norm": 0.5854129195213318, + "learning_rate": 1.9558488106875205e-05, + "loss": 0.1864, + "step": 10389 + }, + { + "epoch": 1.626487163431434, + "grad_norm": 0.641940176486969, + "learning_rate": 1.9550342130987292e-05, + "loss": 0.1616, + "step": 10390 + }, + { + "epoch": 1.6266437069505324, + "grad_norm": 0.3776359558105469, + "learning_rate": 1.9542196155099383e-05, + "loss": 0.1865, + "step": 10391 + }, + { + "epoch": 1.6268002504696306, + "grad_norm": 0.6387605667114258, + "learning_rate": 1.9534050179211473e-05, + "loss": 0.1527, + "step": 10392 + }, + { + "epoch": 1.6269567939887288, + "grad_norm": 0.8531847596168518, + "learning_rate": 1.9525904203323557e-05, + "loss": 0.2163, + "step": 10393 + }, + { + "epoch": 1.627113337507827, + "grad_norm": 0.6515390872955322, + "learning_rate": 1.9517758227435648e-05, + "loss": 0.1678, + "step": 10394 + }, + { + "epoch": 1.6272698810269255, + "grad_norm": 0.6404340267181396, + "learning_rate": 1.9509612251547738e-05, + "loss": 0.154, + "step": 10395 + }, + { + "epoch": 1.627426424546024, + "grad_norm": 0.773704469203949, + "learning_rate": 1.9501466275659825e-05, + "loss": 0.2568, + "step": 10396 + }, + { + "epoch": 1.6275829680651221, + "grad_norm": 0.5345539450645447, + "learning_rate": 1.9493320299771912e-05, + "loss": 0.1307, + "step": 10397 + }, + { + "epoch": 1.6277395115842204, + "grad_norm": 1.2479101419448853, + "learning_rate": 1.9485174323884003e-05, + "loss": 0.3319, + "step": 10398 + }, + { + "epoch": 1.6278960551033186, + "grad_norm": 0.9615517258644104, + "learning_rate": 1.947702834799609e-05, + "loss": 0.2343, + "step": 10399 + }, + { + "epoch": 1.628052598622417, + "grad_norm": 0.7906052470207214, + "learning_rate": 1.946888237210818e-05, + "loss": 0.1962, + "step": 10400 + }, + { + "epoch": 1.6282091421415155, + "grad_norm": 0.6494764089584351, + "learning_rate": 1.9460736396220268e-05, + "loss": 0.2678, + "step": 10401 + }, + { + "epoch": 1.6283656856606137, + "grad_norm": 1.9376046657562256, + "learning_rate": 1.9452590420332355e-05, + "loss": 0.4899, + "step": 10402 + }, + { + "epoch": 1.628522229179712, + "grad_norm": 1.5597764253616333, + "learning_rate": 1.9444444444444445e-05, + "loss": 0.4206, + "step": 10403 + }, + { + "epoch": 1.6286787726988101, + "grad_norm": 0.8758235573768616, + "learning_rate": 1.9436298468556536e-05, + "loss": 0.1916, + "step": 10404 + }, + { + "epoch": 1.6288353162179086, + "grad_norm": 1.682101845741272, + "learning_rate": 1.9428152492668623e-05, + "loss": 0.2877, + "step": 10405 + }, + { + "epoch": 1.628991859737007, + "grad_norm": 4.584434986114502, + "learning_rate": 1.942000651678071e-05, + "loss": 0.4074, + "step": 10406 + }, + { + "epoch": 1.6291484032561052, + "grad_norm": 1.5655168294906616, + "learning_rate": 1.94118605408928e-05, + "loss": 0.2538, + "step": 10407 + }, + { + "epoch": 1.6293049467752034, + "grad_norm": 1.920690894126892, + "learning_rate": 1.9403714565004888e-05, + "loss": 0.3523, + "step": 10408 + }, + { + "epoch": 1.6294614902943017, + "grad_norm": 5.293962001800537, + "learning_rate": 1.939556858911698e-05, + "loss": 0.4287, + "step": 10409 + }, + { + "epoch": 1.6296180338134, + "grad_norm": 1.1078988313674927, + "learning_rate": 1.9387422613229066e-05, + "loss": 0.2734, + "step": 10410 + }, + { + "epoch": 1.6297745773324985, + "grad_norm": 1.3927046060562134, + "learning_rate": 1.9379276637341153e-05, + "loss": 0.3648, + "step": 10411 + }, + { + "epoch": 1.6299311208515967, + "grad_norm": 2.488992929458618, + "learning_rate": 1.9371130661453243e-05, + "loss": 0.6421, + "step": 10412 + }, + { + "epoch": 1.630087664370695, + "grad_norm": 1.305591344833374, + "learning_rate": 1.9362984685565334e-05, + "loss": 0.2653, + "step": 10413 + }, + { + "epoch": 1.6302442078897934, + "grad_norm": 2.5287530422210693, + "learning_rate": 1.935483870967742e-05, + "loss": 0.5749, + "step": 10414 + }, + { + "epoch": 1.6304007514088916, + "grad_norm": 3.120553493499756, + "learning_rate": 1.9346692733789508e-05, + "loss": 0.6221, + "step": 10415 + }, + { + "epoch": 1.63055729492799, + "grad_norm": 2.15724515914917, + "learning_rate": 1.93385467579016e-05, + "loss": 0.4143, + "step": 10416 + }, + { + "epoch": 1.6307138384470883, + "grad_norm": 3.1805694103240967, + "learning_rate": 1.9330400782013686e-05, + "loss": 0.6857, + "step": 10417 + }, + { + "epoch": 1.6308703819661865, + "grad_norm": 2.376474142074585, + "learning_rate": 1.9322254806125776e-05, + "loss": 0.6402, + "step": 10418 + }, + { + "epoch": 1.631026925485285, + "grad_norm": 3.42748761177063, + "learning_rate": 1.9314108830237863e-05, + "loss": 0.455, + "step": 10419 + }, + { + "epoch": 1.6311834690043834, + "grad_norm": 3.2777836322784424, + "learning_rate": 1.930596285434995e-05, + "loss": 0.8638, + "step": 10420 + }, + { + "epoch": 1.6313400125234816, + "grad_norm": 5.949195384979248, + "learning_rate": 1.929781687846204e-05, + "loss": 0.7097, + "step": 10421 + }, + { + "epoch": 1.6314965560425798, + "grad_norm": 2.927468776702881, + "learning_rate": 1.928967090257413e-05, + "loss": 0.7502, + "step": 10422 + }, + { + "epoch": 1.631653099561678, + "grad_norm": 1.7552094459533691, + "learning_rate": 1.928152492668622e-05, + "loss": 0.6281, + "step": 10423 + }, + { + "epoch": 1.6318096430807765, + "grad_norm": 3.8784427642822266, + "learning_rate": 1.9273378950798306e-05, + "loss": 0.6782, + "step": 10424 + }, + { + "epoch": 1.631966186599875, + "grad_norm": 4.423348903656006, + "learning_rate": 1.9265232974910393e-05, + "loss": 0.7605, + "step": 10425 + }, + { + "epoch": 1.6321227301189731, + "grad_norm": 1.9665762186050415, + "learning_rate": 1.9257086999022483e-05, + "loss": 0.6701, + "step": 10426 + }, + { + "epoch": 1.6322792736380713, + "grad_norm": 4.929471015930176, + "learning_rate": 1.9248941023134574e-05, + "loss": 0.9173, + "step": 10427 + }, + { + "epoch": 1.6324358171571696, + "grad_norm": 3.3410558700561523, + "learning_rate": 1.924079504724666e-05, + "loss": 1.0929, + "step": 10428 + }, + { + "epoch": 1.632592360676268, + "grad_norm": 15.012572288513184, + "learning_rate": 1.9232649071358748e-05, + "loss": 0.8039, + "step": 10429 + }, + { + "epoch": 1.6327489041953664, + "grad_norm": 5.181441307067871, + "learning_rate": 1.922450309547084e-05, + "loss": 1.0478, + "step": 10430 + }, + { + "epoch": 1.6329054477144647, + "grad_norm": 2.307860851287842, + "learning_rate": 1.9216357119582926e-05, + "loss": 0.7457, + "step": 10431 + }, + { + "epoch": 1.6330619912335629, + "grad_norm": 8.263679504394531, + "learning_rate": 1.9208211143695016e-05, + "loss": 1.2377, + "step": 10432 + }, + { + "epoch": 1.633218534752661, + "grad_norm": 3.9643726348876953, + "learning_rate": 1.9200065167807104e-05, + "loss": 0.9566, + "step": 10433 + }, + { + "epoch": 1.6333750782717595, + "grad_norm": 3.3757569789886475, + "learning_rate": 1.919191919191919e-05, + "loss": 0.4091, + "step": 10434 + }, + { + "epoch": 1.633531621790858, + "grad_norm": 1.2500364780426025, + "learning_rate": 1.918377321603128e-05, + "loss": 0.2138, + "step": 10435 + }, + { + "epoch": 1.6336881653099562, + "grad_norm": 7.9198222160339355, + "learning_rate": 1.9175627240143372e-05, + "loss": 1.071, + "step": 10436 + }, + { + "epoch": 1.6338447088290544, + "grad_norm": 4.370776176452637, + "learning_rate": 1.916748126425546e-05, + "loss": 1.33, + "step": 10437 + }, + { + "epoch": 1.6340012523481526, + "grad_norm": 4.519410610198975, + "learning_rate": 1.9159335288367546e-05, + "loss": 1.3273, + "step": 10438 + }, + { + "epoch": 1.634157795867251, + "grad_norm": 0.5807533264160156, + "learning_rate": 1.9151189312479637e-05, + "loss": 0.1813, + "step": 10439 + }, + { + "epoch": 1.6343143393863495, + "grad_norm": 0.6870834827423096, + "learning_rate": 1.9143043336591724e-05, + "loss": 0.19, + "step": 10440 + }, + { + "epoch": 1.6344708829054477, + "grad_norm": 1.1598180532455444, + "learning_rate": 1.9134897360703814e-05, + "loss": 0.2398, + "step": 10441 + }, + { + "epoch": 1.634627426424546, + "grad_norm": 1.0232431888580322, + "learning_rate": 1.91267513848159e-05, + "loss": 0.2107, + "step": 10442 + }, + { + "epoch": 1.6347839699436444, + "grad_norm": 0.4688561260700226, + "learning_rate": 1.911860540892799e-05, + "loss": 0.2047, + "step": 10443 + }, + { + "epoch": 1.6349405134627426, + "grad_norm": 0.4733808636665344, + "learning_rate": 1.911045943304008e-05, + "loss": 0.1382, + "step": 10444 + }, + { + "epoch": 1.635097056981841, + "grad_norm": 1.259488582611084, + "learning_rate": 1.910231345715217e-05, + "loss": 0.1861, + "step": 10445 + }, + { + "epoch": 1.6352536005009393, + "grad_norm": 0.48728692531585693, + "learning_rate": 1.9094167481264257e-05, + "loss": 0.1813, + "step": 10446 + }, + { + "epoch": 1.6354101440200375, + "grad_norm": 1.1155749559402466, + "learning_rate": 1.9086021505376344e-05, + "loss": 0.2762, + "step": 10447 + }, + { + "epoch": 1.635566687539136, + "grad_norm": 1.1619758605957031, + "learning_rate": 1.9077875529488434e-05, + "loss": 0.2568, + "step": 10448 + }, + { + "epoch": 1.6357232310582341, + "grad_norm": 0.7947984337806702, + "learning_rate": 1.906972955360052e-05, + "loss": 0.2475, + "step": 10449 + }, + { + "epoch": 1.6358797745773326, + "grad_norm": 1.6944808959960938, + "learning_rate": 1.9061583577712612e-05, + "loss": 0.3041, + "step": 10450 + }, + { + "epoch": 1.6360363180964308, + "grad_norm": 1.4346870183944702, + "learning_rate": 1.90534376018247e-05, + "loss": 0.4468, + "step": 10451 + }, + { + "epoch": 1.636192861615529, + "grad_norm": 1.8415066003799438, + "learning_rate": 1.9045291625936786e-05, + "loss": 0.3455, + "step": 10452 + }, + { + "epoch": 1.6363494051346275, + "grad_norm": 1.3417218923568726, + "learning_rate": 1.9037145650048877e-05, + "loss": 0.5414, + "step": 10453 + }, + { + "epoch": 1.636505948653726, + "grad_norm": 1.4156824350357056, + "learning_rate": 1.9028999674160967e-05, + "loss": 0.3822, + "step": 10454 + }, + { + "epoch": 1.6366624921728241, + "grad_norm": 1.1268956661224365, + "learning_rate": 1.9020853698273054e-05, + "loss": 0.3603, + "step": 10455 + }, + { + "epoch": 1.6368190356919223, + "grad_norm": 0.9540091753005981, + "learning_rate": 1.901270772238514e-05, + "loss": 0.2579, + "step": 10456 + }, + { + "epoch": 1.6369755792110205, + "grad_norm": 0.6858832240104675, + "learning_rate": 1.9004561746497232e-05, + "loss": 0.1559, + "step": 10457 + }, + { + "epoch": 1.637132122730119, + "grad_norm": 2.2919726371765137, + "learning_rate": 1.899641577060932e-05, + "loss": 0.3928, + "step": 10458 + }, + { + "epoch": 1.6372886662492174, + "grad_norm": 5.207332611083984, + "learning_rate": 1.898826979472141e-05, + "loss": 0.536, + "step": 10459 + }, + { + "epoch": 1.6374452097683156, + "grad_norm": 1.2791147232055664, + "learning_rate": 1.8980123818833497e-05, + "loss": 0.328, + "step": 10460 + }, + { + "epoch": 1.6376017532874139, + "grad_norm": 2.2712600231170654, + "learning_rate": 1.8971977842945584e-05, + "loss": 0.3607, + "step": 10461 + }, + { + "epoch": 1.637758296806512, + "grad_norm": 2.513448476791382, + "learning_rate": 1.8963831867057675e-05, + "loss": 0.5717, + "step": 10462 + }, + { + "epoch": 1.6379148403256105, + "grad_norm": 1.5005974769592285, + "learning_rate": 1.8955685891169765e-05, + "loss": 0.3209, + "step": 10463 + }, + { + "epoch": 1.638071383844709, + "grad_norm": 1.7003542184829712, + "learning_rate": 1.8947539915281852e-05, + "loss": 0.3342, + "step": 10464 + }, + { + "epoch": 1.6382279273638072, + "grad_norm": 2.6806154251098633, + "learning_rate": 1.893939393939394e-05, + "loss": 0.448, + "step": 10465 + }, + { + "epoch": 1.6383844708829054, + "grad_norm": 1.799599289894104, + "learning_rate": 1.893124796350603e-05, + "loss": 0.7687, + "step": 10466 + }, + { + "epoch": 1.6385410144020036, + "grad_norm": 2.265245199203491, + "learning_rate": 1.8923101987618117e-05, + "loss": 0.4969, + "step": 10467 + }, + { + "epoch": 1.638697557921102, + "grad_norm": 1.8843276500701904, + "learning_rate": 1.8914956011730208e-05, + "loss": 0.3539, + "step": 10468 + }, + { + "epoch": 1.6388541014402005, + "grad_norm": 4.397439479827881, + "learning_rate": 1.8906810035842295e-05, + "loss": 0.7999, + "step": 10469 + }, + { + "epoch": 1.6390106449592987, + "grad_norm": 1.657614827156067, + "learning_rate": 1.8898664059954382e-05, + "loss": 0.52, + "step": 10470 + }, + { + "epoch": 1.639167188478397, + "grad_norm": 6.214556694030762, + "learning_rate": 1.8890518084066472e-05, + "loss": 1.0033, + "step": 10471 + }, + { + "epoch": 1.6393237319974951, + "grad_norm": 3.1205849647521973, + "learning_rate": 1.8882372108178563e-05, + "loss": 0.8434, + "step": 10472 + }, + { + "epoch": 1.6394802755165936, + "grad_norm": 2.4963576793670654, + "learning_rate": 1.8874226132290647e-05, + "loss": 0.881, + "step": 10473 + }, + { + "epoch": 1.639636819035692, + "grad_norm": 8.218254089355469, + "learning_rate": 1.8866080156402737e-05, + "loss": 0.9169, + "step": 10474 + }, + { + "epoch": 1.6397933625547902, + "grad_norm": 4.61908483505249, + "learning_rate": 1.8857934180514828e-05, + "loss": 0.7904, + "step": 10475 + }, + { + "epoch": 1.6399499060738885, + "grad_norm": 1.7322115898132324, + "learning_rate": 1.8849788204626915e-05, + "loss": 0.5687, + "step": 10476 + }, + { + "epoch": 1.640106449592987, + "grad_norm": 3.0872113704681396, + "learning_rate": 1.8841642228739005e-05, + "loss": 0.6594, + "step": 10477 + }, + { + "epoch": 1.6402629931120851, + "grad_norm": 4.348513603210449, + "learning_rate": 1.8833496252851092e-05, + "loss": 1.2372, + "step": 10478 + }, + { + "epoch": 1.6404195366311836, + "grad_norm": 3.6448991298675537, + "learning_rate": 1.882535027696318e-05, + "loss": 1.3721, + "step": 10479 + }, + { + "epoch": 1.6405760801502818, + "grad_norm": 9.712137222290039, + "learning_rate": 1.881720430107527e-05, + "loss": 1.2509, + "step": 10480 + }, + { + "epoch": 1.64073262366938, + "grad_norm": 3.8527541160583496, + "learning_rate": 1.880905832518736e-05, + "loss": 0.6989, + "step": 10481 + }, + { + "epoch": 1.6408891671884784, + "grad_norm": 2.073819875717163, + "learning_rate": 1.8800912349299444e-05, + "loss": 1.2777, + "step": 10482 + }, + { + "epoch": 1.6410457107075767, + "grad_norm": 5.369659423828125, + "learning_rate": 1.8792766373411535e-05, + "loss": 0.7038, + "step": 10483 + }, + { + "epoch": 1.641202254226675, + "grad_norm": 5.677603244781494, + "learning_rate": 1.8784620397523625e-05, + "loss": 0.7346, + "step": 10484 + }, + { + "epoch": 1.6413587977457733, + "grad_norm": 2.1812517642974854, + "learning_rate": 1.8776474421635713e-05, + "loss": 0.4874, + "step": 10485 + }, + { + "epoch": 1.6415153412648715, + "grad_norm": 3.9122581481933594, + "learning_rate": 1.8768328445747803e-05, + "loss": 0.265, + "step": 10486 + }, + { + "epoch": 1.64167188478397, + "grad_norm": 4.69173002243042, + "learning_rate": 1.876018246985989e-05, + "loss": 1.1737, + "step": 10487 + }, + { + "epoch": 1.6418284283030684, + "grad_norm": 3.5798797607421875, + "learning_rate": 1.8752036493971977e-05, + "loss": 1.0261, + "step": 10488 + }, + { + "epoch": 1.6419849718221666, + "grad_norm": 0.7883862257003784, + "learning_rate": 1.8743890518084068e-05, + "loss": 0.1632, + "step": 10489 + }, + { + "epoch": 1.6421415153412648, + "grad_norm": 0.756984293460846, + "learning_rate": 1.873574454219616e-05, + "loss": 0.223, + "step": 10490 + }, + { + "epoch": 1.642298058860363, + "grad_norm": 0.6652536392211914, + "learning_rate": 1.8727598566308242e-05, + "loss": 0.3277, + "step": 10491 + }, + { + "epoch": 1.6424546023794615, + "grad_norm": 0.7470530271530151, + "learning_rate": 1.8719452590420333e-05, + "loss": 0.2387, + "step": 10492 + }, + { + "epoch": 1.64261114589856, + "grad_norm": 0.38469719886779785, + "learning_rate": 1.8711306614532423e-05, + "loss": 0.1798, + "step": 10493 + }, + { + "epoch": 1.6427676894176582, + "grad_norm": 0.5543297529220581, + "learning_rate": 1.870316063864451e-05, + "loss": 0.1849, + "step": 10494 + }, + { + "epoch": 1.6429242329367564, + "grad_norm": 0.4288400709629059, + "learning_rate": 1.8695014662756598e-05, + "loss": 0.1185, + "step": 10495 + }, + { + "epoch": 1.6430807764558546, + "grad_norm": 0.6918376684188843, + "learning_rate": 1.8686868686868688e-05, + "loss": 0.1763, + "step": 10496 + }, + { + "epoch": 1.643237319974953, + "grad_norm": 1.2025117874145508, + "learning_rate": 1.8678722710980775e-05, + "loss": 0.3773, + "step": 10497 + }, + { + "epoch": 1.6433938634940515, + "grad_norm": 1.1309044361114502, + "learning_rate": 1.8670576735092866e-05, + "loss": 0.2554, + "step": 10498 + }, + { + "epoch": 1.6435504070131497, + "grad_norm": 0.974139392375946, + "learning_rate": 1.8662430759204956e-05, + "loss": 0.2452, + "step": 10499 + }, + { + "epoch": 1.643706950532248, + "grad_norm": 1.0606727600097656, + "learning_rate": 1.865428478331704e-05, + "loss": 0.37, + "step": 10500 + }, + { + "epoch": 1.6438634940513461, + "grad_norm": 0.7894330024719238, + "learning_rate": 1.864613880742913e-05, + "loss": 0.2543, + "step": 10501 + }, + { + "epoch": 1.6440200375704446, + "grad_norm": 1.6304712295532227, + "learning_rate": 1.863799283154122e-05, + "loss": 0.4082, + "step": 10502 + }, + { + "epoch": 1.644176581089543, + "grad_norm": 1.8314026594161987, + "learning_rate": 1.8629846855653308e-05, + "loss": 0.5395, + "step": 10503 + }, + { + "epoch": 1.6443331246086412, + "grad_norm": 1.3564115762710571, + "learning_rate": 1.8621700879765395e-05, + "loss": 0.4099, + "step": 10504 + }, + { + "epoch": 1.6444896681277394, + "grad_norm": 1.2427698373794556, + "learning_rate": 1.8613554903877486e-05, + "loss": 0.3883, + "step": 10505 + }, + { + "epoch": 1.6446462116468377, + "grad_norm": 2.610727310180664, + "learning_rate": 1.8605408927989573e-05, + "loss": 0.592, + "step": 10506 + }, + { + "epoch": 1.644802755165936, + "grad_norm": 2.9664864540100098, + "learning_rate": 1.8597262952101664e-05, + "loss": 0.4026, + "step": 10507 + }, + { + "epoch": 1.6449592986850345, + "grad_norm": 1.502127766609192, + "learning_rate": 1.8589116976213754e-05, + "loss": 0.3093, + "step": 10508 + }, + { + "epoch": 1.6451158422041328, + "grad_norm": 2.3732850551605225, + "learning_rate": 1.8580971000325838e-05, + "loss": 0.3656, + "step": 10509 + }, + { + "epoch": 1.645272385723231, + "grad_norm": 2.8172760009765625, + "learning_rate": 1.857282502443793e-05, + "loss": 0.7389, + "step": 10510 + }, + { + "epoch": 1.6454289292423294, + "grad_norm": 2.821566343307495, + "learning_rate": 1.856467904855002e-05, + "loss": 0.6717, + "step": 10511 + }, + { + "epoch": 1.6455854727614276, + "grad_norm": 2.025930643081665, + "learning_rate": 1.8556533072662106e-05, + "loss": 0.4698, + "step": 10512 + }, + { + "epoch": 1.645742016280526, + "grad_norm": 1.8048934936523438, + "learning_rate": 1.8548387096774193e-05, + "loss": 0.3937, + "step": 10513 + }, + { + "epoch": 1.6458985597996243, + "grad_norm": 3.688737154006958, + "learning_rate": 1.8540241120886284e-05, + "loss": 0.5115, + "step": 10514 + }, + { + "epoch": 1.6460551033187225, + "grad_norm": 2.016831636428833, + "learning_rate": 1.853209514499837e-05, + "loss": 0.5547, + "step": 10515 + }, + { + "epoch": 1.646211646837821, + "grad_norm": 4.018406867980957, + "learning_rate": 1.852394916911046e-05, + "loss": 0.8293, + "step": 10516 + }, + { + "epoch": 1.6463681903569192, + "grad_norm": 1.454301118850708, + "learning_rate": 1.851580319322255e-05, + "loss": 0.3424, + "step": 10517 + }, + { + "epoch": 1.6465247338760176, + "grad_norm": 1.8230268955230713, + "learning_rate": 1.8507657217334636e-05, + "loss": 0.3279, + "step": 10518 + }, + { + "epoch": 1.6466812773951158, + "grad_norm": 1.872459888458252, + "learning_rate": 1.8499511241446726e-05, + "loss": 0.4709, + "step": 10519 + }, + { + "epoch": 1.646837820914214, + "grad_norm": 1.787115216255188, + "learning_rate": 1.8491365265558817e-05, + "loss": 0.5852, + "step": 10520 + }, + { + "epoch": 1.6469943644333125, + "grad_norm": 1.6910972595214844, + "learning_rate": 1.8483219289670904e-05, + "loss": 0.3302, + "step": 10521 + }, + { + "epoch": 1.647150907952411, + "grad_norm": 3.1752471923828125, + "learning_rate": 1.847507331378299e-05, + "loss": 0.5311, + "step": 10522 + }, + { + "epoch": 1.6473074514715091, + "grad_norm": 1.8416131734848022, + "learning_rate": 1.846692733789508e-05, + "loss": 0.7343, + "step": 10523 + }, + { + "epoch": 1.6474639949906074, + "grad_norm": 4.845549583435059, + "learning_rate": 1.845878136200717e-05, + "loss": 1.141, + "step": 10524 + }, + { + "epoch": 1.6476205385097056, + "grad_norm": 2.4235122203826904, + "learning_rate": 1.845063538611926e-05, + "loss": 0.5773, + "step": 10525 + }, + { + "epoch": 1.647777082028804, + "grad_norm": 2.7588307857513428, + "learning_rate": 1.8442489410231346e-05, + "loss": 0.5784, + "step": 10526 + }, + { + "epoch": 1.6479336255479025, + "grad_norm": 5.646317005157471, + "learning_rate": 1.8434343434343433e-05, + "loss": 1.0736, + "step": 10527 + }, + { + "epoch": 1.6480901690670007, + "grad_norm": 4.926019191741943, + "learning_rate": 1.8426197458455524e-05, + "loss": 1.4173, + "step": 10528 + }, + { + "epoch": 1.648246712586099, + "grad_norm": 3.313913583755493, + "learning_rate": 1.8418051482567614e-05, + "loss": 0.4662, + "step": 10529 + }, + { + "epoch": 1.6484032561051971, + "grad_norm": 6.088518142700195, + "learning_rate": 1.84099055066797e-05, + "loss": 1.3867, + "step": 10530 + }, + { + "epoch": 1.6485597996242956, + "grad_norm": 4.187114715576172, + "learning_rate": 1.840175953079179e-05, + "loss": 1.4968, + "step": 10531 + }, + { + "epoch": 1.648716343143394, + "grad_norm": 1.7943652868270874, + "learning_rate": 1.839361355490388e-05, + "loss": 1.0489, + "step": 10532 + }, + { + "epoch": 1.6488728866624922, + "grad_norm": 3.7895212173461914, + "learning_rate": 1.8385467579015966e-05, + "loss": 1.3061, + "step": 10533 + }, + { + "epoch": 1.6490294301815904, + "grad_norm": 2.7494237422943115, + "learning_rate": 1.8377321603128057e-05, + "loss": 0.8524, + "step": 10534 + }, + { + "epoch": 1.6491859737006886, + "grad_norm": 2.3531219959259033, + "learning_rate": 1.8369175627240144e-05, + "loss": 0.4926, + "step": 10535 + }, + { + "epoch": 1.649342517219787, + "grad_norm": 4.134480953216553, + "learning_rate": 1.836102965135223e-05, + "loss": 0.6415, + "step": 10536 + }, + { + "epoch": 1.6494990607388855, + "grad_norm": 4.211005210876465, + "learning_rate": 1.835288367546432e-05, + "loss": 0.6828, + "step": 10537 + }, + { + "epoch": 1.6496556042579837, + "grad_norm": 3.0589451789855957, + "learning_rate": 1.8344737699576412e-05, + "loss": 0.7836, + "step": 10538 + }, + { + "epoch": 1.649812147777082, + "grad_norm": 0.3590056896209717, + "learning_rate": 1.83365917236885e-05, + "loss": 0.2022, + "step": 10539 + }, + { + "epoch": 1.6499686912961802, + "grad_norm": 0.6708316206932068, + "learning_rate": 1.8328445747800586e-05, + "loss": 0.1931, + "step": 10540 + }, + { + "epoch": 1.6501252348152786, + "grad_norm": 0.381491094827652, + "learning_rate": 1.8320299771912677e-05, + "loss": 0.1529, + "step": 10541 + }, + { + "epoch": 1.650281778334377, + "grad_norm": 0.5007537007331848, + "learning_rate": 1.8312153796024764e-05, + "loss": 0.1548, + "step": 10542 + }, + { + "epoch": 1.6504383218534753, + "grad_norm": 0.8182041049003601, + "learning_rate": 1.8304007820136855e-05, + "loss": 0.2212, + "step": 10543 + }, + { + "epoch": 1.6505948653725735, + "grad_norm": 1.4365794658660889, + "learning_rate": 1.8295861844248942e-05, + "loss": 0.2391, + "step": 10544 + }, + { + "epoch": 1.650751408891672, + "grad_norm": 0.8610919117927551, + "learning_rate": 1.828771586836103e-05, + "loss": 0.2637, + "step": 10545 + }, + { + "epoch": 1.6509079524107702, + "grad_norm": 0.8198899626731873, + "learning_rate": 1.827956989247312e-05, + "loss": 0.3198, + "step": 10546 + }, + { + "epoch": 1.6510644959298686, + "grad_norm": 1.2725788354873657, + "learning_rate": 1.827142391658521e-05, + "loss": 0.2035, + "step": 10547 + }, + { + "epoch": 1.6512210394489668, + "grad_norm": 0.9212202429771423, + "learning_rate": 1.8263277940697297e-05, + "loss": 0.2692, + "step": 10548 + }, + { + "epoch": 1.651377582968065, + "grad_norm": 0.847491979598999, + "learning_rate": 1.8255131964809384e-05, + "loss": 0.2754, + "step": 10549 + }, + { + "epoch": 1.6515341264871635, + "grad_norm": 2.70503830909729, + "learning_rate": 1.8246985988921475e-05, + "loss": 0.2712, + "step": 10550 + }, + { + "epoch": 1.6516906700062617, + "grad_norm": 1.2173309326171875, + "learning_rate": 1.8238840013033562e-05, + "loss": 0.4128, + "step": 10551 + }, + { + "epoch": 1.6518472135253601, + "grad_norm": 1.1534054279327393, + "learning_rate": 1.8230694037145652e-05, + "loss": 0.2259, + "step": 10552 + }, + { + "epoch": 1.6520037570444583, + "grad_norm": 0.9784389138221741, + "learning_rate": 1.822254806125774e-05, + "loss": 0.363, + "step": 10553 + }, + { + "epoch": 1.6521603005635566, + "grad_norm": 1.022369146347046, + "learning_rate": 1.8214402085369827e-05, + "loss": 0.2543, + "step": 10554 + }, + { + "epoch": 1.652316844082655, + "grad_norm": 1.5359954833984375, + "learning_rate": 1.8206256109481917e-05, + "loss": 0.319, + "step": 10555 + }, + { + "epoch": 1.6524733876017534, + "grad_norm": 1.443113088607788, + "learning_rate": 1.8198110133594004e-05, + "loss": 0.3573, + "step": 10556 + }, + { + "epoch": 1.6526299311208517, + "grad_norm": 0.9496574997901917, + "learning_rate": 1.8189964157706095e-05, + "loss": 0.2859, + "step": 10557 + }, + { + "epoch": 1.6527864746399499, + "grad_norm": 1.932607889175415, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.4025, + "step": 10558 + }, + { + "epoch": 1.652943018159048, + "grad_norm": 1.9212493896484375, + "learning_rate": 1.817367220593027e-05, + "loss": 0.4343, + "step": 10559 + }, + { + "epoch": 1.6530995616781465, + "grad_norm": 2.273987054824829, + "learning_rate": 1.816552623004236e-05, + "loss": 0.3141, + "step": 10560 + }, + { + "epoch": 1.653256105197245, + "grad_norm": 2.5833003520965576, + "learning_rate": 1.815738025415445e-05, + "loss": 0.3694, + "step": 10561 + }, + { + "epoch": 1.6534126487163432, + "grad_norm": 1.4924031496047974, + "learning_rate": 1.8149234278266537e-05, + "loss": 0.3777, + "step": 10562 + }, + { + "epoch": 1.6535691922354414, + "grad_norm": 1.8515427112579346, + "learning_rate": 1.8141088302378625e-05, + "loss": 0.3225, + "step": 10563 + }, + { + "epoch": 1.6537257357545396, + "grad_norm": 2.1420302391052246, + "learning_rate": 1.8132942326490715e-05, + "loss": 0.5157, + "step": 10564 + }, + { + "epoch": 1.653882279273638, + "grad_norm": 2.0534403324127197, + "learning_rate": 1.8124796350602802e-05, + "loss": 0.6893, + "step": 10565 + }, + { + "epoch": 1.6540388227927365, + "grad_norm": 2.754812717437744, + "learning_rate": 1.8116650374714893e-05, + "loss": 0.629, + "step": 10566 + }, + { + "epoch": 1.6541953663118347, + "grad_norm": 1.6254382133483887, + "learning_rate": 1.810850439882698e-05, + "loss": 0.4514, + "step": 10567 + }, + { + "epoch": 1.654351909830933, + "grad_norm": 3.9840450286865234, + "learning_rate": 1.8100358422939067e-05, + "loss": 0.6012, + "step": 10568 + }, + { + "epoch": 1.6545084533500312, + "grad_norm": 3.472200870513916, + "learning_rate": 1.8092212447051158e-05, + "loss": 0.803, + "step": 10569 + }, + { + "epoch": 1.6546649968691296, + "grad_norm": 1.7345845699310303, + "learning_rate": 1.8084066471163248e-05, + "loss": 0.4838, + "step": 10570 + }, + { + "epoch": 1.654821540388228, + "grad_norm": 3.820539712905884, + "learning_rate": 1.8075920495275335e-05, + "loss": 0.8839, + "step": 10571 + }, + { + "epoch": 1.6549780839073263, + "grad_norm": 3.3414077758789062, + "learning_rate": 1.8067774519387422e-05, + "loss": 1.176, + "step": 10572 + }, + { + "epoch": 1.6551346274264245, + "grad_norm": 2.756244421005249, + "learning_rate": 1.8059628543499513e-05, + "loss": 0.4607, + "step": 10573 + }, + { + "epoch": 1.6552911709455227, + "grad_norm": 3.492840051651001, + "learning_rate": 1.80514825676116e-05, + "loss": 1.1212, + "step": 10574 + }, + { + "epoch": 1.6554477144646211, + "grad_norm": 3.9332220554351807, + "learning_rate": 1.804333659172369e-05, + "loss": 0.993, + "step": 10575 + }, + { + "epoch": 1.6556042579837196, + "grad_norm": 1.9590574502944946, + "learning_rate": 1.8035190615835778e-05, + "loss": 0.5943, + "step": 10576 + }, + { + "epoch": 1.6557608015028178, + "grad_norm": 4.627928256988525, + "learning_rate": 1.8027044639947865e-05, + "loss": 1.5429, + "step": 10577 + }, + { + "epoch": 1.655917345021916, + "grad_norm": 5.595308303833008, + "learning_rate": 1.8018898664059955e-05, + "loss": 1.0205, + "step": 10578 + }, + { + "epoch": 1.6560738885410144, + "grad_norm": 8.836652755737305, + "learning_rate": 1.8010752688172046e-05, + "loss": 1.4687, + "step": 10579 + }, + { + "epoch": 1.6562304320601127, + "grad_norm": 3.4835598468780518, + "learning_rate": 1.8002606712284133e-05, + "loss": 1.5541, + "step": 10580 + }, + { + "epoch": 1.656386975579211, + "grad_norm": 8.778911590576172, + "learning_rate": 1.799446073639622e-05, + "loss": 1.5601, + "step": 10581 + }, + { + "epoch": 1.6565435190983093, + "grad_norm": 1.5130410194396973, + "learning_rate": 1.798631476050831e-05, + "loss": 0.6957, + "step": 10582 + }, + { + "epoch": 1.6567000626174075, + "grad_norm": 3.543071746826172, + "learning_rate": 1.7978168784620398e-05, + "loss": 1.0516, + "step": 10583 + }, + { + "epoch": 1.656856606136506, + "grad_norm": NaN, + "learning_rate": 1.7978168784620398e-05, + "loss": 0.0, + "step": 10584 + }, + { + "epoch": 1.6570131496556044, + "grad_norm": 1.8775664567947388, + "learning_rate": 1.7970022808732488e-05, + "loss": 0.6027, + "step": 10585 + }, + { + "epoch": 1.6571696931747026, + "grad_norm": 3.628800392150879, + "learning_rate": 1.7961876832844575e-05, + "loss": 0.6545, + "step": 10586 + }, + { + "epoch": 1.6573262366938009, + "grad_norm": 3.224200963973999, + "learning_rate": 1.7953730856956663e-05, + "loss": 1.1387, + "step": 10587 + }, + { + "epoch": 1.657482780212899, + "grad_norm": 2.53369140625, + "learning_rate": 1.7945584881068753e-05, + "loss": 1.0349, + "step": 10588 + }, + { + "epoch": 1.6576393237319975, + "grad_norm": 0.799846351146698, + "learning_rate": 1.7937438905180844e-05, + "loss": 0.2416, + "step": 10589 + }, + { + "epoch": 1.657795867251096, + "grad_norm": 0.4707060754299164, + "learning_rate": 1.7929292929292927e-05, + "loss": 0.202, + "step": 10590 + }, + { + "epoch": 1.6579524107701942, + "grad_norm": 0.7624747157096863, + "learning_rate": 1.7921146953405018e-05, + "loss": 0.2207, + "step": 10591 + }, + { + "epoch": 1.6581089542892924, + "grad_norm": 0.5540100932121277, + "learning_rate": 1.791300097751711e-05, + "loss": 0.1808, + "step": 10592 + }, + { + "epoch": 1.6582654978083906, + "grad_norm": 0.5180730223655701, + "learning_rate": 1.7904855001629196e-05, + "loss": 0.2087, + "step": 10593 + }, + { + "epoch": 1.658422041327489, + "grad_norm": 0.6686544418334961, + "learning_rate": 1.7896709025741286e-05, + "loss": 0.2374, + "step": 10594 + }, + { + "epoch": 1.6585785848465875, + "grad_norm": 1.5307831764221191, + "learning_rate": 1.7888563049853373e-05, + "loss": 0.3269, + "step": 10595 + }, + { + "epoch": 1.6587351283656857, + "grad_norm": 0.6407783031463623, + "learning_rate": 1.788041707396546e-05, + "loss": 0.3055, + "step": 10596 + }, + { + "epoch": 1.658891671884784, + "grad_norm": 0.9872055053710938, + "learning_rate": 1.787227109807755e-05, + "loss": 0.152, + "step": 10597 + }, + { + "epoch": 1.6590482154038821, + "grad_norm": 1.14183509349823, + "learning_rate": 1.786412512218964e-05, + "loss": 0.3095, + "step": 10598 + }, + { + "epoch": 1.6592047589229806, + "grad_norm": 1.5578346252441406, + "learning_rate": 1.7855979146301725e-05, + "loss": 0.4721, + "step": 10599 + }, + { + "epoch": 1.659361302442079, + "grad_norm": 0.5551625490188599, + "learning_rate": 1.7847833170413816e-05, + "loss": 0.1558, + "step": 10600 + }, + { + "epoch": 1.6595178459611772, + "grad_norm": 0.762615978717804, + "learning_rate": 1.7839687194525906e-05, + "loss": 0.242, + "step": 10601 + }, + { + "epoch": 1.6596743894802755, + "grad_norm": 1.6262027025222778, + "learning_rate": 1.7831541218637993e-05, + "loss": 0.3308, + "step": 10602 + }, + { + "epoch": 1.6598309329993737, + "grad_norm": 0.7559671401977539, + "learning_rate": 1.7823395242750084e-05, + "loss": 0.2548, + "step": 10603 + }, + { + "epoch": 1.6599874765184721, + "grad_norm": 1.7238185405731201, + "learning_rate": 1.781524926686217e-05, + "loss": 0.4473, + "step": 10604 + }, + { + "epoch": 1.6601440200375706, + "grad_norm": 1.5889941453933716, + "learning_rate": 1.7807103290974258e-05, + "loss": 0.446, + "step": 10605 + }, + { + "epoch": 1.6603005635566688, + "grad_norm": 1.2120749950408936, + "learning_rate": 1.779895731508635e-05, + "loss": 0.2628, + "step": 10606 + }, + { + "epoch": 1.660457107075767, + "grad_norm": 0.8435314893722534, + "learning_rate": 1.779081133919844e-05, + "loss": 0.1482, + "step": 10607 + }, + { + "epoch": 1.6606136505948652, + "grad_norm": 1.4240500926971436, + "learning_rate": 1.7782665363310523e-05, + "loss": 0.1846, + "step": 10608 + }, + { + "epoch": 1.6607701941139636, + "grad_norm": 1.3905706405639648, + "learning_rate": 1.7774519387422613e-05, + "loss": 0.2374, + "step": 10609 + }, + { + "epoch": 1.660926737633062, + "grad_norm": 1.2325713634490967, + "learning_rate": 1.7766373411534704e-05, + "loss": 0.4726, + "step": 10610 + }, + { + "epoch": 1.6610832811521603, + "grad_norm": 2.1726176738739014, + "learning_rate": 1.775822743564679e-05, + "loss": 0.4449, + "step": 10611 + }, + { + "epoch": 1.6612398246712585, + "grad_norm": 1.5986297130584717, + "learning_rate": 1.7750081459758878e-05, + "loss": 0.5422, + "step": 10612 + }, + { + "epoch": 1.661396368190357, + "grad_norm": 1.7150894403457642, + "learning_rate": 1.774193548387097e-05, + "loss": 0.4116, + "step": 10613 + }, + { + "epoch": 1.6615529117094552, + "grad_norm": 2.091102123260498, + "learning_rate": 1.7733789507983056e-05, + "loss": 0.5327, + "step": 10614 + }, + { + "epoch": 1.6617094552285536, + "grad_norm": 2.5948214530944824, + "learning_rate": 1.7725643532095146e-05, + "loss": 0.5052, + "step": 10615 + }, + { + "epoch": 1.6618659987476518, + "grad_norm": 4.558361053466797, + "learning_rate": 1.7717497556207237e-05, + "loss": 0.8733, + "step": 10616 + }, + { + "epoch": 1.66202254226675, + "grad_norm": 2.3315396308898926, + "learning_rate": 1.770935158031932e-05, + "loss": 0.5619, + "step": 10617 + }, + { + "epoch": 1.6621790857858485, + "grad_norm": 2.5836989879608154, + "learning_rate": 1.770120560443141e-05, + "loss": 0.7879, + "step": 10618 + }, + { + "epoch": 1.662335629304947, + "grad_norm": 2.6634888648986816, + "learning_rate": 1.7693059628543502e-05, + "loss": 0.7805, + "step": 10619 + }, + { + "epoch": 1.6624921728240452, + "grad_norm": 3.213376045227051, + "learning_rate": 1.768491365265559e-05, + "loss": 0.8346, + "step": 10620 + }, + { + "epoch": 1.6626487163431434, + "grad_norm": 1.8537222146987915, + "learning_rate": 1.7676767676767676e-05, + "loss": 0.4213, + "step": 10621 + }, + { + "epoch": 1.6628052598622416, + "grad_norm": 2.966496706008911, + "learning_rate": 1.7668621700879767e-05, + "loss": 0.5277, + "step": 10622 + }, + { + "epoch": 1.66296180338134, + "grad_norm": 6.182995319366455, + "learning_rate": 1.7660475724991854e-05, + "loss": 0.7999, + "step": 10623 + }, + { + "epoch": 1.6631183469004385, + "grad_norm": 3.163299560546875, + "learning_rate": 1.7652329749103944e-05, + "loss": 0.9887, + "step": 10624 + }, + { + "epoch": 1.6632748904195367, + "grad_norm": 3.838970422744751, + "learning_rate": 1.7644183773216035e-05, + "loss": 1.0528, + "step": 10625 + }, + { + "epoch": 1.663431433938635, + "grad_norm": 3.786155939102173, + "learning_rate": 1.763603779732812e-05, + "loss": 0.546, + "step": 10626 + }, + { + "epoch": 1.6635879774577331, + "grad_norm": 2.2259976863861084, + "learning_rate": 1.762789182144021e-05, + "loss": 0.9043, + "step": 10627 + }, + { + "epoch": 1.6637445209768316, + "grad_norm": 3.3571889400482178, + "learning_rate": 1.76197458455523e-05, + "loss": 0.8942, + "step": 10628 + }, + { + "epoch": 1.66390106449593, + "grad_norm": 4.330054759979248, + "learning_rate": 1.7611599869664387e-05, + "loss": 1.0563, + "step": 10629 + }, + { + "epoch": 1.6640576080150282, + "grad_norm": 2.7561450004577637, + "learning_rate": 1.7603453893776474e-05, + "loss": 1.2318, + "step": 10630 + }, + { + "epoch": 1.6642141515341264, + "grad_norm": 3.3229217529296875, + "learning_rate": 1.7595307917888564e-05, + "loss": 1.6032, + "step": 10631 + }, + { + "epoch": 1.6643706950532247, + "grad_norm": 4.740453243255615, + "learning_rate": 1.758716194200065e-05, + "loss": 1.3419, + "step": 10632 + }, + { + "epoch": 1.664527238572323, + "grad_norm": 1.8521314859390259, + "learning_rate": 1.7579015966112742e-05, + "loss": 0.8976, + "step": 10633 + }, + { + "epoch": 1.6646837820914215, + "grad_norm": 3.7052669525146484, + "learning_rate": 1.757086999022483e-05, + "loss": 1.1802, + "step": 10634 + }, + { + "epoch": 1.6648403256105198, + "grad_norm": 2.0368690490722656, + "learning_rate": 1.7562724014336916e-05, + "loss": 0.6979, + "step": 10635 + }, + { + "epoch": 1.664996869129618, + "grad_norm": 3.547532796859741, + "learning_rate": 1.7554578038449007e-05, + "loss": 0.7969, + "step": 10636 + }, + { + "epoch": 1.6651534126487162, + "grad_norm": 5.864954471588135, + "learning_rate": 1.7546432062561097e-05, + "loss": 0.8398, + "step": 10637 + }, + { + "epoch": 1.6653099561678146, + "grad_norm": 4.420790672302246, + "learning_rate": 1.7538286086673185e-05, + "loss": 1.1865, + "step": 10638 + }, + { + "epoch": 1.665466499686913, + "grad_norm": 1.1587281227111816, + "learning_rate": 1.753014011078527e-05, + "loss": 0.6271, + "step": 10639 + }, + { + "epoch": 1.6656230432060113, + "grad_norm": 0.4132951498031616, + "learning_rate": 1.7521994134897362e-05, + "loss": 0.1833, + "step": 10640 + }, + { + "epoch": 1.6657795867251095, + "grad_norm": 0.8129231333732605, + "learning_rate": 1.751384815900945e-05, + "loss": 0.2733, + "step": 10641 + }, + { + "epoch": 1.6659361302442077, + "grad_norm": 0.9017931222915649, + "learning_rate": 1.750570218312154e-05, + "loss": 0.2534, + "step": 10642 + }, + { + "epoch": 1.6660926737633062, + "grad_norm": 0.7999629378318787, + "learning_rate": 1.7497556207233627e-05, + "loss": 0.2764, + "step": 10643 + }, + { + "epoch": 1.6662492172824046, + "grad_norm": 0.7539124488830566, + "learning_rate": 1.7489410231345714e-05, + "loss": 0.3253, + "step": 10644 + }, + { + "epoch": 1.6664057608015028, + "grad_norm": 0.5520537495613098, + "learning_rate": 1.7481264255457805e-05, + "loss": 0.1617, + "step": 10645 + }, + { + "epoch": 1.666562304320601, + "grad_norm": 1.1641591787338257, + "learning_rate": 1.7473118279569895e-05, + "loss": 0.2499, + "step": 10646 + }, + { + "epoch": 1.6667188478396995, + "grad_norm": 0.8076048493385315, + "learning_rate": 1.7464972303681982e-05, + "loss": 0.2431, + "step": 10647 + }, + { + "epoch": 1.6668753913587977, + "grad_norm": 1.0835829973220825, + "learning_rate": 1.745682632779407e-05, + "loss": 0.2818, + "step": 10648 + }, + { + "epoch": 1.6670319348778961, + "grad_norm": 1.0381416082382202, + "learning_rate": 1.744868035190616e-05, + "loss": 0.3314, + "step": 10649 + }, + { + "epoch": 1.6671884783969944, + "grad_norm": 1.025383710861206, + "learning_rate": 1.7440534376018247e-05, + "loss": 0.4069, + "step": 10650 + }, + { + "epoch": 1.6673450219160926, + "grad_norm": 1.1895415782928467, + "learning_rate": 1.7432388400130338e-05, + "loss": 0.2927, + "step": 10651 + }, + { + "epoch": 1.667501565435191, + "grad_norm": 1.6129794120788574, + "learning_rate": 1.7424242424242425e-05, + "loss": 0.4202, + "step": 10652 + }, + { + "epoch": 1.6676581089542895, + "grad_norm": 1.6220108270645142, + "learning_rate": 1.7416096448354512e-05, + "loss": 0.4774, + "step": 10653 + }, + { + "epoch": 1.6678146524733877, + "grad_norm": 2.3107805252075195, + "learning_rate": 1.7407950472466602e-05, + "loss": 0.3792, + "step": 10654 + }, + { + "epoch": 1.6679711959924859, + "grad_norm": 1.4484260082244873, + "learning_rate": 1.7399804496578693e-05, + "loss": 0.3157, + "step": 10655 + }, + { + "epoch": 1.668127739511584, + "grad_norm": 1.414920687675476, + "learning_rate": 1.739165852069078e-05, + "loss": 0.4033, + "step": 10656 + }, + { + "epoch": 1.6682842830306825, + "grad_norm": 1.3913631439208984, + "learning_rate": 1.7383512544802867e-05, + "loss": 0.3115, + "step": 10657 + }, + { + "epoch": 1.668440826549781, + "grad_norm": 2.336583375930786, + "learning_rate": 1.7375366568914958e-05, + "loss": 0.6219, + "step": 10658 + }, + { + "epoch": 1.6685973700688792, + "grad_norm": 15.505657196044922, + "learning_rate": 1.7367220593027045e-05, + "loss": 1.1709, + "step": 10659 + }, + { + "epoch": 1.6687539135879774, + "grad_norm": 1.5668660402297974, + "learning_rate": 1.7359074617139135e-05, + "loss": 0.6125, + "step": 10660 + }, + { + "epoch": 1.6689104571070756, + "grad_norm": 2.2632315158843994, + "learning_rate": 1.7350928641251223e-05, + "loss": 0.4212, + "step": 10661 + }, + { + "epoch": 1.669067000626174, + "grad_norm": 1.7592647075653076, + "learning_rate": 1.734278266536331e-05, + "loss": 0.4359, + "step": 10662 + }, + { + "epoch": 1.6692235441452725, + "grad_norm": 4.973824501037598, + "learning_rate": 1.73346366894754e-05, + "loss": 0.3661, + "step": 10663 + }, + { + "epoch": 1.6693800876643707, + "grad_norm": 3.2474687099456787, + "learning_rate": 1.732649071358749e-05, + "loss": 0.9877, + "step": 10664 + }, + { + "epoch": 1.669536631183469, + "grad_norm": 2.729857921600342, + "learning_rate": 1.7318344737699578e-05, + "loss": 0.743, + "step": 10665 + }, + { + "epoch": 1.6696931747025672, + "grad_norm": 3.607320785522461, + "learning_rate": 1.7310198761811665e-05, + "loss": 0.5537, + "step": 10666 + }, + { + "epoch": 1.6698497182216656, + "grad_norm": 2.016511917114258, + "learning_rate": 1.7302052785923756e-05, + "loss": 0.2914, + "step": 10667 + }, + { + "epoch": 1.670006261740764, + "grad_norm": 2.8036999702453613, + "learning_rate": 1.7293906810035843e-05, + "loss": 0.3818, + "step": 10668 + }, + { + "epoch": 1.6701628052598623, + "grad_norm": 1.5871727466583252, + "learning_rate": 1.7285760834147933e-05, + "loss": 0.3924, + "step": 10669 + }, + { + "epoch": 1.6703193487789605, + "grad_norm": 2.4036543369293213, + "learning_rate": 1.727761485826002e-05, + "loss": 0.824, + "step": 10670 + }, + { + "epoch": 1.6704758922980587, + "grad_norm": 1.6844873428344727, + "learning_rate": 1.7269468882372107e-05, + "loss": 0.4344, + "step": 10671 + }, + { + "epoch": 1.6706324358171571, + "grad_norm": 2.293459415435791, + "learning_rate": 1.7261322906484198e-05, + "loss": 0.5435, + "step": 10672 + }, + { + "epoch": 1.6707889793362556, + "grad_norm": 4.456692695617676, + "learning_rate": 1.725317693059629e-05, + "loss": 0.7345, + "step": 10673 + }, + { + "epoch": 1.6709455228553538, + "grad_norm": 9.932607650756836, + "learning_rate": 1.7245030954708376e-05, + "loss": 1.0749, + "step": 10674 + }, + { + "epoch": 1.671102066374452, + "grad_norm": 4.654472827911377, + "learning_rate": 1.7236884978820463e-05, + "loss": 0.5913, + "step": 10675 + }, + { + "epoch": 1.6712586098935505, + "grad_norm": 2.8303797245025635, + "learning_rate": 1.7228739002932553e-05, + "loss": 0.7852, + "step": 10676 + }, + { + "epoch": 1.6714151534126487, + "grad_norm": 4.485992908477783, + "learning_rate": 1.722059302704464e-05, + "loss": 0.5992, + "step": 10677 + }, + { + "epoch": 1.6715716969317471, + "grad_norm": 3.593834638595581, + "learning_rate": 1.721244705115673e-05, + "loss": 1.2802, + "step": 10678 + }, + { + "epoch": 1.6717282404508453, + "grad_norm": 2.4445931911468506, + "learning_rate": 1.7204301075268818e-05, + "loss": 0.489, + "step": 10679 + }, + { + "epoch": 1.6718847839699436, + "grad_norm": 3.9984238147735596, + "learning_rate": 1.7196155099380905e-05, + "loss": 1.1252, + "step": 10680 + }, + { + "epoch": 1.672041327489042, + "grad_norm": 2.0315189361572266, + "learning_rate": 1.7188009123492996e-05, + "loss": 0.5981, + "step": 10681 + }, + { + "epoch": 1.6721978710081402, + "grad_norm": 4.797017574310303, + "learning_rate": 1.7179863147605083e-05, + "loss": 0.9645, + "step": 10682 + }, + { + "epoch": 1.6723544145272387, + "grad_norm": 4.392819404602051, + "learning_rate": 1.7171717171717173e-05, + "loss": 1.2699, + "step": 10683 + }, + { + "epoch": 1.6725109580463369, + "grad_norm": 3.0914857387542725, + "learning_rate": 1.716357119582926e-05, + "loss": 0.7269, + "step": 10684 + }, + { + "epoch": 1.672667501565435, + "grad_norm": 3.4829630851745605, + "learning_rate": 1.7155425219941348e-05, + "loss": 0.7868, + "step": 10685 + }, + { + "epoch": 1.6728240450845335, + "grad_norm": 2.280256509780884, + "learning_rate": 1.7147279244053438e-05, + "loss": 0.3504, + "step": 10686 + }, + { + "epoch": 1.672980588603632, + "grad_norm": 2.6548142433166504, + "learning_rate": 1.713913326816553e-05, + "loss": 0.5787, + "step": 10687 + }, + { + "epoch": 1.6731371321227302, + "grad_norm": 4.180248260498047, + "learning_rate": 1.7130987292277616e-05, + "loss": 1.058, + "step": 10688 + }, + { + "epoch": 1.6732936756418284, + "grad_norm": 0.48792019486427307, + "learning_rate": 1.7122841316389703e-05, + "loss": 0.1716, + "step": 10689 + }, + { + "epoch": 1.6734502191609266, + "grad_norm": 0.38258567452430725, + "learning_rate": 1.7114695340501794e-05, + "loss": 0.1427, + "step": 10690 + }, + { + "epoch": 1.673606762680025, + "grad_norm": 0.5613877773284912, + "learning_rate": 1.710654936461388e-05, + "loss": 0.2217, + "step": 10691 + }, + { + "epoch": 1.6737633061991235, + "grad_norm": 0.48668795824050903, + "learning_rate": 1.709840338872597e-05, + "loss": 0.1985, + "step": 10692 + }, + { + "epoch": 1.6739198497182217, + "grad_norm": 0.8017076849937439, + "learning_rate": 1.709025741283806e-05, + "loss": 0.1985, + "step": 10693 + }, + { + "epoch": 1.67407639323732, + "grad_norm": 0.6857833862304688, + "learning_rate": 1.7082111436950146e-05, + "loss": 0.1983, + "step": 10694 + }, + { + "epoch": 1.6742329367564182, + "grad_norm": 0.9418497085571289, + "learning_rate": 1.7073965461062236e-05, + "loss": 0.2906, + "step": 10695 + }, + { + "epoch": 1.6743894802755166, + "grad_norm": 0.7837926149368286, + "learning_rate": 1.7065819485174327e-05, + "loss": 0.263, + "step": 10696 + }, + { + "epoch": 1.674546023794615, + "grad_norm": 0.9161882996559143, + "learning_rate": 1.7057673509286414e-05, + "loss": 0.409, + "step": 10697 + }, + { + "epoch": 1.6747025673137133, + "grad_norm": 1.4246692657470703, + "learning_rate": 1.70495275333985e-05, + "loss": 0.3577, + "step": 10698 + }, + { + "epoch": 1.6748591108328115, + "grad_norm": 0.9723537564277649, + "learning_rate": 1.704138155751059e-05, + "loss": 0.2251, + "step": 10699 + }, + { + "epoch": 1.6750156543519097, + "grad_norm": 1.211083173751831, + "learning_rate": 1.703323558162268e-05, + "loss": 0.3502, + "step": 10700 + }, + { + "epoch": 1.6751721978710081, + "grad_norm": 1.5646439790725708, + "learning_rate": 1.702508960573477e-05, + "loss": 0.3397, + "step": 10701 + }, + { + "epoch": 1.6753287413901066, + "grad_norm": 1.2209651470184326, + "learning_rate": 1.7016943629846856e-05, + "loss": 0.2247, + "step": 10702 + }, + { + "epoch": 1.6754852849092048, + "grad_norm": 1.4091103076934814, + "learning_rate": 1.7008797653958943e-05, + "loss": 0.5756, + "step": 10703 + }, + { + "epoch": 1.675641828428303, + "grad_norm": 1.0909346342086792, + "learning_rate": 1.7000651678071034e-05, + "loss": 0.3375, + "step": 10704 + }, + { + "epoch": 1.6757983719474012, + "grad_norm": 2.4394993782043457, + "learning_rate": 1.6992505702183124e-05, + "loss": 0.3789, + "step": 10705 + }, + { + "epoch": 1.6759549154664997, + "grad_norm": 1.3222695589065552, + "learning_rate": 1.6984359726295208e-05, + "loss": 0.4327, + "step": 10706 + }, + { + "epoch": 1.676111458985598, + "grad_norm": 0.760452926158905, + "learning_rate": 1.69762137504073e-05, + "loss": 0.2119, + "step": 10707 + }, + { + "epoch": 1.6762680025046963, + "grad_norm": 1.4725582599639893, + "learning_rate": 1.696806777451939e-05, + "loss": 0.3631, + "step": 10708 + }, + { + "epoch": 1.6764245460237945, + "grad_norm": 2.54144549369812, + "learning_rate": 1.6959921798631476e-05, + "loss": 0.3003, + "step": 10709 + }, + { + "epoch": 1.676581089542893, + "grad_norm": 1.6258243322372437, + "learning_rate": 1.6951775822743567e-05, + "loss": 0.3845, + "step": 10710 + }, + { + "epoch": 1.6767376330619912, + "grad_norm": 1.0261036157608032, + "learning_rate": 1.6943629846855654e-05, + "loss": 0.3195, + "step": 10711 + }, + { + "epoch": 1.6768941765810896, + "grad_norm": 2.569061517715454, + "learning_rate": 1.693548387096774e-05, + "loss": 0.432, + "step": 10712 + }, + { + "epoch": 1.6770507201001879, + "grad_norm": 2.516355037689209, + "learning_rate": 1.692733789507983e-05, + "loss": 0.7629, + "step": 10713 + }, + { + "epoch": 1.677207263619286, + "grad_norm": 1.5534687042236328, + "learning_rate": 1.6919191919191922e-05, + "loss": 0.3394, + "step": 10714 + }, + { + "epoch": 1.6773638071383845, + "grad_norm": 2.599407196044922, + "learning_rate": 1.6911045943304006e-05, + "loss": 0.8401, + "step": 10715 + }, + { + "epoch": 1.6775203506574827, + "grad_norm": 2.3908746242523193, + "learning_rate": 1.6902899967416096e-05, + "loss": 0.6219, + "step": 10716 + }, + { + "epoch": 1.6776768941765812, + "grad_norm": 2.5521457195281982, + "learning_rate": 1.6894753991528187e-05, + "loss": 0.484, + "step": 10717 + }, + { + "epoch": 1.6778334376956794, + "grad_norm": 2.6944849491119385, + "learning_rate": 1.6886608015640274e-05, + "loss": 0.6966, + "step": 10718 + }, + { + "epoch": 1.6779899812147776, + "grad_norm": 2.6340200901031494, + "learning_rate": 1.6878462039752365e-05, + "loss": 0.5403, + "step": 10719 + }, + { + "epoch": 1.678146524733876, + "grad_norm": 2.8762052059173584, + "learning_rate": 1.6870316063864452e-05, + "loss": 0.6278, + "step": 10720 + }, + { + "epoch": 1.6783030682529745, + "grad_norm": 4.543339252471924, + "learning_rate": 1.686217008797654e-05, + "loss": 0.883, + "step": 10721 + }, + { + "epoch": 1.6784596117720727, + "grad_norm": 2.066063642501831, + "learning_rate": 1.685402411208863e-05, + "loss": 0.5486, + "step": 10722 + }, + { + "epoch": 1.678616155291171, + "grad_norm": 3.0602383613586426, + "learning_rate": 1.684587813620072e-05, + "loss": 0.7431, + "step": 10723 + }, + { + "epoch": 1.6787726988102691, + "grad_norm": 3.9862430095672607, + "learning_rate": 1.6837732160312804e-05, + "loss": 0.7464, + "step": 10724 + }, + { + "epoch": 1.6789292423293676, + "grad_norm": 2.2118539810180664, + "learning_rate": 1.6829586184424894e-05, + "loss": 1.3138, + "step": 10725 + }, + { + "epoch": 1.679085785848466, + "grad_norm": 7.473895072937012, + "learning_rate": 1.6821440208536985e-05, + "loss": 1.0739, + "step": 10726 + }, + { + "epoch": 1.6792423293675642, + "grad_norm": 3.114844560623169, + "learning_rate": 1.6813294232649072e-05, + "loss": 0.7505, + "step": 10727 + }, + { + "epoch": 1.6793988728866625, + "grad_norm": 2.506728172302246, + "learning_rate": 1.680514825676116e-05, + "loss": 0.5948, + "step": 10728 + }, + { + "epoch": 1.6795554164057607, + "grad_norm": 3.632676601409912, + "learning_rate": 1.679700228087325e-05, + "loss": 0.6635, + "step": 10729 + }, + { + "epoch": 1.679711959924859, + "grad_norm": 2.0867960453033447, + "learning_rate": 1.6788856304985337e-05, + "loss": 0.7218, + "step": 10730 + }, + { + "epoch": 1.6798685034439576, + "grad_norm": 6.767341136932373, + "learning_rate": 1.6780710329097427e-05, + "loss": 1.5906, + "step": 10731 + }, + { + "epoch": 1.6800250469630558, + "grad_norm": 4.415426731109619, + "learning_rate": 1.6772564353209518e-05, + "loss": 0.5857, + "step": 10732 + }, + { + "epoch": 1.680181590482154, + "grad_norm": 3.975442886352539, + "learning_rate": 1.67644183773216e-05, + "loss": 1.4242, + "step": 10733 + }, + { + "epoch": 1.6803381340012522, + "grad_norm": 2.3388803005218506, + "learning_rate": 1.6756272401433692e-05, + "loss": 0.6661, + "step": 10734 + }, + { + "epoch": 1.6804946775203506, + "grad_norm": 1.394433617591858, + "learning_rate": 1.6748126425545783e-05, + "loss": 0.3153, + "step": 10735 + }, + { + "epoch": 1.680651221039449, + "grad_norm": 2.8544156551361084, + "learning_rate": 1.673998044965787e-05, + "loss": 1.0178, + "step": 10736 + }, + { + "epoch": 1.6808077645585473, + "grad_norm": 1.910866618156433, + "learning_rate": 1.6731834473769957e-05, + "loss": 0.581, + "step": 10737 + }, + { + "epoch": 1.6809643080776455, + "grad_norm": 4.440711498260498, + "learning_rate": 1.6723688497882047e-05, + "loss": 0.7821, + "step": 10738 + }, + { + "epoch": 1.6811208515967437, + "grad_norm": 0.6553218960762024, + "learning_rate": 1.6715542521994134e-05, + "loss": 0.2376, + "step": 10739 + }, + { + "epoch": 1.6812773951158422, + "grad_norm": 0.5248884558677673, + "learning_rate": 1.6707396546106225e-05, + "loss": 0.1794, + "step": 10740 + }, + { + "epoch": 1.6814339386349406, + "grad_norm": 1.0284734964370728, + "learning_rate": 1.6699250570218316e-05, + "loss": 0.1707, + "step": 10741 + }, + { + "epoch": 1.6815904821540388, + "grad_norm": 1.219259262084961, + "learning_rate": 1.66911045943304e-05, + "loss": 0.2054, + "step": 10742 + }, + { + "epoch": 1.681747025673137, + "grad_norm": 0.5599666237831116, + "learning_rate": 1.668295861844249e-05, + "loss": 0.2986, + "step": 10743 + }, + { + "epoch": 1.6819035691922355, + "grad_norm": 1.0208908319473267, + "learning_rate": 1.667481264255458e-05, + "loss": 0.1874, + "step": 10744 + }, + { + "epoch": 1.6820601127113337, + "grad_norm": 0.9589442014694214, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.3238, + "step": 10745 + }, + { + "epoch": 1.6822166562304322, + "grad_norm": 0.8883544206619263, + "learning_rate": 1.6658520690778755e-05, + "loss": 0.3284, + "step": 10746 + }, + { + "epoch": 1.6823731997495304, + "grad_norm": 0.7634162306785583, + "learning_rate": 1.6650374714890845e-05, + "loss": 0.3499, + "step": 10747 + }, + { + "epoch": 1.6825297432686286, + "grad_norm": 0.7927228212356567, + "learning_rate": 1.6642228739002932e-05, + "loss": 0.209, + "step": 10748 + }, + { + "epoch": 1.682686286787727, + "grad_norm": 1.1994575262069702, + "learning_rate": 1.6634082763115023e-05, + "loss": 0.1914, + "step": 10749 + }, + { + "epoch": 1.6828428303068252, + "grad_norm": 0.6639792919158936, + "learning_rate": 1.662593678722711e-05, + "loss": 0.2615, + "step": 10750 + }, + { + "epoch": 1.6829993738259237, + "grad_norm": 1.13416588306427, + "learning_rate": 1.6617790811339197e-05, + "loss": 0.2839, + "step": 10751 + }, + { + "epoch": 1.683155917345022, + "grad_norm": 0.8506364226341248, + "learning_rate": 1.6609644835451288e-05, + "loss": 0.3841, + "step": 10752 + }, + { + "epoch": 1.6833124608641201, + "grad_norm": 0.7892373204231262, + "learning_rate": 1.6601498859563378e-05, + "loss": 0.3165, + "step": 10753 + }, + { + "epoch": 1.6834690043832186, + "grad_norm": 1.4440827369689941, + "learning_rate": 1.6593352883675465e-05, + "loss": 0.478, + "step": 10754 + }, + { + "epoch": 1.683625547902317, + "grad_norm": 1.5406244993209839, + "learning_rate": 1.6585206907787552e-05, + "loss": 0.49, + "step": 10755 + }, + { + "epoch": 1.6837820914214152, + "grad_norm": 1.2711836099624634, + "learning_rate": 1.6577060931899643e-05, + "loss": 0.3804, + "step": 10756 + }, + { + "epoch": 1.6839386349405134, + "grad_norm": 0.9156380295753479, + "learning_rate": 1.656891495601173e-05, + "loss": 0.327, + "step": 10757 + }, + { + "epoch": 1.6840951784596117, + "grad_norm": 1.3760141134262085, + "learning_rate": 1.656076898012382e-05, + "loss": 0.4496, + "step": 10758 + }, + { + "epoch": 1.68425172197871, + "grad_norm": 1.7052363157272339, + "learning_rate": 1.6552623004235908e-05, + "loss": 0.6252, + "step": 10759 + }, + { + "epoch": 1.6844082654978085, + "grad_norm": 2.1589176654815674, + "learning_rate": 1.6544477028347995e-05, + "loss": 0.4605, + "step": 10760 + }, + { + "epoch": 1.6845648090169068, + "grad_norm": 1.5091878175735474, + "learning_rate": 1.6536331052460085e-05, + "loss": 0.4637, + "step": 10761 + }, + { + "epoch": 1.684721352536005, + "grad_norm": 1.998953104019165, + "learning_rate": 1.6528185076572176e-05, + "loss": 0.3771, + "step": 10762 + }, + { + "epoch": 1.6848778960551032, + "grad_norm": 3.2356033325195312, + "learning_rate": 1.6520039100684263e-05, + "loss": 0.3644, + "step": 10763 + }, + { + "epoch": 1.6850344395742016, + "grad_norm": 1.9181002378463745, + "learning_rate": 1.651189312479635e-05, + "loss": 0.5956, + "step": 10764 + }, + { + "epoch": 1.6851909830933, + "grad_norm": 2.216979503631592, + "learning_rate": 1.650374714890844e-05, + "loss": 0.4901, + "step": 10765 + }, + { + "epoch": 1.6853475266123983, + "grad_norm": 2.867090940475464, + "learning_rate": 1.6495601173020528e-05, + "loss": 0.5327, + "step": 10766 + }, + { + "epoch": 1.6855040701314965, + "grad_norm": 1.9636167287826538, + "learning_rate": 1.648745519713262e-05, + "loss": 0.6604, + "step": 10767 + }, + { + "epoch": 1.6856606136505947, + "grad_norm": 2.585047483444214, + "learning_rate": 1.6479309221244706e-05, + "loss": 0.743, + "step": 10768 + }, + { + "epoch": 1.6858171571696932, + "grad_norm": 6.250885009765625, + "learning_rate": 1.6471163245356793e-05, + "loss": 0.3589, + "step": 10769 + }, + { + "epoch": 1.6859737006887916, + "grad_norm": 2.2458488941192627, + "learning_rate": 1.6463017269468883e-05, + "loss": 0.6556, + "step": 10770 + }, + { + "epoch": 1.6861302442078898, + "grad_norm": 3.216923475265503, + "learning_rate": 1.6454871293580974e-05, + "loss": 1.1533, + "step": 10771 + }, + { + "epoch": 1.686286787726988, + "grad_norm": 7.242591381072998, + "learning_rate": 1.644672531769306e-05, + "loss": 1.0455, + "step": 10772 + }, + { + "epoch": 1.6864433312460863, + "grad_norm": 3.1592044830322266, + "learning_rate": 1.6438579341805148e-05, + "loss": 1.1806, + "step": 10773 + }, + { + "epoch": 1.6865998747651847, + "grad_norm": 3.6646671295166016, + "learning_rate": 1.643043336591724e-05, + "loss": 0.7369, + "step": 10774 + }, + { + "epoch": 1.6867564182842831, + "grad_norm": 3.2307512760162354, + "learning_rate": 1.6422287390029326e-05, + "loss": 0.9366, + "step": 10775 + }, + { + "epoch": 1.6869129618033814, + "grad_norm": 5.847586631774902, + "learning_rate": 1.6414141414141416e-05, + "loss": 1.0386, + "step": 10776 + }, + { + "epoch": 1.6870695053224796, + "grad_norm": 2.9217686653137207, + "learning_rate": 1.6405995438253503e-05, + "loss": 1.083, + "step": 10777 + }, + { + "epoch": 1.687226048841578, + "grad_norm": 3.9936108589172363, + "learning_rate": 1.639784946236559e-05, + "loss": 1.0143, + "step": 10778 + }, + { + "epoch": 1.6873825923606762, + "grad_norm": 2.857028007507324, + "learning_rate": 1.638970348647768e-05, + "loss": 0.82, + "step": 10779 + }, + { + "epoch": 1.6875391358797747, + "grad_norm": 6.060074806213379, + "learning_rate": 1.638155751058977e-05, + "loss": 0.7787, + "step": 10780 + }, + { + "epoch": 1.6876956793988729, + "grad_norm": 3.4753334522247314, + "learning_rate": 1.637341153470186e-05, + "loss": 0.6829, + "step": 10781 + }, + { + "epoch": 1.687852222917971, + "grad_norm": 4.773415565490723, + "learning_rate": 1.6365265558813946e-05, + "loss": 1.5254, + "step": 10782 + }, + { + "epoch": 1.6880087664370695, + "grad_norm": 2.6325182914733887, + "learning_rate": 1.6357119582926036e-05, + "loss": 1.0234, + "step": 10783 + }, + { + "epoch": 1.688165309956168, + "grad_norm": 5.459670066833496, + "learning_rate": 1.6348973607038123e-05, + "loss": 0.683, + "step": 10784 + }, + { + "epoch": 1.6883218534752662, + "grad_norm": 2.7173705101013184, + "learning_rate": 1.6340827631150214e-05, + "loss": 0.738, + "step": 10785 + }, + { + "epoch": 1.6884783969943644, + "grad_norm": 1.9269850254058838, + "learning_rate": 1.63326816552623e-05, + "loss": 0.5118, + "step": 10786 + }, + { + "epoch": 1.6886349405134626, + "grad_norm": 2.3096821308135986, + "learning_rate": 1.6324535679374388e-05, + "loss": 0.5897, + "step": 10787 + }, + { + "epoch": 1.688791484032561, + "grad_norm": 3.4759583473205566, + "learning_rate": 1.631638970348648e-05, + "loss": 1.4275, + "step": 10788 + }, + { + "epoch": 1.6889480275516595, + "grad_norm": 0.626553475856781, + "learning_rate": 1.630824372759857e-05, + "loss": 0.2359, + "step": 10789 + }, + { + "epoch": 1.6891045710707577, + "grad_norm": 0.5802909135818481, + "learning_rate": 1.6300097751710656e-05, + "loss": 0.2368, + "step": 10790 + }, + { + "epoch": 1.689261114589856, + "grad_norm": 0.42067950963974, + "learning_rate": 1.6291951775822744e-05, + "loss": 0.1649, + "step": 10791 + }, + { + "epoch": 1.6894176581089542, + "grad_norm": 0.7835147976875305, + "learning_rate": 1.6283805799934834e-05, + "loss": 0.1718, + "step": 10792 + }, + { + "epoch": 1.6895742016280526, + "grad_norm": 0.7188643217086792, + "learning_rate": 1.627565982404692e-05, + "loss": 0.258, + "step": 10793 + }, + { + "epoch": 1.689730745147151, + "grad_norm": 1.0707365274429321, + "learning_rate": 1.6267513848159012e-05, + "loss": 0.3217, + "step": 10794 + }, + { + "epoch": 1.6898872886662493, + "grad_norm": 1.4417506456375122, + "learning_rate": 1.62593678722711e-05, + "loss": 0.3, + "step": 10795 + }, + { + "epoch": 1.6900438321853475, + "grad_norm": 0.8599969744682312, + "learning_rate": 1.6251221896383186e-05, + "loss": 0.3382, + "step": 10796 + }, + { + "epoch": 1.6902003757044457, + "grad_norm": 1.2747220993041992, + "learning_rate": 1.6243075920495277e-05, + "loss": 0.2402, + "step": 10797 + }, + { + "epoch": 1.6903569192235441, + "grad_norm": 0.8377679586410522, + "learning_rate": 1.6234929944607367e-05, + "loss": 0.3062, + "step": 10798 + }, + { + "epoch": 1.6905134627426426, + "grad_norm": 0.6755133867263794, + "learning_rate": 1.6226783968719454e-05, + "loss": 0.2966, + "step": 10799 + }, + { + "epoch": 1.6906700062617408, + "grad_norm": 1.7346014976501465, + "learning_rate": 1.621863799283154e-05, + "loss": 0.2674, + "step": 10800 + }, + { + "epoch": 1.690826549780839, + "grad_norm": 1.260880470275879, + "learning_rate": 1.6210492016943632e-05, + "loss": 0.4413, + "step": 10801 + }, + { + "epoch": 1.6909830932999372, + "grad_norm": 1.6336499452590942, + "learning_rate": 1.620234604105572e-05, + "loss": 0.3522, + "step": 10802 + }, + { + "epoch": 1.6911396368190357, + "grad_norm": 0.9170951247215271, + "learning_rate": 1.619420006516781e-05, + "loss": 0.3347, + "step": 10803 + }, + { + "epoch": 1.6912961803381341, + "grad_norm": 1.1144932508468628, + "learning_rate": 1.6186054089279897e-05, + "loss": 0.3167, + "step": 10804 + }, + { + "epoch": 1.6914527238572323, + "grad_norm": 1.670179009437561, + "learning_rate": 1.6177908113391984e-05, + "loss": 0.3992, + "step": 10805 + }, + { + "epoch": 1.6916092673763305, + "grad_norm": 2.2547295093536377, + "learning_rate": 1.6169762137504074e-05, + "loss": 0.3701, + "step": 10806 + }, + { + "epoch": 1.6917658108954288, + "grad_norm": 1.1978129148483276, + "learning_rate": 1.6161616161616165e-05, + "loss": 0.3354, + "step": 10807 + }, + { + "epoch": 1.6919223544145272, + "grad_norm": 1.757881760597229, + "learning_rate": 1.6153470185728252e-05, + "loss": 0.5886, + "step": 10808 + }, + { + "epoch": 1.6920788979336256, + "grad_norm": 1.1403547525405884, + "learning_rate": 1.614532420984034e-05, + "loss": 0.3155, + "step": 10809 + }, + { + "epoch": 1.6922354414527239, + "grad_norm": 3.6774868965148926, + "learning_rate": 1.6137178233952426e-05, + "loss": 0.5249, + "step": 10810 + }, + { + "epoch": 1.692391984971822, + "grad_norm": 4.490832328796387, + "learning_rate": 1.6129032258064517e-05, + "loss": 0.7013, + "step": 10811 + }, + { + "epoch": 1.6925485284909205, + "grad_norm": 3.3519363403320312, + "learning_rate": 1.6120886282176607e-05, + "loss": 0.6607, + "step": 10812 + }, + { + "epoch": 1.6927050720100187, + "grad_norm": 2.0062782764434814, + "learning_rate": 1.6112740306288694e-05, + "loss": 0.6092, + "step": 10813 + }, + { + "epoch": 1.6928616155291172, + "grad_norm": 1.857517123222351, + "learning_rate": 1.610459433040078e-05, + "loss": 0.5746, + "step": 10814 + }, + { + "epoch": 1.6930181590482154, + "grad_norm": 3.679738759994507, + "learning_rate": 1.6096448354512872e-05, + "loss": 0.5643, + "step": 10815 + }, + { + "epoch": 1.6931747025673136, + "grad_norm": 4.508304119110107, + "learning_rate": 1.608830237862496e-05, + "loss": 0.677, + "step": 10816 + }, + { + "epoch": 1.693331246086412, + "grad_norm": 1.718203067779541, + "learning_rate": 1.608015640273705e-05, + "loss": 0.6142, + "step": 10817 + }, + { + "epoch": 1.6934877896055105, + "grad_norm": 0.6615483164787292, + "learning_rate": 1.6072010426849137e-05, + "loss": 0.1336, + "step": 10818 + }, + { + "epoch": 1.6936443331246087, + "grad_norm": 5.044641494750977, + "learning_rate": 1.6063864450961224e-05, + "loss": 0.8872, + "step": 10819 + }, + { + "epoch": 1.693800876643707, + "grad_norm": 2.8641371726989746, + "learning_rate": 1.6055718475073315e-05, + "loss": 0.703, + "step": 10820 + }, + { + "epoch": 1.6939574201628051, + "grad_norm": 3.9028444290161133, + "learning_rate": 1.6047572499185405e-05, + "loss": 0.8069, + "step": 10821 + }, + { + "epoch": 1.6941139636819036, + "grad_norm": 1.744416356086731, + "learning_rate": 1.603942652329749e-05, + "loss": 0.5377, + "step": 10822 + }, + { + "epoch": 1.694270507201002, + "grad_norm": 1.5708686113357544, + "learning_rate": 1.603128054740958e-05, + "loss": 0.5499, + "step": 10823 + }, + { + "epoch": 1.6944270507201002, + "grad_norm": 2.4663615226745605, + "learning_rate": 1.602313457152167e-05, + "loss": 0.6108, + "step": 10824 + }, + { + "epoch": 1.6945835942391985, + "grad_norm": 4.5735955238342285, + "learning_rate": 1.6014988595633757e-05, + "loss": 1.536, + "step": 10825 + }, + { + "epoch": 1.6947401377582967, + "grad_norm": 2.713693380355835, + "learning_rate": 1.6006842619745848e-05, + "loss": 0.9196, + "step": 10826 + }, + { + "epoch": 1.6948966812773951, + "grad_norm": 5.214019775390625, + "learning_rate": 1.5998696643857935e-05, + "loss": 0.9622, + "step": 10827 + }, + { + "epoch": 1.6950532247964936, + "grad_norm": 2.1274044513702393, + "learning_rate": 1.5990550667970022e-05, + "loss": 0.9757, + "step": 10828 + }, + { + "epoch": 1.6952097683155918, + "grad_norm": 3.3608763217926025, + "learning_rate": 1.5982404692082112e-05, + "loss": 0.6345, + "step": 10829 + }, + { + "epoch": 1.69536631183469, + "grad_norm": 3.2238738536834717, + "learning_rate": 1.5974258716194203e-05, + "loss": 0.708, + "step": 10830 + }, + { + "epoch": 1.6955228553537882, + "grad_norm": 4.170705318450928, + "learning_rate": 1.5966112740306287e-05, + "loss": 1.0293, + "step": 10831 + }, + { + "epoch": 1.6956793988728867, + "grad_norm": 5.1115899085998535, + "learning_rate": 1.5957966764418377e-05, + "loss": 1.1045, + "step": 10832 + }, + { + "epoch": 1.695835942391985, + "grad_norm": 3.428966760635376, + "learning_rate": 1.5949820788530468e-05, + "loss": 0.8437, + "step": 10833 + }, + { + "epoch": 1.6959924859110833, + "grad_norm": 3.676954507827759, + "learning_rate": 1.5941674812642555e-05, + "loss": 1.079, + "step": 10834 + }, + { + "epoch": 1.6961490294301815, + "grad_norm": 2.2261290550231934, + "learning_rate": 1.5933528836754645e-05, + "loss": 0.7141, + "step": 10835 + }, + { + "epoch": 1.6963055729492797, + "grad_norm": 2.7491133213043213, + "learning_rate": 1.5925382860866732e-05, + "loss": 0.4004, + "step": 10836 + }, + { + "epoch": 1.6964621164683782, + "grad_norm": 2.3427395820617676, + "learning_rate": 1.591723688497882e-05, + "loss": 0.6848, + "step": 10837 + }, + { + "epoch": 1.6966186599874766, + "grad_norm": 2.714620590209961, + "learning_rate": 1.590909090909091e-05, + "loss": 0.6241, + "step": 10838 + }, + { + "epoch": 1.6967752035065748, + "grad_norm": 0.7214166522026062, + "learning_rate": 1.5900944933203e-05, + "loss": 0.2141, + "step": 10839 + }, + { + "epoch": 1.696931747025673, + "grad_norm": 0.4886230230331421, + "learning_rate": 1.5892798957315084e-05, + "loss": 0.2243, + "step": 10840 + }, + { + "epoch": 1.6970882905447713, + "grad_norm": 0.4372832775115967, + "learning_rate": 1.5884652981427175e-05, + "loss": 0.2474, + "step": 10841 + }, + { + "epoch": 1.6972448340638697, + "grad_norm": 0.719634473323822, + "learning_rate": 1.5876507005539265e-05, + "loss": 0.2155, + "step": 10842 + }, + { + "epoch": 1.6974013775829682, + "grad_norm": 0.6957513093948364, + "learning_rate": 1.5868361029651353e-05, + "loss": 0.1856, + "step": 10843 + }, + { + "epoch": 1.6975579211020664, + "grad_norm": 0.5764868259429932, + "learning_rate": 1.586021505376344e-05, + "loss": 0.2201, + "step": 10844 + }, + { + "epoch": 1.6977144646211646, + "grad_norm": 0.781048595905304, + "learning_rate": 1.585206907787553e-05, + "loss": 0.1854, + "step": 10845 + }, + { + "epoch": 1.697871008140263, + "grad_norm": 0.8240617513656616, + "learning_rate": 1.5843923101987617e-05, + "loss": 0.2032, + "step": 10846 + }, + { + "epoch": 1.6980275516593613, + "grad_norm": 3.5250728130340576, + "learning_rate": 1.5835777126099708e-05, + "loss": 0.2867, + "step": 10847 + }, + { + "epoch": 1.6981840951784597, + "grad_norm": 1.2659281492233276, + "learning_rate": 1.58276311502118e-05, + "loss": 0.3301, + "step": 10848 + }, + { + "epoch": 1.698340638697558, + "grad_norm": 0.4548594355583191, + "learning_rate": 1.5819485174323882e-05, + "loss": 0.1268, + "step": 10849 + }, + { + "epoch": 1.6984971822166561, + "grad_norm": 0.5467552542686462, + "learning_rate": 1.5811339198435973e-05, + "loss": 0.2165, + "step": 10850 + }, + { + "epoch": 1.6986537257357546, + "grad_norm": 1.0635026693344116, + "learning_rate": 1.5803193222548063e-05, + "loss": 0.428, + "step": 10851 + }, + { + "epoch": 1.698810269254853, + "grad_norm": 1.539397954940796, + "learning_rate": 1.579504724666015e-05, + "loss": 0.4165, + "step": 10852 + }, + { + "epoch": 1.6989668127739512, + "grad_norm": 1.1764988899230957, + "learning_rate": 1.5786901270772238e-05, + "loss": 0.3315, + "step": 10853 + }, + { + "epoch": 1.6991233562930494, + "grad_norm": 2.165921449661255, + "learning_rate": 1.5778755294884328e-05, + "loss": 0.3019, + "step": 10854 + }, + { + "epoch": 1.6992798998121477, + "grad_norm": 1.5188761949539185, + "learning_rate": 1.5770609318996415e-05, + "loss": 0.3916, + "step": 10855 + }, + { + "epoch": 1.699436443331246, + "grad_norm": 2.8382749557495117, + "learning_rate": 1.5762463343108506e-05, + "loss": 0.3745, + "step": 10856 + }, + { + "epoch": 1.6995929868503445, + "grad_norm": 1.605650544166565, + "learning_rate": 1.5754317367220596e-05, + "loss": 0.2872, + "step": 10857 + }, + { + "epoch": 1.6997495303694428, + "grad_norm": 0.9848734736442566, + "learning_rate": 1.574617139133268e-05, + "loss": 0.3113, + "step": 10858 + }, + { + "epoch": 1.699906073888541, + "grad_norm": 1.5894790887832642, + "learning_rate": 1.573802541544477e-05, + "loss": 0.3345, + "step": 10859 + }, + { + "epoch": 1.7000626174076392, + "grad_norm": 1.5297536849975586, + "learning_rate": 1.572987943955686e-05, + "loss": 0.3658, + "step": 10860 + }, + { + "epoch": 1.7002191609267376, + "grad_norm": 1.0898429155349731, + "learning_rate": 1.5721733463668948e-05, + "loss": 0.3934, + "step": 10861 + }, + { + "epoch": 1.700375704445836, + "grad_norm": 1.6934685707092285, + "learning_rate": 1.5713587487781035e-05, + "loss": 0.4526, + "step": 10862 + }, + { + "epoch": 1.7005322479649343, + "grad_norm": 4.341007232666016, + "learning_rate": 1.5705441511893126e-05, + "loss": 1.0416, + "step": 10863 + }, + { + "epoch": 1.7006887914840325, + "grad_norm": 2.027693033218384, + "learning_rate": 1.5697295536005213e-05, + "loss": 0.5171, + "step": 10864 + }, + { + "epoch": 1.7008453350031307, + "grad_norm": 1.8451496362686157, + "learning_rate": 1.5689149560117304e-05, + "loss": 0.2691, + "step": 10865 + }, + { + "epoch": 1.7010018785222292, + "grad_norm": 4.357911109924316, + "learning_rate": 1.568100358422939e-05, + "loss": 0.7132, + "step": 10866 + }, + { + "epoch": 1.7011584220413276, + "grad_norm": 1.2298561334609985, + "learning_rate": 1.5672857608341478e-05, + "loss": 0.4054, + "step": 10867 + }, + { + "epoch": 1.7013149655604258, + "grad_norm": 2.000014305114746, + "learning_rate": 1.566471163245357e-05, + "loss": 0.7138, + "step": 10868 + }, + { + "epoch": 1.701471509079524, + "grad_norm": 2.7842462062835693, + "learning_rate": 1.565656565656566e-05, + "loss": 0.8544, + "step": 10869 + }, + { + "epoch": 1.7016280525986223, + "grad_norm": 3.5621039867401123, + "learning_rate": 1.5648419680677746e-05, + "loss": 0.4655, + "step": 10870 + }, + { + "epoch": 1.7017845961177207, + "grad_norm": 3.2011475563049316, + "learning_rate": 1.5640273704789833e-05, + "loss": 0.7232, + "step": 10871 + }, + { + "epoch": 1.7019411396368191, + "grad_norm": 1.3633527755737305, + "learning_rate": 1.5632127728901924e-05, + "loss": 0.2997, + "step": 10872 + }, + { + "epoch": 1.7020976831559174, + "grad_norm": 2.3503477573394775, + "learning_rate": 1.562398175301401e-05, + "loss": 0.5524, + "step": 10873 + }, + { + "epoch": 1.7022542266750156, + "grad_norm": 3.070122480392456, + "learning_rate": 1.56158357771261e-05, + "loss": 1.2672, + "step": 10874 + }, + { + "epoch": 1.7024107701941138, + "grad_norm": 5.393155574798584, + "learning_rate": 1.560768980123819e-05, + "loss": 1.0202, + "step": 10875 + }, + { + "epoch": 1.7025673137132122, + "grad_norm": 3.7579212188720703, + "learning_rate": 1.5599543825350276e-05, + "loss": 1.2789, + "step": 10876 + }, + { + "epoch": 1.7027238572323107, + "grad_norm": 3.5953445434570312, + "learning_rate": 1.5591397849462366e-05, + "loss": 0.8793, + "step": 10877 + }, + { + "epoch": 1.702880400751409, + "grad_norm": 1.6577696800231934, + "learning_rate": 1.5583251873574457e-05, + "loss": 0.6488, + "step": 10878 + }, + { + "epoch": 1.7030369442705071, + "grad_norm": 3.5413718223571777, + "learning_rate": 1.5575105897686544e-05, + "loss": 1.1526, + "step": 10879 + }, + { + "epoch": 1.7031934877896056, + "grad_norm": 3.070261240005493, + "learning_rate": 1.556695992179863e-05, + "loss": 0.8621, + "step": 10880 + }, + { + "epoch": 1.7033500313087038, + "grad_norm": 3.2518649101257324, + "learning_rate": 1.555881394591072e-05, + "loss": 0.8298, + "step": 10881 + }, + { + "epoch": 1.7035065748278022, + "grad_norm": 5.195306777954102, + "learning_rate": 1.555066797002281e-05, + "loss": 1.5016, + "step": 10882 + }, + { + "epoch": 1.7036631183469004, + "grad_norm": 6.117228031158447, + "learning_rate": 1.55425219941349e-05, + "loss": 0.7659, + "step": 10883 + }, + { + "epoch": 1.7038196618659986, + "grad_norm": 4.653079986572266, + "learning_rate": 1.5534376018246986e-05, + "loss": 0.7859, + "step": 10884 + }, + { + "epoch": 1.703976205385097, + "grad_norm": 3.4052443504333496, + "learning_rate": 1.5526230042359073e-05, + "loss": 0.7957, + "step": 10885 + }, + { + "epoch": 1.7041327489041955, + "grad_norm": 3.037620782852173, + "learning_rate": 1.5518084066471164e-05, + "loss": 0.7206, + "step": 10886 + }, + { + "epoch": 1.7042892924232937, + "grad_norm": 3.335369348526001, + "learning_rate": 1.5509938090583254e-05, + "loss": 1.2171, + "step": 10887 + }, + { + "epoch": 1.704445835942392, + "grad_norm": 1.9354963302612305, + "learning_rate": 1.550179211469534e-05, + "loss": 0.5939, + "step": 10888 + }, + { + "epoch": 1.7046023794614902, + "grad_norm": 1.6480036973953247, + "learning_rate": 1.549364613880743e-05, + "loss": 0.3266, + "step": 10889 + }, + { + "epoch": 1.7047589229805886, + "grad_norm": 0.8435109257698059, + "learning_rate": 1.548550016291952e-05, + "loss": 0.2077, + "step": 10890 + }, + { + "epoch": 1.704915466499687, + "grad_norm": 0.7347594499588013, + "learning_rate": 1.5477354187031606e-05, + "loss": 0.2825, + "step": 10891 + }, + { + "epoch": 1.7050720100187853, + "grad_norm": 0.6220846772193909, + "learning_rate": 1.5469208211143697e-05, + "loss": 0.2889, + "step": 10892 + }, + { + "epoch": 1.7052285535378835, + "grad_norm": 1.3983346223831177, + "learning_rate": 1.5461062235255784e-05, + "loss": 0.1974, + "step": 10893 + }, + { + "epoch": 1.7053850970569817, + "grad_norm": 0.8054052591323853, + "learning_rate": 1.545291625936787e-05, + "loss": 0.1981, + "step": 10894 + }, + { + "epoch": 1.7055416405760802, + "grad_norm": 0.8532100915908813, + "learning_rate": 1.5444770283479962e-05, + "loss": 0.2371, + "step": 10895 + }, + { + "epoch": 1.7056981840951786, + "grad_norm": 0.9532986283302307, + "learning_rate": 1.5436624307592052e-05, + "loss": 0.2333, + "step": 10896 + }, + { + "epoch": 1.7058547276142768, + "grad_norm": 2.57015061378479, + "learning_rate": 1.542847833170414e-05, + "loss": 0.2825, + "step": 10897 + }, + { + "epoch": 1.706011271133375, + "grad_norm": 0.7419247031211853, + "learning_rate": 1.5420332355816226e-05, + "loss": 0.2044, + "step": 10898 + }, + { + "epoch": 1.7061678146524732, + "grad_norm": 0.7724635601043701, + "learning_rate": 1.5412186379928317e-05, + "loss": 0.2012, + "step": 10899 + }, + { + "epoch": 1.7063243581715717, + "grad_norm": 1.3937052488327026, + "learning_rate": 1.5404040404040404e-05, + "loss": 0.1953, + "step": 10900 + }, + { + "epoch": 1.7064809016906701, + "grad_norm": 1.1660184860229492, + "learning_rate": 1.5395894428152495e-05, + "loss": 0.3198, + "step": 10901 + }, + { + "epoch": 1.7066374452097683, + "grad_norm": 2.0943281650543213, + "learning_rate": 1.5387748452264582e-05, + "loss": 0.6124, + "step": 10902 + }, + { + "epoch": 1.7067939887288666, + "grad_norm": 1.4122178554534912, + "learning_rate": 1.537960247637667e-05, + "loss": 0.3679, + "step": 10903 + }, + { + "epoch": 1.7069505322479648, + "grad_norm": 0.843903124332428, + "learning_rate": 1.537145650048876e-05, + "loss": 0.5517, + "step": 10904 + }, + { + "epoch": 1.7071070757670632, + "grad_norm": 1.299999713897705, + "learning_rate": 1.536331052460085e-05, + "loss": 0.2431, + "step": 10905 + }, + { + "epoch": 1.7072636192861617, + "grad_norm": 1.5908440351486206, + "learning_rate": 1.5355164548712937e-05, + "loss": 0.4125, + "step": 10906 + }, + { + "epoch": 1.7074201628052599, + "grad_norm": 1.282630443572998, + "learning_rate": 1.5347018572825024e-05, + "loss": 0.4038, + "step": 10907 + }, + { + "epoch": 1.707576706324358, + "grad_norm": 1.3203190565109253, + "learning_rate": 1.5338872596937115e-05, + "loss": 0.3007, + "step": 10908 + }, + { + "epoch": 1.7077332498434565, + "grad_norm": 1.960978627204895, + "learning_rate": 1.5330726621049202e-05, + "loss": 0.4774, + "step": 10909 + }, + { + "epoch": 1.7078897933625548, + "grad_norm": 0.9904780387878418, + "learning_rate": 1.5322580645161292e-05, + "loss": 0.4421, + "step": 10910 + }, + { + "epoch": 1.7080463368816532, + "grad_norm": 0.9318543076515198, + "learning_rate": 1.531443466927338e-05, + "loss": 0.2385, + "step": 10911 + }, + { + "epoch": 1.7082028804007514, + "grad_norm": 1.508102297782898, + "learning_rate": 1.5306288693385467e-05, + "loss": 0.3986, + "step": 10912 + }, + { + "epoch": 1.7083594239198496, + "grad_norm": 2.7925074100494385, + "learning_rate": 1.5298142717497557e-05, + "loss": 0.6397, + "step": 10913 + }, + { + "epoch": 1.708515967438948, + "grad_norm": 2.4962244033813477, + "learning_rate": 1.5289996741609648e-05, + "loss": 0.4187, + "step": 10914 + }, + { + "epoch": 1.7086725109580463, + "grad_norm": 1.7696136236190796, + "learning_rate": 1.5281850765721735e-05, + "loss": 0.5136, + "step": 10915 + }, + { + "epoch": 1.7088290544771447, + "grad_norm": 4.327839374542236, + "learning_rate": 1.5273704789833822e-05, + "loss": 0.9511, + "step": 10916 + }, + { + "epoch": 1.708985597996243, + "grad_norm": 2.6649222373962402, + "learning_rate": 1.5265558813945913e-05, + "loss": 1.3898, + "step": 10917 + }, + { + "epoch": 1.7091421415153412, + "grad_norm": 2.8928720951080322, + "learning_rate": 1.5257412838058e-05, + "loss": 0.7091, + "step": 10918 + }, + { + "epoch": 1.7092986850344396, + "grad_norm": 1.580301284790039, + "learning_rate": 1.5249266862170089e-05, + "loss": 0.3983, + "step": 10919 + }, + { + "epoch": 1.709455228553538, + "grad_norm": 3.2134532928466797, + "learning_rate": 1.5241120886282179e-05, + "loss": 0.5677, + "step": 10920 + }, + { + "epoch": 1.7096117720726363, + "grad_norm": 2.413642644882202, + "learning_rate": 1.5232974910394265e-05, + "loss": 0.2733, + "step": 10921 + }, + { + "epoch": 1.7097683155917345, + "grad_norm": 2.479295492172241, + "learning_rate": 1.5224828934506355e-05, + "loss": 0.9866, + "step": 10922 + }, + { + "epoch": 1.7099248591108327, + "grad_norm": 4.333662986755371, + "learning_rate": 1.5216682958618444e-05, + "loss": 0.8518, + "step": 10923 + }, + { + "epoch": 1.7100814026299311, + "grad_norm": 3.6454429626464844, + "learning_rate": 1.5208536982730531e-05, + "loss": 0.7081, + "step": 10924 + }, + { + "epoch": 1.7102379461490296, + "grad_norm": 3.2687439918518066, + "learning_rate": 1.5200391006842622e-05, + "loss": 0.9766, + "step": 10925 + }, + { + "epoch": 1.7103944896681278, + "grad_norm": 2.5546724796295166, + "learning_rate": 1.519224503095471e-05, + "loss": 0.7251, + "step": 10926 + }, + { + "epoch": 1.710551033187226, + "grad_norm": 1.965653896331787, + "learning_rate": 1.5184099055066798e-05, + "loss": 0.8036, + "step": 10927 + }, + { + "epoch": 1.7107075767063242, + "grad_norm": 3.3602654933929443, + "learning_rate": 1.5175953079178886e-05, + "loss": 1.0767, + "step": 10928 + }, + { + "epoch": 1.7108641202254227, + "grad_norm": 4.633134841918945, + "learning_rate": 1.5167807103290977e-05, + "loss": 1.3198, + "step": 10929 + }, + { + "epoch": 1.711020663744521, + "grad_norm": 1.7418475151062012, + "learning_rate": 1.5159661127403062e-05, + "loss": 0.7141, + "step": 10930 + }, + { + "epoch": 1.7111772072636193, + "grad_norm": 3.3566174507141113, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.6937, + "step": 10931 + }, + { + "epoch": 1.7113337507827175, + "grad_norm": 3.7086143493652344, + "learning_rate": 1.5143369175627242e-05, + "loss": 1.7646, + "step": 10932 + }, + { + "epoch": 1.7114902943018158, + "grad_norm": 1.516882300376892, + "learning_rate": 1.5135223199739329e-05, + "loss": 1.0216, + "step": 10933 + }, + { + "epoch": 1.7116468378209142, + "grad_norm": 2.322099447250366, + "learning_rate": 1.512707722385142e-05, + "loss": 0.9449, + "step": 10934 + }, + { + "epoch": 1.7118033813400126, + "grad_norm": 1.3897013664245605, + "learning_rate": 1.5118931247963508e-05, + "loss": 0.1431, + "step": 10935 + }, + { + "epoch": 1.7119599248591109, + "grad_norm": 3.662074089050293, + "learning_rate": 1.5110785272075595e-05, + "loss": 0.9071, + "step": 10936 + }, + { + "epoch": 1.712116468378209, + "grad_norm": 2.0742766857147217, + "learning_rate": 1.5102639296187684e-05, + "loss": 0.373, + "step": 10937 + }, + { + "epoch": 1.7122730118973073, + "grad_norm": 3.1452574729919434, + "learning_rate": 1.5094493320299771e-05, + "loss": 1.3186, + "step": 10938 + }, + { + "epoch": 1.7124295554164057, + "grad_norm": 0.45612671971321106, + "learning_rate": 1.508634734441186e-05, + "loss": 0.2549, + "step": 10939 + }, + { + "epoch": 1.7125860989355042, + "grad_norm": 0.7497422695159912, + "learning_rate": 1.507820136852395e-05, + "loss": 0.2717, + "step": 10940 + }, + { + "epoch": 1.7127426424546024, + "grad_norm": 0.5451535582542419, + "learning_rate": 1.5070055392636038e-05, + "loss": 0.2058, + "step": 10941 + }, + { + "epoch": 1.7128991859737006, + "grad_norm": 0.5076204538345337, + "learning_rate": 1.5061909416748127e-05, + "loss": 0.2021, + "step": 10942 + }, + { + "epoch": 1.713055729492799, + "grad_norm": 0.5423101782798767, + "learning_rate": 1.5053763440860215e-05, + "loss": 0.2484, + "step": 10943 + }, + { + "epoch": 1.7132122730118973, + "grad_norm": 0.4852851331233978, + "learning_rate": 1.5045617464972303e-05, + "loss": 0.1674, + "step": 10944 + }, + { + "epoch": 1.7133688165309957, + "grad_norm": 0.5821298360824585, + "learning_rate": 1.5037471489084393e-05, + "loss": 0.2903, + "step": 10945 + }, + { + "epoch": 1.713525360050094, + "grad_norm": 1.0704151391983032, + "learning_rate": 1.5029325513196482e-05, + "loss": 0.1795, + "step": 10946 + }, + { + "epoch": 1.7136819035691921, + "grad_norm": 0.5721018314361572, + "learning_rate": 1.5021179537308569e-05, + "loss": 0.1555, + "step": 10947 + }, + { + "epoch": 1.7138384470882906, + "grad_norm": 1.9287538528442383, + "learning_rate": 1.5013033561420658e-05, + "loss": 0.2743, + "step": 10948 + }, + { + "epoch": 1.7139949906073888, + "grad_norm": 1.4354498386383057, + "learning_rate": 1.5004887585532748e-05, + "loss": 0.3259, + "step": 10949 + }, + { + "epoch": 1.7141515341264872, + "grad_norm": 0.9900325536727905, + "learning_rate": 1.4996741609644836e-05, + "loss": 0.2993, + "step": 10950 + }, + { + "epoch": 1.7143080776455855, + "grad_norm": 1.0948717594146729, + "learning_rate": 1.4988595633756924e-05, + "loss": 0.2298, + "step": 10951 + }, + { + "epoch": 1.7144646211646837, + "grad_norm": 2.187636375427246, + "learning_rate": 1.4980449657869013e-05, + "loss": 0.459, + "step": 10952 + }, + { + "epoch": 1.7146211646837821, + "grad_norm": 2.632906198501587, + "learning_rate": 1.49723036819811e-05, + "loss": 0.4231, + "step": 10953 + }, + { + "epoch": 1.7147777082028806, + "grad_norm": 2.1041159629821777, + "learning_rate": 1.4964157706093191e-05, + "loss": 0.4968, + "step": 10954 + }, + { + "epoch": 1.7149342517219788, + "grad_norm": 1.2961403131484985, + "learning_rate": 1.495601173020528e-05, + "loss": 0.2704, + "step": 10955 + }, + { + "epoch": 1.715090795241077, + "grad_norm": 1.7437032461166382, + "learning_rate": 1.4947865754317367e-05, + "loss": 0.5442, + "step": 10956 + }, + { + "epoch": 1.7152473387601752, + "grad_norm": 3.8962855339050293, + "learning_rate": 1.4939719778429456e-05, + "loss": 0.6939, + "step": 10957 + }, + { + "epoch": 1.7154038822792737, + "grad_norm": 2.2071609497070312, + "learning_rate": 1.4931573802541546e-05, + "loss": 0.8029, + "step": 10958 + }, + { + "epoch": 1.715560425798372, + "grad_norm": 2.4164621829986572, + "learning_rate": 1.4923427826653633e-05, + "loss": 0.4992, + "step": 10959 + }, + { + "epoch": 1.7157169693174703, + "grad_norm": 1.3851954936981201, + "learning_rate": 1.4915281850765722e-05, + "loss": 0.3692, + "step": 10960 + }, + { + "epoch": 1.7158735128365685, + "grad_norm": 7.847949981689453, + "learning_rate": 1.4907135874877811e-05, + "loss": 0.5723, + "step": 10961 + }, + { + "epoch": 1.7160300563556667, + "grad_norm": 1.478545069694519, + "learning_rate": 1.4898989898989898e-05, + "loss": 0.3232, + "step": 10962 + }, + { + "epoch": 1.7161865998747652, + "grad_norm": 1.811725378036499, + "learning_rate": 1.4890843923101989e-05, + "loss": 0.4915, + "step": 10963 + }, + { + "epoch": 1.7163431433938636, + "grad_norm": 4.678177833557129, + "learning_rate": 1.4882697947214078e-05, + "loss": 0.7376, + "step": 10964 + }, + { + "epoch": 1.7164996869129618, + "grad_norm": 1.2394559383392334, + "learning_rate": 1.4874551971326165e-05, + "loss": 0.4326, + "step": 10965 + }, + { + "epoch": 1.71665623043206, + "grad_norm": 1.687760829925537, + "learning_rate": 1.4866405995438253e-05, + "loss": 0.7007, + "step": 10966 + }, + { + "epoch": 1.7168127739511583, + "grad_norm": 2.1334099769592285, + "learning_rate": 1.4858260019550344e-05, + "loss": 0.714, + "step": 10967 + }, + { + "epoch": 1.7169693174702567, + "grad_norm": 2.873534917831421, + "learning_rate": 1.4850114043662431e-05, + "loss": 0.568, + "step": 10968 + }, + { + "epoch": 1.7171258609893552, + "grad_norm": 2.2236111164093018, + "learning_rate": 1.484196806777452e-05, + "loss": 0.566, + "step": 10969 + }, + { + "epoch": 1.7172824045084534, + "grad_norm": 3.711664915084839, + "learning_rate": 1.4833822091886609e-05, + "loss": 0.8507, + "step": 10970 + }, + { + "epoch": 1.7174389480275516, + "grad_norm": 2.522130012512207, + "learning_rate": 1.4825676115998696e-05, + "loss": 0.8755, + "step": 10971 + }, + { + "epoch": 1.7175954915466498, + "grad_norm": 3.3438189029693604, + "learning_rate": 1.4817530140110786e-05, + "loss": 0.7229, + "step": 10972 + }, + { + "epoch": 1.7177520350657483, + "grad_norm": 2.2689123153686523, + "learning_rate": 1.4809384164222875e-05, + "loss": 0.9623, + "step": 10973 + }, + { + "epoch": 1.7179085785848467, + "grad_norm": 2.945545196533203, + "learning_rate": 1.4801238188334962e-05, + "loss": 1.1212, + "step": 10974 + }, + { + "epoch": 1.718065122103945, + "grad_norm": 3.309260368347168, + "learning_rate": 1.4793092212447051e-05, + "loss": 0.6426, + "step": 10975 + }, + { + "epoch": 1.7182216656230431, + "grad_norm": 3.0183310508728027, + "learning_rate": 1.4784946236559142e-05, + "loss": 0.745, + "step": 10976 + }, + { + "epoch": 1.7183782091421416, + "grad_norm": 2.770991563796997, + "learning_rate": 1.4776800260671227e-05, + "loss": 0.9107, + "step": 10977 + }, + { + "epoch": 1.7185347526612398, + "grad_norm": 4.434696674346924, + "learning_rate": 1.4768654284783318e-05, + "loss": 1.1134, + "step": 10978 + }, + { + "epoch": 1.7186912961803382, + "grad_norm": 4.271946907043457, + "learning_rate": 1.4760508308895407e-05, + "loss": 0.8328, + "step": 10979 + }, + { + "epoch": 1.7188478396994364, + "grad_norm": 3.5022871494293213, + "learning_rate": 1.4752362333007494e-05, + "loss": 0.9531, + "step": 10980 + }, + { + "epoch": 1.7190043832185347, + "grad_norm": 3.2325711250305176, + "learning_rate": 1.4744216357119584e-05, + "loss": 1.1734, + "step": 10981 + }, + { + "epoch": 1.719160926737633, + "grad_norm": 2.6976304054260254, + "learning_rate": 1.4736070381231673e-05, + "loss": 0.3671, + "step": 10982 + }, + { + "epoch": 1.7193174702567313, + "grad_norm": 3.849717140197754, + "learning_rate": 1.472792440534376e-05, + "loss": 1.1952, + "step": 10983 + }, + { + "epoch": 1.7194740137758298, + "grad_norm": 4.971611499786377, + "learning_rate": 1.4719778429455849e-05, + "loss": 1.0526, + "step": 10984 + }, + { + "epoch": 1.719630557294928, + "grad_norm": 4.696512699127197, + "learning_rate": 1.471163245356794e-05, + "loss": 0.6794, + "step": 10985 + }, + { + "epoch": 1.7197871008140262, + "grad_norm": 2.6763622760772705, + "learning_rate": 1.4703486477680025e-05, + "loss": 0.7411, + "step": 10986 + }, + { + "epoch": 1.7199436443331246, + "grad_norm": 6.09776496887207, + "learning_rate": 1.4695340501792116e-05, + "loss": 0.7269, + "step": 10987 + }, + { + "epoch": 1.720100187852223, + "grad_norm": 2.5462090969085693, + "learning_rate": 1.4687194525904204e-05, + "loss": 0.8563, + "step": 10988 + }, + { + "epoch": 1.7202567313713213, + "grad_norm": 0.72677081823349, + "learning_rate": 1.4679048550016292e-05, + "loss": 0.3856, + "step": 10989 + }, + { + "epoch": 1.7204132748904195, + "grad_norm": 0.49740156531333923, + "learning_rate": 1.467090257412838e-05, + "loss": 0.1668, + "step": 10990 + }, + { + "epoch": 1.7205698184095177, + "grad_norm": 0.48872700333595276, + "learning_rate": 1.4662756598240471e-05, + "loss": 0.1723, + "step": 10991 + }, + { + "epoch": 1.7207263619286162, + "grad_norm": 0.6738420128822327, + "learning_rate": 1.4654610622352558e-05, + "loss": 0.1924, + "step": 10992 + }, + { + "epoch": 1.7208829054477146, + "grad_norm": 1.63912832736969, + "learning_rate": 1.4646464646464647e-05, + "loss": 0.375, + "step": 10993 + }, + { + "epoch": 1.7210394489668128, + "grad_norm": 0.640859842300415, + "learning_rate": 1.4638318670576737e-05, + "loss": 0.1848, + "step": 10994 + }, + { + "epoch": 1.721195992485911, + "grad_norm": 0.7115835547447205, + "learning_rate": 1.4630172694688823e-05, + "loss": 0.3098, + "step": 10995 + }, + { + "epoch": 1.7213525360050093, + "grad_norm": 0.561629593372345, + "learning_rate": 1.4622026718800913e-05, + "loss": 0.2376, + "step": 10996 + }, + { + "epoch": 1.7215090795241077, + "grad_norm": 0.9497993588447571, + "learning_rate": 1.4613880742913002e-05, + "loss": 0.2142, + "step": 10997 + }, + { + "epoch": 1.7216656230432061, + "grad_norm": 0.6388746500015259, + "learning_rate": 1.460573476702509e-05, + "loss": 0.2118, + "step": 10998 + }, + { + "epoch": 1.7218221665623044, + "grad_norm": 0.8871611952781677, + "learning_rate": 1.4597588791137178e-05, + "loss": 0.2701, + "step": 10999 + }, + { + "epoch": 1.7219787100814026, + "grad_norm": 1.3162970542907715, + "learning_rate": 1.4589442815249269e-05, + "loss": 0.4085, + "step": 11000 + }, + { + "epoch": 1.7219787100814026, + "eval_loss": 0.4841917157173157, + "eval_runtime": 205.5455, + "eval_samples_per_second": 60.245, + "eval_steps_per_second": 3.766, + "eval_wer": 0.30816881654164097, + "step": 11000 + }, + { + "epoch": 1.7221352536005008, + "grad_norm": 1.128441333770752, + "learning_rate": 1.4581296839361356e-05, + "loss": 0.3299, + "step": 11001 + }, + { + "epoch": 1.7222917971195992, + "grad_norm": 1.0532206296920776, + "learning_rate": 1.4573150863473445e-05, + "loss": 0.3221, + "step": 11002 + }, + { + "epoch": 1.7224483406386977, + "grad_norm": 1.980502724647522, + "learning_rate": 1.4565004887585535e-05, + "loss": 0.4077, + "step": 11003 + }, + { + "epoch": 1.722604884157796, + "grad_norm": 1.3393118381500244, + "learning_rate": 1.455685891169762e-05, + "loss": 0.346, + "step": 11004 + }, + { + "epoch": 1.722761427676894, + "grad_norm": 2.3092494010925293, + "learning_rate": 1.4548712935809711e-05, + "loss": 0.4726, + "step": 11005 + }, + { + "epoch": 1.7229179711959923, + "grad_norm": 1.7505921125411987, + "learning_rate": 1.45405669599218e-05, + "loss": 0.4455, + "step": 11006 + }, + { + "epoch": 1.7230745147150908, + "grad_norm": 0.9154909253120422, + "learning_rate": 1.4532420984033887e-05, + "loss": 0.3035, + "step": 11007 + }, + { + "epoch": 1.7232310582341892, + "grad_norm": 3.279909372329712, + "learning_rate": 1.4524275008145976e-05, + "loss": 0.4316, + "step": 11008 + }, + { + "epoch": 1.7233876017532874, + "grad_norm": 2.9538204669952393, + "learning_rate": 1.4516129032258066e-05, + "loss": 0.6006, + "step": 11009 + }, + { + "epoch": 1.7235441452723856, + "grad_norm": 1.6598281860351562, + "learning_rate": 1.4507983056370154e-05, + "loss": 0.6712, + "step": 11010 + }, + { + "epoch": 1.723700688791484, + "grad_norm": 2.2328972816467285, + "learning_rate": 1.4499837080482242e-05, + "loss": 0.5141, + "step": 11011 + }, + { + "epoch": 1.7238572323105823, + "grad_norm": 1.3294422626495361, + "learning_rate": 1.4491691104594331e-05, + "loss": 0.3415, + "step": 11012 + }, + { + "epoch": 1.7240137758296807, + "grad_norm": 1.6379685401916504, + "learning_rate": 1.4483545128706418e-05, + "loss": 0.5884, + "step": 11013 + }, + { + "epoch": 1.724170319348779, + "grad_norm": 3.4745569229125977, + "learning_rate": 1.4475399152818509e-05, + "loss": 0.9195, + "step": 11014 + }, + { + "epoch": 1.7243268628678772, + "grad_norm": 2.6542863845825195, + "learning_rate": 1.4467253176930598e-05, + "loss": 0.63, + "step": 11015 + }, + { + "epoch": 1.7244834063869756, + "grad_norm": 1.4448120594024658, + "learning_rate": 1.4459107201042685e-05, + "loss": 0.3622, + "step": 11016 + }, + { + "epoch": 1.724639949906074, + "grad_norm": 2.765899896621704, + "learning_rate": 1.4450961225154774e-05, + "loss": 0.4679, + "step": 11017 + }, + { + "epoch": 1.7247964934251723, + "grad_norm": 2.130628824234009, + "learning_rate": 1.4442815249266864e-05, + "loss": 0.4253, + "step": 11018 + }, + { + "epoch": 1.7249530369442705, + "grad_norm": 3.6800875663757324, + "learning_rate": 1.4434669273378951e-05, + "loss": 0.8234, + "step": 11019 + }, + { + "epoch": 1.7251095804633687, + "grad_norm": 2.2524399757385254, + "learning_rate": 1.442652329749104e-05, + "loss": 0.5869, + "step": 11020 + }, + { + "epoch": 1.7252661239824671, + "grad_norm": 1.4299160242080688, + "learning_rate": 1.4418377321603129e-05, + "loss": 0.4709, + "step": 11021 + }, + { + "epoch": 1.7254226675015656, + "grad_norm": 3.5032968521118164, + "learning_rate": 1.4410231345715216e-05, + "loss": 0.5702, + "step": 11022 + }, + { + "epoch": 1.7255792110206638, + "grad_norm": 3.0849051475524902, + "learning_rate": 1.4402085369827307e-05, + "loss": 1.1971, + "step": 11023 + }, + { + "epoch": 1.725735754539762, + "grad_norm": 4.295708656311035, + "learning_rate": 1.4393939393939396e-05, + "loss": 1.4816, + "step": 11024 + }, + { + "epoch": 1.7258922980588602, + "grad_norm": 2.870216131210327, + "learning_rate": 1.4385793418051483e-05, + "loss": 1.1903, + "step": 11025 + }, + { + "epoch": 1.7260488415779587, + "grad_norm": 3.844456911087036, + "learning_rate": 1.4377647442163572e-05, + "loss": 0.8682, + "step": 11026 + }, + { + "epoch": 1.7262053850970571, + "grad_norm": 2.001420259475708, + "learning_rate": 1.4369501466275662e-05, + "loss": 0.5794, + "step": 11027 + }, + { + "epoch": 1.7263619286161553, + "grad_norm": 4.810309886932373, + "learning_rate": 1.436135549038775e-05, + "loss": 1.1435, + "step": 11028 + }, + { + "epoch": 1.7265184721352536, + "grad_norm": 5.033125877380371, + "learning_rate": 1.4353209514499838e-05, + "loss": 1.5453, + "step": 11029 + }, + { + "epoch": 1.7266750156543518, + "grad_norm": 2.929131507873535, + "learning_rate": 1.4345063538611927e-05, + "loss": 0.8935, + "step": 11030 + }, + { + "epoch": 1.7268315591734502, + "grad_norm": 2.893683433532715, + "learning_rate": 1.4336917562724014e-05, + "loss": 1.1838, + "step": 11031 + }, + { + "epoch": 1.7269881026925487, + "grad_norm": 1.7176178693771362, + "learning_rate": 1.4328771586836105e-05, + "loss": 0.77, + "step": 11032 + }, + { + "epoch": 1.7271446462116469, + "grad_norm": 3.3683359622955322, + "learning_rate": 1.4320625610948193e-05, + "loss": 1.2434, + "step": 11033 + }, + { + "epoch": 1.727301189730745, + "grad_norm": 2.5576226711273193, + "learning_rate": 1.431247963506028e-05, + "loss": 0.5207, + "step": 11034 + }, + { + "epoch": 1.7274577332498433, + "grad_norm": 3.784641981124878, + "learning_rate": 1.430433365917237e-05, + "loss": 0.6256, + "step": 11035 + }, + { + "epoch": 1.7276142767689417, + "grad_norm": 2.0867278575897217, + "learning_rate": 1.429618768328446e-05, + "loss": 0.8195, + "step": 11036 + }, + { + "epoch": 1.7277708202880402, + "grad_norm": 3.372119903564453, + "learning_rate": 1.4288041707396547e-05, + "loss": 0.8738, + "step": 11037 + }, + { + "epoch": 1.7279273638071384, + "grad_norm": 3.146177053451538, + "learning_rate": 1.4279895731508636e-05, + "loss": 1.0599, + "step": 11038 + }, + { + "epoch": 1.7280839073262366, + "grad_norm": 0.4042954444885254, + "learning_rate": 1.4271749755620725e-05, + "loss": 0.2017, + "step": 11039 + }, + { + "epoch": 1.7282404508453348, + "grad_norm": 0.5494428873062134, + "learning_rate": 1.4263603779732812e-05, + "loss": 0.214, + "step": 11040 + }, + { + "epoch": 1.7283969943644333, + "grad_norm": 1.4833433628082275, + "learning_rate": 1.4255457803844902e-05, + "loss": 0.2969, + "step": 11041 + }, + { + "epoch": 1.7285535378835317, + "grad_norm": 0.888079047203064, + "learning_rate": 1.4247311827956991e-05, + "loss": 0.1495, + "step": 11042 + }, + { + "epoch": 1.72871008140263, + "grad_norm": 0.888366162776947, + "learning_rate": 1.4239165852069078e-05, + "loss": 0.2224, + "step": 11043 + }, + { + "epoch": 1.7288666249217282, + "grad_norm": 1.6944013833999634, + "learning_rate": 1.4231019876181167e-05, + "loss": 0.4248, + "step": 11044 + }, + { + "epoch": 1.7290231684408266, + "grad_norm": 0.6291303038597107, + "learning_rate": 1.4222873900293258e-05, + "loss": 0.2178, + "step": 11045 + }, + { + "epoch": 1.7291797119599248, + "grad_norm": 0.7930209636688232, + "learning_rate": 1.4214727924405343e-05, + "loss": 0.1705, + "step": 11046 + }, + { + "epoch": 1.7293362554790233, + "grad_norm": 1.0002535581588745, + "learning_rate": 1.4206581948517434e-05, + "loss": 0.3167, + "step": 11047 + }, + { + "epoch": 1.7294927989981215, + "grad_norm": 0.9938116669654846, + "learning_rate": 1.4198435972629522e-05, + "loss": 0.3041, + "step": 11048 + }, + { + "epoch": 1.7296493425172197, + "grad_norm": 2.654334306716919, + "learning_rate": 1.419028999674161e-05, + "loss": 0.3993, + "step": 11049 + }, + { + "epoch": 1.7298058860363181, + "grad_norm": 1.517397403717041, + "learning_rate": 1.41821440208537e-05, + "loss": 0.3594, + "step": 11050 + }, + { + "epoch": 1.7299624295554166, + "grad_norm": 1.2718406915664673, + "learning_rate": 1.4173998044965789e-05, + "loss": 0.373, + "step": 11051 + }, + { + "epoch": 1.7301189730745148, + "grad_norm": 2.755364418029785, + "learning_rate": 1.4165852069077876e-05, + "loss": 0.4237, + "step": 11052 + }, + { + "epoch": 1.730275516593613, + "grad_norm": 0.8983971476554871, + "learning_rate": 1.4157706093189965e-05, + "loss": 0.3015, + "step": 11053 + }, + { + "epoch": 1.7304320601127112, + "grad_norm": 1.094590425491333, + "learning_rate": 1.4149560117302055e-05, + "loss": 0.6404, + "step": 11054 + }, + { + "epoch": 1.7305886036318097, + "grad_norm": 1.787893295288086, + "learning_rate": 1.4141414141414141e-05, + "loss": 0.537, + "step": 11055 + }, + { + "epoch": 1.730745147150908, + "grad_norm": 1.2837941646575928, + "learning_rate": 1.4133268165526231e-05, + "loss": 0.2834, + "step": 11056 + }, + { + "epoch": 1.7309016906700063, + "grad_norm": 1.1924264430999756, + "learning_rate": 1.412512218963832e-05, + "loss": 0.2456, + "step": 11057 + }, + { + "epoch": 1.7310582341891045, + "grad_norm": 1.761860728263855, + "learning_rate": 1.4116976213750407e-05, + "loss": 0.4907, + "step": 11058 + }, + { + "epoch": 1.7312147777082028, + "grad_norm": 3.3419132232666016, + "learning_rate": 1.4108830237862496e-05, + "loss": 0.6633, + "step": 11059 + }, + { + "epoch": 1.7313713212273012, + "grad_norm": 0.9454516172409058, + "learning_rate": 1.4100684261974587e-05, + "loss": 0.3061, + "step": 11060 + }, + { + "epoch": 1.7315278647463996, + "grad_norm": 1.71548330783844, + "learning_rate": 1.4092538286086674e-05, + "loss": 0.3124, + "step": 11061 + }, + { + "epoch": 1.7316844082654979, + "grad_norm": 1.8046894073486328, + "learning_rate": 1.4084392310198763e-05, + "loss": 0.6363, + "step": 11062 + }, + { + "epoch": 1.731840951784596, + "grad_norm": 2.2335398197174072, + "learning_rate": 1.4076246334310853e-05, + "loss": 0.4444, + "step": 11063 + }, + { + "epoch": 1.7319974953036943, + "grad_norm": 2.4897379875183105, + "learning_rate": 1.4068100358422939e-05, + "loss": 0.9392, + "step": 11064 + }, + { + "epoch": 1.7321540388227927, + "grad_norm": 2.1237893104553223, + "learning_rate": 1.405995438253503e-05, + "loss": 0.6382, + "step": 11065 + }, + { + "epoch": 1.7323105823418912, + "grad_norm": 1.4361774921417236, + "learning_rate": 1.4051808406647116e-05, + "loss": 0.5239, + "step": 11066 + }, + { + "epoch": 1.7324671258609894, + "grad_norm": 2.0243310928344727, + "learning_rate": 1.4043662430759205e-05, + "loss": 0.3767, + "step": 11067 + }, + { + "epoch": 1.7326236693800876, + "grad_norm": 2.173426389694214, + "learning_rate": 1.4035516454871294e-05, + "loss": 0.5943, + "step": 11068 + }, + { + "epoch": 1.7327802128991858, + "grad_norm": 2.3418123722076416, + "learning_rate": 1.4027370478983381e-05, + "loss": 0.8858, + "step": 11069 + }, + { + "epoch": 1.7329367564182843, + "grad_norm": 2.048450231552124, + "learning_rate": 1.4019224503095472e-05, + "loss": 0.4687, + "step": 11070 + }, + { + "epoch": 1.7330932999373827, + "grad_norm": 1.155648946762085, + "learning_rate": 1.401107852720756e-05, + "loss": 0.2899, + "step": 11071 + }, + { + "epoch": 1.733249843456481, + "grad_norm": 1.2889111042022705, + "learning_rate": 1.4002932551319648e-05, + "loss": 0.7409, + "step": 11072 + }, + { + "epoch": 1.7334063869755791, + "grad_norm": 2.548475980758667, + "learning_rate": 1.3994786575431736e-05, + "loss": 0.518, + "step": 11073 + }, + { + "epoch": 1.7335629304946774, + "grad_norm": 3.5301177501678467, + "learning_rate": 1.3986640599543827e-05, + "loss": 0.5744, + "step": 11074 + }, + { + "epoch": 1.7337194740137758, + "grad_norm": 3.4305171966552734, + "learning_rate": 1.3978494623655914e-05, + "loss": 1.2421, + "step": 11075 + }, + { + "epoch": 1.7338760175328742, + "grad_norm": 4.2517991065979, + "learning_rate": 1.3970348647768003e-05, + "loss": 1.1578, + "step": 11076 + }, + { + "epoch": 1.7340325610519725, + "grad_norm": 2.356567144393921, + "learning_rate": 1.3962202671880092e-05, + "loss": 0.7411, + "step": 11077 + }, + { + "epoch": 1.7341891045710707, + "grad_norm": 2.2395572662353516, + "learning_rate": 1.3954056695992179e-05, + "loss": 0.8106, + "step": 11078 + }, + { + "epoch": 1.7343456480901691, + "grad_norm": 2.1255784034729004, + "learning_rate": 1.394591072010427e-05, + "loss": 1.1704, + "step": 11079 + }, + { + "epoch": 1.7345021916092673, + "grad_norm": 7.69715690612793, + "learning_rate": 1.3937764744216358e-05, + "loss": 1.594, + "step": 11080 + }, + { + "epoch": 1.7346587351283658, + "grad_norm": 4.034933567047119, + "learning_rate": 1.3929618768328445e-05, + "loss": 0.7924, + "step": 11081 + }, + { + "epoch": 1.734815278647464, + "grad_norm": 2.7860054969787598, + "learning_rate": 1.3921472792440534e-05, + "loss": 1.4574, + "step": 11082 + }, + { + "epoch": 1.7349718221665622, + "grad_norm": 2.3166098594665527, + "learning_rate": 1.3913326816552625e-05, + "loss": 0.9116, + "step": 11083 + }, + { + "epoch": 1.7351283656856606, + "grad_norm": 3.6081931591033936, + "learning_rate": 1.3905180840664712e-05, + "loss": 0.5904, + "step": 11084 + }, + { + "epoch": 1.735284909204759, + "grad_norm": 2.5938680171966553, + "learning_rate": 1.38970348647768e-05, + "loss": 0.6381, + "step": 11085 + }, + { + "epoch": 1.7354414527238573, + "grad_norm": 1.4842782020568848, + "learning_rate": 1.388888888888889e-05, + "loss": 0.5284, + "step": 11086 + }, + { + "epoch": 1.7355979962429555, + "grad_norm": 2.2379679679870605, + "learning_rate": 1.3880742913000977e-05, + "loss": 0.5046, + "step": 11087 + }, + { + "epoch": 1.7357545397620537, + "grad_norm": 2.281604290008545, + "learning_rate": 1.3872596937113067e-05, + "loss": 1.1319, + "step": 11088 + }, + { + "epoch": 1.7359110832811522, + "grad_norm": 1.0276175737380981, + "learning_rate": 1.3864450961225156e-05, + "loss": 0.2101, + "step": 11089 + }, + { + "epoch": 1.7360676268002506, + "grad_norm": 0.8456549048423767, + "learning_rate": 1.3856304985337243e-05, + "loss": 0.2296, + "step": 11090 + }, + { + "epoch": 1.7362241703193488, + "grad_norm": 0.7368896007537842, + "learning_rate": 1.3848159009449332e-05, + "loss": 0.175, + "step": 11091 + }, + { + "epoch": 1.736380713838447, + "grad_norm": 0.6099885702133179, + "learning_rate": 1.3840013033561423e-05, + "loss": 0.2025, + "step": 11092 + }, + { + "epoch": 1.7365372573575453, + "grad_norm": 0.7266945838928223, + "learning_rate": 1.3831867057673508e-05, + "loss": 0.3131, + "step": 11093 + }, + { + "epoch": 1.7366938008766437, + "grad_norm": 0.8554593324661255, + "learning_rate": 1.3823721081785599e-05, + "loss": 0.1465, + "step": 11094 + }, + { + "epoch": 1.7368503443957422, + "grad_norm": 0.543168306350708, + "learning_rate": 1.3815575105897687e-05, + "loss": 0.297, + "step": 11095 + }, + { + "epoch": 1.7370068879148404, + "grad_norm": 1.008362054824829, + "learning_rate": 1.3807429130009774e-05, + "loss": 0.2258, + "step": 11096 + }, + { + "epoch": 1.7371634314339386, + "grad_norm": 1.1930603981018066, + "learning_rate": 1.3799283154121865e-05, + "loss": 0.4065, + "step": 11097 + }, + { + "epoch": 1.7373199749530368, + "grad_norm": 0.9410476088523865, + "learning_rate": 1.3791137178233954e-05, + "loss": 0.3929, + "step": 11098 + }, + { + "epoch": 1.7374765184721352, + "grad_norm": 1.1978623867034912, + "learning_rate": 1.3782991202346041e-05, + "loss": 0.4864, + "step": 11099 + }, + { + "epoch": 1.7376330619912337, + "grad_norm": 1.3115553855895996, + "learning_rate": 1.377484522645813e-05, + "loss": 0.3199, + "step": 11100 + }, + { + "epoch": 1.737789605510332, + "grad_norm": 0.7031318545341492, + "learning_rate": 1.376669925057022e-05, + "loss": 0.2106, + "step": 11101 + }, + { + "epoch": 1.7379461490294301, + "grad_norm": 1.505841612815857, + "learning_rate": 1.3758553274682306e-05, + "loss": 0.2688, + "step": 11102 + }, + { + "epoch": 1.7381026925485283, + "grad_norm": 1.4812792539596558, + "learning_rate": 1.3750407298794396e-05, + "loss": 0.3409, + "step": 11103 + }, + { + "epoch": 1.7382592360676268, + "grad_norm": 1.6648534536361694, + "learning_rate": 1.3742261322906485e-05, + "loss": 0.5269, + "step": 11104 + }, + { + "epoch": 1.7384157795867252, + "grad_norm": 1.1038296222686768, + "learning_rate": 1.3734115347018572e-05, + "loss": 0.2468, + "step": 11105 + }, + { + "epoch": 1.7385723231058234, + "grad_norm": 1.9155805110931396, + "learning_rate": 1.3725969371130661e-05, + "loss": 0.429, + "step": 11106 + }, + { + "epoch": 1.7387288666249217, + "grad_norm": 1.923473596572876, + "learning_rate": 1.3717823395242752e-05, + "loss": 0.3186, + "step": 11107 + }, + { + "epoch": 1.7388854101440199, + "grad_norm": 1.719010591506958, + "learning_rate": 1.3709677419354839e-05, + "loss": 0.2947, + "step": 11108 + }, + { + "epoch": 1.7390419536631183, + "grad_norm": 2.078354597091675, + "learning_rate": 1.3701531443466928e-05, + "loss": 0.6472, + "step": 11109 + }, + { + "epoch": 1.7391984971822168, + "grad_norm": 1.3921319246292114, + "learning_rate": 1.3693385467579018e-05, + "loss": 0.4766, + "step": 11110 + }, + { + "epoch": 1.739355040701315, + "grad_norm": 1.4101134538650513, + "learning_rate": 1.3685239491691104e-05, + "loss": 0.6361, + "step": 11111 + }, + { + "epoch": 1.7395115842204132, + "grad_norm": 2.111492872238159, + "learning_rate": 1.3677093515803194e-05, + "loss": 0.6464, + "step": 11112 + }, + { + "epoch": 1.7396681277395116, + "grad_norm": 2.003757953643799, + "learning_rate": 1.3668947539915283e-05, + "loss": 0.2802, + "step": 11113 + }, + { + "epoch": 1.7398246712586098, + "grad_norm": 1.3382511138916016, + "learning_rate": 1.366080156402737e-05, + "loss": 0.2563, + "step": 11114 + }, + { + "epoch": 1.7399812147777083, + "grad_norm": 2.5075886249542236, + "learning_rate": 1.3652655588139459e-05, + "loss": 0.7243, + "step": 11115 + }, + { + "epoch": 1.7401377582968065, + "grad_norm": 3.2145419120788574, + "learning_rate": 1.364450961225155e-05, + "loss": 0.6382, + "step": 11116 + }, + { + "epoch": 1.7402943018159047, + "grad_norm": 2.9483301639556885, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.612, + "step": 11117 + }, + { + "epoch": 1.7404508453350032, + "grad_norm": 2.611739158630371, + "learning_rate": 1.3628217660475725e-05, + "loss": 0.541, + "step": 11118 + }, + { + "epoch": 1.7406073888541016, + "grad_norm": 2.2528750896453857, + "learning_rate": 1.3620071684587816e-05, + "loss": 0.3798, + "step": 11119 + }, + { + "epoch": 1.7407639323731998, + "grad_norm": 5.27735710144043, + "learning_rate": 1.3611925708699901e-05, + "loss": 0.5584, + "step": 11120 + }, + { + "epoch": 1.740920475892298, + "grad_norm": 2.6276936531066895, + "learning_rate": 1.3603779732811992e-05, + "loss": 0.6523, + "step": 11121 + }, + { + "epoch": 1.7410770194113963, + "grad_norm": 3.333059549331665, + "learning_rate": 1.359563375692408e-05, + "loss": 0.7421, + "step": 11122 + }, + { + "epoch": 1.7412335629304947, + "grad_norm": 2.6668002605438232, + "learning_rate": 1.3587487781036168e-05, + "loss": 0.4066, + "step": 11123 + }, + { + "epoch": 1.7413901064495931, + "grad_norm": 2.546180009841919, + "learning_rate": 1.3579341805148257e-05, + "loss": 0.8705, + "step": 11124 + }, + { + "epoch": 1.7415466499686914, + "grad_norm": 2.774930715560913, + "learning_rate": 1.3571195829260347e-05, + "loss": 0.7819, + "step": 11125 + }, + { + "epoch": 1.7417031934877896, + "grad_norm": 3.5177125930786133, + "learning_rate": 1.3563049853372434e-05, + "loss": 0.7683, + "step": 11126 + }, + { + "epoch": 1.7418597370068878, + "grad_norm": 6.536265850067139, + "learning_rate": 1.3554903877484523e-05, + "loss": 0.8697, + "step": 11127 + }, + { + "epoch": 1.7420162805259862, + "grad_norm": 4.45420503616333, + "learning_rate": 1.3546757901596612e-05, + "loss": 1.1992, + "step": 11128 + }, + { + "epoch": 1.7421728240450847, + "grad_norm": 3.7175912857055664, + "learning_rate": 1.3538611925708699e-05, + "loss": 1.4337, + "step": 11129 + }, + { + "epoch": 1.7423293675641829, + "grad_norm": 4.5392889976501465, + "learning_rate": 1.353046594982079e-05, + "loss": 1.2843, + "step": 11130 + }, + { + "epoch": 1.742485911083281, + "grad_norm": 4.926556587219238, + "learning_rate": 1.3522319973932879e-05, + "loss": 0.8924, + "step": 11131 + }, + { + "epoch": 1.7426424546023793, + "grad_norm": 1.393261194229126, + "learning_rate": 1.3514173998044966e-05, + "loss": 0.6791, + "step": 11132 + }, + { + "epoch": 1.7427989981214778, + "grad_norm": 3.0996763706207275, + "learning_rate": 1.3506028022157054e-05, + "loss": 1.5759, + "step": 11133 + }, + { + "epoch": 1.7429555416405762, + "grad_norm": 4.321694850921631, + "learning_rate": 1.3497882046269145e-05, + "loss": 0.569, + "step": 11134 + }, + { + "epoch": 1.7431120851596744, + "grad_norm": NaN, + "learning_rate": 1.3497882046269145e-05, + "loss": 0.0, + "step": 11135 + }, + { + "epoch": 1.7432686286787726, + "grad_norm": 1.2365198135375977, + "learning_rate": 1.3489736070381232e-05, + "loss": 0.4467, + "step": 11136 + }, + { + "epoch": 1.7434251721978709, + "grad_norm": 3.3895277976989746, + "learning_rate": 1.3481590094493321e-05, + "loss": 0.8632, + "step": 11137 + }, + { + "epoch": 1.7435817157169693, + "grad_norm": 3.3732104301452637, + "learning_rate": 1.347344411860541e-05, + "loss": 1.267, + "step": 11138 + }, + { + "epoch": 1.7437382592360677, + "grad_norm": 0.6181389093399048, + "learning_rate": 1.3465298142717497e-05, + "loss": 0.25, + "step": 11139 + }, + { + "epoch": 1.743894802755166, + "grad_norm": 0.8748602271080017, + "learning_rate": 1.3457152166829587e-05, + "loss": 0.3692, + "step": 11140 + }, + { + "epoch": 1.7440513462742642, + "grad_norm": 0.6879124045372009, + "learning_rate": 1.3449006190941676e-05, + "loss": 0.1802, + "step": 11141 + }, + { + "epoch": 1.7442078897933626, + "grad_norm": 0.6855342388153076, + "learning_rate": 1.3440860215053763e-05, + "loss": 0.2081, + "step": 11142 + }, + { + "epoch": 1.7443644333124608, + "grad_norm": 0.7994014620780945, + "learning_rate": 1.3432714239165852e-05, + "loss": 0.2225, + "step": 11143 + }, + { + "epoch": 1.7445209768315593, + "grad_norm": 0.7025110125541687, + "learning_rate": 1.3424568263277943e-05, + "loss": 0.2161, + "step": 11144 + }, + { + "epoch": 1.7446775203506575, + "grad_norm": 0.7902966141700745, + "learning_rate": 1.341642228739003e-05, + "loss": 0.234, + "step": 11145 + }, + { + "epoch": 1.7448340638697557, + "grad_norm": 0.9090819954872131, + "learning_rate": 1.3408276311502119e-05, + "loss": 0.26, + "step": 11146 + }, + { + "epoch": 1.7449906073888541, + "grad_norm": 0.9897305965423584, + "learning_rate": 1.3400130335614208e-05, + "loss": 0.2202, + "step": 11147 + }, + { + "epoch": 1.7451471509079524, + "grad_norm": 0.6712114214897156, + "learning_rate": 1.3391984359726295e-05, + "loss": 0.3345, + "step": 11148 + }, + { + "epoch": 1.7453036944270508, + "grad_norm": 0.7444350719451904, + "learning_rate": 1.3383838383838385e-05, + "loss": 0.2694, + "step": 11149 + }, + { + "epoch": 1.745460237946149, + "grad_norm": 2.302060127258301, + "learning_rate": 1.3375692407950474e-05, + "loss": 0.1876, + "step": 11150 + }, + { + "epoch": 1.7456167814652472, + "grad_norm": 0.5736376047134399, + "learning_rate": 1.3367546432062561e-05, + "loss": 0.156, + "step": 11151 + }, + { + "epoch": 1.7457733249843457, + "grad_norm": 1.694002628326416, + "learning_rate": 1.335940045617465e-05, + "loss": 0.3639, + "step": 11152 + }, + { + "epoch": 1.7459298685034441, + "grad_norm": 1.2187213897705078, + "learning_rate": 1.335125448028674e-05, + "loss": 0.4369, + "step": 11153 + }, + { + "epoch": 1.7460864120225423, + "grad_norm": 1.3513669967651367, + "learning_rate": 1.3343108504398828e-05, + "loss": 0.3336, + "step": 11154 + }, + { + "epoch": 1.7462429555416406, + "grad_norm": 1.3094583749771118, + "learning_rate": 1.3334962528510917e-05, + "loss": 0.4482, + "step": 11155 + }, + { + "epoch": 1.7463994990607388, + "grad_norm": 2.427682638168335, + "learning_rate": 1.3326816552623005e-05, + "loss": 0.6552, + "step": 11156 + }, + { + "epoch": 1.7465560425798372, + "grad_norm": 2.3450098037719727, + "learning_rate": 1.3318670576735093e-05, + "loss": 0.7234, + "step": 11157 + }, + { + "epoch": 1.7467125860989356, + "grad_norm": 2.384768486022949, + "learning_rate": 1.3310524600847183e-05, + "loss": 0.6933, + "step": 11158 + }, + { + "epoch": 1.7468691296180339, + "grad_norm": 2.8505797386169434, + "learning_rate": 1.3302378624959272e-05, + "loss": 0.4153, + "step": 11159 + }, + { + "epoch": 1.747025673137132, + "grad_norm": 2.3834028244018555, + "learning_rate": 1.3294232649071359e-05, + "loss": 0.6205, + "step": 11160 + }, + { + "epoch": 1.7471822166562303, + "grad_norm": 1.7528411149978638, + "learning_rate": 1.3286086673183448e-05, + "loss": 0.3473, + "step": 11161 + }, + { + "epoch": 1.7473387601753287, + "grad_norm": 2.0563600063323975, + "learning_rate": 1.3277940697295538e-05, + "loss": 0.554, + "step": 11162 + }, + { + "epoch": 1.7474953036944272, + "grad_norm": 2.3344671726226807, + "learning_rate": 1.3269794721407624e-05, + "loss": 0.641, + "step": 11163 + }, + { + "epoch": 1.7476518472135254, + "grad_norm": 1.1553797721862793, + "learning_rate": 1.3261648745519714e-05, + "loss": 0.3614, + "step": 11164 + }, + { + "epoch": 1.7478083907326236, + "grad_norm": 3.7909886837005615, + "learning_rate": 1.3253502769631803e-05, + "loss": 0.7886, + "step": 11165 + }, + { + "epoch": 1.7479649342517218, + "grad_norm": 3.521713972091675, + "learning_rate": 1.324535679374389e-05, + "loss": 0.6157, + "step": 11166 + }, + { + "epoch": 1.7481214777708203, + "grad_norm": 4.800197124481201, + "learning_rate": 1.323721081785598e-05, + "loss": 0.5225, + "step": 11167 + }, + { + "epoch": 1.7482780212899187, + "grad_norm": 2.712125301361084, + "learning_rate": 1.322906484196807e-05, + "loss": 0.9144, + "step": 11168 + }, + { + "epoch": 1.748434564809017, + "grad_norm": 3.0555615425109863, + "learning_rate": 1.3220918866080157e-05, + "loss": 0.6644, + "step": 11169 + }, + { + "epoch": 1.7485911083281152, + "grad_norm": 2.5651137828826904, + "learning_rate": 1.3212772890192246e-05, + "loss": 0.5844, + "step": 11170 + }, + { + "epoch": 1.7487476518472134, + "grad_norm": 2.2256851196289062, + "learning_rate": 1.3204626914304336e-05, + "loss": 0.872, + "step": 11171 + }, + { + "epoch": 1.7489041953663118, + "grad_norm": 3.0463991165161133, + "learning_rate": 1.3196480938416422e-05, + "loss": 0.8496, + "step": 11172 + }, + { + "epoch": 1.7490607388854102, + "grad_norm": 2.5116875171661377, + "learning_rate": 1.3188334962528512e-05, + "loss": 0.5277, + "step": 11173 + }, + { + "epoch": 1.7492172824045085, + "grad_norm": 2.287644624710083, + "learning_rate": 1.3180188986640601e-05, + "loss": 0.6614, + "step": 11174 + }, + { + "epoch": 1.7493738259236067, + "grad_norm": 2.644148826599121, + "learning_rate": 1.3172043010752688e-05, + "loss": 0.7361, + "step": 11175 + }, + { + "epoch": 1.7495303694427051, + "grad_norm": 5.015182971954346, + "learning_rate": 1.3163897034864777e-05, + "loss": 1.2309, + "step": 11176 + }, + { + "epoch": 1.7496869129618033, + "grad_norm": 6.205677509307861, + "learning_rate": 1.3155751058976867e-05, + "loss": 0.9968, + "step": 11177 + }, + { + "epoch": 1.7498434564809018, + "grad_norm": 2.3275938034057617, + "learning_rate": 1.3147605083088955e-05, + "loss": 0.9352, + "step": 11178 + }, + { + "epoch": 1.75, + "grad_norm": 1.8987526893615723, + "learning_rate": 1.3139459107201043e-05, + "loss": 0.4498, + "step": 11179 + }, + { + "epoch": 1.7501565435190982, + "grad_norm": 3.932171583175659, + "learning_rate": 1.3131313131313134e-05, + "loss": 0.881, + "step": 11180 + }, + { + "epoch": 1.7503130870381967, + "grad_norm": 2.8132128715515137, + "learning_rate": 1.312316715542522e-05, + "loss": 0.783, + "step": 11181 + }, + { + "epoch": 1.7504696305572949, + "grad_norm": 2.528136968612671, + "learning_rate": 1.311502117953731e-05, + "loss": 1.1264, + "step": 11182 + }, + { + "epoch": 1.7506261740763933, + "grad_norm": 3.0253169536590576, + "learning_rate": 1.3106875203649399e-05, + "loss": 1.0508, + "step": 11183 + }, + { + "epoch": 1.7507827175954915, + "grad_norm": 2.7386341094970703, + "learning_rate": 1.3098729227761486e-05, + "loss": 0.618, + "step": 11184 + }, + { + "epoch": 1.7509392611145898, + "grad_norm": 2.966038465499878, + "learning_rate": 1.3090583251873575e-05, + "loss": 0.9545, + "step": 11185 + }, + { + "epoch": 1.7510958046336882, + "grad_norm": 2.2485854625701904, + "learning_rate": 1.3082437275985665e-05, + "loss": 0.7687, + "step": 11186 + }, + { + "epoch": 1.7512523481527866, + "grad_norm": 3.2845797538757324, + "learning_rate": 1.3074291300097752e-05, + "loss": 0.6586, + "step": 11187 + }, + { + "epoch": 1.7514088916718848, + "grad_norm": 5.057204723358154, + "learning_rate": 1.3066145324209841e-05, + "loss": 0.988, + "step": 11188 + }, + { + "epoch": 1.751565435190983, + "grad_norm": 0.39695680141448975, + "learning_rate": 1.3057999348321932e-05, + "loss": 0.1492, + "step": 11189 + }, + { + "epoch": 1.7517219787100813, + "grad_norm": 0.45288753509521484, + "learning_rate": 1.3049853372434017e-05, + "loss": 0.2337, + "step": 11190 + }, + { + "epoch": 1.7518785222291797, + "grad_norm": 1.722946286201477, + "learning_rate": 1.3041707396546108e-05, + "loss": 0.3239, + "step": 11191 + }, + { + "epoch": 1.7520350657482782, + "grad_norm": 0.5941222310066223, + "learning_rate": 1.3033561420658197e-05, + "loss": 0.2035, + "step": 11192 + }, + { + "epoch": 1.7521916092673764, + "grad_norm": 0.5723540186882019, + "learning_rate": 1.3025415444770284e-05, + "loss": 0.2434, + "step": 11193 + }, + { + "epoch": 1.7523481527864746, + "grad_norm": 0.8469648361206055, + "learning_rate": 1.3017269468882373e-05, + "loss": 0.2789, + "step": 11194 + }, + { + "epoch": 1.7525046963055728, + "grad_norm": 0.865783154964447, + "learning_rate": 1.300912349299446e-05, + "loss": 0.2283, + "step": 11195 + }, + { + "epoch": 1.7526612398246713, + "grad_norm": 0.709560215473175, + "learning_rate": 1.300097751710655e-05, + "loss": 0.2813, + "step": 11196 + }, + { + "epoch": 1.7528177833437697, + "grad_norm": 1.9800435304641724, + "learning_rate": 1.2992831541218639e-05, + "loss": 0.4627, + "step": 11197 + }, + { + "epoch": 1.752974326862868, + "grad_norm": 0.8802345991134644, + "learning_rate": 1.2984685565330726e-05, + "loss": 0.2265, + "step": 11198 + }, + { + "epoch": 1.7531308703819661, + "grad_norm": 1.2330639362335205, + "learning_rate": 1.2976539589442815e-05, + "loss": 0.3693, + "step": 11199 + }, + { + "epoch": 1.7532874139010644, + "grad_norm": 4.89985466003418, + "learning_rate": 1.2968393613554905e-05, + "loss": 0.2873, + "step": 11200 + }, + { + "epoch": 1.7534439574201628, + "grad_norm": 0.8723368048667908, + "learning_rate": 1.2960247637666993e-05, + "loss": 0.3499, + "step": 11201 + }, + { + "epoch": 1.7536005009392612, + "grad_norm": 1.6780978441238403, + "learning_rate": 1.2952101661779081e-05, + "loss": 0.5462, + "step": 11202 + }, + { + "epoch": 1.7537570444583594, + "grad_norm": 1.6838902235031128, + "learning_rate": 1.294395568589117e-05, + "loss": 0.2689, + "step": 11203 + }, + { + "epoch": 1.7539135879774577, + "grad_norm": 1.0043216943740845, + "learning_rate": 1.2935809710003257e-05, + "loss": 0.5139, + "step": 11204 + }, + { + "epoch": 1.7540701314965559, + "grad_norm": 1.2944315671920776, + "learning_rate": 1.2927663734115348e-05, + "loss": 0.3513, + "step": 11205 + }, + { + "epoch": 1.7542266750156543, + "grad_norm": 1.0446572303771973, + "learning_rate": 1.2919517758227437e-05, + "loss": 0.2774, + "step": 11206 + }, + { + "epoch": 1.7543832185347528, + "grad_norm": 0.7330520749092102, + "learning_rate": 1.2911371782339524e-05, + "loss": 0.2587, + "step": 11207 + }, + { + "epoch": 1.754539762053851, + "grad_norm": 2.2921817302703857, + "learning_rate": 1.2903225806451613e-05, + "loss": 0.5442, + "step": 11208 + }, + { + "epoch": 1.7546963055729492, + "grad_norm": 4.207590103149414, + "learning_rate": 1.2895079830563703e-05, + "loss": 0.6167, + "step": 11209 + }, + { + "epoch": 1.7548528490920476, + "grad_norm": 3.3627188205718994, + "learning_rate": 1.2886933854675789e-05, + "loss": 0.4575, + "step": 11210 + }, + { + "epoch": 1.7550093926111459, + "grad_norm": 2.2908740043640137, + "learning_rate": 1.287878787878788e-05, + "loss": 0.3545, + "step": 11211 + }, + { + "epoch": 1.7551659361302443, + "grad_norm": 2.2918057441711426, + "learning_rate": 1.2870641902899968e-05, + "loss": 0.6062, + "step": 11212 + }, + { + "epoch": 1.7553224796493425, + "grad_norm": 1.8927359580993652, + "learning_rate": 1.2862495927012055e-05, + "loss": 0.5679, + "step": 11213 + }, + { + "epoch": 1.7554790231684407, + "grad_norm": 1.981913447380066, + "learning_rate": 1.2854349951124146e-05, + "loss": 0.4783, + "step": 11214 + }, + { + "epoch": 1.7556355666875392, + "grad_norm": 2.2491676807403564, + "learning_rate": 1.2846203975236235e-05, + "loss": 0.3974, + "step": 11215 + }, + { + "epoch": 1.7557921102066374, + "grad_norm": 2.2772786617279053, + "learning_rate": 1.2838057999348322e-05, + "loss": 0.3666, + "step": 11216 + }, + { + "epoch": 1.7559486537257358, + "grad_norm": 14.906514167785645, + "learning_rate": 1.282991202346041e-05, + "loss": 0.684, + "step": 11217 + }, + { + "epoch": 1.756105197244834, + "grad_norm": 1.3710500001907349, + "learning_rate": 1.2821766047572501e-05, + "loss": 0.3394, + "step": 11218 + }, + { + "epoch": 1.7562617407639323, + "grad_norm": 1.9396122694015503, + "learning_rate": 1.2813620071684587e-05, + "loss": 0.623, + "step": 11219 + }, + { + "epoch": 1.7564182842830307, + "grad_norm": 3.4568750858306885, + "learning_rate": 1.2805474095796677e-05, + "loss": 0.6246, + "step": 11220 + }, + { + "epoch": 1.7565748278021291, + "grad_norm": 2.413411855697632, + "learning_rate": 1.2797328119908766e-05, + "loss": 0.7009, + "step": 11221 + }, + { + "epoch": 1.7567313713212274, + "grad_norm": 3.460231065750122, + "learning_rate": 1.2789182144020853e-05, + "loss": 0.8002, + "step": 11222 + }, + { + "epoch": 1.7568879148403256, + "grad_norm": 3.4229321479797363, + "learning_rate": 1.2781036168132944e-05, + "loss": 1.0475, + "step": 11223 + }, + { + "epoch": 1.7570444583594238, + "grad_norm": 1.9202378988265991, + "learning_rate": 1.2772890192245032e-05, + "loss": 0.5262, + "step": 11224 + }, + { + "epoch": 1.7572010018785222, + "grad_norm": 3.94421124458313, + "learning_rate": 1.276474421635712e-05, + "loss": 0.6533, + "step": 11225 + }, + { + "epoch": 1.7573575453976207, + "grad_norm": 1.7386069297790527, + "learning_rate": 1.2756598240469208e-05, + "loss": 0.3521, + "step": 11226 + }, + { + "epoch": 1.757514088916719, + "grad_norm": 2.67561411857605, + "learning_rate": 1.2748452264581299e-05, + "loss": 0.6668, + "step": 11227 + }, + { + "epoch": 1.7576706324358171, + "grad_norm": 6.835404396057129, + "learning_rate": 1.2740306288693384e-05, + "loss": 0.9724, + "step": 11228 + }, + { + "epoch": 1.7578271759549153, + "grad_norm": 2.9949982166290283, + "learning_rate": 1.2732160312805475e-05, + "loss": 1.5257, + "step": 11229 + }, + { + "epoch": 1.7579837194740138, + "grad_norm": 3.2012109756469727, + "learning_rate": 1.2724014336917564e-05, + "loss": 1.4039, + "step": 11230 + }, + { + "epoch": 1.7581402629931122, + "grad_norm": 5.6498847007751465, + "learning_rate": 1.271586836102965e-05, + "loss": 1.2329, + "step": 11231 + }, + { + "epoch": 1.7582968065122104, + "grad_norm": 2.866100311279297, + "learning_rate": 1.270772238514174e-05, + "loss": 0.6026, + "step": 11232 + }, + { + "epoch": 1.7584533500313086, + "grad_norm": 2.356135845184326, + "learning_rate": 1.269957640925383e-05, + "loss": 0.6573, + "step": 11233 + }, + { + "epoch": 1.7586098935504069, + "grad_norm": 1.678078055381775, + "learning_rate": 1.2691430433365917e-05, + "loss": 0.3628, + "step": 11234 + }, + { + "epoch": 1.7587664370695053, + "grad_norm": 3.0054755210876465, + "learning_rate": 1.2683284457478006e-05, + "loss": 0.6406, + "step": 11235 + }, + { + "epoch": 1.7589229805886037, + "grad_norm": 1.6612087488174438, + "learning_rate": 1.2675138481590097e-05, + "loss": 0.6679, + "step": 11236 + }, + { + "epoch": 1.759079524107702, + "grad_norm": 4.29382848739624, + "learning_rate": 1.2666992505702182e-05, + "loss": 0.9542, + "step": 11237 + }, + { + "epoch": 1.7592360676268002, + "grad_norm": 5.598333835601807, + "learning_rate": 1.2658846529814273e-05, + "loss": 1.7399, + "step": 11238 + }, + { + "epoch": 1.7593926111458984, + "grad_norm": 0.7009952068328857, + "learning_rate": 1.2650700553926361e-05, + "loss": 0.2084, + "step": 11239 + }, + { + "epoch": 1.7595491546649968, + "grad_norm": 0.5526193976402283, + "learning_rate": 1.2642554578038449e-05, + "loss": 0.1885, + "step": 11240 + }, + { + "epoch": 1.7597056981840953, + "grad_norm": 0.5002678632736206, + "learning_rate": 1.2634408602150537e-05, + "loss": 0.1995, + "step": 11241 + }, + { + "epoch": 1.7598622417031935, + "grad_norm": 4.546533584594727, + "learning_rate": 1.2626262626262628e-05, + "loss": 0.5571, + "step": 11242 + }, + { + "epoch": 1.7600187852222917, + "grad_norm": 0.7882021069526672, + "learning_rate": 1.2618116650374715e-05, + "loss": 0.2593, + "step": 11243 + }, + { + "epoch": 1.7601753287413902, + "grad_norm": 0.6298576593399048, + "learning_rate": 1.2609970674486804e-05, + "loss": 0.2635, + "step": 11244 + }, + { + "epoch": 1.7603318722604884, + "grad_norm": 0.7514098286628723, + "learning_rate": 1.2601824698598893e-05, + "loss": 0.2705, + "step": 11245 + }, + { + "epoch": 1.7604884157795868, + "grad_norm": 1.5332111120224, + "learning_rate": 1.259367872271098e-05, + "loss": 0.2856, + "step": 11246 + }, + { + "epoch": 1.760644959298685, + "grad_norm": 2.279364585876465, + "learning_rate": 1.258553274682307e-05, + "loss": 0.1983, + "step": 11247 + }, + { + "epoch": 1.7608015028177832, + "grad_norm": 0.7114599347114563, + "learning_rate": 1.257738677093516e-05, + "loss": 0.2083, + "step": 11248 + }, + { + "epoch": 1.7609580463368817, + "grad_norm": 0.8126291632652283, + "learning_rate": 1.2569240795047246e-05, + "loss": 0.2382, + "step": 11249 + }, + { + "epoch": 1.7611145898559801, + "grad_norm": 1.3707486391067505, + "learning_rate": 1.2561094819159335e-05, + "loss": 0.379, + "step": 11250 + }, + { + "epoch": 1.7612711333750783, + "grad_norm": 0.7469150424003601, + "learning_rate": 1.2552948843271426e-05, + "loss": 0.1781, + "step": 11251 + }, + { + "epoch": 1.7614276768941766, + "grad_norm": 1.026343822479248, + "learning_rate": 1.2544802867383513e-05, + "loss": 0.3182, + "step": 11252 + }, + { + "epoch": 1.7615842204132748, + "grad_norm": 1.611232042312622, + "learning_rate": 1.2536656891495602e-05, + "loss": 0.3607, + "step": 11253 + }, + { + "epoch": 1.7617407639323732, + "grad_norm": 1.3361122608184814, + "learning_rate": 1.252851091560769e-05, + "loss": 0.2699, + "step": 11254 + }, + { + "epoch": 1.7618973074514717, + "grad_norm": 1.215887188911438, + "learning_rate": 1.2520364939719778e-05, + "loss": 0.3496, + "step": 11255 + }, + { + "epoch": 1.7620538509705699, + "grad_norm": 4.778668403625488, + "learning_rate": 1.2512218963831868e-05, + "loss": 1.9189, + "step": 11256 + }, + { + "epoch": 1.762210394489668, + "grad_norm": 1.2309569120407104, + "learning_rate": 1.2504072987943957e-05, + "loss": 0.4269, + "step": 11257 + }, + { + "epoch": 1.7623669380087663, + "grad_norm": 0.9424862861633301, + "learning_rate": 1.2495927012056046e-05, + "loss": 0.3103, + "step": 11258 + }, + { + "epoch": 1.7625234815278648, + "grad_norm": 0.8906205892562866, + "learning_rate": 1.2487781036168133e-05, + "loss": 0.3639, + "step": 11259 + }, + { + "epoch": 1.7626800250469632, + "grad_norm": 1.165489912033081, + "learning_rate": 1.2479635060280222e-05, + "loss": 0.3672, + "step": 11260 + }, + { + "epoch": 1.7628365685660614, + "grad_norm": 0.9781424403190613, + "learning_rate": 1.247148908439231e-05, + "loss": 0.4436, + "step": 11261 + }, + { + "epoch": 1.7629931120851596, + "grad_norm": 1.188038945198059, + "learning_rate": 1.24633431085044e-05, + "loss": 0.455, + "step": 11262 + }, + { + "epoch": 1.7631496556042578, + "grad_norm": 2.636016607284546, + "learning_rate": 1.2455197132616488e-05, + "loss": 0.6, + "step": 11263 + }, + { + "epoch": 1.7633061991233563, + "grad_norm": 2.086601972579956, + "learning_rate": 1.2447051156728577e-05, + "loss": 0.4569, + "step": 11264 + }, + { + "epoch": 1.7634627426424547, + "grad_norm": 1.721804141998291, + "learning_rate": 1.2438905180840666e-05, + "loss": 0.4622, + "step": 11265 + }, + { + "epoch": 1.763619286161553, + "grad_norm": 1.311772108078003, + "learning_rate": 1.2430759204952753e-05, + "loss": 0.4748, + "step": 11266 + }, + { + "epoch": 1.7637758296806512, + "grad_norm": 1.8208191394805908, + "learning_rate": 1.2422613229064844e-05, + "loss": 0.6635, + "step": 11267 + }, + { + "epoch": 1.7639323731997494, + "grad_norm": 2.027522563934326, + "learning_rate": 1.241446725317693e-05, + "loss": 0.4219, + "step": 11268 + }, + { + "epoch": 1.7640889167188478, + "grad_norm": 1.8633061647415161, + "learning_rate": 1.240632127728902e-05, + "loss": 0.7502, + "step": 11269 + }, + { + "epoch": 1.7642454602379463, + "grad_norm": 3.2549304962158203, + "learning_rate": 1.2398175301401108e-05, + "loss": 0.6208, + "step": 11270 + }, + { + "epoch": 1.7644020037570445, + "grad_norm": 2.55618953704834, + "learning_rate": 1.2390029325513197e-05, + "loss": 0.8272, + "step": 11271 + }, + { + "epoch": 1.7645585472761427, + "grad_norm": 4.388089656829834, + "learning_rate": 1.2381883349625286e-05, + "loss": 0.9732, + "step": 11272 + }, + { + "epoch": 1.764715090795241, + "grad_norm": 3.623124361038208, + "learning_rate": 1.2373737373737375e-05, + "loss": 0.5193, + "step": 11273 + }, + { + "epoch": 1.7648716343143394, + "grad_norm": 2.6143596172332764, + "learning_rate": 1.2365591397849464e-05, + "loss": 1.0493, + "step": 11274 + }, + { + "epoch": 1.7650281778334378, + "grad_norm": 3.1999588012695312, + "learning_rate": 1.2357445421961551e-05, + "loss": 1.0442, + "step": 11275 + }, + { + "epoch": 1.765184721352536, + "grad_norm": 3.1361083984375, + "learning_rate": 1.2349299446073641e-05, + "loss": 1.1158, + "step": 11276 + }, + { + "epoch": 1.7653412648716342, + "grad_norm": 9.164917945861816, + "learning_rate": 1.2341153470185729e-05, + "loss": 0.67, + "step": 11277 + }, + { + "epoch": 1.7654978083907327, + "grad_norm": 2.8239083290100098, + "learning_rate": 1.2333007494297817e-05, + "loss": 1.0654, + "step": 11278 + }, + { + "epoch": 1.7656543519098309, + "grad_norm": 3.179630994796753, + "learning_rate": 1.2324861518409906e-05, + "loss": 1.0543, + "step": 11279 + }, + { + "epoch": 1.7658108954289293, + "grad_norm": 4.381399154663086, + "learning_rate": 1.2316715542521995e-05, + "loss": 1.1388, + "step": 11280 + }, + { + "epoch": 1.7659674389480275, + "grad_norm": 3.6765670776367188, + "learning_rate": 1.2308569566634084e-05, + "loss": 0.9212, + "step": 11281 + }, + { + "epoch": 1.7661239824671258, + "grad_norm": 2.6333236694335938, + "learning_rate": 1.2300423590746173e-05, + "loss": 1.2447, + "step": 11282 + }, + { + "epoch": 1.7662805259862242, + "grad_norm": 3.6881089210510254, + "learning_rate": 1.2292277614858262e-05, + "loss": 1.1317, + "step": 11283 + }, + { + "epoch": 1.7664370695053226, + "grad_norm": 8.721012115478516, + "learning_rate": 1.2284131638970349e-05, + "loss": 1.1485, + "step": 11284 + }, + { + "epoch": 1.7665936130244209, + "grad_norm": 1.8992232084274292, + "learning_rate": 1.227598566308244e-05, + "loss": 0.6704, + "step": 11285 + }, + { + "epoch": 1.766750156543519, + "grad_norm": 2.8548076152801514, + "learning_rate": 1.2267839687194526e-05, + "loss": 0.5809, + "step": 11286 + }, + { + "epoch": 1.7669067000626173, + "grad_norm": 2.5647830963134766, + "learning_rate": 1.2259693711306615e-05, + "loss": 0.7082, + "step": 11287 + }, + { + "epoch": 1.7670632435817157, + "grad_norm": 2.630845069885254, + "learning_rate": 1.2251547735418704e-05, + "loss": 0.6867, + "step": 11288 + }, + { + "epoch": 1.7672197871008142, + "grad_norm": 1.1455131769180298, + "learning_rate": 1.2243401759530793e-05, + "loss": 0.2535, + "step": 11289 + }, + { + "epoch": 1.7673763306199124, + "grad_norm": 0.4821718633174896, + "learning_rate": 1.223525578364288e-05, + "loss": 0.1998, + "step": 11290 + }, + { + "epoch": 1.7675328741390106, + "grad_norm": 0.701033890247345, + "learning_rate": 1.2227109807754969e-05, + "loss": 0.2142, + "step": 11291 + }, + { + "epoch": 1.7676894176581088, + "grad_norm": 0.782828152179718, + "learning_rate": 1.2218963831867058e-05, + "loss": 0.2135, + "step": 11292 + }, + { + "epoch": 1.7678459611772073, + "grad_norm": 1.2897509336471558, + "learning_rate": 1.2210817855979146e-05, + "loss": 0.3488, + "step": 11293 + }, + { + "epoch": 1.7680025046963057, + "grad_norm": 0.541743814945221, + "learning_rate": 1.2202671880091235e-05, + "loss": 0.2305, + "step": 11294 + }, + { + "epoch": 1.768159048215404, + "grad_norm": 0.9344268441200256, + "learning_rate": 1.2194525904203324e-05, + "loss": 0.2783, + "step": 11295 + }, + { + "epoch": 1.7683155917345021, + "grad_norm": 1.3051669597625732, + "learning_rate": 1.2186379928315413e-05, + "loss": 0.4397, + "step": 11296 + }, + { + "epoch": 1.7684721352536004, + "grad_norm": 0.8748600482940674, + "learning_rate": 1.21782339524275e-05, + "loss": 0.2923, + "step": 11297 + }, + { + "epoch": 1.7686286787726988, + "grad_norm": 1.0334067344665527, + "learning_rate": 1.217008797653959e-05, + "loss": 0.3404, + "step": 11298 + }, + { + "epoch": 1.7687852222917972, + "grad_norm": 1.1683763265609741, + "learning_rate": 1.2161942000651678e-05, + "loss": 0.3091, + "step": 11299 + }, + { + "epoch": 1.7689417658108955, + "grad_norm": 0.6380636692047119, + "learning_rate": 1.2153796024763767e-05, + "loss": 0.2566, + "step": 11300 + }, + { + "epoch": 1.7690983093299937, + "grad_norm": 1.182677984237671, + "learning_rate": 1.2145650048875855e-05, + "loss": 0.4941, + "step": 11301 + }, + { + "epoch": 1.769254852849092, + "grad_norm": 6.255828380584717, + "learning_rate": 1.2137504072987944e-05, + "loss": 0.3963, + "step": 11302 + }, + { + "epoch": 1.7694113963681903, + "grad_norm": 1.3892639875411987, + "learning_rate": 1.2129358097100033e-05, + "loss": 0.3052, + "step": 11303 + }, + { + "epoch": 1.7695679398872888, + "grad_norm": 6.636640548706055, + "learning_rate": 1.2121212121212122e-05, + "loss": 0.3471, + "step": 11304 + }, + { + "epoch": 1.769724483406387, + "grad_norm": 0.9157004952430725, + "learning_rate": 1.211306614532421e-05, + "loss": 0.2445, + "step": 11305 + }, + { + "epoch": 1.7698810269254852, + "grad_norm": 2.316617012023926, + "learning_rate": 1.2104920169436298e-05, + "loss": 0.5864, + "step": 11306 + }, + { + "epoch": 1.7700375704445834, + "grad_norm": 3.4656407833099365, + "learning_rate": 1.2096774193548388e-05, + "loss": 0.4024, + "step": 11307 + }, + { + "epoch": 1.7701941139636819, + "grad_norm": 1.9527254104614258, + "learning_rate": 1.2088628217660476e-05, + "loss": 0.6642, + "step": 11308 + }, + { + "epoch": 1.7703506574827803, + "grad_norm": 1.3863706588745117, + "learning_rate": 1.2080482241772564e-05, + "loss": 0.3734, + "step": 11309 + }, + { + "epoch": 1.7705072010018785, + "grad_norm": 2.020282745361328, + "learning_rate": 1.2072336265884653e-05, + "loss": 0.5573, + "step": 11310 + }, + { + "epoch": 1.7706637445209767, + "grad_norm": 1.5206310749053955, + "learning_rate": 1.2064190289996742e-05, + "loss": 0.3658, + "step": 11311 + }, + { + "epoch": 1.7708202880400752, + "grad_norm": 2.116086721420288, + "learning_rate": 1.2056044314108831e-05, + "loss": 0.4899, + "step": 11312 + }, + { + "epoch": 1.7709768315591734, + "grad_norm": 1.4659829139709473, + "learning_rate": 1.204789833822092e-05, + "loss": 0.351, + "step": 11313 + }, + { + "epoch": 1.7711333750782718, + "grad_norm": 1.6455097198486328, + "learning_rate": 1.2039752362333009e-05, + "loss": 0.6155, + "step": 11314 + }, + { + "epoch": 1.77128991859737, + "grad_norm": 1.0801538228988647, + "learning_rate": 1.2031606386445096e-05, + "loss": 0.4512, + "step": 11315 + }, + { + "epoch": 1.7714464621164683, + "grad_norm": 2.813013792037964, + "learning_rate": 1.2023460410557186e-05, + "loss": 0.6035, + "step": 11316 + }, + { + "epoch": 1.7716030056355667, + "grad_norm": 2.1752471923828125, + "learning_rate": 1.2015314434669273e-05, + "loss": 0.4823, + "step": 11317 + }, + { + "epoch": 1.7717595491546652, + "grad_norm": 4.710559368133545, + "learning_rate": 1.2007168458781362e-05, + "loss": 0.5984, + "step": 11318 + }, + { + "epoch": 1.7719160926737634, + "grad_norm": 1.9553247690200806, + "learning_rate": 1.1999022482893451e-05, + "loss": 0.4747, + "step": 11319 + }, + { + "epoch": 1.7720726361928616, + "grad_norm": 8.441060066223145, + "learning_rate": 1.199087650700554e-05, + "loss": 0.5674, + "step": 11320 + }, + { + "epoch": 1.7722291797119598, + "grad_norm": 2.059373617172241, + "learning_rate": 1.1982730531117629e-05, + "loss": 0.7324, + "step": 11321 + }, + { + "epoch": 1.7723857232310583, + "grad_norm": 2.880218505859375, + "learning_rate": 1.1974584555229718e-05, + "loss": 0.6096, + "step": 11322 + }, + { + "epoch": 1.7725422667501567, + "grad_norm": 4.529618740081787, + "learning_rate": 1.1966438579341806e-05, + "loss": 0.7907, + "step": 11323 + }, + { + "epoch": 1.772698810269255, + "grad_norm": 2.885565757751465, + "learning_rate": 1.1958292603453893e-05, + "loss": 1.0332, + "step": 11324 + }, + { + "epoch": 1.7728553537883531, + "grad_norm": 1.2587612867355347, + "learning_rate": 1.1950146627565984e-05, + "loss": 0.6517, + "step": 11325 + }, + { + "epoch": 1.7730118973074513, + "grad_norm": 3.478365659713745, + "learning_rate": 1.1942000651678071e-05, + "loss": 0.7966, + "step": 11326 + }, + { + "epoch": 1.7731684408265498, + "grad_norm": 1.7801717519760132, + "learning_rate": 1.193385467579016e-05, + "loss": 0.5703, + "step": 11327 + }, + { + "epoch": 1.7733249843456482, + "grad_norm": 2.39738130569458, + "learning_rate": 1.1925708699902249e-05, + "loss": 0.5486, + "step": 11328 + }, + { + "epoch": 1.7734815278647464, + "grad_norm": 2.78174090385437, + "learning_rate": 1.1917562724014338e-05, + "loss": 0.9421, + "step": 11329 + }, + { + "epoch": 1.7736380713838447, + "grad_norm": 2.870911121368408, + "learning_rate": 1.1909416748126426e-05, + "loss": 0.4509, + "step": 11330 + }, + { + "epoch": 1.7737946149029429, + "grad_norm": 2.493579387664795, + "learning_rate": 1.1901270772238515e-05, + "loss": 0.9425, + "step": 11331 + }, + { + "epoch": 1.7739511584220413, + "grad_norm": 3.8823800086975098, + "learning_rate": 1.1893124796350604e-05, + "loss": 1.1069, + "step": 11332 + }, + { + "epoch": 1.7741077019411398, + "grad_norm": 2.4571962356567383, + "learning_rate": 1.1884978820462691e-05, + "loss": 1.3737, + "step": 11333 + }, + { + "epoch": 1.774264245460238, + "grad_norm": 3.6859283447265625, + "learning_rate": 1.1876832844574782e-05, + "loss": 0.7225, + "step": 11334 + }, + { + "epoch": 1.7744207889793362, + "grad_norm": 4.111556529998779, + "learning_rate": 1.1868686868686869e-05, + "loss": 0.3971, + "step": 11335 + }, + { + "epoch": 1.7745773324984344, + "grad_norm": 5.8049726486206055, + "learning_rate": 1.1860540892798958e-05, + "loss": 0.7044, + "step": 11336 + }, + { + "epoch": 1.7747338760175329, + "grad_norm": 3.2043817043304443, + "learning_rate": 1.1852394916911047e-05, + "loss": 1.3356, + "step": 11337 + }, + { + "epoch": 1.7748904195366313, + "grad_norm": 1.5727969408035278, + "learning_rate": 1.1844248941023135e-05, + "loss": 0.9351, + "step": 11338 + }, + { + "epoch": 1.7750469630557295, + "grad_norm": 0.5627408027648926, + "learning_rate": 1.1836102965135224e-05, + "loss": 0.2962, + "step": 11339 + }, + { + "epoch": 1.7752035065748277, + "grad_norm": 1.10183846950531, + "learning_rate": 1.1827956989247313e-05, + "loss": 0.2025, + "step": 11340 + }, + { + "epoch": 1.775360050093926, + "grad_norm": 0.6861681342124939, + "learning_rate": 1.1819811013359402e-05, + "loss": 0.2076, + "step": 11341 + }, + { + "epoch": 1.7755165936130244, + "grad_norm": 1.1159563064575195, + "learning_rate": 1.1811665037471489e-05, + "loss": 0.1948, + "step": 11342 + }, + { + "epoch": 1.7756731371321228, + "grad_norm": 1.1723508834838867, + "learning_rate": 1.180351906158358e-05, + "loss": 0.3421, + "step": 11343 + }, + { + "epoch": 1.775829680651221, + "grad_norm": 0.9879608750343323, + "learning_rate": 1.1795373085695667e-05, + "loss": 0.417, + "step": 11344 + }, + { + "epoch": 1.7759862241703193, + "grad_norm": 0.7462827563285828, + "learning_rate": 1.1787227109807756e-05, + "loss": 0.3255, + "step": 11345 + }, + { + "epoch": 1.7761427676894177, + "grad_norm": 0.8231093883514404, + "learning_rate": 1.1779081133919844e-05, + "loss": 0.2926, + "step": 11346 + }, + { + "epoch": 1.776299311208516, + "grad_norm": 0.637130856513977, + "learning_rate": 1.1770935158031933e-05, + "loss": 0.2798, + "step": 11347 + }, + { + "epoch": 1.7764558547276144, + "grad_norm": 0.844779372215271, + "learning_rate": 1.176278918214402e-05, + "loss": 0.1998, + "step": 11348 + }, + { + "epoch": 1.7766123982467126, + "grad_norm": 1.0674197673797607, + "learning_rate": 1.1754643206256111e-05, + "loss": 0.3538, + "step": 11349 + }, + { + "epoch": 1.7767689417658108, + "grad_norm": 0.9423178434371948, + "learning_rate": 1.17464972303682e-05, + "loss": 0.3272, + "step": 11350 + }, + { + "epoch": 1.7769254852849092, + "grad_norm": 1.456839680671692, + "learning_rate": 1.1738351254480287e-05, + "loss": 0.2627, + "step": 11351 + }, + { + "epoch": 1.7770820288040077, + "grad_norm": 1.0717869997024536, + "learning_rate": 1.1730205278592377e-05, + "loss": 0.2629, + "step": 11352 + }, + { + "epoch": 1.777238572323106, + "grad_norm": 0.9330407977104187, + "learning_rate": 1.1722059302704465e-05, + "loss": 0.2141, + "step": 11353 + }, + { + "epoch": 1.777395115842204, + "grad_norm": 1.4045625925064087, + "learning_rate": 1.1713913326816553e-05, + "loss": 0.317, + "step": 11354 + }, + { + "epoch": 1.7775516593613023, + "grad_norm": 3.684098243713379, + "learning_rate": 1.170576735092864e-05, + "loss": 0.5541, + "step": 11355 + }, + { + "epoch": 1.7777082028804008, + "grad_norm": 2.156986713409424, + "learning_rate": 1.1697621375040731e-05, + "loss": 0.485, + "step": 11356 + }, + { + "epoch": 1.7778647463994992, + "grad_norm": 1.4683109521865845, + "learning_rate": 1.1689475399152818e-05, + "loss": 0.4144, + "step": 11357 + }, + { + "epoch": 1.7780212899185974, + "grad_norm": 1.6254905462265015, + "learning_rate": 1.1681329423264907e-05, + "loss": 0.3577, + "step": 11358 + }, + { + "epoch": 1.7781778334376956, + "grad_norm": 1.5611939430236816, + "learning_rate": 1.1673183447376996e-05, + "loss": 0.3565, + "step": 11359 + }, + { + "epoch": 1.7783343769567939, + "grad_norm": 2.9820313453674316, + "learning_rate": 1.1665037471489085e-05, + "loss": 0.5911, + "step": 11360 + }, + { + "epoch": 1.7784909204758923, + "grad_norm": 2.493563413619995, + "learning_rate": 1.1656891495601173e-05, + "loss": 0.5763, + "step": 11361 + }, + { + "epoch": 1.7786474639949907, + "grad_norm": 2.5377328395843506, + "learning_rate": 1.1648745519713262e-05, + "loss": 0.5694, + "step": 11362 + }, + { + "epoch": 1.778804007514089, + "grad_norm": 1.3267650604248047, + "learning_rate": 1.1640599543825351e-05, + "loss": 0.1881, + "step": 11363 + }, + { + "epoch": 1.7789605510331872, + "grad_norm": 1.9823884963989258, + "learning_rate": 1.1632453567937438e-05, + "loss": 0.5385, + "step": 11364 + }, + { + "epoch": 1.7791170945522854, + "grad_norm": 2.353557586669922, + "learning_rate": 1.1624307592049529e-05, + "loss": 0.583, + "step": 11365 + }, + { + "epoch": 1.7792736380713838, + "grad_norm": 2.493476629257202, + "learning_rate": 1.1616161616161616e-05, + "loss": 0.6125, + "step": 11366 + }, + { + "epoch": 1.7794301815904823, + "grad_norm": 1.7654962539672852, + "learning_rate": 1.1608015640273705e-05, + "loss": 0.4925, + "step": 11367 + }, + { + "epoch": 1.7795867251095805, + "grad_norm": 1.8148096799850464, + "learning_rate": 1.1599869664385794e-05, + "loss": 0.4637, + "step": 11368 + }, + { + "epoch": 1.7797432686286787, + "grad_norm": 2.4079103469848633, + "learning_rate": 1.1591723688497882e-05, + "loss": 0.447, + "step": 11369 + }, + { + "epoch": 1.779899812147777, + "grad_norm": 1.3345768451690674, + "learning_rate": 1.1583577712609971e-05, + "loss": 0.4698, + "step": 11370 + }, + { + "epoch": 1.7800563556668754, + "grad_norm": 4.009058475494385, + "learning_rate": 1.157543173672206e-05, + "loss": 0.8138, + "step": 11371 + }, + { + "epoch": 1.7802128991859738, + "grad_norm": 3.0272624492645264, + "learning_rate": 1.1567285760834149e-05, + "loss": 0.5279, + "step": 11372 + }, + { + "epoch": 1.780369442705072, + "grad_norm": 2.545057535171509, + "learning_rate": 1.1559139784946236e-05, + "loss": 0.8395, + "step": 11373 + }, + { + "epoch": 1.7805259862241702, + "grad_norm": 4.381741046905518, + "learning_rate": 1.1550993809058327e-05, + "loss": 0.807, + "step": 11374 + }, + { + "epoch": 1.7806825297432687, + "grad_norm": 5.520626544952393, + "learning_rate": 1.1542847833170414e-05, + "loss": 1.1318, + "step": 11375 + }, + { + "epoch": 1.780839073262367, + "grad_norm": 1.848698616027832, + "learning_rate": 1.1534701857282503e-05, + "loss": 0.9281, + "step": 11376 + }, + { + "epoch": 1.7809956167814653, + "grad_norm": 2.5288639068603516, + "learning_rate": 1.1526555881394591e-05, + "loss": 1.0056, + "step": 11377 + }, + { + "epoch": 1.7811521603005636, + "grad_norm": 6.225196838378906, + "learning_rate": 1.151840990550668e-05, + "loss": 1.381, + "step": 11378 + }, + { + "epoch": 1.7813087038196618, + "grad_norm": 2.9526352882385254, + "learning_rate": 1.1510263929618769e-05, + "loss": 0.8465, + "step": 11379 + }, + { + "epoch": 1.7814652473387602, + "grad_norm": 3.226083517074585, + "learning_rate": 1.1502117953730858e-05, + "loss": 1.2561, + "step": 11380 + }, + { + "epoch": 1.7816217908578584, + "grad_norm": 5.500044345855713, + "learning_rate": 1.1493971977842947e-05, + "loss": 1.358, + "step": 11381 + }, + { + "epoch": 1.7817783343769569, + "grad_norm": 5.313345432281494, + "learning_rate": 1.1485826001955034e-05, + "loss": 1.1396, + "step": 11382 + }, + { + "epoch": 1.781934877896055, + "grad_norm": 2.927936315536499, + "learning_rate": 1.1477680026067124e-05, + "loss": 0.911, + "step": 11383 + }, + { + "epoch": 1.7820914214151533, + "grad_norm": 1.690229058265686, + "learning_rate": 1.1469534050179212e-05, + "loss": 0.2426, + "step": 11384 + }, + { + "epoch": 1.7822479649342517, + "grad_norm": 2.472726345062256, + "learning_rate": 1.14613880742913e-05, + "loss": 0.4769, + "step": 11385 + }, + { + "epoch": 1.7824045084533502, + "grad_norm": 7.3422722816467285, + "learning_rate": 1.145324209840339e-05, + "loss": 0.7514, + "step": 11386 + }, + { + "epoch": 1.7825610519724484, + "grad_norm": 1.3262523412704468, + "learning_rate": 1.1445096122515478e-05, + "loss": 0.5605, + "step": 11387 + }, + { + "epoch": 1.7827175954915466, + "grad_norm": 4.813174247741699, + "learning_rate": 1.1436950146627567e-05, + "loss": 0.8202, + "step": 11388 + }, + { + "epoch": 1.7828741390106448, + "grad_norm": 0.4834008514881134, + "learning_rate": 1.1428804170739656e-05, + "loss": 0.2022, + "step": 11389 + }, + { + "epoch": 1.7830306825297433, + "grad_norm": 0.42499566078186035, + "learning_rate": 1.1420658194851745e-05, + "loss": 0.2035, + "step": 11390 + }, + { + "epoch": 1.7831872260488417, + "grad_norm": 0.500664472579956, + "learning_rate": 1.1412512218963832e-05, + "loss": 0.2318, + "step": 11391 + }, + { + "epoch": 1.78334376956794, + "grad_norm": 0.8191183805465698, + "learning_rate": 1.1404366243075922e-05, + "loss": 0.1831, + "step": 11392 + }, + { + "epoch": 1.7835003130870382, + "grad_norm": 1.2701098918914795, + "learning_rate": 1.139622026718801e-05, + "loss": 0.2851, + "step": 11393 + }, + { + "epoch": 1.7836568566061364, + "grad_norm": 0.6897211074829102, + "learning_rate": 1.1388074291300098e-05, + "loss": 0.2197, + "step": 11394 + }, + { + "epoch": 1.7838134001252348, + "grad_norm": 1.1059285402297974, + "learning_rate": 1.1379928315412187e-05, + "loss": 0.2807, + "step": 11395 + }, + { + "epoch": 1.7839699436443333, + "grad_norm": 0.723738968372345, + "learning_rate": 1.1371782339524276e-05, + "loss": 0.2705, + "step": 11396 + }, + { + "epoch": 1.7841264871634315, + "grad_norm": 0.6553926467895508, + "learning_rate": 1.1363636363636365e-05, + "loss": 0.2282, + "step": 11397 + }, + { + "epoch": 1.7842830306825297, + "grad_norm": 2.486424207687378, + "learning_rate": 1.1355490387748453e-05, + "loss": 0.3524, + "step": 11398 + }, + { + "epoch": 1.784439574201628, + "grad_norm": 0.8778018355369568, + "learning_rate": 1.1347344411860542e-05, + "loss": 0.3092, + "step": 11399 + }, + { + "epoch": 1.7845961177207263, + "grad_norm": 1.9780429601669312, + "learning_rate": 1.133919843597263e-05, + "loss": 0.354, + "step": 11400 + }, + { + "epoch": 1.7847526612398248, + "grad_norm": 0.7541031837463379, + "learning_rate": 1.133105246008472e-05, + "loss": 0.2512, + "step": 11401 + }, + { + "epoch": 1.784909204758923, + "grad_norm": 1.428398847579956, + "learning_rate": 1.1322906484196807e-05, + "loss": 0.3377, + "step": 11402 + }, + { + "epoch": 1.7850657482780212, + "grad_norm": 0.9503200650215149, + "learning_rate": 1.1314760508308896e-05, + "loss": 0.3362, + "step": 11403 + }, + { + "epoch": 1.7852222917971194, + "grad_norm": 1.2927324771881104, + "learning_rate": 1.1306614532420985e-05, + "loss": 0.2797, + "step": 11404 + }, + { + "epoch": 1.7853788353162179, + "grad_norm": 0.945274829864502, + "learning_rate": 1.1298468556533074e-05, + "loss": 0.358, + "step": 11405 + }, + { + "epoch": 1.7855353788353163, + "grad_norm": 2.0974860191345215, + "learning_rate": 1.129032258064516e-05, + "loss": 0.4165, + "step": 11406 + }, + { + "epoch": 1.7856919223544145, + "grad_norm": 2.5062437057495117, + "learning_rate": 1.1282176604757251e-05, + "loss": 0.821, + "step": 11407 + }, + { + "epoch": 1.7858484658735128, + "grad_norm": 3.6130242347717285, + "learning_rate": 1.127403062886934e-05, + "loss": 0.3119, + "step": 11408 + }, + { + "epoch": 1.7860050093926112, + "grad_norm": 2.189107894897461, + "learning_rate": 1.1265884652981427e-05, + "loss": 0.4642, + "step": 11409 + }, + { + "epoch": 1.7861615529117094, + "grad_norm": 1.6117579936981201, + "learning_rate": 1.1257738677093518e-05, + "loss": 0.4093, + "step": 11410 + }, + { + "epoch": 1.7863180964308079, + "grad_norm": 1.7168275117874146, + "learning_rate": 1.1249592701205605e-05, + "loss": 0.4523, + "step": 11411 + }, + { + "epoch": 1.786474639949906, + "grad_norm": 1.621882438659668, + "learning_rate": 1.1241446725317694e-05, + "loss": 0.6694, + "step": 11412 + }, + { + "epoch": 1.7866311834690043, + "grad_norm": 3.979172945022583, + "learning_rate": 1.1233300749429783e-05, + "loss": 0.5665, + "step": 11413 + }, + { + "epoch": 1.7867877269881027, + "grad_norm": 1.7836036682128906, + "learning_rate": 1.1225154773541871e-05, + "loss": 0.4069, + "step": 11414 + }, + { + "epoch": 1.786944270507201, + "grad_norm": 1.9014168977737427, + "learning_rate": 1.1217008797653959e-05, + "loss": 0.5766, + "step": 11415 + }, + { + "epoch": 1.7871008140262994, + "grad_norm": 2.6798295974731445, + "learning_rate": 1.1208862821766049e-05, + "loss": 0.7028, + "step": 11416 + }, + { + "epoch": 1.7872573575453976, + "grad_norm": 2.207080364227295, + "learning_rate": 1.1200716845878136e-05, + "loss": 0.5121, + "step": 11417 + }, + { + "epoch": 1.7874139010644958, + "grad_norm": 2.4557905197143555, + "learning_rate": 1.1192570869990225e-05, + "loss": 0.6439, + "step": 11418 + }, + { + "epoch": 1.7875704445835943, + "grad_norm": 1.5457416772842407, + "learning_rate": 1.1184424894102314e-05, + "loss": 0.3356, + "step": 11419 + }, + { + "epoch": 1.7877269881026927, + "grad_norm": 3.2951722145080566, + "learning_rate": 1.1176278918214403e-05, + "loss": 0.6463, + "step": 11420 + }, + { + "epoch": 1.787883531621791, + "grad_norm": 1.8164892196655273, + "learning_rate": 1.1168132942326492e-05, + "loss": 0.7731, + "step": 11421 + }, + { + "epoch": 1.7880400751408891, + "grad_norm": 3.3712100982666016, + "learning_rate": 1.1159986966438579e-05, + "loss": 0.6665, + "step": 11422 + }, + { + "epoch": 1.7881966186599874, + "grad_norm": 2.2472572326660156, + "learning_rate": 1.115184099055067e-05, + "loss": 0.5819, + "step": 11423 + }, + { + "epoch": 1.7883531621790858, + "grad_norm": 5.180994987487793, + "learning_rate": 1.1143695014662756e-05, + "loss": 0.6629, + "step": 11424 + }, + { + "epoch": 1.7885097056981842, + "grad_norm": 2.1343371868133545, + "learning_rate": 1.1135549038774845e-05, + "loss": 0.7565, + "step": 11425 + }, + { + "epoch": 1.7886662492172825, + "grad_norm": 3.8485007286071777, + "learning_rate": 1.1127403062886934e-05, + "loss": 0.7721, + "step": 11426 + }, + { + "epoch": 1.7888227927363807, + "grad_norm": 3.6648313999176025, + "learning_rate": 1.1119257086999023e-05, + "loss": 1.1196, + "step": 11427 + }, + { + "epoch": 1.788979336255479, + "grad_norm": 3.728807210922241, + "learning_rate": 1.1111111111111112e-05, + "loss": 1.0451, + "step": 11428 + }, + { + "epoch": 1.7891358797745773, + "grad_norm": 11.601908683776855, + "learning_rate": 1.11029651352232e-05, + "loss": 1.4664, + "step": 11429 + }, + { + "epoch": 1.7892924232936758, + "grad_norm": 2.9339852333068848, + "learning_rate": 1.109481915933529e-05, + "loss": 1.1624, + "step": 11430 + }, + { + "epoch": 1.789448966812774, + "grad_norm": 1.7590299844741821, + "learning_rate": 1.1086673183447376e-05, + "loss": 0.7483, + "step": 11431 + }, + { + "epoch": 1.7896055103318722, + "grad_norm": 3.845505714416504, + "learning_rate": 1.1078527207559467e-05, + "loss": 0.8321, + "step": 11432 + }, + { + "epoch": 1.7897620538509704, + "grad_norm": 1.8013699054718018, + "learning_rate": 1.1070381231671554e-05, + "loss": 0.6499, + "step": 11433 + }, + { + "epoch": 1.7899185973700689, + "grad_norm": 3.595322847366333, + "learning_rate": 1.1062235255783643e-05, + "loss": 0.1799, + "step": 11434 + }, + { + "epoch": 1.7900751408891673, + "grad_norm": 6.974082946777344, + "learning_rate": 1.1054089279895732e-05, + "loss": 0.8715, + "step": 11435 + }, + { + "epoch": 1.7902316844082655, + "grad_norm": 5.303676605224609, + "learning_rate": 1.104594330400782e-05, + "loss": 0.8366, + "step": 11436 + }, + { + "epoch": 1.7903882279273637, + "grad_norm": 3.4770102500915527, + "learning_rate": 1.103779732811991e-05, + "loss": 0.9228, + "step": 11437 + }, + { + "epoch": 1.790544771446462, + "grad_norm": 2.6674957275390625, + "learning_rate": 1.1029651352231998e-05, + "loss": 0.6673, + "step": 11438 + }, + { + "epoch": 1.7907013149655604, + "grad_norm": 0.8222519159317017, + "learning_rate": 1.1021505376344087e-05, + "loss": 0.3726, + "step": 11439 + }, + { + "epoch": 1.7908578584846588, + "grad_norm": 0.6058958172798157, + "learning_rate": 1.1013359400456174e-05, + "loss": 0.2321, + "step": 11440 + }, + { + "epoch": 1.791014402003757, + "grad_norm": 0.6012743711471558, + "learning_rate": 1.1005213424568265e-05, + "loss": 0.2059, + "step": 11441 + }, + { + "epoch": 1.7911709455228553, + "grad_norm": 0.7750794887542725, + "learning_rate": 1.0997067448680352e-05, + "loss": 0.2087, + "step": 11442 + }, + { + "epoch": 1.7913274890419537, + "grad_norm": 0.51323002576828, + "learning_rate": 1.098892147279244e-05, + "loss": 0.1906, + "step": 11443 + }, + { + "epoch": 1.791484032561052, + "grad_norm": 0.9904295206069946, + "learning_rate": 1.098077549690453e-05, + "loss": 0.2871, + "step": 11444 + }, + { + "epoch": 1.7916405760801504, + "grad_norm": 1.6133145093917847, + "learning_rate": 1.0972629521016618e-05, + "loss": 0.2699, + "step": 11445 + }, + { + "epoch": 1.7917971195992486, + "grad_norm": 1.0600115060806274, + "learning_rate": 1.0964483545128707e-05, + "loss": 0.308, + "step": 11446 + }, + { + "epoch": 1.7919536631183468, + "grad_norm": 0.681742250919342, + "learning_rate": 1.0956337569240796e-05, + "loss": 0.3297, + "step": 11447 + }, + { + "epoch": 1.7921102066374452, + "grad_norm": 0.9680487513542175, + "learning_rate": 1.0948191593352885e-05, + "loss": 0.2106, + "step": 11448 + }, + { + "epoch": 1.7922667501565435, + "grad_norm": 0.6009546518325806, + "learning_rate": 1.0940045617464972e-05, + "loss": 0.2759, + "step": 11449 + }, + { + "epoch": 1.792423293675642, + "grad_norm": 0.8416286110877991, + "learning_rate": 1.0931899641577063e-05, + "loss": 0.1923, + "step": 11450 + }, + { + "epoch": 1.7925798371947401, + "grad_norm": 0.8546473979949951, + "learning_rate": 1.092375366568915e-05, + "loss": 0.3043, + "step": 11451 + }, + { + "epoch": 1.7927363807138383, + "grad_norm": 0.954856812953949, + "learning_rate": 1.0915607689801239e-05, + "loss": 0.3214, + "step": 11452 + }, + { + "epoch": 1.7928929242329368, + "grad_norm": 1.5914499759674072, + "learning_rate": 1.0907461713913327e-05, + "loss": 0.3658, + "step": 11453 + }, + { + "epoch": 1.7930494677520352, + "grad_norm": 2.316108465194702, + "learning_rate": 1.0899315738025416e-05, + "loss": 0.4661, + "step": 11454 + }, + { + "epoch": 1.7932060112711334, + "grad_norm": 1.4224647283554077, + "learning_rate": 1.0891169762137505e-05, + "loss": 0.2579, + "step": 11455 + }, + { + "epoch": 1.7933625547902317, + "grad_norm": 1.1433172225952148, + "learning_rate": 1.0883023786249594e-05, + "loss": 0.2509, + "step": 11456 + }, + { + "epoch": 1.7935190983093299, + "grad_norm": 1.138683557510376, + "learning_rate": 1.0874877810361683e-05, + "loss": 0.376, + "step": 11457 + }, + { + "epoch": 1.7936756418284283, + "grad_norm": 2.624495029449463, + "learning_rate": 1.086673183447377e-05, + "loss": 0.914, + "step": 11458 + }, + { + "epoch": 1.7938321853475268, + "grad_norm": 1.3972575664520264, + "learning_rate": 1.085858585858586e-05, + "loss": 0.4626, + "step": 11459 + }, + { + "epoch": 1.793988728866625, + "grad_norm": 1.786301851272583, + "learning_rate": 1.0850439882697947e-05, + "loss": 0.382, + "step": 11460 + }, + { + "epoch": 1.7941452723857232, + "grad_norm": 1.5706777572631836, + "learning_rate": 1.0842293906810036e-05, + "loss": 0.4227, + "step": 11461 + }, + { + "epoch": 1.7943018159048214, + "grad_norm": 1.8428858518600464, + "learning_rate": 1.0834147930922125e-05, + "loss": 0.5098, + "step": 11462 + }, + { + "epoch": 1.7944583594239198, + "grad_norm": 1.930580735206604, + "learning_rate": 1.0826001955034214e-05, + "loss": 0.6691, + "step": 11463 + }, + { + "epoch": 1.7946149029430183, + "grad_norm": 3.0174548625946045, + "learning_rate": 1.0817855979146301e-05, + "loss": 0.4722, + "step": 11464 + }, + { + "epoch": 1.7947714464621165, + "grad_norm": 2.867929220199585, + "learning_rate": 1.0809710003258392e-05, + "loss": 0.8938, + "step": 11465 + }, + { + "epoch": 1.7949279899812147, + "grad_norm": 3.2756412029266357, + "learning_rate": 1.080156402737048e-05, + "loss": 0.4576, + "step": 11466 + }, + { + "epoch": 1.795084533500313, + "grad_norm": 11.233529090881348, + "learning_rate": 1.0793418051482568e-05, + "loss": 0.6885, + "step": 11467 + }, + { + "epoch": 1.7952410770194114, + "grad_norm": 1.4120447635650635, + "learning_rate": 1.0785272075594658e-05, + "loss": 0.4431, + "step": 11468 + }, + { + "epoch": 1.7953976205385098, + "grad_norm": 2.0411007404327393, + "learning_rate": 1.0777126099706745e-05, + "loss": 0.5502, + "step": 11469 + }, + { + "epoch": 1.795554164057608, + "grad_norm": 3.088266134262085, + "learning_rate": 1.0768980123818834e-05, + "loss": 0.611, + "step": 11470 + }, + { + "epoch": 1.7957107075767063, + "grad_norm": 2.3291232585906982, + "learning_rate": 1.0760834147930923e-05, + "loss": 0.8784, + "step": 11471 + }, + { + "epoch": 1.7958672510958045, + "grad_norm": 1.7932875156402588, + "learning_rate": 1.0752688172043012e-05, + "loss": 0.3142, + "step": 11472 + }, + { + "epoch": 1.796023794614903, + "grad_norm": 1.7420480251312256, + "learning_rate": 1.0744542196155099e-05, + "loss": 0.5155, + "step": 11473 + }, + { + "epoch": 1.7961803381340014, + "grad_norm": 3.9839770793914795, + "learning_rate": 1.073639622026719e-05, + "loss": 1.1188, + "step": 11474 + }, + { + "epoch": 1.7963368816530996, + "grad_norm": 2.2350587844848633, + "learning_rate": 1.0728250244379277e-05, + "loss": 0.7156, + "step": 11475 + }, + { + "epoch": 1.7964934251721978, + "grad_norm": 2.9410011768341064, + "learning_rate": 1.0720104268491365e-05, + "loss": 0.7034, + "step": 11476 + }, + { + "epoch": 1.7966499686912962, + "grad_norm": 4.064459800720215, + "learning_rate": 1.0711958292603454e-05, + "loss": 1.1777, + "step": 11477 + }, + { + "epoch": 1.7968065122103944, + "grad_norm": 1.6611049175262451, + "learning_rate": 1.0703812316715543e-05, + "loss": 0.5625, + "step": 11478 + }, + { + "epoch": 1.7969630557294929, + "grad_norm": 3.314251661300659, + "learning_rate": 1.0695666340827632e-05, + "loss": 1.2688, + "step": 11479 + }, + { + "epoch": 1.797119599248591, + "grad_norm": 3.4898741245269775, + "learning_rate": 1.068752036493972e-05, + "loss": 1.2398, + "step": 11480 + }, + { + "epoch": 1.7972761427676893, + "grad_norm": 2.4635121822357178, + "learning_rate": 1.067937438905181e-05, + "loss": 0.9398, + "step": 11481 + }, + { + "epoch": 1.7974326862867878, + "grad_norm": 3.593308448791504, + "learning_rate": 1.0671228413163897e-05, + "loss": 1.314, + "step": 11482 + }, + { + "epoch": 1.7975892298058862, + "grad_norm": 4.641136646270752, + "learning_rate": 1.0663082437275986e-05, + "loss": 1.1456, + "step": 11483 + }, + { + "epoch": 1.7977457733249844, + "grad_norm": 2.867286205291748, + "learning_rate": 1.0654936461388074e-05, + "loss": 0.6101, + "step": 11484 + }, + { + "epoch": 1.7979023168440826, + "grad_norm": 1.4076091051101685, + "learning_rate": 1.0646790485500163e-05, + "loss": 0.3793, + "step": 11485 + }, + { + "epoch": 1.7980588603631809, + "grad_norm": 1.5361912250518799, + "learning_rate": 1.0638644509612252e-05, + "loss": 0.2885, + "step": 11486 + }, + { + "epoch": 1.7982154038822793, + "grad_norm": 2.2617743015289307, + "learning_rate": 1.063049853372434e-05, + "loss": 0.4612, + "step": 11487 + }, + { + "epoch": 1.7983719474013777, + "grad_norm": 7.640021800994873, + "learning_rate": 1.062235255783643e-05, + "loss": 0.8459, + "step": 11488 + }, + { + "epoch": 1.798528490920476, + "grad_norm": 0.6483803391456604, + "learning_rate": 1.0614206581948517e-05, + "loss": 0.2377, + "step": 11489 + }, + { + "epoch": 1.7986850344395742, + "grad_norm": 1.2947741746902466, + "learning_rate": 1.0606060606060607e-05, + "loss": 0.4415, + "step": 11490 + }, + { + "epoch": 1.7988415779586724, + "grad_norm": 1.7544164657592773, + "learning_rate": 1.0597914630172694e-05, + "loss": 0.2503, + "step": 11491 + }, + { + "epoch": 1.7989981214777708, + "grad_norm": 0.6606625318527222, + "learning_rate": 1.0589768654284783e-05, + "loss": 0.2848, + "step": 11492 + }, + { + "epoch": 1.7991546649968693, + "grad_norm": 1.3329366445541382, + "learning_rate": 1.0581622678396872e-05, + "loss": 0.2349, + "step": 11493 + }, + { + "epoch": 1.7993112085159675, + "grad_norm": 0.9169349670410156, + "learning_rate": 1.0573476702508961e-05, + "loss": 0.1983, + "step": 11494 + }, + { + "epoch": 1.7994677520350657, + "grad_norm": 1.2347450256347656, + "learning_rate": 1.056533072662105e-05, + "loss": 0.2836, + "step": 11495 + }, + { + "epoch": 1.799624295554164, + "grad_norm": 1.5069764852523804, + "learning_rate": 1.0557184750733139e-05, + "loss": 0.2779, + "step": 11496 + }, + { + "epoch": 1.7997808390732624, + "grad_norm": 1.2083748579025269, + "learning_rate": 1.0549038774845227e-05, + "loss": 0.4379, + "step": 11497 + }, + { + "epoch": 1.7999373825923608, + "grad_norm": 1.6139681339263916, + "learning_rate": 1.0540892798957315e-05, + "loss": 0.401, + "step": 11498 + }, + { + "epoch": 1.800093926111459, + "grad_norm": 1.6186563968658447, + "learning_rate": 1.0532746823069405e-05, + "loss": 0.5289, + "step": 11499 + }, + { + "epoch": 1.8002504696305572, + "grad_norm": 1.170892357826233, + "learning_rate": 1.0524600847181492e-05, + "loss": 0.5881, + "step": 11500 + }, + { + "epoch": 1.8004070131496555, + "grad_norm": 0.9061486124992371, + "learning_rate": 1.0516454871293581e-05, + "loss": 0.2886, + "step": 11501 + }, + { + "epoch": 1.800563556668754, + "grad_norm": 1.2766883373260498, + "learning_rate": 1.050830889540567e-05, + "loss": 0.3561, + "step": 11502 + }, + { + "epoch": 1.8007201001878523, + "grad_norm": 0.9397156238555908, + "learning_rate": 1.0500162919517759e-05, + "loss": 0.4285, + "step": 11503 + }, + { + "epoch": 1.8008766437069506, + "grad_norm": 4.670447826385498, + "learning_rate": 1.0492016943629848e-05, + "loss": 0.3989, + "step": 11504 + }, + { + "epoch": 1.8010331872260488, + "grad_norm": 0.8945463299751282, + "learning_rate": 1.0483870967741936e-05, + "loss": 0.4306, + "step": 11505 + }, + { + "epoch": 1.801189730745147, + "grad_norm": 0.9093192219734192, + "learning_rate": 1.0475724991854025e-05, + "loss": 0.2008, + "step": 11506 + }, + { + "epoch": 1.8013462742642454, + "grad_norm": 1.5513911247253418, + "learning_rate": 1.0467579015966112e-05, + "loss": 0.3673, + "step": 11507 + }, + { + "epoch": 1.8015028177833439, + "grad_norm": 0.9253044724464417, + "learning_rate": 1.0459433040078203e-05, + "loss": 0.3861, + "step": 11508 + }, + { + "epoch": 1.801659361302442, + "grad_norm": 2.2203598022460938, + "learning_rate": 1.045128706419029e-05, + "loss": 0.4454, + "step": 11509 + }, + { + "epoch": 1.8018159048215403, + "grad_norm": 1.532909631729126, + "learning_rate": 1.0443141088302379e-05, + "loss": 0.4374, + "step": 11510 + }, + { + "epoch": 1.8019724483406387, + "grad_norm": 2.5276856422424316, + "learning_rate": 1.0434995112414468e-05, + "loss": 0.5073, + "step": 11511 + }, + { + "epoch": 1.802128991859737, + "grad_norm": 1.6028140783309937, + "learning_rate": 1.0426849136526557e-05, + "loss": 0.4913, + "step": 11512 + }, + { + "epoch": 1.8022855353788354, + "grad_norm": 2.547666072845459, + "learning_rate": 1.0418703160638645e-05, + "loss": 0.6511, + "step": 11513 + }, + { + "epoch": 1.8024420788979336, + "grad_norm": 2.4167640209198, + "learning_rate": 1.0410557184750734e-05, + "loss": 0.5151, + "step": 11514 + }, + { + "epoch": 1.8025986224170318, + "grad_norm": 2.298388719558716, + "learning_rate": 1.0402411208862823e-05, + "loss": 0.5297, + "step": 11515 + }, + { + "epoch": 1.8027551659361303, + "grad_norm": 9.25503921508789, + "learning_rate": 1.039426523297491e-05, + "loss": 0.8295, + "step": 11516 + }, + { + "epoch": 1.8029117094552287, + "grad_norm": 2.5711140632629395, + "learning_rate": 1.0386119257087e-05, + "loss": 0.4703, + "step": 11517 + }, + { + "epoch": 1.803068252974327, + "grad_norm": 1.804229497909546, + "learning_rate": 1.0377973281199088e-05, + "loss": 0.5517, + "step": 11518 + }, + { + "epoch": 1.8032247964934252, + "grad_norm": 2.847851514816284, + "learning_rate": 1.0369827305311177e-05, + "loss": 0.6392, + "step": 11519 + }, + { + "epoch": 1.8033813400125234, + "grad_norm": 3.6901378631591797, + "learning_rate": 1.0361681329423266e-05, + "loss": 0.9818, + "step": 11520 + }, + { + "epoch": 1.8035378835316218, + "grad_norm": 3.3479650020599365, + "learning_rate": 1.0353535353535354e-05, + "loss": 0.7069, + "step": 11521 + }, + { + "epoch": 1.8036944270507203, + "grad_norm": NaN, + "learning_rate": 1.0353535353535354e-05, + "loss": 0.0, + "step": 11522 + }, + { + "epoch": 1.8038509705698185, + "grad_norm": 2.7159531116485596, + "learning_rate": 1.0345389377647441e-05, + "loss": 0.8101, + "step": 11523 + }, + { + "epoch": 1.8040075140889167, + "grad_norm": 1.8939694166183472, + "learning_rate": 1.0337243401759532e-05, + "loss": 0.6789, + "step": 11524 + }, + { + "epoch": 1.804164057608015, + "grad_norm": 4.862288951873779, + "learning_rate": 1.032909742587162e-05, + "loss": 0.8738, + "step": 11525 + }, + { + "epoch": 1.8043206011271133, + "grad_norm": 3.0275449752807617, + "learning_rate": 1.0320951449983708e-05, + "loss": 0.9892, + "step": 11526 + }, + { + "epoch": 1.8044771446462118, + "grad_norm": 3.271939992904663, + "learning_rate": 1.0312805474095798e-05, + "loss": 1.0818, + "step": 11527 + }, + { + "epoch": 1.80463368816531, + "grad_norm": 2.7519800662994385, + "learning_rate": 1.0304659498207886e-05, + "loss": 0.7421, + "step": 11528 + }, + { + "epoch": 1.8047902316844082, + "grad_norm": 4.898366451263428, + "learning_rate": 1.0296513522319974e-05, + "loss": 1.2326, + "step": 11529 + }, + { + "epoch": 1.8049467752035064, + "grad_norm": 4.901711463928223, + "learning_rate": 1.0288367546432063e-05, + "loss": 1.1845, + "step": 11530 + }, + { + "epoch": 1.8051033187226049, + "grad_norm": 2.1818737983703613, + "learning_rate": 1.0280221570544152e-05, + "loss": 0.9504, + "step": 11531 + }, + { + "epoch": 1.8052598622417033, + "grad_norm": 2.006155490875244, + "learning_rate": 1.027207559465624e-05, + "loss": 0.9656, + "step": 11532 + }, + { + "epoch": 1.8054164057608015, + "grad_norm": 2.1449594497680664, + "learning_rate": 1.026392961876833e-05, + "loss": 0.5978, + "step": 11533 + }, + { + "epoch": 1.8055729492798998, + "grad_norm": 3.978703022003174, + "learning_rate": 1.0255783642880417e-05, + "loss": 0.6061, + "step": 11534 + }, + { + "epoch": 1.805729492798998, + "grad_norm": 2.416445255279541, + "learning_rate": 1.0247637666992506e-05, + "loss": 0.6576, + "step": 11535 + }, + { + "epoch": 1.8058860363180964, + "grad_norm": 1.5412706136703491, + "learning_rate": 1.0239491691104596e-05, + "loss": 0.4612, + "step": 11536 + }, + { + "epoch": 1.8060425798371949, + "grad_norm": 4.403695106506348, + "learning_rate": 1.0231345715216683e-05, + "loss": 0.6169, + "step": 11537 + }, + { + "epoch": 1.806199123356293, + "grad_norm": 3.3523213863372803, + "learning_rate": 1.0223199739328772e-05, + "loss": 0.9353, + "step": 11538 + }, + { + "epoch": 1.8063556668753913, + "grad_norm": 0.6438788175582886, + "learning_rate": 1.0215053763440861e-05, + "loss": 0.282, + "step": 11539 + }, + { + "epoch": 1.8065122103944895, + "grad_norm": 0.4967971444129944, + "learning_rate": 1.020690778755295e-05, + "loss": 0.2361, + "step": 11540 + }, + { + "epoch": 1.806668753913588, + "grad_norm": 0.4870172441005707, + "learning_rate": 1.0198761811665037e-05, + "loss": 0.2425, + "step": 11541 + }, + { + "epoch": 1.8068252974326864, + "grad_norm": 0.7733680605888367, + "learning_rate": 1.0190615835777128e-05, + "loss": 0.3016, + "step": 11542 + }, + { + "epoch": 1.8069818409517846, + "grad_norm": 0.7715198993682861, + "learning_rate": 1.0182469859889215e-05, + "loss": 0.3375, + "step": 11543 + }, + { + "epoch": 1.8071383844708828, + "grad_norm": 0.7579755783081055, + "learning_rate": 1.0174323884001304e-05, + "loss": 0.3387, + "step": 11544 + }, + { + "epoch": 1.8072949279899813, + "grad_norm": 0.4505300521850586, + "learning_rate": 1.0166177908113392e-05, + "loss": 0.237, + "step": 11545 + }, + { + "epoch": 1.8074514715090795, + "grad_norm": 0.5946091413497925, + "learning_rate": 1.0158031932225481e-05, + "loss": 0.2592, + "step": 11546 + }, + { + "epoch": 1.807608015028178, + "grad_norm": 1.2761658430099487, + "learning_rate": 1.014988595633757e-05, + "loss": 0.2879, + "step": 11547 + }, + { + "epoch": 1.8077645585472761, + "grad_norm": 1.6412012577056885, + "learning_rate": 1.0141739980449657e-05, + "loss": 0.2086, + "step": 11548 + }, + { + "epoch": 1.8079211020663744, + "grad_norm": 1.1315147876739502, + "learning_rate": 1.0133594004561748e-05, + "loss": 0.2425, + "step": 11549 + }, + { + "epoch": 1.8080776455854728, + "grad_norm": 1.8427296876907349, + "learning_rate": 1.0125448028673835e-05, + "loss": 0.3944, + "step": 11550 + }, + { + "epoch": 1.8082341891045712, + "grad_norm": 1.0719921588897705, + "learning_rate": 1.0117302052785924e-05, + "loss": 0.4052, + "step": 11551 + }, + { + "epoch": 1.8083907326236695, + "grad_norm": 1.1620182991027832, + "learning_rate": 1.0109156076898013e-05, + "loss": 0.4518, + "step": 11552 + }, + { + "epoch": 1.8085472761427677, + "grad_norm": 1.2084290981292725, + "learning_rate": 1.0101010101010101e-05, + "loss": 0.3695, + "step": 11553 + }, + { + "epoch": 1.8087038196618659, + "grad_norm": 1.0771416425704956, + "learning_rate": 1.009286412512219e-05, + "loss": 0.3103, + "step": 11554 + }, + { + "epoch": 1.8088603631809643, + "grad_norm": 1.6641353368759155, + "learning_rate": 1.0084718149234279e-05, + "loss": 0.3965, + "step": 11555 + }, + { + "epoch": 1.8090169067000628, + "grad_norm": 0.8575471043586731, + "learning_rate": 1.0076572173346368e-05, + "loss": 0.3338, + "step": 11556 + }, + { + "epoch": 1.809173450219161, + "grad_norm": 1.8776137828826904, + "learning_rate": 1.0068426197458455e-05, + "loss": 0.3329, + "step": 11557 + }, + { + "epoch": 1.8093299937382592, + "grad_norm": 1.6664118766784668, + "learning_rate": 1.0060280221570545e-05, + "loss": 0.5929, + "step": 11558 + }, + { + "epoch": 1.8094865372573574, + "grad_norm": 2.2073774337768555, + "learning_rate": 1.0052134245682633e-05, + "loss": 0.6012, + "step": 11559 + }, + { + "epoch": 1.8096430807764559, + "grad_norm": 1.0494983196258545, + "learning_rate": 1.0043988269794721e-05, + "loss": 0.3022, + "step": 11560 + }, + { + "epoch": 1.8097996242955543, + "grad_norm": 1.6907958984375, + "learning_rate": 1.003584229390681e-05, + "loss": 0.2952, + "step": 11561 + }, + { + "epoch": 1.8099561678146525, + "grad_norm": 1.4806150197982788, + "learning_rate": 1.0027696318018899e-05, + "loss": 0.4193, + "step": 11562 + }, + { + "epoch": 1.8101127113337507, + "grad_norm": 6.479229927062988, + "learning_rate": 1.0019550342130988e-05, + "loss": 0.3864, + "step": 11563 + }, + { + "epoch": 1.810269254852849, + "grad_norm": 1.673919916152954, + "learning_rate": 1.0011404366243077e-05, + "loss": 0.5323, + "step": 11564 + }, + { + "epoch": 1.8104257983719474, + "grad_norm": 1.7343610525131226, + "learning_rate": 1.0003258390355166e-05, + "loss": 0.3064, + "step": 11565 + }, + { + "epoch": 1.8105823418910458, + "grad_norm": 1.4792449474334717, + "learning_rate": 9.995112414467253e-06, + "loss": 0.5585, + "step": 11566 + }, + { + "epoch": 1.810738885410144, + "grad_norm": 2.5663065910339355, + "learning_rate": 9.986966438579343e-06, + "loss": 0.7905, + "step": 11567 + }, + { + "epoch": 1.8108954289292423, + "grad_norm": 3.075145721435547, + "learning_rate": 9.97882046269143e-06, + "loss": 0.8959, + "step": 11568 + }, + { + "epoch": 1.8110519724483405, + "grad_norm": 2.214106559753418, + "learning_rate": 9.97067448680352e-06, + "loss": 0.4347, + "step": 11569 + }, + { + "epoch": 1.811208515967439, + "grad_norm": 1.6660383939743042, + "learning_rate": 9.962528510915608e-06, + "loss": 0.4975, + "step": 11570 + }, + { + "epoch": 1.8113650594865374, + "grad_norm": 3.408273935317993, + "learning_rate": 9.954382535027697e-06, + "loss": 0.7532, + "step": 11571 + }, + { + "epoch": 1.8115216030056356, + "grad_norm": 2.1117422580718994, + "learning_rate": 9.946236559139786e-06, + "loss": 0.5524, + "step": 11572 + }, + { + "epoch": 1.8116781465247338, + "grad_norm": 2.594554901123047, + "learning_rate": 9.938090583251875e-06, + "loss": 0.8709, + "step": 11573 + }, + { + "epoch": 1.811834690043832, + "grad_norm": 4.955292701721191, + "learning_rate": 9.929944607363963e-06, + "loss": 1.1312, + "step": 11574 + }, + { + "epoch": 1.8119912335629305, + "grad_norm": 2.2853951454162598, + "learning_rate": 9.92179863147605e-06, + "loss": 0.7814, + "step": 11575 + }, + { + "epoch": 1.812147777082029, + "grad_norm": 3.235279083251953, + "learning_rate": 9.913652655588141e-06, + "loss": 1.0779, + "step": 11576 + }, + { + "epoch": 1.8123043206011271, + "grad_norm": 1.064928412437439, + "learning_rate": 9.905506679700228e-06, + "loss": 0.2425, + "step": 11577 + }, + { + "epoch": 1.8124608641202253, + "grad_norm": 2.1797239780426025, + "learning_rate": 9.897360703812317e-06, + "loss": 0.6746, + "step": 11578 + }, + { + "epoch": 1.8126174076393238, + "grad_norm": 4.277167320251465, + "learning_rate": 9.889214727924406e-06, + "loss": 1.1449, + "step": 11579 + }, + { + "epoch": 1.812773951158422, + "grad_norm": 3.3256075382232666, + "learning_rate": 9.881068752036495e-06, + "loss": 1.0608, + "step": 11580 + }, + { + "epoch": 1.8129304946775204, + "grad_norm": 5.4424214363098145, + "learning_rate": 9.872922776148582e-06, + "loss": 0.8845, + "step": 11581 + }, + { + "epoch": 1.8130870381966186, + "grad_norm": 4.91280460357666, + "learning_rate": 9.864776800260672e-06, + "loss": 1.5969, + "step": 11582 + }, + { + "epoch": 1.8132435817157169, + "grad_norm": 3.278050661087036, + "learning_rate": 9.856630824372761e-06, + "loss": 0.9091, + "step": 11583 + }, + { + "epoch": 1.8134001252348153, + "grad_norm": 2.659289598464966, + "learning_rate": 9.848484848484848e-06, + "loss": 1.3351, + "step": 11584 + }, + { + "epoch": 1.8135566687539137, + "grad_norm": 1.018247127532959, + "learning_rate": 9.840338872596939e-06, + "loss": 0.2378, + "step": 11585 + }, + { + "epoch": 1.813713212273012, + "grad_norm": 2.950469493865967, + "learning_rate": 9.832192896709026e-06, + "loss": 0.9474, + "step": 11586 + }, + { + "epoch": 1.8138697557921102, + "grad_norm": 3.0023889541625977, + "learning_rate": 9.824046920821115e-06, + "loss": 0.3856, + "step": 11587 + }, + { + "epoch": 1.8140262993112084, + "grad_norm": 2.9907493591308594, + "learning_rate": 9.815900944933204e-06, + "loss": 0.707, + "step": 11588 + }, + { + "epoch": 1.8141828428303068, + "grad_norm": 0.7878989577293396, + "learning_rate": 9.807754969045292e-06, + "loss": 0.3074, + "step": 11589 + }, + { + "epoch": 1.8143393863494053, + "grad_norm": 0.6317299008369446, + "learning_rate": 9.79960899315738e-06, + "loss": 0.3171, + "step": 11590 + }, + { + "epoch": 1.8144959298685035, + "grad_norm": 0.45333632826805115, + "learning_rate": 9.79146301726947e-06, + "loss": 0.2184, + "step": 11591 + }, + { + "epoch": 1.8146524733876017, + "grad_norm": 0.41124966740608215, + "learning_rate": 9.783317041381557e-06, + "loss": 0.2102, + "step": 11592 + }, + { + "epoch": 1.8148090169067, + "grad_norm": 0.5657354593276978, + "learning_rate": 9.775171065493646e-06, + "loss": 0.2054, + "step": 11593 + }, + { + "epoch": 1.8149655604257984, + "grad_norm": 0.8007012009620667, + "learning_rate": 9.767025089605737e-06, + "loss": 0.2968, + "step": 11594 + }, + { + "epoch": 1.8151221039448968, + "grad_norm": 0.6602072715759277, + "learning_rate": 9.758879113717824e-06, + "loss": 0.2719, + "step": 11595 + }, + { + "epoch": 1.815278647463995, + "grad_norm": 1.2626533508300781, + "learning_rate": 9.750733137829913e-06, + "loss": 0.3017, + "step": 11596 + }, + { + "epoch": 1.8154351909830932, + "grad_norm": 0.7368610501289368, + "learning_rate": 9.742587161942001e-06, + "loss": 0.2909, + "step": 11597 + }, + { + "epoch": 1.8155917345021915, + "grad_norm": 0.8155832290649414, + "learning_rate": 9.73444118605409e-06, + "loss": 0.2108, + "step": 11598 + }, + { + "epoch": 1.81574827802129, + "grad_norm": 0.7932040691375732, + "learning_rate": 9.726295210166177e-06, + "loss": 0.2302, + "step": 11599 + }, + { + "epoch": 1.8159048215403883, + "grad_norm": 0.6503439545631409, + "learning_rate": 9.718149234278268e-06, + "loss": 0.2572, + "step": 11600 + }, + { + "epoch": 1.8160613650594866, + "grad_norm": 0.7835712432861328, + "learning_rate": 9.710003258390355e-06, + "loss": 0.2356, + "step": 11601 + }, + { + "epoch": 1.8162179085785848, + "grad_norm": 1.4504755735397339, + "learning_rate": 9.701857282502444e-06, + "loss": 0.417, + "step": 11602 + }, + { + "epoch": 1.816374452097683, + "grad_norm": 0.8256363868713379, + "learning_rate": 9.693711306614533e-06, + "loss": 0.254, + "step": 11603 + }, + { + "epoch": 1.8165309956167814, + "grad_norm": 1.9095966815948486, + "learning_rate": 9.685565330726622e-06, + "loss": 0.4697, + "step": 11604 + }, + { + "epoch": 1.8166875391358799, + "grad_norm": 1.335070013999939, + "learning_rate": 9.67741935483871e-06, + "loss": 0.3801, + "step": 11605 + }, + { + "epoch": 1.816844082654978, + "grad_norm": 2.262422800064087, + "learning_rate": 9.6692733789508e-06, + "loss": 0.4537, + "step": 11606 + }, + { + "epoch": 1.8170006261740763, + "grad_norm": 1.1605421304702759, + "learning_rate": 9.661127403062888e-06, + "loss": 0.4632, + "step": 11607 + }, + { + "epoch": 1.8171571696931748, + "grad_norm": 1.5763481855392456, + "learning_rate": 9.652981427174975e-06, + "loss": 0.4108, + "step": 11608 + }, + { + "epoch": 1.817313713212273, + "grad_norm": 1.4513318538665771, + "learning_rate": 9.644835451287066e-06, + "loss": 0.4433, + "step": 11609 + }, + { + "epoch": 1.8174702567313714, + "grad_norm": 1.1283116340637207, + "learning_rate": 9.636689475399153e-06, + "loss": 0.3744, + "step": 11610 + }, + { + "epoch": 1.8176268002504696, + "grad_norm": 1.2909374237060547, + "learning_rate": 9.628543499511242e-06, + "loss": 0.3742, + "step": 11611 + }, + { + "epoch": 1.8177833437695678, + "grad_norm": 2.9342777729034424, + "learning_rate": 9.62039752362333e-06, + "loss": 0.3884, + "step": 11612 + }, + { + "epoch": 1.8179398872886663, + "grad_norm": 1.2897518873214722, + "learning_rate": 9.61225154773542e-06, + "loss": 0.4829, + "step": 11613 + }, + { + "epoch": 1.8180964308077645, + "grad_norm": 2.286391019821167, + "learning_rate": 9.604105571847508e-06, + "loss": 0.3498, + "step": 11614 + }, + { + "epoch": 1.818252974326863, + "grad_norm": 3.1702513694763184, + "learning_rate": 9.595959595959595e-06, + "loss": 0.9757, + "step": 11615 + }, + { + "epoch": 1.8184095178459612, + "grad_norm": 2.3737854957580566, + "learning_rate": 9.587813620071686e-06, + "loss": 0.5799, + "step": 11616 + }, + { + "epoch": 1.8185660613650594, + "grad_norm": 1.9054960012435913, + "learning_rate": 9.579667644183773e-06, + "loss": 0.5457, + "step": 11617 + }, + { + "epoch": 1.8187226048841578, + "grad_norm": 3.1369683742523193, + "learning_rate": 9.571521668295862e-06, + "loss": 0.5879, + "step": 11618 + }, + { + "epoch": 1.8188791484032563, + "grad_norm": 5.0215888023376465, + "learning_rate": 9.56337569240795e-06, + "loss": 1.0856, + "step": 11619 + }, + { + "epoch": 1.8190356919223545, + "grad_norm": 2.540959596633911, + "learning_rate": 9.55522971652004e-06, + "loss": 0.6339, + "step": 11620 + }, + { + "epoch": 1.8191922354414527, + "grad_norm": 1.4865546226501465, + "learning_rate": 9.547083740632128e-06, + "loss": 0.5639, + "step": 11621 + }, + { + "epoch": 1.819348778960551, + "grad_norm": 3.9226109981536865, + "learning_rate": 9.538937764744217e-06, + "loss": 0.8331, + "step": 11622 + }, + { + "epoch": 1.8195053224796494, + "grad_norm": 2.768131732940674, + "learning_rate": 9.530791788856306e-06, + "loss": 0.7281, + "step": 11623 + }, + { + "epoch": 1.8196618659987478, + "grad_norm": 2.2406165599823, + "learning_rate": 9.522645812968393e-06, + "loss": 0.5817, + "step": 11624 + }, + { + "epoch": 1.819818409517846, + "grad_norm": 3.9012610912323, + "learning_rate": 9.514499837080484e-06, + "loss": 0.5062, + "step": 11625 + }, + { + "epoch": 1.8199749530369442, + "grad_norm": 5.402362823486328, + "learning_rate": 9.50635386119257e-06, + "loss": 0.6049, + "step": 11626 + }, + { + "epoch": 1.8201314965560424, + "grad_norm": 4.4131550788879395, + "learning_rate": 9.49820788530466e-06, + "loss": 0.9396, + "step": 11627 + }, + { + "epoch": 1.820288040075141, + "grad_norm": 3.9317593574523926, + "learning_rate": 9.490061909416748e-06, + "loss": 1.3105, + "step": 11628 + }, + { + "epoch": 1.8204445835942393, + "grad_norm": 7.046682834625244, + "learning_rate": 9.481915933528837e-06, + "loss": 0.9261, + "step": 11629 + }, + { + "epoch": 1.8206011271133375, + "grad_norm": 5.662478446960449, + "learning_rate": 9.473769957640926e-06, + "loss": 0.7058, + "step": 11630 + }, + { + "epoch": 1.8207576706324358, + "grad_norm": 1.9200595617294312, + "learning_rate": 9.465623981753015e-06, + "loss": 0.9515, + "step": 11631 + }, + { + "epoch": 1.820914214151534, + "grad_norm": 4.417473316192627, + "learning_rate": 9.457478005865104e-06, + "loss": 0.8812, + "step": 11632 + }, + { + "epoch": 1.8210707576706324, + "grad_norm": 5.302669525146484, + "learning_rate": 9.449332029977191e-06, + "loss": 0.9013, + "step": 11633 + }, + { + "epoch": 1.8212273011897309, + "grad_norm": 2.1977696418762207, + "learning_rate": 9.441186054089281e-06, + "loss": 1.0816, + "step": 11634 + }, + { + "epoch": 1.821383844708829, + "grad_norm": 8.764734268188477, + "learning_rate": 9.433040078201369e-06, + "loss": 0.6663, + "step": 11635 + }, + { + "epoch": 1.8215403882279273, + "grad_norm": 3.461695432662964, + "learning_rate": 9.424894102313457e-06, + "loss": 0.6909, + "step": 11636 + }, + { + "epoch": 1.8216969317470255, + "grad_norm": 2.565884828567505, + "learning_rate": 9.416748126425546e-06, + "loss": 0.7868, + "step": 11637 + }, + { + "epoch": 1.821853475266124, + "grad_norm": 2.5792698860168457, + "learning_rate": 9.408602150537635e-06, + "loss": 0.9469, + "step": 11638 + }, + { + "epoch": 1.8220100187852224, + "grad_norm": 0.5101920366287231, + "learning_rate": 9.400456174649722e-06, + "loss": 0.2689, + "step": 11639 + }, + { + "epoch": 1.8221665623043206, + "grad_norm": 0.5965200066566467, + "learning_rate": 9.392310198761813e-06, + "loss": 0.2786, + "step": 11640 + }, + { + "epoch": 1.8223231058234188, + "grad_norm": 0.4855404794216156, + "learning_rate": 9.384164222873902e-06, + "loss": 0.2816, + "step": 11641 + }, + { + "epoch": 1.8224796493425173, + "grad_norm": 0.7268877625465393, + "learning_rate": 9.376018246985989e-06, + "loss": 0.3213, + "step": 11642 + }, + { + "epoch": 1.8226361928616155, + "grad_norm": 0.5807152986526489, + "learning_rate": 9.36787227109808e-06, + "loss": 0.2285, + "step": 11643 + }, + { + "epoch": 1.822792736380714, + "grad_norm": 0.7081146240234375, + "learning_rate": 9.359726295210166e-06, + "loss": 0.3008, + "step": 11644 + }, + { + "epoch": 1.8229492798998121, + "grad_norm": 1.851911187171936, + "learning_rate": 9.351580319322255e-06, + "loss": 0.334, + "step": 11645 + }, + { + "epoch": 1.8231058234189104, + "grad_norm": 0.6624234318733215, + "learning_rate": 9.343434343434344e-06, + "loss": 0.2704, + "step": 11646 + }, + { + "epoch": 1.8232623669380088, + "grad_norm": 0.8540716171264648, + "learning_rate": 9.335288367546433e-06, + "loss": 0.3549, + "step": 11647 + }, + { + "epoch": 1.823418910457107, + "grad_norm": 4.3904571533203125, + "learning_rate": 9.32714239165852e-06, + "loss": 0.4439, + "step": 11648 + }, + { + "epoch": 1.8235754539762055, + "grad_norm": 2.421752452850342, + "learning_rate": 9.31899641577061e-06, + "loss": 0.3644, + "step": 11649 + }, + { + "epoch": 1.8237319974953037, + "grad_norm": 1.0699958801269531, + "learning_rate": 9.310850439882698e-06, + "loss": 0.4576, + "step": 11650 + }, + { + "epoch": 1.823888541014402, + "grad_norm": 1.9442222118377686, + "learning_rate": 9.302704463994786e-06, + "loss": 0.5979, + "step": 11651 + }, + { + "epoch": 1.8240450845335003, + "grad_norm": 1.5284827947616577, + "learning_rate": 9.294558488106877e-06, + "loss": 0.352, + "step": 11652 + }, + { + "epoch": 1.8242016280525988, + "grad_norm": 0.7113910913467407, + "learning_rate": 9.286412512218964e-06, + "loss": 0.3088, + "step": 11653 + }, + { + "epoch": 1.824358171571697, + "grad_norm": 3.1050314903259277, + "learning_rate": 9.278266536331053e-06, + "loss": 0.4906, + "step": 11654 + }, + { + "epoch": 1.8245147150907952, + "grad_norm": 0.6633939743041992, + "learning_rate": 9.270120560443142e-06, + "loss": 0.2803, + "step": 11655 + }, + { + "epoch": 1.8246712586098934, + "grad_norm": 1.7206470966339111, + "learning_rate": 9.26197458455523e-06, + "loss": 0.4402, + "step": 11656 + }, + { + "epoch": 1.8248278021289919, + "grad_norm": 1.7291687726974487, + "learning_rate": 9.253828608667318e-06, + "loss": 0.4579, + "step": 11657 + }, + { + "epoch": 1.8249843456480903, + "grad_norm": 1.8336257934570312, + "learning_rate": 9.245682632779408e-06, + "loss": 0.4606, + "step": 11658 + }, + { + "epoch": 1.8251408891671885, + "grad_norm": 1.7609807252883911, + "learning_rate": 9.237536656891495e-06, + "loss": 0.3698, + "step": 11659 + }, + { + "epoch": 1.8252974326862867, + "grad_norm": 1.470382809638977, + "learning_rate": 9.229390681003584e-06, + "loss": 0.5577, + "step": 11660 + }, + { + "epoch": 1.825453976205385, + "grad_norm": 3.147395610809326, + "learning_rate": 9.221244705115673e-06, + "loss": 0.6435, + "step": 11661 + }, + { + "epoch": 1.8256105197244834, + "grad_norm": 1.2224042415618896, + "learning_rate": 9.213098729227762e-06, + "loss": 0.3577, + "step": 11662 + }, + { + "epoch": 1.8257670632435818, + "grad_norm": 1.2344918251037598, + "learning_rate": 9.20495275333985e-06, + "loss": 0.3857, + "step": 11663 + }, + { + "epoch": 1.82592360676268, + "grad_norm": 2.9405295848846436, + "learning_rate": 9.19680677745194e-06, + "loss": 0.4911, + "step": 11664 + }, + { + "epoch": 1.8260801502817783, + "grad_norm": 2.026337146759033, + "learning_rate": 9.188660801564028e-06, + "loss": 0.5794, + "step": 11665 + }, + { + "epoch": 1.8262366938008765, + "grad_norm": 3.12939453125, + "learning_rate": 9.180514825676116e-06, + "loss": 0.2846, + "step": 11666 + }, + { + "epoch": 1.826393237319975, + "grad_norm": 1.7663549184799194, + "learning_rate": 9.172368849788206e-06, + "loss": 0.5435, + "step": 11667 + }, + { + "epoch": 1.8265497808390734, + "grad_norm": 1.4422333240509033, + "learning_rate": 9.164222873900293e-06, + "loss": 0.3843, + "step": 11668 + }, + { + "epoch": 1.8267063243581716, + "grad_norm": 2.9618587493896484, + "learning_rate": 9.156076898012382e-06, + "loss": 0.3806, + "step": 11669 + }, + { + "epoch": 1.8268628678772698, + "grad_norm": 4.7728986740112305, + "learning_rate": 9.147930922124471e-06, + "loss": 0.9397, + "step": 11670 + }, + { + "epoch": 1.827019411396368, + "grad_norm": 2.5527875423431396, + "learning_rate": 9.13978494623656e-06, + "loss": 0.5519, + "step": 11671 + }, + { + "epoch": 1.8271759549154665, + "grad_norm": 2.2215867042541504, + "learning_rate": 9.131638970348649e-06, + "loss": 1.1623, + "step": 11672 + }, + { + "epoch": 1.827332498434565, + "grad_norm": 3.0696918964385986, + "learning_rate": 9.123492994460737e-06, + "loss": 1.0414, + "step": 11673 + }, + { + "epoch": 1.8274890419536631, + "grad_norm": 5.917203426361084, + "learning_rate": 9.115347018572826e-06, + "loss": 1.2842, + "step": 11674 + }, + { + "epoch": 1.8276455854727613, + "grad_norm": 3.2905144691467285, + "learning_rate": 9.107201042684913e-06, + "loss": 0.7984, + "step": 11675 + }, + { + "epoch": 1.8278021289918598, + "grad_norm": 4.635010719299316, + "learning_rate": 9.099055066797002e-06, + "loss": 0.8587, + "step": 11676 + }, + { + "epoch": 1.827958672510958, + "grad_norm": 4.220514297485352, + "learning_rate": 9.090909090909091e-06, + "loss": 1.199, + "step": 11677 + }, + { + "epoch": 1.8281152160300564, + "grad_norm": 3.3353922367095947, + "learning_rate": 9.08276311502118e-06, + "loss": 0.8626, + "step": 11678 + }, + { + "epoch": 1.8282717595491547, + "grad_norm": 2.780113697052002, + "learning_rate": 9.074617139133269e-06, + "loss": 1.191, + "step": 11679 + }, + { + "epoch": 1.8284283030682529, + "grad_norm": 5.155093669891357, + "learning_rate": 9.066471163245358e-06, + "loss": 1.3576, + "step": 11680 + }, + { + "epoch": 1.8285848465873513, + "grad_norm": 4.824505805969238, + "learning_rate": 9.058325187357446e-06, + "loss": 1.1529, + "step": 11681 + }, + { + "epoch": 1.8287413901064495, + "grad_norm": 7.58240270614624, + "learning_rate": 9.050179211469534e-06, + "loss": 0.9597, + "step": 11682 + }, + { + "epoch": 1.828897933625548, + "grad_norm": 2.4234585762023926, + "learning_rate": 9.042033235581624e-06, + "loss": 1.4587, + "step": 11683 + }, + { + "epoch": 1.8290544771446462, + "grad_norm": 4.772586822509766, + "learning_rate": 9.033887259693711e-06, + "loss": 0.6689, + "step": 11684 + }, + { + "epoch": 1.8292110206637444, + "grad_norm": 2.326843023300171, + "learning_rate": 9.0257412838058e-06, + "loss": 0.3395, + "step": 11685 + }, + { + "epoch": 1.8293675641828429, + "grad_norm": 3.6902923583984375, + "learning_rate": 9.017595307917889e-06, + "loss": 0.514, + "step": 11686 + }, + { + "epoch": 1.8295241077019413, + "grad_norm": 7.05543851852417, + "learning_rate": 9.009449332029978e-06, + "loss": 0.6536, + "step": 11687 + }, + { + "epoch": 1.8296806512210395, + "grad_norm": 3.3704092502593994, + "learning_rate": 9.001303356142066e-06, + "loss": 0.88, + "step": 11688 + }, + { + "epoch": 1.8298371947401377, + "grad_norm": 0.4961819350719452, + "learning_rate": 8.993157380254155e-06, + "loss": 0.2741, + "step": 11689 + }, + { + "epoch": 1.829993738259236, + "grad_norm": 0.8121910095214844, + "learning_rate": 8.985011404366244e-06, + "loss": 0.266, + "step": 11690 + }, + { + "epoch": 1.8301502817783344, + "grad_norm": 0.6254972219467163, + "learning_rate": 8.976865428478331e-06, + "loss": 0.2319, + "step": 11691 + }, + { + "epoch": 1.8303068252974328, + "grad_norm": 0.6259579062461853, + "learning_rate": 8.968719452590422e-06, + "loss": 0.2948, + "step": 11692 + }, + { + "epoch": 1.830463368816531, + "grad_norm": 1.360505223274231, + "learning_rate": 8.960573476702509e-06, + "loss": 0.3572, + "step": 11693 + }, + { + "epoch": 1.8306199123356293, + "grad_norm": 0.6930534243583679, + "learning_rate": 8.952427500814598e-06, + "loss": 0.2683, + "step": 11694 + }, + { + "epoch": 1.8307764558547275, + "grad_norm": 1.3367292881011963, + "learning_rate": 8.944281524926687e-06, + "loss": 0.3745, + "step": 11695 + }, + { + "epoch": 1.830932999373826, + "grad_norm": 0.4380413889884949, + "learning_rate": 8.936135549038775e-06, + "loss": 0.2572, + "step": 11696 + }, + { + "epoch": 1.8310895428929244, + "grad_norm": 1.2049227952957153, + "learning_rate": 8.927989573150863e-06, + "loss": 0.3096, + "step": 11697 + }, + { + "epoch": 1.8312460864120226, + "grad_norm": 1.1049116849899292, + "learning_rate": 8.919843597262953e-06, + "loss": 0.3874, + "step": 11698 + }, + { + "epoch": 1.8314026299311208, + "grad_norm": 0.9396835565567017, + "learning_rate": 8.911697621375042e-06, + "loss": 0.3347, + "step": 11699 + }, + { + "epoch": 1.831559173450219, + "grad_norm": 1.1928058862686157, + "learning_rate": 8.903551645487129e-06, + "loss": 0.3658, + "step": 11700 + }, + { + "epoch": 1.8317157169693175, + "grad_norm": 1.0069547891616821, + "learning_rate": 8.89540566959922e-06, + "loss": 0.3331, + "step": 11701 + }, + { + "epoch": 1.831872260488416, + "grad_norm": 0.9531508684158325, + "learning_rate": 8.887259693711307e-06, + "loss": 0.4072, + "step": 11702 + }, + { + "epoch": 1.8320288040075141, + "grad_norm": 0.7137613296508789, + "learning_rate": 8.879113717823396e-06, + "loss": 0.2736, + "step": 11703 + }, + { + "epoch": 1.8321853475266123, + "grad_norm": 0.820740282535553, + "learning_rate": 8.870967741935484e-06, + "loss": 0.2485, + "step": 11704 + }, + { + "epoch": 1.8323418910457105, + "grad_norm": 1.8023409843444824, + "learning_rate": 8.862821766047573e-06, + "loss": 0.5366, + "step": 11705 + }, + { + "epoch": 1.832498434564809, + "grad_norm": 0.8302339315414429, + "learning_rate": 8.85467579015966e-06, + "loss": 0.3491, + "step": 11706 + }, + { + "epoch": 1.8326549780839074, + "grad_norm": 1.6287059783935547, + "learning_rate": 8.846529814271751e-06, + "loss": 0.4141, + "step": 11707 + }, + { + "epoch": 1.8328115216030056, + "grad_norm": 2.1629910469055176, + "learning_rate": 8.838383838383838e-06, + "loss": 0.4707, + "step": 11708 + }, + { + "epoch": 1.8329680651221039, + "grad_norm": 1.5460114479064941, + "learning_rate": 8.830237862495927e-06, + "loss": 0.5824, + "step": 11709 + }, + { + "epoch": 1.8331246086412023, + "grad_norm": 2.1025335788726807, + "learning_rate": 8.822091886608017e-06, + "loss": 0.4737, + "step": 11710 + }, + { + "epoch": 1.8332811521603005, + "grad_norm": 1.7542686462402344, + "learning_rate": 8.813945910720105e-06, + "loss": 0.7119, + "step": 11711 + }, + { + "epoch": 1.833437695679399, + "grad_norm": 1.3527926206588745, + "learning_rate": 8.805799934832193e-06, + "loss": 0.4251, + "step": 11712 + }, + { + "epoch": 1.8335942391984972, + "grad_norm": 1.2564703226089478, + "learning_rate": 8.797653958944282e-06, + "loss": 0.536, + "step": 11713 + }, + { + "epoch": 1.8337507827175954, + "grad_norm": 1.32987380027771, + "learning_rate": 8.789507983056371e-06, + "loss": 0.4199, + "step": 11714 + }, + { + "epoch": 1.8339073262366938, + "grad_norm": 1.2668266296386719, + "learning_rate": 8.781362007168458e-06, + "loss": 0.458, + "step": 11715 + }, + { + "epoch": 1.8340638697557923, + "grad_norm": 2.371445894241333, + "learning_rate": 8.773216031280549e-06, + "loss": 0.5342, + "step": 11716 + }, + { + "epoch": 1.8342204132748905, + "grad_norm": 1.6443188190460205, + "learning_rate": 8.765070055392636e-06, + "loss": 0.6162, + "step": 11717 + }, + { + "epoch": 1.8343769567939887, + "grad_norm": 2.2438342571258545, + "learning_rate": 8.756924079504725e-06, + "loss": 0.5637, + "step": 11718 + }, + { + "epoch": 1.834533500313087, + "grad_norm": 2.5697834491729736, + "learning_rate": 8.748778103616813e-06, + "loss": 0.862, + "step": 11719 + }, + { + "epoch": 1.8346900438321854, + "grad_norm": 4.20882511138916, + "learning_rate": 8.740632127728902e-06, + "loss": 0.8871, + "step": 11720 + }, + { + "epoch": 1.8348465873512838, + "grad_norm": 3.804483652114868, + "learning_rate": 8.732486151840991e-06, + "loss": 0.7814, + "step": 11721 + }, + { + "epoch": 1.835003130870382, + "grad_norm": 3.23657488822937, + "learning_rate": 8.72434017595308e-06, + "loss": 0.9147, + "step": 11722 + }, + { + "epoch": 1.8351596743894802, + "grad_norm": 7.189966678619385, + "learning_rate": 8.716194200065169e-06, + "loss": 0.6773, + "step": 11723 + }, + { + "epoch": 1.8353162179085785, + "grad_norm": 2.6329920291900635, + "learning_rate": 8.708048224177256e-06, + "loss": 0.4555, + "step": 11724 + }, + { + "epoch": 1.835472761427677, + "grad_norm": 1.583086609840393, + "learning_rate": 8.699902248289346e-06, + "loss": 0.5877, + "step": 11725 + }, + { + "epoch": 1.8356293049467753, + "grad_norm": 1.7837693691253662, + "learning_rate": 8.691756272401434e-06, + "loss": 0.5421, + "step": 11726 + }, + { + "epoch": 1.8357858484658736, + "grad_norm": 3.9418041706085205, + "learning_rate": 8.683610296513522e-06, + "loss": 1.0558, + "step": 11727 + }, + { + "epoch": 1.8359423919849718, + "grad_norm": 3.228860855102539, + "learning_rate": 8.675464320625611e-06, + "loss": 0.7857, + "step": 11728 + }, + { + "epoch": 1.83609893550407, + "grad_norm": 3.151782512664795, + "learning_rate": 8.6673183447377e-06, + "loss": 0.875, + "step": 11729 + }, + { + "epoch": 1.8362554790231684, + "grad_norm": 3.251871347427368, + "learning_rate": 8.659172368849789e-06, + "loss": 1.2028, + "step": 11730 + }, + { + "epoch": 1.8364120225422669, + "grad_norm": 6.767560958862305, + "learning_rate": 8.651026392961878e-06, + "loss": 1.137, + "step": 11731 + }, + { + "epoch": 1.836568566061365, + "grad_norm": 8.362618446350098, + "learning_rate": 8.642880417073967e-06, + "loss": 1.18, + "step": 11732 + }, + { + "epoch": 1.8367251095804633, + "grad_norm": 1.7389017343521118, + "learning_rate": 8.634734441186054e-06, + "loss": 0.7676, + "step": 11733 + }, + { + "epoch": 1.8368816530995615, + "grad_norm": 1.2509464025497437, + "learning_rate": 8.626588465298144e-06, + "loss": 0.3304, + "step": 11734 + }, + { + "epoch": 1.83703819661866, + "grad_norm": 1.9182140827178955, + "learning_rate": 8.618442489410231e-06, + "loss": 0.4409, + "step": 11735 + }, + { + "epoch": 1.8371947401377584, + "grad_norm": 1.7999547719955444, + "learning_rate": 8.61029651352232e-06, + "loss": 0.4232, + "step": 11736 + }, + { + "epoch": 1.8373512836568566, + "grad_norm": 2.9071903228759766, + "learning_rate": 8.602150537634409e-06, + "loss": 0.8191, + "step": 11737 + }, + { + "epoch": 1.8375078271759548, + "grad_norm": 1.2955018281936646, + "learning_rate": 8.594004561746498e-06, + "loss": 0.5754, + "step": 11738 + }, + { + "epoch": 1.837664370695053, + "grad_norm": 0.5868598818778992, + "learning_rate": 8.585858585858587e-06, + "loss": 0.2564, + "step": 11739 + }, + { + "epoch": 1.8378209142141515, + "grad_norm": 0.706810712814331, + "learning_rate": 8.577712609970674e-06, + "loss": 0.2382, + "step": 11740 + }, + { + "epoch": 1.83797745773325, + "grad_norm": 0.5890569686889648, + "learning_rate": 8.569566634082764e-06, + "loss": 0.3361, + "step": 11741 + }, + { + "epoch": 1.8381340012523482, + "grad_norm": 0.4903320074081421, + "learning_rate": 8.561420658194852e-06, + "loss": 0.2086, + "step": 11742 + }, + { + "epoch": 1.8382905447714464, + "grad_norm": 0.729972243309021, + "learning_rate": 8.55327468230694e-06, + "loss": 0.2909, + "step": 11743 + }, + { + "epoch": 1.8384470882905448, + "grad_norm": 1.0089857578277588, + "learning_rate": 8.54512870641903e-06, + "loss": 0.3729, + "step": 11744 + }, + { + "epoch": 1.838603631809643, + "grad_norm": 1.1274482011795044, + "learning_rate": 8.536982730531118e-06, + "loss": 0.3682, + "step": 11745 + }, + { + "epoch": 1.8387601753287415, + "grad_norm": 0.8460453152656555, + "learning_rate": 8.528836754643207e-06, + "loss": 0.3152, + "step": 11746 + }, + { + "epoch": 1.8389167188478397, + "grad_norm": 0.8790549039840698, + "learning_rate": 8.520690778755296e-06, + "loss": 0.2967, + "step": 11747 + }, + { + "epoch": 1.839073262366938, + "grad_norm": 1.0532732009887695, + "learning_rate": 8.512544802867385e-06, + "loss": 0.3391, + "step": 11748 + }, + { + "epoch": 1.8392298058860364, + "grad_norm": 3.924938678741455, + "learning_rate": 8.504398826979472e-06, + "loss": 0.9881, + "step": 11749 + }, + { + "epoch": 1.8393863494051348, + "grad_norm": 1.0835224390029907, + "learning_rate": 8.496252851091562e-06, + "loss": 0.4015, + "step": 11750 + }, + { + "epoch": 1.839542892924233, + "grad_norm": 1.1198031902313232, + "learning_rate": 8.48810687520365e-06, + "loss": 0.3661, + "step": 11751 + }, + { + "epoch": 1.8396994364433312, + "grad_norm": 1.0097121000289917, + "learning_rate": 8.479960899315738e-06, + "loss": 0.5398, + "step": 11752 + }, + { + "epoch": 1.8398559799624294, + "grad_norm": 1.0740890502929688, + "learning_rate": 8.471814923427827e-06, + "loss": 0.395, + "step": 11753 + }, + { + "epoch": 1.8400125234815279, + "grad_norm": 1.7675318717956543, + "learning_rate": 8.463668947539916e-06, + "loss": 0.4393, + "step": 11754 + }, + { + "epoch": 1.8401690670006263, + "grad_norm": 1.49519944190979, + "learning_rate": 8.455522971652003e-06, + "loss": 0.7297, + "step": 11755 + }, + { + "epoch": 1.8403256105197245, + "grad_norm": 1.7148771286010742, + "learning_rate": 8.447376995764093e-06, + "loss": 0.3852, + "step": 11756 + }, + { + "epoch": 1.8404821540388228, + "grad_norm": 2.5274970531463623, + "learning_rate": 8.439231019876182e-06, + "loss": 0.484, + "step": 11757 + }, + { + "epoch": 1.840638697557921, + "grad_norm": 1.6765005588531494, + "learning_rate": 8.43108504398827e-06, + "loss": 0.4564, + "step": 11758 + }, + { + "epoch": 1.8407952410770194, + "grad_norm": 1.6483417749404907, + "learning_rate": 8.42293906810036e-06, + "loss": 0.4769, + "step": 11759 + }, + { + "epoch": 1.8409517845961179, + "grad_norm": 3.7762560844421387, + "learning_rate": 8.414793092212447e-06, + "loss": 0.5111, + "step": 11760 + }, + { + "epoch": 1.841108328115216, + "grad_norm": 1.2559213638305664, + "learning_rate": 8.406647116324536e-06, + "loss": 0.3424, + "step": 11761 + }, + { + "epoch": 1.8412648716343143, + "grad_norm": 2.1985714435577393, + "learning_rate": 8.398501140436625e-06, + "loss": 0.5078, + "step": 11762 + }, + { + "epoch": 1.8414214151534125, + "grad_norm": 3.331117868423462, + "learning_rate": 8.390355164548714e-06, + "loss": 0.5274, + "step": 11763 + }, + { + "epoch": 1.841577958672511, + "grad_norm": 1.7841341495513916, + "learning_rate": 8.3822091886608e-06, + "loss": 0.6537, + "step": 11764 + }, + { + "epoch": 1.8417345021916094, + "grad_norm": 1.5916430950164795, + "learning_rate": 8.374063212772891e-06, + "loss": 0.4835, + "step": 11765 + }, + { + "epoch": 1.8418910457107076, + "grad_norm": 2.3381009101867676, + "learning_rate": 8.365917236884978e-06, + "loss": 0.425, + "step": 11766 + }, + { + "epoch": 1.8420475892298058, + "grad_norm": 1.6873114109039307, + "learning_rate": 8.357771260997067e-06, + "loss": 0.6125, + "step": 11767 + }, + { + "epoch": 1.842204132748904, + "grad_norm": 2.0549557209014893, + "learning_rate": 8.349625285109158e-06, + "loss": 0.4734, + "step": 11768 + }, + { + "epoch": 1.8423606762680025, + "grad_norm": 2.1856656074523926, + "learning_rate": 8.341479309221245e-06, + "loss": 0.7077, + "step": 11769 + }, + { + "epoch": 1.842517219787101, + "grad_norm": 3.987504482269287, + "learning_rate": 8.333333333333334e-06, + "loss": 0.914, + "step": 11770 + }, + { + "epoch": 1.8426737633061991, + "grad_norm": 3.673947811126709, + "learning_rate": 8.325187357445423e-06, + "loss": 0.3835, + "step": 11771 + }, + { + "epoch": 1.8428303068252974, + "grad_norm": 7.217020511627197, + "learning_rate": 8.317041381557511e-06, + "loss": 1.3389, + "step": 11772 + }, + { + "epoch": 1.8429868503443956, + "grad_norm": 2.710909843444824, + "learning_rate": 8.308895405669599e-06, + "loss": 0.7825, + "step": 11773 + }, + { + "epoch": 1.843143393863494, + "grad_norm": 1.9978668689727783, + "learning_rate": 8.300749429781689e-06, + "loss": 0.9369, + "step": 11774 + }, + { + "epoch": 1.8432999373825925, + "grad_norm": 1.7362598180770874, + "learning_rate": 8.292603453893776e-06, + "loss": 0.6759, + "step": 11775 + }, + { + "epoch": 1.8434564809016907, + "grad_norm": 3.0308406352996826, + "learning_rate": 8.284457478005865e-06, + "loss": 0.8257, + "step": 11776 + }, + { + "epoch": 1.843613024420789, + "grad_norm": 3.3358776569366455, + "learning_rate": 8.276311502117954e-06, + "loss": 0.6291, + "step": 11777 + }, + { + "epoch": 1.8437695679398873, + "grad_norm": 5.6964545249938965, + "learning_rate": 8.268165526230043e-06, + "loss": 0.6029, + "step": 11778 + }, + { + "epoch": 1.8439261114589856, + "grad_norm": 2.5133185386657715, + "learning_rate": 8.260019550342132e-06, + "loss": 0.9179, + "step": 11779 + }, + { + "epoch": 1.844082654978084, + "grad_norm": 2.7045984268188477, + "learning_rate": 8.25187357445422e-06, + "loss": 0.9287, + "step": 11780 + }, + { + "epoch": 1.8442391984971822, + "grad_norm": 4.924438953399658, + "learning_rate": 8.24372759856631e-06, + "loss": 1.6219, + "step": 11781 + }, + { + "epoch": 1.8443957420162804, + "grad_norm": 2.2275640964508057, + "learning_rate": 8.235581622678396e-06, + "loss": 0.68, + "step": 11782 + }, + { + "epoch": 1.8445522855353789, + "grad_norm": 2.8599636554718018, + "learning_rate": 8.227435646790487e-06, + "loss": 0.9042, + "step": 11783 + }, + { + "epoch": 1.8447088290544773, + "grad_norm": 4.035522937774658, + "learning_rate": 8.219289670902574e-06, + "loss": 0.6686, + "step": 11784 + }, + { + "epoch": 1.8448653725735755, + "grad_norm": 4.046782493591309, + "learning_rate": 8.211143695014663e-06, + "loss": 1.079, + "step": 11785 + }, + { + "epoch": 1.8450219160926737, + "grad_norm": 4.835042953491211, + "learning_rate": 8.202997719126752e-06, + "loss": 0.9839, + "step": 11786 + }, + { + "epoch": 1.845178459611772, + "grad_norm": 1.1570558547973633, + "learning_rate": 8.19485174323884e-06, + "loss": 0.277, + "step": 11787 + }, + { + "epoch": 1.8453350031308704, + "grad_norm": 3.0108695030212402, + "learning_rate": 8.18670576735093e-06, + "loss": 0.8872, + "step": 11788 + }, + { + "epoch": 1.8454915466499688, + "grad_norm": 1.9415451288223267, + "learning_rate": 8.178559791463018e-06, + "loss": 0.7302, + "step": 11789 + }, + { + "epoch": 1.845648090169067, + "grad_norm": 0.7170361280441284, + "learning_rate": 8.170413815575107e-06, + "loss": 0.2928, + "step": 11790 + }, + { + "epoch": 1.8458046336881653, + "grad_norm": 0.7283549308776855, + "learning_rate": 8.162267839687194e-06, + "loss": 0.2457, + "step": 11791 + }, + { + "epoch": 1.8459611772072635, + "grad_norm": 0.7685917019844055, + "learning_rate": 8.154121863799285e-06, + "loss": 0.3586, + "step": 11792 + }, + { + "epoch": 1.846117720726362, + "grad_norm": 0.5815606713294983, + "learning_rate": 8.145975887911372e-06, + "loss": 0.3029, + "step": 11793 + }, + { + "epoch": 1.8462742642454604, + "grad_norm": 2.702866554260254, + "learning_rate": 8.13782991202346e-06, + "loss": 0.7, + "step": 11794 + }, + { + "epoch": 1.8464308077645586, + "grad_norm": 0.8654584288597107, + "learning_rate": 8.12968393613555e-06, + "loss": 0.4315, + "step": 11795 + }, + { + "epoch": 1.8465873512836568, + "grad_norm": 0.4988161325454712, + "learning_rate": 8.121537960247638e-06, + "loss": 0.2423, + "step": 11796 + }, + { + "epoch": 1.846743894802755, + "grad_norm": 1.1385351419448853, + "learning_rate": 8.113391984359727e-06, + "loss": 0.2798, + "step": 11797 + }, + { + "epoch": 1.8469004383218535, + "grad_norm": 1.0791492462158203, + "learning_rate": 8.105246008471816e-06, + "loss": 0.3496, + "step": 11798 + }, + { + "epoch": 1.847056981840952, + "grad_norm": 0.7237057089805603, + "learning_rate": 8.097100032583905e-06, + "loss": 0.2282, + "step": 11799 + }, + { + "epoch": 1.8472135253600501, + "grad_norm": 1.497847318649292, + "learning_rate": 8.088954056695992e-06, + "loss": 0.3215, + "step": 11800 + }, + { + "epoch": 1.8473700688791483, + "grad_norm": 1.2425721883773804, + "learning_rate": 8.080808080808082e-06, + "loss": 0.4945, + "step": 11801 + }, + { + "epoch": 1.8475266123982466, + "grad_norm": 1.2899259328842163, + "learning_rate": 8.07266210492017e-06, + "loss": 0.4255, + "step": 11802 + }, + { + "epoch": 1.847683155917345, + "grad_norm": 2.1028807163238525, + "learning_rate": 8.064516129032258e-06, + "loss": 0.625, + "step": 11803 + }, + { + "epoch": 1.8478396994364434, + "grad_norm": 2.020739793777466, + "learning_rate": 8.056370153144347e-06, + "loss": 0.4226, + "step": 11804 + }, + { + "epoch": 1.8479962429555417, + "grad_norm": 1.8419381380081177, + "learning_rate": 8.048224177256436e-06, + "loss": 0.4814, + "step": 11805 + }, + { + "epoch": 1.8481527864746399, + "grad_norm": 1.8107738494873047, + "learning_rate": 8.040078201368525e-06, + "loss": 0.7619, + "step": 11806 + }, + { + "epoch": 1.8483093299937383, + "grad_norm": 1.8694567680358887, + "learning_rate": 8.031932225480612e-06, + "loss": 0.5669, + "step": 11807 + }, + { + "epoch": 1.8484658735128365, + "grad_norm": 1.8140857219696045, + "learning_rate": 8.023786249592703e-06, + "loss": 0.4763, + "step": 11808 + }, + { + "epoch": 1.848622417031935, + "grad_norm": 2.439490795135498, + "learning_rate": 8.01564027370479e-06, + "loss": 0.5047, + "step": 11809 + }, + { + "epoch": 1.8487789605510332, + "grad_norm": 1.2802966833114624, + "learning_rate": 8.007494297816879e-06, + "loss": 0.4416, + "step": 11810 + }, + { + "epoch": 1.8489355040701314, + "grad_norm": 1.6420679092407227, + "learning_rate": 7.999348321928967e-06, + "loss": 0.7416, + "step": 11811 + }, + { + "epoch": 1.8490920475892298, + "grad_norm": 4.21354866027832, + "learning_rate": 7.991202346041056e-06, + "loss": 0.6025, + "step": 11812 + }, + { + "epoch": 1.849248591108328, + "grad_norm": 2.1171875, + "learning_rate": 7.983056370153143e-06, + "loss": 0.5339, + "step": 11813 + }, + { + "epoch": 1.8494051346274265, + "grad_norm": 3.136996269226074, + "learning_rate": 7.974910394265234e-06, + "loss": 0.4322, + "step": 11814 + }, + { + "epoch": 1.8495616781465247, + "grad_norm": 0.7915540337562561, + "learning_rate": 7.966764418377323e-06, + "loss": 0.2209, + "step": 11815 + }, + { + "epoch": 1.849718221665623, + "grad_norm": 3.299316883087158, + "learning_rate": 7.95861844248941e-06, + "loss": 0.524, + "step": 11816 + }, + { + "epoch": 1.8498747651847214, + "grad_norm": 8.559708595275879, + "learning_rate": 7.9504724666015e-06, + "loss": 0.4822, + "step": 11817 + }, + { + "epoch": 1.8500313087038198, + "grad_norm": 2.9693355560302734, + "learning_rate": 7.942326490713587e-06, + "loss": 0.8424, + "step": 11818 + }, + { + "epoch": 1.850187852222918, + "grad_norm": 3.014822483062744, + "learning_rate": 7.934180514825676e-06, + "loss": 0.8551, + "step": 11819 + }, + { + "epoch": 1.8503443957420163, + "grad_norm": 2.474665641784668, + "learning_rate": 7.926034538937765e-06, + "loss": 0.7265, + "step": 11820 + }, + { + "epoch": 1.8505009392611145, + "grad_norm": 5.718163967132568, + "learning_rate": 7.917888563049854e-06, + "loss": 0.8167, + "step": 11821 + }, + { + "epoch": 1.850657482780213, + "grad_norm": 2.108369827270508, + "learning_rate": 7.909742587161941e-06, + "loss": 0.6556, + "step": 11822 + }, + { + "epoch": 1.8508140262993114, + "grad_norm": 4.806756496429443, + "learning_rate": 7.901596611274032e-06, + "loss": 0.9971, + "step": 11823 + }, + { + "epoch": 1.8509705698184096, + "grad_norm": 4.392995834350586, + "learning_rate": 7.893450635386119e-06, + "loss": 0.7158, + "step": 11824 + }, + { + "epoch": 1.8511271133375078, + "grad_norm": 2.177574634552002, + "learning_rate": 7.885304659498208e-06, + "loss": 0.7189, + "step": 11825 + }, + { + "epoch": 1.851283656856606, + "grad_norm": 2.4923274517059326, + "learning_rate": 7.877158683610298e-06, + "loss": 0.8327, + "step": 11826 + }, + { + "epoch": 1.8514402003757044, + "grad_norm": 2.9412729740142822, + "learning_rate": 7.869012707722385e-06, + "loss": 0.6929, + "step": 11827 + }, + { + "epoch": 1.8515967438948029, + "grad_norm": 3.562372922897339, + "learning_rate": 7.860866731834474e-06, + "loss": 1.289, + "step": 11828 + }, + { + "epoch": 1.851753287413901, + "grad_norm": 3.3577752113342285, + "learning_rate": 7.852720755946563e-06, + "loss": 0.8949, + "step": 11829 + }, + { + "epoch": 1.8519098309329993, + "grad_norm": 3.8627145290374756, + "learning_rate": 7.844574780058652e-06, + "loss": 1.3695, + "step": 11830 + }, + { + "epoch": 1.8520663744520975, + "grad_norm": 3.92171573638916, + "learning_rate": 7.836428804170739e-06, + "loss": 0.8118, + "step": 11831 + }, + { + "epoch": 1.852222917971196, + "grad_norm": 8.423507690429688, + "learning_rate": 7.82828282828283e-06, + "loss": 0.6805, + "step": 11832 + }, + { + "epoch": 1.8523794614902944, + "grad_norm": 5.796451091766357, + "learning_rate": 7.820136852394917e-06, + "loss": 1.7295, + "step": 11833 + }, + { + "epoch": 1.8525360050093926, + "grad_norm": 1.7552770376205444, + "learning_rate": 7.811990876507005e-06, + "loss": 0.5674, + "step": 11834 + }, + { + "epoch": 1.8526925485284909, + "grad_norm": 3.995020866394043, + "learning_rate": 7.803844900619094e-06, + "loss": 0.6358, + "step": 11835 + }, + { + "epoch": 1.852849092047589, + "grad_norm": 3.156050682067871, + "learning_rate": 7.795698924731183e-06, + "loss": 1.1918, + "step": 11836 + }, + { + "epoch": 1.8530056355666875, + "grad_norm": 1.9720836877822876, + "learning_rate": 7.787552948843272e-06, + "loss": 0.7886, + "step": 11837 + }, + { + "epoch": 1.853162179085786, + "grad_norm": 1.946423888206482, + "learning_rate": 7.77940697295536e-06, + "loss": 0.4409, + "step": 11838 + }, + { + "epoch": 1.8533187226048842, + "grad_norm": 0.6291620135307312, + "learning_rate": 7.77126099706745e-06, + "loss": 0.2979, + "step": 11839 + }, + { + "epoch": 1.8534752661239824, + "grad_norm": 0.7084575891494751, + "learning_rate": 7.763115021179537e-06, + "loss": 0.2566, + "step": 11840 + }, + { + "epoch": 1.8536318096430808, + "grad_norm": 0.6545860767364502, + "learning_rate": 7.754969045291627e-06, + "loss": 0.3109, + "step": 11841 + }, + { + "epoch": 1.853788353162179, + "grad_norm": 0.9005463123321533, + "learning_rate": 7.746823069403714e-06, + "loss": 0.3324, + "step": 11842 + }, + { + "epoch": 1.8539448966812775, + "grad_norm": 0.6053066253662109, + "learning_rate": 7.738677093515803e-06, + "loss": 0.2779, + "step": 11843 + }, + { + "epoch": 1.8541014402003757, + "grad_norm": 0.583955705165863, + "learning_rate": 7.730531117627892e-06, + "loss": 0.2898, + "step": 11844 + }, + { + "epoch": 1.854257983719474, + "grad_norm": 0.8470158576965332, + "learning_rate": 7.722385141739981e-06, + "loss": 0.3137, + "step": 11845 + }, + { + "epoch": 1.8544145272385724, + "grad_norm": 0.5816637277603149, + "learning_rate": 7.71423916585207e-06, + "loss": 0.2475, + "step": 11846 + }, + { + "epoch": 1.8545710707576706, + "grad_norm": 0.8662569522857666, + "learning_rate": 7.706093189964159e-06, + "loss": 0.4307, + "step": 11847 + }, + { + "epoch": 1.854727614276769, + "grad_norm": 6.9525532722473145, + "learning_rate": 7.697947214076247e-06, + "loss": 0.8509, + "step": 11848 + }, + { + "epoch": 1.8548841577958672, + "grad_norm": 0.9900915622711182, + "learning_rate": 7.689801238188334e-06, + "loss": 0.4358, + "step": 11849 + }, + { + "epoch": 1.8550407013149655, + "grad_norm": 3.09839129447937, + "learning_rate": 7.681655262300425e-06, + "loss": 0.5641, + "step": 11850 + }, + { + "epoch": 1.855197244834064, + "grad_norm": 1.3057446479797363, + "learning_rate": 7.673509286412512e-06, + "loss": 0.4088, + "step": 11851 + }, + { + "epoch": 1.8553537883531623, + "grad_norm": 0.9218551516532898, + "learning_rate": 7.665363310524601e-06, + "loss": 0.2636, + "step": 11852 + }, + { + "epoch": 1.8555103318722606, + "grad_norm": 2.0346827507019043, + "learning_rate": 7.65721733463669e-06, + "loss": 0.3227, + "step": 11853 + }, + { + "epoch": 1.8556668753913588, + "grad_norm": 1.8452653884887695, + "learning_rate": 7.649071358748779e-06, + "loss": 0.5004, + "step": 11854 + }, + { + "epoch": 1.855823418910457, + "grad_norm": 1.5288772583007812, + "learning_rate": 7.640925382860867e-06, + "loss": 0.4756, + "step": 11855 + }, + { + "epoch": 1.8559799624295554, + "grad_norm": 2.3722097873687744, + "learning_rate": 7.632779406972956e-06, + "loss": 0.3684, + "step": 11856 + }, + { + "epoch": 1.8561365059486539, + "grad_norm": 1.376643180847168, + "learning_rate": 7.624633431085044e-06, + "loss": 0.3401, + "step": 11857 + }, + { + "epoch": 1.856293049467752, + "grad_norm": 1.1166343688964844, + "learning_rate": 7.616487455197132e-06, + "loss": 0.3335, + "step": 11858 + }, + { + "epoch": 1.8564495929868503, + "grad_norm": 1.0469523668289185, + "learning_rate": 7.608341479309222e-06, + "loss": 0.2453, + "step": 11859 + }, + { + "epoch": 1.8566061365059485, + "grad_norm": 3.94055438041687, + "learning_rate": 7.600195503421311e-06, + "loss": 0.608, + "step": 11860 + }, + { + "epoch": 1.856762680025047, + "grad_norm": 7.441219329833984, + "learning_rate": 7.592049527533399e-06, + "loss": 1.0575, + "step": 11861 + }, + { + "epoch": 1.8569192235441454, + "grad_norm": 2.77823805809021, + "learning_rate": 7.5839035516454884e-06, + "loss": 0.3747, + "step": 11862 + }, + { + "epoch": 1.8570757670632436, + "grad_norm": 1.5626510381698608, + "learning_rate": 7.5757575757575764e-06, + "loss": 0.3043, + "step": 11863 + }, + { + "epoch": 1.8572323105823418, + "grad_norm": 4.193922519683838, + "learning_rate": 7.567611599869664e-06, + "loss": 0.5742, + "step": 11864 + }, + { + "epoch": 1.85738885410144, + "grad_norm": 1.9951577186584473, + "learning_rate": 7.559465623981754e-06, + "loss": 0.7077, + "step": 11865 + }, + { + "epoch": 1.8575453976205385, + "grad_norm": 1.8019089698791504, + "learning_rate": 7.551319648093842e-06, + "loss": 0.6193, + "step": 11866 + }, + { + "epoch": 1.857701941139637, + "grad_norm": 4.907468318939209, + "learning_rate": 7.54317367220593e-06, + "loss": 0.8638, + "step": 11867 + }, + { + "epoch": 1.8578584846587352, + "grad_norm": 3.8551406860351562, + "learning_rate": 7.535027696318019e-06, + "loss": 0.7946, + "step": 11868 + }, + { + "epoch": 1.8580150281778334, + "grad_norm": 4.182845115661621, + "learning_rate": 7.526881720430108e-06, + "loss": 0.4607, + "step": 11869 + }, + { + "epoch": 1.8581715716969316, + "grad_norm": 3.771397352218628, + "learning_rate": 7.5187357445421966e-06, + "loss": 0.7112, + "step": 11870 + }, + { + "epoch": 1.85832811521603, + "grad_norm": 4.569880962371826, + "learning_rate": 7.5105897686542845e-06, + "loss": 1.0636, + "step": 11871 + }, + { + "epoch": 1.8584846587351285, + "grad_norm": 2.4522337913513184, + "learning_rate": 7.502443792766374e-06, + "loss": 1.061, + "step": 11872 + }, + { + "epoch": 1.8586412022542267, + "grad_norm": 5.520784854888916, + "learning_rate": 7.494297816878462e-06, + "loss": 1.1818, + "step": 11873 + }, + { + "epoch": 1.858797745773325, + "grad_norm": 2.5617034435272217, + "learning_rate": 7.48615184099055e-06, + "loss": 0.3922, + "step": 11874 + }, + { + "epoch": 1.8589542892924233, + "grad_norm": 1.7915377616882324, + "learning_rate": 7.47800586510264e-06, + "loss": 0.5387, + "step": 11875 + }, + { + "epoch": 1.8591108328115216, + "grad_norm": 9.17439079284668, + "learning_rate": 7.469859889214728e-06, + "loss": 1.2299, + "step": 11876 + }, + { + "epoch": 1.85926737633062, + "grad_norm": 1.7967581748962402, + "learning_rate": 7.461713913326817e-06, + "loss": 0.7351, + "step": 11877 + }, + { + "epoch": 1.8594239198497182, + "grad_norm": 3.144145965576172, + "learning_rate": 7.4535679374389055e-06, + "loss": 0.6259, + "step": 11878 + }, + { + "epoch": 1.8595804633688164, + "grad_norm": 2.4115328788757324, + "learning_rate": 7.445421961550994e-06, + "loss": 0.7576, + "step": 11879 + }, + { + "epoch": 1.8597370068879149, + "grad_norm": 4.4977288246154785, + "learning_rate": 7.437275985663082e-06, + "loss": 0.7998, + "step": 11880 + }, + { + "epoch": 1.859893550407013, + "grad_norm": 5.419710159301758, + "learning_rate": 7.429130009775172e-06, + "loss": 1.2603, + "step": 11881 + }, + { + "epoch": 1.8600500939261115, + "grad_norm": 3.9099700450897217, + "learning_rate": 7.42098403388726e-06, + "loss": 0.8301, + "step": 11882 + }, + { + "epoch": 1.8602066374452098, + "grad_norm": 3.4485881328582764, + "learning_rate": 7.412838057999348e-06, + "loss": 0.8124, + "step": 11883 + }, + { + "epoch": 1.860363180964308, + "grad_norm": 4.141242027282715, + "learning_rate": 7.404692082111438e-06, + "loss": 0.6474, + "step": 11884 + }, + { + "epoch": 1.8605197244834064, + "grad_norm": 0.918839693069458, + "learning_rate": 7.396546106223526e-06, + "loss": 0.1674, + "step": 11885 + }, + { + "epoch": 1.8606762680025049, + "grad_norm": 3.54884672164917, + "learning_rate": 7.388400130335614e-06, + "loss": 0.4473, + "step": 11886 + }, + { + "epoch": 1.860832811521603, + "grad_norm": 6.076399326324463, + "learning_rate": 7.380254154447703e-06, + "loss": 0.9997, + "step": 11887 + }, + { + "epoch": 1.8609893550407013, + "grad_norm": 2.7559165954589844, + "learning_rate": 7.372108178559792e-06, + "loss": 0.8234, + "step": 11888 + }, + { + "epoch": 1.8611458985597995, + "grad_norm": 0.6447470188140869, + "learning_rate": 7.36396220267188e-06, + "loss": 0.2956, + "step": 11889 + }, + { + "epoch": 1.861302442078898, + "grad_norm": 0.588019073009491, + "learning_rate": 7.35581622678397e-06, + "loss": 0.2464, + "step": 11890 + }, + { + "epoch": 1.8614589855979964, + "grad_norm": 0.576622486114502, + "learning_rate": 7.347670250896058e-06, + "loss": 0.3089, + "step": 11891 + }, + { + "epoch": 1.8616155291170946, + "grad_norm": 0.8565312623977661, + "learning_rate": 7.339524275008146e-06, + "loss": 0.2628, + "step": 11892 + }, + { + "epoch": 1.8617720726361928, + "grad_norm": 0.988175630569458, + "learning_rate": 7.3313782991202354e-06, + "loss": 0.3239, + "step": 11893 + }, + { + "epoch": 1.861928616155291, + "grad_norm": 0.6450760364532471, + "learning_rate": 7.3232323232323234e-06, + "loss": 0.2698, + "step": 11894 + }, + { + "epoch": 1.8620851596743895, + "grad_norm": 0.7986873984336853, + "learning_rate": 7.315086347344411e-06, + "loss": 0.2754, + "step": 11895 + }, + { + "epoch": 1.862241703193488, + "grad_norm": 0.5439180731773376, + "learning_rate": 7.306940371456501e-06, + "loss": 0.236, + "step": 11896 + }, + { + "epoch": 1.8623982467125861, + "grad_norm": 0.9225128293037415, + "learning_rate": 7.298794395568589e-06, + "loss": 0.3113, + "step": 11897 + }, + { + "epoch": 1.8625547902316844, + "grad_norm": 0.8254417777061462, + "learning_rate": 7.290648419680678e-06, + "loss": 0.295, + "step": 11898 + }, + { + "epoch": 1.8627113337507826, + "grad_norm": 0.5982120633125305, + "learning_rate": 7.282502443792768e-06, + "loss": 0.2243, + "step": 11899 + }, + { + "epoch": 1.862867877269881, + "grad_norm": 1.0442785024642944, + "learning_rate": 7.2743564679048556e-06, + "loss": 0.2961, + "step": 11900 + }, + { + "epoch": 1.8630244207889795, + "grad_norm": 1.318932294845581, + "learning_rate": 7.2662104920169436e-06, + "loss": 0.4241, + "step": 11901 + }, + { + "epoch": 1.8631809643080777, + "grad_norm": 0.7742049694061279, + "learning_rate": 7.258064516129033e-06, + "loss": 0.3099, + "step": 11902 + }, + { + "epoch": 1.8633375078271759, + "grad_norm": 1.7873265743255615, + "learning_rate": 7.249918540241121e-06, + "loss": 0.4658, + "step": 11903 + }, + { + "epoch": 1.863494051346274, + "grad_norm": 1.3983709812164307, + "learning_rate": 7.241772564353209e-06, + "loss": 0.2219, + "step": 11904 + }, + { + "epoch": 1.8636505948653725, + "grad_norm": 1.116650938987732, + "learning_rate": 7.233626588465299e-06, + "loss": 0.3475, + "step": 11905 + }, + { + "epoch": 1.863807138384471, + "grad_norm": 1.3055055141448975, + "learning_rate": 7.225480612577387e-06, + "loss": 0.2443, + "step": 11906 + }, + { + "epoch": 1.8639636819035692, + "grad_norm": 1.4229230880737305, + "learning_rate": 7.217334636689476e-06, + "loss": 0.2588, + "step": 11907 + }, + { + "epoch": 1.8641202254226674, + "grad_norm": 0.7412163615226746, + "learning_rate": 7.2091886608015645e-06, + "loss": 0.2496, + "step": 11908 + }, + { + "epoch": 1.8642767689417659, + "grad_norm": 1.935438871383667, + "learning_rate": 7.201042684913653e-06, + "loss": 0.3681, + "step": 11909 + }, + { + "epoch": 1.864433312460864, + "grad_norm": 3.0918591022491455, + "learning_rate": 7.192896709025741e-06, + "loss": 0.7463, + "step": 11910 + }, + { + "epoch": 1.8645898559799625, + "grad_norm": 1.8490350246429443, + "learning_rate": 7.184750733137831e-06, + "loss": 0.5944, + "step": 11911 + }, + { + "epoch": 1.8647463994990607, + "grad_norm": 1.4115583896636963, + "learning_rate": 7.176604757249919e-06, + "loss": 0.6366, + "step": 11912 + }, + { + "epoch": 1.864902943018159, + "grad_norm": 1.5966428518295288, + "learning_rate": 7.168458781362007e-06, + "loss": 0.4462, + "step": 11913 + }, + { + "epoch": 1.8650594865372574, + "grad_norm": 1.4645309448242188, + "learning_rate": 7.160312805474097e-06, + "loss": 0.4233, + "step": 11914 + }, + { + "epoch": 1.8652160300563556, + "grad_norm": 2.1976349353790283, + "learning_rate": 7.152166829586185e-06, + "loss": 0.7757, + "step": 11915 + }, + { + "epoch": 1.865372573575454, + "grad_norm": 2.0167181491851807, + "learning_rate": 7.1440208536982735e-06, + "loss": 0.6941, + "step": 11916 + }, + { + "epoch": 1.8655291170945523, + "grad_norm": 3.1768338680267334, + "learning_rate": 7.135874877810362e-06, + "loss": 0.6868, + "step": 11917 + }, + { + "epoch": 1.8656856606136505, + "grad_norm": 3.279249668121338, + "learning_rate": 7.127728901922451e-06, + "loss": 0.9719, + "step": 11918 + }, + { + "epoch": 1.865842204132749, + "grad_norm": 1.9883712530136108, + "learning_rate": 7.119582926034539e-06, + "loss": 0.4257, + "step": 11919 + }, + { + "epoch": 1.8659987476518474, + "grad_norm": 1.6789515018463135, + "learning_rate": 7.111436950146629e-06, + "loss": 0.3604, + "step": 11920 + }, + { + "epoch": 1.8661552911709456, + "grad_norm": 2.4897546768188477, + "learning_rate": 7.103290974258717e-06, + "loss": 0.425, + "step": 11921 + }, + { + "epoch": 1.8663118346900438, + "grad_norm": 1.9659010171890259, + "learning_rate": 7.095144998370805e-06, + "loss": 0.5875, + "step": 11922 + }, + { + "epoch": 1.866468378209142, + "grad_norm": 4.42153787612915, + "learning_rate": 7.0869990224828945e-06, + "loss": 1.0139, + "step": 11923 + }, + { + "epoch": 1.8666249217282405, + "grad_norm": 4.387969970703125, + "learning_rate": 7.0788530465949824e-06, + "loss": 0.5941, + "step": 11924 + }, + { + "epoch": 1.866781465247339, + "grad_norm": 6.353248596191406, + "learning_rate": 7.0707070707070704e-06, + "loss": 0.5839, + "step": 11925 + }, + { + "epoch": 1.8669380087664371, + "grad_norm": 5.590850353240967, + "learning_rate": 7.06256109481916e-06, + "loss": 1.0906, + "step": 11926 + }, + { + "epoch": 1.8670945522855353, + "grad_norm": 3.0824270248413086, + "learning_rate": 7.054415118931248e-06, + "loss": 1.143, + "step": 11927 + }, + { + "epoch": 1.8672510958046336, + "grad_norm": 2.970263719558716, + "learning_rate": 7.046269143043337e-06, + "loss": 0.5793, + "step": 11928 + }, + { + "epoch": 1.867407639323732, + "grad_norm": 6.330408096313477, + "learning_rate": 7.038123167155427e-06, + "loss": 1.4466, + "step": 11929 + }, + { + "epoch": 1.8675641828428304, + "grad_norm": 3.046933174133301, + "learning_rate": 7.029977191267515e-06, + "loss": 1.049, + "step": 11930 + }, + { + "epoch": 1.8677207263619287, + "grad_norm": 3.8629543781280518, + "learning_rate": 7.0218312153796026e-06, + "loss": 1.9499, + "step": 11931 + }, + { + "epoch": 1.8678772698810269, + "grad_norm": 3.915165901184082, + "learning_rate": 7.0136852394916906e-06, + "loss": 1.0887, + "step": 11932 + }, + { + "epoch": 1.868033813400125, + "grad_norm": NaN, + "learning_rate": 7.0136852394916906e-06, + "loss": 0.0, + "step": 11933 + }, + { + "epoch": 1.8681903569192235, + "grad_norm": 2.6858162879943848, + "learning_rate": 7.00553926360378e-06, + "loss": 0.9222, + "step": 11934 + }, + { + "epoch": 1.868346900438322, + "grad_norm": 3.5079078674316406, + "learning_rate": 6.997393287715868e-06, + "loss": 1.1746, + "step": 11935 + }, + { + "epoch": 1.8685034439574202, + "grad_norm": 2.0634641647338867, + "learning_rate": 6.989247311827957e-06, + "loss": 0.5171, + "step": 11936 + }, + { + "epoch": 1.8686599874765184, + "grad_norm": 16.523466110229492, + "learning_rate": 6.981101335940046e-06, + "loss": 0.598, + "step": 11937 + }, + { + "epoch": 1.8688165309956166, + "grad_norm": 4.374631404876709, + "learning_rate": 6.972955360052135e-06, + "loss": 0.9199, + "step": 11938 + }, + { + "epoch": 1.868973074514715, + "grad_norm": 0.49263325333595276, + "learning_rate": 6.964809384164223e-06, + "loss": 0.2379, + "step": 11939 + }, + { + "epoch": 1.8691296180338135, + "grad_norm": 0.5886602997779846, + "learning_rate": 6.956663408276312e-06, + "loss": 0.305, + "step": 11940 + }, + { + "epoch": 1.8692861615529117, + "grad_norm": 0.8126084804534912, + "learning_rate": 6.9485174323884e-06, + "loss": 0.334, + "step": 11941 + }, + { + "epoch": 1.86944270507201, + "grad_norm": 0.4176965653896332, + "learning_rate": 6.940371456500488e-06, + "loss": 0.2456, + "step": 11942 + }, + { + "epoch": 1.8695992485911084, + "grad_norm": 0.8928771615028381, + "learning_rate": 6.932225480612578e-06, + "loss": 0.5099, + "step": 11943 + }, + { + "epoch": 1.8697557921102066, + "grad_norm": 0.6065268516540527, + "learning_rate": 6.924079504724666e-06, + "loss": 0.2695, + "step": 11944 + }, + { + "epoch": 1.869912335629305, + "grad_norm": 0.6004766225814819, + "learning_rate": 6.915933528836754e-06, + "loss": 0.2351, + "step": 11945 + }, + { + "epoch": 1.8700688791484033, + "grad_norm": 1.2223328351974487, + "learning_rate": 6.907787552948844e-06, + "loss": 0.3536, + "step": 11946 + }, + { + "epoch": 1.8702254226675015, + "grad_norm": 1.0061784982681274, + "learning_rate": 6.8996415770609325e-06, + "loss": 0.3183, + "step": 11947 + }, + { + "epoch": 1.8703819661866, + "grad_norm": 0.8203244805335999, + "learning_rate": 6.8914956011730205e-06, + "loss": 0.3301, + "step": 11948 + }, + { + "epoch": 1.8705385097056983, + "grad_norm": 1.665073037147522, + "learning_rate": 6.88334962528511e-06, + "loss": 0.4206, + "step": 11949 + }, + { + "epoch": 1.8706950532247966, + "grad_norm": 0.9251136183738708, + "learning_rate": 6.875203649397198e-06, + "loss": 0.4656, + "step": 11950 + }, + { + "epoch": 1.8708515967438948, + "grad_norm": 1.5275501012802124, + "learning_rate": 6.867057673509286e-06, + "loss": 0.4114, + "step": 11951 + }, + { + "epoch": 1.871008140262993, + "grad_norm": 1.8447656631469727, + "learning_rate": 6.858911697621376e-06, + "loss": 0.4902, + "step": 11952 + }, + { + "epoch": 1.8711646837820914, + "grad_norm": 2.3487658500671387, + "learning_rate": 6.850765721733464e-06, + "loss": 0.5196, + "step": 11953 + }, + { + "epoch": 1.8713212273011899, + "grad_norm": 1.162658452987671, + "learning_rate": 6.842619745845552e-06, + "loss": 0.4274, + "step": 11954 + }, + { + "epoch": 1.871477770820288, + "grad_norm": 1.4686226844787598, + "learning_rate": 6.8344737699576415e-06, + "loss": 0.4342, + "step": 11955 + }, + { + "epoch": 1.8716343143393863, + "grad_norm": 1.709752082824707, + "learning_rate": 6.8263277940697294e-06, + "loss": 0.4458, + "step": 11956 + }, + { + "epoch": 1.8717908578584845, + "grad_norm": 1.0055660009384155, + "learning_rate": 6.818181818181818e-06, + "loss": 0.4253, + "step": 11957 + }, + { + "epoch": 1.871947401377583, + "grad_norm": 1.840532898902893, + "learning_rate": 6.810035842293908e-06, + "loss": 0.4999, + "step": 11958 + }, + { + "epoch": 1.8721039448966814, + "grad_norm": 2.436182737350464, + "learning_rate": 6.801889866405996e-06, + "loss": 0.5004, + "step": 11959 + }, + { + "epoch": 1.8722604884157796, + "grad_norm": 4.032283782958984, + "learning_rate": 6.793743890518084e-06, + "loss": 0.6779, + "step": 11960 + }, + { + "epoch": 1.8724170319348779, + "grad_norm": 1.9112107753753662, + "learning_rate": 6.785597914630174e-06, + "loss": 0.7059, + "step": 11961 + }, + { + "epoch": 1.872573575453976, + "grad_norm": 2.073430061340332, + "learning_rate": 6.777451938742262e-06, + "loss": 0.3299, + "step": 11962 + }, + { + "epoch": 1.8727301189730745, + "grad_norm": 1.5926822423934937, + "learning_rate": 6.7693059628543496e-06, + "loss": 0.765, + "step": 11963 + }, + { + "epoch": 1.872886662492173, + "grad_norm": 2.5569851398468018, + "learning_rate": 6.761159986966439e-06, + "loss": 0.4777, + "step": 11964 + }, + { + "epoch": 1.8730432060112712, + "grad_norm": 2.837730646133423, + "learning_rate": 6.753014011078527e-06, + "loss": 0.4322, + "step": 11965 + }, + { + "epoch": 1.8731997495303694, + "grad_norm": 3.3506898880004883, + "learning_rate": 6.744868035190616e-06, + "loss": 0.6601, + "step": 11966 + }, + { + "epoch": 1.8733562930494676, + "grad_norm": 4.273726940155029, + "learning_rate": 6.736722059302705e-06, + "loss": 0.6115, + "step": 11967 + }, + { + "epoch": 1.873512836568566, + "grad_norm": 1.747701644897461, + "learning_rate": 6.728576083414794e-06, + "loss": 0.4191, + "step": 11968 + }, + { + "epoch": 1.8736693800876645, + "grad_norm": 2.5972836017608643, + "learning_rate": 6.720430107526882e-06, + "loss": 0.5715, + "step": 11969 + }, + { + "epoch": 1.8738259236067627, + "grad_norm": 1.855442762374878, + "learning_rate": 6.712284131638971e-06, + "loss": 0.6263, + "step": 11970 + }, + { + "epoch": 1.873982467125861, + "grad_norm": 1.8909631967544556, + "learning_rate": 6.704138155751059e-06, + "loss": 0.589, + "step": 11971 + }, + { + "epoch": 1.8741390106449591, + "grad_norm": 4.692439079284668, + "learning_rate": 6.695992179863147e-06, + "loss": 0.741, + "step": 11972 + }, + { + "epoch": 1.8742955541640576, + "grad_norm": 8.81668758392334, + "learning_rate": 6.687846203975237e-06, + "loss": 0.7612, + "step": 11973 + }, + { + "epoch": 1.874452097683156, + "grad_norm": 2.4300053119659424, + "learning_rate": 6.679700228087325e-06, + "loss": 0.5055, + "step": 11974 + }, + { + "epoch": 1.8746086412022542, + "grad_norm": 3.6211280822753906, + "learning_rate": 6.671554252199414e-06, + "loss": 0.9829, + "step": 11975 + }, + { + "epoch": 1.8747651847213525, + "grad_norm": 3.444044589996338, + "learning_rate": 6.663408276311503e-06, + "loss": 1.0994, + "step": 11976 + }, + { + "epoch": 1.874921728240451, + "grad_norm": 3.5625085830688477, + "learning_rate": 6.6552623004235915e-06, + "loss": 0.5628, + "step": 11977 + }, + { + "epoch": 1.875078271759549, + "grad_norm": 3.6905524730682373, + "learning_rate": 6.6471163245356795e-06, + "loss": 0.9668, + "step": 11978 + }, + { + "epoch": 1.8752348152786475, + "grad_norm": 5.353758811950684, + "learning_rate": 6.638970348647769e-06, + "loss": 1.2314, + "step": 11979 + }, + { + "epoch": 1.8753913587977458, + "grad_norm": 3.0839314460754395, + "learning_rate": 6.630824372759857e-06, + "loss": 1.0769, + "step": 11980 + }, + { + "epoch": 1.875547902316844, + "grad_norm": 3.682119607925415, + "learning_rate": 6.622678396871945e-06, + "loss": 1.1167, + "step": 11981 + }, + { + "epoch": 1.8757044458359424, + "grad_norm": 2.510239601135254, + "learning_rate": 6.614532420984035e-06, + "loss": 0.6673, + "step": 11982 + }, + { + "epoch": 1.8758609893550409, + "grad_norm": 2.1695849895477295, + "learning_rate": 6.606386445096123e-06, + "loss": 0.5115, + "step": 11983 + }, + { + "epoch": 1.876017532874139, + "grad_norm": 11.790565490722656, + "learning_rate": 6.598240469208211e-06, + "loss": 0.4903, + "step": 11984 + }, + { + "epoch": 1.8761740763932373, + "grad_norm": 1.7815672159194946, + "learning_rate": 6.5900944933203005e-06, + "loss": 0.3129, + "step": 11985 + }, + { + "epoch": 1.8763306199123355, + "grad_norm": 3.781043291091919, + "learning_rate": 6.5819485174323885e-06, + "loss": 0.9739, + "step": 11986 + }, + { + "epoch": 1.876487163431434, + "grad_norm": 4.2882609367370605, + "learning_rate": 6.573802541544477e-06, + "loss": 0.9844, + "step": 11987 + }, + { + "epoch": 1.8766437069505324, + "grad_norm": 2.978605031967163, + "learning_rate": 6.565656565656567e-06, + "loss": 1.1775, + "step": 11988 + }, + { + "epoch": 1.8768002504696306, + "grad_norm": 0.777816653251648, + "learning_rate": 6.557510589768655e-06, + "loss": 0.4325, + "step": 11989 + }, + { + "epoch": 1.8769567939887288, + "grad_norm": 0.6157427430152893, + "learning_rate": 6.549364613880743e-06, + "loss": 0.3309, + "step": 11990 + }, + { + "epoch": 1.877113337507827, + "grad_norm": 0.8688997626304626, + "learning_rate": 6.541218637992833e-06, + "loss": 0.533, + "step": 11991 + }, + { + "epoch": 1.8772698810269255, + "grad_norm": 0.914389431476593, + "learning_rate": 6.533072662104921e-06, + "loss": 0.3781, + "step": 11992 + }, + { + "epoch": 1.877426424546024, + "grad_norm": 0.9454541802406311, + "learning_rate": 6.524926686217009e-06, + "loss": 0.4797, + "step": 11993 + }, + { + "epoch": 1.8775829680651221, + "grad_norm": 2.5074470043182373, + "learning_rate": 6.516780710329098e-06, + "loss": 0.4727, + "step": 11994 + }, + { + "epoch": 1.8777395115842204, + "grad_norm": 0.7996684312820435, + "learning_rate": 6.508634734441186e-06, + "loss": 0.3727, + "step": 11995 + }, + { + "epoch": 1.8778960551033186, + "grad_norm": 0.7908637523651123, + "learning_rate": 6.500488758553275e-06, + "loss": 0.4176, + "step": 11996 + }, + { + "epoch": 1.878052598622417, + "grad_norm": 0.7815083861351013, + "learning_rate": 6.492342782665363e-06, + "loss": 0.4402, + "step": 11997 + }, + { + "epoch": 1.8782091421415155, + "grad_norm": 0.8773549199104309, + "learning_rate": 6.484196806777453e-06, + "loss": 0.3736, + "step": 11998 + }, + { + "epoch": 1.8783656856606137, + "grad_norm": 1.30736243724823, + "learning_rate": 6.476050830889541e-06, + "loss": 0.4292, + "step": 11999 + }, + { + "epoch": 1.878522229179712, + "grad_norm": 1.1465976238250732, + "learning_rate": 6.467904855001629e-06, + "loss": 0.3529, + "step": 12000 + }, + { + "epoch": 1.878522229179712, + "eval_loss": 0.5416739583015442, + "eval_runtime": 206.2717, + "eval_samples_per_second": 60.032, + "eval_steps_per_second": 3.752, + "eval_wer": 0.31977176615593855, + "step": 12000 + }, + { + "epoch": 1.8786787726988101, + "grad_norm": 2.6031224727630615, + "learning_rate": 6.459758879113718e-06, + "loss": 0.8113, + "step": 12001 + }, + { + "epoch": 1.8788353162179086, + "grad_norm": 2.8572726249694824, + "learning_rate": 6.451612903225806e-06, + "loss": 0.6138, + "step": 12002 + }, + { + "epoch": 1.878991859737007, + "grad_norm": 0.8965590000152588, + "learning_rate": 6.443466927337894e-06, + "loss": 0.3103, + "step": 12003 + }, + { + "epoch": 1.8791484032561052, + "grad_norm": 0.9854152798652649, + "learning_rate": 6.435320951449984e-06, + "loss": 0.3569, + "step": 12004 + }, + { + "epoch": 1.8793049467752034, + "grad_norm": 1.7655422687530518, + "learning_rate": 6.427174975562073e-06, + "loss": 0.5636, + "step": 12005 + }, + { + "epoch": 1.8794614902943017, + "grad_norm": 1.7787092924118042, + "learning_rate": 6.419028999674161e-06, + "loss": 0.4998, + "step": 12006 + }, + { + "epoch": 1.8796180338134, + "grad_norm": 1.8858742713928223, + "learning_rate": 6.4108830237862505e-06, + "loss": 0.4857, + "step": 12007 + }, + { + "epoch": 1.8797745773324985, + "grad_norm": 3.161541223526001, + "learning_rate": 6.4027370478983385e-06, + "loss": 0.5449, + "step": 12008 + }, + { + "epoch": 1.8799311208515967, + "grad_norm": 1.61588454246521, + "learning_rate": 6.3945910720104265e-06, + "loss": 0.6103, + "step": 12009 + }, + { + "epoch": 1.880087664370695, + "grad_norm": 3.096891164779663, + "learning_rate": 6.386445096122516e-06, + "loss": 0.6997, + "step": 12010 + }, + { + "epoch": 1.8802442078897934, + "grad_norm": 3.2030820846557617, + "learning_rate": 6.378299120234604e-06, + "loss": 0.4494, + "step": 12011 + }, + { + "epoch": 1.8804007514088916, + "grad_norm": 1.729018211364746, + "learning_rate": 6.370153144346692e-06, + "loss": 0.5669, + "step": 12012 + }, + { + "epoch": 1.88055729492799, + "grad_norm": 1.8659478425979614, + "learning_rate": 6.362007168458782e-06, + "loss": 0.5895, + "step": 12013 + }, + { + "epoch": 1.8807138384470883, + "grad_norm": 1.2703486680984497, + "learning_rate": 6.35386119257087e-06, + "loss": 0.4197, + "step": 12014 + }, + { + "epoch": 1.8808703819661865, + "grad_norm": 2.5146729946136475, + "learning_rate": 6.345715216682959e-06, + "loss": 0.6426, + "step": 12015 + }, + { + "epoch": 1.881026925485285, + "grad_norm": 1.1665806770324707, + "learning_rate": 6.337569240795048e-06, + "loss": 0.3034, + "step": 12016 + }, + { + "epoch": 1.8811834690043834, + "grad_norm": 2.1721248626708984, + "learning_rate": 6.329423264907136e-06, + "loss": 0.519, + "step": 12017 + }, + { + "epoch": 1.8813400125234816, + "grad_norm": 1.4636507034301758, + "learning_rate": 6.321277289019224e-06, + "loss": 0.4571, + "step": 12018 + }, + { + "epoch": 1.8814965560425798, + "grad_norm": 2.3797831535339355, + "learning_rate": 6.313131313131314e-06, + "loss": 0.8121, + "step": 12019 + }, + { + "epoch": 1.881653099561678, + "grad_norm": 3.795231819152832, + "learning_rate": 6.304985337243402e-06, + "loss": 0.7672, + "step": 12020 + }, + { + "epoch": 1.8818096430807765, + "grad_norm": 2.166515827178955, + "learning_rate": 6.29683936135549e-06, + "loss": 0.9353, + "step": 12021 + }, + { + "epoch": 1.881966186599875, + "grad_norm": 3.937155246734619, + "learning_rate": 6.28869338546758e-06, + "loss": 0.9647, + "step": 12022 + }, + { + "epoch": 1.8821227301189731, + "grad_norm": 4.790675163269043, + "learning_rate": 6.280547409579668e-06, + "loss": 0.85, + "step": 12023 + }, + { + "epoch": 1.8822792736380713, + "grad_norm": 3.7363228797912598, + "learning_rate": 6.2724014336917564e-06, + "loss": 0.804, + "step": 12024 + }, + { + "epoch": 1.8824358171571696, + "grad_norm": 2.5283005237579346, + "learning_rate": 6.264255457803845e-06, + "loss": 0.9215, + "step": 12025 + }, + { + "epoch": 1.882592360676268, + "grad_norm": 4.601849555969238, + "learning_rate": 6.256109481915934e-06, + "loss": 1.0995, + "step": 12026 + }, + { + "epoch": 1.8827489041953664, + "grad_norm": 4.328213691711426, + "learning_rate": 6.247963506028023e-06, + "loss": 0.9302, + "step": 12027 + }, + { + "epoch": 1.8829054477144647, + "grad_norm": 2.6439504623413086, + "learning_rate": 6.239817530140111e-06, + "loss": 0.9935, + "step": 12028 + }, + { + "epoch": 1.8830619912335629, + "grad_norm": 3.7033910751342773, + "learning_rate": 6.2316715542522e-06, + "loss": 1.0871, + "step": 12029 + }, + { + "epoch": 1.883218534752661, + "grad_norm": 2.395552158355713, + "learning_rate": 6.223525578364289e-06, + "loss": 1.018, + "step": 12030 + }, + { + "epoch": 1.8833750782717595, + "grad_norm": 5.874751567840576, + "learning_rate": 6.2153796024763766e-06, + "loss": 1.3241, + "step": 12031 + }, + { + "epoch": 1.883531621790858, + "grad_norm": 4.5470709800720215, + "learning_rate": 6.207233626588465e-06, + "loss": 1.1738, + "step": 12032 + }, + { + "epoch": 1.8836881653099562, + "grad_norm": 3.1424667835235596, + "learning_rate": 6.199087650700554e-06, + "loss": 0.6351, + "step": 12033 + }, + { + "epoch": 1.8838447088290544, + "grad_norm": 3.422464370727539, + "learning_rate": 6.190941674812643e-06, + "loss": 0.4433, + "step": 12034 + }, + { + "epoch": 1.8840012523481526, + "grad_norm": 3.469778060913086, + "learning_rate": 6.182795698924732e-06, + "loss": 0.4298, + "step": 12035 + }, + { + "epoch": 1.884157795867251, + "grad_norm": 2.3466837406158447, + "learning_rate": 6.174649723036821e-06, + "loss": 0.5307, + "step": 12036 + }, + { + "epoch": 1.8843143393863495, + "grad_norm": 4.561711311340332, + "learning_rate": 6.166503747148909e-06, + "loss": 0.8104, + "step": 12037 + }, + { + "epoch": 1.8844708829054477, + "grad_norm": 2.3072192668914795, + "learning_rate": 6.1583577712609975e-06, + "loss": 0.632, + "step": 12038 + }, + { + "epoch": 1.884627426424546, + "grad_norm": 0.712828516960144, + "learning_rate": 6.150211795373086e-06, + "loss": 0.3753, + "step": 12039 + }, + { + "epoch": 1.8847839699436444, + "grad_norm": 0.6680353879928589, + "learning_rate": 6.142065819485174e-06, + "loss": 0.3713, + "step": 12040 + }, + { + "epoch": 1.8849405134627426, + "grad_norm": 1.0650885105133057, + "learning_rate": 6.133919843597263e-06, + "loss": 0.449, + "step": 12041 + }, + { + "epoch": 1.885097056981841, + "grad_norm": 0.8658413887023926, + "learning_rate": 6.125773867709352e-06, + "loss": 0.3681, + "step": 12042 + }, + { + "epoch": 1.8852536005009393, + "grad_norm": 0.6537708640098572, + "learning_rate": 6.11762789182144e-06, + "loss": 0.3385, + "step": 12043 + }, + { + "epoch": 1.8854101440200375, + "grad_norm": 0.8216213583946228, + "learning_rate": 6.109481915933529e-06, + "loss": 0.3818, + "step": 12044 + }, + { + "epoch": 1.885566687539136, + "grad_norm": 1.5654932260513306, + "learning_rate": 6.101335940045618e-06, + "loss": 0.4754, + "step": 12045 + }, + { + "epoch": 1.8857232310582341, + "grad_norm": 1.1929435729980469, + "learning_rate": 6.0931899641577065e-06, + "loss": 0.3636, + "step": 12046 + }, + { + "epoch": 1.8858797745773326, + "grad_norm": 0.5765544772148132, + "learning_rate": 6.085043988269795e-06, + "loss": 0.2977, + "step": 12047 + }, + { + "epoch": 1.8860363180964308, + "grad_norm": 1.1112456321716309, + "learning_rate": 6.076898012381883e-06, + "loss": 0.4089, + "step": 12048 + }, + { + "epoch": 1.886192861615529, + "grad_norm": 0.6730085611343384, + "learning_rate": 6.068752036493972e-06, + "loss": 0.3208, + "step": 12049 + }, + { + "epoch": 1.8863494051346275, + "grad_norm": 1.0832507610321045, + "learning_rate": 6.060606060606061e-06, + "loss": 0.422, + "step": 12050 + }, + { + "epoch": 1.886505948653726, + "grad_norm": 0.7013895511627197, + "learning_rate": 6.052460084718149e-06, + "loss": 0.3752, + "step": 12051 + }, + { + "epoch": 1.8866624921728241, + "grad_norm": 0.9578732848167419, + "learning_rate": 6.044314108830238e-06, + "loss": 0.5127, + "step": 12052 + }, + { + "epoch": 1.8868190356919223, + "grad_norm": 1.243189811706543, + "learning_rate": 6.036168132942327e-06, + "loss": 0.3366, + "step": 12053 + }, + { + "epoch": 1.8869755792110205, + "grad_norm": 0.9274179935455322, + "learning_rate": 6.0280221570544155e-06, + "loss": 0.419, + "step": 12054 + }, + { + "epoch": 1.887132122730119, + "grad_norm": 2.1944453716278076, + "learning_rate": 6.019876181166504e-06, + "loss": 0.8151, + "step": 12055 + }, + { + "epoch": 1.8872886662492174, + "grad_norm": 2.121548652648926, + "learning_rate": 6.011730205278593e-06, + "loss": 0.7492, + "step": 12056 + }, + { + "epoch": 1.8874452097683156, + "grad_norm": 1.70314359664917, + "learning_rate": 6.003584229390681e-06, + "loss": 0.5907, + "step": 12057 + }, + { + "epoch": 1.8876017532874139, + "grad_norm": 1.411765456199646, + "learning_rate": 5.99543825350277e-06, + "loss": 0.3359, + "step": 12058 + }, + { + "epoch": 1.887758296806512, + "grad_norm": 2.0746073722839355, + "learning_rate": 5.987292277614859e-06, + "loss": 0.8708, + "step": 12059 + }, + { + "epoch": 1.8879148403256105, + "grad_norm": 3.6200900077819824, + "learning_rate": 5.979146301726947e-06, + "loss": 0.6571, + "step": 12060 + }, + { + "epoch": 1.888071383844709, + "grad_norm": 1.6842200756072998, + "learning_rate": 5.971000325839036e-06, + "loss": 0.5662, + "step": 12061 + }, + { + "epoch": 1.8882279273638072, + "grad_norm": 2.275728940963745, + "learning_rate": 5.962854349951124e-06, + "loss": 0.6034, + "step": 12062 + }, + { + "epoch": 1.8883844708829054, + "grad_norm": 3.925459146499634, + "learning_rate": 5.954708374063213e-06, + "loss": 0.5545, + "step": 12063 + }, + { + "epoch": 1.8885410144020036, + "grad_norm": 2.036801338195801, + "learning_rate": 5.946562398175302e-06, + "loss": 0.5547, + "step": 12064 + }, + { + "epoch": 1.888697557921102, + "grad_norm": 3.1853623390197754, + "learning_rate": 5.938416422287391e-06, + "loss": 0.6387, + "step": 12065 + }, + { + "epoch": 1.8888541014402005, + "grad_norm": 1.3382679224014282, + "learning_rate": 5.930270446399479e-06, + "loss": 0.4008, + "step": 12066 + }, + { + "epoch": 1.8890106449592987, + "grad_norm": 3.84051251411438, + "learning_rate": 5.922124470511568e-06, + "loss": 0.6837, + "step": 12067 + }, + { + "epoch": 1.889167188478397, + "grad_norm": 4.388838768005371, + "learning_rate": 5.9139784946236566e-06, + "loss": 1.0494, + "step": 12068 + }, + { + "epoch": 1.8893237319974951, + "grad_norm": 2.8012328147888184, + "learning_rate": 5.9058325187357445e-06, + "loss": 0.7789, + "step": 12069 + }, + { + "epoch": 1.8894802755165936, + "grad_norm": 4.523769378662109, + "learning_rate": 5.897686542847833e-06, + "loss": 0.7916, + "step": 12070 + }, + { + "epoch": 1.889636819035692, + "grad_norm": 2.7708323001861572, + "learning_rate": 5.889540566959922e-06, + "loss": 0.8891, + "step": 12071 + }, + { + "epoch": 1.8897933625547902, + "grad_norm": 3.477240562438965, + "learning_rate": 5.88139459107201e-06, + "loss": 0.9363, + "step": 12072 + }, + { + "epoch": 1.8899499060738885, + "grad_norm": 2.68642258644104, + "learning_rate": 5.8732486151841e-06, + "loss": 0.7428, + "step": 12073 + }, + { + "epoch": 1.890106449592987, + "grad_norm": 3.560378313064575, + "learning_rate": 5.865102639296189e-06, + "loss": 0.6878, + "step": 12074 + }, + { + "epoch": 1.8902629931120851, + "grad_norm": 4.687848091125488, + "learning_rate": 5.856956663408277e-06, + "loss": 1.256, + "step": 12075 + }, + { + "epoch": 1.8904195366311836, + "grad_norm": 5.7802910804748535, + "learning_rate": 5.8488106875203655e-06, + "loss": 1.0475, + "step": 12076 + }, + { + "epoch": 1.8905760801502818, + "grad_norm": 3.1628963947296143, + "learning_rate": 5.8406647116324535e-06, + "loss": 1.1098, + "step": 12077 + }, + { + "epoch": 1.89073262366938, + "grad_norm": 3.312086343765259, + "learning_rate": 5.832518735744542e-06, + "loss": 0.9315, + "step": 12078 + }, + { + "epoch": 1.8908891671884784, + "grad_norm": 2.6078884601593018, + "learning_rate": 5.824372759856631e-06, + "loss": 0.842, + "step": 12079 + }, + { + "epoch": 1.8910457107075767, + "grad_norm": 3.877609968185425, + "learning_rate": 5.816226783968719e-06, + "loss": 0.9026, + "step": 12080 + }, + { + "epoch": 1.891202254226675, + "grad_norm": 7.133406639099121, + "learning_rate": 5.808080808080808e-06, + "loss": 1.5367, + "step": 12081 + }, + { + "epoch": 1.8913587977457733, + "grad_norm": 3.3148412704467773, + "learning_rate": 5.799934832192897e-06, + "loss": 1.2304, + "step": 12082 + }, + { + "epoch": 1.8915153412648715, + "grad_norm": 2.687420606613159, + "learning_rate": 5.791788856304986e-06, + "loss": 0.7266, + "step": 12083 + }, + { + "epoch": 1.89167188478397, + "grad_norm": 3.2409512996673584, + "learning_rate": 5.7836428804170745e-06, + "loss": 0.7197, + "step": 12084 + }, + { + "epoch": 1.8918284283030684, + "grad_norm": 2.346021890640259, + "learning_rate": 5.775496904529163e-06, + "loss": 0.6739, + "step": 12085 + }, + { + "epoch": 1.8919849718221666, + "grad_norm": 2.4460196495056152, + "learning_rate": 5.767350928641251e-06, + "loss": 0.5937, + "step": 12086 + }, + { + "epoch": 1.8921415153412648, + "grad_norm": 5.112000942230225, + "learning_rate": 5.75920495275334e-06, + "loss": 1.4154, + "step": 12087 + }, + { + "epoch": 1.892298058860363, + "grad_norm": 2.411132335662842, + "learning_rate": 5.751058976865429e-06, + "loss": 0.8145, + "step": 12088 + }, + { + "epoch": 1.8924546023794615, + "grad_norm": 0.571246325969696, + "learning_rate": 5.742913000977517e-06, + "loss": 0.4517, + "step": 12089 + }, + { + "epoch": 1.89261114589856, + "grad_norm": 0.512128472328186, + "learning_rate": 5.734767025089606e-06, + "loss": 0.3557, + "step": 12090 + }, + { + "epoch": 1.8927676894176582, + "grad_norm": 0.81144118309021, + "learning_rate": 5.726621049201695e-06, + "loss": 0.3893, + "step": 12091 + }, + { + "epoch": 1.8929242329367564, + "grad_norm": 0.8612348437309265, + "learning_rate": 5.7184750733137834e-06, + "loss": 0.4439, + "step": 12092 + }, + { + "epoch": 1.8930807764558546, + "grad_norm": 0.655714213848114, + "learning_rate": 5.710329097425872e-06, + "loss": 0.3709, + "step": 12093 + }, + { + "epoch": 1.893237319974953, + "grad_norm": 0.6706739664077759, + "learning_rate": 5.702183121537961e-06, + "loss": 0.352, + "step": 12094 + }, + { + "epoch": 1.8933938634940515, + "grad_norm": 0.5924631357192993, + "learning_rate": 5.694037145650049e-06, + "loss": 0.4143, + "step": 12095 + }, + { + "epoch": 1.8935504070131497, + "grad_norm": 1.4839768409729004, + "learning_rate": 5.685891169762138e-06, + "loss": 0.5061, + "step": 12096 + }, + { + "epoch": 1.893706950532248, + "grad_norm": 1.3477853536605835, + "learning_rate": 5.677745193874227e-06, + "loss": 0.5497, + "step": 12097 + }, + { + "epoch": 1.8938634940513461, + "grad_norm": 1.0705461502075195, + "learning_rate": 5.669599217986315e-06, + "loss": 0.4558, + "step": 12098 + }, + { + "epoch": 1.8940200375704446, + "grad_norm": 1.9132755994796753, + "learning_rate": 5.6614532420984036e-06, + "loss": 0.4396, + "step": 12099 + }, + { + "epoch": 1.894176581089543, + "grad_norm": 1.1979432106018066, + "learning_rate": 5.653307266210492e-06, + "loss": 0.4402, + "step": 12100 + }, + { + "epoch": 1.8943331246086412, + "grad_norm": 1.0470112562179565, + "learning_rate": 5.64516129032258e-06, + "loss": 0.4556, + "step": 12101 + }, + { + "epoch": 1.8944896681277394, + "grad_norm": 3.478754758834839, + "learning_rate": 5.63701531443467e-06, + "loss": 0.589, + "step": 12102 + }, + { + "epoch": 1.8946462116468377, + "grad_norm": 1.068916916847229, + "learning_rate": 5.628869338546759e-06, + "loss": 0.5314, + "step": 12103 + }, + { + "epoch": 1.894802755165936, + "grad_norm": 1.7096710205078125, + "learning_rate": 5.620723362658847e-06, + "loss": 0.6918, + "step": 12104 + }, + { + "epoch": 1.8949592986850345, + "grad_norm": 1.2677874565124512, + "learning_rate": 5.612577386770936e-06, + "loss": 0.4113, + "step": 12105 + }, + { + "epoch": 1.8951158422041328, + "grad_norm": 1.9167366027832031, + "learning_rate": 5.6044314108830245e-06, + "loss": 0.5665, + "step": 12106 + }, + { + "epoch": 1.895272385723231, + "grad_norm": 1.4515151977539062, + "learning_rate": 5.5962854349951125e-06, + "loss": 0.6061, + "step": 12107 + }, + { + "epoch": 1.8954289292423294, + "grad_norm": 1.7091184854507446, + "learning_rate": 5.588139459107201e-06, + "loss": 0.5918, + "step": 12108 + }, + { + "epoch": 1.8955854727614276, + "grad_norm": 1.7163666486740112, + "learning_rate": 5.579993483219289e-06, + "loss": 0.5352, + "step": 12109 + }, + { + "epoch": 1.895742016280526, + "grad_norm": 1.613120198249817, + "learning_rate": 5.571847507331378e-06, + "loss": 0.3983, + "step": 12110 + }, + { + "epoch": 1.8958985597996243, + "grad_norm": 1.154775857925415, + "learning_rate": 5.563701531443467e-06, + "loss": 0.4066, + "step": 12111 + }, + { + "epoch": 1.8960551033187225, + "grad_norm": 1.42263925075531, + "learning_rate": 5.555555555555556e-06, + "loss": 0.5864, + "step": 12112 + }, + { + "epoch": 1.896211646837821, + "grad_norm": 1.7473114728927612, + "learning_rate": 5.547409579667645e-06, + "loss": 0.4791, + "step": 12113 + }, + { + "epoch": 1.8963681903569192, + "grad_norm": 4.9935526847839355, + "learning_rate": 5.5392636037797335e-06, + "loss": 0.7678, + "step": 12114 + }, + { + "epoch": 1.8965247338760176, + "grad_norm": 1.3416955471038818, + "learning_rate": 5.5311176278918215e-06, + "loss": 0.4203, + "step": 12115 + }, + { + "epoch": 1.8966812773951158, + "grad_norm": 1.4085800647735596, + "learning_rate": 5.52297165200391e-06, + "loss": 0.5648, + "step": 12116 + }, + { + "epoch": 1.896837820914214, + "grad_norm": 1.8643602132797241, + "learning_rate": 5.514825676115999e-06, + "loss": 0.8382, + "step": 12117 + }, + { + "epoch": 1.8969943644333125, + "grad_norm": 1.8720816373825073, + "learning_rate": 5.506679700228087e-06, + "loss": 0.4916, + "step": 12118 + }, + { + "epoch": 1.897150907952411, + "grad_norm": 3.8971235752105713, + "learning_rate": 5.498533724340176e-06, + "loss": 0.9057, + "step": 12119 + }, + { + "epoch": 1.8973074514715091, + "grad_norm": 2.8982672691345215, + "learning_rate": 5.490387748452265e-06, + "loss": 0.7079, + "step": 12120 + }, + { + "epoch": 1.8974639949906074, + "grad_norm": 2.3151278495788574, + "learning_rate": 5.482241772564354e-06, + "loss": 0.5268, + "step": 12121 + }, + { + "epoch": 1.8976205385097056, + "grad_norm": 3.9375417232513428, + "learning_rate": 5.4740957966764424e-06, + "loss": 0.6303, + "step": 12122 + }, + { + "epoch": 1.897777082028804, + "grad_norm": 2.3662750720977783, + "learning_rate": 5.465949820788531e-06, + "loss": 0.6648, + "step": 12123 + }, + { + "epoch": 1.8979336255479025, + "grad_norm": 4.089177131652832, + "learning_rate": 5.457803844900619e-06, + "loss": 0.7405, + "step": 12124 + }, + { + "epoch": 1.8980901690670007, + "grad_norm": 2.896233081817627, + "learning_rate": 5.449657869012708e-06, + "loss": 1.0828, + "step": 12125 + }, + { + "epoch": 1.898246712586099, + "grad_norm": 4.039260387420654, + "learning_rate": 5.441511893124797e-06, + "loss": 0.7396, + "step": 12126 + }, + { + "epoch": 1.8984032561051971, + "grad_norm": 3.610041379928589, + "learning_rate": 5.433365917236885e-06, + "loss": 0.8061, + "step": 12127 + }, + { + "epoch": 1.8985597996242956, + "grad_norm": 3.475969076156616, + "learning_rate": 5.425219941348974e-06, + "loss": 0.8796, + "step": 12128 + }, + { + "epoch": 1.898716343143394, + "grad_norm": 1.896490454673767, + "learning_rate": 5.4170739654610626e-06, + "loss": 0.6444, + "step": 12129 + }, + { + "epoch": 1.8988728866624922, + "grad_norm": 3.178431272506714, + "learning_rate": 5.4089279895731506e-06, + "loss": 1.2458, + "step": 12130 + }, + { + "epoch": 1.8990294301815904, + "grad_norm": 3.733795642852783, + "learning_rate": 5.40078201368524e-06, + "loss": 1.1101, + "step": 12131 + }, + { + "epoch": 1.8991859737006886, + "grad_norm": 3.162736415863037, + "learning_rate": 5.392636037797329e-06, + "loss": 0.8628, + "step": 12132 + }, + { + "epoch": 1.899342517219787, + "grad_norm": 4.925237655639648, + "learning_rate": 5.384490061909417e-06, + "loss": 1.0745, + "step": 12133 + }, + { + "epoch": 1.8994990607388855, + "grad_norm": 1.9824382066726685, + "learning_rate": 5.376344086021506e-06, + "loss": 0.4875, + "step": 12134 + }, + { + "epoch": 1.8996556042579837, + "grad_norm": 1.5211036205291748, + "learning_rate": 5.368198110133595e-06, + "loss": 0.331, + "step": 12135 + }, + { + "epoch": 1.899812147777082, + "grad_norm": 2.0881617069244385, + "learning_rate": 5.360052134245683e-06, + "loss": 0.4835, + "step": 12136 + }, + { + "epoch": 1.8999686912961802, + "grad_norm": 5.893341541290283, + "learning_rate": 5.3519061583577715e-06, + "loss": 1.2941, + "step": 12137 + }, + { + "epoch": 1.9001252348152786, + "grad_norm": 2.3720452785491943, + "learning_rate": 5.34376018246986e-06, + "loss": 0.5348, + "step": 12138 + }, + { + "epoch": 1.900281778334377, + "grad_norm": 0.47745466232299805, + "learning_rate": 5.335614206581948e-06, + "loss": 0.3781, + "step": 12139 + }, + { + "epoch": 1.9004383218534753, + "grad_norm": 0.5446431040763855, + "learning_rate": 5.327468230694037e-06, + "loss": 0.4063, + "step": 12140 + }, + { + "epoch": 1.9005948653725735, + "grad_norm": 0.5415022373199463, + "learning_rate": 5.319322254806126e-06, + "loss": 0.3884, + "step": 12141 + }, + { + "epoch": 1.900751408891672, + "grad_norm": 0.8470726013183594, + "learning_rate": 5.311176278918215e-06, + "loss": 0.3743, + "step": 12142 + }, + { + "epoch": 1.9009079524107702, + "grad_norm": 0.47008344531059265, + "learning_rate": 5.303030303030304e-06, + "loss": 0.3214, + "step": 12143 + }, + { + "epoch": 1.9010644959298686, + "grad_norm": 0.9170559644699097, + "learning_rate": 5.294884327142392e-06, + "loss": 0.3768, + "step": 12144 + }, + { + "epoch": 1.9012210394489668, + "grad_norm": 1.2423782348632812, + "learning_rate": 5.2867383512544805e-06, + "loss": 0.3643, + "step": 12145 + }, + { + "epoch": 1.901377582968065, + "grad_norm": 1.2667381763458252, + "learning_rate": 5.278592375366569e-06, + "loss": 0.4292, + "step": 12146 + }, + { + "epoch": 1.9015341264871635, + "grad_norm": 0.9966252446174622, + "learning_rate": 5.270446399478657e-06, + "loss": 0.349, + "step": 12147 + }, + { + "epoch": 1.9016906700062617, + "grad_norm": 0.9408605694770813, + "learning_rate": 5.262300423590746e-06, + "loss": 0.4925, + "step": 12148 + }, + { + "epoch": 1.9018472135253601, + "grad_norm": 1.0370075702667236, + "learning_rate": 5.254154447702835e-06, + "loss": 0.3721, + "step": 12149 + }, + { + "epoch": 1.9020037570444583, + "grad_norm": 0.6432914733886719, + "learning_rate": 5.246008471814924e-06, + "loss": 0.398, + "step": 12150 + }, + { + "epoch": 1.9021603005635566, + "grad_norm": 0.7563079595565796, + "learning_rate": 5.237862495927013e-06, + "loss": 0.4204, + "step": 12151 + }, + { + "epoch": 1.902316844082655, + "grad_norm": 1.045837640762329, + "learning_rate": 5.2297165200391015e-06, + "loss": 0.3448, + "step": 12152 + }, + { + "epoch": 1.9024733876017534, + "grad_norm": 0.6337746381759644, + "learning_rate": 5.2215705441511894e-06, + "loss": 0.3641, + "step": 12153 + }, + { + "epoch": 1.9026299311208517, + "grad_norm": 12.366353034973145, + "learning_rate": 5.213424568263278e-06, + "loss": 1.0567, + "step": 12154 + }, + { + "epoch": 1.9027864746399499, + "grad_norm": 1.3487145900726318, + "learning_rate": 5.205278592375367e-06, + "loss": 0.4239, + "step": 12155 + }, + { + "epoch": 1.902943018159048, + "grad_norm": 1.5277602672576904, + "learning_rate": 5.197132616487455e-06, + "loss": 0.4517, + "step": 12156 + }, + { + "epoch": 1.9030995616781465, + "grad_norm": 2.087116241455078, + "learning_rate": 5.188986640599544e-06, + "loss": 0.4374, + "step": 12157 + }, + { + "epoch": 1.903256105197245, + "grad_norm": 1.3922455310821533, + "learning_rate": 5.180840664711633e-06, + "loss": 0.4693, + "step": 12158 + }, + { + "epoch": 1.9034126487163432, + "grad_norm": 1.3364778757095337, + "learning_rate": 5.172694688823721e-06, + "loss": 0.579, + "step": 12159 + }, + { + "epoch": 1.9035691922354414, + "grad_norm": 1.820658802986145, + "learning_rate": 5.16454871293581e-06, + "loss": 0.5387, + "step": 12160 + }, + { + "epoch": 1.9037257357545396, + "grad_norm": 3.138976812362671, + "learning_rate": 5.156402737047899e-06, + "loss": 0.5018, + "step": 12161 + }, + { + "epoch": 1.903882279273638, + "grad_norm": 1.7904644012451172, + "learning_rate": 5.148256761159987e-06, + "loss": 0.4481, + "step": 12162 + }, + { + "epoch": 1.9040388227927365, + "grad_norm": 2.1207456588745117, + "learning_rate": 5.140110785272076e-06, + "loss": 0.6604, + "step": 12163 + }, + { + "epoch": 1.9041953663118347, + "grad_norm": 2.416285514831543, + "learning_rate": 5.131964809384165e-06, + "loss": 0.7814, + "step": 12164 + }, + { + "epoch": 1.904351909830933, + "grad_norm": 4.391247272491455, + "learning_rate": 5.123818833496253e-06, + "loss": 0.8447, + "step": 12165 + }, + { + "epoch": 1.9045084533500312, + "grad_norm": 3.187286853790283, + "learning_rate": 5.115672857608342e-06, + "loss": 0.5802, + "step": 12166 + }, + { + "epoch": 1.9046649968691296, + "grad_norm": 3.0097200870513916, + "learning_rate": 5.1075268817204305e-06, + "loss": 0.8371, + "step": 12167 + }, + { + "epoch": 1.904821540388228, + "grad_norm": 3.213592290878296, + "learning_rate": 5.0993809058325185e-06, + "loss": 0.8348, + "step": 12168 + }, + { + "epoch": 1.9049780839073263, + "grad_norm": 3.0626487731933594, + "learning_rate": 5.091234929944607e-06, + "loss": 0.8177, + "step": 12169 + }, + { + "epoch": 1.9051346274264245, + "grad_norm": 2.944638252258301, + "learning_rate": 5.083088954056696e-06, + "loss": 0.7884, + "step": 12170 + }, + { + "epoch": 1.9052911709455227, + "grad_norm": 3.0563809871673584, + "learning_rate": 5.074942978168785e-06, + "loss": 0.6647, + "step": 12171 + }, + { + "epoch": 1.9054477144646211, + "grad_norm": 1.7454148530960083, + "learning_rate": 5.066797002280874e-06, + "loss": 0.6343, + "step": 12172 + }, + { + "epoch": 1.9056042579837196, + "grad_norm": 3.4159984588623047, + "learning_rate": 5.058651026392962e-06, + "loss": 0.6691, + "step": 12173 + }, + { + "epoch": 1.9057608015028178, + "grad_norm": 4.3006463050842285, + "learning_rate": 5.050505050505051e-06, + "loss": 0.8727, + "step": 12174 + }, + { + "epoch": 1.905917345021916, + "grad_norm": 5.110257625579834, + "learning_rate": 5.0423590746171395e-06, + "loss": 0.7059, + "step": 12175 + }, + { + "epoch": 1.9060738885410144, + "grad_norm": 5.129769802093506, + "learning_rate": 5.0342130987292275e-06, + "loss": 0.6679, + "step": 12176 + }, + { + "epoch": 1.9062304320601127, + "grad_norm": 2.064796209335327, + "learning_rate": 5.026067122841316e-06, + "loss": 0.644, + "step": 12177 + }, + { + "epoch": 1.906386975579211, + "grad_norm": 4.305102348327637, + "learning_rate": 5.017921146953405e-06, + "loss": 0.7756, + "step": 12178 + }, + { + "epoch": 1.9065435190983093, + "grad_norm": 4.668224334716797, + "learning_rate": 5.009775171065494e-06, + "loss": 0.7846, + "step": 12179 + }, + { + "epoch": 1.9067000626174075, + "grad_norm": 3.0236222743988037, + "learning_rate": 5.001629195177583e-06, + "loss": 0.9604, + "step": 12180 + }, + { + "epoch": 1.906856606136506, + "grad_norm": 5.0217790603637695, + "learning_rate": 4.993483219289672e-06, + "loss": 0.9204, + "step": 12181 + }, + { + "epoch": 1.9070131496556044, + "grad_norm": 1.9195283651351929, + "learning_rate": 4.98533724340176e-06, + "loss": 0.6231, + "step": 12182 + }, + { + "epoch": 1.9071696931747026, + "grad_norm": 5.867606163024902, + "learning_rate": 4.9771912675138485e-06, + "loss": 0.3518, + "step": 12183 + }, + { + "epoch": 1.9073262366938009, + "grad_norm": 2.870250701904297, + "learning_rate": 4.969045291625937e-06, + "loss": 0.9855, + "step": 12184 + }, + { + "epoch": 1.907482780212899, + "grad_norm": 1.5654443502426147, + "learning_rate": 4.960899315738025e-06, + "loss": 0.466, + "step": 12185 + }, + { + "epoch": 1.9076393237319975, + "grad_norm": 4.2845234870910645, + "learning_rate": 4.952753339850114e-06, + "loss": 0.2218, + "step": 12186 + }, + { + "epoch": 1.907795867251096, + "grad_norm": 2.0230870246887207, + "learning_rate": 4.944607363962203e-06, + "loss": 0.4333, + "step": 12187 + }, + { + "epoch": 1.9079524107701942, + "grad_norm": 3.797625780105591, + "learning_rate": 4.936461388074291e-06, + "loss": 1.3573, + "step": 12188 + }, + { + "epoch": 1.9081089542892924, + "grad_norm": 0.5633471608161926, + "learning_rate": 4.928315412186381e-06, + "loss": 0.3947, + "step": 12189 + }, + { + "epoch": 1.9082654978083906, + "grad_norm": 0.8402072787284851, + "learning_rate": 4.9201694362984694e-06, + "loss": 0.462, + "step": 12190 + }, + { + "epoch": 1.908422041327489, + "grad_norm": 0.6543194651603699, + "learning_rate": 4.912023460410557e-06, + "loss": 0.3529, + "step": 12191 + }, + { + "epoch": 1.9085785848465875, + "grad_norm": 1.4853767156600952, + "learning_rate": 4.903877484522646e-06, + "loss": 0.445, + "step": 12192 + }, + { + "epoch": 1.9087351283656857, + "grad_norm": 1.5066040754318237, + "learning_rate": 4.895731508634735e-06, + "loss": 0.4998, + "step": 12193 + }, + { + "epoch": 1.908891671884784, + "grad_norm": 0.7844545841217041, + "learning_rate": 4.887585532746823e-06, + "loss": 0.5016, + "step": 12194 + }, + { + "epoch": 1.9090482154038821, + "grad_norm": 1.9727526903152466, + "learning_rate": 4.879439556858912e-06, + "loss": 0.4844, + "step": 12195 + }, + { + "epoch": 1.9092047589229806, + "grad_norm": 0.9637848734855652, + "learning_rate": 4.871293580971001e-06, + "loss": 0.3927, + "step": 12196 + }, + { + "epoch": 1.909361302442079, + "grad_norm": 0.8825727105140686, + "learning_rate": 4.863147605083089e-06, + "loss": 0.4172, + "step": 12197 + }, + { + "epoch": 1.9095178459611772, + "grad_norm": 1.2922227382659912, + "learning_rate": 4.8550016291951775e-06, + "loss": 0.4031, + "step": 12198 + }, + { + "epoch": 1.9096743894802755, + "grad_norm": 0.7790793180465698, + "learning_rate": 4.846855653307266e-06, + "loss": 0.4107, + "step": 12199 + }, + { + "epoch": 1.9098309329993737, + "grad_norm": 0.9185672998428345, + "learning_rate": 4.838709677419355e-06, + "loss": 0.4083, + "step": 12200 + }, + { + "epoch": 1.9099874765184721, + "grad_norm": 1.2111402750015259, + "learning_rate": 4.830563701531444e-06, + "loss": 0.3885, + "step": 12201 + }, + { + "epoch": 1.9101440200375706, + "grad_norm": 1.2835332155227661, + "learning_rate": 4.822417725643533e-06, + "loss": 0.4843, + "step": 12202 + }, + { + "epoch": 1.9103005635566688, + "grad_norm": 1.921718955039978, + "learning_rate": 4.814271749755621e-06, + "loss": 0.4476, + "step": 12203 + }, + { + "epoch": 1.910457107075767, + "grad_norm": 1.9995290040969849, + "learning_rate": 4.80612577386771e-06, + "loss": 0.4664, + "step": 12204 + }, + { + "epoch": 1.9106136505948652, + "grad_norm": 1.2947126626968384, + "learning_rate": 4.797979797979798e-06, + "loss": 0.4854, + "step": 12205 + }, + { + "epoch": 1.9107701941139636, + "grad_norm": 0.9404786825180054, + "learning_rate": 4.7898338220918865e-06, + "loss": 0.4612, + "step": 12206 + }, + { + "epoch": 1.910926737633062, + "grad_norm": 1.636773943901062, + "learning_rate": 4.781687846203975e-06, + "loss": 0.4891, + "step": 12207 + }, + { + "epoch": 1.9110832811521603, + "grad_norm": 1.5782272815704346, + "learning_rate": 4.773541870316064e-06, + "loss": 0.6078, + "step": 12208 + }, + { + "epoch": 1.9112398246712585, + "grad_norm": 1.5976572036743164, + "learning_rate": 4.765395894428153e-06, + "loss": 0.5009, + "step": 12209 + }, + { + "epoch": 1.911396368190357, + "grad_norm": 2.419039487838745, + "learning_rate": 4.757249918540242e-06, + "loss": 0.5428, + "step": 12210 + }, + { + "epoch": 1.9115529117094552, + "grad_norm": 1.0265878438949585, + "learning_rate": 4.74910394265233e-06, + "loss": 0.4198, + "step": 12211 + }, + { + "epoch": 1.9117094552285536, + "grad_norm": 1.9617303609848022, + "learning_rate": 4.740957966764419e-06, + "loss": 0.5374, + "step": 12212 + }, + { + "epoch": 1.9118659987476518, + "grad_norm": 2.8070554733276367, + "learning_rate": 4.7328119908765075e-06, + "loss": 0.7035, + "step": 12213 + }, + { + "epoch": 1.91202254226675, + "grad_norm": 2.103036642074585, + "learning_rate": 4.7246660149885955e-06, + "loss": 0.6372, + "step": 12214 + }, + { + "epoch": 1.9121790857858485, + "grad_norm": 1.7591677904129028, + "learning_rate": 4.716520039100684e-06, + "loss": 0.4961, + "step": 12215 + }, + { + "epoch": 1.912335629304947, + "grad_norm": 1.753316044807434, + "learning_rate": 4.708374063212773e-06, + "loss": 0.5335, + "step": 12216 + }, + { + "epoch": 1.9124921728240452, + "grad_norm": 2.5238351821899414, + "learning_rate": 4.700228087324861e-06, + "loss": 0.9168, + "step": 12217 + }, + { + "epoch": 1.9126487163431434, + "grad_norm": 3.997856855392456, + "learning_rate": 4.692082111436951e-06, + "loss": 0.9899, + "step": 12218 + }, + { + "epoch": 1.9128052598622416, + "grad_norm": 3.372671365737915, + "learning_rate": 4.68393613554904e-06, + "loss": 0.6294, + "step": 12219 + }, + { + "epoch": 1.91296180338134, + "grad_norm": 2.543013572692871, + "learning_rate": 4.675790159661128e-06, + "loss": 0.6197, + "step": 12220 + }, + { + "epoch": 1.9131183469004385, + "grad_norm": 3.2301077842712402, + "learning_rate": 4.6676441837732164e-06, + "loss": 0.8553, + "step": 12221 + }, + { + "epoch": 1.9132748904195367, + "grad_norm": 1.4770371913909912, + "learning_rate": 4.659498207885305e-06, + "loss": 0.6037, + "step": 12222 + }, + { + "epoch": 1.913431433938635, + "grad_norm": 2.5242908000946045, + "learning_rate": 4.651352231997393e-06, + "loss": 0.9243, + "step": 12223 + }, + { + "epoch": 1.9135879774577331, + "grad_norm": 2.862241506576538, + "learning_rate": 4.643206256109482e-06, + "loss": 0.5744, + "step": 12224 + }, + { + "epoch": 1.9137445209768316, + "grad_norm": 2.068598508834839, + "learning_rate": 4.635060280221571e-06, + "loss": 0.5734, + "step": 12225 + }, + { + "epoch": 1.91390106449593, + "grad_norm": 3.0680291652679443, + "learning_rate": 4.626914304333659e-06, + "loss": 0.9402, + "step": 12226 + }, + { + "epoch": 1.9140576080150282, + "grad_norm": 3.229787826538086, + "learning_rate": 4.618768328445748e-06, + "loss": 1.1124, + "step": 12227 + }, + { + "epoch": 1.9142141515341264, + "grad_norm": 3.7756009101867676, + "learning_rate": 4.6106223525578366e-06, + "loss": 1.1396, + "step": 12228 + }, + { + "epoch": 1.9143706950532247, + "grad_norm": 3.7954931259155273, + "learning_rate": 4.602476376669925e-06, + "loss": 1.0493, + "step": 12229 + }, + { + "epoch": 1.914527238572323, + "grad_norm": 3.183464527130127, + "learning_rate": 4.594330400782014e-06, + "loss": 1.1832, + "step": 12230 + }, + { + "epoch": 1.9146837820914215, + "grad_norm": 2.024658203125, + "learning_rate": 4.586184424894103e-06, + "loss": 0.7312, + "step": 12231 + }, + { + "epoch": 1.9148403256105198, + "grad_norm": 5.799678802490234, + "learning_rate": 4.578038449006191e-06, + "loss": 0.9536, + "step": 12232 + }, + { + "epoch": 1.914996869129618, + "grad_norm": 2.353919506072998, + "learning_rate": 4.56989247311828e-06, + "loss": 0.7779, + "step": 12233 + }, + { + "epoch": 1.9151534126487162, + "grad_norm": 2.1309850215911865, + "learning_rate": 4.561746497230369e-06, + "loss": 0.1664, + "step": 12234 + }, + { + "epoch": 1.9153099561678146, + "grad_norm": 6.6623969078063965, + "learning_rate": 4.553600521342457e-06, + "loss": 0.4313, + "step": 12235 + }, + { + "epoch": 1.915466499686913, + "grad_norm": 2.032729387283325, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.2644, + "step": 12236 + }, + { + "epoch": 1.9156230432060113, + "grad_norm": 4.312324047088623, + "learning_rate": 4.537308569566634e-06, + "loss": 0.8969, + "step": 12237 + }, + { + "epoch": 1.9157795867251095, + "grad_norm": 3.237133026123047, + "learning_rate": 4.529162593678723e-06, + "loss": 1.0131, + "step": 12238 + }, + { + "epoch": 1.9159361302442077, + "grad_norm": 0.5485087633132935, + "learning_rate": 4.521016617790812e-06, + "loss": 0.4278, + "step": 12239 + }, + { + "epoch": 1.9160926737633062, + "grad_norm": 0.6898446679115295, + "learning_rate": 4.5128706419029e-06, + "loss": 0.4222, + "step": 12240 + }, + { + "epoch": 1.9162492172824046, + "grad_norm": 1.0803757905960083, + "learning_rate": 4.504724666014989e-06, + "loss": 0.4502, + "step": 12241 + }, + { + "epoch": 1.9164057608015028, + "grad_norm": 0.6849081516265869, + "learning_rate": 4.496578690127078e-06, + "loss": 0.4078, + "step": 12242 + }, + { + "epoch": 1.916562304320601, + "grad_norm": 2.0615830421447754, + "learning_rate": 4.488432714239166e-06, + "loss": 0.5392, + "step": 12243 + }, + { + "epoch": 1.9167188478396995, + "grad_norm": 0.7548286318778992, + "learning_rate": 4.4802867383512545e-06, + "loss": 0.4804, + "step": 12244 + }, + { + "epoch": 1.9168753913587977, + "grad_norm": 0.931703507900238, + "learning_rate": 4.472140762463343e-06, + "loss": 0.4292, + "step": 12245 + }, + { + "epoch": 1.9170319348778961, + "grad_norm": 1.4103442430496216, + "learning_rate": 4.463994786575431e-06, + "loss": 0.6253, + "step": 12246 + }, + { + "epoch": 1.9171884783969944, + "grad_norm": 0.7882108688354492, + "learning_rate": 4.455848810687521e-06, + "loss": 0.449, + "step": 12247 + }, + { + "epoch": 1.9173450219160926, + "grad_norm": 1.2497607469558716, + "learning_rate": 4.44770283479961e-06, + "loss": 0.5515, + "step": 12248 + }, + { + "epoch": 1.917501565435191, + "grad_norm": 3.555234909057617, + "learning_rate": 4.439556858911698e-06, + "loss": 0.5165, + "step": 12249 + }, + { + "epoch": 1.9176581089542895, + "grad_norm": 1.3324576616287231, + "learning_rate": 4.431410883023787e-06, + "loss": 0.5429, + "step": 12250 + }, + { + "epoch": 1.9178146524733877, + "grad_norm": 1.3891175985336304, + "learning_rate": 4.4232649071358754e-06, + "loss": 0.4455, + "step": 12251 + }, + { + "epoch": 1.9179711959924859, + "grad_norm": 1.1567541360855103, + "learning_rate": 4.4151189312479634e-06, + "loss": 0.5254, + "step": 12252 + }, + { + "epoch": 1.918127739511584, + "grad_norm": 1.1293220520019531, + "learning_rate": 4.406972955360052e-06, + "loss": 0.541, + "step": 12253 + }, + { + "epoch": 1.9182842830306825, + "grad_norm": 2.559788942337036, + "learning_rate": 4.398826979472141e-06, + "loss": 0.4821, + "step": 12254 + }, + { + "epoch": 1.918440826549781, + "grad_norm": 1.2801285982131958, + "learning_rate": 4.390681003584229e-06, + "loss": 0.636, + "step": 12255 + }, + { + "epoch": 1.9185973700688792, + "grad_norm": 1.9216235876083374, + "learning_rate": 4.382535027696318e-06, + "loss": 0.5553, + "step": 12256 + }, + { + "epoch": 1.9187539135879774, + "grad_norm": 3.950186014175415, + "learning_rate": 4.374389051808407e-06, + "loss": 0.6175, + "step": 12257 + }, + { + "epoch": 1.9189104571070756, + "grad_norm": 1.226547360420227, + "learning_rate": 4.3662430759204956e-06, + "loss": 0.5013, + "step": 12258 + }, + { + "epoch": 1.919067000626174, + "grad_norm": 4.9684953689575195, + "learning_rate": 4.358097100032584e-06, + "loss": 0.5581, + "step": 12259 + }, + { + "epoch": 1.9192235441452725, + "grad_norm": 1.9547481536865234, + "learning_rate": 4.349951124144673e-06, + "loss": 0.6081, + "step": 12260 + }, + { + "epoch": 1.9193800876643707, + "grad_norm": 2.5153794288635254, + "learning_rate": 4.341805148256761e-06, + "loss": 0.6061, + "step": 12261 + }, + { + "epoch": 1.919536631183469, + "grad_norm": 8.693881034851074, + "learning_rate": 4.33365917236885e-06, + "loss": 1.4576, + "step": 12262 + }, + { + "epoch": 1.9196931747025672, + "grad_norm": 5.9853949546813965, + "learning_rate": 4.325513196480939e-06, + "loss": 0.519, + "step": 12263 + }, + { + "epoch": 1.9198497182216656, + "grad_norm": 1.3229608535766602, + "learning_rate": 4.317367220593027e-06, + "loss": 0.5119, + "step": 12264 + }, + { + "epoch": 1.920006261740764, + "grad_norm": 1.7647383213043213, + "learning_rate": 4.309221244705116e-06, + "loss": 0.5075, + "step": 12265 + }, + { + "epoch": 1.9201628052598623, + "grad_norm": 2.200968027114868, + "learning_rate": 4.3010752688172045e-06, + "loss": 0.6982, + "step": 12266 + }, + { + "epoch": 1.9203193487789605, + "grad_norm": 3.3955066204071045, + "learning_rate": 4.292929292929293e-06, + "loss": 0.7177, + "step": 12267 + }, + { + "epoch": 1.9204758922980587, + "grad_norm": 2.9563262462615967, + "learning_rate": 4.284783317041382e-06, + "loss": 0.5249, + "step": 12268 + }, + { + "epoch": 1.9206324358171571, + "grad_norm": 2.7296011447906494, + "learning_rate": 4.27663734115347e-06, + "loss": 0.516, + "step": 12269 + }, + { + "epoch": 1.9207889793362556, + "grad_norm": 4.111399173736572, + "learning_rate": 4.268491365265559e-06, + "loss": 0.923, + "step": 12270 + }, + { + "epoch": 1.9209455228553538, + "grad_norm": 4.194766521453857, + "learning_rate": 4.260345389377648e-06, + "loss": 0.5377, + "step": 12271 + }, + { + "epoch": 1.921102066374452, + "grad_norm": 2.709955930709839, + "learning_rate": 4.252199413489736e-06, + "loss": 0.5446, + "step": 12272 + }, + { + "epoch": 1.9212586098935505, + "grad_norm": 1.7555557489395142, + "learning_rate": 4.244053437601825e-06, + "loss": 0.5536, + "step": 12273 + }, + { + "epoch": 1.9214151534126487, + "grad_norm": 2.63270902633667, + "learning_rate": 4.2359074617139135e-06, + "loss": 0.5698, + "step": 12274 + }, + { + "epoch": 1.9215716969317471, + "grad_norm": 3.293694496154785, + "learning_rate": 4.2277614858260015e-06, + "loss": 1.4645, + "step": 12275 + }, + { + "epoch": 1.9217282404508453, + "grad_norm": 4.148124694824219, + "learning_rate": 4.219615509938091e-06, + "loss": 1.307, + "step": 12276 + }, + { + "epoch": 1.9218847839699436, + "grad_norm": 4.237106800079346, + "learning_rate": 4.21146953405018e-06, + "loss": 0.8331, + "step": 12277 + }, + { + "epoch": 1.922041327489042, + "grad_norm": 3.0846943855285645, + "learning_rate": 4.203323558162268e-06, + "loss": 1.0966, + "step": 12278 + }, + { + "epoch": 1.9221978710081402, + "grad_norm": 4.519121170043945, + "learning_rate": 4.195177582274357e-06, + "loss": 1.3145, + "step": 12279 + }, + { + "epoch": 1.9223544145272387, + "grad_norm": 5.143624782562256, + "learning_rate": 4.187031606386446e-06, + "loss": 1.6395, + "step": 12280 + }, + { + "epoch": 1.9225109580463369, + "grad_norm": 6.827467441558838, + "learning_rate": 4.178885630498534e-06, + "loss": 0.9814, + "step": 12281 + }, + { + "epoch": 1.922667501565435, + "grad_norm": 4.37494421005249, + "learning_rate": 4.1707396546106225e-06, + "loss": 0.8208, + "step": 12282 + }, + { + "epoch": 1.9228240450845335, + "grad_norm": 4.758868217468262, + "learning_rate": 4.162593678722711e-06, + "loss": 0.6683, + "step": 12283 + }, + { + "epoch": 1.922980588603632, + "grad_norm": 1.6269840002059937, + "learning_rate": 4.154447702834799e-06, + "loss": 0.4096, + "step": 12284 + }, + { + "epoch": 1.9231371321227302, + "grad_norm": 2.7258100509643555, + "learning_rate": 4.146301726946888e-06, + "loss": 0.4772, + "step": 12285 + }, + { + "epoch": 1.9232936756418284, + "grad_norm": 0.9410831928253174, + "learning_rate": 4.138155751058977e-06, + "loss": 0.2929, + "step": 12286 + }, + { + "epoch": 1.9234502191609266, + "grad_norm": 2.578202724456787, + "learning_rate": 4.130009775171066e-06, + "loss": 0.2682, + "step": 12287 + }, + { + "epoch": 1.923606762680025, + "grad_norm": 2.5475270748138428, + "learning_rate": 4.121863799283155e-06, + "loss": 0.4827, + "step": 12288 + }, + { + "epoch": 1.9237633061991235, + "grad_norm": 0.7865843176841736, + "learning_rate": 4.1137178233952434e-06, + "loss": 0.4582, + "step": 12289 + }, + { + "epoch": 1.9239198497182217, + "grad_norm": 0.6502735614776611, + "learning_rate": 4.105571847507331e-06, + "loss": 0.4213, + "step": 12290 + }, + { + "epoch": 1.92407639323732, + "grad_norm": 0.5582402348518372, + "learning_rate": 4.09742587161942e-06, + "loss": 0.4766, + "step": 12291 + }, + { + "epoch": 1.9242329367564182, + "grad_norm": 0.49286600947380066, + "learning_rate": 4.089279895731509e-06, + "loss": 0.4667, + "step": 12292 + }, + { + "epoch": 1.9243894802755166, + "grad_norm": 0.6890012621879578, + "learning_rate": 4.081133919843597e-06, + "loss": 0.4679, + "step": 12293 + }, + { + "epoch": 1.924546023794615, + "grad_norm": 0.8104533553123474, + "learning_rate": 4.072987943955686e-06, + "loss": 0.4504, + "step": 12294 + }, + { + "epoch": 1.9247025673137133, + "grad_norm": 0.635417640209198, + "learning_rate": 4.064841968067775e-06, + "loss": 0.415, + "step": 12295 + }, + { + "epoch": 1.9248591108328115, + "grad_norm": 1.0436673164367676, + "learning_rate": 4.0566959921798636e-06, + "loss": 0.4531, + "step": 12296 + }, + { + "epoch": 1.9250156543519097, + "grad_norm": 0.9951387643814087, + "learning_rate": 4.048550016291952e-06, + "loss": 0.4173, + "step": 12297 + }, + { + "epoch": 1.9251721978710081, + "grad_norm": 0.763129472732544, + "learning_rate": 4.040404040404041e-06, + "loss": 0.4326, + "step": 12298 + }, + { + "epoch": 1.9253287413901066, + "grad_norm": 1.005190134048462, + "learning_rate": 4.032258064516129e-06, + "loss": 0.4754, + "step": 12299 + }, + { + "epoch": 1.9254852849092048, + "grad_norm": 1.699729323387146, + "learning_rate": 4.024112088628218e-06, + "loss": 0.6165, + "step": 12300 + }, + { + "epoch": 1.925641828428303, + "grad_norm": 1.398398518562317, + "learning_rate": 4.015966112740306e-06, + "loss": 0.4969, + "step": 12301 + }, + { + "epoch": 1.9257983719474012, + "grad_norm": 1.8227745294570923, + "learning_rate": 4.007820136852395e-06, + "loss": 0.7169, + "step": 12302 + }, + { + "epoch": 1.9259549154664997, + "grad_norm": 1.4116772413253784, + "learning_rate": 3.999674160964484e-06, + "loss": 0.4784, + "step": 12303 + }, + { + "epoch": 1.926111458985598, + "grad_norm": 1.2180135250091553, + "learning_rate": 3.991528185076572e-06, + "loss": 0.5157, + "step": 12304 + }, + { + "epoch": 1.9262680025046963, + "grad_norm": 1.0050441026687622, + "learning_rate": 3.983382209188661e-06, + "loss": 0.3727, + "step": 12305 + }, + { + "epoch": 1.9264245460237945, + "grad_norm": 1.2989635467529297, + "learning_rate": 3.97523623330075e-06, + "loss": 0.4661, + "step": 12306 + }, + { + "epoch": 1.926581089542893, + "grad_norm": 1.9092564582824707, + "learning_rate": 3.967090257412838e-06, + "loss": 0.7188, + "step": 12307 + }, + { + "epoch": 1.9267376330619912, + "grad_norm": 2.5080084800720215, + "learning_rate": 3.958944281524927e-06, + "loss": 0.5712, + "step": 12308 + }, + { + "epoch": 1.9268941765810896, + "grad_norm": 1.2770705223083496, + "learning_rate": 3.950798305637016e-06, + "loss": 0.5929, + "step": 12309 + }, + { + "epoch": 1.9270507201001879, + "grad_norm": 1.8223532438278198, + "learning_rate": 3.942652329749104e-06, + "loss": 0.6549, + "step": 12310 + }, + { + "epoch": 1.927207263619286, + "grad_norm": 3.352853775024414, + "learning_rate": 3.934506353861193e-06, + "loss": 0.7281, + "step": 12311 + }, + { + "epoch": 1.9273638071383845, + "grad_norm": 2.9821829795837402, + "learning_rate": 3.9263603779732815e-06, + "loss": 0.7428, + "step": 12312 + }, + { + "epoch": 1.9275203506574827, + "grad_norm": 1.960609793663025, + "learning_rate": 3.9182144020853695e-06, + "loss": 0.5792, + "step": 12313 + }, + { + "epoch": 1.9276768941765812, + "grad_norm": 2.51800537109375, + "learning_rate": 3.910068426197458e-06, + "loss": 0.6948, + "step": 12314 + }, + { + "epoch": 1.9278334376956794, + "grad_norm": 2.207981824874878, + "learning_rate": 3.901922450309547e-06, + "loss": 0.8966, + "step": 12315 + }, + { + "epoch": 1.9279899812147776, + "grad_norm": 2.392934560775757, + "learning_rate": 3.893776474421636e-06, + "loss": 0.6497, + "step": 12316 + }, + { + "epoch": 1.928146524733876, + "grad_norm": 1.797955870628357, + "learning_rate": 3.885630498533725e-06, + "loss": 0.6908, + "step": 12317 + }, + { + "epoch": 1.9283030682529745, + "grad_norm": 3.298415422439575, + "learning_rate": 3.877484522645814e-06, + "loss": 0.7658, + "step": 12318 + }, + { + "epoch": 1.9284596117720727, + "grad_norm": 3.3987529277801514, + "learning_rate": 3.869338546757902e-06, + "loss": 0.813, + "step": 12319 + }, + { + "epoch": 1.928616155291171, + "grad_norm": 4.554177761077881, + "learning_rate": 3.8611925708699904e-06, + "loss": 0.966, + "step": 12320 + }, + { + "epoch": 1.9287726988102691, + "grad_norm": 2.609259605407715, + "learning_rate": 3.853046594982079e-06, + "loss": 0.7133, + "step": 12321 + }, + { + "epoch": 1.9289292423293676, + "grad_norm": 2.9040400981903076, + "learning_rate": 3.844900619094167e-06, + "loss": 0.8297, + "step": 12322 + }, + { + "epoch": 1.929085785848466, + "grad_norm": 1.5103906393051147, + "learning_rate": 3.836754643206256e-06, + "loss": 0.6802, + "step": 12323 + }, + { + "epoch": 1.9292423293675642, + "grad_norm": 2.03440523147583, + "learning_rate": 3.828608667318345e-06, + "loss": 1.0669, + "step": 12324 + }, + { + "epoch": 1.9293988728866625, + "grad_norm": 2.663066864013672, + "learning_rate": 3.820462691430434e-06, + "loss": 1.2842, + "step": 12325 + }, + { + "epoch": 1.9295554164057607, + "grad_norm": 2.8208634853363037, + "learning_rate": 3.812316715542522e-06, + "loss": 1.1513, + "step": 12326 + }, + { + "epoch": 1.929711959924859, + "grad_norm": 3.5021708011627197, + "learning_rate": 3.804170739654611e-06, + "loss": 0.8698, + "step": 12327 + }, + { + "epoch": 1.9298685034439576, + "grad_norm": 4.520351886749268, + "learning_rate": 3.7960247637666994e-06, + "loss": 1.1543, + "step": 12328 + }, + { + "epoch": 1.9300250469630558, + "grad_norm": 4.526183128356934, + "learning_rate": 3.7878787878787882e-06, + "loss": 0.881, + "step": 12329 + }, + { + "epoch": 1.930181590482154, + "grad_norm": 2.522676706314087, + "learning_rate": 3.779732811990877e-06, + "loss": 1.1748, + "step": 12330 + }, + { + "epoch": 1.9303381340012522, + "grad_norm": 1.5897367000579834, + "learning_rate": 3.771586836102965e-06, + "loss": 0.4718, + "step": 12331 + }, + { + "epoch": 1.9304946775203506, + "grad_norm": 4.659087657928467, + "learning_rate": 3.763440860215054e-06, + "loss": 1.0346, + "step": 12332 + }, + { + "epoch": 1.930651221039449, + "grad_norm": 3.8022749423980713, + "learning_rate": 3.7552948843271423e-06, + "loss": 1.4969, + "step": 12333 + }, + { + "epoch": 1.9308077645585473, + "grad_norm": 5.910645008087158, + "learning_rate": 3.747148908439231e-06, + "loss": 0.5906, + "step": 12334 + }, + { + "epoch": 1.9309643080776455, + "grad_norm": 1.386872410774231, + "learning_rate": 3.73900293255132e-06, + "loss": 0.8477, + "step": 12335 + }, + { + "epoch": 1.9311208515967437, + "grad_norm": 1.4564473628997803, + "learning_rate": 3.7308569566634083e-06, + "loss": 0.6392, + "step": 12336 + }, + { + "epoch": 1.9312773951158422, + "grad_norm": 2.188255548477173, + "learning_rate": 3.722710980775497e-06, + "loss": 0.3747, + "step": 12337 + }, + { + "epoch": 1.9314339386349406, + "grad_norm": 3.2779624462127686, + "learning_rate": 3.714565004887586e-06, + "loss": 1.0401, + "step": 12338 + }, + { + "epoch": 1.9315904821540388, + "grad_norm": 5.670164585113525, + "learning_rate": 3.706419028999674e-06, + "loss": 0.8095, + "step": 12339 + }, + { + "epoch": 1.931747025673137, + "grad_norm": 0.5211887359619141, + "learning_rate": 3.698273053111763e-06, + "loss": 0.4698, + "step": 12340 + }, + { + "epoch": 1.9319035691922355, + "grad_norm": 1.2946819067001343, + "learning_rate": 3.6901270772238517e-06, + "loss": 0.469, + "step": 12341 + }, + { + "epoch": 1.9320601127113337, + "grad_norm": 0.6890685558319092, + "learning_rate": 3.68198110133594e-06, + "loss": 0.4381, + "step": 12342 + }, + { + "epoch": 1.9322166562304322, + "grad_norm": 0.8155226707458496, + "learning_rate": 3.673835125448029e-06, + "loss": 0.4793, + "step": 12343 + }, + { + "epoch": 1.9323731997495304, + "grad_norm": 0.5776665210723877, + "learning_rate": 3.6656891495601177e-06, + "loss": 0.4282, + "step": 12344 + }, + { + "epoch": 1.9325297432686286, + "grad_norm": 1.7510740756988525, + "learning_rate": 3.6575431736722057e-06, + "loss": 0.4315, + "step": 12345 + }, + { + "epoch": 1.932686286787727, + "grad_norm": 0.8292139768600464, + "learning_rate": 3.6493971977842945e-06, + "loss": 0.476, + "step": 12346 + }, + { + "epoch": 1.9328428303068252, + "grad_norm": 0.8631847500801086, + "learning_rate": 3.641251221896384e-06, + "loss": 0.509, + "step": 12347 + }, + { + "epoch": 1.9329993738259237, + "grad_norm": 0.8454664349555969, + "learning_rate": 3.6331052460084718e-06, + "loss": 0.5025, + "step": 12348 + }, + { + "epoch": 1.933155917345022, + "grad_norm": 1.474377989768982, + "learning_rate": 3.6249592701205606e-06, + "loss": 0.5362, + "step": 12349 + }, + { + "epoch": 1.9333124608641201, + "grad_norm": 1.137299656867981, + "learning_rate": 3.6168132942326494e-06, + "loss": 0.8018, + "step": 12350 + }, + { + "epoch": 1.9334690043832186, + "grad_norm": 1.2379460334777832, + "learning_rate": 3.608667318344738e-06, + "loss": 0.5229, + "step": 12351 + }, + { + "epoch": 1.933625547902317, + "grad_norm": 1.6893112659454346, + "learning_rate": 3.6005213424568267e-06, + "loss": 0.8152, + "step": 12352 + }, + { + "epoch": 1.9337820914214152, + "grad_norm": 1.1529065370559692, + "learning_rate": 3.5923753665689155e-06, + "loss": 0.4855, + "step": 12353 + }, + { + "epoch": 1.9339386349405134, + "grad_norm": 0.9757954478263855, + "learning_rate": 3.5842293906810035e-06, + "loss": 0.5353, + "step": 12354 + }, + { + "epoch": 1.9340951784596117, + "grad_norm": 3.7397408485412598, + "learning_rate": 3.5760834147930923e-06, + "loss": 0.9009, + "step": 12355 + }, + { + "epoch": 1.93425172197871, + "grad_norm": 1.438539981842041, + "learning_rate": 3.567937438905181e-06, + "loss": 0.73, + "step": 12356 + }, + { + "epoch": 1.9344082654978085, + "grad_norm": 1.5457494258880615, + "learning_rate": 3.5597914630172696e-06, + "loss": 0.6449, + "step": 12357 + }, + { + "epoch": 1.9345648090169068, + "grad_norm": 1.4765822887420654, + "learning_rate": 3.5516454871293584e-06, + "loss": 0.5462, + "step": 12358 + }, + { + "epoch": 1.934721352536005, + "grad_norm": 0.9319648146629333, + "learning_rate": 3.5434995112414472e-06, + "loss": 0.3627, + "step": 12359 + }, + { + "epoch": 1.9348778960551032, + "grad_norm": 1.9773396253585815, + "learning_rate": 3.5353535353535352e-06, + "loss": 0.6468, + "step": 12360 + }, + { + "epoch": 1.9350344395742016, + "grad_norm": 2.1247777938842773, + "learning_rate": 3.527207559465624e-06, + "loss": 0.6442, + "step": 12361 + }, + { + "epoch": 1.9351909830933, + "grad_norm": 1.4147021770477295, + "learning_rate": 3.5190615835777133e-06, + "loss": 0.5081, + "step": 12362 + }, + { + "epoch": 1.9353475266123983, + "grad_norm": 3.162578582763672, + "learning_rate": 3.5109156076898013e-06, + "loss": 0.7972, + "step": 12363 + }, + { + "epoch": 1.9355040701314965, + "grad_norm": 3.5179080963134766, + "learning_rate": 3.50276963180189e-06, + "loss": 0.7712, + "step": 12364 + }, + { + "epoch": 1.9356606136505947, + "grad_norm": 2.2432847023010254, + "learning_rate": 3.4946236559139785e-06, + "loss": 0.8779, + "step": 12365 + }, + { + "epoch": 1.9358171571696932, + "grad_norm": 4.697100639343262, + "learning_rate": 3.4864776800260674e-06, + "loss": 1.1074, + "step": 12366 + }, + { + "epoch": 1.9359737006887916, + "grad_norm": 1.3010350465774536, + "learning_rate": 3.478331704138156e-06, + "loss": 0.5955, + "step": 12367 + }, + { + "epoch": 1.9361302442078898, + "grad_norm": 1.99745774269104, + "learning_rate": 3.470185728250244e-06, + "loss": 0.8274, + "step": 12368 + }, + { + "epoch": 1.936286787726988, + "grad_norm": 1.7645330429077148, + "learning_rate": 3.462039752362333e-06, + "loss": 0.7012, + "step": 12369 + }, + { + "epoch": 1.9364433312460863, + "grad_norm": 2.224867105484009, + "learning_rate": 3.453893776474422e-06, + "loss": 0.8946, + "step": 12370 + }, + { + "epoch": 1.9365998747651847, + "grad_norm": 2.5902857780456543, + "learning_rate": 3.4457478005865102e-06, + "loss": 0.501, + "step": 12371 + }, + { + "epoch": 1.9367564182842831, + "grad_norm": 2.897432565689087, + "learning_rate": 3.437601824698599e-06, + "loss": 0.5169, + "step": 12372 + }, + { + "epoch": 1.9369129618033814, + "grad_norm": 2.764101266860962, + "learning_rate": 3.429455848810688e-06, + "loss": 0.9126, + "step": 12373 + }, + { + "epoch": 1.9370695053224796, + "grad_norm": 3.0184249877929688, + "learning_rate": 3.421309872922776e-06, + "loss": 1.137, + "step": 12374 + }, + { + "epoch": 1.937226048841578, + "grad_norm": 3.3190903663635254, + "learning_rate": 3.4131638970348647e-06, + "loss": 0.9312, + "step": 12375 + }, + { + "epoch": 1.9373825923606762, + "grad_norm": 3.7195041179656982, + "learning_rate": 3.405017921146954e-06, + "loss": 1.2583, + "step": 12376 + }, + { + "epoch": 1.9375391358797747, + "grad_norm": 4.191839218139648, + "learning_rate": 3.396871945259042e-06, + "loss": 1.121, + "step": 12377 + }, + { + "epoch": 1.9376956793988729, + "grad_norm": 4.055298805236816, + "learning_rate": 3.388725969371131e-06, + "loss": 0.7471, + "step": 12378 + }, + { + "epoch": 1.937852222917971, + "grad_norm": 2.5017004013061523, + "learning_rate": 3.3805799934832196e-06, + "loss": 1.1873, + "step": 12379 + }, + { + "epoch": 1.9380087664370695, + "grad_norm": 2.278543710708618, + "learning_rate": 3.372434017595308e-06, + "loss": 0.5338, + "step": 12380 + }, + { + "epoch": 1.938165309956168, + "grad_norm": 4.969339847564697, + "learning_rate": 3.364288041707397e-06, + "loss": 0.5705, + "step": 12381 + }, + { + "epoch": 1.9383218534752662, + "grad_norm": 3.9476213455200195, + "learning_rate": 3.3561420658194857e-06, + "loss": 1.0075, + "step": 12382 + }, + { + "epoch": 1.9384783969943644, + "grad_norm": 2.871112108230591, + "learning_rate": 3.3479960899315737e-06, + "loss": 0.9583, + "step": 12383 + }, + { + "epoch": 1.9386349405134626, + "grad_norm": 7.562802791595459, + "learning_rate": 3.3398501140436625e-06, + "loss": 0.9112, + "step": 12384 + }, + { + "epoch": 1.938791484032561, + "grad_norm": 1.9348034858703613, + "learning_rate": 3.3317041381557513e-06, + "loss": 0.5697, + "step": 12385 + }, + { + "epoch": 1.9389480275516595, + "grad_norm": 1.9943476915359497, + "learning_rate": 3.3235581622678398e-06, + "loss": 0.7415, + "step": 12386 + }, + { + "epoch": 1.9391045710707577, + "grad_norm": 1.4422982931137085, + "learning_rate": 3.3154121863799286e-06, + "loss": 0.2817, + "step": 12387 + }, + { + "epoch": 1.939261114589856, + "grad_norm": 5.48392391204834, + "learning_rate": 3.3072662104920174e-06, + "loss": 1.0759, + "step": 12388 + }, + { + "epoch": 1.9394176581089542, + "grad_norm": 0.8248450756072998, + "learning_rate": 3.2991202346041054e-06, + "loss": 0.4756, + "step": 12389 + }, + { + "epoch": 1.9395742016280526, + "grad_norm": 0.7185633182525635, + "learning_rate": 3.2909742587161942e-06, + "loss": 0.4963, + "step": 12390 + }, + { + "epoch": 1.939730745147151, + "grad_norm": 0.8231492042541504, + "learning_rate": 3.2828282828282835e-06, + "loss": 0.4797, + "step": 12391 + }, + { + "epoch": 1.9398872886662493, + "grad_norm": 0.7531884908676147, + "learning_rate": 3.2746823069403715e-06, + "loss": 0.5238, + "step": 12392 + }, + { + "epoch": 1.9400438321853475, + "grad_norm": 0.749457836151123, + "learning_rate": 3.2665363310524603e-06, + "loss": 0.4587, + "step": 12393 + }, + { + "epoch": 1.9402003757044457, + "grad_norm": 0.5850194096565247, + "learning_rate": 3.258390355164549e-06, + "loss": 0.3888, + "step": 12394 + }, + { + "epoch": 1.9403569192235441, + "grad_norm": 0.7998384237289429, + "learning_rate": 3.2502443792766375e-06, + "loss": 0.4646, + "step": 12395 + }, + { + "epoch": 1.9405134627426426, + "grad_norm": 0.9250445365905762, + "learning_rate": 3.2420984033887264e-06, + "loss": 0.5338, + "step": 12396 + }, + { + "epoch": 1.9406700062617408, + "grad_norm": 2.1011929512023926, + "learning_rate": 3.2339524275008144e-06, + "loss": 0.5327, + "step": 12397 + }, + { + "epoch": 1.940826549780839, + "grad_norm": 1.0535062551498413, + "learning_rate": 3.225806451612903e-06, + "loss": 0.5184, + "step": 12398 + }, + { + "epoch": 1.9409830932999372, + "grad_norm": 0.8322818279266357, + "learning_rate": 3.217660475724992e-06, + "loss": 0.4945, + "step": 12399 + }, + { + "epoch": 1.9411396368190357, + "grad_norm": 1.231059193611145, + "learning_rate": 3.2095144998370804e-06, + "loss": 0.4736, + "step": 12400 + }, + { + "epoch": 1.9412961803381341, + "grad_norm": 0.7877229452133179, + "learning_rate": 3.2013685239491693e-06, + "loss": 0.535, + "step": 12401 + }, + { + "epoch": 1.9414527238572323, + "grad_norm": 0.9182453751564026, + "learning_rate": 3.193222548061258e-06, + "loss": 0.5139, + "step": 12402 + }, + { + "epoch": 1.9416092673763305, + "grad_norm": 1.3340604305267334, + "learning_rate": 3.185076572173346e-06, + "loss": 0.4828, + "step": 12403 + }, + { + "epoch": 1.9417658108954288, + "grad_norm": 1.1967307329177856, + "learning_rate": 3.176930596285435e-06, + "loss": 0.4015, + "step": 12404 + }, + { + "epoch": 1.9419223544145272, + "grad_norm": 1.7335708141326904, + "learning_rate": 3.168784620397524e-06, + "loss": 0.4162, + "step": 12405 + }, + { + "epoch": 1.9420788979336256, + "grad_norm": 0.7990303039550781, + "learning_rate": 3.160638644509612e-06, + "loss": 0.479, + "step": 12406 + }, + { + "epoch": 1.9422354414527239, + "grad_norm": 1.2409868240356445, + "learning_rate": 3.152492668621701e-06, + "loss": 0.5126, + "step": 12407 + }, + { + "epoch": 1.942391984971822, + "grad_norm": 2.782005548477173, + "learning_rate": 3.14434669273379e-06, + "loss": 0.6158, + "step": 12408 + }, + { + "epoch": 1.9425485284909205, + "grad_norm": 2.5291197299957275, + "learning_rate": 3.1362007168458782e-06, + "loss": 0.6529, + "step": 12409 + }, + { + "epoch": 1.9427050720100187, + "grad_norm": 2.732254981994629, + "learning_rate": 3.128054740957967e-06, + "loss": 0.6239, + "step": 12410 + }, + { + "epoch": 1.9428616155291172, + "grad_norm": 1.1715056896209717, + "learning_rate": 3.1199087650700555e-06, + "loss": 0.4038, + "step": 12411 + }, + { + "epoch": 1.9430181590482154, + "grad_norm": 1.9417810440063477, + "learning_rate": 3.1117627891821443e-06, + "loss": 0.6368, + "step": 12412 + }, + { + "epoch": 1.9431747025673136, + "grad_norm": 2.3235907554626465, + "learning_rate": 3.1036168132942327e-06, + "loss": 0.845, + "step": 12413 + }, + { + "epoch": 1.943331246086412, + "grad_norm": 4.0234785079956055, + "learning_rate": 3.0954708374063215e-06, + "loss": 0.657, + "step": 12414 + }, + { + "epoch": 1.9434877896055105, + "grad_norm": 1.999584674835205, + "learning_rate": 3.0873248615184104e-06, + "loss": 0.8528, + "step": 12415 + }, + { + "epoch": 1.9436443331246087, + "grad_norm": 2.0865719318389893, + "learning_rate": 3.0791788856304988e-06, + "loss": 0.4496, + "step": 12416 + }, + { + "epoch": 1.943800876643707, + "grad_norm": 3.2667462825775146, + "learning_rate": 3.071032909742587e-06, + "loss": 1.1298, + "step": 12417 + }, + { + "epoch": 1.9439574201628051, + "grad_norm": 2.803001880645752, + "learning_rate": 3.062886933854676e-06, + "loss": 0.7189, + "step": 12418 + }, + { + "epoch": 1.9441139636819036, + "grad_norm": 1.2814937829971313, + "learning_rate": 3.0547409579667644e-06, + "loss": 0.6112, + "step": 12419 + }, + { + "epoch": 1.944270507201002, + "grad_norm": 3.1790101528167725, + "learning_rate": 3.0465949820788532e-06, + "loss": 0.8489, + "step": 12420 + }, + { + "epoch": 1.9444270507201002, + "grad_norm": 1.1260640621185303, + "learning_rate": 3.0384490061909417e-06, + "loss": 0.4086, + "step": 12421 + }, + { + "epoch": 1.9445835942391985, + "grad_norm": 2.684689521789551, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.5702, + "step": 12422 + }, + { + "epoch": 1.9447401377582967, + "grad_norm": 1.4722059965133667, + "learning_rate": 3.022157054415119e-06, + "loss": 0.4123, + "step": 12423 + }, + { + "epoch": 1.9448966812773951, + "grad_norm": 2.5959038734436035, + "learning_rate": 3.0140110785272077e-06, + "loss": 0.5439, + "step": 12424 + }, + { + "epoch": 1.9450532247964936, + "grad_norm": 3.157130241394043, + "learning_rate": 3.0058651026392966e-06, + "loss": 1.3185, + "step": 12425 + }, + { + "epoch": 1.9452097683155918, + "grad_norm": 2.6138229370117188, + "learning_rate": 2.997719126751385e-06, + "loss": 0.9478, + "step": 12426 + }, + { + "epoch": 1.94536631183469, + "grad_norm": 3.228604316711426, + "learning_rate": 2.9895731508634734e-06, + "loss": 0.7625, + "step": 12427 + }, + { + "epoch": 1.9455228553537882, + "grad_norm": 3.902863025665283, + "learning_rate": 2.981427174975562e-06, + "loss": 1.0758, + "step": 12428 + }, + { + "epoch": 1.9456793988728867, + "grad_norm": 9.919913291931152, + "learning_rate": 2.973281199087651e-06, + "loss": 1.58, + "step": 12429 + }, + { + "epoch": 1.945835942391985, + "grad_norm": 6.73028564453125, + "learning_rate": 2.9651352231997394e-06, + "loss": 1.0516, + "step": 12430 + }, + { + "epoch": 1.9459924859110833, + "grad_norm": 3.821019411087036, + "learning_rate": 2.9569892473118283e-06, + "loss": 1.4604, + "step": 12431 + }, + { + "epoch": 1.9461490294301815, + "grad_norm": 1.7347142696380615, + "learning_rate": 2.9488432714239167e-06, + "loss": 0.7962, + "step": 12432 + }, + { + "epoch": 1.9463055729492797, + "grad_norm": 2.469947576522827, + "learning_rate": 2.940697295536005e-06, + "loss": 0.944, + "step": 12433 + }, + { + "epoch": 1.9464621164683782, + "grad_norm": 3.449458122253418, + "learning_rate": 2.9325513196480943e-06, + "loss": 0.9924, + "step": 12434 + }, + { + "epoch": 1.9466186599874766, + "grad_norm": 2.3762338161468506, + "learning_rate": 2.9244053437601828e-06, + "loss": 0.6725, + "step": 12435 + }, + { + "epoch": 1.9467752035065748, + "grad_norm": 3.3178560733795166, + "learning_rate": 2.916259367872271e-06, + "loss": 0.6303, + "step": 12436 + }, + { + "epoch": 1.946931747025673, + "grad_norm": 6.668274402618408, + "learning_rate": 2.9081133919843596e-06, + "loss": 0.7688, + "step": 12437 + }, + { + "epoch": 1.9470882905447713, + "grad_norm": 2.859844923019409, + "learning_rate": 2.8999674160964484e-06, + "loss": 0.6958, + "step": 12438 + }, + { + "epoch": 1.9472448340638697, + "grad_norm": 0.4906415343284607, + "learning_rate": 2.8918214402085372e-06, + "loss": 0.4761, + "step": 12439 + }, + { + "epoch": 1.9474013775829682, + "grad_norm": 0.526695728302002, + "learning_rate": 2.8836754643206256e-06, + "loss": 0.3706, + "step": 12440 + }, + { + "epoch": 1.9475579211020664, + "grad_norm": 0.7069143056869507, + "learning_rate": 2.8755294884327145e-06, + "loss": 0.4662, + "step": 12441 + }, + { + "epoch": 1.9477144646211646, + "grad_norm": 0.8008819818496704, + "learning_rate": 2.867383512544803e-06, + "loss": 0.4161, + "step": 12442 + }, + { + "epoch": 1.947871008140263, + "grad_norm": 0.7434290051460266, + "learning_rate": 2.8592375366568917e-06, + "loss": 0.4598, + "step": 12443 + }, + { + "epoch": 1.9480275516593613, + "grad_norm": 0.7274467349052429, + "learning_rate": 2.8510915607689805e-06, + "loss": 0.4445, + "step": 12444 + }, + { + "epoch": 1.9481840951784597, + "grad_norm": 1.1765246391296387, + "learning_rate": 2.842945584881069e-06, + "loss": 0.3969, + "step": 12445 + }, + { + "epoch": 1.948340638697558, + "grad_norm": 0.902952253818512, + "learning_rate": 2.8347996089931574e-06, + "loss": 0.4653, + "step": 12446 + }, + { + "epoch": 1.9484971822166561, + "grad_norm": 0.866496741771698, + "learning_rate": 2.826653633105246e-06, + "loss": 0.4662, + "step": 12447 + }, + { + "epoch": 1.9486537257357546, + "grad_norm": 1.3634753227233887, + "learning_rate": 2.818507657217335e-06, + "loss": 0.5302, + "step": 12448 + }, + { + "epoch": 1.948810269254853, + "grad_norm": 0.7554190158843994, + "learning_rate": 2.8103616813294234e-06, + "loss": 0.4277, + "step": 12449 + }, + { + "epoch": 1.9489668127739512, + "grad_norm": 0.7139448523521423, + "learning_rate": 2.8022157054415123e-06, + "loss": 0.342, + "step": 12450 + }, + { + "epoch": 1.9491233562930494, + "grad_norm": 1.5309877395629883, + "learning_rate": 2.7940697295536007e-06, + "loss": 0.5586, + "step": 12451 + }, + { + "epoch": 1.9492798998121477, + "grad_norm": 1.544829249382019, + "learning_rate": 2.785923753665689e-06, + "loss": 0.4697, + "step": 12452 + }, + { + "epoch": 1.949436443331246, + "grad_norm": 0.8845558762550354, + "learning_rate": 2.777777777777778e-06, + "loss": 0.3777, + "step": 12453 + }, + { + "epoch": 1.9495929868503445, + "grad_norm": 1.9516104459762573, + "learning_rate": 2.7696318018898667e-06, + "loss": 0.4169, + "step": 12454 + }, + { + "epoch": 1.9497495303694428, + "grad_norm": 1.3536854982376099, + "learning_rate": 2.761485826001955e-06, + "loss": 0.7733, + "step": 12455 + }, + { + "epoch": 1.949906073888541, + "grad_norm": 1.850222110748291, + "learning_rate": 2.7533398501140436e-06, + "loss": 0.6303, + "step": 12456 + }, + { + "epoch": 1.9500626174076392, + "grad_norm": 2.048166036605835, + "learning_rate": 2.7451938742261324e-06, + "loss": 0.5738, + "step": 12457 + }, + { + "epoch": 1.9502191609267376, + "grad_norm": 1.272202730178833, + "learning_rate": 2.7370478983382212e-06, + "loss": 0.5231, + "step": 12458 + }, + { + "epoch": 1.950375704445836, + "grad_norm": 1.6545017957687378, + "learning_rate": 2.7289019224503096e-06, + "loss": 0.5988, + "step": 12459 + }, + { + "epoch": 1.9505322479649343, + "grad_norm": 1.5944536924362183, + "learning_rate": 2.7207559465623985e-06, + "loss": 0.4729, + "step": 12460 + }, + { + "epoch": 1.9506887914840325, + "grad_norm": 2.106208086013794, + "learning_rate": 2.712609970674487e-06, + "loss": 0.871, + "step": 12461 + }, + { + "epoch": 1.9508453350031307, + "grad_norm": 3.190800666809082, + "learning_rate": 2.7044639947865753e-06, + "loss": 0.7182, + "step": 12462 + }, + { + "epoch": 1.9510018785222292, + "grad_norm": 1.8175286054611206, + "learning_rate": 2.6963180188986645e-06, + "loss": 0.8795, + "step": 12463 + }, + { + "epoch": 1.9511584220413276, + "grad_norm": 1.5429362058639526, + "learning_rate": 2.688172043010753e-06, + "loss": 0.3982, + "step": 12464 + }, + { + "epoch": 1.9513149655604258, + "grad_norm": 2.0826821327209473, + "learning_rate": 2.6800260671228413e-06, + "loss": 0.7055, + "step": 12465 + }, + { + "epoch": 1.951471509079524, + "grad_norm": 3.9329092502593994, + "learning_rate": 2.67188009123493e-06, + "loss": 0.5561, + "step": 12466 + }, + { + "epoch": 1.9516280525986223, + "grad_norm": 1.356522798538208, + "learning_rate": 2.6637341153470186e-06, + "loss": 0.418, + "step": 12467 + }, + { + "epoch": 1.9517845961177207, + "grad_norm": 3.068420648574829, + "learning_rate": 2.6555881394591074e-06, + "loss": 0.8437, + "step": 12468 + }, + { + "epoch": 1.9519411396368191, + "grad_norm": 2.2558155059814453, + "learning_rate": 2.647442163571196e-06, + "loss": 0.843, + "step": 12469 + }, + { + "epoch": 1.9520976831559174, + "grad_norm": 5.175942420959473, + "learning_rate": 2.6392961876832847e-06, + "loss": 0.8001, + "step": 12470 + }, + { + "epoch": 1.9522542266750156, + "grad_norm": 2.374220132827759, + "learning_rate": 2.631150211795373e-06, + "loss": 0.9845, + "step": 12471 + }, + { + "epoch": 1.9524107701941138, + "grad_norm": 2.2067580223083496, + "learning_rate": 2.623004235907462e-06, + "loss": 0.5315, + "step": 12472 + }, + { + "epoch": 1.9525673137132122, + "grad_norm": 2.708930492401123, + "learning_rate": 2.6148582600195507e-06, + "loss": 1.1427, + "step": 12473 + }, + { + "epoch": 1.9527238572323107, + "grad_norm": 3.6878769397735596, + "learning_rate": 2.606712284131639e-06, + "loss": 1.0274, + "step": 12474 + }, + { + "epoch": 1.952880400751409, + "grad_norm": 5.223238468170166, + "learning_rate": 2.5985663082437275e-06, + "loss": 1.4042, + "step": 12475 + }, + { + "epoch": 1.9530369442705071, + "grad_norm": 2.905867576599121, + "learning_rate": 2.5904203323558164e-06, + "loss": 0.6455, + "step": 12476 + }, + { + "epoch": 1.9531934877896056, + "grad_norm": 2.379636764526367, + "learning_rate": 2.582274356467905e-06, + "loss": 0.7131, + "step": 12477 + }, + { + "epoch": 1.9533500313087038, + "grad_norm": 6.733752250671387, + "learning_rate": 2.5741283805799936e-06, + "loss": 0.4845, + "step": 12478 + }, + { + "epoch": 1.9535065748278022, + "grad_norm": 2.4714713096618652, + "learning_rate": 2.5659824046920824e-06, + "loss": 0.9429, + "step": 12479 + }, + { + "epoch": 1.9536631183469004, + "grad_norm": 3.247493028640747, + "learning_rate": 2.557836428804171e-06, + "loss": 0.4817, + "step": 12480 + }, + { + "epoch": 1.9538196618659986, + "grad_norm": 2.730489492416382, + "learning_rate": 2.5496904529162593e-06, + "loss": 0.6679, + "step": 12481 + }, + { + "epoch": 1.953976205385097, + "grad_norm": 3.607422351837158, + "learning_rate": 2.541544477028348e-06, + "loss": 0.6755, + "step": 12482 + }, + { + "epoch": 1.9541327489041955, + "grad_norm": 3.5614240169525146, + "learning_rate": 2.533398501140437e-06, + "loss": 1.2731, + "step": 12483 + }, + { + "epoch": 1.9542892924232937, + "grad_norm": 2.3518991470336914, + "learning_rate": 2.5252525252525253e-06, + "loss": 0.9453, + "step": 12484 + }, + { + "epoch": 1.954445835942392, + "grad_norm": 11.119514465332031, + "learning_rate": 2.5171065493646137e-06, + "loss": 1.2475, + "step": 12485 + }, + { + "epoch": 1.9546023794614902, + "grad_norm": 3.7905397415161133, + "learning_rate": 2.5089605734767026e-06, + "loss": 0.4377, + "step": 12486 + }, + { + "epoch": 1.9547589229805886, + "grad_norm": 4.5544047355651855, + "learning_rate": 2.5008145975887914e-06, + "loss": 0.9544, + "step": 12487 + }, + { + "epoch": 1.954915466499687, + "grad_norm": 1.6368591785430908, + "learning_rate": 2.49266862170088e-06, + "loss": 0.3551, + "step": 12488 + }, + { + "epoch": 1.9550720100187853, + "grad_norm": 0.5549148917198181, + "learning_rate": 2.4845226458129686e-06, + "loss": 0.4761, + "step": 12489 + }, + { + "epoch": 1.9552285535378835, + "grad_norm": 0.7118874788284302, + "learning_rate": 2.476376669925057e-06, + "loss": 0.4323, + "step": 12490 + }, + { + "epoch": 1.9553850970569817, + "grad_norm": 0.9520952105522156, + "learning_rate": 2.4682306940371455e-06, + "loss": 0.405, + "step": 12491 + }, + { + "epoch": 1.9555416405760802, + "grad_norm": 0.5535159111022949, + "learning_rate": 2.4600847181492347e-06, + "loss": 0.4175, + "step": 12492 + }, + { + "epoch": 1.9556981840951786, + "grad_norm": 0.5248305201530457, + "learning_rate": 2.451938742261323e-06, + "loss": 0.4643, + "step": 12493 + }, + { + "epoch": 1.9558547276142768, + "grad_norm": 0.667699933052063, + "learning_rate": 2.4437927663734115e-06, + "loss": 0.5422, + "step": 12494 + }, + { + "epoch": 1.956011271133375, + "grad_norm": 0.7815746665000916, + "learning_rate": 2.4356467904855004e-06, + "loss": 0.3675, + "step": 12495 + }, + { + "epoch": 1.9561678146524732, + "grad_norm": 0.5756655931472778, + "learning_rate": 2.4275008145975888e-06, + "loss": 0.4316, + "step": 12496 + }, + { + "epoch": 1.9563243581715717, + "grad_norm": 0.6417515277862549, + "learning_rate": 2.4193548387096776e-06, + "loss": 0.4249, + "step": 12497 + }, + { + "epoch": 1.9564809016906701, + "grad_norm": 0.9999462366104126, + "learning_rate": 2.4112088628217664e-06, + "loss": 0.4345, + "step": 12498 + }, + { + "epoch": 1.9566374452097683, + "grad_norm": 1.5714811086654663, + "learning_rate": 2.403062886933855e-06, + "loss": 0.5784, + "step": 12499 + }, + { + "epoch": 1.9567939887288666, + "grad_norm": 0.9044635891914368, + "learning_rate": 2.3949169110459433e-06, + "loss": 0.5055, + "step": 12500 + }, + { + "epoch": 1.9569505322479648, + "grad_norm": 0.9703615307807922, + "learning_rate": 2.386770935158032e-06, + "loss": 0.5056, + "step": 12501 + }, + { + "epoch": 1.9571070757670632, + "grad_norm": 0.7934814691543579, + "learning_rate": 2.378624959270121e-06, + "loss": 0.4664, + "step": 12502 + }, + { + "epoch": 1.9572636192861617, + "grad_norm": 1.605263113975525, + "learning_rate": 2.3704789833822093e-06, + "loss": 0.5733, + "step": 12503 + }, + { + "epoch": 1.9574201628052599, + "grad_norm": 1.0729097127914429, + "learning_rate": 2.3623330074942977e-06, + "loss": 0.5316, + "step": 12504 + }, + { + "epoch": 1.957576706324358, + "grad_norm": 1.7226802110671997, + "learning_rate": 2.3541870316063866e-06, + "loss": 0.5415, + "step": 12505 + }, + { + "epoch": 1.9577332498434565, + "grad_norm": 1.2247395515441895, + "learning_rate": 2.3460410557184754e-06, + "loss": 0.5274, + "step": 12506 + }, + { + "epoch": 1.9578897933625548, + "grad_norm": 1.7237412929534912, + "learning_rate": 2.337895079830564e-06, + "loss": 0.5251, + "step": 12507 + }, + { + "epoch": 1.9580463368816532, + "grad_norm": 1.4121756553649902, + "learning_rate": 2.3297491039426526e-06, + "loss": 0.5167, + "step": 12508 + }, + { + "epoch": 1.9582028804007514, + "grad_norm": 2.362248182296753, + "learning_rate": 2.321603128054741e-06, + "loss": 0.5271, + "step": 12509 + }, + { + "epoch": 1.9583594239198496, + "grad_norm": 5.198141098022461, + "learning_rate": 2.3134571521668294e-06, + "loss": 0.812, + "step": 12510 + }, + { + "epoch": 1.958515967438948, + "grad_norm": 2.080247640609741, + "learning_rate": 2.3053111762789183e-06, + "loss": 0.5275, + "step": 12511 + }, + { + "epoch": 1.9586725109580463, + "grad_norm": 2.879849910736084, + "learning_rate": 2.297165200391007e-06, + "loss": 0.837, + "step": 12512 + }, + { + "epoch": 1.9588290544771447, + "grad_norm": 2.14780330657959, + "learning_rate": 2.2890192245030955e-06, + "loss": 0.8173, + "step": 12513 + }, + { + "epoch": 1.958985597996243, + "grad_norm": 1.8872599601745605, + "learning_rate": 2.2808732486151844e-06, + "loss": 0.4584, + "step": 12514 + }, + { + "epoch": 1.9591421415153412, + "grad_norm": 1.7228397130966187, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.6098, + "step": 12515 + }, + { + "epoch": 1.9592986850344396, + "grad_norm": 2.2226197719573975, + "learning_rate": 2.2645812968393616e-06, + "loss": 0.4732, + "step": 12516 + }, + { + "epoch": 1.959455228553538, + "grad_norm": 2.886376142501831, + "learning_rate": 2.25643532095145e-06, + "loss": 0.938, + "step": 12517 + }, + { + "epoch": 1.9596117720726363, + "grad_norm": 1.5765841007232666, + "learning_rate": 2.248289345063539e-06, + "loss": 0.5125, + "step": 12518 + }, + { + "epoch": 1.9597683155917345, + "grad_norm": 5.303073406219482, + "learning_rate": 2.2401433691756272e-06, + "loss": 1.1083, + "step": 12519 + }, + { + "epoch": 1.9599248591108327, + "grad_norm": 1.996748685836792, + "learning_rate": 2.2319973932877156e-06, + "loss": 0.6831, + "step": 12520 + }, + { + "epoch": 1.9600814026299311, + "grad_norm": 2.738168239593506, + "learning_rate": 2.223851417399805e-06, + "loss": 0.5483, + "step": 12521 + }, + { + "epoch": 1.9602379461490296, + "grad_norm": 4.373471260070801, + "learning_rate": 2.2157054415118933e-06, + "loss": 0.576, + "step": 12522 + }, + { + "epoch": 1.9603944896681278, + "grad_norm": 4.65053129196167, + "learning_rate": 2.2075594656239817e-06, + "loss": 0.5165, + "step": 12523 + }, + { + "epoch": 1.960551033187226, + "grad_norm": 2.9686362743377686, + "learning_rate": 2.1994134897360705e-06, + "loss": 0.8665, + "step": 12524 + }, + { + "epoch": 1.9607075767063242, + "grad_norm": 3.9991953372955322, + "learning_rate": 2.191267513848159e-06, + "loss": 0.9928, + "step": 12525 + }, + { + "epoch": 1.9608641202254227, + "grad_norm": 3.885099172592163, + "learning_rate": 2.1831215379602478e-06, + "loss": 0.7923, + "step": 12526 + }, + { + "epoch": 1.961020663744521, + "grad_norm": 2.474177360534668, + "learning_rate": 2.1749755620723366e-06, + "loss": 0.6301, + "step": 12527 + }, + { + "epoch": 1.9611772072636193, + "grad_norm": 5.740012168884277, + "learning_rate": 2.166829586184425e-06, + "loss": 1.2681, + "step": 12528 + }, + { + "epoch": 1.9613337507827175, + "grad_norm": 2.5809266567230225, + "learning_rate": 2.1586836102965134e-06, + "loss": 0.851, + "step": 12529 + }, + { + "epoch": 1.9614902943018158, + "grad_norm": 4.45579195022583, + "learning_rate": 2.1505376344086023e-06, + "loss": 0.8772, + "step": 12530 + }, + { + "epoch": 1.9616468378209142, + "grad_norm": 2.3404927253723145, + "learning_rate": 2.142391658520691e-06, + "loss": 0.7364, + "step": 12531 + }, + { + "epoch": 1.9618033813400126, + "grad_norm": 4.528717994689941, + "learning_rate": 2.1342456826327795e-06, + "loss": 1.4627, + "step": 12532 + }, + { + "epoch": 1.9619599248591109, + "grad_norm": 3.925095558166504, + "learning_rate": 2.126099706744868e-06, + "loss": 0.8307, + "step": 12533 + }, + { + "epoch": 1.962116468378209, + "grad_norm": 3.46557354927063, + "learning_rate": 2.1179537308569567e-06, + "loss": 1.2435, + "step": 12534 + }, + { + "epoch": 1.9622730118973073, + "grad_norm": 2.692558526992798, + "learning_rate": 2.1098077549690456e-06, + "loss": 0.6647, + "step": 12535 + }, + { + "epoch": 1.9624295554164057, + "grad_norm": 3.5295166969299316, + "learning_rate": 2.101661779081134e-06, + "loss": 0.628, + "step": 12536 + }, + { + "epoch": 1.9625860989355042, + "grad_norm": 3.0950043201446533, + "learning_rate": 2.093515803193223e-06, + "loss": 0.8713, + "step": 12537 + }, + { + "epoch": 1.9627426424546024, + "grad_norm": 3.21415376663208, + "learning_rate": 2.0853698273053112e-06, + "loss": 0.5647, + "step": 12538 + }, + { + "epoch": 1.9628991859737006, + "grad_norm": 0.6747961640357971, + "learning_rate": 2.0772238514173996e-06, + "loss": 0.5195, + "step": 12539 + }, + { + "epoch": 1.963055729492799, + "grad_norm": 0.5338968634605408, + "learning_rate": 2.0690778755294885e-06, + "loss": 0.3856, + "step": 12540 + }, + { + "epoch": 1.9632122730118973, + "grad_norm": 0.6332517862319946, + "learning_rate": 2.0609318996415773e-06, + "loss": 0.4302, + "step": 12541 + }, + { + "epoch": 1.9633688165309957, + "grad_norm": 0.7040651440620422, + "learning_rate": 2.0527859237536657e-06, + "loss": 0.4684, + "step": 12542 + }, + { + "epoch": 1.963525360050094, + "grad_norm": 0.92644202709198, + "learning_rate": 2.0446399478657545e-06, + "loss": 0.5014, + "step": 12543 + }, + { + "epoch": 1.9636819035691921, + "grad_norm": 0.7603963613510132, + "learning_rate": 2.036493971977843e-06, + "loss": 0.53, + "step": 12544 + }, + { + "epoch": 1.9638384470882906, + "grad_norm": 0.8620895743370056, + "learning_rate": 2.0283479960899318e-06, + "loss": 0.4224, + "step": 12545 + }, + { + "epoch": 1.9639949906073888, + "grad_norm": 0.9935777187347412, + "learning_rate": 2.0202020202020206e-06, + "loss": 0.481, + "step": 12546 + }, + { + "epoch": 1.9641515341264872, + "grad_norm": 0.7658788561820984, + "learning_rate": 2.012056044314109e-06, + "loss": 0.3729, + "step": 12547 + }, + { + "epoch": 1.9643080776455855, + "grad_norm": 0.6804404854774475, + "learning_rate": 2.0039100684261974e-06, + "loss": 0.3914, + "step": 12548 + }, + { + "epoch": 1.9644646211646837, + "grad_norm": 0.7658693194389343, + "learning_rate": 1.995764092538286e-06, + "loss": 0.3798, + "step": 12549 + }, + { + "epoch": 1.9646211646837821, + "grad_norm": 0.7099312543869019, + "learning_rate": 1.987618116650375e-06, + "loss": 0.4826, + "step": 12550 + }, + { + "epoch": 1.9647777082028806, + "grad_norm": 1.4372090101242065, + "learning_rate": 1.9794721407624635e-06, + "loss": 0.4691, + "step": 12551 + }, + { + "epoch": 1.9649342517219788, + "grad_norm": 1.2906445264816284, + "learning_rate": 1.971326164874552e-06, + "loss": 0.5686, + "step": 12552 + }, + { + "epoch": 1.965090795241077, + "grad_norm": 1.4649921655654907, + "learning_rate": 1.9631801889866407e-06, + "loss": 0.6194, + "step": 12553 + }, + { + "epoch": 1.9652473387601752, + "grad_norm": 1.28202223777771, + "learning_rate": 1.955034213098729e-06, + "loss": 0.5953, + "step": 12554 + }, + { + "epoch": 1.9654038822792737, + "grad_norm": 0.9350935220718384, + "learning_rate": 1.946888237210818e-06, + "loss": 0.4643, + "step": 12555 + }, + { + "epoch": 1.965560425798372, + "grad_norm": 1.8001290559768677, + "learning_rate": 1.938742261322907e-06, + "loss": 0.4927, + "step": 12556 + }, + { + "epoch": 1.9657169693174703, + "grad_norm": 1.2543338537216187, + "learning_rate": 1.9305962854349952e-06, + "loss": 0.4767, + "step": 12557 + }, + { + "epoch": 1.9658735128365685, + "grad_norm": 1.3619269132614136, + "learning_rate": 1.9224503095470836e-06, + "loss": 0.5729, + "step": 12558 + }, + { + "epoch": 1.9660300563556667, + "grad_norm": 1.8355375528335571, + "learning_rate": 1.9143043336591725e-06, + "loss": 0.7498, + "step": 12559 + }, + { + "epoch": 1.9661865998747652, + "grad_norm": 1.488260269165039, + "learning_rate": 1.906158357771261e-06, + "loss": 0.486, + "step": 12560 + }, + { + "epoch": 1.9663431433938636, + "grad_norm": 1.5359816551208496, + "learning_rate": 1.8980123818833497e-06, + "loss": 0.5697, + "step": 12561 + }, + { + "epoch": 1.9664996869129618, + "grad_norm": 2.863281011581421, + "learning_rate": 1.8898664059954385e-06, + "loss": 0.6473, + "step": 12562 + }, + { + "epoch": 1.96665623043206, + "grad_norm": 2.545659065246582, + "learning_rate": 1.881720430107527e-06, + "loss": 0.7462, + "step": 12563 + }, + { + "epoch": 1.9668127739511583, + "grad_norm": 2.1165542602539062, + "learning_rate": 1.8735744542196156e-06, + "loss": 0.7499, + "step": 12564 + }, + { + "epoch": 1.9669693174702567, + "grad_norm": 3.251842498779297, + "learning_rate": 1.8654284783317042e-06, + "loss": 0.6362, + "step": 12565 + }, + { + "epoch": 1.9671258609893552, + "grad_norm": 1.743667483329773, + "learning_rate": 1.857282502443793e-06, + "loss": 0.5575, + "step": 12566 + }, + { + "epoch": 1.9672824045084534, + "grad_norm": 2.2528867721557617, + "learning_rate": 1.8491365265558814e-06, + "loss": 0.5585, + "step": 12567 + }, + { + "epoch": 1.9674389480275516, + "grad_norm": 2.930398941040039, + "learning_rate": 1.84099055066797e-06, + "loss": 0.7752, + "step": 12568 + }, + { + "epoch": 1.9675954915466498, + "grad_norm": 2.7118191719055176, + "learning_rate": 1.8328445747800589e-06, + "loss": 0.6576, + "step": 12569 + }, + { + "epoch": 1.9677520350657483, + "grad_norm": 1.7636281251907349, + "learning_rate": 1.8246985988921473e-06, + "loss": 0.4407, + "step": 12570 + }, + { + "epoch": 1.9679085785848467, + "grad_norm": 3.2213099002838135, + "learning_rate": 1.8165526230042359e-06, + "loss": 1.05, + "step": 12571 + }, + { + "epoch": 1.968065122103945, + "grad_norm": 3.018411159515381, + "learning_rate": 1.8084066471163247e-06, + "loss": 0.6936, + "step": 12572 + }, + { + "epoch": 1.9682216656230431, + "grad_norm": 4.541343688964844, + "learning_rate": 1.8002606712284133e-06, + "loss": 0.7085, + "step": 12573 + }, + { + "epoch": 1.9683782091421416, + "grad_norm": 2.6957478523254395, + "learning_rate": 1.7921146953405017e-06, + "loss": 0.9425, + "step": 12574 + }, + { + "epoch": 1.9685347526612398, + "grad_norm": 9.620570182800293, + "learning_rate": 1.7839687194525906e-06, + "loss": 0.8896, + "step": 12575 + }, + { + "epoch": 1.9686912961803382, + "grad_norm": 2.751086950302124, + "learning_rate": 1.7758227435646792e-06, + "loss": 0.666, + "step": 12576 + }, + { + "epoch": 1.9688478396994364, + "grad_norm": 3.495507001876831, + "learning_rate": 1.7676767676767676e-06, + "loss": 0.8227, + "step": 12577 + }, + { + "epoch": 1.9690043832185347, + "grad_norm": 7.878044605255127, + "learning_rate": 1.7595307917888567e-06, + "loss": 0.8045, + "step": 12578 + }, + { + "epoch": 1.969160926737633, + "grad_norm": 4.12336540222168, + "learning_rate": 1.751384815900945e-06, + "loss": 1.0205, + "step": 12579 + }, + { + "epoch": 1.9693174702567313, + "grad_norm": 4.54221773147583, + "learning_rate": 1.7432388400130337e-06, + "loss": 0.6806, + "step": 12580 + }, + { + "epoch": 1.9694740137758298, + "grad_norm": 11.912877082824707, + "learning_rate": 1.735092864125122e-06, + "loss": 0.7734, + "step": 12581 + }, + { + "epoch": 1.969630557294928, + "grad_norm": 3.142104387283325, + "learning_rate": 1.726946888237211e-06, + "loss": 1.6051, + "step": 12582 + }, + { + "epoch": 1.9697871008140262, + "grad_norm": 3.3377885818481445, + "learning_rate": 1.7188009123492995e-06, + "loss": 1.5716, + "step": 12583 + }, + { + "epoch": 1.9699436443331246, + "grad_norm": 4.839450836181641, + "learning_rate": 1.710654936461388e-06, + "loss": 0.5367, + "step": 12584 + }, + { + "epoch": 1.970100187852223, + "grad_norm": 2.7434420585632324, + "learning_rate": 1.702508960573477e-06, + "loss": 0.2669, + "step": 12585 + }, + { + "epoch": 1.9702567313713213, + "grad_norm": 3.0508460998535156, + "learning_rate": 1.6943629846855654e-06, + "loss": 0.6074, + "step": 12586 + }, + { + "epoch": 1.9704132748904195, + "grad_norm": 2.3277575969696045, + "learning_rate": 1.686217008797654e-06, + "loss": 0.7205, + "step": 12587 + }, + { + "epoch": 1.9705698184095177, + "grad_norm": 2.8714818954467773, + "learning_rate": 1.6780710329097428e-06, + "loss": 0.7183, + "step": 12588 + }, + { + "epoch": 1.9707263619286162, + "grad_norm": 0.5738904476165771, + "learning_rate": 1.6699250570218313e-06, + "loss": 0.4566, + "step": 12589 + }, + { + "epoch": 1.9708829054477146, + "grad_norm": 0.6275519728660583, + "learning_rate": 1.6617790811339199e-06, + "loss": 0.4555, + "step": 12590 + }, + { + "epoch": 1.9710394489668128, + "grad_norm": 0.7495302557945251, + "learning_rate": 1.6536331052460087e-06, + "loss": 0.4765, + "step": 12591 + }, + { + "epoch": 1.971195992485911, + "grad_norm": 0.9187777638435364, + "learning_rate": 1.6454871293580971e-06, + "loss": 0.5541, + "step": 12592 + }, + { + "epoch": 1.9713525360050093, + "grad_norm": 1.1001012325286865, + "learning_rate": 1.6373411534701857e-06, + "loss": 0.5624, + "step": 12593 + }, + { + "epoch": 1.9715090795241077, + "grad_norm": 0.7525448799133301, + "learning_rate": 1.6291951775822746e-06, + "loss": 0.4712, + "step": 12594 + }, + { + "epoch": 1.9716656230432061, + "grad_norm": 0.8630020022392273, + "learning_rate": 1.6210492016943632e-06, + "loss": 0.4403, + "step": 12595 + }, + { + "epoch": 1.9718221665623044, + "grad_norm": 0.7084668874740601, + "learning_rate": 1.6129032258064516e-06, + "loss": 0.436, + "step": 12596 + }, + { + "epoch": 1.9719787100814026, + "grad_norm": 1.4396096467971802, + "learning_rate": 1.6047572499185402e-06, + "loss": 0.448, + "step": 12597 + }, + { + "epoch": 1.9721352536005008, + "grad_norm": 0.8294693827629089, + "learning_rate": 1.596611274030629e-06, + "loss": 0.4966, + "step": 12598 + }, + { + "epoch": 1.9722917971195992, + "grad_norm": 0.5781842470169067, + "learning_rate": 1.5884652981427175e-06, + "loss": 0.4831, + "step": 12599 + }, + { + "epoch": 1.9724483406386977, + "grad_norm": 1.141119360923767, + "learning_rate": 1.580319322254806e-06, + "loss": 0.5014, + "step": 12600 + }, + { + "epoch": 1.972604884157796, + "grad_norm": 1.386080265045166, + "learning_rate": 1.572173346366895e-06, + "loss": 0.5926, + "step": 12601 + }, + { + "epoch": 1.972761427676894, + "grad_norm": 0.949116587638855, + "learning_rate": 1.5640273704789835e-06, + "loss": 0.4605, + "step": 12602 + }, + { + "epoch": 1.9729179711959923, + "grad_norm": 1.141287922859192, + "learning_rate": 1.5558813945910721e-06, + "loss": 0.4941, + "step": 12603 + }, + { + "epoch": 1.9730745147150908, + "grad_norm": 1.98452627658844, + "learning_rate": 1.5477354187031608e-06, + "loss": 0.7161, + "step": 12604 + }, + { + "epoch": 1.9732310582341892, + "grad_norm": 1.5774983167648315, + "learning_rate": 1.5395894428152494e-06, + "loss": 0.584, + "step": 12605 + }, + { + "epoch": 1.9733876017532874, + "grad_norm": 1.142909288406372, + "learning_rate": 1.531443466927338e-06, + "loss": 0.6523, + "step": 12606 + }, + { + "epoch": 1.9735441452723856, + "grad_norm": 2.2721924781799316, + "learning_rate": 1.5232974910394266e-06, + "loss": 0.6355, + "step": 12607 + }, + { + "epoch": 1.973700688791484, + "grad_norm": 2.5435497760772705, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.7732, + "step": 12608 + }, + { + "epoch": 1.9738572323105823, + "grad_norm": 4.525137901306152, + "learning_rate": 1.5070055392636039e-06, + "loss": 0.8268, + "step": 12609 + }, + { + "epoch": 1.9740137758296807, + "grad_norm": 2.0381767749786377, + "learning_rate": 1.4988595633756925e-06, + "loss": 0.7933, + "step": 12610 + }, + { + "epoch": 1.974170319348779, + "grad_norm": 2.677743434906006, + "learning_rate": 1.490713587487781e-06, + "loss": 0.6641, + "step": 12611 + }, + { + "epoch": 1.9743268628678772, + "grad_norm": 1.4609029293060303, + "learning_rate": 1.4825676115998697e-06, + "loss": 0.6375, + "step": 12612 + }, + { + "epoch": 1.9744834063869756, + "grad_norm": 1.5940873622894287, + "learning_rate": 1.4744216357119583e-06, + "loss": 0.5479, + "step": 12613 + }, + { + "epoch": 1.974639949906074, + "grad_norm": 2.0391147136688232, + "learning_rate": 1.4662756598240472e-06, + "loss": 0.5695, + "step": 12614 + }, + { + "epoch": 1.9747964934251723, + "grad_norm": 2.03641676902771, + "learning_rate": 1.4581296839361356e-06, + "loss": 0.456, + "step": 12615 + }, + { + "epoch": 1.9749530369442705, + "grad_norm": 1.8017065525054932, + "learning_rate": 1.4499837080482242e-06, + "loss": 0.4814, + "step": 12616 + }, + { + "epoch": 1.9751095804633687, + "grad_norm": 2.7681915760040283, + "learning_rate": 1.4418377321603128e-06, + "loss": 0.6496, + "step": 12617 + }, + { + "epoch": 1.9752661239824671, + "grad_norm": 5.8109025955200195, + "learning_rate": 1.4336917562724014e-06, + "loss": 0.9668, + "step": 12618 + }, + { + "epoch": 1.9754226675015656, + "grad_norm": 1.6775314807891846, + "learning_rate": 1.4255457803844903e-06, + "loss": 0.46, + "step": 12619 + }, + { + "epoch": 1.9755792110206638, + "grad_norm": 3.138233184814453, + "learning_rate": 1.4173998044965787e-06, + "loss": 0.6725, + "step": 12620 + }, + { + "epoch": 1.975735754539762, + "grad_norm": 2.860448122024536, + "learning_rate": 1.4092538286086675e-06, + "loss": 0.7415, + "step": 12621 + }, + { + "epoch": 1.9758922980588602, + "grad_norm": 2.7847864627838135, + "learning_rate": 1.4011078527207561e-06, + "loss": 0.8426, + "step": 12622 + }, + { + "epoch": 1.9760488415779587, + "grad_norm": 3.4924819469451904, + "learning_rate": 1.3929618768328445e-06, + "loss": 0.6309, + "step": 12623 + }, + { + "epoch": 1.9762053850970571, + "grad_norm": 4.635035037994385, + "learning_rate": 1.3848159009449334e-06, + "loss": 1.2704, + "step": 12624 + }, + { + "epoch": 1.9763619286161553, + "grad_norm": 4.683971405029297, + "learning_rate": 1.3766699250570218e-06, + "loss": 1.6822, + "step": 12625 + }, + { + "epoch": 1.9765184721352536, + "grad_norm": 5.2948126792907715, + "learning_rate": 1.3685239491691106e-06, + "loss": 0.9501, + "step": 12626 + }, + { + "epoch": 1.9766750156543518, + "grad_norm": 3.913292646408081, + "learning_rate": 1.3603779732811992e-06, + "loss": 0.9936, + "step": 12627 + }, + { + "epoch": 1.9768315591734502, + "grad_norm": 3.2585413455963135, + "learning_rate": 1.3522319973932876e-06, + "loss": 1.0419, + "step": 12628 + }, + { + "epoch": 1.9769881026925487, + "grad_norm": 2.941061496734619, + "learning_rate": 1.3440860215053765e-06, + "loss": 1.4858, + "step": 12629 + }, + { + "epoch": 1.9771446462116469, + "grad_norm": 6.044554233551025, + "learning_rate": 1.335940045617465e-06, + "loss": 1.1805, + "step": 12630 + }, + { + "epoch": 1.977301189730745, + "grad_norm": 4.4481000900268555, + "learning_rate": 1.3277940697295537e-06, + "loss": 0.8087, + "step": 12631 + }, + { + "epoch": 1.9774577332498433, + "grad_norm": 2.427474021911621, + "learning_rate": 1.3196480938416423e-06, + "loss": 1.1698, + "step": 12632 + }, + { + "epoch": 1.9776142767689417, + "grad_norm": 6.705784797668457, + "learning_rate": 1.311502117953731e-06, + "loss": 0.93, + "step": 12633 + }, + { + "epoch": 1.9777708202880402, + "grad_norm": 3.1424403190612793, + "learning_rate": 1.3033561420658196e-06, + "loss": 0.3697, + "step": 12634 + }, + { + "epoch": 1.9779273638071384, + "grad_norm": 3.473153591156006, + "learning_rate": 1.2952101661779082e-06, + "loss": 0.789, + "step": 12635 + }, + { + "epoch": 1.9780839073262366, + "grad_norm": 1.7377632856369019, + "learning_rate": 1.2870641902899968e-06, + "loss": 0.2485, + "step": 12636 + }, + { + "epoch": 1.9782404508453348, + "grad_norm": 1.9757336378097534, + "learning_rate": 1.2789182144020854e-06, + "loss": 0.6167, + "step": 12637 + }, + { + "epoch": 1.9783969943644333, + "grad_norm": 4.1722235679626465, + "learning_rate": 1.270772238514174e-06, + "loss": 0.8527, + "step": 12638 + }, + { + "epoch": 1.9785535378835317, + "grad_norm": 0.5516929030418396, + "learning_rate": 1.2626262626262627e-06, + "loss": 0.4815, + "step": 12639 + }, + { + "epoch": 1.97871008140263, + "grad_norm": 0.9486571550369263, + "learning_rate": 1.2544802867383513e-06, + "loss": 0.5113, + "step": 12640 + }, + { + "epoch": 1.9788666249217282, + "grad_norm": 0.4964858591556549, + "learning_rate": 1.24633431085044e-06, + "loss": 0.4545, + "step": 12641 + }, + { + "epoch": 1.9790231684408266, + "grad_norm": 0.7588146924972534, + "learning_rate": 1.2381883349625285e-06, + "loss": 0.4221, + "step": 12642 + }, + { + "epoch": 1.9791797119599248, + "grad_norm": 0.731907069683075, + "learning_rate": 1.2300423590746174e-06, + "loss": 0.4768, + "step": 12643 + }, + { + "epoch": 1.9793362554790233, + "grad_norm": 0.6124377846717834, + "learning_rate": 1.2218963831867058e-06, + "loss": 0.4734, + "step": 12644 + }, + { + "epoch": 1.9794927989981215, + "grad_norm": 0.6235209107398987, + "learning_rate": 1.2137504072987944e-06, + "loss": 0.4104, + "step": 12645 + }, + { + "epoch": 1.9796493425172197, + "grad_norm": 0.7038710713386536, + "learning_rate": 1.2056044314108832e-06, + "loss": 0.4309, + "step": 12646 + }, + { + "epoch": 1.9798058860363181, + "grad_norm": 0.7130526304244995, + "learning_rate": 1.1974584555229716e-06, + "loss": 0.4752, + "step": 12647 + }, + { + "epoch": 1.9799624295554166, + "grad_norm": 0.7917520999908447, + "learning_rate": 1.1893124796350605e-06, + "loss": 0.5286, + "step": 12648 + }, + { + "epoch": 1.9801189730745148, + "grad_norm": 0.7305119633674622, + "learning_rate": 1.1811665037471489e-06, + "loss": 0.4605, + "step": 12649 + }, + { + "epoch": 1.980275516593613, + "grad_norm": 0.9296625256538391, + "learning_rate": 1.1730205278592377e-06, + "loss": 0.3947, + "step": 12650 + }, + { + "epoch": 1.9804320601127112, + "grad_norm": 1.215757131576538, + "learning_rate": 1.1648745519713263e-06, + "loss": 0.6275, + "step": 12651 + }, + { + "epoch": 1.9805886036318097, + "grad_norm": 1.1434177160263062, + "learning_rate": 1.1567285760834147e-06, + "loss": 0.532, + "step": 12652 + }, + { + "epoch": 1.980745147150908, + "grad_norm": 0.6825506687164307, + "learning_rate": 1.1485826001955036e-06, + "loss": 0.3796, + "step": 12653 + }, + { + "epoch": 1.9809016906700063, + "grad_norm": 0.8476182222366333, + "learning_rate": 1.1404366243075922e-06, + "loss": 0.4834, + "step": 12654 + }, + { + "epoch": 1.9810582341891045, + "grad_norm": 0.9957939386367798, + "learning_rate": 1.1322906484196808e-06, + "loss": 0.4402, + "step": 12655 + }, + { + "epoch": 1.9812147777082028, + "grad_norm": 2.3601388931274414, + "learning_rate": 1.1241446725317694e-06, + "loss": 0.4899, + "step": 12656 + }, + { + "epoch": 1.9813713212273012, + "grad_norm": 1.3694380521774292, + "learning_rate": 1.1159986966438578e-06, + "loss": 0.5199, + "step": 12657 + }, + { + "epoch": 1.9815278647463996, + "grad_norm": 5.307101249694824, + "learning_rate": 1.1078527207559467e-06, + "loss": 0.7887, + "step": 12658 + }, + { + "epoch": 1.9816844082654979, + "grad_norm": 1.6431567668914795, + "learning_rate": 1.0997067448680353e-06, + "loss": 0.5869, + "step": 12659 + }, + { + "epoch": 1.981840951784596, + "grad_norm": 1.0768793821334839, + "learning_rate": 1.0915607689801239e-06, + "loss": 0.4316, + "step": 12660 + }, + { + "epoch": 1.9819974953036943, + "grad_norm": 1.6530342102050781, + "learning_rate": 1.0834147930922125e-06, + "loss": 0.5503, + "step": 12661 + }, + { + "epoch": 1.9821540388227927, + "grad_norm": 1.6875410079956055, + "learning_rate": 1.0752688172043011e-06, + "loss": 0.6933, + "step": 12662 + }, + { + "epoch": 1.9823105823418912, + "grad_norm": 0.9060271978378296, + "learning_rate": 1.0671228413163898e-06, + "loss": 0.4328, + "step": 12663 + }, + { + "epoch": 1.9824671258609894, + "grad_norm": 5.510295867919922, + "learning_rate": 1.0589768654284784e-06, + "loss": 0.9723, + "step": 12664 + }, + { + "epoch": 1.9826236693800876, + "grad_norm": 2.379274606704712, + "learning_rate": 1.050830889540567e-06, + "loss": 0.8763, + "step": 12665 + }, + { + "epoch": 1.9827802128991858, + "grad_norm": 4.948631763458252, + "learning_rate": 1.0426849136526556e-06, + "loss": 0.9918, + "step": 12666 + }, + { + "epoch": 1.9829367564182843, + "grad_norm": 2.5527384281158447, + "learning_rate": 1.0345389377647442e-06, + "loss": 0.587, + "step": 12667 + }, + { + "epoch": 1.9830932999373827, + "grad_norm": 4.615874767303467, + "learning_rate": 1.0263929618768329e-06, + "loss": 0.6437, + "step": 12668 + }, + { + "epoch": 1.983249843456481, + "grad_norm": 2.517007827758789, + "learning_rate": 1.0182469859889215e-06, + "loss": 0.5513, + "step": 12669 + }, + { + "epoch": 1.9834063869755791, + "grad_norm": 3.227851629257202, + "learning_rate": 1.0101010101010103e-06, + "loss": 0.5543, + "step": 12670 + }, + { + "epoch": 1.9835629304946774, + "grad_norm": 2.9871068000793457, + "learning_rate": 1.0019550342130987e-06, + "loss": 0.5225, + "step": 12671 + }, + { + "epoch": 1.9837194740137758, + "grad_norm": 2.249817371368408, + "learning_rate": 9.938090583251875e-07, + "loss": 0.7247, + "step": 12672 + }, + { + "epoch": 1.9838760175328742, + "grad_norm": 2.996314525604248, + "learning_rate": 9.85663082437276e-07, + "loss": 0.5341, + "step": 12673 + }, + { + "epoch": 1.9840325610519725, + "grad_norm": 6.011224269866943, + "learning_rate": 9.775171065493646e-07, + "loss": 0.7168, + "step": 12674 + }, + { + "epoch": 1.9841891045710707, + "grad_norm": 1.9925557374954224, + "learning_rate": 9.693711306614534e-07, + "loss": 0.6254, + "step": 12675 + }, + { + "epoch": 1.9843456480901691, + "grad_norm": 4.672172546386719, + "learning_rate": 9.612251547735418e-07, + "loss": 0.946, + "step": 12676 + }, + { + "epoch": 1.9845021916092673, + "grad_norm": 3.6110551357269287, + "learning_rate": 9.530791788856305e-07, + "loss": 0.8769, + "step": 12677 + }, + { + "epoch": 1.9846587351283658, + "grad_norm": 2.6954398155212402, + "learning_rate": 9.449332029977193e-07, + "loss": 0.6626, + "step": 12678 + }, + { + "epoch": 1.984815278647464, + "grad_norm": 2.7507688999176025, + "learning_rate": 9.367872271098078e-07, + "loss": 0.8546, + "step": 12679 + }, + { + "epoch": 1.9849718221665622, + "grad_norm": 3.1203064918518066, + "learning_rate": 9.286412512218965e-07, + "loss": 0.5548, + "step": 12680 + }, + { + "epoch": 1.9851283656856606, + "grad_norm": 2.9063162803649902, + "learning_rate": 9.20495275333985e-07, + "loss": 1.3771, + "step": 12681 + }, + { + "epoch": 1.985284909204759, + "grad_norm": 3.1988065242767334, + "learning_rate": 9.123492994460736e-07, + "loss": 0.9996, + "step": 12682 + }, + { + "epoch": 1.9854414527238573, + "grad_norm": 2.589529514312744, + "learning_rate": 9.042033235581624e-07, + "loss": 0.7506, + "step": 12683 + }, + { + "epoch": 1.9855979962429555, + "grad_norm": 3.5598721504211426, + "learning_rate": 8.960573476702509e-07, + "loss": 0.3569, + "step": 12684 + }, + { + "epoch": 1.9857545397620537, + "grad_norm": 2.114825487136841, + "learning_rate": 8.879113717823396e-07, + "loss": 0.6154, + "step": 12685 + }, + { + "epoch": 1.9859110832811522, + "grad_norm": 1.574540138244629, + "learning_rate": 8.797653958944283e-07, + "loss": 0.7488, + "step": 12686 + }, + { + "epoch": 1.9860676268002506, + "grad_norm": 1.1412297487258911, + "learning_rate": 8.716194200065168e-07, + "loss": 0.4626, + "step": 12687 + }, + { + "epoch": 1.9862241703193488, + "grad_norm": 2.1670050621032715, + "learning_rate": 8.634734441186055e-07, + "loss": 0.7718, + "step": 12688 + }, + { + "epoch": 1.986380713838447, + "grad_norm": 0.8819579482078552, + "learning_rate": 8.55327468230694e-07, + "loss": 0.5036, + "step": 12689 + }, + { + "epoch": 1.9865372573575453, + "grad_norm": 0.4713776409626007, + "learning_rate": 8.471814923427827e-07, + "loss": 0.4385, + "step": 12690 + }, + { + "epoch": 1.9866938008766437, + "grad_norm": 0.7063671946525574, + "learning_rate": 8.390355164548714e-07, + "loss": 0.4692, + "step": 12691 + }, + { + "epoch": 1.9868503443957422, + "grad_norm": 0.7027537226676941, + "learning_rate": 8.308895405669599e-07, + "loss": 0.4983, + "step": 12692 + }, + { + "epoch": 1.9870068879148404, + "grad_norm": 0.5473253726959229, + "learning_rate": 8.227435646790486e-07, + "loss": 0.3895, + "step": 12693 + }, + { + "epoch": 1.9871634314339386, + "grad_norm": 1.1711751222610474, + "learning_rate": 8.145975887911373e-07, + "loss": 0.5704, + "step": 12694 + }, + { + "epoch": 1.9873199749530368, + "grad_norm": 0.7432283759117126, + "learning_rate": 8.064516129032258e-07, + "loss": 0.466, + "step": 12695 + }, + { + "epoch": 1.9874765184721352, + "grad_norm": 1.2065191268920898, + "learning_rate": 7.983056370153145e-07, + "loss": 0.635, + "step": 12696 + }, + { + "epoch": 1.9876330619912337, + "grad_norm": 0.8933687806129456, + "learning_rate": 7.90159661127403e-07, + "loss": 0.3887, + "step": 12697 + }, + { + "epoch": 1.987789605510332, + "grad_norm": 0.9646480679512024, + "learning_rate": 7.820136852394918e-07, + "loss": 0.5523, + "step": 12698 + }, + { + "epoch": 1.9879461490294301, + "grad_norm": 0.7593737244606018, + "learning_rate": 7.738677093515804e-07, + "loss": 0.4322, + "step": 12699 + }, + { + "epoch": 1.9881026925485283, + "grad_norm": 1.2853738069534302, + "learning_rate": 7.65721733463669e-07, + "loss": 0.5032, + "step": 12700 + }, + { + "epoch": 1.9882592360676268, + "grad_norm": 1.292449712753296, + "learning_rate": 7.575757575757576e-07, + "loss": 0.498, + "step": 12701 + }, + { + "epoch": 1.9884157795867252, + "grad_norm": 1.1544909477233887, + "learning_rate": 7.494297816878462e-07, + "loss": 0.5187, + "step": 12702 + }, + { + "epoch": 1.9885723231058234, + "grad_norm": 1.3005468845367432, + "learning_rate": 7.412838057999349e-07, + "loss": 0.5354, + "step": 12703 + }, + { + "epoch": 1.9887288666249217, + "grad_norm": 1.322045087814331, + "learning_rate": 7.331378299120236e-07, + "loss": 0.4939, + "step": 12704 + }, + { + "epoch": 1.9888854101440199, + "grad_norm": 1.3371986150741577, + "learning_rate": 7.249918540241121e-07, + "loss": 0.6322, + "step": 12705 + }, + { + "epoch": 1.9890419536631183, + "grad_norm": 1.1698259115219116, + "learning_rate": 7.168458781362007e-07, + "loss": 0.3984, + "step": 12706 + }, + { + "epoch": 1.9891984971822168, + "grad_norm": 2.5473201274871826, + "learning_rate": 7.086999022482893e-07, + "loss": 0.6787, + "step": 12707 + }, + { + "epoch": 1.989355040701315, + "grad_norm": 1.3283898830413818, + "learning_rate": 7.005539263603781e-07, + "loss": 0.3876, + "step": 12708 + }, + { + "epoch": 1.9895115842204132, + "grad_norm": 1.0113199949264526, + "learning_rate": 6.924079504724667e-07, + "loss": 0.5401, + "step": 12709 + }, + { + "epoch": 1.9896681277395116, + "grad_norm": 2.6165335178375244, + "learning_rate": 6.842619745845553e-07, + "loss": 0.7897, + "step": 12710 + }, + { + "epoch": 1.9898246712586098, + "grad_norm": 2.7641260623931885, + "learning_rate": 6.761159986966438e-07, + "loss": 0.5928, + "step": 12711 + }, + { + "epoch": 1.9899812147777083, + "grad_norm": 1.8080928325653076, + "learning_rate": 6.679700228087325e-07, + "loss": 0.7623, + "step": 12712 + }, + { + "epoch": 1.9901377582968065, + "grad_norm": 3.4666502475738525, + "learning_rate": 6.598240469208212e-07, + "loss": 0.6928, + "step": 12713 + }, + { + "epoch": 1.9902943018159047, + "grad_norm": 1.8880454301834106, + "learning_rate": 6.516780710329098e-07, + "loss": 0.5481, + "step": 12714 + }, + { + "epoch": 1.9904508453350032, + "grad_norm": 2.123213291168213, + "learning_rate": 6.435320951449984e-07, + "loss": 0.5147, + "step": 12715 + }, + { + "epoch": 1.9906073888541016, + "grad_norm": 3.8532602787017822, + "learning_rate": 6.35386119257087e-07, + "loss": 0.5835, + "step": 12716 + }, + { + "epoch": 1.9907639323731998, + "grad_norm": 3.845989942550659, + "learning_rate": 6.272401433691756e-07, + "loss": 0.4829, + "step": 12717 + }, + { + "epoch": 1.990920475892298, + "grad_norm": 2.2973721027374268, + "learning_rate": 6.190941674812643e-07, + "loss": 0.8729, + "step": 12718 + }, + { + "epoch": 1.9910770194113963, + "grad_norm": 4.127100944519043, + "learning_rate": 6.109481915933529e-07, + "loss": 0.6353, + "step": 12719 + }, + { + "epoch": 1.9912335629304947, + "grad_norm": 2.5507097244262695, + "learning_rate": 6.028022157054416e-07, + "loss": 0.9386, + "step": 12720 + }, + { + "epoch": 1.9913901064495931, + "grad_norm": 3.8045060634613037, + "learning_rate": 5.946562398175302e-07, + "loss": 0.6392, + "step": 12721 + }, + { + "epoch": 1.9915466499686914, + "grad_norm": 3.7975494861602783, + "learning_rate": 5.865102639296188e-07, + "loss": 0.6772, + "step": 12722 + }, + { + "epoch": 1.9917031934877896, + "grad_norm": 2.302764654159546, + "learning_rate": 5.783642880417074e-07, + "loss": 0.986, + "step": 12723 + }, + { + "epoch": 1.9918597370068878, + "grad_norm": 5.310937404632568, + "learning_rate": 5.702183121537961e-07, + "loss": 0.7184, + "step": 12724 + }, + { + "epoch": 1.9920162805259862, + "grad_norm": 2.0385098457336426, + "learning_rate": 5.620723362658847e-07, + "loss": 0.5593, + "step": 12725 + }, + { + "epoch": 1.9921728240450847, + "grad_norm": 4.123274803161621, + "learning_rate": 5.539263603779733e-07, + "loss": 0.774, + "step": 12726 + }, + { + "epoch": 1.9923293675641829, + "grad_norm": 4.707087516784668, + "learning_rate": 5.457803844900619e-07, + "loss": 0.6349, + "step": 12727 + }, + { + "epoch": 1.992485911083281, + "grad_norm": 3.0110580921173096, + "learning_rate": 5.376344086021506e-07, + "loss": 1.0548, + "step": 12728 + }, + { + "epoch": 1.9926424546023793, + "grad_norm": 3.8284475803375244, + "learning_rate": 5.294884327142392e-07, + "loss": 1.5722, + "step": 12729 + }, + { + "epoch": 1.9927989981214778, + "grad_norm": 4.324825286865234, + "learning_rate": 5.213424568263278e-07, + "loss": 1.0087, + "step": 12730 + }, + { + "epoch": 1.9929555416405762, + "grad_norm": 2.772529363632202, + "learning_rate": 5.131964809384164e-07, + "loss": 0.8376, + "step": 12731 + }, + { + "epoch": 1.9931120851596744, + "grad_norm": 2.1954023838043213, + "learning_rate": 5.050505050505052e-07, + "loss": 0.5357, + "step": 12732 + }, + { + "epoch": 1.9932686286787726, + "grad_norm": NaN, + "learning_rate": 5.050505050505052e-07, + "loss": 0.0, + "step": 12733 + }, + { + "epoch": 1.9934251721978709, + "grad_norm": 3.4823226928710938, + "learning_rate": 4.969045291625938e-07, + "loss": 0.7102, + "step": 12734 + }, + { + "epoch": 1.9935817157169693, + "grad_norm": 8.106609344482422, + "learning_rate": 4.887585532746823e-07, + "loss": 0.7125, + "step": 12735 + }, + { + "epoch": 1.9937382592360677, + "grad_norm": 3.1230461597442627, + "learning_rate": 4.806125773867709e-07, + "loss": 0.5551, + "step": 12736 + }, + { + "epoch": 1.993894802755166, + "grad_norm": 7.425134181976318, + "learning_rate": 4.7246660149885963e-07, + "loss": 1.0922, + "step": 12737 + }, + { + "epoch": 1.9940513462742642, + "grad_norm": 3.2609171867370605, + "learning_rate": 4.6432062561094825e-07, + "loss": 1.1325, + "step": 12738 + }, + { + "epoch": 1.9942078897933626, + "grad_norm": 0.5436992645263672, + "learning_rate": 4.561746497230368e-07, + "loss": 0.4923, + "step": 12739 + }, + { + "epoch": 1.9943644333124608, + "grad_norm": 1.958966612815857, + "learning_rate": 4.4802867383512544e-07, + "loss": 0.5633, + "step": 12740 + }, + { + "epoch": 1.9945209768315593, + "grad_norm": 0.46318382024765015, + "learning_rate": 4.3988269794721416e-07, + "loss": 0.4742, + "step": 12741 + }, + { + "epoch": 1.9946775203506575, + "grad_norm": 0.4730134606361389, + "learning_rate": 4.3173672205930273e-07, + "loss": 0.3989, + "step": 12742 + }, + { + "epoch": 1.9948340638697557, + "grad_norm": 2.257380485534668, + "learning_rate": 4.2359074617139135e-07, + "loss": 0.4152, + "step": 12743 + }, + { + "epoch": 1.9949906073888541, + "grad_norm": 0.620560348033905, + "learning_rate": 4.1544477028347997e-07, + "loss": 0.4866, + "step": 12744 + }, + { + "epoch": 1.9951471509079524, + "grad_norm": 0.9186817407608032, + "learning_rate": 4.0729879439556864e-07, + "loss": 0.5411, + "step": 12745 + }, + { + "epoch": 1.9953036944270508, + "grad_norm": 0.5580697059631348, + "learning_rate": 3.9915281850765726e-07, + "loss": 0.4342, + "step": 12746 + }, + { + "epoch": 1.995460237946149, + "grad_norm": 0.6352057456970215, + "learning_rate": 3.910068426197459e-07, + "loss": 0.4198, + "step": 12747 + }, + { + "epoch": 1.9956167814652472, + "grad_norm": 1.092692255973816, + "learning_rate": 3.828608667318345e-07, + "loss": 0.4932, + "step": 12748 + }, + { + "epoch": 1.9957733249843457, + "grad_norm": 0.6617681980133057, + "learning_rate": 3.747148908439231e-07, + "loss": 0.4655, + "step": 12749 + }, + { + "epoch": 1.9959298685034441, + "grad_norm": 1.0020225048065186, + "learning_rate": 3.665689149560118e-07, + "loss": 0.5525, + "step": 12750 + }, + { + "epoch": 1.9960864120225423, + "grad_norm": 1.7259622812271118, + "learning_rate": 3.5842293906810036e-07, + "loss": 0.6287, + "step": 12751 + }, + { + "epoch": 1.9962429555416406, + "grad_norm": 1.5244617462158203, + "learning_rate": 3.5027696318018903e-07, + "loss": 0.499, + "step": 12752 + }, + { + "epoch": 1.9963994990607388, + "grad_norm": 1.0839905738830566, + "learning_rate": 3.4213098729227765e-07, + "loss": 0.581, + "step": 12753 + }, + { + "epoch": 1.9965560425798372, + "grad_norm": 1.0463849306106567, + "learning_rate": 3.3398501140436627e-07, + "loss": 0.3825, + "step": 12754 + }, + { + "epoch": 1.9967125860989356, + "grad_norm": 1.4220589399337769, + "learning_rate": 3.258390355164549e-07, + "loss": 0.4434, + "step": 12755 + }, + { + "epoch": 1.9968691296180339, + "grad_norm": 1.950783371925354, + "learning_rate": 3.176930596285435e-07, + "loss": 0.6159, + "step": 12756 + }, + { + "epoch": 1.997025673137132, + "grad_norm": 2.076777219772339, + "learning_rate": 3.0954708374063213e-07, + "loss": 0.5037, + "step": 12757 + }, + { + "epoch": 1.9971822166562303, + "grad_norm": 1.2784727811813354, + "learning_rate": 3.014011078527208e-07, + "loss": 0.4053, + "step": 12758 + }, + { + "epoch": 1.9973387601753287, + "grad_norm": 2.417647361755371, + "learning_rate": 2.932551319648094e-07, + "loss": 0.4317, + "step": 12759 + }, + { + "epoch": 1.9974953036944272, + "grad_norm": 3.1708128452301025, + "learning_rate": 2.8510915607689804e-07, + "loss": 0.5067, + "step": 12760 + }, + { + "epoch": 1.9976518472135254, + "grad_norm": 3.7259669303894043, + "learning_rate": 2.7696318018898666e-07, + "loss": 0.732, + "step": 12761 + }, + { + "epoch": 1.9978083907326236, + "grad_norm": 2.2094645500183105, + "learning_rate": 2.688172043010753e-07, + "loss": 0.8503, + "step": 12762 + }, + { + "epoch": 1.9979649342517218, + "grad_norm": 2.3610074520111084, + "learning_rate": 2.606712284131639e-07, + "loss": 0.7234, + "step": 12763 + }, + { + "epoch": 1.9981214777708203, + "grad_norm": 2.5917131900787354, + "learning_rate": 2.525252525252526e-07, + "loss": 0.8078, + "step": 12764 + }, + { + "epoch": 1.9982780212899187, + "grad_norm": 3.30293607711792, + "learning_rate": 2.4437927663734114e-07, + "loss": 1.0961, + "step": 12765 + }, + { + "epoch": 1.998434564809017, + "grad_norm": 2.7565383911132812, + "learning_rate": 2.3623330074942982e-07, + "loss": 0.9726, + "step": 12766 + }, + { + "epoch": 1.9985911083281152, + "grad_norm": 2.694244861602783, + "learning_rate": 2.280873248615184e-07, + "loss": 0.8652, + "step": 12767 + }, + { + "epoch": 1.9987476518472134, + "grad_norm": 4.686280250549316, + "learning_rate": 2.1994134897360708e-07, + "loss": 1.5486, + "step": 12768 + }, + { + "epoch": 1.9989041953663118, + "grad_norm": 2.9849295616149902, + "learning_rate": 2.1179537308569567e-07, + "loss": 1.0779, + "step": 12769 + }, + { + "epoch": 1.9990607388854102, + "grad_norm": 4.789377689361572, + "learning_rate": 2.0364939719778432e-07, + "loss": 1.6121, + "step": 12770 + }, + { + "epoch": 1.9992172824045085, + "grad_norm": 3.951878309249878, + "learning_rate": 1.9550342130987294e-07, + "loss": 1.0633, + "step": 12771 + }, + { + "epoch": 1.9993738259236067, + "grad_norm": 2.656249761581421, + "learning_rate": 1.8735744542196156e-07, + "loss": 0.9705, + "step": 12772 + }, + { + "epoch": 1.9995303694427051, + "grad_norm": 3.799205780029297, + "learning_rate": 1.7921146953405018e-07, + "loss": 0.4256, + "step": 12773 + }, + { + "epoch": 1.9996869129618033, + "grad_norm": 1.307655692100525, + "learning_rate": 1.7106549364613883e-07, + "loss": 0.3585, + "step": 12774 + }, + { + "epoch": 1.9998434564809018, + "grad_norm": 3.594616651535034, + "learning_rate": 1.6291951775822745e-07, + "loss": 1.408, + "step": 12775 + }, + { + "epoch": 2.0, + "grad_norm": 5.270784378051758, + "learning_rate": 1.5477354187031607e-07, + "loss": 1.1571, + "step": 12776 + }, + { + "epoch": 2.0, + "step": 12776, + "total_flos": 1.799905283771071e+19, + "train_loss": 0.9291712401461605, + "train_runtime": 7691.3711, + "train_samples_per_second": 26.575, + "train_steps_per_second": 1.661 + } + ], + "logging_steps": 1.0, + "max_steps": 12776, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.799905283771071e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}