|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.976, |
|
"eval_steps": 125, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002, |
|
"grad_norm": 0.06923668831586838, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4175, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002, |
|
"eval_loss": 0.4618559181690216, |
|
"eval_runtime": 137.9356, |
|
"eval_samples_per_second": 4.002, |
|
"eval_steps_per_second": 0.5, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 0.09036832302808762, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.5159, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.006, |
|
"grad_norm": 0.06212183088064194, |
|
"learning_rate": 3e-06, |
|
"loss": 0.3274, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 0.089068204164505, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.5353, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.08060520887374878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5229, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.012, |
|
"grad_norm": 0.08129512518644333, |
|
"learning_rate": 6e-06, |
|
"loss": 0.416, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.014, |
|
"grad_norm": 0.13881395757198334, |
|
"learning_rate": 7e-06, |
|
"loss": 0.4797, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 0.09156442433595657, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.4808, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.018, |
|
"grad_norm": 0.09145132452249527, |
|
"learning_rate": 9e-06, |
|
"loss": 0.4991, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.08622220903635025, |
|
"learning_rate": 1e-05, |
|
"loss": 0.484, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.022, |
|
"grad_norm": 0.07630373537540436, |
|
"learning_rate": 9.999974825027756e-06, |
|
"loss": 0.3951, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 0.06840338557958603, |
|
"learning_rate": 9.999899300364534e-06, |
|
"loss": 0.4058, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.026, |
|
"grad_norm": 0.09991295635700226, |
|
"learning_rate": 9.999773426770864e-06, |
|
"loss": 0.5737, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.028, |
|
"grad_norm": 0.09987013041973114, |
|
"learning_rate": 9.999597205514298e-06, |
|
"loss": 0.4535, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.07334341108798981, |
|
"learning_rate": 9.999370638369377e-06, |
|
"loss": 0.4047, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.10504010319709778, |
|
"learning_rate": 9.99909372761763e-06, |
|
"loss": 0.4587, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.034, |
|
"grad_norm": 0.12481511384248734, |
|
"learning_rate": 9.998766476047546e-06, |
|
"loss": 0.5568, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.036, |
|
"grad_norm": 0.10193619877099991, |
|
"learning_rate": 9.998388886954546e-06, |
|
"loss": 0.58, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.038, |
|
"grad_norm": 0.09747433662414551, |
|
"learning_rate": 9.997960964140946e-06, |
|
"loss": 0.4248, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.10985693335533142, |
|
"learning_rate": 9.997482711915926e-06, |
|
"loss": 0.5813, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.042, |
|
"grad_norm": 0.08061390370130539, |
|
"learning_rate": 9.99695413509548e-06, |
|
"loss": 0.3419, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.044, |
|
"grad_norm": 0.09820478409528732, |
|
"learning_rate": 9.99637523900237e-06, |
|
"loss": 0.336, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.046, |
|
"grad_norm": 0.11657540500164032, |
|
"learning_rate": 9.995746029466071e-06, |
|
"loss": 0.4634, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 0.0904548391699791, |
|
"learning_rate": 9.99506651282272e-06, |
|
"loss": 0.4085, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.1137523204088211, |
|
"learning_rate": 9.994336695915041e-06, |
|
"loss": 0.6002, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.052, |
|
"grad_norm": 0.08930382132530212, |
|
"learning_rate": 9.993556586092281e-06, |
|
"loss": 0.4007, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.054, |
|
"grad_norm": 0.10268951207399368, |
|
"learning_rate": 9.992726191210139e-06, |
|
"loss": 0.5762, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 0.11000809073448181, |
|
"learning_rate": 9.991845519630679e-06, |
|
"loss": 0.5878, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.058, |
|
"grad_norm": 0.08394967019557953, |
|
"learning_rate": 9.990914580222258e-06, |
|
"loss": 0.4447, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.10849784314632416, |
|
"learning_rate": 9.989933382359423e-06, |
|
"loss": 0.6129, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.062, |
|
"grad_norm": 0.09749893844127655, |
|
"learning_rate": 9.988901935922826e-06, |
|
"loss": 0.4993, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.09867393970489502, |
|
"learning_rate": 9.987820251299121e-06, |
|
"loss": 0.4415, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.066, |
|
"grad_norm": 0.07566885650157928, |
|
"learning_rate": 9.986688339380863e-06, |
|
"loss": 0.3669, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.068, |
|
"grad_norm": 0.08246949315071106, |
|
"learning_rate": 9.985506211566388e-06, |
|
"loss": 0.4102, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.10148797929286957, |
|
"learning_rate": 9.984273879759713e-06, |
|
"loss": 0.5327, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 0.08779735118150711, |
|
"learning_rate": 9.982991356370404e-06, |
|
"loss": 0.4914, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.074, |
|
"grad_norm": 0.09165964275598526, |
|
"learning_rate": 9.981658654313458e-06, |
|
"loss": 0.4136, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.076, |
|
"grad_norm": 0.10425784438848495, |
|
"learning_rate": 9.98027578700917e-06, |
|
"loss": 0.6063, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.078, |
|
"grad_norm": 0.09124460816383362, |
|
"learning_rate": 9.978842768382999e-06, |
|
"loss": 0.5461, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.0863451436161995, |
|
"learning_rate": 9.977359612865424e-06, |
|
"loss": 0.5108, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.082, |
|
"grad_norm": 0.11560487747192383, |
|
"learning_rate": 9.975826335391808e-06, |
|
"loss": 0.4965, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.084, |
|
"grad_norm": 0.1319773942232132, |
|
"learning_rate": 9.974242951402236e-06, |
|
"loss": 0.4754, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.086, |
|
"grad_norm": 0.08868485689163208, |
|
"learning_rate": 9.972609476841368e-06, |
|
"loss": 0.4958, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 0.12390384823083878, |
|
"learning_rate": 9.970925928158275e-06, |
|
"loss": 0.5641, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.095445416867733, |
|
"learning_rate": 9.969192322306271e-06, |
|
"loss": 0.5145, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.092, |
|
"grad_norm": 0.09656377136707306, |
|
"learning_rate": 9.96740867674275e-06, |
|
"loss": 0.3749, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.094, |
|
"grad_norm": 0.07841179519891739, |
|
"learning_rate": 9.965575009429006e-06, |
|
"loss": 0.4113, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.07786890119314194, |
|
"learning_rate": 9.963691338830045e-06, |
|
"loss": 0.4374, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.098, |
|
"grad_norm": 0.09050661325454712, |
|
"learning_rate": 9.961757683914406e-06, |
|
"loss": 0.5285, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.11070208251476288, |
|
"learning_rate": 9.959774064153977e-06, |
|
"loss": 0.5326, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.102, |
|
"grad_norm": 0.09067952632904053, |
|
"learning_rate": 9.957740499523787e-06, |
|
"loss": 0.5613, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 0.08883544057607651, |
|
"learning_rate": 9.955657010501807e-06, |
|
"loss": 0.4599, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.106, |
|
"grad_norm": 0.10251513868570328, |
|
"learning_rate": 9.95352361806875e-06, |
|
"loss": 0.5354, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.108, |
|
"grad_norm": 0.07133735716342926, |
|
"learning_rate": 9.951340343707852e-06, |
|
"loss": 0.3696, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.061642151325941086, |
|
"learning_rate": 9.949107209404664e-06, |
|
"loss": 0.3472, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.08950634300708771, |
|
"learning_rate": 9.946824237646823e-06, |
|
"loss": 0.4969, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.114, |
|
"grad_norm": 0.08016358315944672, |
|
"learning_rate": 9.944491451423829e-06, |
|
"loss": 0.5239, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.116, |
|
"grad_norm": 0.12512832880020142, |
|
"learning_rate": 9.942108874226812e-06, |
|
"loss": 0.5365, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.118, |
|
"grad_norm": 0.09220532327890396, |
|
"learning_rate": 9.9396765300483e-06, |
|
"loss": 0.4783, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.0885612890124321, |
|
"learning_rate": 9.937194443381972e-06, |
|
"loss": 0.5459, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.122, |
|
"grad_norm": 0.08592379838228226, |
|
"learning_rate": 9.934662639222412e-06, |
|
"loss": 0.4545, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.124, |
|
"grad_norm": 0.08418423682451248, |
|
"learning_rate": 9.93208114306486e-06, |
|
"loss": 0.5105, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.126, |
|
"grad_norm": 0.07870952039957047, |
|
"learning_rate": 9.929449980904952e-06, |
|
"loss": 0.4593, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.08841884881258011, |
|
"learning_rate": 9.926769179238467e-06, |
|
"loss": 0.4812, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.07493194192647934, |
|
"learning_rate": 9.924038765061042e-06, |
|
"loss": 0.5065, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.132, |
|
"grad_norm": 0.08470446616411209, |
|
"learning_rate": 9.921258765867919e-06, |
|
"loss": 0.4676, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.134, |
|
"grad_norm": 0.0656595379114151, |
|
"learning_rate": 9.918429209653662e-06, |
|
"loss": 0.3227, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 0.06501025706529617, |
|
"learning_rate": 9.915550124911866e-06, |
|
"loss": 0.2777, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.138, |
|
"grad_norm": 0.08443128317594528, |
|
"learning_rate": 9.912621540634889e-06, |
|
"loss": 0.4357, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.07121642678976059, |
|
"learning_rate": 9.909643486313533e-06, |
|
"loss": 0.3545, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.142, |
|
"grad_norm": 0.09408602863550186, |
|
"learning_rate": 9.906615991936781e-06, |
|
"loss": 0.3916, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.05998094752430916, |
|
"learning_rate": 9.903539087991462e-06, |
|
"loss": 0.2739, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.146, |
|
"grad_norm": 0.08949826657772064, |
|
"learning_rate": 9.900412805461968e-06, |
|
"loss": 0.3722, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.148, |
|
"grad_norm": 0.0731697678565979, |
|
"learning_rate": 9.897237175829927e-06, |
|
"loss": 0.2906, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.07855986058712006, |
|
"learning_rate": 9.894012231073895e-06, |
|
"loss": 0.4149, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 0.0791892409324646, |
|
"learning_rate": 9.890738003669029e-06, |
|
"loss": 0.4383, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.154, |
|
"grad_norm": 0.07980603724718094, |
|
"learning_rate": 9.887414526586764e-06, |
|
"loss": 0.4867, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.156, |
|
"grad_norm": 0.08503536880016327, |
|
"learning_rate": 9.884041833294477e-06, |
|
"loss": 0.4644, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.158, |
|
"grad_norm": 0.09240555018186569, |
|
"learning_rate": 9.880619957755151e-06, |
|
"loss": 0.3107, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.08195238560438156, |
|
"learning_rate": 9.877148934427037e-06, |
|
"loss": 0.3414, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.162, |
|
"grad_norm": 0.09512759745121002, |
|
"learning_rate": 9.873628798263297e-06, |
|
"loss": 0.4745, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.164, |
|
"grad_norm": 0.07976000756025314, |
|
"learning_rate": 9.870059584711668e-06, |
|
"loss": 0.3925, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.166, |
|
"grad_norm": 0.11229317635297775, |
|
"learning_rate": 9.86644132971409e-06, |
|
"loss": 0.4921, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 0.07479218393564224, |
|
"learning_rate": 9.862774069706346e-06, |
|
"loss": 0.3607, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.08530927449464798, |
|
"learning_rate": 9.859057841617709e-06, |
|
"loss": 0.4116, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.172, |
|
"grad_norm": 0.05544688552618027, |
|
"learning_rate": 9.855292682870552e-06, |
|
"loss": 0.2043, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.174, |
|
"grad_norm": 0.08539939671754837, |
|
"learning_rate": 9.851478631379982e-06, |
|
"loss": 0.4437, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.08732863515615463, |
|
"learning_rate": 9.847615725553457e-06, |
|
"loss": 0.4449, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.178, |
|
"grad_norm": 0.08848625421524048, |
|
"learning_rate": 9.843704004290393e-06, |
|
"loss": 0.5191, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.1142885684967041, |
|
"learning_rate": 9.839743506981783e-06, |
|
"loss": 0.3788, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.182, |
|
"grad_norm": 0.0678037703037262, |
|
"learning_rate": 9.835734273509787e-06, |
|
"loss": 0.3655, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 0.08179458975791931, |
|
"learning_rate": 9.831676344247343e-06, |
|
"loss": 0.4804, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.186, |
|
"grad_norm": 0.10821828246116638, |
|
"learning_rate": 9.827569760057755e-06, |
|
"loss": 0.4946, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.188, |
|
"grad_norm": 0.06980521976947784, |
|
"learning_rate": 9.82341456229428e-06, |
|
"loss": 0.3301, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.07966768741607666, |
|
"learning_rate": 9.819210792799711e-06, |
|
"loss": 0.4377, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.08750802278518677, |
|
"learning_rate": 9.814958493905962e-06, |
|
"loss": 0.4137, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.194, |
|
"grad_norm": 0.08171187341213226, |
|
"learning_rate": 9.810657708433637e-06, |
|
"loss": 0.5154, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.196, |
|
"grad_norm": 0.07627864181995392, |
|
"learning_rate": 9.806308479691595e-06, |
|
"loss": 0.3593, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.198, |
|
"grad_norm": 0.07038850337266922, |
|
"learning_rate": 9.801910851476524e-06, |
|
"loss": 0.3882, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.09910848736763, |
|
"learning_rate": 9.797464868072489e-06, |
|
"loss": 0.5034, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.202, |
|
"grad_norm": 0.08382704854011536, |
|
"learning_rate": 9.792970574250493e-06, |
|
"loss": 0.4769, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.204, |
|
"grad_norm": 0.07511335611343384, |
|
"learning_rate": 9.788428015268027e-06, |
|
"loss": 0.3703, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.206, |
|
"grad_norm": 0.08155877888202667, |
|
"learning_rate": 9.78383723686861e-06, |
|
"loss": 0.4102, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.06436574459075928, |
|
"learning_rate": 9.779198285281326e-06, |
|
"loss": 0.3253, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.06901544332504272, |
|
"learning_rate": 9.774511207220369e-06, |
|
"loss": 0.2842, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.212, |
|
"grad_norm": 0.08444689959287643, |
|
"learning_rate": 9.769776049884564e-06, |
|
"loss": 0.4212, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.214, |
|
"grad_norm": 0.08550014346837997, |
|
"learning_rate": 9.76499286095689e-06, |
|
"loss": 0.4404, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 0.09659305214881897, |
|
"learning_rate": 9.760161688604008e-06, |
|
"loss": 0.5841, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.218, |
|
"grad_norm": 0.06201549619436264, |
|
"learning_rate": 9.755282581475769e-06, |
|
"loss": 0.2246, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.07813581079244614, |
|
"learning_rate": 9.750355588704728e-06, |
|
"loss": 0.4415, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.222, |
|
"grad_norm": 0.10021974891424179, |
|
"learning_rate": 9.745380759905648e-06, |
|
"loss": 0.3042, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.10321412235498428, |
|
"learning_rate": 9.740358145174999e-06, |
|
"loss": 0.4837, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.226, |
|
"grad_norm": 0.11536537110805511, |
|
"learning_rate": 9.735287795090455e-06, |
|
"loss": 0.5586, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.228, |
|
"grad_norm": 0.07521039247512817, |
|
"learning_rate": 9.730169760710385e-06, |
|
"loss": 0.361, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.07128458470106125, |
|
"learning_rate": 9.725004093573343e-06, |
|
"loss": 0.3511, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 0.08504608273506165, |
|
"learning_rate": 9.719790845697534e-06, |
|
"loss": 0.4472, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.234, |
|
"grad_norm": 0.08541107177734375, |
|
"learning_rate": 9.71453006958031e-06, |
|
"loss": 0.3195, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.236, |
|
"grad_norm": 0.085638627409935, |
|
"learning_rate": 9.709221818197626e-06, |
|
"loss": 0.4343, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.238, |
|
"grad_norm": 0.06405656784772873, |
|
"learning_rate": 9.703866145003512e-06, |
|
"loss": 0.2905, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.12191811949014664, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.4092, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.242, |
|
"grad_norm": 0.08051154762506485, |
|
"learning_rate": 9.69301274938428e-06, |
|
"loss": 0.3362, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.244, |
|
"grad_norm": 0.09473302215337753, |
|
"learning_rate": 9.687515136252732e-06, |
|
"loss": 0.3941, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.246, |
|
"grad_norm": 0.09992998838424683, |
|
"learning_rate": 9.681970319895804e-06, |
|
"loss": 0.4603, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 0.08887780457735062, |
|
"learning_rate": 9.676378356149733e-06, |
|
"loss": 0.3082, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.08823645859956741, |
|
"learning_rate": 9.670739301325534e-06, |
|
"loss": 0.4301, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.3706146478652954, |
|
"eval_runtime": 76.5201, |
|
"eval_samples_per_second": 7.214, |
|
"eval_steps_per_second": 0.902, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.252, |
|
"grad_norm": 0.10688935965299606, |
|
"learning_rate": 9.665053212208426e-06, |
|
"loss": 0.3065, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.254, |
|
"grad_norm": 0.09517981857061386, |
|
"learning_rate": 9.659320146057263e-06, |
|
"loss": 0.5437, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.11310486495494843, |
|
"learning_rate": 9.653540160603956e-06, |
|
"loss": 0.6087, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.258, |
|
"grad_norm": 0.08851969987154007, |
|
"learning_rate": 9.647713314052896e-06, |
|
"loss": 0.3598, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.09503145515918732, |
|
"learning_rate": 9.641839665080363e-06, |
|
"loss": 0.338, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.262, |
|
"grad_norm": 0.09553948044776917, |
|
"learning_rate": 9.635919272833938e-06, |
|
"loss": 0.3801, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 0.09811339527368546, |
|
"learning_rate": 9.629952196931902e-06, |
|
"loss": 0.3866, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.266, |
|
"grad_norm": 0.0865439921617508, |
|
"learning_rate": 9.623938497462647e-06, |
|
"loss": 0.4466, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.268, |
|
"grad_norm": 0.09298735857009888, |
|
"learning_rate": 9.617878234984056e-06, |
|
"loss": 0.4413, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.10931612551212311, |
|
"learning_rate": 9.611771470522908e-06, |
|
"loss": 0.3974, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.08798681199550629, |
|
"learning_rate": 9.60561826557425e-06, |
|
"loss": 0.4052, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.274, |
|
"grad_norm": 0.09892652928829193, |
|
"learning_rate": 9.599418682100793e-06, |
|
"loss": 0.4645, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.276, |
|
"grad_norm": 0.10193604230880737, |
|
"learning_rate": 9.59317278253227e-06, |
|
"loss": 0.4064, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.278, |
|
"grad_norm": 0.07900392264127731, |
|
"learning_rate": 9.586880629764817e-06, |
|
"loss": 0.3229, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.08284664154052734, |
|
"learning_rate": 9.580542287160348e-06, |
|
"loss": 0.3703, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.282, |
|
"grad_norm": 0.08164459466934204, |
|
"learning_rate": 9.574157818545902e-06, |
|
"loss": 0.2879, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.284, |
|
"grad_norm": 0.1115422248840332, |
|
"learning_rate": 9.567727288213005e-06, |
|
"loss": 0.4593, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.286, |
|
"grad_norm": 0.09770838916301727, |
|
"learning_rate": 9.561250760917026e-06, |
|
"loss": 0.4133, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.12189961224794388, |
|
"learning_rate": 9.554728301876525e-06, |
|
"loss": 0.5928, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.14093732833862305, |
|
"learning_rate": 9.548159976772593e-06, |
|
"loss": 0.415, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.292, |
|
"grad_norm": 0.11479732394218445, |
|
"learning_rate": 9.541545851748186e-06, |
|
"loss": 0.3691, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.294, |
|
"grad_norm": 0.09249378740787506, |
|
"learning_rate": 9.534885993407474e-06, |
|
"loss": 0.3394, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 0.10194878280162811, |
|
"learning_rate": 9.528180468815155e-06, |
|
"loss": 0.3745, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.298, |
|
"grad_norm": 0.09345925599336624, |
|
"learning_rate": 9.521429345495787e-06, |
|
"loss": 0.3934, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.09919178485870361, |
|
"learning_rate": 9.514632691433108e-06, |
|
"loss": 0.4053, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.302, |
|
"grad_norm": 0.10807909071445465, |
|
"learning_rate": 9.507790575069347e-06, |
|
"loss": 0.4631, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.10555636882781982, |
|
"learning_rate": 9.50090306530454e-06, |
|
"loss": 0.4952, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.306, |
|
"grad_norm": 0.10507559776306152, |
|
"learning_rate": 9.493970231495836e-06, |
|
"loss": 0.294, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.308, |
|
"grad_norm": 0.08718883246183395, |
|
"learning_rate": 9.486992143456792e-06, |
|
"loss": 0.3044, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.10039477050304413, |
|
"learning_rate": 9.47996887145668e-06, |
|
"loss": 0.3736, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 0.09952064603567123, |
|
"learning_rate": 9.47290048621977e-06, |
|
"loss": 0.4359, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.314, |
|
"grad_norm": 0.10663799196481705, |
|
"learning_rate": 9.46578705892462e-06, |
|
"loss": 0.3939, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.316, |
|
"grad_norm": 0.10759017616510391, |
|
"learning_rate": 9.458628661203368e-06, |
|
"loss": 0.4575, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.318, |
|
"grad_norm": 0.08924371749162674, |
|
"learning_rate": 9.451425365140997e-06, |
|
"loss": 0.3525, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.13670168817043304, |
|
"learning_rate": 9.444177243274619e-06, |
|
"loss": 0.5385, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.322, |
|
"grad_norm": 0.10520858317613602, |
|
"learning_rate": 9.43688436859274e-06, |
|
"loss": 0.2964, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.324, |
|
"grad_norm": 0.10608810931444168, |
|
"learning_rate": 9.429546814534528e-06, |
|
"loss": 0.4369, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.326, |
|
"grad_norm": 0.08399061113595963, |
|
"learning_rate": 9.422164654989073e-06, |
|
"loss": 0.3246, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 0.11295214295387268, |
|
"learning_rate": 9.414737964294636e-06, |
|
"loss": 0.4766, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.1255977749824524, |
|
"learning_rate": 9.40726681723791e-06, |
|
"loss": 0.5263, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.332, |
|
"grad_norm": 0.0891086682677269, |
|
"learning_rate": 9.399751289053267e-06, |
|
"loss": 0.2796, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.334, |
|
"grad_norm": 0.12856395542621613, |
|
"learning_rate": 9.392191455421989e-06, |
|
"loss": 0.4485, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.1172974556684494, |
|
"learning_rate": 9.384587392471516e-06, |
|
"loss": 0.542, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.338, |
|
"grad_norm": 0.08675208687782288, |
|
"learning_rate": 9.376939176774678e-06, |
|
"loss": 0.2899, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.11079028248786926, |
|
"learning_rate": 9.369246885348926e-06, |
|
"loss": 0.3732, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.342, |
|
"grad_norm": 0.12667471170425415, |
|
"learning_rate": 9.361510595655545e-06, |
|
"loss": 0.54, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 0.08692082017660141, |
|
"learning_rate": 9.353730385598887e-06, |
|
"loss": 0.3873, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.346, |
|
"grad_norm": 0.1013069748878479, |
|
"learning_rate": 9.345906333525582e-06, |
|
"loss": 0.438, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.348, |
|
"grad_norm": 0.09999188780784607, |
|
"learning_rate": 9.338038518223746e-06, |
|
"loss": 0.4467, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.11317498981952667, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 0.3912, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.10574603080749512, |
|
"learning_rate": 9.322171915289635e-06, |
|
"loss": 0.3808, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.354, |
|
"grad_norm": 0.1281527876853943, |
|
"learning_rate": 9.314173287433874e-06, |
|
"loss": 0.423, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.356, |
|
"grad_norm": 0.12899580597877502, |
|
"learning_rate": 9.306131215901004e-06, |
|
"loss": 0.4509, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.358, |
|
"grad_norm": 0.10952267050743103, |
|
"learning_rate": 9.298045781674595e-06, |
|
"loss": 0.3512, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.1423255354166031, |
|
"learning_rate": 9.289917066174887e-06, |
|
"loss": 0.3631, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.362, |
|
"grad_norm": 0.13039131462574005, |
|
"learning_rate": 9.281745151257946e-06, |
|
"loss": 0.3762, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.364, |
|
"grad_norm": 0.10448655486106873, |
|
"learning_rate": 9.273530119214868e-06, |
|
"loss": 0.3694, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.366, |
|
"grad_norm": 0.0945306122303009, |
|
"learning_rate": 9.265272052770936e-06, |
|
"loss": 0.28, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.10995735973119736, |
|
"learning_rate": 9.256971035084786e-06, |
|
"loss": 0.4849, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.11014600843191147, |
|
"learning_rate": 9.248627149747573e-06, |
|
"loss": 0.3213, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.372, |
|
"grad_norm": 0.09283925592899323, |
|
"learning_rate": 9.24024048078213e-06, |
|
"loss": 0.4077, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.374, |
|
"grad_norm": 0.14395715296268463, |
|
"learning_rate": 9.231811112642121e-06, |
|
"loss": 0.4869, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 0.10785488784313202, |
|
"learning_rate": 9.223339130211194e-06, |
|
"loss": 0.4122, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.378, |
|
"grad_norm": 0.09983161091804504, |
|
"learning_rate": 9.214824618802108e-06, |
|
"loss": 0.3027, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.10121427476406097, |
|
"learning_rate": 9.206267664155906e-06, |
|
"loss": 0.3055, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.382, |
|
"grad_norm": 0.11393419653177261, |
|
"learning_rate": 9.197668352441025e-06, |
|
"loss": 0.3567, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.132842019200325, |
|
"learning_rate": 9.189026770252437e-06, |
|
"loss": 0.3556, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.386, |
|
"grad_norm": 0.1139449030160904, |
|
"learning_rate": 9.18034300461078e-06, |
|
"loss": 0.4298, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.388, |
|
"grad_norm": 0.09980877488851547, |
|
"learning_rate": 9.171617142961477e-06, |
|
"loss": 0.3853, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.12531818449497223, |
|
"learning_rate": 9.162849273173857e-06, |
|
"loss": 0.4845, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 0.11148197203874588, |
|
"learning_rate": 9.154039483540273e-06, |
|
"loss": 0.4091, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.394, |
|
"grad_norm": 0.11962081491947174, |
|
"learning_rate": 9.145187862775208e-06, |
|
"loss": 0.371, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.396, |
|
"grad_norm": 0.10789982974529266, |
|
"learning_rate": 9.136294500014387e-06, |
|
"loss": 0.4268, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.398, |
|
"grad_norm": 0.15846121311187744, |
|
"learning_rate": 9.12735948481387e-06, |
|
"loss": 0.6264, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1426246613264084, |
|
"learning_rate": 9.118382907149164e-06, |
|
"loss": 0.4769, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.402, |
|
"grad_norm": 0.1069459393620491, |
|
"learning_rate": 9.109364857414306e-06, |
|
"loss": 0.3708, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.404, |
|
"grad_norm": 0.10732389986515045, |
|
"learning_rate": 9.100305426420957e-06, |
|
"loss": 0.3962, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.406, |
|
"grad_norm": 0.1436106562614441, |
|
"learning_rate": 9.091204705397485e-06, |
|
"loss": 0.4549, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 0.10230587422847748, |
|
"learning_rate": 9.08206278598805e-06, |
|
"loss": 0.3926, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.11367027461528778, |
|
"learning_rate": 9.07287976025168e-06, |
|
"loss": 0.3378, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.412, |
|
"grad_norm": 0.14832234382629395, |
|
"learning_rate": 9.06365572066134e-06, |
|
"loss": 0.4202, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.414, |
|
"grad_norm": 0.10567332804203033, |
|
"learning_rate": 9.05439076010301e-06, |
|
"loss": 0.2904, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.11918513476848602, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.2632, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.418, |
|
"grad_norm": 0.13223537802696228, |
|
"learning_rate": 9.035738449685707e-06, |
|
"loss": 0.4208, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.12573251128196716, |
|
"learning_rate": 9.026351287655294e-06, |
|
"loss": 0.4609, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.422, |
|
"grad_norm": 0.11943136155605316, |
|
"learning_rate": 9.016923580312114e-06, |
|
"loss": 0.3323, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 0.13152974843978882, |
|
"learning_rate": 9.007455422593077e-06, |
|
"loss": 0.4258, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.426, |
|
"grad_norm": 0.13339808583259583, |
|
"learning_rate": 8.997946909842426e-06, |
|
"loss": 0.5303, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.428, |
|
"grad_norm": 0.11746034771203995, |
|
"learning_rate": 8.988398137810778e-06, |
|
"loss": 0.4109, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.11518029868602753, |
|
"learning_rate": 8.978809202654161e-06, |
|
"loss": 0.4154, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.15307952463626862, |
|
"learning_rate": 8.969180200933048e-06, |
|
"loss": 0.4196, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.434, |
|
"grad_norm": 0.11385340988636017, |
|
"learning_rate": 8.959511229611377e-06, |
|
"loss": 0.3713, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.436, |
|
"grad_norm": 0.1380355805158615, |
|
"learning_rate": 8.949802386055582e-06, |
|
"loss": 0.3891, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.438, |
|
"grad_norm": 0.09614066779613495, |
|
"learning_rate": 8.94005376803361e-06, |
|
"loss": 0.2527, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.12352288514375687, |
|
"learning_rate": 8.930265473713939e-06, |
|
"loss": 0.3737, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.442, |
|
"grad_norm": 0.18210633099079132, |
|
"learning_rate": 8.92043760166458e-06, |
|
"loss": 0.3839, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.444, |
|
"grad_norm": 0.1087498739361763, |
|
"learning_rate": 8.910570250852098e-06, |
|
"loss": 0.3141, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.446, |
|
"grad_norm": 0.11985889822244644, |
|
"learning_rate": 8.900663520640605e-06, |
|
"loss": 0.4606, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.146299347281456, |
|
"learning_rate": 8.890717510790763e-06, |
|
"loss": 0.4094, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.09788361191749573, |
|
"learning_rate": 8.880732321458785e-06, |
|
"loss": 0.2964, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.452, |
|
"grad_norm": 0.09735774993896484, |
|
"learning_rate": 8.870708053195414e-06, |
|
"loss": 0.2646, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.454, |
|
"grad_norm": 0.1293504238128662, |
|
"learning_rate": 8.860644806944917e-06, |
|
"loss": 0.2991, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 0.13126921653747559, |
|
"learning_rate": 8.850542684044078e-06, |
|
"loss": 0.4474, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.458, |
|
"grad_norm": 0.11488878726959229, |
|
"learning_rate": 8.84040178622116e-06, |
|
"loss": 0.3628, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.13861073553562164, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 0.4022, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.462, |
|
"grad_norm": 0.16164664924144745, |
|
"learning_rate": 8.820004074673433e-06, |
|
"loss": 0.4217, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.10550030320882797, |
|
"learning_rate": 8.809747466353356e-06, |
|
"loss": 0.2927, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.466, |
|
"grad_norm": 0.1035122275352478, |
|
"learning_rate": 8.799452493918586e-06, |
|
"loss": 0.2453, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.468, |
|
"grad_norm": 0.15530018508434296, |
|
"learning_rate": 8.789119261039385e-06, |
|
"loss": 0.3758, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.13951483368873596, |
|
"learning_rate": 8.778747871771293e-06, |
|
"loss": 0.4502, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 0.13241475820541382, |
|
"learning_rate": 8.768338430554083e-06, |
|
"loss": 0.5012, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.474, |
|
"grad_norm": 0.11370962113142014, |
|
"learning_rate": 8.757891042210713e-06, |
|
"loss": 0.2801, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.476, |
|
"grad_norm": 0.1501305103302002, |
|
"learning_rate": 8.747405811946272e-06, |
|
"loss": 0.4888, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.478, |
|
"grad_norm": 0.1636514514684677, |
|
"learning_rate": 8.736882845346906e-06, |
|
"loss": 0.518, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.11505798250436783, |
|
"learning_rate": 8.726322248378775e-06, |
|
"loss": 0.2627, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.482, |
|
"grad_norm": 0.15717971324920654, |
|
"learning_rate": 8.715724127386971e-06, |
|
"loss": 0.3299, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.484, |
|
"grad_norm": 0.13042742013931274, |
|
"learning_rate": 8.705088589094458e-06, |
|
"loss": 0.351, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.486, |
|
"grad_norm": 0.1414385885000229, |
|
"learning_rate": 8.69441574060099e-06, |
|
"loss": 0.471, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 0.10110446810722351, |
|
"learning_rate": 8.683705689382025e-06, |
|
"loss": 0.2369, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.1549258530139923, |
|
"learning_rate": 8.672958543287666e-06, |
|
"loss": 0.4333, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.492, |
|
"grad_norm": 0.11834664642810822, |
|
"learning_rate": 8.662174410541556e-06, |
|
"loss": 0.3182, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.494, |
|
"grad_norm": 0.1529727429151535, |
|
"learning_rate": 8.651353399739787e-06, |
|
"loss": 0.4963, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.14854104816913605, |
|
"learning_rate": 8.640495619849821e-06, |
|
"loss": 0.4514, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.498, |
|
"grad_norm": 0.12271202355623245, |
|
"learning_rate": 8.629601180209382e-06, |
|
"loss": 0.3694, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.11352905631065369, |
|
"learning_rate": 8.61867019052535e-06, |
|
"loss": 0.2978, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.32808247208595276, |
|
"eval_runtime": 76.51, |
|
"eval_samples_per_second": 7.215, |
|
"eval_steps_per_second": 0.902, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.502, |
|
"grad_norm": 0.1511523425579071, |
|
"learning_rate": 8.607702760872679e-06, |
|
"loss": 0.4037, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 0.13344620168209076, |
|
"learning_rate": 8.596699001693257e-06, |
|
"loss": 0.2303, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.506, |
|
"grad_norm": 0.12220989167690277, |
|
"learning_rate": 8.585659023794818e-06, |
|
"loss": 0.4347, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.508, |
|
"grad_norm": 0.1094481498003006, |
|
"learning_rate": 8.574582938349818e-06, |
|
"loss": 0.3089, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.11940666288137436, |
|
"learning_rate": 8.563470856894316e-06, |
|
"loss": 0.2699, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.139656201004982, |
|
"learning_rate": 8.552322891326846e-06, |
|
"loss": 0.2763, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.514, |
|
"grad_norm": 0.11665194481611252, |
|
"learning_rate": 8.541139153907296e-06, |
|
"loss": 0.2695, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.516, |
|
"grad_norm": 0.12714596092700958, |
|
"learning_rate": 8.529919757255783e-06, |
|
"loss": 0.2489, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.518, |
|
"grad_norm": 0.12326015532016754, |
|
"learning_rate": 8.518664814351502e-06, |
|
"loss": 0.3067, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.13826797902584076, |
|
"learning_rate": 8.507374438531606e-06, |
|
"loss": 0.3119, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.522, |
|
"grad_norm": 0.15031856298446655, |
|
"learning_rate": 8.496048743490053e-06, |
|
"loss": 0.3112, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.524, |
|
"grad_norm": 0.14100715517997742, |
|
"learning_rate": 8.48468784327647e-06, |
|
"loss": 0.3878, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.526, |
|
"grad_norm": 0.15813864767551422, |
|
"learning_rate": 8.473291852294986e-06, |
|
"loss": 0.3382, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.15911728143692017, |
|
"learning_rate": 8.461860885303116e-06, |
|
"loss": 0.4177, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.15685637295246124, |
|
"learning_rate": 8.450395057410561e-06, |
|
"loss": 0.3557, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.532, |
|
"grad_norm": 0.13905856013298035, |
|
"learning_rate": 8.438894484078086e-06, |
|
"loss": 0.3323, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.534, |
|
"grad_norm": 0.13344989717006683, |
|
"learning_rate": 8.427359281116335e-06, |
|
"loss": 0.3475, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 0.16016146540641785, |
|
"learning_rate": 8.415789564684673e-06, |
|
"loss": 0.3789, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.538, |
|
"grad_norm": 0.11681054532527924, |
|
"learning_rate": 8.404185451290017e-06, |
|
"loss": 0.2061, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.14662593603134155, |
|
"learning_rate": 8.392547057785662e-06, |
|
"loss": 0.4173, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.542, |
|
"grad_norm": 0.21970625221729279, |
|
"learning_rate": 8.380874501370098e-06, |
|
"loss": 0.5602, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.11630596220493317, |
|
"learning_rate": 8.36916789958584e-06, |
|
"loss": 0.2674, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.546, |
|
"grad_norm": 0.14212217926979065, |
|
"learning_rate": 8.357427370318239e-06, |
|
"loss": 0.2776, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.548, |
|
"grad_norm": 0.14911417663097382, |
|
"learning_rate": 8.345653031794292e-06, |
|
"loss": 0.4463, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.142579585313797, |
|
"learning_rate": 8.33384500258146e-06, |
|
"loss": 0.4963, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 0.14713557064533234, |
|
"learning_rate": 8.322003401586463e-06, |
|
"loss": 0.2642, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.554, |
|
"grad_norm": 0.24756528437137604, |
|
"learning_rate": 8.310128348054093e-06, |
|
"loss": 0.5423, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.556, |
|
"grad_norm": 0.13731062412261963, |
|
"learning_rate": 8.298219961566008e-06, |
|
"loss": 0.3333, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.558, |
|
"grad_norm": 0.18075144290924072, |
|
"learning_rate": 8.286278362039527e-06, |
|
"loss": 0.3733, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.1650344282388687, |
|
"learning_rate": 8.274303669726427e-06, |
|
"loss": 0.383, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.562, |
|
"grad_norm": 0.18053463101387024, |
|
"learning_rate": 8.262296005211722e-06, |
|
"loss": 0.4359, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.564, |
|
"grad_norm": 0.16192179918289185, |
|
"learning_rate": 8.250255489412464e-06, |
|
"loss": 0.3839, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.566, |
|
"grad_norm": 0.16045285761356354, |
|
"learning_rate": 8.238182243576512e-06, |
|
"loss": 0.4185, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 0.14847232401371002, |
|
"learning_rate": 8.226076389281316e-06, |
|
"loss": 0.43, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.1868700236082077, |
|
"learning_rate": 8.213938048432697e-06, |
|
"loss": 0.3437, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.572, |
|
"grad_norm": 0.1744498908519745, |
|
"learning_rate": 8.201767343263612e-06, |
|
"loss": 0.4926, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.574, |
|
"grad_norm": 0.13156633079051971, |
|
"learning_rate": 8.189564396332927e-06, |
|
"loss": 0.4245, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.17716287076473236, |
|
"learning_rate": 8.177329330524182e-06, |
|
"loss": 0.3134, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.578, |
|
"grad_norm": 0.15387575328350067, |
|
"learning_rate": 8.165062269044353e-06, |
|
"loss": 0.3723, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.11926203221082687, |
|
"learning_rate": 8.152763335422612e-06, |
|
"loss": 0.251, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.582, |
|
"grad_norm": 0.14692164957523346, |
|
"learning_rate": 8.140432653509089e-06, |
|
"loss": 0.3068, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 0.12874449789524078, |
|
"learning_rate": 8.128070347473609e-06, |
|
"loss": 0.3449, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.586, |
|
"grad_norm": 0.1284901350736618, |
|
"learning_rate": 8.115676541804456e-06, |
|
"loss": 0.2336, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.588, |
|
"grad_norm": 0.18448615074157715, |
|
"learning_rate": 8.10325136130712e-06, |
|
"loss": 0.4497, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.18793466687202454, |
|
"learning_rate": 8.090794931103026e-06, |
|
"loss": 0.446, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.11833447217941284, |
|
"learning_rate": 8.078307376628292e-06, |
|
"loss": 0.286, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.594, |
|
"grad_norm": 0.14963407814502716, |
|
"learning_rate": 8.065788823632451e-06, |
|
"loss": 0.329, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.596, |
|
"grad_norm": 0.1394645869731903, |
|
"learning_rate": 8.053239398177191e-06, |
|
"loss": 0.2671, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.598, |
|
"grad_norm": 0.17401300370693207, |
|
"learning_rate": 8.04065922663509e-06, |
|
"loss": 0.5106, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.1559733897447586, |
|
"learning_rate": 8.028048435688333e-06, |
|
"loss": 0.259, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.602, |
|
"grad_norm": 0.14853116869926453, |
|
"learning_rate": 8.015407152327448e-06, |
|
"loss": 0.4095, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.604, |
|
"grad_norm": 0.13665775954723358, |
|
"learning_rate": 8.002735503850016e-06, |
|
"loss": 0.379, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.606, |
|
"grad_norm": 0.15187975764274597, |
|
"learning_rate": 7.990033617859396e-06, |
|
"loss": 0.336, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.17993216216564178, |
|
"learning_rate": 7.97730162226344e-06, |
|
"loss": 0.4718, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.14840970933437347, |
|
"learning_rate": 7.964539645273204e-06, |
|
"loss": 0.3572, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.612, |
|
"grad_norm": 0.2386975884437561, |
|
"learning_rate": 7.951747815401651e-06, |
|
"loss": 0.3185, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.614, |
|
"grad_norm": 0.21291233599185944, |
|
"learning_rate": 7.938926261462366e-06, |
|
"loss": 0.362, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 0.16196957230567932, |
|
"learning_rate": 7.92607511256826e-06, |
|
"loss": 0.3024, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.618, |
|
"grad_norm": 0.2727487087249756, |
|
"learning_rate": 7.913194498130252e-06, |
|
"loss": 0.5212, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.1640804558992386, |
|
"learning_rate": 7.900284547855992e-06, |
|
"loss": 0.3948, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.622, |
|
"grad_norm": 0.22003543376922607, |
|
"learning_rate": 7.887345391748533e-06, |
|
"loss": 0.3745, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.1896262764930725, |
|
"learning_rate": 7.874377160105037e-06, |
|
"loss": 0.4448, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.626, |
|
"grad_norm": 0.18609432876110077, |
|
"learning_rate": 7.861379983515449e-06, |
|
"loss": 0.3685, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.628, |
|
"grad_norm": 0.14590106904506683, |
|
"learning_rate": 7.848353992861195e-06, |
|
"loss": 0.3338, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.13211271166801453, |
|
"learning_rate": 7.835299319313854e-06, |
|
"loss": 0.3297, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 0.16736850142478943, |
|
"learning_rate": 7.822216094333847e-06, |
|
"loss": 0.3118, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.634, |
|
"grad_norm": 0.17553502321243286, |
|
"learning_rate": 7.8091044496691e-06, |
|
"loss": 0.3447, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.636, |
|
"grad_norm": 0.17292480170726776, |
|
"learning_rate": 7.795964517353734e-06, |
|
"loss": 0.3152, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.638, |
|
"grad_norm": 0.13962873816490173, |
|
"learning_rate": 7.782796429706721e-06, |
|
"loss": 0.2142, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.19501662254333496, |
|
"learning_rate": 7.769600319330553e-06, |
|
"loss": 0.3923, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.642, |
|
"grad_norm": 0.1338018923997879, |
|
"learning_rate": 7.756376319109917e-06, |
|
"loss": 0.3381, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.644, |
|
"grad_norm": 0.1579694300889969, |
|
"learning_rate": 7.743124562210351e-06, |
|
"loss": 0.37, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.646, |
|
"grad_norm": 0.12136895209550858, |
|
"learning_rate": 7.729845182076896e-06, |
|
"loss": 0.212, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 0.2188921570777893, |
|
"learning_rate": 7.716538312432767e-06, |
|
"loss": 0.3732, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.1570715606212616, |
|
"learning_rate": 7.703204087277989e-06, |
|
"loss": 0.321, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.652, |
|
"grad_norm": 0.19729937613010406, |
|
"learning_rate": 7.689842640888063e-06, |
|
"loss": 0.3955, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.654, |
|
"grad_norm": 0.20023679733276367, |
|
"learning_rate": 7.676454107812608e-06, |
|
"loss": 0.4399, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.14793503284454346, |
|
"learning_rate": 7.663038622873999e-06, |
|
"loss": 0.2922, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.658, |
|
"grad_norm": 0.16386426985263824, |
|
"learning_rate": 7.649596321166024e-06, |
|
"loss": 0.3495, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.15845847129821777, |
|
"learning_rate": 7.636127338052513e-06, |
|
"loss": 0.3607, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.662, |
|
"grad_norm": 0.17752616107463837, |
|
"learning_rate": 7.622631809165972e-06, |
|
"loss": 0.2863, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 0.2213558405637741, |
|
"learning_rate": 7.60910987040623e-06, |
|
"loss": 0.4411, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.666, |
|
"grad_norm": 0.2018650323152542, |
|
"learning_rate": 7.595561657939061e-06, |
|
"loss": 0.418, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.668, |
|
"grad_norm": 0.20029357075691223, |
|
"learning_rate": 7.5819873081948105e-06, |
|
"loss": 0.3025, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.1478874832391739, |
|
"learning_rate": 7.568386957867033e-06, |
|
"loss": 0.2437, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.18909971415996552, |
|
"learning_rate": 7.554760743911104e-06, |
|
"loss": 0.3974, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.674, |
|
"grad_norm": 0.16544924676418304, |
|
"learning_rate": 7.541108803542846e-06, |
|
"loss": 0.336, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.676, |
|
"grad_norm": 0.19204874336719513, |
|
"learning_rate": 7.527431274237149e-06, |
|
"loss": 0.3617, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.678, |
|
"grad_norm": 0.1770397573709488, |
|
"learning_rate": 7.5137282937265796e-06, |
|
"loss": 0.3617, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.15880927443504333, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.2993, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.682, |
|
"grad_norm": 0.4031960368156433, |
|
"learning_rate": 7.486246531301178e-06, |
|
"loss": 0.3137, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.684, |
|
"grad_norm": 0.17426829040050507, |
|
"learning_rate": 7.472468026127385e-06, |
|
"loss": 0.3712, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.686, |
|
"grad_norm": 0.16782499849796295, |
|
"learning_rate": 7.45866462322802e-06, |
|
"loss": 0.359, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.20207028090953827, |
|
"learning_rate": 7.444836461603195e-06, |
|
"loss": 0.4301, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.18788397312164307, |
|
"learning_rate": 7.430983680502344e-06, |
|
"loss": 0.3609, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.692, |
|
"grad_norm": 0.16447116434574127, |
|
"learning_rate": 7.4171064194228196e-06, |
|
"loss": 0.3514, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.694, |
|
"grad_norm": 0.15939724445343018, |
|
"learning_rate": 7.403204818108487e-06, |
|
"loss": 0.2747, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 0.2825759947299957, |
|
"learning_rate": 7.3892790165483164e-06, |
|
"loss": 0.5376, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.698, |
|
"grad_norm": 0.15753747522830963, |
|
"learning_rate": 7.3753291549749764e-06, |
|
"loss": 0.2741, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.19103243947029114, |
|
"learning_rate": 7.361355373863415e-06, |
|
"loss": 0.3088, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.702, |
|
"grad_norm": 0.18185654282569885, |
|
"learning_rate": 7.347357813929455e-06, |
|
"loss": 0.3204, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.15075427293777466, |
|
"learning_rate": 7.333336616128369e-06, |
|
"loss": 0.2885, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.706, |
|
"grad_norm": 0.14092062413692474, |
|
"learning_rate": 7.319291921653464e-06, |
|
"loss": 0.2423, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.708, |
|
"grad_norm": 0.11944609135389328, |
|
"learning_rate": 7.305223871934657e-06, |
|
"loss": 0.1367, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.2248326539993286, |
|
"learning_rate": 7.291132608637053e-06, |
|
"loss": 0.4119, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 0.1844269186258316, |
|
"learning_rate": 7.2770182736595164e-06, |
|
"loss": 0.2714, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.714, |
|
"grad_norm": 0.19066232442855835, |
|
"learning_rate": 7.262881009133242e-06, |
|
"loss": 0.432, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.716, |
|
"grad_norm": 0.21767167747020721, |
|
"learning_rate": 7.24872095742033e-06, |
|
"loss": 0.3804, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.718, |
|
"grad_norm": 0.14823076128959656, |
|
"learning_rate": 7.234538261112342e-06, |
|
"loss": 0.3182, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.1661371886730194, |
|
"learning_rate": 7.2203330630288714e-06, |
|
"loss": 0.3078, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.722, |
|
"grad_norm": 0.18412846326828003, |
|
"learning_rate": 7.206105506216107e-06, |
|
"loss": 0.4066, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.724, |
|
"grad_norm": 0.17892518639564514, |
|
"learning_rate": 7.191855733945388e-06, |
|
"loss": 0.4772, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.726, |
|
"grad_norm": 0.24270282685756683, |
|
"learning_rate": 7.177583889711763e-06, |
|
"loss": 0.3902, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 0.187135249376297, |
|
"learning_rate": 7.163290117232542e-06, |
|
"loss": 0.3154, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.20502962172031403, |
|
"learning_rate": 7.148974560445859e-06, |
|
"loss": 0.3599, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.732, |
|
"grad_norm": 0.1704569160938263, |
|
"learning_rate": 7.1346373635092095e-06, |
|
"loss": 0.3705, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.734, |
|
"grad_norm": 0.20562830567359924, |
|
"learning_rate": 7.12027867079801e-06, |
|
"loss": 0.3169, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.19051577150821686, |
|
"learning_rate": 7.105898626904134e-06, |
|
"loss": 0.4571, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.738, |
|
"grad_norm": 0.18842366337776184, |
|
"learning_rate": 7.0914973766344645e-06, |
|
"loss": 0.2771, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.14864154160022736, |
|
"learning_rate": 7.0770750650094335e-06, |
|
"loss": 0.2184, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.742, |
|
"grad_norm": 0.1662212610244751, |
|
"learning_rate": 7.062631837261556e-06, |
|
"loss": 0.2706, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 0.15230734646320343, |
|
"learning_rate": 7.048167838833977e-06, |
|
"loss": 0.2611, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.746, |
|
"grad_norm": 0.16176356375217438, |
|
"learning_rate": 7.033683215379002e-06, |
|
"loss": 0.3144, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.748, |
|
"grad_norm": 0.16796669363975525, |
|
"learning_rate": 7.019178112756625e-06, |
|
"loss": 0.3742, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.16455894708633423, |
|
"learning_rate": 7.004652677033069e-06, |
|
"loss": 0.2426, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.2979236841201782, |
|
"eval_runtime": 76.5795, |
|
"eval_samples_per_second": 7.208, |
|
"eval_steps_per_second": 0.901, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.22792088985443115, |
|
"learning_rate": 6.990107054479313e-06, |
|
"loss": 0.319, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.754, |
|
"grad_norm": 0.24258168041706085, |
|
"learning_rate": 6.9755413915696105e-06, |
|
"loss": 0.5036, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.756, |
|
"grad_norm": 0.17646639049053192, |
|
"learning_rate": 6.960955834980028e-06, |
|
"loss": 0.3024, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.758, |
|
"grad_norm": 0.15006083250045776, |
|
"learning_rate": 6.946350531586959e-06, |
|
"loss": 0.2702, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.15430916845798492, |
|
"learning_rate": 6.931725628465643e-06, |
|
"loss": 0.2492, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.762, |
|
"grad_norm": 0.13274860382080078, |
|
"learning_rate": 6.917081272888697e-06, |
|
"loss": 0.2188, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.764, |
|
"grad_norm": 0.12552917003631592, |
|
"learning_rate": 6.902417612324615e-06, |
|
"loss": 0.2275, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.766, |
|
"grad_norm": 0.14306232333183289, |
|
"learning_rate": 6.887734794436301e-06, |
|
"loss": 0.3204, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.18567156791687012, |
|
"learning_rate": 6.873032967079562e-06, |
|
"loss": 0.4079, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.18761208653450012, |
|
"learning_rate": 6.858312278301638e-06, |
|
"loss": 0.2944, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.772, |
|
"grad_norm": 0.18265055119991302, |
|
"learning_rate": 6.8435728763397045e-06, |
|
"loss": 0.4399, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.774, |
|
"grad_norm": 0.18840709328651428, |
|
"learning_rate": 6.828814909619374e-06, |
|
"loss": 0.4057, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 0.19235002994537354, |
|
"learning_rate": 6.814038526753205e-06, |
|
"loss": 0.2826, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.778, |
|
"grad_norm": 0.1880473792552948, |
|
"learning_rate": 6.799243876539213e-06, |
|
"loss": 0.3739, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.29550889134407043, |
|
"learning_rate": 6.78443110795936e-06, |
|
"loss": 0.3594, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.782, |
|
"grad_norm": 0.19335615634918213, |
|
"learning_rate": 6.76960037017806e-06, |
|
"loss": 0.4026, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.14000019431114197, |
|
"learning_rate": 6.75475181254068e-06, |
|
"loss": 0.2576, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.786, |
|
"grad_norm": 0.15106743574142456, |
|
"learning_rate": 6.739885584572026e-06, |
|
"loss": 0.2538, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.788, |
|
"grad_norm": 0.19910076260566711, |
|
"learning_rate": 6.725001835974854e-06, |
|
"loss": 0.2867, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.22941169142723083, |
|
"learning_rate": 6.710100716628345e-06, |
|
"loss": 0.3183, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 0.1540730744600296, |
|
"learning_rate": 6.695182376586603e-06, |
|
"loss": 0.31, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.794, |
|
"grad_norm": 0.18420648574829102, |
|
"learning_rate": 6.680246966077151e-06, |
|
"loss": 0.388, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.796, |
|
"grad_norm": 0.14336371421813965, |
|
"learning_rate": 6.665294635499404e-06, |
|
"loss": 0.3359, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.798, |
|
"grad_norm": 0.21092049777507782, |
|
"learning_rate": 6.650325535423166e-06, |
|
"loss": 0.2935, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.23870034515857697, |
|
"learning_rate": 6.635339816587109e-06, |
|
"loss": 0.3413, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.802, |
|
"grad_norm": 0.21548299491405487, |
|
"learning_rate": 6.6203376298972535e-06, |
|
"loss": 0.4255, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.804, |
|
"grad_norm": 0.21555306017398834, |
|
"learning_rate": 6.605319126425455e-06, |
|
"loss": 0.4044, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.806, |
|
"grad_norm": 0.212354838848114, |
|
"learning_rate": 6.590284457407876e-06, |
|
"loss": 0.3225, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 0.17822064459323883, |
|
"learning_rate": 6.5752337742434644e-06, |
|
"loss": 0.3449, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.15272925794124603, |
|
"learning_rate": 6.560167228492436e-06, |
|
"loss": 0.2732, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.812, |
|
"grad_norm": 0.18225990235805511, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.3326, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.814, |
|
"grad_norm": 0.1854051798582077, |
|
"learning_rate": 6.529987156268527e-06, |
|
"loss": 0.3603, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.17678527534008026, |
|
"learning_rate": 6.514873933708637e-06, |
|
"loss": 0.2996, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.818, |
|
"grad_norm": 0.35500454902648926, |
|
"learning_rate": 6.499745456385054e-06, |
|
"loss": 0.4185, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.18555931746959686, |
|
"learning_rate": 6.484601876641375e-06, |
|
"loss": 0.2208, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.822, |
|
"grad_norm": 0.16834326088428497, |
|
"learning_rate": 6.469443346973281e-06, |
|
"loss": 0.3684, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 0.1469370424747467, |
|
"learning_rate": 6.454270020026996e-06, |
|
"loss": 0.2526, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.826, |
|
"grad_norm": 0.19754226505756378, |
|
"learning_rate": 6.439082048597755e-06, |
|
"loss": 0.3341, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.828, |
|
"grad_norm": 0.15154729783535004, |
|
"learning_rate": 6.423879585628262e-06, |
|
"loss": 0.2402, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.20265011489391327, |
|
"learning_rate": 6.408662784207149e-06, |
|
"loss": 0.374, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.2674030065536499, |
|
"learning_rate": 6.39343179756744e-06, |
|
"loss": 0.3057, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.834, |
|
"grad_norm": 0.1473691463470459, |
|
"learning_rate": 6.378186779084996e-06, |
|
"loss": 0.3684, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.836, |
|
"grad_norm": 0.2826951742172241, |
|
"learning_rate": 6.362927882276991e-06, |
|
"loss": 0.2585, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.838, |
|
"grad_norm": 0.20093302428722382, |
|
"learning_rate": 6.34765526080034e-06, |
|
"loss": 0.3041, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.1346312314271927, |
|
"learning_rate": 6.332369068450175e-06, |
|
"loss": 0.2105, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.842, |
|
"grad_norm": 0.16400040686130524, |
|
"learning_rate": 6.317069459158284e-06, |
|
"loss": 0.2832, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.844, |
|
"grad_norm": 0.19443334639072418, |
|
"learning_rate": 6.301756586991561e-06, |
|
"loss": 0.3353, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.846, |
|
"grad_norm": 0.22223643958568573, |
|
"learning_rate": 6.286430606150458e-06, |
|
"loss": 0.384, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.16762332618236542, |
|
"learning_rate": 6.271091670967437e-06, |
|
"loss": 0.3826, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.26455458998680115, |
|
"learning_rate": 6.255739935905396e-06, |
|
"loss": 0.4419, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.852, |
|
"grad_norm": 0.1570374071598053, |
|
"learning_rate": 6.240375555556145e-06, |
|
"loss": 0.2199, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.854, |
|
"grad_norm": 0.16800148785114288, |
|
"learning_rate": 6.22499868463882e-06, |
|
"loss": 0.2561, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 0.17082828283309937, |
|
"learning_rate": 6.209609477998339e-06, |
|
"loss": 0.3317, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.858, |
|
"grad_norm": 0.26214951276779175, |
|
"learning_rate": 6.194208090603845e-06, |
|
"loss": 0.4105, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.17318500578403473, |
|
"learning_rate": 6.178794677547138e-06, |
|
"loss": 0.2216, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.862, |
|
"grad_norm": 0.18394838273525238, |
|
"learning_rate": 6.163369394041112e-06, |
|
"loss": 0.3251, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.2352125197649002, |
|
"learning_rate": 6.1479323954182055e-06, |
|
"loss": 0.349, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.866, |
|
"grad_norm": 0.18627074360847473, |
|
"learning_rate": 6.132483837128823e-06, |
|
"loss": 0.3048, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.868, |
|
"grad_norm": 0.2253945916891098, |
|
"learning_rate": 6.1170238747397715e-06, |
|
"loss": 0.3081, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.1479015201330185, |
|
"learning_rate": 6.101552663932704e-06, |
|
"loss": 0.192, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 0.1954430192708969, |
|
"learning_rate": 6.08607036050254e-06, |
|
"loss": 0.2251, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.874, |
|
"grad_norm": 0.16169880330562592, |
|
"learning_rate": 6.070577120355903e-06, |
|
"loss": 0.2765, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.876, |
|
"grad_norm": 0.19537843763828278, |
|
"learning_rate": 6.055073099509549e-06, |
|
"loss": 0.2724, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.878, |
|
"grad_norm": 0.1675713211297989, |
|
"learning_rate": 6.039558454088796e-06, |
|
"loss": 0.3164, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.27977389097213745, |
|
"learning_rate": 6.024033340325954e-06, |
|
"loss": 0.4432, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.882, |
|
"grad_norm": 0.1879289448261261, |
|
"learning_rate": 6.0084979145587444e-06, |
|
"loss": 0.3558, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.884, |
|
"grad_norm": 0.16285355389118195, |
|
"learning_rate": 5.9929523332287275e-06, |
|
"loss": 0.3014, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.886, |
|
"grad_norm": 0.2135494202375412, |
|
"learning_rate": 5.977396752879742e-06, |
|
"loss": 0.3124, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 0.21992646157741547, |
|
"learning_rate": 5.961831330156306e-06, |
|
"loss": 0.3152, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.34824761748313904, |
|
"learning_rate": 5.946256221802052e-06, |
|
"loss": 0.4022, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.892, |
|
"grad_norm": 0.3176579177379608, |
|
"learning_rate": 5.930671584658151e-06, |
|
"loss": 0.3373, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.894, |
|
"grad_norm": 0.13881681859493256, |
|
"learning_rate": 5.915077575661723e-06, |
|
"loss": 0.2732, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.23585429787635803, |
|
"learning_rate": 5.89947435184427e-06, |
|
"loss": 0.383, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.898, |
|
"grad_norm": 0.20338225364685059, |
|
"learning_rate": 5.883862070330079e-06, |
|
"loss": 0.3929, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5738399028778076, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.3834, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.902, |
|
"grad_norm": 0.16114148497581482, |
|
"learning_rate": 5.85261096316312e-06, |
|
"loss": 0.2351, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 0.16090261936187744, |
|
"learning_rate": 5.8369724522086545e-06, |
|
"loss": 0.2264, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.906, |
|
"grad_norm": 0.1992426961660385, |
|
"learning_rate": 5.821325512950886e-06, |
|
"loss": 0.3239, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.908, |
|
"grad_norm": 0.1780838966369629, |
|
"learning_rate": 5.805670302954322e-06, |
|
"loss": 0.2997, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.24148645997047424, |
|
"learning_rate": 5.79000697986675e-06, |
|
"loss": 0.3701, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.1544380933046341, |
|
"learning_rate": 5.774335701417662e-06, |
|
"loss": 0.1843, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.914, |
|
"grad_norm": 0.20772896707057953, |
|
"learning_rate": 5.758656625416659e-06, |
|
"loss": 0.3617, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.916, |
|
"grad_norm": 0.2054608017206192, |
|
"learning_rate": 5.7429699097518585e-06, |
|
"loss": 0.3286, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.918, |
|
"grad_norm": 0.1513553261756897, |
|
"learning_rate": 5.727275712388318e-06, |
|
"loss": 0.2149, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.20221109688282013, |
|
"learning_rate": 5.711574191366427e-06, |
|
"loss": 0.2895, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.922, |
|
"grad_norm": 0.26075002551078796, |
|
"learning_rate": 5.695865504800328e-06, |
|
"loss": 0.3115, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.924, |
|
"grad_norm": 0.2223353236913681, |
|
"learning_rate": 5.680149810876322e-06, |
|
"loss": 0.3065, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.926, |
|
"grad_norm": 0.18663600087165833, |
|
"learning_rate": 5.664427267851271e-06, |
|
"loss": 0.2444, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.19538210332393646, |
|
"learning_rate": 5.648698034051009e-06, |
|
"loss": 0.3877, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.1691403090953827, |
|
"learning_rate": 5.632962267868747e-06, |
|
"loss": 0.2445, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.932, |
|
"grad_norm": 0.1581772416830063, |
|
"learning_rate": 5.617220127763474e-06, |
|
"loss": 0.3217, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.934, |
|
"grad_norm": 0.20001822710037231, |
|
"learning_rate": 5.601471772258368e-06, |
|
"loss": 0.3184, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 0.3052047789096832, |
|
"learning_rate": 5.585717359939192e-06, |
|
"loss": 0.3479, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.938, |
|
"grad_norm": 0.23681974411010742, |
|
"learning_rate": 5.569957049452703e-06, |
|
"loss": 0.3403, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.12364782392978668, |
|
"learning_rate": 5.5541909995050554e-06, |
|
"loss": 0.2085, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.942, |
|
"grad_norm": 0.1526976227760315, |
|
"learning_rate": 5.538419368860196e-06, |
|
"loss": 0.2281, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.2230585813522339, |
|
"learning_rate": 5.522642316338268e-06, |
|
"loss": 0.3351, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.946, |
|
"grad_norm": 0.17690080404281616, |
|
"learning_rate": 5.506860000814017e-06, |
|
"loss": 0.2985, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.948, |
|
"grad_norm": 0.1738656908273697, |
|
"learning_rate": 5.491072581215186e-06, |
|
"loss": 0.247, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.18501204252243042, |
|
"learning_rate": 5.475280216520913e-06, |
|
"loss": 0.2646, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 0.19721092283725739, |
|
"learning_rate": 5.459483065760138e-06, |
|
"loss": 0.2876, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.954, |
|
"grad_norm": 0.16680027544498444, |
|
"learning_rate": 5.443681288009991e-06, |
|
"loss": 0.2167, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.956, |
|
"grad_norm": 0.17918136715888977, |
|
"learning_rate": 5.4278750423942e-06, |
|
"loss": 0.3997, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.958, |
|
"grad_norm": 0.15725551545619965, |
|
"learning_rate": 5.412064488081482e-06, |
|
"loss": 0.2829, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.19459596276283264, |
|
"learning_rate": 5.396249784283943e-06, |
|
"loss": 0.3373, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.962, |
|
"grad_norm": 0.32756415009498596, |
|
"learning_rate": 5.380431090255475e-06, |
|
"loss": 0.4206, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.964, |
|
"grad_norm": 0.19843968749046326, |
|
"learning_rate": 5.364608565290154e-06, |
|
"loss": 0.3385, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.966, |
|
"grad_norm": 0.15863648056983948, |
|
"learning_rate": 5.348782368720627e-06, |
|
"loss": 0.2524, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 0.21220897138118744, |
|
"learning_rate": 5.33295265991652e-06, |
|
"loss": 0.2326, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.24547149240970612, |
|
"learning_rate": 5.317119598282823e-06, |
|
"loss": 0.3854, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.972, |
|
"grad_norm": 0.2009747326374054, |
|
"learning_rate": 5.301283343258293e-06, |
|
"loss": 0.3141, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.974, |
|
"grad_norm": 0.22629286348819733, |
|
"learning_rate": 5.285444054313841e-06, |
|
"loss": 0.3044, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.18528909981250763, |
|
"learning_rate": 5.26960189095093e-06, |
|
"loss": 0.3056, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.978, |
|
"grad_norm": 0.18446871638298035, |
|
"learning_rate": 5.253757012699972e-06, |
|
"loss": 0.3206, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.1961178332567215, |
|
"learning_rate": 5.237909579118713e-06, |
|
"loss": 0.386, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.982, |
|
"grad_norm": 0.20445547997951508, |
|
"learning_rate": 5.2220597497906315e-06, |
|
"loss": 0.3997, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 0.17709751427173615, |
|
"learning_rate": 5.206207684323337e-06, |
|
"loss": 0.3212, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.986, |
|
"grad_norm": 0.15768595039844513, |
|
"learning_rate": 5.190353542346951e-06, |
|
"loss": 0.2752, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.988, |
|
"grad_norm": 0.14925841987133026, |
|
"learning_rate": 5.174497483512506e-06, |
|
"loss": 0.2593, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.2051381766796112, |
|
"learning_rate": 5.15863966749034e-06, |
|
"loss": 0.3941, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.2395932674407959, |
|
"learning_rate": 5.142780253968481e-06, |
|
"loss": 0.3136, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.994, |
|
"grad_norm": 0.2152215540409088, |
|
"learning_rate": 5.126919402651053e-06, |
|
"loss": 0.3083, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.996, |
|
"grad_norm": 0.17021948099136353, |
|
"learning_rate": 5.111057273256648e-06, |
|
"loss": 0.3185, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.998, |
|
"grad_norm": 0.22681966423988342, |
|
"learning_rate": 5.095194025516733e-06, |
|
"loss": 0.4107, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.22234933078289032, |
|
"learning_rate": 5.07932981917404e-06, |
|
"loss": 0.3672, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.27911150455474854, |
|
"eval_runtime": 76.7158, |
|
"eval_samples_per_second": 7.195, |
|
"eval_steps_per_second": 0.899, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.002, |
|
"grad_norm": 0.18890836834907532, |
|
"learning_rate": 5.063464813980948e-06, |
|
"loss": 0.2277, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.004, |
|
"grad_norm": 0.19094686210155487, |
|
"learning_rate": 5.0475991696978844e-06, |
|
"loss": 0.3602, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.006, |
|
"grad_norm": 0.24123992025852203, |
|
"learning_rate": 5.03173304609171e-06, |
|
"loss": 0.2796, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 0.2091682106256485, |
|
"learning_rate": 5.015866602934112e-06, |
|
"loss": 0.333, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.21148917078971863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4005, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.012, |
|
"grad_norm": 0.14547854661941528, |
|
"learning_rate": 4.984133397065889e-06, |
|
"loss": 0.2223, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.014, |
|
"grad_norm": 0.23349957168102264, |
|
"learning_rate": 4.9682669539082914e-06, |
|
"loss": 0.3264, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.016, |
|
"grad_norm": 0.16822971403598785, |
|
"learning_rate": 4.952400830302117e-06, |
|
"loss": 0.3151, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.018, |
|
"grad_norm": 0.1795063018798828, |
|
"learning_rate": 4.936535186019053e-06, |
|
"loss": 0.2896, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.19863282144069672, |
|
"learning_rate": 4.9206701808259605e-06, |
|
"loss": 0.2481, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.022, |
|
"grad_norm": 0.18788766860961914, |
|
"learning_rate": 4.904805974483267e-06, |
|
"loss": 0.3513, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.1949293315410614, |
|
"learning_rate": 4.888942726743353e-06, |
|
"loss": 0.2264, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.002, |
|
"grad_norm": 0.16474653780460358, |
|
"learning_rate": 4.873080597348948e-06, |
|
"loss": 0.2793, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.004, |
|
"grad_norm": 0.20230461657047272, |
|
"learning_rate": 4.85721974603152e-06, |
|
"loss": 0.3618, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.006, |
|
"grad_norm": 0.16907107830047607, |
|
"learning_rate": 4.841360332509663e-06, |
|
"loss": 0.2708, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 0.22199520468711853, |
|
"learning_rate": 4.825502516487497e-06, |
|
"loss": 0.3405, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.17370116710662842, |
|
"learning_rate": 4.809646457653051e-06, |
|
"loss": 0.2715, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.012, |
|
"grad_norm": 0.21842899918556213, |
|
"learning_rate": 4.793792315676665e-06, |
|
"loss": 0.1802, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.014, |
|
"grad_norm": 0.1792248785495758, |
|
"learning_rate": 4.777940250209369e-06, |
|
"loss": 0.1912, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.016, |
|
"grad_norm": 0.24431253969669342, |
|
"learning_rate": 4.762090420881289e-06, |
|
"loss": 0.3494, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.018, |
|
"grad_norm": 0.1893794983625412, |
|
"learning_rate": 4.74624298730003e-06, |
|
"loss": 0.246, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.29100745916366577, |
|
"learning_rate": 4.7303981090490715e-06, |
|
"loss": 0.4553, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.022, |
|
"grad_norm": 0.21313871443271637, |
|
"learning_rate": 4.71455594568616e-06, |
|
"loss": 0.3414, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.257988840341568, |
|
"learning_rate": 4.6987166567417085e-06, |
|
"loss": 0.3223, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.026, |
|
"grad_norm": 0.1500207781791687, |
|
"learning_rate": 4.682880401717178e-06, |
|
"loss": 0.2883, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.028, |
|
"grad_norm": 0.2195630818605423, |
|
"learning_rate": 4.667047340083481e-06, |
|
"loss": 0.4185, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.24663732945919037, |
|
"learning_rate": 4.651217631279374e-06, |
|
"loss": 0.312, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.032, |
|
"grad_norm": 0.23168163001537323, |
|
"learning_rate": 4.635391434709847e-06, |
|
"loss": 0.3826, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.034, |
|
"grad_norm": 0.20334544777870178, |
|
"learning_rate": 4.619568909744524e-06, |
|
"loss": 0.302, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.036, |
|
"grad_norm": 0.2471403032541275, |
|
"learning_rate": 4.603750215716057e-06, |
|
"loss": 0.3024, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.038, |
|
"grad_norm": 0.19385652244091034, |
|
"learning_rate": 4.587935511918521e-06, |
|
"loss": 0.2803, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.24697639048099518, |
|
"learning_rate": 4.572124957605803e-06, |
|
"loss": 0.4114, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.042, |
|
"grad_norm": 0.24823316931724548, |
|
"learning_rate": 4.55631871199001e-06, |
|
"loss": 0.3705, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.044, |
|
"grad_norm": 0.1970013827085495, |
|
"learning_rate": 4.5405169342398634e-06, |
|
"loss": 0.3608, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.046, |
|
"grad_norm": 0.20955346524715424, |
|
"learning_rate": 4.524719783479088e-06, |
|
"loss": 0.347, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.048, |
|
"grad_norm": 0.1911235898733139, |
|
"learning_rate": 4.5089274187848144e-06, |
|
"loss": 0.2342, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.22940923273563385, |
|
"learning_rate": 4.493139999185984e-06, |
|
"loss": 0.2803, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.052, |
|
"grad_norm": 0.24347023665905, |
|
"learning_rate": 4.477357683661734e-06, |
|
"loss": 0.3833, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.054, |
|
"grad_norm": 0.24687382578849792, |
|
"learning_rate": 4.461580631139806e-06, |
|
"loss": 0.3467, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 0.15779221057891846, |
|
"learning_rate": 4.445809000494945e-06, |
|
"loss": 0.2781, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.058, |
|
"grad_norm": 0.20665578544139862, |
|
"learning_rate": 4.430042950547298e-06, |
|
"loss": 0.4656, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.24457348883152008, |
|
"learning_rate": 4.414282640060809e-06, |
|
"loss": 0.2684, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.062, |
|
"grad_norm": 0.20804962515830994, |
|
"learning_rate": 4.398528227741634e-06, |
|
"loss": 0.3577, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 0.2586953043937683, |
|
"learning_rate": 4.382779872236527e-06, |
|
"loss": 0.3492, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.066, |
|
"grad_norm": 0.26488688588142395, |
|
"learning_rate": 4.367037732131254e-06, |
|
"loss": 0.3954, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.068, |
|
"grad_norm": 0.15630888938903809, |
|
"learning_rate": 4.3513019659489916e-06, |
|
"loss": 0.1673, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.15465758740901947, |
|
"learning_rate": 4.33557273214873e-06, |
|
"loss": 0.2532, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 0.25680503249168396, |
|
"learning_rate": 4.319850189123681e-06, |
|
"loss": 0.3065, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.074, |
|
"grad_norm": 0.24224849045276642, |
|
"learning_rate": 4.304134495199675e-06, |
|
"loss": 0.4157, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.076, |
|
"grad_norm": 0.1849289834499359, |
|
"learning_rate": 4.2884258086335755e-06, |
|
"loss": 0.3611, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.078, |
|
"grad_norm": 0.2488396316766739, |
|
"learning_rate": 4.272724287611684e-06, |
|
"loss": 0.313, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.23535999655723572, |
|
"learning_rate": 4.257030090248142e-06, |
|
"loss": 0.3165, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.082, |
|
"grad_norm": 0.19105635583400726, |
|
"learning_rate": 4.241343374583343e-06, |
|
"loss": 0.2779, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.084, |
|
"grad_norm": 0.22108493745326996, |
|
"learning_rate": 4.225664298582339e-06, |
|
"loss": 0.3312, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.086, |
|
"grad_norm": 0.18127895891666412, |
|
"learning_rate": 4.209993020133251e-06, |
|
"loss": 0.2099, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.304030179977417, |
|
"learning_rate": 4.194329697045681e-06, |
|
"loss": 0.4397, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.16876006126403809, |
|
"learning_rate": 4.178674487049116e-06, |
|
"loss": 0.253, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.092, |
|
"grad_norm": 0.18693579733371735, |
|
"learning_rate": 4.163027547791347e-06, |
|
"loss": 0.2696, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.094, |
|
"grad_norm": 0.2209119349718094, |
|
"learning_rate": 4.147389036836881e-06, |
|
"loss": 0.2225, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.096, |
|
"grad_norm": 0.1712501347064972, |
|
"learning_rate": 4.131759111665349e-06, |
|
"loss": 0.2205, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.098, |
|
"grad_norm": 0.18427731096744537, |
|
"learning_rate": 4.116137929669921e-06, |
|
"loss": 0.2527, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.16298742592334747, |
|
"learning_rate": 4.100525648155731e-06, |
|
"loss": 0.2583, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.102, |
|
"grad_norm": 0.1921571046113968, |
|
"learning_rate": 4.084922424338277e-06, |
|
"loss": 0.2931, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 0.1696956604719162, |
|
"learning_rate": 4.06932841534185e-06, |
|
"loss": 0.2686, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.106, |
|
"grad_norm": 0.2463129460811615, |
|
"learning_rate": 4.053743778197951e-06, |
|
"loss": 0.301, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.108, |
|
"grad_norm": 0.15761299431324005, |
|
"learning_rate": 4.038168669843698e-06, |
|
"loss": 0.1756, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.1688557118177414, |
|
"learning_rate": 4.02260324712026e-06, |
|
"loss": 0.2969, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 0.21805354952812195, |
|
"learning_rate": 4.007047666771274e-06, |
|
"loss": 0.2739, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.114, |
|
"grad_norm": 0.17749401926994324, |
|
"learning_rate": 3.991502085441259e-06, |
|
"loss": 0.2698, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.116, |
|
"grad_norm": 0.2537892758846283, |
|
"learning_rate": 3.975966659674048e-06, |
|
"loss": 0.4131, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.1179999999999999, |
|
"grad_norm": 0.15672741830348969, |
|
"learning_rate": 3.960441545911205e-06, |
|
"loss": 0.2118, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.23960451781749725, |
|
"learning_rate": 3.944926900490452e-06, |
|
"loss": 0.2715, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.1219999999999999, |
|
"grad_norm": 0.17803031206130981, |
|
"learning_rate": 3.929422879644099e-06, |
|
"loss": 0.24, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.124, |
|
"grad_norm": 0.2676704525947571, |
|
"learning_rate": 3.913929639497462e-06, |
|
"loss": 0.3247, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.126, |
|
"grad_norm": 0.1522570550441742, |
|
"learning_rate": 3.898447336067297e-06, |
|
"loss": 0.2298, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 0.23372875154018402, |
|
"learning_rate": 3.882976125260229e-06, |
|
"loss": 0.4375, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.3442481756210327, |
|
"learning_rate": 3.867516162871177e-06, |
|
"loss": 0.2883, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.1320000000000001, |
|
"grad_norm": 0.2335498332977295, |
|
"learning_rate": 3.8520676045817945e-06, |
|
"loss": 0.2602, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.134, |
|
"grad_norm": 0.29386457800865173, |
|
"learning_rate": 3.8366306059588885e-06, |
|
"loss": 0.3826, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 0.18141314387321472, |
|
"learning_rate": 3.821205322452863e-06, |
|
"loss": 0.205, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.138, |
|
"grad_norm": 0.21235667169094086, |
|
"learning_rate": 3.8057919093961554e-06, |
|
"loss": 0.2511, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.1400000000000001, |
|
"grad_norm": 0.15281343460083008, |
|
"learning_rate": 3.790390522001662e-06, |
|
"loss": 0.1908, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.142, |
|
"grad_norm": 0.1883106231689453, |
|
"learning_rate": 3.775001315361183e-06, |
|
"loss": 0.2896, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.144, |
|
"grad_norm": 0.19878095388412476, |
|
"learning_rate": 3.7596244444438577e-06, |
|
"loss": 0.2847, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.146, |
|
"grad_norm": 0.18822817504405975, |
|
"learning_rate": 3.7442600640946045e-06, |
|
"loss": 0.3134, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.148, |
|
"grad_norm": 0.21552503108978271, |
|
"learning_rate": 3.7289083290325668e-06, |
|
"loss": 0.3323, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.25933748483657837, |
|
"learning_rate": 3.7135693938495433e-06, |
|
"loss": 0.3463, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.23867465555667877, |
|
"learning_rate": 3.69824341300844e-06, |
|
"loss": 0.3601, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.154, |
|
"grad_norm": 0.3167083263397217, |
|
"learning_rate": 3.682930540841717e-06, |
|
"loss": 0.4182, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.156, |
|
"grad_norm": 0.31397873163223267, |
|
"learning_rate": 3.667630931549826e-06, |
|
"loss": 0.3287, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.158, |
|
"grad_norm": 0.18764562904834747, |
|
"learning_rate": 3.6523447391996613e-06, |
|
"loss": 0.276, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.29411885142326355, |
|
"learning_rate": 3.637072117723012e-06, |
|
"loss": 0.3956, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.162, |
|
"grad_norm": 0.19027218222618103, |
|
"learning_rate": 3.6218132209150047e-06, |
|
"loss": 0.2753, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.164, |
|
"grad_norm": 0.20175009965896606, |
|
"learning_rate": 3.606568202432562e-06, |
|
"loss": 0.3459, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.166, |
|
"grad_norm": 0.2005695253610611, |
|
"learning_rate": 3.5913372157928515e-06, |
|
"loss": 0.2125, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 0.22972247004508972, |
|
"learning_rate": 3.5761204143717387e-06, |
|
"loss": 0.2925, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.22252865135669708, |
|
"learning_rate": 3.560917951402245e-06, |
|
"loss": 0.3467, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.172, |
|
"grad_norm": 0.2404780089855194, |
|
"learning_rate": 3.5457299799730047e-06, |
|
"loss": 0.3268, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.174, |
|
"grad_norm": 0.24187296628952026, |
|
"learning_rate": 3.5305566530267217e-06, |
|
"loss": 0.3654, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.176, |
|
"grad_norm": 0.23365625739097595, |
|
"learning_rate": 3.5153981233586277e-06, |
|
"loss": 0.3168, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.178, |
|
"grad_norm": 0.20350268483161926, |
|
"learning_rate": 3.5002545436149478e-06, |
|
"loss": 0.2618, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.22084195911884308, |
|
"learning_rate": 3.4851260662913643e-06, |
|
"loss": 0.381, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.182, |
|
"grad_norm": 0.5043354630470276, |
|
"learning_rate": 3.470012843731476e-06, |
|
"loss": 0.426, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 0.23615571856498718, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.3891, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.186, |
|
"grad_norm": 0.1776285469532013, |
|
"learning_rate": 3.439832771507565e-06, |
|
"loss": 0.2032, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.188, |
|
"grad_norm": 0.23352046310901642, |
|
"learning_rate": 3.4247662257565372e-06, |
|
"loss": 0.2098, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.19145451486110687, |
|
"learning_rate": 3.4097155425921256e-06, |
|
"loss": 0.2612, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 0.19671331346035004, |
|
"learning_rate": 3.394680873574546e-06, |
|
"loss": 0.2941, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.194, |
|
"grad_norm": 0.2002706378698349, |
|
"learning_rate": 3.3796623701027477e-06, |
|
"loss": 0.1828, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.196, |
|
"grad_norm": 0.23058104515075684, |
|
"learning_rate": 3.3646601834128924e-06, |
|
"loss": 0.2983, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.198, |
|
"grad_norm": 0.13006491959095, |
|
"learning_rate": 3.349674464576834e-06, |
|
"loss": 0.1306, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.29587817192077637, |
|
"learning_rate": 3.3347053645005965e-06, |
|
"loss": 0.3542, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.202, |
|
"grad_norm": 0.23100513219833374, |
|
"learning_rate": 3.319753033922849e-06, |
|
"loss": 0.4051, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.204, |
|
"grad_norm": 0.24775229394435883, |
|
"learning_rate": 3.3048176234133967e-06, |
|
"loss": 0.2378, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.206, |
|
"grad_norm": 0.18648101389408112, |
|
"learning_rate": 3.289899283371657e-06, |
|
"loss": 0.2141, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.208, |
|
"grad_norm": 0.24682392179965973, |
|
"learning_rate": 3.274998164025148e-06, |
|
"loss": 0.3123, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.25237175822257996, |
|
"learning_rate": 3.260114415427975e-06, |
|
"loss": 0.4471, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.212, |
|
"grad_norm": 0.20262058079242706, |
|
"learning_rate": 3.2452481874593234e-06, |
|
"loss": 0.2694, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.214, |
|
"grad_norm": 0.23342056572437286, |
|
"learning_rate": 3.230399629821942e-06, |
|
"loss": 0.3093, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 0.17575059831142426, |
|
"learning_rate": 3.2155688920406415e-06, |
|
"loss": 0.2923, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.218, |
|
"grad_norm": 0.2357223480939865, |
|
"learning_rate": 3.200756123460788e-06, |
|
"loss": 0.3569, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.3179761469364166, |
|
"learning_rate": 3.1859614732467957e-06, |
|
"loss": 0.4442, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.222, |
|
"grad_norm": 0.28770139813423157, |
|
"learning_rate": 3.171185090380628e-06, |
|
"loss": 0.3325, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 0.18547223508358002, |
|
"learning_rate": 3.156427123660297e-06, |
|
"loss": 0.2269, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.226, |
|
"grad_norm": 0.21385949850082397, |
|
"learning_rate": 3.141687721698363e-06, |
|
"loss": 0.2615, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.226, |
|
"eval_loss": 0.2700715959072113, |
|
"eval_runtime": 76.6157, |
|
"eval_samples_per_second": 7.205, |
|
"eval_steps_per_second": 0.901, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.228, |
|
"grad_norm": 0.3386872708797455, |
|
"learning_rate": 3.12696703292044e-06, |
|
"loss": 0.3519, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.19794243574142456, |
|
"learning_rate": 3.1122652055637014e-06, |
|
"loss": 0.2581, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 0.1912515014410019, |
|
"learning_rate": 3.097582387675385e-06, |
|
"loss": 0.3286, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.234, |
|
"grad_norm": 0.18073877692222595, |
|
"learning_rate": 3.0829187271113035e-06, |
|
"loss": 0.2411, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.236, |
|
"grad_norm": 0.24173890054225922, |
|
"learning_rate": 3.0682743715343565e-06, |
|
"loss": 0.3853, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.238, |
|
"grad_norm": 0.17611730098724365, |
|
"learning_rate": 3.053649468413043e-06, |
|
"loss": 0.1971, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.22723500430583954, |
|
"learning_rate": 3.0390441650199727e-06, |
|
"loss": 0.2852, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.242, |
|
"grad_norm": 0.2124418169260025, |
|
"learning_rate": 3.0244586084303908e-06, |
|
"loss": 0.329, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.244, |
|
"grad_norm": 0.24569527804851532, |
|
"learning_rate": 3.0098929455206905e-06, |
|
"loss": 0.4141, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.246, |
|
"grad_norm": 0.2651529312133789, |
|
"learning_rate": 2.995347322966933e-06, |
|
"loss": 0.2759, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 0.3110187351703644, |
|
"learning_rate": 2.980821887243377e-06, |
|
"loss": 0.3405, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.23818974196910858, |
|
"learning_rate": 2.966316784621e-06, |
|
"loss": 0.2185, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.252, |
|
"grad_norm": 0.32177677750587463, |
|
"learning_rate": 2.951832161166024e-06, |
|
"loss": 0.4972, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.254, |
|
"grad_norm": 0.21647526323795319, |
|
"learning_rate": 2.937368162738445e-06, |
|
"loss": 0.4215, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 0.1766624003648758, |
|
"learning_rate": 2.9229249349905686e-06, |
|
"loss": 0.2439, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.258, |
|
"grad_norm": 0.34441429376602173, |
|
"learning_rate": 2.9085026233655367e-06, |
|
"loss": 0.4078, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.30576056241989136, |
|
"learning_rate": 2.8941013730958674e-06, |
|
"loss": 0.4071, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.262, |
|
"grad_norm": 0.22246578335762024, |
|
"learning_rate": 2.8797213292019927e-06, |
|
"loss": 0.3456, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 0.21253855526447296, |
|
"learning_rate": 2.8653626364907918e-06, |
|
"loss": 0.2257, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.266, |
|
"grad_norm": 0.22427724301815033, |
|
"learning_rate": 2.851025439554142e-06, |
|
"loss": 0.298, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.268, |
|
"grad_norm": 0.19472835958003998, |
|
"learning_rate": 2.8367098827674575e-06, |
|
"loss": 0.3093, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.19399920105934143, |
|
"learning_rate": 2.82241611028824e-06, |
|
"loss": 0.2254, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 0.23820382356643677, |
|
"learning_rate": 2.8081442660546126e-06, |
|
"loss": 0.2909, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.274, |
|
"grad_norm": 0.1856381893157959, |
|
"learning_rate": 2.7938944937838924e-06, |
|
"loss": 0.2367, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.276, |
|
"grad_norm": 0.16763170063495636, |
|
"learning_rate": 2.7796669369711294e-06, |
|
"loss": 0.1991, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.278, |
|
"grad_norm": 0.25936460494995117, |
|
"learning_rate": 2.7654617388876612e-06, |
|
"loss": 0.3244, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.37680599093437195, |
|
"learning_rate": 2.751279042579672e-06, |
|
"loss": 0.409, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.282, |
|
"grad_norm": 0.2094666063785553, |
|
"learning_rate": 2.7371189908667604e-06, |
|
"loss": 0.3523, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.284, |
|
"grad_norm": 0.25615018606185913, |
|
"learning_rate": 2.722981726340487e-06, |
|
"loss": 0.3496, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.286, |
|
"grad_norm": 0.2155938446521759, |
|
"learning_rate": 2.708867391362948e-06, |
|
"loss": 0.2099, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"grad_norm": 0.2571382522583008, |
|
"learning_rate": 2.694776128065345e-06, |
|
"loss": 0.2505, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.25513583421707153, |
|
"learning_rate": 2.6807080783465376e-06, |
|
"loss": 0.3528, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.292, |
|
"grad_norm": 0.21190734207630157, |
|
"learning_rate": 2.6666633838716317e-06, |
|
"loss": 0.3892, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.294, |
|
"grad_norm": 0.2990153133869171, |
|
"learning_rate": 2.6526421860705474e-06, |
|
"loss": 0.3916, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 0.22129324078559875, |
|
"learning_rate": 2.6386446261365874e-06, |
|
"loss": 0.2596, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.298, |
|
"grad_norm": 0.2187465876340866, |
|
"learning_rate": 2.6246708450250256e-06, |
|
"loss": 0.3962, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.17136049270629883, |
|
"learning_rate": 2.6107209834516857e-06, |
|
"loss": 0.3483, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.302, |
|
"grad_norm": 0.25110378861427307, |
|
"learning_rate": 2.5967951818915137e-06, |
|
"loss": 0.4098, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.304, |
|
"grad_norm": 0.3335612118244171, |
|
"learning_rate": 2.5828935805771804e-06, |
|
"loss": 0.3407, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.306, |
|
"grad_norm": 0.23392237722873688, |
|
"learning_rate": 2.5690163194976576e-06, |
|
"loss": 0.3893, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.308, |
|
"grad_norm": 0.21025826036930084, |
|
"learning_rate": 2.5551635383968063e-06, |
|
"loss": 0.3047, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.20678383111953735, |
|
"learning_rate": 2.5413353767719805e-06, |
|
"loss": 0.3068, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 0.255937397480011, |
|
"learning_rate": 2.527531973872617e-06, |
|
"loss": 0.2963, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.314, |
|
"grad_norm": 0.3448125422000885, |
|
"learning_rate": 2.5137534686988265e-06, |
|
"loss": 0.3944, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.316, |
|
"grad_norm": 0.21276655793190002, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 0.2955, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.318, |
|
"grad_norm": 0.2522459030151367, |
|
"learning_rate": 2.486271706273421e-06, |
|
"loss": 0.3536, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.2182285189628601, |
|
"learning_rate": 2.4725687257628533e-06, |
|
"loss": 0.3541, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.322, |
|
"grad_norm": 0.30204272270202637, |
|
"learning_rate": 2.4588911964571557e-06, |
|
"loss": 0.268, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.324, |
|
"grad_norm": 0.27727144956588745, |
|
"learning_rate": 2.445239256088898e-06, |
|
"loss": 0.3061, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.326, |
|
"grad_norm": 0.22263972461223602, |
|
"learning_rate": 2.4316130421329696e-06, |
|
"loss": 0.3317, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 0.23461495339870453, |
|
"learning_rate": 2.418012691805191e-06, |
|
"loss": 0.3153, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.1453184336423874, |
|
"learning_rate": 2.404438342060941e-06, |
|
"loss": 0.1933, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.332, |
|
"grad_norm": 0.20232437551021576, |
|
"learning_rate": 2.3908901295937713e-06, |
|
"loss": 0.1941, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.334, |
|
"grad_norm": 0.23894034326076508, |
|
"learning_rate": 2.3773681908340284e-06, |
|
"loss": 0.3198, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.336, |
|
"grad_norm": 0.3079819977283478, |
|
"learning_rate": 2.363872661947488e-06, |
|
"loss": 0.3761, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.338, |
|
"grad_norm": 0.20794443786144257, |
|
"learning_rate": 2.3504036788339763e-06, |
|
"loss": 0.3837, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.2881450057029724, |
|
"learning_rate": 2.3369613771260006e-06, |
|
"loss": 0.2904, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.342, |
|
"grad_norm": 0.20050355792045593, |
|
"learning_rate": 2.323545892187393e-06, |
|
"loss": 0.2323, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.22167599201202393, |
|
"learning_rate": 2.310157359111938e-06, |
|
"loss": 0.2501, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.346, |
|
"grad_norm": 0.29652273654937744, |
|
"learning_rate": 2.296795912722014e-06, |
|
"loss": 0.3702, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.3479999999999999, |
|
"grad_norm": 0.20178988575935364, |
|
"learning_rate": 2.2834616875672362e-06, |
|
"loss": 0.2581, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.25368136167526245, |
|
"learning_rate": 2.2701548179231048e-06, |
|
"loss": 0.3034, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 0.20186640322208405, |
|
"learning_rate": 2.2568754377896516e-06, |
|
"loss": 0.2991, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.354, |
|
"grad_norm": 0.2289544939994812, |
|
"learning_rate": 2.2436236808900846e-06, |
|
"loss": 0.3188, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.3559999999999999, |
|
"grad_norm": 0.2351309210062027, |
|
"learning_rate": 2.230399680669449e-06, |
|
"loss": 0.2942, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.358, |
|
"grad_norm": 0.19411875307559967, |
|
"learning_rate": 2.2172035702932828e-06, |
|
"loss": 0.3415, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.23344936966896057, |
|
"learning_rate": 2.204035482646267e-06, |
|
"loss": 0.2904, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.362, |
|
"grad_norm": 0.17623913288116455, |
|
"learning_rate": 2.190895550330899e-06, |
|
"loss": 0.1493, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.3639999999999999, |
|
"grad_norm": 0.22438128292560577, |
|
"learning_rate": 2.1777839056661555e-06, |
|
"loss": 0.3669, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.366, |
|
"grad_norm": 0.25720444321632385, |
|
"learning_rate": 2.1647006806861472e-06, |
|
"loss": 0.4394, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 0.17176856100559235, |
|
"learning_rate": 2.1516460071388062e-06, |
|
"loss": 0.2309, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.26110807061195374, |
|
"learning_rate": 2.1386200164845527e-06, |
|
"loss": 0.4329, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 1.3719999999999999, |
|
"grad_norm": 0.24240969121456146, |
|
"learning_rate": 2.125622839894964e-06, |
|
"loss": 0.2596, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.374, |
|
"grad_norm": 0.202704519033432, |
|
"learning_rate": 2.1126546082514665e-06, |
|
"loss": 0.2737, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 0.20342108607292175, |
|
"learning_rate": 2.09971545214401e-06, |
|
"loss": 0.2692, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3780000000000001, |
|
"grad_norm": 0.3197811543941498, |
|
"learning_rate": 2.086805501869749e-06, |
|
"loss": 0.3117, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.29925206303596497, |
|
"learning_rate": 2.073924887431744e-06, |
|
"loss": 0.2391, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.3820000000000001, |
|
"grad_norm": 0.2412380427122116, |
|
"learning_rate": 2.061073738537635e-06, |
|
"loss": 0.2434, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"grad_norm": 0.25253570079803467, |
|
"learning_rate": 2.0482521845983522e-06, |
|
"loss": 0.3284, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.3860000000000001, |
|
"grad_norm": 0.18548652529716492, |
|
"learning_rate": 2.0354603547267985e-06, |
|
"loss": 0.2562, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.388, |
|
"grad_norm": 0.2307010442018509, |
|
"learning_rate": 2.0226983777365604e-06, |
|
"loss": 0.2445, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.3900000000000001, |
|
"grad_norm": 0.1840142160654068, |
|
"learning_rate": 2.009966382140606e-06, |
|
"loss": 0.3521, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 0.2078990340232849, |
|
"learning_rate": 1.9972644961499853e-06, |
|
"loss": 0.2887, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.3940000000000001, |
|
"grad_norm": 0.20442235469818115, |
|
"learning_rate": 1.9845928476725522e-06, |
|
"loss": 0.3453, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 1.396, |
|
"grad_norm": 0.1933489441871643, |
|
"learning_rate": 1.971951564311668e-06, |
|
"loss": 0.3581, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3980000000000001, |
|
"grad_norm": 0.19691258668899536, |
|
"learning_rate": 1.959340773364911e-06, |
|
"loss": 0.2933, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.1842382252216339, |
|
"learning_rate": 1.946760601822809e-06, |
|
"loss": 0.2894, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.4020000000000001, |
|
"grad_norm": 0.35139110684394836, |
|
"learning_rate": 1.9342111763675512e-06, |
|
"loss": 0.3405, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 1.404, |
|
"grad_norm": 0.19070106744766235, |
|
"learning_rate": 1.9216926233717087e-06, |
|
"loss": 0.213, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.4060000000000001, |
|
"grad_norm": 0.20061296224594116, |
|
"learning_rate": 1.9092050688969736e-06, |
|
"loss": 0.2858, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 0.30167287588119507, |
|
"learning_rate": 1.8967486386928819e-06, |
|
"loss": 0.4004, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.21128444373607635, |
|
"learning_rate": 1.8843234581955444e-06, |
|
"loss": 0.2326, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 1.412, |
|
"grad_norm": 0.23791776597499847, |
|
"learning_rate": 1.8719296525263925e-06, |
|
"loss": 0.2337, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.414, |
|
"grad_norm": 0.27308812737464905, |
|
"learning_rate": 1.859567346490913e-06, |
|
"loss": 0.2667, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 0.19012384116649628, |
|
"learning_rate": 1.8472366645773892e-06, |
|
"loss": 0.2042, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.418, |
|
"grad_norm": 0.2819920480251312, |
|
"learning_rate": 1.8349377309556487e-06, |
|
"loss": 0.3546, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.16963627934455872, |
|
"learning_rate": 1.8226706694758194e-06, |
|
"loss": 0.2087, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.422, |
|
"grad_norm": 0.222882941365242, |
|
"learning_rate": 1.810435603667075e-06, |
|
"loss": 0.3519, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 0.200264573097229, |
|
"learning_rate": 1.798232656736389e-06, |
|
"loss": 0.2172, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.426, |
|
"grad_norm": 0.25277942419052124, |
|
"learning_rate": 1.7860619515673034e-06, |
|
"loss": 0.3984, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.428, |
|
"grad_norm": 0.24608227610588074, |
|
"learning_rate": 1.7739236107186858e-06, |
|
"loss": 0.2575, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.30379989743232727, |
|
"learning_rate": 1.7618177564234907e-06, |
|
"loss": 0.2949, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 1.432, |
|
"grad_norm": 0.15659303963184357, |
|
"learning_rate": 1.7497445105875377e-06, |
|
"loss": 0.1913, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.434, |
|
"grad_norm": 0.2043537199497223, |
|
"learning_rate": 1.7377039947882802e-06, |
|
"loss": 0.2716, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 1.436, |
|
"grad_norm": 0.20367324352264404, |
|
"learning_rate": 1.7256963302735752e-06, |
|
"loss": 0.2358, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.438, |
|
"grad_norm": 0.28134340047836304, |
|
"learning_rate": 1.7137216379604727e-06, |
|
"loss": 0.2814, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.2837545871734619, |
|
"learning_rate": 1.7017800384339928e-06, |
|
"loss": 0.3792, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.442, |
|
"grad_norm": 0.22841040790081024, |
|
"learning_rate": 1.6898716519459074e-06, |
|
"loss": 0.2819, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 1.444, |
|
"grad_norm": 0.21164868772029877, |
|
"learning_rate": 1.6779965984135376e-06, |
|
"loss": 0.2676, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.446, |
|
"grad_norm": 0.2656158208847046, |
|
"learning_rate": 1.6661549974185426e-06, |
|
"loss": 0.284, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 0.2675846815109253, |
|
"learning_rate": 1.6543469682057105e-06, |
|
"loss": 0.3098, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.2900715172290802, |
|
"learning_rate": 1.6425726296817634e-06, |
|
"loss": 0.3378, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 1.452, |
|
"grad_norm": 0.27534744143486023, |
|
"learning_rate": 1.6308321004141609e-06, |
|
"loss": 0.3497, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.454, |
|
"grad_norm": 0.30499523878097534, |
|
"learning_rate": 1.6191254986299044e-06, |
|
"loss": 0.3271, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 0.1775362193584442, |
|
"learning_rate": 1.6074529422143398e-06, |
|
"loss": 0.1754, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.458, |
|
"grad_norm": 0.25734683871269226, |
|
"learning_rate": 1.5958145487099829e-06, |
|
"loss": 0.3568, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.22716552019119263, |
|
"learning_rate": 1.5842104353153286e-06, |
|
"loss": 0.2856, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.462, |
|
"grad_norm": 0.2042451947927475, |
|
"learning_rate": 1.5726407188836672e-06, |
|
"loss": 0.2623, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 1.464, |
|
"grad_norm": 0.26923978328704834, |
|
"learning_rate": 1.561105515921915e-06, |
|
"loss": 0.4326, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.466, |
|
"grad_norm": 0.22442659735679626, |
|
"learning_rate": 1.549604942589441e-06, |
|
"loss": 0.2867, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.468, |
|
"grad_norm": 0.16880613565444946, |
|
"learning_rate": 1.5381391146968866e-06, |
|
"loss": 0.1821, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.24349483847618103, |
|
"learning_rate": 1.5267081477050132e-06, |
|
"loss": 0.2753, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 0.27072674036026, |
|
"learning_rate": 1.5153121567235334e-06, |
|
"loss": 0.2222, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.474, |
|
"grad_norm": 0.291255921125412, |
|
"learning_rate": 1.5039512565099468e-06, |
|
"loss": 0.3485, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 1.476, |
|
"grad_norm": 0.20078301429748535, |
|
"learning_rate": 1.4926255614683931e-06, |
|
"loss": 0.2959, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.476, |
|
"eval_loss": 0.2654268741607666, |
|
"eval_runtime": 76.2376, |
|
"eval_samples_per_second": 7.241, |
|
"eval_steps_per_second": 0.905, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.478, |
|
"grad_norm": 0.2795911431312561, |
|
"learning_rate": 1.4813351856484981e-06, |
|
"loss": 0.1859, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.35663336515426636, |
|
"learning_rate": 1.470080242744218e-06, |
|
"loss": 0.3358, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.482, |
|
"grad_norm": 0.23237483203411102, |
|
"learning_rate": 1.458860846092705e-06, |
|
"loss": 0.2874, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 1.484, |
|
"grad_norm": 0.19958510994911194, |
|
"learning_rate": 1.4476771086731567e-06, |
|
"loss": 0.3507, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.486, |
|
"grad_norm": 0.22077733278274536, |
|
"learning_rate": 1.4365291431056871e-06, |
|
"loss": 0.3085, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 0.31041693687438965, |
|
"learning_rate": 1.4254170616501828e-06, |
|
"loss": 0.3724, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.18345925211906433, |
|
"learning_rate": 1.4143409762051829e-06, |
|
"loss": 0.1957, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 1.492, |
|
"grad_norm": 0.1973162293434143, |
|
"learning_rate": 1.4033009983067454e-06, |
|
"loss": 0.2304, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 1.494, |
|
"grad_norm": 0.2636561095714569, |
|
"learning_rate": 1.3922972391273226e-06, |
|
"loss": 0.3215, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 1.496, |
|
"grad_norm": 0.22231453657150269, |
|
"learning_rate": 1.3813298094746491e-06, |
|
"loss": 0.2346, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.498, |
|
"grad_norm": 0.21096548438072205, |
|
"learning_rate": 1.3703988197906209e-06, |
|
"loss": 0.297, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.29171353578567505, |
|
"learning_rate": 1.3595043801501794e-06, |
|
"loss": 0.362, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 1.502, |
|
"grad_norm": 0.2302405834197998, |
|
"learning_rate": 1.3486466002602133e-06, |
|
"loss": 0.3468, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 0.1669236272573471, |
|
"learning_rate": 1.3378255894584463e-06, |
|
"loss": 0.2525, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.506, |
|
"grad_norm": 0.22917306423187256, |
|
"learning_rate": 1.3270414567123342e-06, |
|
"loss": 0.34, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.508, |
|
"grad_norm": 0.22837324440479279, |
|
"learning_rate": 1.3162943106179748e-06, |
|
"loss": 0.516, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.1973070204257965, |
|
"learning_rate": 1.305584259399013e-06, |
|
"loss": 0.2083, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"grad_norm": 0.25936761498451233, |
|
"learning_rate": 1.2949114109055417e-06, |
|
"loss": 0.4483, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.514, |
|
"grad_norm": 0.23405812680721283, |
|
"learning_rate": 1.2842758726130283e-06, |
|
"loss": 0.3334, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 1.516, |
|
"grad_norm": 0.2227783501148224, |
|
"learning_rate": 1.2736777516212267e-06, |
|
"loss": 0.3724, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.518, |
|
"grad_norm": 0.23398268222808838, |
|
"learning_rate": 1.263117154653097e-06, |
|
"loss": 0.2008, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.16665144264698029, |
|
"learning_rate": 1.2525941880537307e-06, |
|
"loss": 0.2177, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.522, |
|
"grad_norm": 0.21703177690505981, |
|
"learning_rate": 1.242108957789287e-06, |
|
"loss": 0.2668, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 1.524, |
|
"grad_norm": 0.3440599739551544, |
|
"learning_rate": 1.2316615694459188e-06, |
|
"loss": 0.3352, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 1.526, |
|
"grad_norm": 0.2005206048488617, |
|
"learning_rate": 1.2212521282287093e-06, |
|
"loss": 0.2719, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.528, |
|
"grad_norm": 0.2054724395275116, |
|
"learning_rate": 1.210880738960616e-06, |
|
"loss": 0.3181, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.2903349995613098, |
|
"learning_rate": 1.200547506081416e-06, |
|
"loss": 0.3382, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 1.532, |
|
"grad_norm": 0.22862407565116882, |
|
"learning_rate": 1.1902525336466465e-06, |
|
"loss": 0.2544, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 1.534, |
|
"grad_norm": 0.20812873542308807, |
|
"learning_rate": 1.1799959253265668e-06, |
|
"loss": 0.3118, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 0.2820591330528259, |
|
"learning_rate": 1.1697777844051105e-06, |
|
"loss": 0.3646, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.538, |
|
"grad_norm": 0.21943072974681854, |
|
"learning_rate": 1.1595982137788403e-06, |
|
"loss": 0.1957, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.1949055939912796, |
|
"learning_rate": 1.1494573159559214e-06, |
|
"loss": 0.253, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 1.542, |
|
"grad_norm": 0.20829080045223236, |
|
"learning_rate": 1.1393551930550828e-06, |
|
"loss": 0.2558, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"grad_norm": 0.20741114020347595, |
|
"learning_rate": 1.1292919468045876e-06, |
|
"loss": 0.2221, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.546, |
|
"grad_norm": 0.24327073991298676, |
|
"learning_rate": 1.1192676785412154e-06, |
|
"loss": 0.2616, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.548, |
|
"grad_norm": 0.2541949152946472, |
|
"learning_rate": 1.1092824892092375e-06, |
|
"loss": 0.2435, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.2096426635980606, |
|
"learning_rate": 1.099336479359398e-06, |
|
"loss": 0.2448, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"grad_norm": 0.24535740911960602, |
|
"learning_rate": 1.0894297491479044e-06, |
|
"loss": 0.2892, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.554, |
|
"grad_norm": 0.2067105919122696, |
|
"learning_rate": 1.0795623983354214e-06, |
|
"loss": 0.2584, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 1.556, |
|
"grad_norm": 0.2478252500295639, |
|
"learning_rate": 1.0697345262860638e-06, |
|
"loss": 0.3474, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.558, |
|
"grad_norm": 0.17269453406333923, |
|
"learning_rate": 1.0599462319663906e-06, |
|
"loss": 0.2793, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.2102997750043869, |
|
"learning_rate": 1.0501976139444191e-06, |
|
"loss": 0.3124, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.562, |
|
"grad_norm": 0.29494714736938477, |
|
"learning_rate": 1.0404887703886252e-06, |
|
"loss": 0.2693, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 1.564, |
|
"grad_norm": 0.19094854593276978, |
|
"learning_rate": 1.0308197990669538e-06, |
|
"loss": 0.3593, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 1.5659999999999998, |
|
"grad_norm": 0.20082080364227295, |
|
"learning_rate": 1.0211907973458391e-06, |
|
"loss": 0.2296, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 0.24483440816402435, |
|
"learning_rate": 1.0116018621892237e-06, |
|
"loss": 0.344, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.5699999999999998, |
|
"grad_norm": 0.21700353920459747, |
|
"learning_rate": 1.0020530901575754e-06, |
|
"loss": 0.2562, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 1.572, |
|
"grad_norm": 0.18885864317417145, |
|
"learning_rate": 9.925445774069232e-07, |
|
"loss": 0.2155, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 1.5739999999999998, |
|
"grad_norm": 0.2546456754207611, |
|
"learning_rate": 9.830764196878872e-07, |
|
"loss": 0.3539, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"grad_norm": 0.20347674190998077, |
|
"learning_rate": 9.73648712344707e-07, |
|
"loss": 0.2864, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5779999999999998, |
|
"grad_norm": 0.3315930962562561, |
|
"learning_rate": 9.642615503142927e-07, |
|
"loss": 0.3753, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.18244577944278717, |
|
"learning_rate": 9.549150281252633e-07, |
|
"loss": 0.2116, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 1.5819999999999999, |
|
"grad_norm": 0.24047374725341797, |
|
"learning_rate": 9.456092398969902e-07, |
|
"loss": 0.3352, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 0.2712211012840271, |
|
"learning_rate": 9.363442793386606e-07, |
|
"loss": 0.4647, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 1.5859999999999999, |
|
"grad_norm": 0.15284787118434906, |
|
"learning_rate": 9.271202397483214e-07, |
|
"loss": 0.2296, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.588, |
|
"grad_norm": 0.2665194571018219, |
|
"learning_rate": 9.179372140119524e-07, |
|
"loss": 0.353, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 1.5899999999999999, |
|
"grad_norm": 0.2965538799762726, |
|
"learning_rate": 9.087952946025175e-07, |
|
"loss": 0.2863, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 1.592, |
|
"grad_norm": 0.19379866123199463, |
|
"learning_rate": 8.996945735790447e-07, |
|
"loss": 0.3056, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 1.5939999999999999, |
|
"grad_norm": 0.2339809238910675, |
|
"learning_rate": 8.906351425856952e-07, |
|
"loss": 0.3741, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 1.596, |
|
"grad_norm": 0.2753208577632904, |
|
"learning_rate": 8.816170928508367e-07, |
|
"loss": 0.2715, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5979999999999999, |
|
"grad_norm": 0.2367635816335678, |
|
"learning_rate": 8.7264051518613e-07, |
|
"loss": 0.3268, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.2004977911710739, |
|
"learning_rate": 8.637054999856148e-07, |
|
"loss": 0.2217, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.6019999999999999, |
|
"grad_norm": 0.3549105226993561, |
|
"learning_rate": 8.54812137224792e-07, |
|
"loss": 0.3371, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 1.604, |
|
"grad_norm": 0.27921661734580994, |
|
"learning_rate": 8.459605164597268e-07, |
|
"loss": 0.3983, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 1.6059999999999999, |
|
"grad_norm": 0.2014499306678772, |
|
"learning_rate": 8.371507268261436e-07, |
|
"loss": 0.2413, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"grad_norm": 0.20690080523490906, |
|
"learning_rate": 8.283828570385239e-07, |
|
"loss": 0.2012, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.6099999999999999, |
|
"grad_norm": 0.21998871862888336, |
|
"learning_rate": 8.196569953892202e-07, |
|
"loss": 0.3298, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 1.612, |
|
"grad_norm": 0.3980468511581421, |
|
"learning_rate": 8.109732297475637e-07, |
|
"loss": 0.3194, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 1.6139999999999999, |
|
"grad_norm": 0.20355728268623352, |
|
"learning_rate": 8.023316475589754e-07, |
|
"loss": 0.1823, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 1.616, |
|
"grad_norm": 0.17916588485240936, |
|
"learning_rate": 7.937323358440935e-07, |
|
"loss": 0.2189, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.6179999999999999, |
|
"grad_norm": 0.3024926781654358, |
|
"learning_rate": 7.851753811978924e-07, |
|
"loss": 0.3149, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.20770519971847534, |
|
"learning_rate": 7.766608697888095e-07, |
|
"loss": 0.2967, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 1.6219999999999999, |
|
"grad_norm": 0.2985385060310364, |
|
"learning_rate": 7.681888873578786e-07, |
|
"loss": 0.3245, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 1.624, |
|
"grad_norm": 0.238825723528862, |
|
"learning_rate": 7.597595192178702e-07, |
|
"loss": 0.2024, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 1.626, |
|
"grad_norm": 0.24210689961910248, |
|
"learning_rate": 7.513728502524286e-07, |
|
"loss": 0.3364, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.6280000000000001, |
|
"grad_norm": 0.2465432733297348, |
|
"learning_rate": 7.430289649152156e-07, |
|
"loss": 0.3643, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.37851664423942566, |
|
"learning_rate": 7.347279472290647e-07, |
|
"loss": 0.4549, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 0.29046836495399475, |
|
"learning_rate": 7.264698807851328e-07, |
|
"loss": 0.3777, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 1.634, |
|
"grad_norm": 0.17954066395759583, |
|
"learning_rate": 7.182548487420555e-07, |
|
"loss": 0.1817, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 1.6360000000000001, |
|
"grad_norm": 0.21587719023227692, |
|
"learning_rate": 7.100829338251147e-07, |
|
"loss": 0.3208, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.638, |
|
"grad_norm": 0.24211935698986053, |
|
"learning_rate": 7.019542183254047e-07, |
|
"loss": 0.302, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 0.3430536389350891, |
|
"learning_rate": 6.938687840989972e-07, |
|
"loss": 0.3358, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 1.642, |
|
"grad_norm": 0.26358646154403687, |
|
"learning_rate": 6.858267125661272e-07, |
|
"loss": 0.3329, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 1.6440000000000001, |
|
"grad_norm": 0.21013550460338593, |
|
"learning_rate": 6.778280847103668e-07, |
|
"loss": 0.247, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 1.646, |
|
"grad_norm": 0.17694292962551117, |
|
"learning_rate": 6.698729810778065e-07, |
|
"loss": 0.2205, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.6480000000000001, |
|
"grad_norm": 0.15793128311634064, |
|
"learning_rate": 6.619614817762537e-07, |
|
"loss": 0.1541, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.18143923580646515, |
|
"learning_rate": 6.540936664744197e-07, |
|
"loss": 0.2367, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 1.6520000000000001, |
|
"grad_norm": 0.21212640404701233, |
|
"learning_rate": 6.462696144011149e-07, |
|
"loss": 0.3049, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 1.654, |
|
"grad_norm": 0.21567395329475403, |
|
"learning_rate": 6.384894043444568e-07, |
|
"loss": 0.2519, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 1.6560000000000001, |
|
"grad_norm": 0.17464697360992432, |
|
"learning_rate": 6.307531146510754e-07, |
|
"loss": 0.1692, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.658, |
|
"grad_norm": 0.23152326047420502, |
|
"learning_rate": 6.230608232253227e-07, |
|
"loss": 0.2823, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 1.6600000000000001, |
|
"grad_norm": 0.3341864049434662, |
|
"learning_rate": 6.154126075284855e-07, |
|
"loss": 0.2823, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 1.662, |
|
"grad_norm": 0.24136964976787567, |
|
"learning_rate": 6.07808544578013e-07, |
|
"loss": 0.3713, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 0.21439406275749207, |
|
"learning_rate": 6.002487109467347e-07, |
|
"loss": 0.2631, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 1.666, |
|
"grad_norm": 0.3102458715438843, |
|
"learning_rate": 5.927331827620902e-07, |
|
"loss": 0.3513, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.6680000000000001, |
|
"grad_norm": 0.20326466858386993, |
|
"learning_rate": 5.852620357053651e-07, |
|
"loss": 0.2738, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.185090109705925, |
|
"learning_rate": 5.778353450109286e-07, |
|
"loss": 0.2665, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 1.6720000000000002, |
|
"grad_norm": 0.17061105370521545, |
|
"learning_rate": 5.704531854654721e-07, |
|
"loss": 0.2018, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.674, |
|
"grad_norm": 0.18026676774024963, |
|
"learning_rate": 5.631156314072605e-07, |
|
"loss": 0.2182, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 1.6760000000000002, |
|
"grad_norm": 0.24431855976581573, |
|
"learning_rate": 5.558227567253832e-07, |
|
"loss": 0.3036, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.678, |
|
"grad_norm": 0.1817561835050583, |
|
"learning_rate": 5.485746348590048e-07, |
|
"loss": 0.2786, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 0.20034758746623993, |
|
"learning_rate": 5.413713387966329e-07, |
|
"loss": 0.2073, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 1.682, |
|
"grad_norm": 0.23046346008777618, |
|
"learning_rate": 5.34212941075381e-07, |
|
"loss": 0.2456, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 1.6840000000000002, |
|
"grad_norm": 0.28231683373451233, |
|
"learning_rate": 5.270995137802315e-07, |
|
"loss": 0.2962, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 1.686, |
|
"grad_norm": 0.20535282790660858, |
|
"learning_rate": 5.200311285433213e-07, |
|
"loss": 0.2003, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.688, |
|
"grad_norm": 0.27334460616111755, |
|
"learning_rate": 5.130078565432089e-07, |
|
"loss": 0.2784, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.2541443109512329, |
|
"learning_rate": 5.06029768504166e-07, |
|
"loss": 0.3575, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 1.692, |
|
"grad_norm": 0.20568181574344635, |
|
"learning_rate": 4.990969346954611e-07, |
|
"loss": 0.3116, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 1.694, |
|
"grad_norm": 0.2725497782230377, |
|
"learning_rate": 4.922094249306559e-07, |
|
"loss": 0.2698, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 0.2767050862312317, |
|
"learning_rate": 4.853673085668947e-07, |
|
"loss": 0.3246, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.698, |
|
"grad_norm": 0.27081194519996643, |
|
"learning_rate": 4.785706545042141e-07, |
|
"loss": 0.3067, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.2148142009973526, |
|
"learning_rate": 4.7181953118484556e-07, |
|
"loss": 0.335, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 1.702, |
|
"grad_norm": 0.20924992859363556, |
|
"learning_rate": 4.651140065925269e-07, |
|
"loss": 0.2473, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"grad_norm": 0.1969323456287384, |
|
"learning_rate": 4.58454148251814e-07, |
|
"loss": 0.2384, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.706, |
|
"grad_norm": 0.21272586286067963, |
|
"learning_rate": 4.5184002322740784e-07, |
|
"loss": 0.1894, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.708, |
|
"grad_norm": 0.22230306267738342, |
|
"learning_rate": 4.4527169812347446e-07, |
|
"loss": 0.2878, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.23957069218158722, |
|
"learning_rate": 4.387492390829734e-07, |
|
"loss": 0.2608, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 1.712, |
|
"grad_norm": 0.19603803753852844, |
|
"learning_rate": 4.322727117869951e-07, |
|
"loss": 0.2291, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 1.714, |
|
"grad_norm": 0.19814668595790863, |
|
"learning_rate": 4.2584218145409916e-07, |
|
"loss": 0.2933, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 1.716, |
|
"grad_norm": 0.2840145230293274, |
|
"learning_rate": 4.194577128396521e-07, |
|
"loss": 0.2678, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.718, |
|
"grad_norm": 0.3841419816017151, |
|
"learning_rate": 4.131193702351827e-07, |
|
"loss": 0.4492, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.1749158352613449, |
|
"learning_rate": 4.0682721746773346e-07, |
|
"loss": 0.2205, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 1.722, |
|
"grad_norm": 0.22776730358600616, |
|
"learning_rate": 4.005813178992091e-07, |
|
"loss": 0.2634, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 1.724, |
|
"grad_norm": 0.20322760939598083, |
|
"learning_rate": 3.9438173442575e-07, |
|
"loss": 0.3125, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 1.726, |
|
"grad_norm": 0.24371430277824402, |
|
"learning_rate": 3.882285294770938e-07, |
|
"loss": 0.3223, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.726, |
|
"eval_loss": 0.26352861523628235, |
|
"eval_runtime": 76.577, |
|
"eval_samples_per_second": 7.208, |
|
"eval_steps_per_second": 0.901, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 0.2777194678783417, |
|
"learning_rate": 3.821217650159453e-07, |
|
"loss": 0.3117, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.21060119569301605, |
|
"learning_rate": 3.760615025373543e-07, |
|
"loss": 0.2444, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 1.732, |
|
"grad_norm": 0.19364982843399048, |
|
"learning_rate": 3.7004780306809873e-07, |
|
"loss": 0.2534, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 1.734, |
|
"grad_norm": 0.2388126105070114, |
|
"learning_rate": 3.6408072716606346e-07, |
|
"loss": 0.5307, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 1.736, |
|
"grad_norm": 0.21501779556274414, |
|
"learning_rate": 3.581603349196372e-07, |
|
"loss": 0.299, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.738, |
|
"grad_norm": 0.2748852074146271, |
|
"learning_rate": 3.522866859471047e-07, |
|
"loss": 0.4626, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.2657471299171448, |
|
"learning_rate": 3.46459839396045e-07, |
|
"loss": 0.2947, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 1.742, |
|
"grad_norm": 0.1825701743364334, |
|
"learning_rate": 3.406798539427386e-07, |
|
"loss": 0.2525, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 1.744, |
|
"grad_norm": 0.18898171186447144, |
|
"learning_rate": 3.3494678779157464e-07, |
|
"loss": 0.2188, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 1.746, |
|
"grad_norm": 0.2019154280424118, |
|
"learning_rate": 3.2926069867446673e-07, |
|
"loss": 0.2575, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.748, |
|
"grad_norm": 0.26931118965148926, |
|
"learning_rate": 3.2362164385026704e-07, |
|
"loss": 0.2867, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.25869134068489075, |
|
"learning_rate": 3.180296801041971e-07, |
|
"loss": 0.4233, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 1.752, |
|
"grad_norm": 0.24689964950084686, |
|
"learning_rate": 3.1248486374726884e-07, |
|
"loss": 0.3778, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 1.754, |
|
"grad_norm": 0.2961515486240387, |
|
"learning_rate": 3.069872506157212e-07, |
|
"loss": 0.3767, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 1.756, |
|
"grad_norm": 0.2758214473724365, |
|
"learning_rate": 3.015368960704584e-07, |
|
"loss": 0.4107, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.758, |
|
"grad_norm": 0.19258597493171692, |
|
"learning_rate": 2.9613385499648926e-07, |
|
"loss": 0.2285, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.21885156631469727, |
|
"learning_rate": 2.9077818180237693e-07, |
|
"loss": 0.2726, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 1.762, |
|
"grad_norm": 0.20850767195224762, |
|
"learning_rate": 2.8546993041969173e-07, |
|
"loss": 0.3443, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 1.764, |
|
"grad_norm": 0.22747254371643066, |
|
"learning_rate": 2.802091543024671e-07, |
|
"loss": 0.2785, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 1.766, |
|
"grad_norm": 0.18733809888362885, |
|
"learning_rate": 2.7499590642665773e-07, |
|
"loss": 0.2047, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.768, |
|
"grad_norm": 0.230934277176857, |
|
"learning_rate": 2.6983023928961406e-07, |
|
"loss": 0.2994, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.1833610087633133, |
|
"learning_rate": 2.647122049095463e-07, |
|
"loss": 0.2064, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 1.772, |
|
"grad_norm": 0.2077609896659851, |
|
"learning_rate": 2.596418548250029e-07, |
|
"loss": 0.2537, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 1.774, |
|
"grad_norm": 0.163072407245636, |
|
"learning_rate": 2.546192400943537e-07, |
|
"loss": 0.194, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 1.776, |
|
"grad_norm": 0.1943567395210266, |
|
"learning_rate": 2.4964441129527337e-07, |
|
"loss": 0.2519, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.778, |
|
"grad_norm": 0.18382684886455536, |
|
"learning_rate": 2.447174185242324e-07, |
|
"loss": 0.1944, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.20981475710868835, |
|
"learning_rate": 2.398383113959929e-07, |
|
"loss": 0.173, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 1.782, |
|
"grad_norm": 0.1996649205684662, |
|
"learning_rate": 2.3500713904311023e-07, |
|
"loss": 0.2536, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 1.784, |
|
"grad_norm": 0.2560986578464508, |
|
"learning_rate": 2.3022395011543687e-07, |
|
"loss": 0.374, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 1.786, |
|
"grad_norm": 0.20811672508716583, |
|
"learning_rate": 2.2548879277963065e-07, |
|
"loss": 0.3225, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.788, |
|
"grad_norm": 0.1996699571609497, |
|
"learning_rate": 2.2080171471867362e-07, |
|
"loss": 0.2632, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.20678700506687164, |
|
"learning_rate": 2.161627631313923e-07, |
|
"loss": 0.3513, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 0.20172181725502014, |
|
"learning_rate": 2.1157198473197417e-07, |
|
"loss": 0.2117, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 1.794, |
|
"grad_norm": 0.16854679584503174, |
|
"learning_rate": 2.0702942574950812e-07, |
|
"loss": 0.3006, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 1.796, |
|
"grad_norm": 0.1959567815065384, |
|
"learning_rate": 2.0253513192751374e-07, |
|
"loss": 0.2695, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.798, |
|
"grad_norm": 0.1726803481578827, |
|
"learning_rate": 1.9808914852347817e-07, |
|
"loss": 0.2635, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.22450147569179535, |
|
"learning_rate": 1.9369152030840553e-07, |
|
"loss": 0.2598, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 1.802, |
|
"grad_norm": 0.26783040165901184, |
|
"learning_rate": 1.8934229156636453e-07, |
|
"loss": 0.2029, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 1.804, |
|
"grad_norm": 0.2690034508705139, |
|
"learning_rate": 1.8504150609403858e-07, |
|
"loss": 0.2446, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 1.806, |
|
"grad_norm": 0.23306065797805786, |
|
"learning_rate": 1.807892072002898e-07, |
|
"loss": 0.3264, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.808, |
|
"grad_norm": 0.2681446075439453, |
|
"learning_rate": 1.765854377057219e-07, |
|
"loss": 0.302, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.19500699639320374, |
|
"learning_rate": 1.724302399422456e-07, |
|
"loss": 0.2066, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 1.812, |
|
"grad_norm": 0.2524206340312958, |
|
"learning_rate": 1.6832365575265742e-07, |
|
"loss": 0.3334, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 1.814, |
|
"grad_norm": 0.2076834887266159, |
|
"learning_rate": 1.6426572649021477e-07, |
|
"loss": 0.2737, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 1.8159999999999998, |
|
"grad_norm": 0.28093916177749634, |
|
"learning_rate": 1.6025649301821877e-07, |
|
"loss": 0.3558, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.818, |
|
"grad_norm": 0.24566200375556946, |
|
"learning_rate": 1.562959957096072e-07, |
|
"loss": 0.3636, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 1.8199999999999998, |
|
"grad_norm": 0.2996765077114105, |
|
"learning_rate": 1.5238427444654368e-07, |
|
"loss": 0.3945, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 1.822, |
|
"grad_norm": 0.24855782091617584, |
|
"learning_rate": 1.4852136862001766e-07, |
|
"loss": 0.1894, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 0.2089153230190277, |
|
"learning_rate": 1.4470731712944885e-07, |
|
"loss": 0.3297, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 1.826, |
|
"grad_norm": 0.3130733072757721, |
|
"learning_rate": 1.4094215838229176e-07, |
|
"loss": 0.4001, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.8279999999999998, |
|
"grad_norm": 0.2722707688808441, |
|
"learning_rate": 1.372259302936546e-07, |
|
"loss": 0.356, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.15767575800418854, |
|
"learning_rate": 1.3355867028591209e-07, |
|
"loss": 0.2161, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 0.18771317601203918, |
|
"learning_rate": 1.2994041528833267e-07, |
|
"loss": 0.1912, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 1.834, |
|
"grad_norm": 0.15640737116336823, |
|
"learning_rate": 1.263712017367036e-07, |
|
"loss": 0.2173, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 1.8359999999999999, |
|
"grad_norm": 0.2588789463043213, |
|
"learning_rate": 1.2285106557296479e-07, |
|
"loss": 0.3506, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.838, |
|
"grad_norm": 0.21290963888168335, |
|
"learning_rate": 1.193800422448499e-07, |
|
"loss": 0.2377, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.198676198720932, |
|
"learning_rate": 1.1595816670552429e-07, |
|
"loss": 0.1823, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 1.842, |
|
"grad_norm": 0.23629765212535858, |
|
"learning_rate": 1.12585473413237e-07, |
|
"loss": 0.2565, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 1.8439999999999999, |
|
"grad_norm": 0.23395268619060516, |
|
"learning_rate": 1.0926199633097156e-07, |
|
"loss": 0.2184, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 1.846, |
|
"grad_norm": 0.2589554190635681, |
|
"learning_rate": 1.0598776892610685e-07, |
|
"loss": 0.369, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.8479999999999999, |
|
"grad_norm": 0.22093115746974945, |
|
"learning_rate": 1.0276282417007399e-07, |
|
"loss": 0.3437, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.23697194457054138, |
|
"learning_rate": 9.958719453803278e-08, |
|
"loss": 0.3288, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 1.8519999999999999, |
|
"grad_norm": 0.22383596003055573, |
|
"learning_rate": 9.646091200853802e-08, |
|
"loss": 0.4897, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 1.854, |
|
"grad_norm": 0.20475724339485168, |
|
"learning_rate": 9.338400806321979e-08, |
|
"loss": 0.257, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 0.263615220785141, |
|
"learning_rate": 9.035651368646647e-08, |
|
"loss": 0.4592, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.858, |
|
"grad_norm": 0.24478185176849365, |
|
"learning_rate": 8.737845936511335e-08, |
|
"loss": 0.4337, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 1.8599999999999999, |
|
"grad_norm": 0.2436402142047882, |
|
"learning_rate": 8.444987508813451e-08, |
|
"loss": 0.3344, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 1.862, |
|
"grad_norm": 0.23337677121162415, |
|
"learning_rate": 8.157079034633974e-08, |
|
"loss": 0.2967, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 1.8639999999999999, |
|
"grad_norm": 0.20073962211608887, |
|
"learning_rate": 7.874123413208145e-08, |
|
"loss": 0.1952, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 1.866, |
|
"grad_norm": 0.2582467496395111, |
|
"learning_rate": 7.59612349389599e-08, |
|
"loss": 0.372, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.8679999999999999, |
|
"grad_norm": 0.2121819704771042, |
|
"learning_rate": 7.32308207615351e-08, |
|
"loss": 0.2619, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.16836410760879517, |
|
"learning_rate": 7.055001909504755e-08, |
|
"loss": 0.293, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 1.8719999999999999, |
|
"grad_norm": 0.18819768726825714, |
|
"learning_rate": 6.791885693514134e-08, |
|
"loss": 0.2476, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 1.874, |
|
"grad_norm": 0.2157561331987381, |
|
"learning_rate": 6.533736077758868e-08, |
|
"loss": 0.2615, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 1.876, |
|
"grad_norm": 0.24670301377773285, |
|
"learning_rate": 6.280555661802857e-08, |
|
"loss": 0.371, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.8780000000000001, |
|
"grad_norm": 0.21483668684959412, |
|
"learning_rate": 6.032346995169968e-08, |
|
"loss": 0.2231, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.1763847917318344, |
|
"learning_rate": 5.7891125773187896e-08, |
|
"loss": 0.2074, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 1.8820000000000001, |
|
"grad_norm": 0.20190970599651337, |
|
"learning_rate": 5.550854857617194e-08, |
|
"loss": 0.3226, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 1.884, |
|
"grad_norm": 0.23266001045703888, |
|
"learning_rate": 5.3175762353177563e-08, |
|
"loss": 0.3055, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 1.8860000000000001, |
|
"grad_norm": 0.26426488161087036, |
|
"learning_rate": 5.089279059533658e-08, |
|
"loss": 0.3319, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 0.24322916567325592, |
|
"learning_rate": 4.865965629214819e-08, |
|
"loss": 0.2372, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 1.8900000000000001, |
|
"grad_norm": 0.23628686368465424, |
|
"learning_rate": 4.6476381931251366e-08, |
|
"loss": 0.3808, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 1.892, |
|
"grad_norm": 0.16934725642204285, |
|
"learning_rate": 4.434298949819449e-08, |
|
"loss": 0.1737, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 1.8940000000000001, |
|
"grad_norm": 0.30660754442214966, |
|
"learning_rate": 4.225950047621441e-08, |
|
"loss": 0.3483, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 1.896, |
|
"grad_norm": 0.27640894055366516, |
|
"learning_rate": 4.02259358460233e-08, |
|
"loss": 0.3264, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.8980000000000001, |
|
"grad_norm": 0.2123912125825882, |
|
"learning_rate": 3.8242316085594923e-08, |
|
"loss": 0.3876, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.2987152636051178, |
|
"learning_rate": 3.630866116995757e-08, |
|
"loss": 0.4525, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 1.9020000000000001, |
|
"grad_norm": 0.22001074254512787, |
|
"learning_rate": 3.44249905709948e-08, |
|
"loss": 0.1842, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 1.904, |
|
"grad_norm": 0.20775096118450165, |
|
"learning_rate": 3.25913232572489e-08, |
|
"loss": 0.3012, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 1.9060000000000001, |
|
"grad_norm": 0.19180834293365479, |
|
"learning_rate": 3.080767769372939e-08, |
|
"loss": 0.2681, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.908, |
|
"grad_norm": 0.22222468256950378, |
|
"learning_rate": 2.907407184172706e-08, |
|
"loss": 0.1809, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 1.9100000000000001, |
|
"grad_norm": 0.20555076003074646, |
|
"learning_rate": 2.7390523158633552e-08, |
|
"loss": 0.1482, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 1.912, |
|
"grad_norm": 0.29668375849723816, |
|
"learning_rate": 2.57570485977654e-08, |
|
"loss": 0.2179, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 1.9140000000000001, |
|
"grad_norm": 0.19830183684825897, |
|
"learning_rate": 2.4173664608193592e-08, |
|
"loss": 0.2677, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 1.916, |
|
"grad_norm": 0.23050029575824738, |
|
"learning_rate": 2.264038713457706e-08, |
|
"loss": 0.3348, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.9180000000000001, |
|
"grad_norm": 0.36921679973602295, |
|
"learning_rate": 2.1157231617002783e-08, |
|
"loss": 0.4821, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.16172367334365845, |
|
"learning_rate": 1.9724212990830938e-08, |
|
"loss": 0.2348, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 1.9220000000000002, |
|
"grad_norm": 0.18016183376312256, |
|
"learning_rate": 1.834134568654333e-08, |
|
"loss": 0.2486, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 1.924, |
|
"grad_norm": 0.32527899742126465, |
|
"learning_rate": 1.7008643629596866e-08, |
|
"loss": 0.3623, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 1.9260000000000002, |
|
"grad_norm": 0.21802493929862976, |
|
"learning_rate": 1.5726120240288632e-08, |
|
"loss": 0.2155, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.928, |
|
"grad_norm": 0.23393763601779938, |
|
"learning_rate": 1.449378843361271e-08, |
|
"loss": 0.284, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 1.9300000000000002, |
|
"grad_norm": 0.2498655915260315, |
|
"learning_rate": 1.3311660619138578e-08, |
|
"loss": 0.2816, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 1.932, |
|
"grad_norm": 0.20273719727993011, |
|
"learning_rate": 1.2179748700879013e-08, |
|
"loss": 0.2945, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 1.9340000000000002, |
|
"grad_norm": 0.16979333758354187, |
|
"learning_rate": 1.109806407717462e-08, |
|
"loss": 0.1949, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 1.936, |
|
"grad_norm": 0.18881943821907043, |
|
"learning_rate": 1.006661764057837e-08, |
|
"loss": 0.2681, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.938, |
|
"grad_norm": 0.23016507923603058, |
|
"learning_rate": 9.085419777743465e-09, |
|
"loss": 0.4162, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.21829769015312195, |
|
"learning_rate": 8.15448036932176e-09, |
|
"loss": 0.3911, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 1.942, |
|
"grad_norm": 0.192356139421463, |
|
"learning_rate": 7.273808789862724e-09, |
|
"loss": 0.3076, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 1.944, |
|
"grad_norm": 0.20806097984313965, |
|
"learning_rate": 6.4434139077201865e-09, |
|
"loss": 0.2808, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 1.946, |
|
"grad_norm": 0.2533554434776306, |
|
"learning_rate": 5.6633040849601865e-09, |
|
"loss": 0.264, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.948, |
|
"grad_norm": 0.25440603494644165, |
|
"learning_rate": 4.933487177280483e-09, |
|
"loss": 0.386, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.2403300553560257, |
|
"learning_rate": 4.253970533929508e-09, |
|
"loss": 0.2665, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 0.18095187842845917, |
|
"learning_rate": 3.6247609976319818e-09, |
|
"loss": 0.2414, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 1.954, |
|
"grad_norm": 0.43698740005493164, |
|
"learning_rate": 3.0458649045211897e-09, |
|
"loss": 0.4131, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 1.956, |
|
"grad_norm": 0.2908496856689453, |
|
"learning_rate": 2.5172880840745873e-09, |
|
"loss": 0.2955, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.958, |
|
"grad_norm": 0.19435322284698486, |
|
"learning_rate": 2.0390358590538507e-09, |
|
"loss": 0.1839, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.20639224350452423, |
|
"learning_rate": 1.61111304545436e-09, |
|
"loss": 0.336, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 1.962, |
|
"grad_norm": 0.18591168522834778, |
|
"learning_rate": 1.2335239524541298e-09, |
|
"loss": 0.2653, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 1.964, |
|
"grad_norm": 0.2295517921447754, |
|
"learning_rate": 9.062723823710651e-10, |
|
"loss": 0.3478, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 1.966, |
|
"grad_norm": 0.2810915410518646, |
|
"learning_rate": 6.293616306246586e-10, |
|
"loss": 0.3266, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.968, |
|
"grad_norm": 0.19316555559635162, |
|
"learning_rate": 4.027944857032395e-10, |
|
"loss": 0.2753, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.24243375658988953, |
|
"learning_rate": 2.265732291356626e-10, |
|
"loss": 0.2786, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 1.972, |
|
"grad_norm": 0.27688726782798767, |
|
"learning_rate": 1.0069963546743833e-10, |
|
"loss": 0.2615, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 1.974, |
|
"grad_norm": 0.18696589767932892, |
|
"learning_rate": 2.5174972244634834e-11, |
|
"loss": 0.2866, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 1.976, |
|
"grad_norm": 0.21791526675224304, |
|
"learning_rate": 0.0, |
|
"loss": 0.2074, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.976, |
|
"eval_loss": 0.26330506801605225, |
|
"eval_runtime": 76.7272, |
|
"eval_samples_per_second": 7.194, |
|
"eval_steps_per_second": 0.899, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.531674674724864e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|