|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.981177899210686, |
|
"eval_steps": 26, |
|
"global_step": 822, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0024286581663630845, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.9095, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0024286581663630845, |
|
"eval_loss": 0.8089314699172974, |
|
"eval_runtime": 98.8099, |
|
"eval_samples_per_second": 30.361, |
|
"eval_steps_per_second": 3.795, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004857316332726169, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.8146, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007285974499089253, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.806, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.009714632665452338, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.781, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012143290831815421, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.7774, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.014571948998178506, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.776, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01700060716454159, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 0.7554, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.019429265330904676, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7362, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02185792349726776, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 1.125e-05, |
|
"loss": 0.7365, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.024286581663630843, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.7183, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02671523982999393, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.375e-05, |
|
"loss": 0.7153, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.029143897996357013, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.8111, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.031572556162720096, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.6250000000000002e-05, |
|
"loss": 0.6966, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03400121432908318, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 1.7500000000000002e-05, |
|
"loss": 0.7068, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03642987249544627, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 0.6915, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03885853066180935, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6878, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.041287188828172436, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 1.999992403752328e-05, |
|
"loss": 0.6902, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04371584699453552, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 1.999969615124717e-05, |
|
"loss": 0.6818, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0461445051608986, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 1.999931634463383e-05, |
|
"loss": 0.6732, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.048573163327261686, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 1.9998784623453477e-05, |
|
"loss": 0.6693, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.051001821493624776, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 1.999810099578428e-05, |
|
"loss": 0.6663, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05343047965998786, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 1.9997265472012247e-05, |
|
"loss": 0.7473, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05585913782635094, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 1.999627806483107e-05, |
|
"loss": 0.6457, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.058287795992714025, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.999513878924193e-05, |
|
"loss": 0.7388, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06071645415907711, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 1.9993847662553264e-05, |
|
"loss": 0.6505, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06314511232544019, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 1.9992404704380513e-05, |
|
"loss": 0.6388, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06314511232544019, |
|
"eval_loss": 0.649046003818512, |
|
"eval_runtime": 97.2348, |
|
"eval_samples_per_second": 30.853, |
|
"eval_steps_per_second": 3.857, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06557377049180328, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 1.9990809936645804e-05, |
|
"loss": 0.6507, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06800242865816636, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 1.9989063383577644e-05, |
|
"loss": 0.6536, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07043108682452945, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 1.998716507171053e-05, |
|
"loss": 0.6508, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07285974499089254, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 1.9985115029884556e-05, |
|
"loss": 0.6465, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07528840315725562, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 1.9982913289244977e-05, |
|
"loss": 0.7309, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0777170613236187, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 1.9980559883241723e-05, |
|
"loss": 0.6319, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08014571948998178, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.9978054847628908e-05, |
|
"loss": 0.6309, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08257437765634487, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.9975398220464268e-05, |
|
"loss": 0.6301, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.08500303582270795, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.9972590042108605e-05, |
|
"loss": 0.6364, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08743169398907104, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.996963035522515e-05, |
|
"loss": 0.6303, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.08986035215543413, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.9966519204778937e-05, |
|
"loss": 0.6374, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0922890103217972, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.99632566380361e-05, |
|
"loss": 0.6177, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0947176684881603, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.995984270456317e-05, |
|
"loss": 0.6259, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.09714632665452337, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.995627745622632e-05, |
|
"loss": 0.6311, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09957498482088646, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.9952560947190568e-05, |
|
"loss": 0.6254, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10200364298724955, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.994869323391895e-05, |
|
"loss": 0.6197, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.10443230115361263, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.9944674375171697e-05, |
|
"loss": 0.6147, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.10686095931997572, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.9940504432005293e-05, |
|
"loss": 0.6281, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1092896174863388, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.993618346777158e-05, |
|
"loss": 0.6142, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11171827565270188, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.993171154811679e-05, |
|
"loss": 0.6182, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.11414693381906496, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.992708874098054e-05, |
|
"loss": 0.6181, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.11657559198542805, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.992231511659481e-05, |
|
"loss": 0.6136, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.11900425015179114, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.9917390747482855e-05, |
|
"loss": 0.6052, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12143290831815422, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.9912315708458144e-05, |
|
"loss": 0.6087, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12386156648451731, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.9907090076623174e-05, |
|
"loss": 0.6031, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.12629022465088038, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.9901713931368333e-05, |
|
"loss": 0.6131, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.12629022465088038, |
|
"eval_loss": 0.612246572971344, |
|
"eval_runtime": 97.1281, |
|
"eval_samples_per_second": 30.887, |
|
"eval_steps_per_second": 3.861, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.12871888281724347, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 1.989618735437069e-05, |
|
"loss": 0.702, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.13114754098360656, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.989051042959273e-05, |
|
"loss": 0.6192, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.13357619914996965, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.9884683243281117e-05, |
|
"loss": 0.612, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13600485731633272, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.9878705883965342e-05, |
|
"loss": 0.6026, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1384335154826958, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.9872578442456415e-05, |
|
"loss": 0.6044, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.1408621736490589, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.986630101184546e-05, |
|
"loss": 0.6061, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.143290831815422, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.9859873687502317e-05, |
|
"loss": 0.6113, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.14571948998178508, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.9853296567074075e-05, |
|
"loss": 0.5933, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.9846569750483605e-05, |
|
"loss": 0.6046, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.15057680631451123, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 1.983969333992804e-05, |
|
"loss": 0.6079, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.15300546448087432, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.9832667439877217e-05, |
|
"loss": 0.6098, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1554341226472374, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.982549215707209e-05, |
|
"loss": 0.5942, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.15786278081360047, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.98181676005231e-05, |
|
"loss": 0.6082, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.16029143897996356, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 1.9810693881508548e-05, |
|
"loss": 0.6838, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.16272009714632665, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.980307111357288e-05, |
|
"loss": 0.5919, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.16514875531268974, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.9795299412524948e-05, |
|
"loss": 0.5769, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.16757741347905283, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.9787378896436292e-05, |
|
"loss": 0.6, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.1700060716454159, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.9779309685639317e-05, |
|
"loss": 0.5963, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.172434729811779, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.9771091902725465e-05, |
|
"loss": 0.5954, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.17486338797814208, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.9762725672543372e-05, |
|
"loss": 0.5892, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.17729204614450517, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.9754211122196945e-05, |
|
"loss": 0.5883, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.17972070431086826, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.9745548381043454e-05, |
|
"loss": 0.5925, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.18214936247723132, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.9736737580691553e-05, |
|
"loss": 0.5867, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1845780206435944, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.9727778854999283e-05, |
|
"loss": 0.5931, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1870066788099575, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.9718672340072044e-05, |
|
"loss": 0.5858, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.1894353369763206, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.9709418174260523e-05, |
|
"loss": 0.5933, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1894353369763206, |
|
"eval_loss": 0.5919594168663025, |
|
"eval_runtime": 97.3358, |
|
"eval_samples_per_second": 30.821, |
|
"eval_steps_per_second": 3.853, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.19186399514268368, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.970001649815859e-05, |
|
"loss": 0.5753, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.19429265330904674, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.969046745460116e-05, |
|
"loss": 0.5892, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.19672131147540983, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.9680771188662044e-05, |
|
"loss": 0.5917, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.19914996964177292, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.9670927847651707e-05, |
|
"loss": 0.5913, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.201578627808136, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.9660937581115073e-05, |
|
"loss": 0.5787, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.2040072859744991, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.9650800540829204e-05, |
|
"loss": 0.5779, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.20643594414086217, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.964051688080105e-05, |
|
"loss": 0.5912, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.20886460230722526, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.963008675726506e-05, |
|
"loss": 0.5879, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.21129326047358835, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.9619510328680847e-05, |
|
"loss": 0.5905, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.21372191863995144, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.9608787755730746e-05, |
|
"loss": 0.5789, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2161505768063145, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.9597919201317393e-05, |
|
"loss": 0.5824, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.2185792349726776, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.958690483056126e-05, |
|
"loss": 0.5841, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.22100789313904068, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.9575744810798118e-05, |
|
"loss": 0.5709, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.22343655130540377, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.9564439311576515e-05, |
|
"loss": 0.5799, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.22586520947176686, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.9552988504655194e-05, |
|
"loss": 0.5757, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.22829386763812992, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.954139256400049e-05, |
|
"loss": 0.5768, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.230722525804493, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 1.9529651665783675e-05, |
|
"loss": 0.6447, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2331511839708561, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.951776598837829e-05, |
|
"loss": 0.5888, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2355798421372192, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 1.9505735712357437e-05, |
|
"loss": 0.6567, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.23800850030358228, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.9493561020491024e-05, |
|
"loss": 0.5866, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.24043715846994534, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.9481242097743002e-05, |
|
"loss": 0.5775, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.24286581663630843, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.9468779131268553e-05, |
|
"loss": 0.5796, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24529447480267152, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.9456172310411228e-05, |
|
"loss": 0.5763, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.24772313296903462, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.9443421826700096e-05, |
|
"loss": 0.5766, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2501517911353977, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.9430527873846826e-05, |
|
"loss": 0.5766, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.25258044930176077, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.9417490647742738e-05, |
|
"loss": 0.5796, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.25258044930176077, |
|
"eval_loss": 0.5772241950035095, |
|
"eval_runtime": 97.0571, |
|
"eval_samples_per_second": 30.91, |
|
"eval_steps_per_second": 3.864, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2550091074681239, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.9404310346455822e-05, |
|
"loss": 0.5762, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.25743776563448695, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.9390987170227746e-05, |
|
"loss": 0.5833, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.25986642380085, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.9377521321470806e-05, |
|
"loss": 0.5739, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.26229508196721313, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.9363913004764847e-05, |
|
"loss": 0.5771, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2647237401335762, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.9350162426854152e-05, |
|
"loss": 0.5674, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2671523982999393, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.9336269796644314e-05, |
|
"loss": 0.5698, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.26958105646630237, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.9322235325199054e-05, |
|
"loss": 0.5681, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.27200971463266543, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.9308059225737015e-05, |
|
"loss": 0.5615, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.27443837279902855, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.9293741713628518e-05, |
|
"loss": 0.5765, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2768670309653916, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.9279283006392304e-05, |
|
"loss": 0.5633, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.27929568913175473, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.9264683323692213e-05, |
|
"loss": 0.5629, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2817243472981178, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.924994288733386e-05, |
|
"loss": 0.5707, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.28415300546448086, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.9235061921261248e-05, |
|
"loss": 0.5658, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.286581663630844, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.9220040651553388e-05, |
|
"loss": 0.5672, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.28901032179720704, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 1.9204879306420852e-05, |
|
"loss": 0.5644, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.29143897996357016, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 1.918957811620231e-05, |
|
"loss": 0.658, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2938676381299332, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.9174137313361012e-05, |
|
"loss": 0.5673, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 1.915855713248129e-05, |
|
"loss": 0.5713, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.2987249544626594, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.9142837810264972e-05, |
|
"loss": 0.5605, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.30115361262902246, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 1.912697958552778e-05, |
|
"loss": 0.634, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3035822707953855, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.9110982699195724e-05, |
|
"loss": 0.5743, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.30601092896174864, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 1.9094847394301427e-05, |
|
"loss": 0.5743, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3084395871281117, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.907857391598043e-05, |
|
"loss": 0.5685, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3108682452944748, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.906216251146748e-05, |
|
"loss": 0.5718, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3132969034608379, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.904561343009276e-05, |
|
"loss": 0.5666, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.31572556162720095, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.902892692327811e-05, |
|
"loss": 0.5487, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.31572556162720095, |
|
"eval_loss": 0.565579354763031, |
|
"eval_runtime": 96.8785, |
|
"eval_samples_per_second": 30.967, |
|
"eval_steps_per_second": 3.871, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.31815421979356406, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.9012103244533217e-05, |
|
"loss": 0.5662, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3205828779599271, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.899514264945173e-05, |
|
"loss": 0.5692, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.32301153612629024, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.897804539570742e-05, |
|
"loss": 0.5571, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3254401942926533, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.8960811743050227e-05, |
|
"loss": 0.553, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.8943441953302346e-05, |
|
"loss": 0.5598, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3302975106253795, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.8925936290354224e-05, |
|
"loss": 0.5624, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.33272616879174255, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.890829502016056e-05, |
|
"loss": 0.5597, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.33515482695810567, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.8890518410736275e-05, |
|
"loss": 0.5575, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.33758348512446873, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.8872606732152426e-05, |
|
"loss": 0.5575, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3400121432908318, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.8854560256532098e-05, |
|
"loss": 0.5549, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3424408014571949, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.8836379258046298e-05, |
|
"loss": 0.5671, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.344869459623558, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 1.8818064012909755e-05, |
|
"loss": 0.639, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3472981177899211, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.8799614799376743e-05, |
|
"loss": 0.6433, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.34972677595628415, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.878103189773686e-05, |
|
"loss": 0.5656, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3521554341226472, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.876231559031075e-05, |
|
"loss": 0.5631, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.35458409228901033, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.8743466161445823e-05, |
|
"loss": 0.5563, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3570127504553734, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.872448389751194e-05, |
|
"loss": 0.5569, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.3594414086217365, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.8705369086897063e-05, |
|
"loss": 0.5548, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3618700667880996, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.8686122020002857e-05, |
|
"loss": 0.5587, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.36429872495446264, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.86667429892403e-05, |
|
"loss": 0.5508, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.36672738312082576, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.8647232289025223e-05, |
|
"loss": 0.5594, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.3691560412871888, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.862759021577385e-05, |
|
"loss": 0.5579, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.37158469945355194, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.860781706789829e-05, |
|
"loss": 0.5503, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.374013357619915, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.8587913145801998e-05, |
|
"loss": 0.5601, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.37644201578627806, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.8567878751875218e-05, |
|
"loss": 0.5516, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3788706739526412, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.8547714190490385e-05, |
|
"loss": 0.552, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3788706739526412, |
|
"eval_loss": 0.5556911826133728, |
|
"eval_runtime": 96.9623, |
|
"eval_samples_per_second": 30.94, |
|
"eval_steps_per_second": 3.867, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.38129933211900424, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.8527419767997506e-05, |
|
"loss": 0.5618, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.38372799028536736, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.8506995792719498e-05, |
|
"loss": 0.5561, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3861566484517304, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.848644257494751e-05, |
|
"loss": 0.5486, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3885853066180935, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.8465760426936212e-05, |
|
"loss": 0.5521, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3910139647844566, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.8444949662899038e-05, |
|
"loss": 0.5474, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.39344262295081966, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.8424010599003424e-05, |
|
"loss": 0.5508, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.3958712811171828, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.8402943553365998e-05, |
|
"loss": 0.5483, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.39829993928354585, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.838174884604776e-05, |
|
"loss": 0.5525, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4007285974499089, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.8360426799049197e-05, |
|
"loss": 0.5512, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.403157255616272, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.8338977736305408e-05, |
|
"loss": 0.5509, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4055859137826351, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.831740198368118e-05, |
|
"loss": 0.5403, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4080145719489982, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.8295699868966038e-05, |
|
"loss": 0.5507, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.41044323011536127, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.8273871721869256e-05, |
|
"loss": 0.5354, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.41287188828172433, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.8251917874014854e-05, |
|
"loss": 0.5483, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.41530054644808745, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.8229838658936566e-05, |
|
"loss": 0.5416, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.4177292046144505, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.8207634412072765e-05, |
|
"loss": 0.5547, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4201578627808136, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.8185305470761366e-05, |
|
"loss": 0.548, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.4225865209471767, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 1.8162852174234712e-05, |
|
"loss": 0.6328, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.42501517911353975, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.81402748636144e-05, |
|
"loss": 0.5406, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.42744383727990287, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.8117573881906114e-05, |
|
"loss": 0.5446, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.42987249544626593, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.809474957399442e-05, |
|
"loss": 0.5591, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.432301153612629, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.8071802286637505e-05, |
|
"loss": 0.5415, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4347298117789921, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.8048732368461927e-05, |
|
"loss": 0.5362, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.4371584699453552, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.8025540169957315e-05, |
|
"loss": 0.5464, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4395871281117183, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.8002226043471025e-05, |
|
"loss": 0.544, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.44201578627808136, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.7978790343202826e-05, |
|
"loss": 0.5567, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.44201578627808136, |
|
"eval_loss": 0.5476920008659363, |
|
"eval_runtime": 97.0095, |
|
"eval_samples_per_second": 30.925, |
|
"eval_steps_per_second": 3.866, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.795523342519948e-05, |
|
"loss": 0.5349, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.44687310261080754, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.7931555647349358e-05, |
|
"loss": 0.5494, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4493017607771706, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.7907757369376984e-05, |
|
"loss": 0.5431, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4517304189435337, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.7883838952837595e-05, |
|
"loss": 0.5455, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4541590771098968, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 1.785980076111161e-05, |
|
"loss": 0.5475, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.45658773527625984, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.7835643159399156e-05, |
|
"loss": 0.5426, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.45901639344262296, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.7811366514714475e-05, |
|
"loss": 0.549, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.461445051608986, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.778697119588039e-05, |
|
"loss": 0.5409, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.46387370977534914, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 1.7762457573522658e-05, |
|
"loss": 0.6053, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.4663023679417122, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.7737826020064377e-05, |
|
"loss": 0.5487, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.46873102610807527, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 1.771307690972031e-05, |
|
"loss": 0.5347, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.4711596842744384, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.76882106184912e-05, |
|
"loss": 0.5525, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.47358834244080145, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.7663227524158053e-05, |
|
"loss": 0.5423, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.47601700060716456, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.7638128006276422e-05, |
|
"loss": 0.5526, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4784456587735276, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.7612912446170615e-05, |
|
"loss": 0.5464, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.4808743169398907, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.758758122692791e-05, |
|
"loss": 0.6096, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4833029751062538, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.7562134733392736e-05, |
|
"loss": 0.5399, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.48573163327261687, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.753657335216083e-05, |
|
"loss": 0.5503, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.48816029143898, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.751089747157336e-05, |
|
"loss": 0.5389, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.49058894960534305, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.7485107481711014e-05, |
|
"loss": 0.548, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.4930176077717061, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 1.7459203774388097e-05, |
|
"loss": 0.5404, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.49544626593806923, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.743318674314656e-05, |
|
"loss": 0.5497, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.4978749241044323, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.740705678325004e-05, |
|
"loss": 0.5313, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5003035822707954, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.7380814291677818e-05, |
|
"loss": 0.5446, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5027322404371585, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 1.7354459667118825e-05, |
|
"loss": 0.6115, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5051608986035215, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.7327993309965583e-05, |
|
"loss": 0.5263, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5051608986035215, |
|
"eval_loss": 0.5407972931861877, |
|
"eval_runtime": 97.5769, |
|
"eval_samples_per_second": 30.745, |
|
"eval_steps_per_second": 3.843, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5075895567698846, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.730141562230809e-05, |
|
"loss": 0.5454, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5100182149362478, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.7274727007927747e-05, |
|
"loss": 0.5417, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5124468731026108, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.72479278722912e-05, |
|
"loss": 0.5337, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5148755312689739, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.7221018622544197e-05, |
|
"loss": 0.5477, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.517304189435337, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.7193999667505387e-05, |
|
"loss": 0.533, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5197328476017, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.7166871417660116e-05, |
|
"loss": 0.5203, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5221615057680632, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.7139634285154198e-05, |
|
"loss": 0.5326, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5245901639344263, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.7112288683787637e-05, |
|
"loss": 0.6092, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5270188221007893, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.708483502900836e-05, |
|
"loss": 0.5417, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5294474802671524, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.7057273737905887e-05, |
|
"loss": 0.5347, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5318761384335154, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.7029605229205005e-05, |
|
"loss": 0.523, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5343047965998786, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.70018299232594e-05, |
|
"loss": 0.5363, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5367334547662417, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.6973948242045284e-05, |
|
"loss": 0.5287, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5391621129326047, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.6945960609154966e-05, |
|
"loss": 0.5396, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5415907710989678, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.6917867449790432e-05, |
|
"loss": 0.5198, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5440194292653309, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 1.688966919075687e-05, |
|
"loss": 0.6069, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.546448087431694, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.68613662604562e-05, |
|
"loss": 0.5376, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5488767455980571, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.6832959088880557e-05, |
|
"loss": 0.5264, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5513054037644202, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.6804448107605767e-05, |
|
"loss": 0.5369, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5537340619307832, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.677583374978478e-05, |
|
"loss": 0.537, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5561627200971463, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.6747116450141092e-05, |
|
"loss": 0.5257, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5585913782635095, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.6718296644962146e-05, |
|
"loss": 0.532, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5610200364298725, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.6689374772092695e-05, |
|
"loss": 0.5382, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5634486945962356, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.6660351270928164e-05, |
|
"loss": 0.5313, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5658773527625987, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.6631226582407954e-05, |
|
"loss": 0.5283, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.5683060109289617, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.660200114900876e-05, |
|
"loss": 0.5466, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5683060109289617, |
|
"eval_loss": 0.5350908637046814, |
|
"eval_runtime": 97.0805, |
|
"eval_samples_per_second": 30.902, |
|
"eval_steps_per_second": 3.863, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5707346690953249, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.6572675414737844e-05, |
|
"loss": 0.5343, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.573163327261688, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.6543249825126285e-05, |
|
"loss": 0.5405, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.575591985428051, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.6513724827222225e-05, |
|
"loss": 0.5252, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5780206435944141, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.6484100869584044e-05, |
|
"loss": 0.5295, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5804493017607771, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.645437840227359e-05, |
|
"loss": 0.5331, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.5828779599271403, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.6424557876849308e-05, |
|
"loss": 0.5274, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5853066180935034, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.639463974635939e-05, |
|
"loss": 0.5303, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.5877352762598664, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.636462446533489e-05, |
|
"loss": 0.5319, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.5901639344262295, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.6334512489782833e-05, |
|
"loss": 0.5316, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.6304304277179267e-05, |
|
"loss": 0.5291, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.5950212507589556, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.627400028646231e-05, |
|
"loss": 0.5341, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5974499089253188, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.6243600978025215e-05, |
|
"loss": 0.5233, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.5998785670916819, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.6213106813709328e-05, |
|
"loss": 0.5251, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6023072252580449, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.6182518256797095e-05, |
|
"loss": 0.534, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.604735883424408, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.6151835772005028e-05, |
|
"loss": 0.5215, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.607164541590771, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.612105982547663e-05, |
|
"loss": 0.5391, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6095931997571342, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.6090190884775333e-05, |
|
"loss": 0.5316, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6120218579234973, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.605922941887737e-05, |
|
"loss": 0.5251, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6144505160898603, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.6028175898164665e-05, |
|
"loss": 0.5239, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6168791742562234, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.599703079441769e-05, |
|
"loss": 0.5229, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6193078324225865, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.5965794580808292e-05, |
|
"loss": 0.5311, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6217364905889496, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.5934467731892497e-05, |
|
"loss": 0.5217, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6241651487553127, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.590305072360331e-05, |
|
"loss": 0.5299, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6265938069216758, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.5871544033243488e-05, |
|
"loss": 0.52, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6290224650880388, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.583994813947827e-05, |
|
"loss": 0.5168, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6314511232544019, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 1.5808263522328137e-05, |
|
"loss": 0.6037, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6314511232544019, |
|
"eval_loss": 0.5299703478813171, |
|
"eval_runtime": 96.9378, |
|
"eval_samples_per_second": 30.948, |
|
"eval_steps_per_second": 3.868, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6338797814207651, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.5776490663161474e-05, |
|
"loss": 0.517, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6363084395871281, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.5744630044687307e-05, |
|
"loss": 0.5182, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6387370977534912, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.5712682150947926e-05, |
|
"loss": 0.5219, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6411657559198543, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.568064746731156e-05, |
|
"loss": 0.5323, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6435944140862173, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 1.5648526480464995e-05, |
|
"loss": 0.5902, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6460230722525805, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.561631967840617e-05, |
|
"loss": 0.5374, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6484517304189436, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.558402755043677e-05, |
|
"loss": 0.5145, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6508803885853066, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.5551650587154815e-05, |
|
"loss": 0.5213, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6533090467516697, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.5519189280447153e-05, |
|
"loss": 0.5192, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.5486644123482047e-05, |
|
"loss": 0.5325, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6581663630843959, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.545401561070163e-05, |
|
"loss": 0.5286, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.660595021250759, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.542130423781444e-05, |
|
"loss": 0.526, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.663023679417122, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.5388510501787855e-05, |
|
"loss": 0.5317, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.6654523375834851, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.5355634900840558e-05, |
|
"loss": 0.5204, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6678809957498482, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.5322677934434965e-05, |
|
"loss": 0.5215, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6703096539162113, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.5289640103269626e-05, |
|
"loss": 0.5247, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6727383120825744, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.5256521909271644e-05, |
|
"loss": 0.5163, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.6751669702489375, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.5223323855589027e-05, |
|
"loss": 0.5335, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.6775956284153005, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.519004644658305e-05, |
|
"loss": 0.5199, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.6800242865816636, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.5156690187820596e-05, |
|
"loss": 0.5294, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6824529447480268, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.5123255586066467e-05, |
|
"loss": 0.5248, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.6848816029143898, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.50897431492757e-05, |
|
"loss": 0.5261, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.6873102610807529, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.5056153386585828e-05, |
|
"loss": 0.5246, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.689738919247116, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.5022486808309171e-05, |
|
"loss": 0.518, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.692167577413479, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.498874392592506e-05, |
|
"loss": 0.5222, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6945962355798422, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.4954925252072077e-05, |
|
"loss": 0.5333, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.6945962355798422, |
|
"eval_loss": 0.5256316661834717, |
|
"eval_runtime": 97.1941, |
|
"eval_samples_per_second": 30.866, |
|
"eval_steps_per_second": 3.858, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.6970248937462052, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.4921031300540268e-05, |
|
"loss": 0.5385, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.6994535519125683, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.4887062586263334e-05, |
|
"loss": 0.5203, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7018822100789314, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.4853019625310813e-05, |
|
"loss": 0.5163, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7043108682452944, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.4818902934880222e-05, |
|
"loss": 0.5211, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7067395264116576, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.4784713033289228e-05, |
|
"loss": 0.5251, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7091681845780207, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 1.4750450439967751e-05, |
|
"loss": 0.5817, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7115968427443837, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.4716115675450078e-05, |
|
"loss": 0.5178, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7140255009107468, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.4681709261366963e-05, |
|
"loss": 0.5317, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7164541590771099, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.4647231720437687e-05, |
|
"loss": 0.535, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.718882817243473, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.4612683576462135e-05, |
|
"loss": 0.5263, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7213114754098361, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.4578065354312816e-05, |
|
"loss": 0.5162, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7237401335761992, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.4543377579926915e-05, |
|
"loss": 0.5262, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7261687917425622, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.4508620780298288e-05, |
|
"loss": 0.5242, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7285974499089253, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.4473795483469442e-05, |
|
"loss": 0.5258, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7310261080752884, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 1.4438902218523537e-05, |
|
"loss": 0.5909, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7334547662416515, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.4403941515576344e-05, |
|
"loss": 0.5213, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7358834244080146, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.4368913905768178e-05, |
|
"loss": 0.5192, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7383120825743776, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 1.4333819921255836e-05, |
|
"loss": 0.5678, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.4298660095204516e-05, |
|
"loss": 0.5247, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7431693989071039, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.4263434961779709e-05, |
|
"loss": 0.5291, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7455980570734669, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.4228145056139097e-05, |
|
"loss": 0.5241, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.74802671523983, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.41927909144244e-05, |
|
"loss": 0.5199, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7504553734061931, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.4157373073753255e-05, |
|
"loss": 0.5341, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7528840315725561, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.412189207221104e-05, |
|
"loss": 0.5282, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7553126897389193, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.4086348448842707e-05, |
|
"loss": 0.5194, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.7577413479052824, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.4050742743644588e-05, |
|
"loss": 0.5139, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7577413479052824, |
|
"eval_loss": 0.5217667818069458, |
|
"eval_runtime": 96.9922, |
|
"eval_samples_per_second": 30.93, |
|
"eval_steps_per_second": 3.866, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7601700060716454, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.4015075497556193e-05, |
|
"loss": 0.5176, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.7625986642380085, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.3979347252451994e-05, |
|
"loss": 0.5178, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7650273224043715, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.3943558551133186e-05, |
|
"loss": 0.5258, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7674559805707347, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.3907709937319451e-05, |
|
"loss": 0.5176, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7698846387370978, |
|
"grad_norm": 0.625, |
|
"learning_rate": 1.3871801955640682e-05, |
|
"loss": 0.5865, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.7723132969034608, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.3835835151628728e-05, |
|
"loss": 0.5194, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7747419550698239, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 1.3799810071709088e-05, |
|
"loss": 0.5213, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.777170613236187, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.3763727263192626e-05, |
|
"loss": 0.5276, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7795992714025501, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.3727587274267235e-05, |
|
"loss": 0.5214, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.7820279295689132, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.3691390653989536e-05, |
|
"loss": 0.5307, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.7844565877352763, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.365513795227651e-05, |
|
"loss": 0.5252, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.7868852459016393, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.3618829719897158e-05, |
|
"loss": 0.5186, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.7893139040680024, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.3582466508464132e-05, |
|
"loss": 0.5191, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7917425622343656, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.3546048870425356e-05, |
|
"loss": 0.5268, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.7941712204007286, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.3509577359055627e-05, |
|
"loss": 0.53, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.7965998785670917, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 1.3473052528448203e-05, |
|
"loss": 0.5142, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.7990285367334548, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.3436474933506412e-05, |
|
"loss": 0.5148, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8014571948998178, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.3399845129935191e-05, |
|
"loss": 0.5223, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.803885853066181, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.3363163674232663e-05, |
|
"loss": 0.5247, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.806314511232544, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.3326431123681667e-05, |
|
"loss": 0.52, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8087431693989071, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.328964803634131e-05, |
|
"loss": 0.5172, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8111718275652702, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.3252814971038477e-05, |
|
"loss": 0.5226, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8136004857316332, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.3215932487359338e-05, |
|
"loss": 0.5214, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8160291438979964, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.3179001145640856e-05, |
|
"loss": 0.5234, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8184578020643595, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.314202150696227e-05, |
|
"loss": 0.5195, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8208864602307225, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.3104994133136563e-05, |
|
"loss": 0.5212, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8208864602307225, |
|
"eval_loss": 0.5185486674308777, |
|
"eval_runtime": 97.0358, |
|
"eval_samples_per_second": 30.916, |
|
"eval_steps_per_second": 3.865, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8233151183970856, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.3067919586701948e-05, |
|
"loss": 0.5108, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8257437765634487, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.3030798430913289e-05, |
|
"loss": 0.5175, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8281724347298117, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.2993631229733584e-05, |
|
"loss": 0.5165, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8306010928961749, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.295641854782535e-05, |
|
"loss": 0.5096, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.833029751062538, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.2919160950542095e-05, |
|
"loss": 0.5231, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.835458409228901, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.2881859003919688e-05, |
|
"loss": 0.512, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8378870673952641, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.284451327466778e-05, |
|
"loss": 0.5081, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8403157255616271, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.2807124330161188e-05, |
|
"loss": 0.5181, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.8427443837279903, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.2769692738431279e-05, |
|
"loss": 0.5191, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.8451730418943534, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.2732219068157335e-05, |
|
"loss": 0.499, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8476017000607164, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.2694703888657915e-05, |
|
"loss": 0.5205, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.8500303582270795, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.2657147769882215e-05, |
|
"loss": 0.5799, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8524590163934426, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.261955128240139e-05, |
|
"loss": 0.5102, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.8548876745598057, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.2581914997399899e-05, |
|
"loss": 0.514, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8573163327261688, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.2544239486666831e-05, |
|
"loss": 0.5168, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.8597449908925319, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.2506525322587207e-05, |
|
"loss": 0.5138, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8621736490588949, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 1.2468773078133286e-05, |
|
"loss": 0.563, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.864602307225258, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.2430983326855873e-05, |
|
"loss": 0.5064, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8670309653916212, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.2393156642875579e-05, |
|
"loss": 0.5148, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.8694596235579842, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.2355293600874132e-05, |
|
"loss": 0.5147, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.8718882817243473, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 1.2317394776085614e-05, |
|
"loss": 0.5164, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.8743169398907104, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.2279460744287755e-05, |
|
"loss": 0.5109, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8767455980570734, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.2241492081793145e-05, |
|
"loss": 0.5184, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.8791742562234366, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 1.220348936544052e-05, |
|
"loss": 0.5627, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.8816029143897997, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.2165453172585964e-05, |
|
"loss": 0.5149, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.8840315725561627, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.2127384081094167e-05, |
|
"loss": 0.5109, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.8840315725561627, |
|
"eval_loss": 0.5158221125602722, |
|
"eval_runtime": 97.2582, |
|
"eval_samples_per_second": 30.846, |
|
"eval_steps_per_second": 3.856, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.8864602307225258, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 1.2089282669329625e-05, |
|
"loss": 0.4993, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.205114951614785e-05, |
|
"loss": 0.5187, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.891317547055252, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.2012985200886602e-05, |
|
"loss": 0.5088, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.8937462052216151, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.197479030335706e-05, |
|
"loss": 0.5167, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.8961748633879781, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.1936565403835027e-05, |
|
"loss": 0.5138, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.8986035215543412, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.1898311083052113e-05, |
|
"loss": 0.5062, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9010321797207043, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.1860027922186908e-05, |
|
"loss": 0.5122, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.9034608378870674, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.1821716502856154e-05, |
|
"loss": 0.5108, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9058894960534305, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.1783377407105907e-05, |
|
"loss": 0.5212, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9083181542197936, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 1.1745011217402709e-05, |
|
"loss": 0.5079, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9107468123861566, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 1.1706618516624712e-05, |
|
"loss": 0.5105, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9131754705525197, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.1668199888052844e-05, |
|
"loss": 0.5123, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9156041287188829, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.1629755915361947e-05, |
|
"loss": 0.5125, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9180327868852459, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.159128718261189e-05, |
|
"loss": 0.5021, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.920461445051609, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.1552794274238723e-05, |
|
"loss": 0.5158, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.922890103217972, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.1514277775045768e-05, |
|
"loss": 0.5064, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9253187613843351, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.1475738270194767e-05, |
|
"loss": 0.512, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.9277474195506983, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.1437176345196967e-05, |
|
"loss": 0.5236, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9301760777170613, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.1398592585904234e-05, |
|
"loss": 0.5152, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.9326047358834244, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.135998757850015e-05, |
|
"loss": 0.522, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9350333940497875, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.1321361909491108e-05, |
|
"loss": 0.5159, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9374620522161505, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.128271616569741e-05, |
|
"loss": 0.5042, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9398907103825137, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.1244050934244333e-05, |
|
"loss": 0.5161, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.9423193685488768, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.1205366802553231e-05, |
|
"loss": 0.5094, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9447480267152398, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.1166664358332595e-05, |
|
"loss": 0.5165, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.9471766848816029, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.1127944189569122e-05, |
|
"loss": 0.5148, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9471766848816029, |
|
"eval_loss": 0.5134184956550598, |
|
"eval_runtime": 97.3787, |
|
"eval_samples_per_second": 30.808, |
|
"eval_steps_per_second": 3.851, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.949605343047966, |
|
"grad_norm": 0.375, |
|
"learning_rate": 1.1089206884518802e-05, |
|
"loss": 0.52, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.9520340012143291, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.1050453031697958e-05, |
|
"loss": 0.5141, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.9544626593806922, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.1011683219874324e-05, |
|
"loss": 0.5114, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.9568913175470553, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.0972898038058077e-05, |
|
"loss": 0.5128, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.9593199757134183, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.093409807549292e-05, |
|
"loss": 0.5107, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9617486338797814, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 1.0895283921647098e-05, |
|
"loss": 0.5607, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.9641772920461446, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.085645616620446e-05, |
|
"loss": 0.5203, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.9666059502125076, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.0817615399055513e-05, |
|
"loss": 0.511, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.9690346083788707, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.0778762210288416e-05, |
|
"loss": 0.5017, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.9714632665452337, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.0739897190180066e-05, |
|
"loss": 0.5149, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9738919247115968, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.0701020929187096e-05, |
|
"loss": 0.5721, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.97632058287796, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.0662134017936924e-05, |
|
"loss": 0.5081, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.978749241044323, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.0623237047218771e-05, |
|
"loss": 0.5709, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.9811778992106861, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.0584330607974673e-05, |
|
"loss": 0.5015, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.054541529129054e-05, |
|
"loss": 0.5167, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9860352155434122, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 1.0506491688387128e-05, |
|
"loss": 0.5619, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.9884638737097754, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.04675603906111e-05, |
|
"loss": 0.5261, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.9908925318761385, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.0428621989426016e-05, |
|
"loss": 0.4998, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.9933211900425015, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.0389677076403351e-05, |
|
"loss": 0.5051, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.9957498482088646, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.0350726243213519e-05, |
|
"loss": 0.569, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9981785063752276, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.0311770081616864e-05, |
|
"loss": 0.514, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.0006071645415908, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.0272809183454701e-05, |
|
"loss": 0.5084, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.0030358227079539, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 1.0233844140640287e-05, |
|
"loss": 0.5605, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.005464480874317, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.0194875545149854e-05, |
|
"loss": 0.507, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.00789313904068, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.015590398901361e-05, |
|
"loss": 0.5133, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.010321797207043, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.0116930064306736e-05, |
|
"loss": 0.5121, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.010321797207043, |
|
"eval_loss": 0.5115101933479309, |
|
"eval_runtime": 96.8252, |
|
"eval_samples_per_second": 30.984, |
|
"eval_steps_per_second": 3.873, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.0127504553734061, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.0077954363140407e-05, |
|
"loss": 0.5109, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.0151791135397692, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 1.0038977477652779e-05, |
|
"loss": 0.4991, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.002428658166363, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4774, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.0048573163327261, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 9.961022522347226e-06, |
|
"loss": 0.475, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0072859744990892, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 9.922045636859596e-06, |
|
"loss": 0.4863, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.0097146326654522, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 9.883069935693267e-06, |
|
"loss": 0.4837, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.0121432908318153, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 9.844096010986392e-06, |
|
"loss": 0.479, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.0145719489981786, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.80512445485015e-06, |
|
"loss": 0.4849, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.0170006071645417, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 9.766155859359718e-06, |
|
"loss": 0.4765, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.0194292653309047, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.7271908165453e-06, |
|
"loss": 0.4773, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.0218579234972678, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.688229918383138e-06, |
|
"loss": 0.5238, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.0242865816636308, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 9.649273756786486e-06, |
|
"loss": 0.483, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.026715239829994, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.610322923596652e-06, |
|
"loss": 0.4718, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.029143897996357, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 9.57137801057399e-06, |
|
"loss": 0.5207, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.03157255616272, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 9.532439609388901e-06, |
|
"loss": 0.4787, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.034001214329083, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 9.493508311612874e-06, |
|
"loss": 0.4768, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.0364298724954462, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 9.454584708709462e-06, |
|
"loss": 0.484, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.0388585306618094, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.415669392025329e-06, |
|
"loss": 0.4812, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.0412871888281725, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 9.376762952781234e-06, |
|
"loss": 0.475, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.0437158469945356, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 9.337865982063076e-06, |
|
"loss": 0.4726, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.0461445051608986, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.298979070812908e-06, |
|
"loss": 0.473, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.0485731633272617, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 9.260102809819939e-06, |
|
"loss": 0.4739, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.0510018214936248, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.221237789711587e-06, |
|
"loss": 0.4916, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.0534304796599878, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 9.182384600944494e-06, |
|
"loss": 0.4823, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0558591378263509, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 9.143543833795539e-06, |
|
"loss": 0.4737, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.058287795992714, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.104716078352906e-06, |
|
"loss": 0.4788, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.058287795992714, |
|
"eval_loss": 0.5110214352607727, |
|
"eval_runtime": 96.9242, |
|
"eval_samples_per_second": 30.952, |
|
"eval_steps_per_second": 3.869, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.060716454159077, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 9.065901924507085e-06, |
|
"loss": 0.4775, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.0631451123254403, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 9.027101961941925e-06, |
|
"loss": 0.4735, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.0655737704918034, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 8.98831678012568e-06, |
|
"loss": 0.4803, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.0680024286581664, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 8.949546968302042e-06, |
|
"loss": 0.4767, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.0704310868245295, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 8.910793115481201e-06, |
|
"loss": 0.4765, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.0728597449908925, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 8.872055810430881e-06, |
|
"loss": 0.4789, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.0752884031572556, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 8.833335641667408e-06, |
|
"loss": 0.5243, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.0777170613236187, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 8.79463319744677e-06, |
|
"loss": 0.4769, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0801457194899817, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 8.755949065755668e-06, |
|
"loss": 0.4774, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.0825743776563448, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 8.717283834302593e-06, |
|
"loss": 0.4792, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.0850030358227079, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 8.678638090508897e-06, |
|
"loss": 0.4768, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.0874316939890711, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 8.640012421499856e-06, |
|
"loss": 0.4738, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.0898603521554342, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 8.601407414095771e-06, |
|
"loss": 0.5251, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.0922890103217973, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 8.562823654803035e-06, |
|
"loss": 0.4847, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.0947176684881603, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 8.524261729805235e-06, |
|
"loss": 0.4815, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.0971463266545234, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 8.485722224954237e-06, |
|
"loss": 0.5586, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.0995749848208864, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 8.44720572576128e-06, |
|
"loss": 0.4716, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.1020036429872495, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 8.408712817388113e-06, |
|
"loss": 0.4782, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1044323011536126, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 8.370244084638055e-06, |
|
"loss": 0.5251, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.1068609593199756, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 8.331800111947158e-06, |
|
"loss": 0.5125, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.1092896174863387, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 8.293381483375293e-06, |
|
"loss": 0.5175, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.111718275652702, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 8.254988782597295e-06, |
|
"loss": 0.514, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.114146933819065, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 8.216622592894097e-06, |
|
"loss": 0.477, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.116575591985428, |
|
"grad_norm": 0.375, |
|
"learning_rate": 8.178283497143851e-06, |
|
"loss": 0.4873, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.1190042501517912, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 8.139972077813093e-06, |
|
"loss": 0.4805, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.1214329083181542, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 8.10168891694789e-06, |
|
"loss": 0.4738, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.1214329083181542, |
|
"eval_loss": 0.5099829435348511, |
|
"eval_runtime": 99.9835, |
|
"eval_samples_per_second": 30.005, |
|
"eval_steps_per_second": 3.751, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.1238615664845173, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 8.063434596164974e-06, |
|
"loss": 0.471, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.1262902246508804, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 8.025209696642942e-06, |
|
"loss": 0.4781, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1287188828172434, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.987014799113398e-06, |
|
"loss": 0.4806, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.1311475409836065, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 7.948850483852153e-06, |
|
"loss": 0.4737, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.1335761991499695, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 7.91071733067038e-06, |
|
"loss": 0.5262, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.1360048573163328, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 7.872615918905833e-06, |
|
"loss": 0.4892, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.138433515482696, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.83454682741404e-06, |
|
"loss": 0.4825, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.140862173649059, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.796510634559487e-06, |
|
"loss": 0.4708, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.143290831815422, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 7.758507918206859e-06, |
|
"loss": 0.474, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.145719489981785, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 7.720539255712252e-06, |
|
"loss": 0.4705, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.1481481481481481, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 7.682605223914386e-06, |
|
"loss": 0.4735, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.1505768063145112, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 7.644706399125871e-06, |
|
"loss": 0.4696, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1530054644808743, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 7.606843357124426e-06, |
|
"loss": 0.4953, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.1554341226472373, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 7.569016673144132e-06, |
|
"loss": 0.4749, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.1578627808136004, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.531226921866715e-06, |
|
"loss": 0.4755, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.1602914389799635, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.493474677412795e-06, |
|
"loss": 0.4661, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.1627200971463267, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.455760513333172e-06, |
|
"loss": 0.5152, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.1651487553126898, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 7.418085002600104e-06, |
|
"loss": 0.4787, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.1675774134790529, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.3804487175986135e-06, |
|
"loss": 0.4718, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.170006071645416, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 7.3428522301177894e-06, |
|
"loss": 0.4728, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.172434729811779, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.305296111342086e-06, |
|
"loss": 0.4771, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.174863387978142, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.267780931842667e-06, |
|
"loss": 0.479, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.177292046144505, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 7.230307261568725e-06, |
|
"loss": 0.468, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.1797207043108682, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.192875669838815e-06, |
|
"loss": 0.475, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.1821493624772312, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.155486725332224e-06, |
|
"loss": 0.4683, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.1845780206435945, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.118140996080313e-06, |
|
"loss": 0.4818, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.1845780206435945, |
|
"eval_loss": 0.5090214610099792, |
|
"eval_runtime": 100.5205, |
|
"eval_samples_per_second": 29.845, |
|
"eval_steps_per_second": 3.731, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.1870066788099576, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 7.080839049457908e-06, |
|
"loss": 0.513, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.1894353369763206, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 7.043581452174653e-06, |
|
"loss": 0.4799, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.1918639951426837, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 7.006368770266421e-06, |
|
"loss": 0.5165, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.1942926533090468, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 6.9692015690867135e-06, |
|
"loss": 0.4774, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.1967213114754098, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.932080413298055e-06, |
|
"loss": 0.4723, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.199149969641773, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 6.895005866863439e-06, |
|
"loss": 0.4679, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.201578627808136, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.857978493037734e-06, |
|
"loss": 0.4769, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.204007285974499, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 6.820998854359144e-06, |
|
"loss": 0.4752, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.206435944140862, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 6.784067512640666e-06, |
|
"loss": 0.4781, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.2088646023072251, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 6.7471850289615246e-06, |
|
"loss": 0.4705, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.2112932604735884, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 6.710351963658692e-06, |
|
"loss": 0.5441, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.2137219186399515, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 6.67356887631834e-06, |
|
"loss": 0.4712, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.2161505768063146, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 6.636836325767342e-06, |
|
"loss": 0.4824, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.2185792349726776, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 6.600154870064812e-06, |
|
"loss": 0.4772, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.2210078931390407, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 6.563525066493588e-06, |
|
"loss": 0.4641, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.2234365513054037, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 6.526947471551799e-06, |
|
"loss": 0.4711, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2258652094717668, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 6.490422640944378e-06, |
|
"loss": 0.4702, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.2282938676381299, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.453951129574644e-06, |
|
"loss": 0.4849, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.230722525804493, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 6.41753349153587e-06, |
|
"loss": 0.5051, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.2331511839708562, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.3811702801028465e-06, |
|
"loss": 0.4701, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.2355798421372193, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.344862047723495e-06, |
|
"loss": 0.4765, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.2380085003035823, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.30860934601047e-06, |
|
"loss": 0.4827, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.2404371584699454, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 6.272412725732767e-06, |
|
"loss": 0.4787, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.2428658166363085, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.236272736807378e-06, |
|
"loss": 0.4825, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.2452944748026715, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 6.200189928290916e-06, |
|
"loss": 0.4799, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.2477231329690346, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 6.1641648483712755e-06, |
|
"loss": 0.4719, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2477231329690346, |
|
"eval_loss": 0.5082234740257263, |
|
"eval_runtime": 98.6416, |
|
"eval_samples_per_second": 30.413, |
|
"eval_steps_per_second": 3.802, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2501517911353976, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 6.128198044359322e-06, |
|
"loss": 0.4689, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.2525804493017607, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.09229006268055e-06, |
|
"loss": 0.4821, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.255009107468124, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 6.056441448866817e-06, |
|
"loss": 0.4793, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.2574377656344868, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.020652747548008e-06, |
|
"loss": 0.4761, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.2598664238008501, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 5.984924502443807e-06, |
|
"loss": 0.482, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.2622950819672132, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 5.949257256355415e-06, |
|
"loss": 0.4674, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.2647237401335762, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 5.913651551157295e-06, |
|
"loss": 0.4733, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.2671523982999393, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 5.878107927788962e-06, |
|
"loss": 0.4742, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.2695810564663024, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 5.84262692624675e-06, |
|
"loss": 0.476, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.2720097146326654, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 5.8072090855756e-06, |
|
"loss": 0.4698, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2744383727990285, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 5.7718549438609085e-06, |
|
"loss": 0.4737, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.2768670309653916, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 5.736565038220289e-06, |
|
"loss": 0.4787, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.2792956891317546, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 5.701339904795486e-06, |
|
"loss": 0.4673, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.281724347298118, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.666180078744169e-06, |
|
"loss": 0.4786, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.2841530054644807, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 5.6310860942318235e-06, |
|
"loss": 0.4766, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.286581663630844, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 5.5960584844236565e-06, |
|
"loss": 0.4744, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.289010321797207, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 5.561097781476463e-06, |
|
"loss": 0.4706, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.2914389799635702, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 5.5262045165305615e-06, |
|
"loss": 0.474, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.2938676381299332, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.491379219701718e-06, |
|
"loss": 0.4737, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.2962962962962963, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 5.456622420073084e-06, |
|
"loss": 0.4797, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.2987249544626593, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 5.421934645687185e-06, |
|
"loss": 0.4779, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.3011536126290224, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 5.387316423537869e-06, |
|
"loss": 0.476, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.3035822707953855, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.352768279562315e-06, |
|
"loss": 0.4792, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.3060109289617485, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 5.318290738633041e-06, |
|
"loss": 0.5148, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.3084395871281118, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 5.283884324549924e-06, |
|
"loss": 0.4741, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.3108682452944749, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 5.249549560032252e-06, |
|
"loss": 0.4643, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.3108682452944749, |
|
"eval_loss": 0.5077295899391174, |
|
"eval_runtime": 96.7746, |
|
"eval_samples_per_second": 31.0, |
|
"eval_steps_per_second": 3.875, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.313296903460838, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.215286966710774e-06, |
|
"loss": 0.4723, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.315725561627201, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.18109706511978e-06, |
|
"loss": 0.4812, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.318154219793564, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 5.146980374689192e-06, |
|
"loss": 0.4683, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.3205828779599271, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 5.112937413736667e-06, |
|
"loss": 0.4731, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3230115361262902, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 5.078968699459736e-06, |
|
"loss": 0.4687, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.3254401942926533, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 5.045074747927927e-06, |
|
"loss": 0.4781, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.3278688524590163, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.011256074074945e-06, |
|
"loss": 0.4764, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.3302975106253796, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 4.977513191690834e-06, |
|
"loss": 0.4628, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.3327261687917424, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.943846613414172e-06, |
|
"loss": 0.4751, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.3351548269581057, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 4.910256850724306e-06, |
|
"loss": 0.4742, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.3375834851244688, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 4.8767444139335365e-06, |
|
"loss": 0.4653, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.3400121432908318, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.843309812179405e-06, |
|
"loss": 0.4779, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.342440801457195, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 4.809953553416954e-06, |
|
"loss": 0.4845, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.344869459623558, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 4.776676144410973e-06, |
|
"loss": 0.4687, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.347298117789921, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.743478090728356e-06, |
|
"loss": 0.4819, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.349726775956284, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.710359896730379e-06, |
|
"loss": 0.4757, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.3521554341226472, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.677322065565039e-06, |
|
"loss": 0.4692, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.3545840922890102, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.644365099159443e-06, |
|
"loss": 0.4787, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.3570127504553735, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 4.611489498212145e-06, |
|
"loss": 0.5029, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.3594414086217366, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 4.57869576218556e-06, |
|
"loss": 0.473, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.3618700667880996, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 4.545984389298371e-06, |
|
"loss": 0.4751, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.3642987249544627, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 4.5133558765179576e-06, |
|
"loss": 0.4757, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.3667273831208258, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 4.480810719552848e-06, |
|
"loss": 0.4691, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.3691560412871888, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 4.4483494128451885e-06, |
|
"loss": 0.477, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3715846994535519, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.4159724495632295e-06, |
|
"loss": 0.4775, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.374013357619915, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.383680321593836e-06, |
|
"loss": 0.4783, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.374013357619915, |
|
"eval_loss": 0.5073318481445312, |
|
"eval_runtime": 103.2698, |
|
"eval_samples_per_second": 29.05, |
|
"eval_steps_per_second": 3.631, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.376442015786278, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.35147351953501e-06, |
|
"loss": 0.4735, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.3788706739526413, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 4.319352532688444e-06, |
|
"loss": 0.4667, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.3812993321190041, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 4.287317849052075e-06, |
|
"loss": 0.4788, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.3837279902853674, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.255369955312698e-06, |
|
"loss": 0.474, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.3861566484517305, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.223509336838528e-06, |
|
"loss": 0.4688, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.3885853066180935, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 4.191736477671864e-06, |
|
"loss": 0.4688, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.3910139647844566, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.160051860521731e-06, |
|
"loss": 0.4659, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.3934426229508197, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 4.128455966756512e-06, |
|
"loss": 0.4759, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3958712811171827, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 4.096949276396694e-06, |
|
"loss": 0.4779, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.3982999392835458, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.065532268107507e-06, |
|
"loss": 0.4776, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.4007285974499089, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.034205419191709e-06, |
|
"loss": 0.4749, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.403157255616272, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 4.002969205582314e-06, |
|
"loss": 0.4791, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.4055859137826352, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 3.971824101835341e-06, |
|
"loss": 0.4723, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.4080145719489983, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.940770581122634e-06, |
|
"loss": 0.4803, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.4104432301153613, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.909809115224674e-06, |
|
"loss": 0.4667, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.4128718882817244, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.878940174523371e-06, |
|
"loss": 0.4795, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.4153005464480874, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 3.848164227994976e-06, |
|
"loss": 0.4631, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.4177292046144505, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 3.8174817432029125e-06, |
|
"loss": 0.4728, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4201578627808136, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.7868931862906756e-06, |
|
"loss": 0.4658, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.4225865209471766, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.7563990219747857e-06, |
|
"loss": 0.4841, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.4250151791135397, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 3.725999713537689e-06, |
|
"loss": 0.4763, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.427443837279903, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 3.695695722820737e-06, |
|
"loss": 0.4804, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.4298724954462658, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.6654875102171683e-06, |
|
"loss": 0.4687, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.432301153612629, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.635375534665111e-06, |
|
"loss": 0.464, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.4347298117789922, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.605360253640614e-06, |
|
"loss": 0.4735, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.4371584699453552, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.5754421231506953e-06, |
|
"loss": 0.4782, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.4371584699453552, |
|
"eval_loss": 0.5070293545722961, |
|
"eval_runtime": 107.3386, |
|
"eval_samples_per_second": 27.949, |
|
"eval_steps_per_second": 3.494, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.4395871281117183, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.545621597726412e-06, |
|
"loss": 0.4721, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.4420157862780814, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.5158991304159572e-06, |
|
"loss": 0.4755, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.48627517277778e-06, |
|
"loss": 0.4827, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.4468731026108075, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.4567501748737153e-06, |
|
"loss": 0.4693, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.4493017607771705, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.427324585262156e-06, |
|
"loss": 0.468, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.4517304189435336, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 3.3979988509912443e-06, |
|
"loss": 0.4715, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.454159077109897, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.3687734175920505e-06, |
|
"loss": 0.4844, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.4565877352762597, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.339648729071836e-06, |
|
"loss": 0.4731, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.459016393442623, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.310625227907307e-06, |
|
"loss": 0.4744, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.461445051608986, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.281703355037854e-06, |
|
"loss": 0.4771, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.4638737097753491, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.2528835498589085e-06, |
|
"loss": 0.471, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.4663023679417122, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 3.2241662502152236e-06, |
|
"loss": 0.4773, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4687310261080753, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.195551892394234e-06, |
|
"loss": 0.4772, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.4711596842744383, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 3.1670409111194454e-06, |
|
"loss": 0.4707, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.4735883424408014, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.138633739543805e-06, |
|
"loss": 0.4759, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.4760170006071647, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.110330809243134e-06, |
|
"loss": 0.4693, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.4784456587735275, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 3.082132550209571e-06, |
|
"loss": 0.4666, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.4808743169398908, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.054039390845035e-06, |
|
"loss": 0.4731, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.4833029751062539, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.0260517579547166e-06, |
|
"loss": 0.4782, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.485731633272617, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 2.998170076740601e-06, |
|
"loss": 0.5016, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.48816029143898, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 2.9703947707949974e-06, |
|
"loss": 0.5092, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.490588949605343, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 2.9427262620941142e-06, |
|
"loss": 0.4768, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4930176077717061, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.915164970991642e-06, |
|
"loss": 0.4699, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.4954462659380692, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.8877113162123637e-06, |
|
"loss": 0.4729, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.4978749241044322, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.8603657148458053e-06, |
|
"loss": 0.4698, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.5003035822707953, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.833128582339887e-06, |
|
"loss": 0.4812, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.5003035822707953, |
|
"eval_loss": 0.5068376660346985, |
|
"eval_runtime": 97.0195, |
|
"eval_samples_per_second": 30.922, |
|
"eval_steps_per_second": 3.865, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.5027322404371586, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.806000332494617e-06, |
|
"loss": 0.4651, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.5051608986035214, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.778981377455806e-06, |
|
"loss": 0.4681, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.5075895567698847, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.7520721277088023e-06, |
|
"loss": 0.4747, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.5100182149362478, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 2.7252729920722564e-06, |
|
"loss": 0.4736, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.5124468731026108, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 2.698584377691913e-06, |
|
"loss": 0.5096, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.514875531268974, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.6720066900344212e-06, |
|
"loss": 0.4703, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.517304189435337, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 2.6455403328811736e-06, |
|
"loss": 0.4765, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.5197328476017, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.6191857083221873e-06, |
|
"loss": 0.4819, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.522161505768063, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.5929432167499658e-06, |
|
"loss": 0.4673, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.5245901639344264, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.5668132568534377e-06, |
|
"loss": 0.4748, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.5270188221007892, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.540796225611907e-06, |
|
"loss": 0.4674, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.5294474802671525, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 2.514892518288988e-06, |
|
"loss": 0.5083, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.5318761384335153, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 2.4891025284266436e-06, |
|
"loss": 0.5049, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.5343047965998786, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.463426647839173e-06, |
|
"loss": 0.4701, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.5367334547662417, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.4378652666072646e-06, |
|
"loss": 0.4715, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.5391621129326047, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 2.4124187730720916e-06, |
|
"loss": 0.5031, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.5415907710989678, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.387087553829386e-06, |
|
"loss": 0.4734, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.5440194292653309, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.361871993723579e-06, |
|
"loss": 0.4649, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.5464480874316942, |
|
"grad_norm": 0.5, |
|
"learning_rate": 2.3367724758419495e-06, |
|
"loss": 0.5191, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.548876745598057, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.3117893815088067e-06, |
|
"loss": 0.4755, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.5513054037644203, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 2.2869230902796934e-06, |
|
"loss": 0.4805, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.5537340619307831, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 2.2621739799356244e-06, |
|
"loss": 0.4807, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.5561627200971464, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.2375424264773447e-06, |
|
"loss": 0.4818, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.5585913782635095, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 2.2130288041196135e-06, |
|
"loss": 0.4773, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.5610200364298725, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.188633485285525e-06, |
|
"loss": 0.4696, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.5634486945962356, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.1643568406008476e-06, |
|
"loss": 0.4679, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5634486945962356, |
|
"eval_loss": 0.5066995620727539, |
|
"eval_runtime": 98.2878, |
|
"eval_samples_per_second": 30.523, |
|
"eval_steps_per_second": 3.815, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5658773527625987, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.1401992388883888e-06, |
|
"loss": 0.4672, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.5683060109289617, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 2.1161610471624084e-06, |
|
"loss": 0.4629, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.5707346690953248, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.092242630623016e-06, |
|
"loss": 0.4701, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.573163327261688, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.0684443526506415e-06, |
|
"loss": 0.4767, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.575591985428051, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.0447665748005206e-06, |
|
"loss": 0.4677, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.5780206435944142, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 2.021209656797174e-06, |
|
"loss": 0.5038, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.580449301760777, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.9977739565289743e-06, |
|
"loss": 0.4732, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.5828779599271403, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.974459830042691e-06, |
|
"loss": 0.4743, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.5853066180935034, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.951267631538072e-06, |
|
"loss": 0.4686, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.5877352762598664, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 1.928197713362495e-06, |
|
"loss": 0.5074, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.5901639344262295, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.9052504260055838e-06, |
|
"loss": 0.4701, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.5925925925925926, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.8824261180938875e-06, |
|
"loss": 0.4757, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.5950212507589556, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.8597251363856061e-06, |
|
"loss": 0.4754, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.5974499089253187, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.8371478257652908e-06, |
|
"loss": 0.4718, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.599878567091682, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.8146945292386343e-06, |
|
"loss": 0.4765, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.6023072252580448, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.7923655879272395e-06, |
|
"loss": 0.4822, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.604735883424408, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 1.7701613410634367e-06, |
|
"loss": 0.4802, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.607164541590771, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.7480821259851488e-06, |
|
"loss": 0.4741, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.6095931997571342, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.7261282781307486e-06, |
|
"loss": 0.4686, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.6120218579234973, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.7043001310339646e-06, |
|
"loss": 0.4672, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6144505160898603, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.6825980163188204e-06, |
|
"loss": 0.4727, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.6168791742562234, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.661022263694594e-06, |
|
"loss": 0.4805, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.6193078324225865, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.6395732009508058e-06, |
|
"loss": 0.469, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.6217364905889498, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.6182511539522427e-06, |
|
"loss": 0.4747, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.6241651487553126, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.5970564466340022e-06, |
|
"loss": 0.4635, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.6265938069216759, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.5759894009965793e-06, |
|
"loss": 0.4725, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.6265938069216759, |
|
"eval_loss": 0.5065969824790955, |
|
"eval_runtime": 97.0109, |
|
"eval_samples_per_second": 30.924, |
|
"eval_steps_per_second": 3.866, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.6290224650880387, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.5550503371009652e-06, |
|
"loss": 0.4762, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.631451123254402, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.5342395730637904e-06, |
|
"loss": 0.4738, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.633879781420765, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.5135574250524898e-06, |
|
"loss": 0.4787, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.6363084395871281, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.4930042072805062e-06, |
|
"loss": 0.4681, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6387370977534912, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.4725802320024985e-06, |
|
"loss": 0.4772, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.6411657559198543, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.452285809509617e-06, |
|
"loss": 0.4753, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.6435944140862173, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.432121248124786e-06, |
|
"loss": 0.4793, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.6460230722525804, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.4120868541980026e-06, |
|
"loss": 0.4766, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.6484517304189437, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.39218293210171e-06, |
|
"loss": 0.4742, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.6508803885853065, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.372409784226152e-06, |
|
"loss": 0.485, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.6533090467516698, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.3527677109747784e-06, |
|
"loss": 0.476, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.6557377049180326, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.333257010759702e-06, |
|
"loss": 0.4773, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.658166363084396, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.3138779799971446e-06, |
|
"loss": 0.4772, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.660595021250759, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.294630913102939e-06, |
|
"loss": 0.478, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.663023679417122, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.2755161024880602e-06, |
|
"loss": 0.472, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 1.665452337583485, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.2565338385541792e-06, |
|
"loss": 0.4716, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.6678809957498482, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.2376844096892526e-06, |
|
"loss": 0.4646, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.6703096539162114, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.2189681022631405e-06, |
|
"loss": 0.4743, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.6727383120825743, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.2003852006232564e-06, |
|
"loss": 0.4727, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.6751669702489376, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.181935987090247e-06, |
|
"loss": 0.463, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.6775956284153004, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.1636207419537038e-06, |
|
"loss": 0.4799, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 1.6800242865816637, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.1454397434679022e-06, |
|
"loss": 0.4795, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.6824529447480268, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.1273932678475764e-06, |
|
"loss": 0.4748, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 1.6848816029143898, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 1.1094815892637256e-06, |
|
"loss": 0.5055, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6873102610807529, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.0917049798394408e-06, |
|
"loss": 0.4721, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 1.689738919247116, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.0740637096457773e-06, |
|
"loss": 0.4645, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.689738919247116, |
|
"eval_loss": 0.5065945386886597, |
|
"eval_runtime": 97.1015, |
|
"eval_samples_per_second": 30.896, |
|
"eval_steps_per_second": 3.862, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.692167577413479, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 1.0565580466976566e-06, |
|
"loss": 0.4757, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.694596235579842, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.0391882569497758e-06, |
|
"loss": 0.475, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.6970248937462054, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.0219546042925842e-06, |
|
"loss": 0.4777, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.6994535519125682, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.0048573505482728e-06, |
|
"loss": 0.4712, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.7018822100789315, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 9.878967554667862e-07, |
|
"loss": 0.5034, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 1.7043108682452943, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 9.710730767218913e-07, |
|
"loss": 0.469, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.7067395264116576, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 9.54386569907244e-07, |
|
"loss": 0.4712, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 1.7091681845780207, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 9.378374885325225e-07, |
|
"loss": 0.4754, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7115968427443837, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.214260840195732e-07, |
|
"loss": 0.4796, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 1.7140255009107468, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.051526056985737e-07, |
|
"loss": 0.467, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.7164541590771099, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 8.890173008042768e-07, |
|
"loss": 0.4749, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 1.7188828172434731, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 8.730204144722232e-07, |
|
"loss": 0.5046, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.721311475409836, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 8.571621897350312e-07, |
|
"loss": 0.4781, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.7237401335761993, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 8.414428675187114e-07, |
|
"loss": 0.4611, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.726168791742562, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 8.258626866389897e-07, |
|
"loss": 0.4659, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 1.7285974499089254, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 8.10421883797694e-07, |
|
"loss": 0.467, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.7310261080752884, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 7.951206935791478e-07, |
|
"loss": 0.4678, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 1.7334547662416515, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 7.799593484466139e-07, |
|
"loss": 0.4771, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7358834244080146, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 7.649380787387561e-07, |
|
"loss": 0.4725, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 1.7383120825743776, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 7.500571126661449e-07, |
|
"loss": 0.4732, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.7407407407407407, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 7.35316676307789e-07, |
|
"loss": 0.4716, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 1.7431693989071038, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 7.207169936076974e-07, |
|
"loss": 0.4721, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.745598057073467, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 7.06258286371484e-07, |
|
"loss": 0.4726, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.7480267152398299, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 6.919407742629891e-07, |
|
"loss": 0.5167, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.7504553734061932, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.77764674800947e-07, |
|
"loss": 0.4826, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 1.752884031572556, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 6.637302033556891e-07, |
|
"loss": 0.4792, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.752884031572556, |
|
"eval_loss": 0.5065528750419617, |
|
"eval_runtime": 97.2896, |
|
"eval_samples_per_second": 30.836, |
|
"eval_steps_per_second": 3.854, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.7553126897389193, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 6.498375731458529e-07, |
|
"loss": 0.4687, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 1.7577413479052824, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 6.360869952351568e-07, |
|
"loss": 0.4841, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7601700060716454, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 6.22478678529197e-07, |
|
"loss": 0.4773, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 1.7625986642380085, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.090128297722564e-07, |
|
"loss": 0.476, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.7650273224043715, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 5.956896535441803e-07, |
|
"loss": 0.4749, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 1.7674559805707348, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 5.825093522572666e-07, |
|
"loss": 0.4828, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.7698846387370977, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 5.694721261531732e-07, |
|
"loss": 0.4682, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.772313296903461, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 5.565781732999043e-07, |
|
"loss": 0.4733, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.7747419550698238, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 5.438276895887761e-07, |
|
"loss": 0.4767, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 1.777170613236187, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 5.312208687314502e-07, |
|
"loss": 0.4758, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.7795992714025501, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 5.187579022569977e-07, |
|
"loss": 0.4839, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 1.7820279295689132, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 5.064389795089764e-07, |
|
"loss": 0.5067, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.7844565877352763, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 4.942642876425641e-07, |
|
"loss": 0.5085, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 1.7868852459016393, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 4.822340116217116e-07, |
|
"loss": 0.4757, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.7893139040680024, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.703483342163262e-07, |
|
"loss": 0.4792, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 1.7917425622343655, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.5860743599951186e-07, |
|
"loss": 0.4667, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.7941712204007287, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 4.470114953448079e-07, |
|
"loss": 0.4772, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.7965998785670916, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.3556068842348865e-07, |
|
"loss": 0.4801, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.7990285367334549, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 4.2425518920188536e-07, |
|
"loss": 0.4718, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 1.8014571948998177, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.1309516943874196e-07, |
|
"loss": 0.4731, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.803885853066181, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.0208079868260696e-07, |
|
"loss": 0.4812, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 1.806314511232544, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.9121224426925675e-07, |
|
"loss": 0.4739, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.8087431693989071, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.8048967131915414e-07, |
|
"loss": 0.4755, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 1.8111718275652702, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.699132427349383e-07, |
|
"loss": 0.4749, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.8136004857316332, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.594831191989523e-07, |
|
"loss": 0.4737, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 1.8160291438979965, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.49199459170797e-07, |
|
"loss": 0.4689, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.8160291438979965, |
|
"eval_loss": 0.506585955619812, |
|
"eval_runtime": 101.0452, |
|
"eval_samples_per_second": 29.69, |
|
"eval_steps_per_second": 3.711, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.8184578020643594, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.3906241888493005e-07, |
|
"loss": 0.4732, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.8208864602307226, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.2907215234829205e-07, |
|
"loss": 0.4814, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.8233151183970855, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.1922881133795827e-07, |
|
"loss": 0.4705, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 1.8257437765634488, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.095325453988385e-07, |
|
"loss": 0.4727, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 1.8281724347298116, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.999835018414143e-07, |
|
"loss": 0.4698, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 1.830601092896175, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.905818257394799e-07, |
|
"loss": 0.478, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.833029751062538, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.8132765992795797e-07, |
|
"loss": 0.4695, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 1.835458409228901, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.722211450007206e-07, |
|
"loss": 0.4722, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 1.837887067395264, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.632624193084499e-07, |
|
"loss": 0.4632, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 1.8403157255616271, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.544516189565482e-07, |
|
"loss": 0.4781, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.8427443837279904, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.4578887780305704e-07, |
|
"loss": 0.4755, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.8451730418943533, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.3727432745663025e-07, |
|
"loss": 0.4761, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 1.8476017000607166, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.2890809727453612e-07, |
|
"loss": 0.4747, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 1.8500303582270794, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.2069031436068643e-07, |
|
"loss": 0.4728, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.8524590163934427, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.1262110356371047e-07, |
|
"loss": 0.4824, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 1.8548876745598057, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.0470058747505516e-07, |
|
"loss": 0.4683, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.8573163327261688, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.969288864271246e-07, |
|
"loss": 0.4866, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 1.8597449908925319, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.8930611849145131e-07, |
|
"loss": 0.4797, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.862173649058895, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.8183239947690112e-07, |
|
"loss": 0.4676, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 1.864602307225258, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.7450784292791456e-07, |
|
"loss": 0.4668, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 1.867030965391621, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.6733256012278486e-07, |
|
"loss": 0.4742, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.8694596235579843, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.603066600719605e-07, |
|
"loss": 0.4728, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.8718882817243472, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.5343024951639752e-07, |
|
"loss": 0.47, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 1.8743169398907105, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.467034329259287e-07, |
|
"loss": 0.4656, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 1.8767455980570733, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.4012631249768592e-07, |
|
"loss": 0.4858, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 1.8791742562234366, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.336989881545403e-07, |
|
"loss": 0.4646, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.8791742562234366, |
|
"eval_loss": 0.5065886974334717, |
|
"eval_runtime": 100.9737, |
|
"eval_samples_per_second": 29.711, |
|
"eval_steps_per_second": 3.714, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.8816029143897997, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.2742155754358553e-07, |
|
"loss": 0.4823, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 1.8840315725561627, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.2129411603465924e-07, |
|
"loss": 0.4806, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 1.8864602307225258, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.1531675671888621e-07, |
|
"loss": 0.4909, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.0948957040727071e-07, |
|
"loss": 0.4798, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.8913175470552521, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.0381264562931426e-07, |
|
"loss": 0.4667, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.893746205221615, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.828606863166779e-08, |
|
"loss": 0.4703, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 1.8961748633879782, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 9.290992337682936e-08, |
|
"loss": 0.4799, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 1.898603521554341, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 8.768429154185853e-08, |
|
"loss": 0.478, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.9010321797207044, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 8.260925251714514e-08, |
|
"loss": 0.4779, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 1.9034608378870674, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 7.768488340519464e-08, |
|
"loss": 0.4801, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.9058894960534305, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 7.291125901946027e-08, |
|
"loss": 0.4716, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 1.9083181542197936, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 6.828845188321054e-08, |
|
"loss": 0.4739, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.9107468123861566, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 6.381653222842011e-08, |
|
"loss": 0.4673, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 1.9131754705525197, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 5.949556799470846e-08, |
|
"loss": 0.4853, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 1.9156041287188827, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 5.532562482830406e-08, |
|
"loss": 0.5203, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.918032786885246, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 5.1306766081048456e-08, |
|
"loss": 0.4728, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.9204614450516089, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.743905280943595e-08, |
|
"loss": 0.5187, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 1.9228901032179722, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.3722543773681016e-08, |
|
"loss": 0.4797, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 1.925318761384335, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 4.0157295436830116e-08, |
|
"loss": 0.4678, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 1.9277474195506983, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 3.674336196390238e-08, |
|
"loss": 0.5106, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9301760777170613, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.3480795221066955e-08, |
|
"loss": 0.4749, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 1.9326047358834244, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.036964477485249e-08, |
|
"loss": 0.4735, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 1.9350333940497875, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 2.7409957891397775e-08, |
|
"loss": 0.476, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 1.9374620522161505, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.4601779535733394e-08, |
|
"loss": 0.4695, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 1.9398907103825138, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.1945152371094512e-08, |
|
"loss": 0.4808, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.9423193685488767, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.944011675827695e-08, |
|
"loss": 0.4719, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 1.9423193685488767, |
|
"eval_loss": 0.5065895318984985, |
|
"eval_runtime": 96.8435, |
|
"eval_samples_per_second": 30.978, |
|
"eval_steps_per_second": 3.872, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 1.94474802671524, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.7086710755024327e-08, |
|
"loss": 0.4724, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 1.9471766848816028, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.4884970115444097e-08, |
|
"loss": 0.4684, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 1.949605343047966, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.2834928289472415e-08, |
|
"loss": 0.4773, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 1.9520340012143291, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.0936616422358992e-08, |
|
"loss": 0.4767, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.9544626593806922, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 9.190063354198586e-09, |
|
"loss": 0.4771, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 1.9568913175470553, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 7.595295619490239e-09, |
|
"loss": 0.4729, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.9593199757134183, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.152337446736489e-09, |
|
"loss": 0.4754, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 1.9617486338797814, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 4.861210758071444e-09, |
|
"loss": 0.4906, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 1.9641772920461444, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.7219351689310455e-09, |
|
"loss": 0.4767, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.9666059502125077, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.734527987755531e-09, |
|
"loss": 0.4862, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.9690346083788706, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.899004215722977e-09, |
|
"loss": 0.4682, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 1.9714632665452339, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.2153765465250378e-09, |
|
"loss": 0.4798, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 1.9738919247115967, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 6.836553661715429e-10, |
|
"loss": 0.4743, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 1.97632058287796, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.038487528350675e-10, |
|
"loss": 0.4736, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.978749241044323, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 7.596247672325696e-11, |
|
"loss": 0.4709, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 1.981177899210686, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0, |
|
"loss": 0.4669, |
|
"step": 822 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 822, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 411, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8283124614508839e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|