|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 492, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006097560975609756, |
|
"grad_norm": 1.763520359992981, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1695, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012195121951219513, |
|
"grad_norm": 1.4942116737365723, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0553, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.018292682926829267, |
|
"grad_norm": 1.345508337020874, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0218, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.024390243902439025, |
|
"grad_norm": 1.123505711555481, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9164, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03048780487804878, |
|
"grad_norm": 0.823020875453949, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9038, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.036585365853658534, |
|
"grad_norm": 0.763806939125061, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9062, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.042682926829268296, |
|
"grad_norm": 0.7193121314048767, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8466, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.04878048780487805, |
|
"grad_norm": 0.7080236077308655, |
|
"learning_rate": 0.0001, |
|
"loss": 0.82, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.054878048780487805, |
|
"grad_norm": 0.6981502175331116, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8546, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06097560975609756, |
|
"grad_norm": 0.7174396514892578, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8438, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06707317073170732, |
|
"grad_norm": 0.6729642152786255, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8285, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07317073170731707, |
|
"grad_norm": 0.6757375001907349, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8068, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07926829268292683, |
|
"grad_norm": 0.6743811368942261, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7853, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08536585365853659, |
|
"grad_norm": 0.6855434775352478, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7305, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09146341463414634, |
|
"grad_norm": 0.7576789259910583, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7894, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0975609756097561, |
|
"grad_norm": 0.6285218000411987, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7821, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10365853658536585, |
|
"grad_norm": 0.6224460005760193, |
|
"learning_rate": 0.0001, |
|
"loss": 0.806, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.10975609756097561, |
|
"grad_norm": 0.6526975631713867, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7725, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.11585365853658537, |
|
"grad_norm": 0.7162805795669556, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7637, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12195121951219512, |
|
"grad_norm": 0.6594821214675903, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7953, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12804878048780488, |
|
"grad_norm": 0.6285718679428101, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7673, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.13414634146341464, |
|
"grad_norm": 0.6275126338005066, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7639, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1402439024390244, |
|
"grad_norm": 0.6683803200721741, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7598, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.14634146341463414, |
|
"grad_norm": 0.6154472231864929, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7243, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1524390243902439, |
|
"grad_norm": 0.6709151864051819, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7869, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.15853658536585366, |
|
"grad_norm": 0.6176601648330688, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7181, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.16463414634146342, |
|
"grad_norm": 0.5965794324874878, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6955, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.17073170731707318, |
|
"grad_norm": 0.5950392484664917, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6866, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.17682926829268292, |
|
"grad_norm": 0.5902345180511475, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7488, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.18292682926829268, |
|
"grad_norm": 0.5966442227363586, |
|
"learning_rate": 0.0001, |
|
"loss": 0.766, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.18902439024390244, |
|
"grad_norm": 0.6065996289253235, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7602, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1951219512195122, |
|
"grad_norm": 0.6001562476158142, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7307, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.20121951219512196, |
|
"grad_norm": 0.5457689166069031, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7117, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2073170731707317, |
|
"grad_norm": 0.5943721532821655, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7419, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.21341463414634146, |
|
"grad_norm": 0.5822892785072327, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7199, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.21951219512195122, |
|
"grad_norm": 0.5900689959526062, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7001, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.22560975609756098, |
|
"grad_norm": 0.6492246389389038, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7024, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.23170731707317074, |
|
"grad_norm": 0.5830572247505188, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7311, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.23780487804878048, |
|
"grad_norm": 0.6123290061950684, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7593, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.24390243902439024, |
|
"grad_norm": 0.6116678714752197, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7079, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6134564876556396, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7426, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.25609756097560976, |
|
"grad_norm": 0.5981906652450562, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7207, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2621951219512195, |
|
"grad_norm": 0.6197260022163391, |
|
"learning_rate": 0.0001, |
|
"loss": 0.743, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2682926829268293, |
|
"grad_norm": 0.5889937877655029, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7111, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.27439024390243905, |
|
"grad_norm": 0.5781967639923096, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7088, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2804878048780488, |
|
"grad_norm": 0.5735342502593994, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7264, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2865853658536585, |
|
"grad_norm": 0.6068210005760193, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7217, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2926829268292683, |
|
"grad_norm": 0.584036648273468, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6998, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.29878048780487804, |
|
"grad_norm": 0.5738788843154907, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6662, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3048780487804878, |
|
"grad_norm": 0.5746581554412842, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6702, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.31097560975609756, |
|
"grad_norm": 0.5572565793991089, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6766, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3170731707317073, |
|
"grad_norm": 0.6274172067642212, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7509, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3231707317073171, |
|
"grad_norm": 0.5426685810089111, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7065, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.32926829268292684, |
|
"grad_norm": 0.5456064343452454, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6069, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3353658536585366, |
|
"grad_norm": 0.5641257762908936, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6862, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.34146341463414637, |
|
"grad_norm": 0.5878000259399414, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6827, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3475609756097561, |
|
"grad_norm": 0.5976933240890503, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6838, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.35365853658536583, |
|
"grad_norm": 0.5872485041618347, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7017, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3597560975609756, |
|
"grad_norm": 0.5930238366127014, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6735, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.36585365853658536, |
|
"grad_norm": 0.5682117342948914, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6686, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3719512195121951, |
|
"grad_norm": 0.5660499334335327, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6579, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3780487804878049, |
|
"grad_norm": 0.5715780854225159, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7127, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.38414634146341464, |
|
"grad_norm": 0.5816344022750854, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7158, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.3902439024390244, |
|
"grad_norm": 0.5610223412513733, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6527, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.39634146341463417, |
|
"grad_norm": 0.5489451885223389, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6902, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4024390243902439, |
|
"grad_norm": 0.5633963942527771, |
|
"learning_rate": 0.0001, |
|
"loss": 0.729, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.40853658536585363, |
|
"grad_norm": 0.5687914490699768, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6558, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4146341463414634, |
|
"grad_norm": 0.5886531472206116, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7309, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.42073170731707316, |
|
"grad_norm": 0.546073853969574, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6625, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4268292682926829, |
|
"grad_norm": 0.5979751348495483, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6942, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4329268292682927, |
|
"grad_norm": 0.5324491858482361, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6274, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.43902439024390244, |
|
"grad_norm": 0.6174746751785278, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7073, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.4451219512195122, |
|
"grad_norm": 0.5845648646354675, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6871, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.45121951219512196, |
|
"grad_norm": 0.5905411839485168, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6846, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4573170731707317, |
|
"grad_norm": 0.5970960855484009, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6588, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4634146341463415, |
|
"grad_norm": 0.5933733582496643, |
|
"learning_rate": 0.0001, |
|
"loss": 0.678, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4695121951219512, |
|
"grad_norm": 0.5747849941253662, |
|
"learning_rate": 0.0001, |
|
"loss": 0.683, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.47560975609756095, |
|
"grad_norm": 0.5409815311431885, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6287, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4817073170731707, |
|
"grad_norm": 0.6004408001899719, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6356, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 0.5724059343338013, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6388, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.49390243902439024, |
|
"grad_norm": 0.6099798083305359, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7115, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5958842039108276, |
|
"learning_rate": 0.0001, |
|
"loss": 0.655, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5060975609756098, |
|
"grad_norm": 0.6181111335754395, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6391, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5121951219512195, |
|
"grad_norm": 0.5894577503204346, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6791, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5182926829268293, |
|
"grad_norm": 0.5830883979797363, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6582, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.524390243902439, |
|
"grad_norm": 0.5686275362968445, |
|
"learning_rate": 0.0001, |
|
"loss": 0.678, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5304878048780488, |
|
"grad_norm": 0.6119154095649719, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6714, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5365853658536586, |
|
"grad_norm": 0.5826413035392761, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6746, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5426829268292683, |
|
"grad_norm": 0.6128208041191101, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6851, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5487804878048781, |
|
"grad_norm": 0.575299859046936, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6439, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5548780487804879, |
|
"grad_norm": 0.6011075377464294, |
|
"learning_rate": 0.0001, |
|
"loss": 0.689, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5609756097560976, |
|
"grad_norm": 0.5696834325790405, |
|
"learning_rate": 0.0001, |
|
"loss": 0.688, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5670731707317073, |
|
"grad_norm": 0.5776868462562561, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6237, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.573170731707317, |
|
"grad_norm": 0.5697721242904663, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6551, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5792682926829268, |
|
"grad_norm": 0.5542324185371399, |
|
"learning_rate": 0.0001, |
|
"loss": 0.615, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5853658536585366, |
|
"grad_norm": 0.5746421217918396, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6814, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5914634146341463, |
|
"grad_norm": 0.5714977383613586, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6698, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5975609756097561, |
|
"grad_norm": 0.5868296027183533, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6482, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6036585365853658, |
|
"grad_norm": 0.5577363967895508, |
|
"learning_rate": 0.0001, |
|
"loss": 0.663, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6097560975609756, |
|
"grad_norm": 0.51622474193573, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5813, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6158536585365854, |
|
"grad_norm": 0.5596529245376587, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6157, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6219512195121951, |
|
"grad_norm": 0.585007905960083, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6734, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6280487804878049, |
|
"grad_norm": 0.5682265758514404, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6231, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6341463414634146, |
|
"grad_norm": 0.6157271265983582, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6679, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6402439024390244, |
|
"grad_norm": 0.5796582698822021, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6091, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6463414634146342, |
|
"grad_norm": 0.5919722318649292, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6744, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6524390243902439, |
|
"grad_norm": 0.5803415775299072, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6316, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6585365853658537, |
|
"grad_norm": 0.5573592782020569, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6028, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6646341463414634, |
|
"grad_norm": 0.5864866375923157, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6442, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.6707317073170732, |
|
"grad_norm": 0.5456053018569946, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6233, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.676829268292683, |
|
"grad_norm": 0.575710654258728, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6303, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6829268292682927, |
|
"grad_norm": 0.6122698783874512, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6676, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6890243902439024, |
|
"grad_norm": 0.5976404547691345, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6533, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.6951219512195121, |
|
"grad_norm": 0.6462607979774475, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7024, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7012195121951219, |
|
"grad_norm": 0.5650457143783569, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6667, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7073170731707317, |
|
"grad_norm": 0.5858912467956543, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6492, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7134146341463414, |
|
"grad_norm": 0.5636318325996399, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6112, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7195121951219512, |
|
"grad_norm": 0.5599079728126526, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6817, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.725609756097561, |
|
"grad_norm": 0.551928699016571, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6534, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7317073170731707, |
|
"grad_norm": 0.5585001707077026, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6517, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7378048780487805, |
|
"grad_norm": 0.5939499139785767, |
|
"learning_rate": 0.0001, |
|
"loss": 0.637, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7439024390243902, |
|
"grad_norm": 0.6028351187705994, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6497, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6053422689437866, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6606, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7560975609756098, |
|
"grad_norm": 0.5626771450042725, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6475, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7621951219512195, |
|
"grad_norm": 0.5561665892601013, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6126, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7682926829268293, |
|
"grad_norm": 0.5361859202384949, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6737, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.774390243902439, |
|
"grad_norm": 0.5999827980995178, |
|
"learning_rate": 0.0001, |
|
"loss": 0.627, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.7804878048780488, |
|
"grad_norm": 0.5717467665672302, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7242, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7865853658536586, |
|
"grad_norm": 0.5655209422111511, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6072, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.7926829268292683, |
|
"grad_norm": 0.5843133926391602, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6727, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7987804878048781, |
|
"grad_norm": 0.5787593722343445, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6394, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8048780487804879, |
|
"grad_norm": 0.5661312341690063, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6122, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8109756097560976, |
|
"grad_norm": 0.602393388748169, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6193, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8170731707317073, |
|
"grad_norm": 0.630905032157898, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6427, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.823170731707317, |
|
"grad_norm": 0.6203592419624329, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6491, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8292682926829268, |
|
"grad_norm": 0.5753608345985413, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6295, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8353658536585366, |
|
"grad_norm": 0.5919385552406311, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6262, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8414634146341463, |
|
"grad_norm": 0.564659833908081, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6437, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8475609756097561, |
|
"grad_norm": 0.5595895648002625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.628, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.8536585365853658, |
|
"grad_norm": 0.5651856064796448, |
|
"learning_rate": 0.0001, |
|
"loss": 0.622, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8597560975609756, |
|
"grad_norm": 0.5735089778900146, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6313, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.8658536585365854, |
|
"grad_norm": 0.6084374189376831, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6528, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8719512195121951, |
|
"grad_norm": 0.5673129558563232, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6163, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.8780487804878049, |
|
"grad_norm": 0.5617730021476746, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6397, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.8841463414634146, |
|
"grad_norm": 0.5928285121917725, |
|
"learning_rate": 0.0001, |
|
"loss": 0.64, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8902439024390244, |
|
"grad_norm": 0.5878246426582336, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6691, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.8963414634146342, |
|
"grad_norm": 0.5934311747550964, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6325, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9024390243902439, |
|
"grad_norm": 0.5465561151504517, |
|
"learning_rate": 0.0001, |
|
"loss": 0.663, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9085365853658537, |
|
"grad_norm": 0.5870200991630554, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6104, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.9146341463414634, |
|
"grad_norm": 0.6161399483680725, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6553, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9207317073170732, |
|
"grad_norm": 0.5733305811882019, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6167, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.926829268292683, |
|
"grad_norm": 0.595331072807312, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6594, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9329268292682927, |
|
"grad_norm": 0.5634722709655762, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6435, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9390243902439024, |
|
"grad_norm": 0.5649352073669434, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6338, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.9451219512195121, |
|
"grad_norm": 0.5804089903831482, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6151, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9512195121951219, |
|
"grad_norm": 0.5910571217536926, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6083, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9573170731707317, |
|
"grad_norm": 0.6512947082519531, |
|
"learning_rate": 0.0001, |
|
"loss": 0.652, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.9634146341463414, |
|
"grad_norm": 0.6277866363525391, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6363, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9695121951219512, |
|
"grad_norm": 0.5870842933654785, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6417, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 0.546256422996521, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5957, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9817073170731707, |
|
"grad_norm": 0.5940456390380859, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5774, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.9878048780487805, |
|
"grad_norm": 0.5390895009040833, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6131, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.9939024390243902, |
|
"grad_norm": 0.5646426677703857, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6247, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5933319330215454, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6107, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0060975609756098, |
|
"grad_norm": 0.5555415749549866, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5038, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.0121951219512195, |
|
"grad_norm": 0.5714491605758667, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5403, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.0182926829268293, |
|
"grad_norm": 0.6099926829338074, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4943, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.024390243902439, |
|
"grad_norm": 0.7038013339042664, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4801, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0304878048780488, |
|
"grad_norm": 0.6525987982749939, |
|
"learning_rate": 0.0001, |
|
"loss": 0.499, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.0365853658536586, |
|
"grad_norm": 0.5772536396980286, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4899, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0426829268292683, |
|
"grad_norm": 0.5953510999679565, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5343, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.048780487804878, |
|
"grad_norm": 0.579450786113739, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5222, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.0548780487804879, |
|
"grad_norm": 0.5960140228271484, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4936, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.0609756097560976, |
|
"grad_norm": 0.5782721042633057, |
|
"learning_rate": 0.0001, |
|
"loss": 0.487, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.0670731707317074, |
|
"grad_norm": 0.6194652318954468, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5045, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.0731707317073171, |
|
"grad_norm": 0.7137989401817322, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5206, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.079268292682927, |
|
"grad_norm": 0.6591524481773376, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5203, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.0853658536585367, |
|
"grad_norm": 0.5615283846855164, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4845, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.0914634146341464, |
|
"grad_norm": 0.5729933381080627, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5166, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.0975609756097562, |
|
"grad_norm": 0.5670926570892334, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5311, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.103658536585366, |
|
"grad_norm": 0.5750375390052795, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4739, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.1097560975609757, |
|
"grad_norm": 0.5616285800933838, |
|
"learning_rate": 0.0001, |
|
"loss": 0.513, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.1158536585365855, |
|
"grad_norm": 0.6150811910629272, |
|
"learning_rate": 0.0001, |
|
"loss": 0.53, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.1219512195121952, |
|
"grad_norm": 0.6283072233200073, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5099, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.1280487804878048, |
|
"grad_norm": 0.5622886419296265, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4663, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.1341463414634148, |
|
"grad_norm": 0.6202870607376099, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5113, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.1402439024390243, |
|
"grad_norm": 0.5678901672363281, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4595, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.146341463414634, |
|
"grad_norm": 0.6146119832992554, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5248, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.1524390243902438, |
|
"grad_norm": 0.5726969838142395, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5016, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.1585365853658536, |
|
"grad_norm": 0.5848289132118225, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5236, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1646341463414633, |
|
"grad_norm": 0.598795473575592, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5444, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.170731707317073, |
|
"grad_norm": 0.5984260439872742, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5291, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.1768292682926829, |
|
"grad_norm": 0.5640114545822144, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5366, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.1829268292682926, |
|
"grad_norm": 0.5771395564079285, |
|
"learning_rate": 0.0001, |
|
"loss": 0.519, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.1890243902439024, |
|
"grad_norm": 0.5926110744476318, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4945, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.1951219512195121, |
|
"grad_norm": 0.6406283974647522, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5313, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.201219512195122, |
|
"grad_norm": 0.5671162009239197, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4971, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.2073170731707317, |
|
"grad_norm": 0.5952590703964233, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4886, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.2134146341463414, |
|
"grad_norm": 0.6368497014045715, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4984, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.2195121951219512, |
|
"grad_norm": 0.6427241563796997, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5201, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.225609756097561, |
|
"grad_norm": 0.5814225673675537, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5021, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.2317073170731707, |
|
"grad_norm": 0.5985032916069031, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4969, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.2378048780487805, |
|
"grad_norm": 0.5723533630371094, |
|
"learning_rate": 0.0001, |
|
"loss": 0.485, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.2439024390243902, |
|
"grad_norm": 0.598479688167572, |
|
"learning_rate": 0.0001, |
|
"loss": 0.496, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.6005733013153076, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4746, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.2560975609756098, |
|
"grad_norm": 0.630957841873169, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5069, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.2621951219512195, |
|
"grad_norm": 0.6369969248771667, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4869, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.2682926829268293, |
|
"grad_norm": 0.6387524008750916, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5133, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.274390243902439, |
|
"grad_norm": 0.6263754367828369, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5444, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.2804878048780488, |
|
"grad_norm": 0.557532012462616, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4726, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2865853658536586, |
|
"grad_norm": 0.576702892780304, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5325, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.2926829268292683, |
|
"grad_norm": 0.6313229203224182, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5044, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.298780487804878, |
|
"grad_norm": 0.625912070274353, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5381, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.3048780487804879, |
|
"grad_norm": 0.6148139238357544, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4934, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.3109756097560976, |
|
"grad_norm": 0.6258604526519775, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5239, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.3170731707317074, |
|
"grad_norm": 0.6130456924438477, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5014, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.3231707317073171, |
|
"grad_norm": 0.606001615524292, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5201, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.329268292682927, |
|
"grad_norm": 0.5635973215103149, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4932, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.3353658536585367, |
|
"grad_norm": 0.5979434251785278, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5142, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.3414634146341464, |
|
"grad_norm": 0.5663168430328369, |
|
"learning_rate": 0.0001, |
|
"loss": 0.524, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3475609756097562, |
|
"grad_norm": 0.6072438955307007, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4997, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.3536585365853657, |
|
"grad_norm": 0.601750373840332, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4946, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.3597560975609757, |
|
"grad_norm": 0.6556447744369507, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5114, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.3658536585365852, |
|
"grad_norm": 0.6329565048217773, |
|
"learning_rate": 0.0001, |
|
"loss": 0.512, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.3719512195121952, |
|
"grad_norm": 0.6002699136734009, |
|
"learning_rate": 0.0001, |
|
"loss": 0.494, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.3780487804878048, |
|
"grad_norm": 0.6447397470474243, |
|
"learning_rate": 0.0001, |
|
"loss": 0.548, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.3841463414634148, |
|
"grad_norm": 0.5840697288513184, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5177, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.3902439024390243, |
|
"grad_norm": 0.5911181569099426, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5183, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.3963414634146343, |
|
"grad_norm": 0.6022722125053406, |
|
"learning_rate": 0.0001, |
|
"loss": 0.476, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.4024390243902438, |
|
"grad_norm": 0.5788743495941162, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5109, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4085365853658536, |
|
"grad_norm": 0.5945917963981628, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4869, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.4146341463414633, |
|
"grad_norm": 0.638956606388092, |
|
"learning_rate": 0.0001, |
|
"loss": 0.53, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.420731707317073, |
|
"grad_norm": 0.6204885840415955, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5205, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.4268292682926829, |
|
"grad_norm": 0.5931024551391602, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5167, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.4329268292682926, |
|
"grad_norm": 0.5996592044830322, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4935, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.4390243902439024, |
|
"grad_norm": 0.6242860555648804, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5047, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.4451219512195121, |
|
"grad_norm": 0.5914901494979858, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5092, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.451219512195122, |
|
"grad_norm": 0.6710638999938965, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5437, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.4573170731707317, |
|
"grad_norm": 0.6554276347160339, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4906, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.4634146341463414, |
|
"grad_norm": 0.6532212495803833, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5508, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4695121951219512, |
|
"grad_norm": 0.5957479476928711, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4902, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.475609756097561, |
|
"grad_norm": 0.5946776270866394, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5085, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.4817073170731707, |
|
"grad_norm": 0.5819572806358337, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4819, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.4878048780487805, |
|
"grad_norm": 0.6151570081710815, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5058, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.4939024390243902, |
|
"grad_norm": 0.6580333709716797, |
|
"learning_rate": 0.0001, |
|
"loss": 0.506, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.6214548945426941, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4739, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.5060975609756098, |
|
"grad_norm": 0.6240037083625793, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4898, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.5121951219512195, |
|
"grad_norm": 0.6115790605545044, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5143, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.5182926829268293, |
|
"grad_norm": 0.5654324293136597, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4409, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.524390243902439, |
|
"grad_norm": 0.5737196207046509, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4936, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5304878048780488, |
|
"grad_norm": 0.6084273457527161, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5182, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.5365853658536586, |
|
"grad_norm": 0.5695486664772034, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4857, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.5426829268292683, |
|
"grad_norm": 0.5693416595458984, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5028, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.548780487804878, |
|
"grad_norm": 0.5976539850234985, |
|
"learning_rate": 0.0001, |
|
"loss": 0.492, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.5548780487804879, |
|
"grad_norm": 0.6122463941574097, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5412, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.5609756097560976, |
|
"grad_norm": 0.5977299213409424, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5173, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.5670731707317072, |
|
"grad_norm": 0.5926475524902344, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5037, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.5731707317073171, |
|
"grad_norm": 0.5920047163963318, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4779, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.5792682926829267, |
|
"grad_norm": 0.5987219214439392, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5132, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.5853658536585367, |
|
"grad_norm": 0.5943930149078369, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4938, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5914634146341462, |
|
"grad_norm": 0.6259720921516418, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5295, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.5975609756097562, |
|
"grad_norm": 0.6168601512908936, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4633, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.6036585365853657, |
|
"grad_norm": 0.6057328581809998, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5074, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.6097560975609757, |
|
"grad_norm": 0.607790470123291, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5068, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.6158536585365852, |
|
"grad_norm": 0.5669077634811401, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4578, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.6219512195121952, |
|
"grad_norm": 0.58953458070755, |
|
"learning_rate": 0.0001, |
|
"loss": 0.512, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.6280487804878048, |
|
"grad_norm": 0.6138054728507996, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5035, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.6341463414634148, |
|
"grad_norm": 0.6316951513290405, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5374, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.6402439024390243, |
|
"grad_norm": 0.5779020190238953, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4934, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.6463414634146343, |
|
"grad_norm": 0.6008270978927612, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4628, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.6524390243902438, |
|
"grad_norm": 0.5894110202789307, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5109, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.6585365853658538, |
|
"grad_norm": 0.5894849896430969, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4861, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.6646341463414633, |
|
"grad_norm": 0.6085466146469116, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5101, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.6707317073170733, |
|
"grad_norm": 0.6503622531890869, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5508, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.6768292682926829, |
|
"grad_norm": 0.6089245676994324, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4911, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.6829268292682928, |
|
"grad_norm": 0.6388260126113892, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5165, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.6890243902439024, |
|
"grad_norm": 0.6048246622085571, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5405, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.6951219512195121, |
|
"grad_norm": 0.5887222290039062, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5205, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.701219512195122, |
|
"grad_norm": 0.6097093820571899, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5139, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.7073170731707317, |
|
"grad_norm": 0.5547489523887634, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4915, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7134146341463414, |
|
"grad_norm": 0.6122882962226868, |
|
"learning_rate": 0.0001, |
|
"loss": 0.493, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.7195121951219512, |
|
"grad_norm": 0.6592060923576355, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5314, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.725609756097561, |
|
"grad_norm": 0.6154331564903259, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5025, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.7317073170731707, |
|
"grad_norm": 0.5997411608695984, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5057, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.7378048780487805, |
|
"grad_norm": 0.615349292755127, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5195, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.7439024390243902, |
|
"grad_norm": 0.6155688762664795, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5194, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.5677372217178345, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5433, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.7560975609756098, |
|
"grad_norm": 0.5937820672988892, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5269, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.7621951219512195, |
|
"grad_norm": 0.5868131518363953, |
|
"learning_rate": 0.0001, |
|
"loss": 0.535, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.7682926829268293, |
|
"grad_norm": 0.6256383061408997, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5196, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.774390243902439, |
|
"grad_norm": 0.6187792420387268, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5027, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.7804878048780488, |
|
"grad_norm": 0.6260528564453125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5437, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.7865853658536586, |
|
"grad_norm": 0.5868582129478455, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5133, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.7926829268292683, |
|
"grad_norm": 0.6079871654510498, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5102, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.798780487804878, |
|
"grad_norm": 0.5693763494491577, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4933, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.8048780487804879, |
|
"grad_norm": 0.6394689679145813, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5452, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.8109756097560976, |
|
"grad_norm": 0.6318659782409668, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5391, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.8170731707317072, |
|
"grad_norm": 0.5786278247833252, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5129, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.8231707317073171, |
|
"grad_norm": 0.6378489136695862, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4935, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.8292682926829267, |
|
"grad_norm": 0.637844979763031, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5057, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8353658536585367, |
|
"grad_norm": 0.6403583288192749, |
|
"learning_rate": 0.0001, |
|
"loss": 0.542, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.8414634146341462, |
|
"grad_norm": 0.6149348616600037, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5108, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.8475609756097562, |
|
"grad_norm": 0.5945342779159546, |
|
"learning_rate": 0.0001, |
|
"loss": 0.496, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.8536585365853657, |
|
"grad_norm": 0.6346225142478943, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5463, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.8597560975609757, |
|
"grad_norm": 0.590212881565094, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5126, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.8658536585365852, |
|
"grad_norm": 0.5924628973007202, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5096, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.8719512195121952, |
|
"grad_norm": 0.6342692375183105, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5063, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.8780487804878048, |
|
"grad_norm": 0.6688621640205383, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5534, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.8841463414634148, |
|
"grad_norm": 0.628839910030365, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4975, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.8902439024390243, |
|
"grad_norm": 0.6141210794448853, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4777, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.8963414634146343, |
|
"grad_norm": 0.6270496845245361, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5019, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.9024390243902438, |
|
"grad_norm": 0.5861090421676636, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5066, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.9085365853658538, |
|
"grad_norm": 0.5715667009353638, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4766, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.9146341463414633, |
|
"grad_norm": 0.6288326978683472, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5152, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.9207317073170733, |
|
"grad_norm": 0.5759385228157043, |
|
"learning_rate": 0.0001, |
|
"loss": 0.51, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.9268292682926829, |
|
"grad_norm": 0.6145620346069336, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5104, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.9329268292682928, |
|
"grad_norm": 0.6138148903846741, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4967, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.9390243902439024, |
|
"grad_norm": 0.6269311308860779, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5479, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.9451219512195121, |
|
"grad_norm": 0.6406437754631042, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5199, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.951219512195122, |
|
"grad_norm": 0.5639004707336426, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4852, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.9573170731707317, |
|
"grad_norm": 0.5929526090621948, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5253, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.9634146341463414, |
|
"grad_norm": 0.59356689453125, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5094, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.9695121951219512, |
|
"grad_norm": 0.6183592677116394, |
|
"learning_rate": 0.0001, |
|
"loss": 0.495, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.975609756097561, |
|
"grad_norm": 0.5988680720329285, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5017, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.9817073170731707, |
|
"grad_norm": 0.6253383159637451, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5101, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.9878048780487805, |
|
"grad_norm": 0.6147765517234802, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4952, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.9939024390243902, |
|
"grad_norm": 0.6041817665100098, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5042, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.5927252769470215, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4996, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.0060975609756095, |
|
"grad_norm": 0.6218935251235962, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4171, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.0121951219512195, |
|
"grad_norm": 0.5569261312484741, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3905, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.018292682926829, |
|
"grad_norm": 0.5948651432991028, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3704, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.024390243902439, |
|
"grad_norm": 0.6893870830535889, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3446, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.0304878048780486, |
|
"grad_norm": 0.6298575401306152, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3657, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.0365853658536586, |
|
"grad_norm": 0.6463242173194885, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3752, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.042682926829268, |
|
"grad_norm": 0.6220399141311646, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4133, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.048780487804878, |
|
"grad_norm": 0.6175084710121155, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3856, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.0548780487804876, |
|
"grad_norm": 0.5709812641143799, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3791, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.0609756097560976, |
|
"grad_norm": 0.5842687487602234, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3981, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.067073170731707, |
|
"grad_norm": 0.5711541771888733, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3463, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.073170731707317, |
|
"grad_norm": 0.6160522103309631, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3579, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.0792682926829267, |
|
"grad_norm": 0.6163449287414551, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3651, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.0853658536585367, |
|
"grad_norm": 0.6386067271232605, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4165, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.091463414634146, |
|
"grad_norm": 0.6074360609054565, |
|
"learning_rate": 0.0001, |
|
"loss": 0.383, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.097560975609756, |
|
"grad_norm": 0.5862374305725098, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3658, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.1036585365853657, |
|
"grad_norm": 0.5639402270317078, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3708, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.1097560975609757, |
|
"grad_norm": 0.5674434304237366, |
|
"learning_rate": 0.0001, |
|
"loss": 0.376, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.1158536585365852, |
|
"grad_norm": 0.641013503074646, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3898, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.1219512195121952, |
|
"grad_norm": 0.6373003125190735, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3998, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.1280487804878048, |
|
"grad_norm": 0.6026149392127991, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3419, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.1341463414634148, |
|
"grad_norm": 0.5974167585372925, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3501, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.1402439024390243, |
|
"grad_norm": 0.5709217190742493, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4023, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.1463414634146343, |
|
"grad_norm": 0.6201815605163574, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3801, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.152439024390244, |
|
"grad_norm": 0.5644124150276184, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3536, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.158536585365854, |
|
"grad_norm": 0.5843915343284607, |
|
"learning_rate": 0.0001, |
|
"loss": 0.367, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.1646341463414633, |
|
"grad_norm": 0.6504707336425781, |
|
"learning_rate": 0.0001, |
|
"loss": 0.41, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.1707317073170733, |
|
"grad_norm": 0.6272132396697998, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3642, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.176829268292683, |
|
"grad_norm": 0.6171401143074036, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3709, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.182926829268293, |
|
"grad_norm": 0.5451359748840332, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3699, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.1890243902439024, |
|
"grad_norm": 0.5557040572166443, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3889, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.1951219512195124, |
|
"grad_norm": 0.5514318943023682, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3595, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.201219512195122, |
|
"grad_norm": 0.6279582381248474, |
|
"learning_rate": 0.0001, |
|
"loss": 0.365, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.207317073170732, |
|
"grad_norm": 0.6362396478652954, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3676, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.2134146341463414, |
|
"grad_norm": 0.6167373061180115, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4047, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.2195121951219514, |
|
"grad_norm": 0.5988054871559143, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3866, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.225609756097561, |
|
"grad_norm": 0.6260228753089905, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3969, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.231707317073171, |
|
"grad_norm": 0.5669357180595398, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3624, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.2378048780487805, |
|
"grad_norm": 0.5572336316108704, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3802, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.2439024390243905, |
|
"grad_norm": 0.577407956123352, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3814, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.5576046109199524, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3529, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.2560975609756095, |
|
"grad_norm": 0.5899252891540527, |
|
"learning_rate": 0.0001, |
|
"loss": 0.361, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.2621951219512195, |
|
"grad_norm": 0.6026024222373962, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3602, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.2682926829268295, |
|
"grad_norm": 0.651066780090332, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3646, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.274390243902439, |
|
"grad_norm": 0.6255848407745361, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3468, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.2804878048780486, |
|
"grad_norm": 0.6624294519424438, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3928, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.2865853658536586, |
|
"grad_norm": 0.5514746308326721, |
|
"learning_rate": 0.0001, |
|
"loss": 0.374, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.292682926829268, |
|
"grad_norm": 0.5865519642829895, |
|
"learning_rate": 0.0001, |
|
"loss": 0.387, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.298780487804878, |
|
"grad_norm": 0.5901021957397461, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3922, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.3048780487804876, |
|
"grad_norm": 0.5819031000137329, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3825, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.3109756097560976, |
|
"grad_norm": 0.5795203447341919, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3983, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.317073170731707, |
|
"grad_norm": 0.5817603468894958, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3892, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.323170731707317, |
|
"grad_norm": 0.5905787348747253, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3662, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.3292682926829267, |
|
"grad_norm": 0.6160801649093628, |
|
"learning_rate": 0.0001, |
|
"loss": 0.382, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.3353658536585367, |
|
"grad_norm": 0.6367721557617188, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3684, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.341463414634146, |
|
"grad_norm": 0.6236375570297241, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3671, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.347560975609756, |
|
"grad_norm": 0.5669872164726257, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3634, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.3536585365853657, |
|
"grad_norm": 0.5991116166114807, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3628, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.3597560975609757, |
|
"grad_norm": 0.5670086145401001, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3635, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.3658536585365852, |
|
"grad_norm": 0.629401683807373, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3925, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.3719512195121952, |
|
"grad_norm": 0.6248301267623901, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3825, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.3780487804878048, |
|
"grad_norm": 0.5823646187782288, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3775, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.3841463414634148, |
|
"grad_norm": 0.6670135855674744, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3927, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.3902439024390243, |
|
"grad_norm": 0.6390913128852844, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4057, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.3963414634146343, |
|
"grad_norm": 0.5848169922828674, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3712, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.402439024390244, |
|
"grad_norm": 0.5966094732284546, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3713, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.408536585365854, |
|
"grad_norm": 0.6144512891769409, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3698, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.4146341463414633, |
|
"grad_norm": 0.5988245010375977, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3686, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.4207317073170733, |
|
"grad_norm": 0.6109009981155396, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3921, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.426829268292683, |
|
"grad_norm": 0.6432120203971863, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4231, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.432926829268293, |
|
"grad_norm": 0.5902109742164612, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3699, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.4390243902439024, |
|
"grad_norm": 0.6081752777099609, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3836, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.4451219512195124, |
|
"grad_norm": 0.6146216988563538, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3785, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.451219512195122, |
|
"grad_norm": 0.6472842693328857, |
|
"learning_rate": 0.0001, |
|
"loss": 0.373, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.457317073170732, |
|
"grad_norm": 0.60771644115448, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3685, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.4634146341463414, |
|
"grad_norm": 0.6457931995391846, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3746, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.4695121951219514, |
|
"grad_norm": 0.5895772576332092, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3758, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.475609756097561, |
|
"grad_norm": 0.6693524718284607, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3904, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.4817073170731705, |
|
"grad_norm": 0.6366068124771118, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3923, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.4878048780487805, |
|
"grad_norm": 0.6241960525512695, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3559, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.4939024390243905, |
|
"grad_norm": 0.6247851252555847, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3881, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.6421067714691162, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4021, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.5060975609756095, |
|
"grad_norm": 0.7222415804862976, |
|
"learning_rate": 0.0001, |
|
"loss": 0.391, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.5121951219512195, |
|
"grad_norm": 0.6274811625480652, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3817, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.5182926829268295, |
|
"grad_norm": 0.5927621126174927, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3595, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.524390243902439, |
|
"grad_norm": 0.5889265537261963, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3684, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.5304878048780486, |
|
"grad_norm": 0.6477332711219788, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4308, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.5365853658536586, |
|
"grad_norm": 0.6162149906158447, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4087, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.5426829268292686, |
|
"grad_norm": 0.6609845757484436, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4028, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.548780487804878, |
|
"grad_norm": 0.6425780057907104, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3832, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.5548780487804876, |
|
"grad_norm": 0.6117408275604248, |
|
"learning_rate": 0.0001, |
|
"loss": 0.368, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.5609756097560976, |
|
"grad_norm": 0.6596407890319824, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3848, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.567073170731707, |
|
"grad_norm": 0.6080613136291504, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3862, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.573170731707317, |
|
"grad_norm": 0.6160922646522522, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3797, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.5792682926829267, |
|
"grad_norm": 0.6346991658210754, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3702, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.5853658536585367, |
|
"grad_norm": 0.6169600486755371, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3931, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.591463414634146, |
|
"grad_norm": 0.6396271586418152, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4133, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.597560975609756, |
|
"grad_norm": 0.5953004360198975, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3732, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.6036585365853657, |
|
"grad_norm": 0.6704226732254028, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3924, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.6097560975609757, |
|
"grad_norm": 0.6755167245864868, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3891, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.6158536585365852, |
|
"grad_norm": 0.6189351677894592, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4072, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.6219512195121952, |
|
"grad_norm": 0.6409624218940735, |
|
"learning_rate": 0.0001, |
|
"loss": 0.382, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.6280487804878048, |
|
"grad_norm": 0.629356324672699, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3783, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.6341463414634148, |
|
"grad_norm": 0.6259102821350098, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3837, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.6402439024390243, |
|
"grad_norm": 0.6589633822441101, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3958, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.6463414634146343, |
|
"grad_norm": 0.6646971702575684, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3948, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.652439024390244, |
|
"grad_norm": 0.6579565405845642, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3749, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.658536585365854, |
|
"grad_norm": 0.6253348588943481, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3737, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.6646341463414633, |
|
"grad_norm": 0.6139116287231445, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4165, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.6707317073170733, |
|
"grad_norm": 0.6256686449050903, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3838, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.676829268292683, |
|
"grad_norm": 0.6139652729034424, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3751, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.682926829268293, |
|
"grad_norm": 0.6227155923843384, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3752, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.6890243902439024, |
|
"grad_norm": 0.590382993221283, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3896, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.6951219512195124, |
|
"grad_norm": 0.6084756255149841, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3725, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.701219512195122, |
|
"grad_norm": 0.6576021909713745, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4095, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.7073170731707314, |
|
"grad_norm": 0.6265486478805542, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3868, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.7134146341463414, |
|
"grad_norm": 0.651096761226654, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4042, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.7195121951219514, |
|
"grad_norm": 0.6373317241668701, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4209, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.725609756097561, |
|
"grad_norm": 0.6040897965431213, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4084, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.7317073170731705, |
|
"grad_norm": 0.6254827976226807, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3646, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.7378048780487805, |
|
"grad_norm": 0.6285514831542969, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3711, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.7439024390243905, |
|
"grad_norm": 0.675573468208313, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4191, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.6126376390457153, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3782, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.7560975609756095, |
|
"grad_norm": 0.6281729340553284, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3778, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.7621951219512195, |
|
"grad_norm": 0.5908406376838684, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3927, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.7682926829268295, |
|
"grad_norm": 0.6050170660018921, |
|
"learning_rate": 0.0001, |
|
"loss": 0.431, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.774390243902439, |
|
"grad_norm": 0.624231219291687, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3774, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.7804878048780486, |
|
"grad_norm": 0.6320463418960571, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4062, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.7865853658536586, |
|
"grad_norm": 0.6329071521759033, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3962, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.7926829268292686, |
|
"grad_norm": 0.6450055241584778, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4096, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.798780487804878, |
|
"grad_norm": 0.6559624671936035, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4015, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.8048780487804876, |
|
"grad_norm": 0.5944327116012573, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3864, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.8109756097560976, |
|
"grad_norm": 0.6524405479431152, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4245, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.817073170731707, |
|
"grad_norm": 0.6659778952598572, |
|
"learning_rate": 0.0001, |
|
"loss": 0.419, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.823170731707317, |
|
"grad_norm": 0.6520142555236816, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4163, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.8292682926829267, |
|
"grad_norm": 0.6226247549057007, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3898, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.8353658536585367, |
|
"grad_norm": 0.6132051348686218, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3854, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.841463414634146, |
|
"grad_norm": 0.6409340500831604, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3663, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.847560975609756, |
|
"grad_norm": 0.638858437538147, |
|
"learning_rate": 0.0001, |
|
"loss": 0.381, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 2.8536585365853657, |
|
"grad_norm": 0.6682012677192688, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4027, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 2.8597560975609757, |
|
"grad_norm": 0.6829751133918762, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4232, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 2.8658536585365852, |
|
"grad_norm": 0.6196625232696533, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3629, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.8719512195121952, |
|
"grad_norm": 0.6654703617095947, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4071, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 2.8780487804878048, |
|
"grad_norm": 0.6258810758590698, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3893, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 2.8841463414634148, |
|
"grad_norm": 0.6281041502952576, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3978, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 2.8902439024390243, |
|
"grad_norm": 0.6136834621429443, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4258, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 2.8963414634146343, |
|
"grad_norm": 0.6135198473930359, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3793, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.902439024390244, |
|
"grad_norm": 0.6039949059486389, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4034, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 2.908536585365854, |
|
"grad_norm": 0.6059561967849731, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3997, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 2.9146341463414633, |
|
"grad_norm": 0.6142321825027466, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3778, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 2.9207317073170733, |
|
"grad_norm": 0.6661014556884766, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4241, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 2.926829268292683, |
|
"grad_norm": 0.6781815886497498, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3969, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.932926829268293, |
|
"grad_norm": 0.6294031739234924, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3768, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 2.9390243902439024, |
|
"grad_norm": 0.6458147764205933, |
|
"learning_rate": 0.0001, |
|
"loss": 0.393, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 2.9451219512195124, |
|
"grad_norm": 0.5952702760696411, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3844, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 2.951219512195122, |
|
"grad_norm": 0.5768480896949768, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3893, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 2.9573170731707314, |
|
"grad_norm": 0.6429164409637451, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4078, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.9634146341463414, |
|
"grad_norm": 0.5966724753379822, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3689, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 2.9695121951219514, |
|
"grad_norm": 0.6305826306343079, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3982, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 2.975609756097561, |
|
"grad_norm": 0.6368945240974426, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4033, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 2.9817073170731705, |
|
"grad_norm": 0.6413828730583191, |
|
"learning_rate": 0.0001, |
|
"loss": 0.392, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 2.9878048780487805, |
|
"grad_norm": 0.626516580581665, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3908, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.9939024390243905, |
|
"grad_norm": 0.6416463255882263, |
|
"learning_rate": 0.0001, |
|
"loss": 0.397, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.6507825255393982, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3931, |
|
"step": 492 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 492, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.340951084078203e+17, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|