|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.44, |
|
"eval_steps": 500, |
|
"global_step": 900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 0.8485889434814453, |
|
"learning_rate": 0.0001999964908278481, |
|
"loss": 1.2049, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 0.47789862751960754, |
|
"learning_rate": 0.00019998596355767805, |
|
"loss": 0.9333, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 1.017558217048645, |
|
"learning_rate": 0.00019996841892833, |
|
"loss": 0.8671, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.6610977053642273, |
|
"learning_rate": 0.00019994385817114646, |
|
"loss": 0.7979, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6075429320335388, |
|
"learning_rate": 0.00019991228300988585, |
|
"loss": 0.7662, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 0.6595763564109802, |
|
"learning_rate": 0.00019987369566060176, |
|
"loss": 0.7929, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 0.6968618035316467, |
|
"learning_rate": 0.00019982809883148722, |
|
"loss": 0.7683, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.4889592230319977, |
|
"learning_rate": 0.00019977549572268468, |
|
"loss": 0.8667, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 0.6651108264923096, |
|
"learning_rate": 0.0001997158900260614, |
|
"loss": 0.8446, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5898510217666626, |
|
"learning_rate": 0.00019964928592495045, |
|
"loss": 0.9051, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 0.4398016035556793, |
|
"learning_rate": 0.00019957568809385694, |
|
"loss": 0.7235, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.6901968121528625, |
|
"learning_rate": 0.00019949510169813003, |
|
"loss": 0.8169, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 0.6267213225364685, |
|
"learning_rate": 0.00019940753239360047, |
|
"loss": 0.8266, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.48524895310401917, |
|
"learning_rate": 0.00019931298632618356, |
|
"loss": 0.758, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5294132232666016, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 0.7759, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.48957982659339905, |
|
"learning_rate": 0.0001991029909341493, |
|
"loss": 0.7797, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 0.645412802696228, |
|
"learning_rate": 0.00019898755634773158, |
|
"loss": 0.7437, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.43297675251960754, |
|
"learning_rate": 0.0001988651744737914, |
|
"loss": 0.8043, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 0.5513920783996582, |
|
"learning_rate": 0.00019873585390151003, |
|
"loss": 0.7701, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8462435007095337, |
|
"learning_rate": 0.0001985996037070505, |
|
"loss": 0.709, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 0.6892585158348083, |
|
"learning_rate": 0.00019845643345292054, |
|
"loss": 0.7377, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.4617864191532135, |
|
"learning_rate": 0.00019830635318730154, |
|
"loss": 0.8352, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 0.6300354599952698, |
|
"learning_rate": 0.0001981493734433433, |
|
"loss": 0.7738, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.8086859583854675, |
|
"learning_rate": 0.0001979855052384247, |
|
"loss": 0.8067, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.6272985935211182, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 0.7456, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.44750839471817017, |
|
"learning_rate": 0.00019763714993169452, |
|
"loss": 0.758, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 0.5053977370262146, |
|
"learning_rate": 0.00019745268727865774, |
|
"loss": 0.7895, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.41920769214630127, |
|
"learning_rate": 0.00019726138506049438, |
|
"loss": 0.7302, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 0.38280290365219116, |
|
"learning_rate": 0.00019706325670345275, |
|
"loss": 0.8152, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.554710865020752, |
|
"learning_rate": 0.0001968583161128631, |
|
"loss": 0.8461, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 0.5612509250640869, |
|
"learning_rate": 0.00019664657767216176, |
|
"loss": 0.7787, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.610614538192749, |
|
"learning_rate": 0.00019642805624188147, |
|
"loss": 0.7574, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 0.679517924785614, |
|
"learning_rate": 0.0001962027671586086, |
|
"loss": 0.8487, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.6685434579849243, |
|
"learning_rate": 0.00019597072623390668, |
|
"loss": 0.6611, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.480293869972229, |
|
"learning_rate": 0.00019573194975320673, |
|
"loss": 0.7802, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.7727369070053101, |
|
"learning_rate": 0.00019548645447466431, |
|
"loss": 0.6727, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 0.6371043920516968, |
|
"learning_rate": 0.00019523425762798329, |
|
"loss": 0.7502, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.6399966478347778, |
|
"learning_rate": 0.00019497537691320668, |
|
"loss": 0.8401, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 0.7263137698173523, |
|
"learning_rate": 0.00019470983049947444, |
|
"loss": 0.7494, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.402416467666626, |
|
"learning_rate": 0.00019443763702374812, |
|
"loss": 0.7842, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 0.6639626026153564, |
|
"learning_rate": 0.00019415881558950302, |
|
"loss": 0.8082, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.5801042914390564, |
|
"learning_rate": 0.00019387338576538744, |
|
"loss": 0.7883, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 0.5533607006072998, |
|
"learning_rate": 0.00019358136758384912, |
|
"loss": 0.7356, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.6019654273986816, |
|
"learning_rate": 0.00019328278153972947, |
|
"loss": 0.7891, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5344104170799255, |
|
"learning_rate": 0.00019297764858882514, |
|
"loss": 0.7671, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.5494843125343323, |
|
"learning_rate": 0.0001926659901464172, |
|
"loss": 0.6608, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 0.465420126914978, |
|
"learning_rate": 0.00019234782808576824, |
|
"loss": 0.647, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.5202775001525879, |
|
"learning_rate": 0.00019202318473658705, |
|
"loss": 0.729, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 0.5757818222045898, |
|
"learning_rate": 0.00019169208288346166, |
|
"loss": 0.6713, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.46555572748184204, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 0.7049, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 0.5101790428161621, |
|
"learning_rate": 0.00019101059706849957, |
|
"loss": 0.7419, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.6083744764328003, |
|
"learning_rate": 0.00019066026093568378, |
|
"loss": 0.7148, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 0.4719640612602234, |
|
"learning_rate": 0.00019030356195360874, |
|
"loss": 0.7493, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.7365225553512573, |
|
"learning_rate": 0.0001899405251566371, |
|
"loss": 0.7652, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4452705383300781, |
|
"learning_rate": 0.0001895711760239413, |
|
"loss": 0.7438, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.6071786284446716, |
|
"learning_rate": 0.0001891955404777151, |
|
"loss": 0.7683, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 0.5774498581886292, |
|
"learning_rate": 0.00018881364488135448, |
|
"loss": 0.8115, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.6134682893753052, |
|
"learning_rate": 0.00018842551603760724, |
|
"loss": 0.8335, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 0.4869893193244934, |
|
"learning_rate": 0.00018803118118669202, |
|
"loss": 0.6933, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6457111239433289, |
|
"learning_rate": 0.00018763066800438636, |
|
"loss": 0.7515, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 0.59674471616745, |
|
"learning_rate": 0.0001872240046000844, |
|
"loss": 0.6931, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.44608160853385925, |
|
"learning_rate": 0.00018681121951482393, |
|
"loss": 0.782, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 0.5934664607048035, |
|
"learning_rate": 0.00018639234171928353, |
|
"loss": 0.7361, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.49716323614120483, |
|
"learning_rate": 0.0001859674006117491, |
|
"loss": 0.7443, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.47995495796203613, |
|
"learning_rate": 0.00018553642601605068, |
|
"loss": 0.7221, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.5177399516105652, |
|
"learning_rate": 0.00018509944817946922, |
|
"loss": 0.7622, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 0.6638798713684082, |
|
"learning_rate": 0.0001846564977706138, |
|
"loss": 0.8556, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.5056771636009216, |
|
"learning_rate": 0.00018420760587726923, |
|
"loss": 0.7814, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 0.44543707370758057, |
|
"learning_rate": 0.0001837528040042142, |
|
"loss": 0.722, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6765120625495911, |
|
"learning_rate": 0.00018329212407100994, |
|
"loss": 0.7903, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 0.49232372641563416, |
|
"learning_rate": 0.00018282559840976042, |
|
"loss": 0.6996, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.47392791509628296, |
|
"learning_rate": 0.00018235325976284275, |
|
"loss": 0.773, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 0.5056615471839905, |
|
"learning_rate": 0.00018187514128060946, |
|
"loss": 0.728, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.5857616662979126, |
|
"learning_rate": 0.00018139127651906184, |
|
"loss": 0.7659, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5966864228248596, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.7039, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.4524347484111786, |
|
"learning_rate": 0.00018040644439611348, |
|
"loss": 0.7125, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 0.5570976138114929, |
|
"learning_rate": 0.00017990554615362198, |
|
"loss": 0.698, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.6045777201652527, |
|
"learning_rate": 0.00017939903986478355, |
|
"loss": 0.8255, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 0.6149687767028809, |
|
"learning_rate": 0.00017888696107795342, |
|
"loss": 0.6616, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.4873579144477844, |
|
"learning_rate": 0.000178369345732584, |
|
"loss": 0.7452, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 0.5569061636924744, |
|
"learning_rate": 0.00017784623015670238, |
|
"loss": 0.7652, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.5825181603431702, |
|
"learning_rate": 0.00017731765106436073, |
|
"loss": 0.7793, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 0.4047383666038513, |
|
"learning_rate": 0.00017678364555305978, |
|
"loss": 0.6875, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.5080836415290833, |
|
"learning_rate": 0.0001762442511011448, |
|
"loss": 0.7465, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5825940370559692, |
|
"learning_rate": 0.00017569950556517566, |
|
"loss": 0.7205, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.476992666721344, |
|
"learning_rate": 0.00017514944717726962, |
|
"loss": 0.6589, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 0.7424727082252502, |
|
"learning_rate": 0.00017459411454241822, |
|
"loss": 0.7035, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.6544787287712097, |
|
"learning_rate": 0.00017403354663577783, |
|
"loss": 0.787, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 0.49425187706947327, |
|
"learning_rate": 0.00017346778279993415, |
|
"loss": 0.7515, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5473236441612244, |
|
"learning_rate": 0.00017289686274214118, |
|
"loss": 0.7199, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 0.6773544549942017, |
|
"learning_rate": 0.00017232082653153422, |
|
"loss": 0.8037, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.6355096101760864, |
|
"learning_rate": 0.00017173971459631787, |
|
"loss": 0.7502, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 0.47867000102996826, |
|
"learning_rate": 0.00017115356772092857, |
|
"loss": 0.7446, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.5135357975959778, |
|
"learning_rate": 0.0001705624270431721, |
|
"loss": 0.6507, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.48866042494773865, |
|
"learning_rate": 0.00016996633405133655, |
|
"loss": 0.7164, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.5892354249954224, |
|
"learning_rate": 0.0001693653305812805, |
|
"loss": 0.7621, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 0.6633970141410828, |
|
"learning_rate": 0.00016875945881349676, |
|
"loss": 0.7623, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.6444060802459717, |
|
"learning_rate": 0.000168148761270152, |
|
"loss": 0.6606, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 0.7012648582458496, |
|
"learning_rate": 0.00016753328081210245, |
|
"loss": 0.6941, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7064160704612732, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 0.6841, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 0.7241398096084595, |
|
"learning_rate": 0.00016628814427068953, |
|
"loss": 0.6996, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.7807374596595764, |
|
"learning_rate": 0.00016565857557529566, |
|
"loss": 0.7542, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 0.763768196105957, |
|
"learning_rate": 0.00016502439873500289, |
|
"loss": 0.7175, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.6105090379714966, |
|
"learning_rate": 0.0001643856582585254, |
|
"loss": 0.7565, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5686540603637695, |
|
"learning_rate": 0.000163742398974869, |
|
"loss": 0.7339, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.5341500043869019, |
|
"learning_rate": 0.00016309466603018496, |
|
"loss": 0.569, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 0.7274748682975769, |
|
"learning_rate": 0.00016244250488460158, |
|
"loss": 0.7556, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.7321165204048157, |
|
"learning_rate": 0.00016178596130903344, |
|
"loss": 0.7084, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 0.5086159110069275, |
|
"learning_rate": 0.00016112508138196917, |
|
"loss": 0.6935, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.4714389443397522, |
|
"learning_rate": 0.0001604599114862375, |
|
"loss": 0.7076, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 0.5031452178955078, |
|
"learning_rate": 0.0001597904983057519, |
|
"loss": 0.7151, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.7745943665504456, |
|
"learning_rate": 0.0001591168888222342, |
|
"loss": 0.7001, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 0.6076303124427795, |
|
"learning_rate": 0.00015843913031191723, |
|
"loss": 0.7285, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.7456529140472412, |
|
"learning_rate": 0.00015775727034222675, |
|
"loss": 0.8041, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5760998725891113, |
|
"learning_rate": 0.0001570713567684432, |
|
"loss": 0.7353, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.7057327032089233, |
|
"learning_rate": 0.00015638143773034267, |
|
"loss": 0.7792, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 0.7615967392921448, |
|
"learning_rate": 0.00015568756164881882, |
|
"loss": 1.0121, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.6304950714111328, |
|
"learning_rate": 0.000154989777222484, |
|
"loss": 0.7727, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 0.6852543950080872, |
|
"learning_rate": 0.00015428813342425177, |
|
"loss": 0.741, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.6379660964012146, |
|
"learning_rate": 0.00015358267949789966, |
|
"loss": 0.6919, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 0.5846463441848755, |
|
"learning_rate": 0.00015287346495461315, |
|
"loss": 0.7163, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.5999557971954346, |
|
"learning_rate": 0.0001521605395695108, |
|
"loss": 0.8152, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 0.5806307196617126, |
|
"learning_rate": 0.00015144395337815064, |
|
"loss": 0.6709, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.6559942960739136, |
|
"learning_rate": 0.00015072375667301893, |
|
"loss": 0.6527, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.6287715435028076, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.8194, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 0.616222620010376, |
|
"learning_rate": 0.00014927273415482915, |
|
"loss": 0.6627, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.016, |
|
"grad_norm": 0.4750412106513977, |
|
"learning_rate": 0.0001485420101795274, |
|
"loss": 0.6366, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.5122964978218079, |
|
"learning_rate": 0.00014780787935881923, |
|
"loss": 0.6717, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.032, |
|
"grad_norm": 0.7382633090019226, |
|
"learning_rate": 0.0001470703932165333, |
|
"loss": 0.6483, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.6540554761886597, |
|
"learning_rate": 0.00014632960351198618, |
|
"loss": 0.6151, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.048, |
|
"grad_norm": 0.4776591956615448, |
|
"learning_rate": 0.00014558556223635003, |
|
"loss": 0.6707, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 0.8012662529945374, |
|
"learning_rate": 0.00014483832160900326, |
|
"loss": 0.6125, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 0.6735953092575073, |
|
"learning_rate": 0.00014408793407386588, |
|
"loss": 0.6206, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 0.5640230774879456, |
|
"learning_rate": 0.00014333445229571873, |
|
"loss": 0.6161, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.5928654074668884, |
|
"learning_rate": 0.00014257792915650728, |
|
"loss": 0.6583, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.7347397208213806, |
|
"learning_rate": 0.00014181841775163013, |
|
"loss": 0.6222, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.096, |
|
"grad_norm": 0.593773365020752, |
|
"learning_rate": 0.0001410559713862128, |
|
"loss": 0.716, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 0.6244611144065857, |
|
"learning_rate": 0.00014029064357136628, |
|
"loss": 0.6198, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 0.5083370804786682, |
|
"learning_rate": 0.00013952248802043165, |
|
"loss": 0.6389, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.5241413116455078, |
|
"learning_rate": 0.0001387515586452103, |
|
"loss": 0.6842, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 0.524029016494751, |
|
"learning_rate": 0.00013797790955218014, |
|
"loss": 0.6071, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 0.5097878575325012, |
|
"learning_rate": 0.00013720159503869815, |
|
"loss": 0.5915, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.144, |
|
"grad_norm": 0.5782963037490845, |
|
"learning_rate": 0.00013642266958918984, |
|
"loss": 0.6794, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.6088266372680664, |
|
"learning_rate": 0.00013564118787132506, |
|
"loss": 0.6773, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.7768995761871338, |
|
"learning_rate": 0.00013485720473218154, |
|
"loss": 0.668, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 0.6645551919937134, |
|
"learning_rate": 0.0001340707751943952, |
|
"loss": 0.6997, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.176, |
|
"grad_norm": 0.9228842258453369, |
|
"learning_rate": 0.00013328195445229868, |
|
"loss": 0.831, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 0.7556049823760986, |
|
"learning_rate": 0.00013249079786804765, |
|
"loss": 0.6378, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 0.832775354385376, |
|
"learning_rate": 0.0001316973609677352, |
|
"loss": 0.6547, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.7329304814338684, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.5808, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.208, |
|
"grad_norm": 0.7193475961685181, |
|
"learning_rate": 0.00013010386911959206, |
|
"loss": 0.5582, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 0.6274734735488892, |
|
"learning_rate": 0.00012930392600850573, |
|
"loss": 0.5801, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 0.6485865712165833, |
|
"learning_rate": 0.0001285019262469976, |
|
"loss": 0.65, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 0.7164427042007446, |
|
"learning_rate": 0.00012769792612217224, |
|
"loss": 0.6627, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.600775957107544, |
|
"learning_rate": 0.00012689198206152657, |
|
"loss": 0.5603, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 0.8377975225448608, |
|
"learning_rate": 0.00012608415062898972, |
|
"loss": 0.6525, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 0.8069924116134644, |
|
"learning_rate": 0.00012527448852095295, |
|
"loss": 0.6731, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 0.6501213908195496, |
|
"learning_rate": 0.00012446305256229073, |
|
"loss": 0.6255, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 0.62812340259552, |
|
"learning_rate": 0.00012364989970237248, |
|
"loss": 0.6585, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.5702307820320129, |
|
"learning_rate": 0.00012283508701106557, |
|
"loss": 0.5996, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"grad_norm": 0.6311281323432922, |
|
"learning_rate": 0.00012201867167473015, |
|
"loss": 0.6355, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 0.5885419249534607, |
|
"learning_rate": 0.00012120071099220549, |
|
"loss": 0.6615, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.304, |
|
"grad_norm": 0.5239307284355164, |
|
"learning_rate": 0.0001203812623707885, |
|
"loss": 0.6096, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 0.6101869940757751, |
|
"learning_rate": 0.00011956038332220483, |
|
"loss": 0.5984, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.4395413100719452, |
|
"learning_rate": 0.00011873813145857249, |
|
"loss": 0.5569, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 0.8984820246696472, |
|
"learning_rate": 0.00011791456448835825, |
|
"loss": 0.7088, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.336, |
|
"grad_norm": 0.7709664106369019, |
|
"learning_rate": 0.00011708974021232769, |
|
"loss": 0.6731, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.6782217025756836, |
|
"learning_rate": 0.00011626371651948838, |
|
"loss": 0.6188, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 0.6427358984947205, |
|
"learning_rate": 0.00011543655138302714, |
|
"loss": 0.7004, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.5902594923973083, |
|
"learning_rate": 0.00011460830285624118, |
|
"loss": 0.5884, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 0.5935835838317871, |
|
"learning_rate": 0.0001137790290684638, |
|
"loss": 0.5739, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 0.6752728223800659, |
|
"learning_rate": 0.00011294878822098469, |
|
"loss": 0.6435, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"grad_norm": 0.7927135825157166, |
|
"learning_rate": 0.00011211763858296507, |
|
"loss": 0.6897, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 0.714499294757843, |
|
"learning_rate": 0.00011128563848734816, |
|
"loss": 0.6641, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.7086356282234192, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 0.6273, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 0.6125518679618835, |
|
"learning_rate": 0.00010961932054943778, |
|
"loss": 0.6437, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 0.5635287165641785, |
|
"learning_rate": 0.00010878511965507434, |
|
"loss": 0.6345, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 0.47936007380485535, |
|
"learning_rate": 0.00010795030219076599, |
|
"loss": 0.5913, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.432, |
|
"grad_norm": 0.7142558097839355, |
|
"learning_rate": 0.00010711492674687671, |
|
"loss": 0.6482, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.5252729058265686, |
|
"learning_rate": 0.00010627905195293135, |
|
"loss": 0.6165, |
|
"step": 900 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1875, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.645753588278886e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|