Training in progress, step 3800, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1370666272
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:212ab0d0fa8e6b7844ee509ef0c2f36fa6a5b98f4314314e8ae31a26f8061e14
|
3 |
size 1370666272
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 697294462
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d52db6458b426b6bd77979be5e39e5045f2ca07bf67445e3563381fe27948ec
|
3 |
size 697294462
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0763a0ee6c789e6c090cd10cd205450c6c3acb74dd37b4d57a02c7f9152991aa
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -26257,6 +26257,356 @@
|
|
26257 |
"learning_rate": 0.00018545017643798129,
|
26258 |
"loss": 0.8464,
|
26259 |
"step": 3750
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26260 |
}
|
26261 |
],
|
26262 |
"logging_steps": 1,
|
@@ -26276,7 +26626,7 @@
|
|
26276 |
"attributes": {}
|
26277 |
}
|
26278 |
},
|
26279 |
-
"total_flos": 1.
|
26280 |
"train_batch_size": 32,
|
26281 |
"trial_name": null,
|
26282 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.8819774863641638,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 3800,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
26257 |
"learning_rate": 0.00018545017643798129,
|
26258 |
"loss": 0.8464,
|
26259 |
"step": 3750
|
26260 |
+
},
|
26261 |
+
{
|
26262 |
+
"epoch": 0.8706046187768365,
|
26263 |
+
"grad_norm": 0.4808577299118042,
|
26264 |
+
"learning_rate": 0.00018544259941234085,
|
26265 |
+
"loss": 0.8261,
|
26266 |
+
"step": 3751
|
26267 |
+
},
|
26268 |
+
{
|
26269 |
+
"epoch": 0.8708367181153533,
|
26270 |
+
"grad_norm": 0.4224403202533722,
|
26271 |
+
"learning_rate": 0.00018543502056916536,
|
26272 |
+
"loss": 0.8425,
|
26273 |
+
"step": 3752
|
26274 |
+
},
|
26275 |
+
{
|
26276 |
+
"epoch": 0.8710688174538702,
|
26277 |
+
"grad_norm": 0.5509401559829712,
|
26278 |
+
"learning_rate": 0.000185427439908616,
|
26279 |
+
"loss": 0.7942,
|
26280 |
+
"step": 3753
|
26281 |
+
},
|
26282 |
+
{
|
26283 |
+
"epoch": 0.8713009167923872,
|
26284 |
+
"grad_norm": 0.47354769706726074,
|
26285 |
+
"learning_rate": 0.00018541985743085405,
|
26286 |
+
"loss": 0.8791,
|
26287 |
+
"step": 3754
|
26288 |
+
},
|
26289 |
+
{
|
26290 |
+
"epoch": 0.8715330161309041,
|
26291 |
+
"grad_norm": 0.48359885811805725,
|
26292 |
+
"learning_rate": 0.00018541227313604078,
|
26293 |
+
"loss": 0.8611,
|
26294 |
+
"step": 3755
|
26295 |
+
},
|
26296 |
+
{
|
26297 |
+
"epoch": 0.8717651154694209,
|
26298 |
+
"grad_norm": 0.48973050713539124,
|
26299 |
+
"learning_rate": 0.00018540468702433758,
|
26300 |
+
"loss": 0.8084,
|
26301 |
+
"step": 3756
|
26302 |
+
},
|
26303 |
+
{
|
26304 |
+
"epoch": 0.8719972148079378,
|
26305 |
+
"grad_norm": 0.4059913158416748,
|
26306 |
+
"learning_rate": 0.00018539709909590576,
|
26307 |
+
"loss": 0.8935,
|
26308 |
+
"step": 3757
|
26309 |
+
},
|
26310 |
+
{
|
26311 |
+
"epoch": 0.8722293141464547,
|
26312 |
+
"grad_norm": 0.44113290309906006,
|
26313 |
+
"learning_rate": 0.00018538950935090677,
|
26314 |
+
"loss": 0.8441,
|
26315 |
+
"step": 3758
|
26316 |
+
},
|
26317 |
+
{
|
26318 |
+
"epoch": 0.8724614134849715,
|
26319 |
+
"grad_norm": 0.4337928891181946,
|
26320 |
+
"learning_rate": 0.00018538191778950204,
|
26321 |
+
"loss": 0.8548,
|
26322 |
+
"step": 3759
|
26323 |
+
},
|
26324 |
+
{
|
26325 |
+
"epoch": 0.8726935128234885,
|
26326 |
+
"grad_norm": 0.41244831681251526,
|
26327 |
+
"learning_rate": 0.00018537432441185304,
|
26328 |
+
"loss": 0.8356,
|
26329 |
+
"step": 3760
|
26330 |
+
},
|
26331 |
+
{
|
26332 |
+
"epoch": 0.8729256121620054,
|
26333 |
+
"grad_norm": 0.44714102149009705,
|
26334 |
+
"learning_rate": 0.00018536672921812134,
|
26335 |
+
"loss": 0.8085,
|
26336 |
+
"step": 3761
|
26337 |
+
},
|
26338 |
+
{
|
26339 |
+
"epoch": 0.8731577115005222,
|
26340 |
+
"grad_norm": 0.43154868483543396,
|
26341 |
+
"learning_rate": 0.00018535913220846847,
|
26342 |
+
"loss": 0.7995,
|
26343 |
+
"step": 3762
|
26344 |
+
},
|
26345 |
+
{
|
26346 |
+
"epoch": 0.8733898108390391,
|
26347 |
+
"grad_norm": 0.4167262017726898,
|
26348 |
+
"learning_rate": 0.00018535153338305603,
|
26349 |
+
"loss": 0.8501,
|
26350 |
+
"step": 3763
|
26351 |
+
},
|
26352 |
+
{
|
26353 |
+
"epoch": 0.873621910177556,
|
26354 |
+
"grad_norm": 0.398404598236084,
|
26355 |
+
"learning_rate": 0.00018534393274204574,
|
26356 |
+
"loss": 0.8162,
|
26357 |
+
"step": 3764
|
26358 |
+
},
|
26359 |
+
{
|
26360 |
+
"epoch": 0.8738540095160728,
|
26361 |
+
"grad_norm": 0.3954335153102875,
|
26362 |
+
"learning_rate": 0.00018533633028559917,
|
26363 |
+
"loss": 0.8697,
|
26364 |
+
"step": 3765
|
26365 |
+
},
|
26366 |
+
{
|
26367 |
+
"epoch": 0.8740861088545898,
|
26368 |
+
"grad_norm": 0.4284425377845764,
|
26369 |
+
"learning_rate": 0.00018532872601387807,
|
26370 |
+
"loss": 0.8538,
|
26371 |
+
"step": 3766
|
26372 |
+
},
|
26373 |
+
{
|
26374 |
+
"epoch": 0.8743182081931067,
|
26375 |
+
"grad_norm": 0.3925730586051941,
|
26376 |
+
"learning_rate": 0.00018532111992704424,
|
26377 |
+
"loss": 0.8329,
|
26378 |
+
"step": 3767
|
26379 |
+
},
|
26380 |
+
{
|
26381 |
+
"epoch": 0.8745503075316235,
|
26382 |
+
"grad_norm": 0.42586302757263184,
|
26383 |
+
"learning_rate": 0.00018531351202525945,
|
26384 |
+
"loss": 0.8452,
|
26385 |
+
"step": 3768
|
26386 |
+
},
|
26387 |
+
{
|
26388 |
+
"epoch": 0.8747824068701404,
|
26389 |
+
"grad_norm": 0.41396793723106384,
|
26390 |
+
"learning_rate": 0.00018530590230868556,
|
26391 |
+
"loss": 0.84,
|
26392 |
+
"step": 3769
|
26393 |
+
},
|
26394 |
+
{
|
26395 |
+
"epoch": 0.8750145062086573,
|
26396 |
+
"grad_norm": 0.421150267124176,
|
26397 |
+
"learning_rate": 0.00018529829077748442,
|
26398 |
+
"loss": 0.8413,
|
26399 |
+
"step": 3770
|
26400 |
+
},
|
26401 |
+
{
|
26402 |
+
"epoch": 0.8752466055471742,
|
26403 |
+
"grad_norm": 0.4445338249206543,
|
26404 |
+
"learning_rate": 0.00018529067743181793,
|
26405 |
+
"loss": 0.8299,
|
26406 |
+
"step": 3771
|
26407 |
+
},
|
26408 |
+
{
|
26409 |
+
"epoch": 0.8754787048856911,
|
26410 |
+
"grad_norm": 0.6780601143836975,
|
26411 |
+
"learning_rate": 0.00018528306227184806,
|
26412 |
+
"loss": 0.8644,
|
26413 |
+
"step": 3772
|
26414 |
+
},
|
26415 |
+
{
|
26416 |
+
"epoch": 0.875710804224208,
|
26417 |
+
"grad_norm": 0.4485917091369629,
|
26418 |
+
"learning_rate": 0.0001852754452977368,
|
26419 |
+
"loss": 0.8571,
|
26420 |
+
"step": 3773
|
26421 |
+
},
|
26422 |
+
{
|
26423 |
+
"epoch": 0.8759429035627249,
|
26424 |
+
"grad_norm": 0.43445441126823425,
|
26425 |
+
"learning_rate": 0.00018526782650964618,
|
26426 |
+
"loss": 0.8499,
|
26427 |
+
"step": 3774
|
26428 |
+
},
|
26429 |
+
{
|
26430 |
+
"epoch": 0.8761750029012417,
|
26431 |
+
"grad_norm": 0.43933218717575073,
|
26432 |
+
"learning_rate": 0.00018526020590773823,
|
26433 |
+
"loss": 0.8504,
|
26434 |
+
"step": 3775
|
26435 |
+
},
|
26436 |
+
{
|
26437 |
+
"epoch": 0.8764071022397586,
|
26438 |
+
"grad_norm": 0.4182621240615845,
|
26439 |
+
"learning_rate": 0.0001852525834921751,
|
26440 |
+
"loss": 0.8153,
|
26441 |
+
"step": 3776
|
26442 |
+
},
|
26443 |
+
{
|
26444 |
+
"epoch": 0.8766392015782755,
|
26445 |
+
"grad_norm": 0.4337303638458252,
|
26446 |
+
"learning_rate": 0.00018524495926311893,
|
26447 |
+
"loss": 0.8355,
|
26448 |
+
"step": 3777
|
26449 |
+
},
|
26450 |
+
{
|
26451 |
+
"epoch": 0.8768713009167924,
|
26452 |
+
"grad_norm": 0.45787665247917175,
|
26453 |
+
"learning_rate": 0.00018523733322073192,
|
26454 |
+
"loss": 0.8477,
|
26455 |
+
"step": 3778
|
26456 |
+
},
|
26457 |
+
{
|
26458 |
+
"epoch": 0.8771034002553093,
|
26459 |
+
"grad_norm": 0.44240859150886536,
|
26460 |
+
"learning_rate": 0.00018522970536517623,
|
26461 |
+
"loss": 0.8366,
|
26462 |
+
"step": 3779
|
26463 |
+
},
|
26464 |
+
{
|
26465 |
+
"epoch": 0.8773354995938262,
|
26466 |
+
"grad_norm": 0.5221067070960999,
|
26467 |
+
"learning_rate": 0.00018522207569661415,
|
26468 |
+
"loss": 0.7688,
|
26469 |
+
"step": 3780
|
26470 |
+
},
|
26471 |
+
{
|
26472 |
+
"epoch": 0.877567598932343,
|
26473 |
+
"grad_norm": 0.41580215096473694,
|
26474 |
+
"learning_rate": 0.000185214444215208,
|
26475 |
+
"loss": 0.8615,
|
26476 |
+
"step": 3781
|
26477 |
+
},
|
26478 |
+
{
|
26479 |
+
"epoch": 0.8777996982708599,
|
26480 |
+
"grad_norm": 0.43873414397239685,
|
26481 |
+
"learning_rate": 0.00018520681092112006,
|
26482 |
+
"loss": 0.8547,
|
26483 |
+
"step": 3782
|
26484 |
+
},
|
26485 |
+
{
|
26486 |
+
"epoch": 0.8780317976093768,
|
26487 |
+
"grad_norm": 0.37557294964790344,
|
26488 |
+
"learning_rate": 0.0001851991758145128,
|
26489 |
+
"loss": 0.8184,
|
26490 |
+
"step": 3783
|
26491 |
+
},
|
26492 |
+
{
|
26493 |
+
"epoch": 0.8782638969478938,
|
26494 |
+
"grad_norm": 0.4067942500114441,
|
26495 |
+
"learning_rate": 0.00018519153889554854,
|
26496 |
+
"loss": 0.8072,
|
26497 |
+
"step": 3784
|
26498 |
+
},
|
26499 |
+
{
|
26500 |
+
"epoch": 0.8784959962864106,
|
26501 |
+
"grad_norm": 0.38345038890838623,
|
26502 |
+
"learning_rate": 0.0001851839001643898,
|
26503 |
+
"loss": 0.8175,
|
26504 |
+
"step": 3785
|
26505 |
+
},
|
26506 |
+
{
|
26507 |
+
"epoch": 0.8787280956249275,
|
26508 |
+
"grad_norm": 0.4141753613948822,
|
26509 |
+
"learning_rate": 0.00018517625962119905,
|
26510 |
+
"loss": 0.7874,
|
26511 |
+
"step": 3786
|
26512 |
+
},
|
26513 |
+
{
|
26514 |
+
"epoch": 0.8789601949634444,
|
26515 |
+
"grad_norm": 0.410163015127182,
|
26516 |
+
"learning_rate": 0.00018516861726613877,
|
26517 |
+
"loss": 0.8337,
|
26518 |
+
"step": 3787
|
26519 |
+
},
|
26520 |
+
{
|
26521 |
+
"epoch": 0.8791922943019612,
|
26522 |
+
"grad_norm": 0.4118984639644623,
|
26523 |
+
"learning_rate": 0.00018516097309937156,
|
26524 |
+
"loss": 0.8398,
|
26525 |
+
"step": 3788
|
26526 |
+
},
|
26527 |
+
{
|
26528 |
+
"epoch": 0.8794243936404781,
|
26529 |
+
"grad_norm": 0.42656370997428894,
|
26530 |
+
"learning_rate": 0.0001851533271210601,
|
26531 |
+
"loss": 0.8233,
|
26532 |
+
"step": 3789
|
26533 |
+
},
|
26534 |
+
{
|
26535 |
+
"epoch": 0.8796564929789951,
|
26536 |
+
"grad_norm": 0.4185142517089844,
|
26537 |
+
"learning_rate": 0.00018514567933136693,
|
26538 |
+
"loss": 0.8372,
|
26539 |
+
"step": 3790
|
26540 |
+
},
|
26541 |
+
{
|
26542 |
+
"epoch": 0.8798885923175119,
|
26543 |
+
"grad_norm": 0.4009787440299988,
|
26544 |
+
"learning_rate": 0.0001851380297304548,
|
26545 |
+
"loss": 0.8194,
|
26546 |
+
"step": 3791
|
26547 |
+
},
|
26548 |
+
{
|
26549 |
+
"epoch": 0.8801206916560288,
|
26550 |
+
"grad_norm": 0.43166425824165344,
|
26551 |
+
"learning_rate": 0.00018513037831848639,
|
26552 |
+
"loss": 0.8333,
|
26553 |
+
"step": 3792
|
26554 |
+
},
|
26555 |
+
{
|
26556 |
+
"epoch": 0.8803527909945457,
|
26557 |
+
"grad_norm": 0.4240741431713104,
|
26558 |
+
"learning_rate": 0.00018512272509562446,
|
26559 |
+
"loss": 0.8087,
|
26560 |
+
"step": 3793
|
26561 |
+
},
|
26562 |
+
{
|
26563 |
+
"epoch": 0.8805848903330625,
|
26564 |
+
"grad_norm": 0.4606071710586548,
|
26565 |
+
"learning_rate": 0.00018511507006203188,
|
26566 |
+
"loss": 0.7864,
|
26567 |
+
"step": 3794
|
26568 |
+
},
|
26569 |
+
{
|
26570 |
+
"epoch": 0.8808169896715794,
|
26571 |
+
"grad_norm": 0.4614354372024536,
|
26572 |
+
"learning_rate": 0.0001851074132178714,
|
26573 |
+
"loss": 0.8406,
|
26574 |
+
"step": 3795
|
26575 |
+
},
|
26576 |
+
{
|
26577 |
+
"epoch": 0.8810490890100964,
|
26578 |
+
"grad_norm": 0.46533650159835815,
|
26579 |
+
"learning_rate": 0.00018509975456330592,
|
26580 |
+
"loss": 0.8695,
|
26581 |
+
"step": 3796
|
26582 |
+
},
|
26583 |
+
{
|
26584 |
+
"epoch": 0.8812811883486132,
|
26585 |
+
"grad_norm": 0.46137019991874695,
|
26586 |
+
"learning_rate": 0.00018509209409849843,
|
26587 |
+
"loss": 0.8355,
|
26588 |
+
"step": 3797
|
26589 |
+
},
|
26590 |
+
{
|
26591 |
+
"epoch": 0.8815132876871301,
|
26592 |
+
"grad_norm": 0.4653560221195221,
|
26593 |
+
"learning_rate": 0.00018508443182361175,
|
26594 |
+
"loss": 0.8749,
|
26595 |
+
"step": 3798
|
26596 |
+
},
|
26597 |
+
{
|
26598 |
+
"epoch": 0.881745387025647,
|
26599 |
+
"grad_norm": 0.39684346318244934,
|
26600 |
+
"learning_rate": 0.00018507676773880897,
|
26601 |
+
"loss": 0.8362,
|
26602 |
+
"step": 3799
|
26603 |
+
},
|
26604 |
+
{
|
26605 |
+
"epoch": 0.8819774863641638,
|
26606 |
+
"grad_norm": 0.4175236225128174,
|
26607 |
+
"learning_rate": 0.0001850691018442531,
|
26608 |
+
"loss": 0.8549,
|
26609 |
+
"step": 3800
|
26610 |
}
|
26611 |
],
|
26612 |
"logging_steps": 1,
|
|
|
26626 |
"attributes": {}
|
26627 |
}
|
26628 |
},
|
26629 |
+
"total_flos": 1.6867315219955712e+18,
|
26630 |
"train_batch_size": 32,
|
26631 |
"trial_name": null,
|
26632 |
"trial_params": null
|