Training in progress, step 2950, checkpoint
Browse files
last-checkpoint/adapter_config.json
CHANGED
@@ -26,13 +26,13 @@
|
|
26 |
"rank_pattern": {},
|
27 |
"revision": null,
|
28 |
"target_modules": [
|
29 |
-
"
|
|
|
30 |
"up_proj",
|
31 |
"q_proj",
|
32 |
-
"
|
33 |
"k_proj",
|
34 |
-
"
|
35 |
-
"down_proj"
|
36 |
],
|
37 |
"task_type": "CAUSAL_LM",
|
38 |
"use_dora": false,
|
|
|
26 |
"rank_pattern": {},
|
27 |
"revision": null,
|
28 |
"target_modules": [
|
29 |
+
"down_proj",
|
30 |
+
"gate_proj",
|
31 |
"up_proj",
|
32 |
"q_proj",
|
33 |
+
"v_proj",
|
34 |
"k_proj",
|
35 |
+
"o_proj"
|
|
|
36 |
],
|
37 |
"task_type": "CAUSAL_LM",
|
38 |
"use_dora": false,
|
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1370666272
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c2dded99c0ee7ca1abb8f2fabb96c5156afb103ab3ce6e1c5aaa23701b8648f
|
3 |
size 1370666272
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 697294462
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b045aafc33f03191b84693af1956fd4ffbf7ae3916ccc75dfaad968038476fff
|
3 |
size 697294462
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e748c5d7e6180ed81327ab2a4f0165f8cdc32090ab49af955d587517fb12cdd7
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -20307,6 +20307,356 @@
|
|
20307 |
"learning_rate": 0.00019121161303315963,
|
20308 |
"loss": 0.8731,
|
20309 |
"step": 2900
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20310 |
}
|
20311 |
],
|
20312 |
"logging_steps": 1,
|
@@ -20326,7 +20676,7 @@
|
|
20326 |
"attributes": {}
|
20327 |
}
|
20328 |
},
|
20329 |
-
"total_flos": 1.
|
20330 |
"train_batch_size": 32,
|
20331 |
"trial_name": null,
|
20332 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.6846930486248114,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 2950,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
20307 |
"learning_rate": 0.00019121161303315963,
|
20308 |
"loss": 0.8731,
|
20309 |
"step": 2900
|
20310 |
+
},
|
20311 |
+
{
|
20312 |
+
"epoch": 0.673320181037484,
|
20313 |
+
"grad_norm": 0.4451583921909332,
|
20314 |
+
"learning_rate": 0.00019120563323736343,
|
20315 |
+
"loss": 0.8934,
|
20316 |
+
"step": 2901
|
20317 |
+
},
|
20318 |
+
{
|
20319 |
+
"epoch": 0.6735522803760009,
|
20320 |
+
"grad_norm": 0.41901981830596924,
|
20321 |
+
"learning_rate": 0.00019119965150144095,
|
20322 |
+
"loss": 0.8637,
|
20323 |
+
"step": 2902
|
20324 |
+
},
|
20325 |
+
{
|
20326 |
+
"epoch": 0.6737843797145178,
|
20327 |
+
"grad_norm": 0.42898762226104736,
|
20328 |
+
"learning_rate": 0.00019119366782551937,
|
20329 |
+
"loss": 0.8929,
|
20330 |
+
"step": 2903
|
20331 |
+
},
|
20332 |
+
{
|
20333 |
+
"epoch": 0.6740164790530347,
|
20334 |
+
"grad_norm": 0.4139856994152069,
|
20335 |
+
"learning_rate": 0.00019118768220972596,
|
20336 |
+
"loss": 0.8958,
|
20337 |
+
"step": 2904
|
20338 |
+
},
|
20339 |
+
{
|
20340 |
+
"epoch": 0.6742485783915516,
|
20341 |
+
"grad_norm": 0.4518340528011322,
|
20342 |
+
"learning_rate": 0.0001911816946541881,
|
20343 |
+
"loss": 0.884,
|
20344 |
+
"step": 2905
|
20345 |
+
},
|
20346 |
+
{
|
20347 |
+
"epoch": 0.6744806777300685,
|
20348 |
+
"grad_norm": 0.4949742555618286,
|
20349 |
+
"learning_rate": 0.00019117570515903313,
|
20350 |
+
"loss": 0.9065,
|
20351 |
+
"step": 2906
|
20352 |
+
},
|
20353 |
+
{
|
20354 |
+
"epoch": 0.6747127770685853,
|
20355 |
+
"grad_norm": 0.42285311222076416,
|
20356 |
+
"learning_rate": 0.00019116971372438847,
|
20357 |
+
"loss": 0.9126,
|
20358 |
+
"step": 2907
|
20359 |
+
},
|
20360 |
+
{
|
20361 |
+
"epoch": 0.6749448764071022,
|
20362 |
+
"grad_norm": 0.46767348051071167,
|
20363 |
+
"learning_rate": 0.00019116372035038153,
|
20364 |
+
"loss": 0.8784,
|
20365 |
+
"step": 2908
|
20366 |
+
},
|
20367 |
+
{
|
20368 |
+
"epoch": 0.6751769757456191,
|
20369 |
+
"grad_norm": 0.48399636149406433,
|
20370 |
+
"learning_rate": 0.00019115772503713985,
|
20371 |
+
"loss": 0.8913,
|
20372 |
+
"step": 2909
|
20373 |
+
},
|
20374 |
+
{
|
20375 |
+
"epoch": 0.6754090750841361,
|
20376 |
+
"grad_norm": 0.44633030891418457,
|
20377 |
+
"learning_rate": 0.00019115172778479093,
|
20378 |
+
"loss": 0.8711,
|
20379 |
+
"step": 2910
|
20380 |
+
},
|
20381 |
+
{
|
20382 |
+
"epoch": 0.6756411744226529,
|
20383 |
+
"grad_norm": 0.43487444519996643,
|
20384 |
+
"learning_rate": 0.00019114572859346235,
|
20385 |
+
"loss": 0.8847,
|
20386 |
+
"step": 2911
|
20387 |
+
},
|
20388 |
+
{
|
20389 |
+
"epoch": 0.6758732737611698,
|
20390 |
+
"grad_norm": 0.3979194760322571,
|
20391 |
+
"learning_rate": 0.00019113972746328178,
|
20392 |
+
"loss": 0.849,
|
20393 |
+
"step": 2912
|
20394 |
+
},
|
20395 |
+
{
|
20396 |
+
"epoch": 0.6761053730996867,
|
20397 |
+
"grad_norm": 0.4204396605491638,
|
20398 |
+
"learning_rate": 0.0001911337243943768,
|
20399 |
+
"loss": 0.8596,
|
20400 |
+
"step": 2913
|
20401 |
+
},
|
20402 |
+
{
|
20403 |
+
"epoch": 0.6763374724382035,
|
20404 |
+
"grad_norm": 0.41835030913352966,
|
20405 |
+
"learning_rate": 0.0001911277193868751,
|
20406 |
+
"loss": 0.8431,
|
20407 |
+
"step": 2914
|
20408 |
+
},
|
20409 |
+
{
|
20410 |
+
"epoch": 0.6765695717767204,
|
20411 |
+
"grad_norm": 0.4458625912666321,
|
20412 |
+
"learning_rate": 0.00019112171244090452,
|
20413 |
+
"loss": 0.8341,
|
20414 |
+
"step": 2915
|
20415 |
+
},
|
20416 |
+
{
|
20417 |
+
"epoch": 0.6768016711152374,
|
20418 |
+
"grad_norm": 0.4265308976173401,
|
20419 |
+
"learning_rate": 0.0001911157035565927,
|
20420 |
+
"loss": 0.8193,
|
20421 |
+
"step": 2916
|
20422 |
+
},
|
20423 |
+
{
|
20424 |
+
"epoch": 0.6770337704537542,
|
20425 |
+
"grad_norm": 0.4003806412220001,
|
20426 |
+
"learning_rate": 0.0001911096927340676,
|
20427 |
+
"loss": 0.8821,
|
20428 |
+
"step": 2917
|
20429 |
+
},
|
20430 |
+
{
|
20431 |
+
"epoch": 0.6772658697922711,
|
20432 |
+
"grad_norm": 0.44573527574539185,
|
20433 |
+
"learning_rate": 0.00019110367997345697,
|
20434 |
+
"loss": 0.864,
|
20435 |
+
"step": 2918
|
20436 |
+
},
|
20437 |
+
{
|
20438 |
+
"epoch": 0.677497969130788,
|
20439 |
+
"grad_norm": 0.4213849902153015,
|
20440 |
+
"learning_rate": 0.00019109766527488877,
|
20441 |
+
"loss": 0.8711,
|
20442 |
+
"step": 2919
|
20443 |
+
},
|
20444 |
+
{
|
20445 |
+
"epoch": 0.6777300684693048,
|
20446 |
+
"grad_norm": 0.41736915707588196,
|
20447 |
+
"learning_rate": 0.00019109164863849096,
|
20448 |
+
"loss": 0.8666,
|
20449 |
+
"step": 2920
|
20450 |
+
},
|
20451 |
+
{
|
20452 |
+
"epoch": 0.6779621678078217,
|
20453 |
+
"grad_norm": 0.4173840284347534,
|
20454 |
+
"learning_rate": 0.00019108563006439147,
|
20455 |
+
"loss": 0.8964,
|
20456 |
+
"step": 2921
|
20457 |
+
},
|
20458 |
+
{
|
20459 |
+
"epoch": 0.6781942671463387,
|
20460 |
+
"grad_norm": 0.4290173649787903,
|
20461 |
+
"learning_rate": 0.00019107960955271836,
|
20462 |
+
"loss": 0.8684,
|
20463 |
+
"step": 2922
|
20464 |
+
},
|
20465 |
+
{
|
20466 |
+
"epoch": 0.6784263664848555,
|
20467 |
+
"grad_norm": 0.4732690751552582,
|
20468 |
+
"learning_rate": 0.0001910735871035997,
|
20469 |
+
"loss": 0.844,
|
20470 |
+
"step": 2923
|
20471 |
+
},
|
20472 |
+
{
|
20473 |
+
"epoch": 0.6786584658233724,
|
20474 |
+
"grad_norm": 0.44380733370780945,
|
20475 |
+
"learning_rate": 0.00019106756271716362,
|
20476 |
+
"loss": 0.8779,
|
20477 |
+
"step": 2924
|
20478 |
+
},
|
20479 |
+
{
|
20480 |
+
"epoch": 0.6788905651618893,
|
20481 |
+
"grad_norm": 0.4828498959541321,
|
20482 |
+
"learning_rate": 0.00019106153639353822,
|
20483 |
+
"loss": 0.8606,
|
20484 |
+
"step": 2925
|
20485 |
+
},
|
20486 |
+
{
|
20487 |
+
"epoch": 0.6791226645004061,
|
20488 |
+
"grad_norm": 0.4402746260166168,
|
20489 |
+
"learning_rate": 0.00019105550813285175,
|
20490 |
+
"loss": 0.8463,
|
20491 |
+
"step": 2926
|
20492 |
+
},
|
20493 |
+
{
|
20494 |
+
"epoch": 0.679354763838923,
|
20495 |
+
"grad_norm": 0.44497203826904297,
|
20496 |
+
"learning_rate": 0.00019104947793523234,
|
20497 |
+
"loss": 0.8601,
|
20498 |
+
"step": 2927
|
20499 |
+
},
|
20500 |
+
{
|
20501 |
+
"epoch": 0.67958686317744,
|
20502 |
+
"grad_norm": 0.44765856862068176,
|
20503 |
+
"learning_rate": 0.00019104344580080838,
|
20504 |
+
"loss": 0.8867,
|
20505 |
+
"step": 2928
|
20506 |
+
},
|
20507 |
+
{
|
20508 |
+
"epoch": 0.6798189625159569,
|
20509 |
+
"grad_norm": 0.43054118752479553,
|
20510 |
+
"learning_rate": 0.00019103741172970818,
|
20511 |
+
"loss": 0.8119,
|
20512 |
+
"step": 2929
|
20513 |
+
},
|
20514 |
+
{
|
20515 |
+
"epoch": 0.6800510618544737,
|
20516 |
+
"grad_norm": 0.555328369140625,
|
20517 |
+
"learning_rate": 0.00019103137572206,
|
20518 |
+
"loss": 0.8219,
|
20519 |
+
"step": 2930
|
20520 |
+
},
|
20521 |
+
{
|
20522 |
+
"epoch": 0.6802831611929906,
|
20523 |
+
"grad_norm": 0.45921704173088074,
|
20524 |
+
"learning_rate": 0.0001910253377779923,
|
20525 |
+
"loss": 0.8887,
|
20526 |
+
"step": 2931
|
20527 |
+
},
|
20528 |
+
{
|
20529 |
+
"epoch": 0.6805152605315075,
|
20530 |
+
"grad_norm": 0.4183528423309326,
|
20531 |
+
"learning_rate": 0.00019101929789763354,
|
20532 |
+
"loss": 0.885,
|
20533 |
+
"step": 2932
|
20534 |
+
},
|
20535 |
+
{
|
20536 |
+
"epoch": 0.6807473598700243,
|
20537 |
+
"grad_norm": 0.4342934787273407,
|
20538 |
+
"learning_rate": 0.00019101325608111218,
|
20539 |
+
"loss": 0.9084,
|
20540 |
+
"step": 2933
|
20541 |
+
},
|
20542 |
+
{
|
20543 |
+
"epoch": 0.6809794592085413,
|
20544 |
+
"grad_norm": 0.41013672947883606,
|
20545 |
+
"learning_rate": 0.0001910072123285567,
|
20546 |
+
"loss": 0.8773,
|
20547 |
+
"step": 2934
|
20548 |
+
},
|
20549 |
+
{
|
20550 |
+
"epoch": 0.6812115585470582,
|
20551 |
+
"grad_norm": 0.4397852122783661,
|
20552 |
+
"learning_rate": 0.00019100116664009576,
|
20553 |
+
"loss": 0.8478,
|
20554 |
+
"step": 2935
|
20555 |
+
},
|
20556 |
+
{
|
20557 |
+
"epoch": 0.681443657885575,
|
20558 |
+
"grad_norm": 0.46658027172088623,
|
20559 |
+
"learning_rate": 0.00019099511901585786,
|
20560 |
+
"loss": 0.8682,
|
20561 |
+
"step": 2936
|
20562 |
+
},
|
20563 |
+
{
|
20564 |
+
"epoch": 0.6816757572240919,
|
20565 |
+
"grad_norm": 0.4161824584007263,
|
20566 |
+
"learning_rate": 0.00019098906945597168,
|
20567 |
+
"loss": 0.8447,
|
20568 |
+
"step": 2937
|
20569 |
+
},
|
20570 |
+
{
|
20571 |
+
"epoch": 0.6819078565626088,
|
20572 |
+
"grad_norm": 0.45820096135139465,
|
20573 |
+
"learning_rate": 0.00019098301796056593,
|
20574 |
+
"loss": 0.8632,
|
20575 |
+
"step": 2938
|
20576 |
+
},
|
20577 |
+
{
|
20578 |
+
"epoch": 0.6821399559011256,
|
20579 |
+
"grad_norm": 0.49335211515426636,
|
20580 |
+
"learning_rate": 0.00019097696452976935,
|
20581 |
+
"loss": 0.8543,
|
20582 |
+
"step": 2939
|
20583 |
+
},
|
20584 |
+
{
|
20585 |
+
"epoch": 0.6823720552396426,
|
20586 |
+
"grad_norm": 0.5060347318649292,
|
20587 |
+
"learning_rate": 0.00019097090916371062,
|
20588 |
+
"loss": 0.9283,
|
20589 |
+
"step": 2940
|
20590 |
+
},
|
20591 |
+
{
|
20592 |
+
"epoch": 0.6826041545781595,
|
20593 |
+
"grad_norm": 0.5007983446121216,
|
20594 |
+
"learning_rate": 0.00019096485186251866,
|
20595 |
+
"loss": 0.8542,
|
20596 |
+
"step": 2941
|
20597 |
+
},
|
20598 |
+
{
|
20599 |
+
"epoch": 0.6828362539166764,
|
20600 |
+
"grad_norm": 0.5087704062461853,
|
20601 |
+
"learning_rate": 0.00019095879262632227,
|
20602 |
+
"loss": 0.8908,
|
20603 |
+
"step": 2942
|
20604 |
+
},
|
20605 |
+
{
|
20606 |
+
"epoch": 0.6830683532551932,
|
20607 |
+
"grad_norm": 0.5069675445556641,
|
20608 |
+
"learning_rate": 0.0001909527314552503,
|
20609 |
+
"loss": 0.9079,
|
20610 |
+
"step": 2943
|
20611 |
+
},
|
20612 |
+
{
|
20613 |
+
"epoch": 0.6833004525937101,
|
20614 |
+
"grad_norm": 0.47137320041656494,
|
20615 |
+
"learning_rate": 0.00019094666834943179,
|
20616 |
+
"loss": 0.8626,
|
20617 |
+
"step": 2944
|
20618 |
+
},
|
20619 |
+
{
|
20620 |
+
"epoch": 0.683532551932227,
|
20621 |
+
"grad_norm": 0.4283658564090729,
|
20622 |
+
"learning_rate": 0.0001909406033089956,
|
20623 |
+
"loss": 0.8541,
|
20624 |
+
"step": 2945
|
20625 |
+
},
|
20626 |
+
{
|
20627 |
+
"epoch": 0.6837646512707439,
|
20628 |
+
"grad_norm": 0.46082451939582825,
|
20629 |
+
"learning_rate": 0.00019093453633407082,
|
20630 |
+
"loss": 0.8143,
|
20631 |
+
"step": 2946
|
20632 |
+
},
|
20633 |
+
{
|
20634 |
+
"epoch": 0.6839967506092608,
|
20635 |
+
"grad_norm": 0.4551635682582855,
|
20636 |
+
"learning_rate": 0.00019092846742478647,
|
20637 |
+
"loss": 0.8945,
|
20638 |
+
"step": 2947
|
20639 |
+
},
|
20640 |
+
{
|
20641 |
+
"epoch": 0.6842288499477777,
|
20642 |
+
"grad_norm": 0.5660843253135681,
|
20643 |
+
"learning_rate": 0.00019092239658127167,
|
20644 |
+
"loss": 0.8522,
|
20645 |
+
"step": 2948
|
20646 |
+
},
|
20647 |
+
{
|
20648 |
+
"epoch": 0.6844609492862945,
|
20649 |
+
"grad_norm": 0.481251060962677,
|
20650 |
+
"learning_rate": 0.00019091632380365553,
|
20651 |
+
"loss": 0.8549,
|
20652 |
+
"step": 2949
|
20653 |
+
},
|
20654 |
+
{
|
20655 |
+
"epoch": 0.6846930486248114,
|
20656 |
+
"grad_norm": 0.45565807819366455,
|
20657 |
+
"learning_rate": 0.00019091024909206729,
|
20658 |
+
"loss": 0.8892,
|
20659 |
+
"step": 2950
|
20660 |
}
|
20661 |
],
|
20662 |
"logging_steps": 1,
|
|
|
20676 |
"attributes": {}
|
20677 |
}
|
20678 |
},
|
20679 |
+
"total_flos": 1.3094363131281408e+18,
|
20680 |
"train_batch_size": 32,
|
20681 |
"trial_name": null,
|
20682 |
"trial_params": null
|