mosama commited on
Commit
e4e2196
·
verified ·
1 Parent(s): 267b083

Training in progress, step 2950, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -26,13 +26,13 @@
26
  "rank_pattern": {},
27
  "revision": null,
28
  "target_modules": [
29
- "v_proj",
 
30
  "up_proj",
31
  "q_proj",
32
- "o_proj",
33
  "k_proj",
34
- "gate_proj",
35
- "down_proj"
36
  ],
37
  "task_type": "CAUSAL_LM",
38
  "use_dora": false,
 
26
  "rank_pattern": {},
27
  "revision": null,
28
  "target_modules": [
29
+ "down_proj",
30
+ "gate_proj",
31
  "up_proj",
32
  "q_proj",
33
+ "v_proj",
34
  "k_proj",
35
+ "o_proj"
 
36
  ],
37
  "task_type": "CAUSAL_LM",
38
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85e10776ed7d2feec702f85a92294fc572495be458fad36bc37e21242039a14d
3
  size 1370666272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c2dded99c0ee7ca1abb8f2fabb96c5156afb103ab3ce6e1c5aaa23701b8648f
3
  size 1370666272
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f328674c88f0186255ee5dbf4ac7f148eb4bef19de18a361c1ed0eb9ce9660bb
3
  size 697294462
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b045aafc33f03191b84693af1956fd4ffbf7ae3916ccc75dfaad968038476fff
3
  size 697294462
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:605d9f0439096f21199e65a6f7490d22d8285df735f81d56920505482985be35
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e748c5d7e6180ed81327ab2a4f0165f8cdc32090ab49af955d587517fb12cdd7
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6730880816989672,
5
  "eval_steps": 500,
6
- "global_step": 2900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -20307,6 +20307,356 @@
20307
  "learning_rate": 0.00019121161303315963,
20308
  "loss": 0.8731,
20309
  "step": 2900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20310
  }
20311
  ],
20312
  "logging_steps": 1,
@@ -20326,7 +20676,7 @@
20326
  "attributes": {}
20327
  }
20328
  },
20329
- "total_flos": 1.2872424773124096e+18,
20330
  "train_batch_size": 32,
20331
  "trial_name": null,
20332
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6846930486248114,
5
  "eval_steps": 500,
6
+ "global_step": 2950,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
20307
  "learning_rate": 0.00019121161303315963,
20308
  "loss": 0.8731,
20309
  "step": 2900
20310
+ },
20311
+ {
20312
+ "epoch": 0.673320181037484,
20313
+ "grad_norm": 0.4451583921909332,
20314
+ "learning_rate": 0.00019120563323736343,
20315
+ "loss": 0.8934,
20316
+ "step": 2901
20317
+ },
20318
+ {
20319
+ "epoch": 0.6735522803760009,
20320
+ "grad_norm": 0.41901981830596924,
20321
+ "learning_rate": 0.00019119965150144095,
20322
+ "loss": 0.8637,
20323
+ "step": 2902
20324
+ },
20325
+ {
20326
+ "epoch": 0.6737843797145178,
20327
+ "grad_norm": 0.42898762226104736,
20328
+ "learning_rate": 0.00019119366782551937,
20329
+ "loss": 0.8929,
20330
+ "step": 2903
20331
+ },
20332
+ {
20333
+ "epoch": 0.6740164790530347,
20334
+ "grad_norm": 0.4139856994152069,
20335
+ "learning_rate": 0.00019118768220972596,
20336
+ "loss": 0.8958,
20337
+ "step": 2904
20338
+ },
20339
+ {
20340
+ "epoch": 0.6742485783915516,
20341
+ "grad_norm": 0.4518340528011322,
20342
+ "learning_rate": 0.0001911816946541881,
20343
+ "loss": 0.884,
20344
+ "step": 2905
20345
+ },
20346
+ {
20347
+ "epoch": 0.6744806777300685,
20348
+ "grad_norm": 0.4949742555618286,
20349
+ "learning_rate": 0.00019117570515903313,
20350
+ "loss": 0.9065,
20351
+ "step": 2906
20352
+ },
20353
+ {
20354
+ "epoch": 0.6747127770685853,
20355
+ "grad_norm": 0.42285311222076416,
20356
+ "learning_rate": 0.00019116971372438847,
20357
+ "loss": 0.9126,
20358
+ "step": 2907
20359
+ },
20360
+ {
20361
+ "epoch": 0.6749448764071022,
20362
+ "grad_norm": 0.46767348051071167,
20363
+ "learning_rate": 0.00019116372035038153,
20364
+ "loss": 0.8784,
20365
+ "step": 2908
20366
+ },
20367
+ {
20368
+ "epoch": 0.6751769757456191,
20369
+ "grad_norm": 0.48399636149406433,
20370
+ "learning_rate": 0.00019115772503713985,
20371
+ "loss": 0.8913,
20372
+ "step": 2909
20373
+ },
20374
+ {
20375
+ "epoch": 0.6754090750841361,
20376
+ "grad_norm": 0.44633030891418457,
20377
+ "learning_rate": 0.00019115172778479093,
20378
+ "loss": 0.8711,
20379
+ "step": 2910
20380
+ },
20381
+ {
20382
+ "epoch": 0.6756411744226529,
20383
+ "grad_norm": 0.43487444519996643,
20384
+ "learning_rate": 0.00019114572859346235,
20385
+ "loss": 0.8847,
20386
+ "step": 2911
20387
+ },
20388
+ {
20389
+ "epoch": 0.6758732737611698,
20390
+ "grad_norm": 0.3979194760322571,
20391
+ "learning_rate": 0.00019113972746328178,
20392
+ "loss": 0.849,
20393
+ "step": 2912
20394
+ },
20395
+ {
20396
+ "epoch": 0.6761053730996867,
20397
+ "grad_norm": 0.4204396605491638,
20398
+ "learning_rate": 0.0001911337243943768,
20399
+ "loss": 0.8596,
20400
+ "step": 2913
20401
+ },
20402
+ {
20403
+ "epoch": 0.6763374724382035,
20404
+ "grad_norm": 0.41835030913352966,
20405
+ "learning_rate": 0.0001911277193868751,
20406
+ "loss": 0.8431,
20407
+ "step": 2914
20408
+ },
20409
+ {
20410
+ "epoch": 0.6765695717767204,
20411
+ "grad_norm": 0.4458625912666321,
20412
+ "learning_rate": 0.00019112171244090452,
20413
+ "loss": 0.8341,
20414
+ "step": 2915
20415
+ },
20416
+ {
20417
+ "epoch": 0.6768016711152374,
20418
+ "grad_norm": 0.4265308976173401,
20419
+ "learning_rate": 0.0001911157035565927,
20420
+ "loss": 0.8193,
20421
+ "step": 2916
20422
+ },
20423
+ {
20424
+ "epoch": 0.6770337704537542,
20425
+ "grad_norm": 0.4003806412220001,
20426
+ "learning_rate": 0.0001911096927340676,
20427
+ "loss": 0.8821,
20428
+ "step": 2917
20429
+ },
20430
+ {
20431
+ "epoch": 0.6772658697922711,
20432
+ "grad_norm": 0.44573527574539185,
20433
+ "learning_rate": 0.00019110367997345697,
20434
+ "loss": 0.864,
20435
+ "step": 2918
20436
+ },
20437
+ {
20438
+ "epoch": 0.677497969130788,
20439
+ "grad_norm": 0.4213849902153015,
20440
+ "learning_rate": 0.00019109766527488877,
20441
+ "loss": 0.8711,
20442
+ "step": 2919
20443
+ },
20444
+ {
20445
+ "epoch": 0.6777300684693048,
20446
+ "grad_norm": 0.41736915707588196,
20447
+ "learning_rate": 0.00019109164863849096,
20448
+ "loss": 0.8666,
20449
+ "step": 2920
20450
+ },
20451
+ {
20452
+ "epoch": 0.6779621678078217,
20453
+ "grad_norm": 0.4173840284347534,
20454
+ "learning_rate": 0.00019108563006439147,
20455
+ "loss": 0.8964,
20456
+ "step": 2921
20457
+ },
20458
+ {
20459
+ "epoch": 0.6781942671463387,
20460
+ "grad_norm": 0.4290173649787903,
20461
+ "learning_rate": 0.00019107960955271836,
20462
+ "loss": 0.8684,
20463
+ "step": 2922
20464
+ },
20465
+ {
20466
+ "epoch": 0.6784263664848555,
20467
+ "grad_norm": 0.4732690751552582,
20468
+ "learning_rate": 0.0001910735871035997,
20469
+ "loss": 0.844,
20470
+ "step": 2923
20471
+ },
20472
+ {
20473
+ "epoch": 0.6786584658233724,
20474
+ "grad_norm": 0.44380733370780945,
20475
+ "learning_rate": 0.00019106756271716362,
20476
+ "loss": 0.8779,
20477
+ "step": 2924
20478
+ },
20479
+ {
20480
+ "epoch": 0.6788905651618893,
20481
+ "grad_norm": 0.4828498959541321,
20482
+ "learning_rate": 0.00019106153639353822,
20483
+ "loss": 0.8606,
20484
+ "step": 2925
20485
+ },
20486
+ {
20487
+ "epoch": 0.6791226645004061,
20488
+ "grad_norm": 0.4402746260166168,
20489
+ "learning_rate": 0.00019105550813285175,
20490
+ "loss": 0.8463,
20491
+ "step": 2926
20492
+ },
20493
+ {
20494
+ "epoch": 0.679354763838923,
20495
+ "grad_norm": 0.44497203826904297,
20496
+ "learning_rate": 0.00019104947793523234,
20497
+ "loss": 0.8601,
20498
+ "step": 2927
20499
+ },
20500
+ {
20501
+ "epoch": 0.67958686317744,
20502
+ "grad_norm": 0.44765856862068176,
20503
+ "learning_rate": 0.00019104344580080838,
20504
+ "loss": 0.8867,
20505
+ "step": 2928
20506
+ },
20507
+ {
20508
+ "epoch": 0.6798189625159569,
20509
+ "grad_norm": 0.43054118752479553,
20510
+ "learning_rate": 0.00019103741172970818,
20511
+ "loss": 0.8119,
20512
+ "step": 2929
20513
+ },
20514
+ {
20515
+ "epoch": 0.6800510618544737,
20516
+ "grad_norm": 0.555328369140625,
20517
+ "learning_rate": 0.00019103137572206,
20518
+ "loss": 0.8219,
20519
+ "step": 2930
20520
+ },
20521
+ {
20522
+ "epoch": 0.6802831611929906,
20523
+ "grad_norm": 0.45921704173088074,
20524
+ "learning_rate": 0.0001910253377779923,
20525
+ "loss": 0.8887,
20526
+ "step": 2931
20527
+ },
20528
+ {
20529
+ "epoch": 0.6805152605315075,
20530
+ "grad_norm": 0.4183528423309326,
20531
+ "learning_rate": 0.00019101929789763354,
20532
+ "loss": 0.885,
20533
+ "step": 2932
20534
+ },
20535
+ {
20536
+ "epoch": 0.6807473598700243,
20537
+ "grad_norm": 0.4342934787273407,
20538
+ "learning_rate": 0.00019101325608111218,
20539
+ "loss": 0.9084,
20540
+ "step": 2933
20541
+ },
20542
+ {
20543
+ "epoch": 0.6809794592085413,
20544
+ "grad_norm": 0.41013672947883606,
20545
+ "learning_rate": 0.0001910072123285567,
20546
+ "loss": 0.8773,
20547
+ "step": 2934
20548
+ },
20549
+ {
20550
+ "epoch": 0.6812115585470582,
20551
+ "grad_norm": 0.4397852122783661,
20552
+ "learning_rate": 0.00019100116664009576,
20553
+ "loss": 0.8478,
20554
+ "step": 2935
20555
+ },
20556
+ {
20557
+ "epoch": 0.681443657885575,
20558
+ "grad_norm": 0.46658027172088623,
20559
+ "learning_rate": 0.00019099511901585786,
20560
+ "loss": 0.8682,
20561
+ "step": 2936
20562
+ },
20563
+ {
20564
+ "epoch": 0.6816757572240919,
20565
+ "grad_norm": 0.4161824584007263,
20566
+ "learning_rate": 0.00019098906945597168,
20567
+ "loss": 0.8447,
20568
+ "step": 2937
20569
+ },
20570
+ {
20571
+ "epoch": 0.6819078565626088,
20572
+ "grad_norm": 0.45820096135139465,
20573
+ "learning_rate": 0.00019098301796056593,
20574
+ "loss": 0.8632,
20575
+ "step": 2938
20576
+ },
20577
+ {
20578
+ "epoch": 0.6821399559011256,
20579
+ "grad_norm": 0.49335211515426636,
20580
+ "learning_rate": 0.00019097696452976935,
20581
+ "loss": 0.8543,
20582
+ "step": 2939
20583
+ },
20584
+ {
20585
+ "epoch": 0.6823720552396426,
20586
+ "grad_norm": 0.5060347318649292,
20587
+ "learning_rate": 0.00019097090916371062,
20588
+ "loss": 0.9283,
20589
+ "step": 2940
20590
+ },
20591
+ {
20592
+ "epoch": 0.6826041545781595,
20593
+ "grad_norm": 0.5007983446121216,
20594
+ "learning_rate": 0.00019096485186251866,
20595
+ "loss": 0.8542,
20596
+ "step": 2941
20597
+ },
20598
+ {
20599
+ "epoch": 0.6828362539166764,
20600
+ "grad_norm": 0.5087704062461853,
20601
+ "learning_rate": 0.00019095879262632227,
20602
+ "loss": 0.8908,
20603
+ "step": 2942
20604
+ },
20605
+ {
20606
+ "epoch": 0.6830683532551932,
20607
+ "grad_norm": 0.5069675445556641,
20608
+ "learning_rate": 0.0001909527314552503,
20609
+ "loss": 0.9079,
20610
+ "step": 2943
20611
+ },
20612
+ {
20613
+ "epoch": 0.6833004525937101,
20614
+ "grad_norm": 0.47137320041656494,
20615
+ "learning_rate": 0.00019094666834943179,
20616
+ "loss": 0.8626,
20617
+ "step": 2944
20618
+ },
20619
+ {
20620
+ "epoch": 0.683532551932227,
20621
+ "grad_norm": 0.4283658564090729,
20622
+ "learning_rate": 0.0001909406033089956,
20623
+ "loss": 0.8541,
20624
+ "step": 2945
20625
+ },
20626
+ {
20627
+ "epoch": 0.6837646512707439,
20628
+ "grad_norm": 0.46082451939582825,
20629
+ "learning_rate": 0.00019093453633407082,
20630
+ "loss": 0.8143,
20631
+ "step": 2946
20632
+ },
20633
+ {
20634
+ "epoch": 0.6839967506092608,
20635
+ "grad_norm": 0.4551635682582855,
20636
+ "learning_rate": 0.00019092846742478647,
20637
+ "loss": 0.8945,
20638
+ "step": 2947
20639
+ },
20640
+ {
20641
+ "epoch": 0.6842288499477777,
20642
+ "grad_norm": 0.5660843253135681,
20643
+ "learning_rate": 0.00019092239658127167,
20644
+ "loss": 0.8522,
20645
+ "step": 2948
20646
+ },
20647
+ {
20648
+ "epoch": 0.6844609492862945,
20649
+ "grad_norm": 0.481251060962677,
20650
+ "learning_rate": 0.00019091632380365553,
20651
+ "loss": 0.8549,
20652
+ "step": 2949
20653
+ },
20654
+ {
20655
+ "epoch": 0.6846930486248114,
20656
+ "grad_norm": 0.45565807819366455,
20657
+ "learning_rate": 0.00019091024909206729,
20658
+ "loss": 0.8892,
20659
+ "step": 2950
20660
  }
20661
  ],
20662
  "logging_steps": 1,
 
20676
  "attributes": {}
20677
  }
20678
  },
20679
+ "total_flos": 1.3094363131281408e+18,
20680
  "train_batch_size": 32,
20681
  "trial_name": null,
20682
  "trial_params": null