mosama commited on
Commit
48ca390
·
verified ·
1 Parent(s): 4df635b

Training in progress, step 2850, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:44791fb686c0f92a3ca53186840e2c8355789932001a028b47646af1a8a1b45c
3
  size 1370666272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:789f226e44ff6175f0650db489f0554e7f69dc5b63c5b19f6f8f90422e097bc3
3
  size 1370666272
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d586e1619bf49bc533732763da9dc62537363ab53027df7eb9192650c866d327
3
  size 697294462
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53c7ca855a09c6703804528921ba002a4454692bef620396449f5abdd6380228
3
  size 697294462
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54c9effe6f4720ddc5037c1f923116bf7c70164eba829d93d939ea303faa1268
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afaac9fbe4271faaba5196ab94e52163e6bf1b95bd8386498fc1f2c58b28a4a4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6498781478472786,
5
  "eval_steps": 500,
6
- "global_step": 2800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -19607,6 +19607,356 @@
19607
  "learning_rate": 0.00019179977330980487,
19608
  "loss": 0.8965,
19609
  "step": 2800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19610
  }
19611
  ],
19612
  "logging_steps": 1,
@@ -19626,7 +19976,7 @@
19626
  "attributes": {}
19627
  }
19628
  },
19629
- "total_flos": 1.2428548056809472e+18,
19630
  "train_batch_size": 32,
19631
  "trial_name": null,
19632
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6614831147731229,
5
  "eval_steps": 500,
6
+ "global_step": 2850,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
19607
  "learning_rate": 0.00019179977330980487,
19608
  "loss": 0.8965,
19609
  "step": 2800
19610
+ },
19611
+ {
19612
+ "epoch": 0.6501102471857956,
19613
+ "grad_norm": 0.4750087857246399,
19614
+ "learning_rate": 0.00019179398816212382,
19615
+ "loss": 0.9313,
19616
+ "step": 2801
19617
+ },
19618
+ {
19619
+ "epoch": 0.6503423465243124,
19620
+ "grad_norm": 0.4137982130050659,
19621
+ "learning_rate": 0.00019178820106180094,
19622
+ "loss": 0.8269,
19623
+ "step": 2802
19624
+ },
19625
+ {
19626
+ "epoch": 0.6505744458628293,
19627
+ "grad_norm": 0.48815712332725525,
19628
+ "learning_rate": 0.00019178241200895935,
19629
+ "loss": 0.8957,
19630
+ "step": 2803
19631
+ },
19632
+ {
19633
+ "epoch": 0.6508065452013462,
19634
+ "grad_norm": 0.5212056636810303,
19635
+ "learning_rate": 0.0001917766210037222,
19636
+ "loss": 0.826,
19637
+ "step": 2804
19638
+ },
19639
+ {
19640
+ "epoch": 0.651038644539863,
19641
+ "grad_norm": 0.46167051792144775,
19642
+ "learning_rate": 0.0001917708280462126,
19643
+ "loss": 0.9014,
19644
+ "step": 2805
19645
+ },
19646
+ {
19647
+ "epoch": 0.6512707438783799,
19648
+ "grad_norm": 0.47623032331466675,
19649
+ "learning_rate": 0.00019176503313655393,
19650
+ "loss": 0.8882,
19651
+ "step": 2806
19652
+ },
19653
+ {
19654
+ "epoch": 0.6515028432168968,
19655
+ "grad_norm": 0.46180862188339233,
19656
+ "learning_rate": 0.00019175923627486936,
19657
+ "loss": 0.9117,
19658
+ "step": 2807
19659
+ },
19660
+ {
19661
+ "epoch": 0.6517349425554138,
19662
+ "grad_norm": 0.4569379687309265,
19663
+ "learning_rate": 0.0001917534374612822,
19664
+ "loss": 0.8399,
19665
+ "step": 2808
19666
+ },
19667
+ {
19668
+ "epoch": 0.6519670418939306,
19669
+ "grad_norm": 0.42162245512008667,
19670
+ "learning_rate": 0.00019174763669591583,
19671
+ "loss": 0.8652,
19672
+ "step": 2809
19673
+ },
19674
+ {
19675
+ "epoch": 0.6521991412324475,
19676
+ "grad_norm": 0.4374902546405792,
19677
+ "learning_rate": 0.0001917418339788936,
19678
+ "loss": 0.895,
19679
+ "step": 2810
19680
+ },
19681
+ {
19682
+ "epoch": 0.6524312405709644,
19683
+ "grad_norm": 0.4497464895248413,
19684
+ "learning_rate": 0.000191736029310339,
19685
+ "loss": 0.8953,
19686
+ "step": 2811
19687
+ },
19688
+ {
19689
+ "epoch": 0.6526633399094812,
19690
+ "grad_norm": 0.4323320686817169,
19691
+ "learning_rate": 0.00019173022269037548,
19692
+ "loss": 0.8703,
19693
+ "step": 2812
19694
+ },
19695
+ {
19696
+ "epoch": 0.6528954392479981,
19697
+ "grad_norm": 0.45908528566360474,
19698
+ "learning_rate": 0.00019172441411912657,
19699
+ "loss": 0.8765,
19700
+ "step": 2813
19701
+ },
19702
+ {
19703
+ "epoch": 0.6531275385865151,
19704
+ "grad_norm": 0.41703182458877563,
19705
+ "learning_rate": 0.00019171860359671583,
19706
+ "loss": 0.8681,
19707
+ "step": 2814
19708
+ },
19709
+ {
19710
+ "epoch": 0.6533596379250319,
19711
+ "grad_norm": 0.45060259103775024,
19712
+ "learning_rate": 0.00019171279112326683,
19713
+ "loss": 0.8919,
19714
+ "step": 2815
19715
+ },
19716
+ {
19717
+ "epoch": 0.6535917372635488,
19718
+ "grad_norm": 0.4701296389102936,
19719
+ "learning_rate": 0.00019170697669890324,
19720
+ "loss": 0.8749,
19721
+ "step": 2816
19722
+ },
19723
+ {
19724
+ "epoch": 0.6538238366020657,
19725
+ "grad_norm": 0.4668188691139221,
19726
+ "learning_rate": 0.00019170116032374876,
19727
+ "loss": 0.8601,
19728
+ "step": 2817
19729
+ },
19730
+ {
19731
+ "epoch": 0.6540559359405825,
19732
+ "grad_norm": 0.42963141202926636,
19733
+ "learning_rate": 0.0001916953419979271,
19734
+ "loss": 0.884,
19735
+ "step": 2818
19736
+ },
19737
+ {
19738
+ "epoch": 0.6542880352790994,
19739
+ "grad_norm": 0.5206764340400696,
19740
+ "learning_rate": 0.00019168952172156202,
19741
+ "loss": 0.8831,
19742
+ "step": 2819
19743
+ },
19744
+ {
19745
+ "epoch": 0.6545201346176164,
19746
+ "grad_norm": 0.4822680652141571,
19747
+ "learning_rate": 0.0001916836994947773,
19748
+ "loss": 0.8141,
19749
+ "step": 2820
19750
+ },
19751
+ {
19752
+ "epoch": 0.6547522339561332,
19753
+ "grad_norm": 0.44132062792778015,
19754
+ "learning_rate": 0.00019167787531769684,
19755
+ "loss": 0.8837,
19756
+ "step": 2821
19757
+ },
19758
+ {
19759
+ "epoch": 0.6549843332946501,
19760
+ "grad_norm": 0.47267404198646545,
19761
+ "learning_rate": 0.00019167204919044451,
19762
+ "loss": 0.9059,
19763
+ "step": 2822
19764
+ },
19765
+ {
19766
+ "epoch": 0.655216432633167,
19767
+ "grad_norm": 0.4189220070838928,
19768
+ "learning_rate": 0.00019166622111314426,
19769
+ "loss": 0.8696,
19770
+ "step": 2823
19771
+ },
19772
+ {
19773
+ "epoch": 0.6554485319716838,
19774
+ "grad_norm": 0.41616180539131165,
19775
+ "learning_rate": 0.0001916603910859201,
19776
+ "loss": 0.8296,
19777
+ "step": 2824
19778
+ },
19779
+ {
19780
+ "epoch": 0.6556806313102007,
19781
+ "grad_norm": 0.4162457287311554,
19782
+ "learning_rate": 0.00019165455910889593,
19783
+ "loss": 0.8204,
19784
+ "step": 2825
19785
+ },
19786
+ {
19787
+ "epoch": 0.6559127306487177,
19788
+ "grad_norm": 0.4778987467288971,
19789
+ "learning_rate": 0.0001916487251821959,
19790
+ "loss": 0.8528,
19791
+ "step": 2826
19792
+ },
19793
+ {
19794
+ "epoch": 0.6561448299872346,
19795
+ "grad_norm": 0.4973873198032379,
19796
+ "learning_rate": 0.0001916428893059441,
19797
+ "loss": 0.8403,
19798
+ "step": 2827
19799
+ },
19800
+ {
19801
+ "epoch": 0.6563769293257514,
19802
+ "grad_norm": 0.4930678904056549,
19803
+ "learning_rate": 0.00019163705148026464,
19804
+ "loss": 0.8223,
19805
+ "step": 2828
19806
+ },
19807
+ {
19808
+ "epoch": 0.6566090286642683,
19809
+ "grad_norm": 0.44355422258377075,
19810
+ "learning_rate": 0.00019163121170528175,
19811
+ "loss": 0.8361,
19812
+ "step": 2829
19813
+ },
19814
+ {
19815
+ "epoch": 0.6568411280027852,
19816
+ "grad_norm": 0.45476454496383667,
19817
+ "learning_rate": 0.0001916253699811196,
19818
+ "loss": 0.8712,
19819
+ "step": 2830
19820
+ },
19821
+ {
19822
+ "epoch": 0.657073227341302,
19823
+ "grad_norm": 0.4533182382583618,
19824
+ "learning_rate": 0.00019161952630790248,
19825
+ "loss": 0.8984,
19826
+ "step": 2831
19827
+ },
19828
+ {
19829
+ "epoch": 0.657305326679819,
19830
+ "grad_norm": 0.4435712695121765,
19831
+ "learning_rate": 0.0001916136806857547,
19832
+ "loss": 0.8294,
19833
+ "step": 2832
19834
+ },
19835
+ {
19836
+ "epoch": 0.6575374260183359,
19837
+ "grad_norm": 0.5167298316955566,
19838
+ "learning_rate": 0.00019160783311480061,
19839
+ "loss": 0.9074,
19840
+ "step": 2833
19841
+ },
19842
+ {
19843
+ "epoch": 0.6577695253568527,
19844
+ "grad_norm": 0.48255985975265503,
19845
+ "learning_rate": 0.00019160198359516456,
19846
+ "loss": 0.8771,
19847
+ "step": 2834
19848
+ },
19849
+ {
19850
+ "epoch": 0.6580016246953696,
19851
+ "grad_norm": 0.49954113364219666,
19852
+ "learning_rate": 0.00019159613212697108,
19853
+ "loss": 0.837,
19854
+ "step": 2835
19855
+ },
19856
+ {
19857
+ "epoch": 0.6582337240338865,
19858
+ "grad_norm": 0.45875173807144165,
19859
+ "learning_rate": 0.00019159027871034452,
19860
+ "loss": 0.9007,
19861
+ "step": 2836
19862
+ },
19863
+ {
19864
+ "epoch": 0.6584658233724033,
19865
+ "grad_norm": 0.4180905818939209,
19866
+ "learning_rate": 0.00019158442334540947,
19867
+ "loss": 0.9139,
19868
+ "step": 2837
19869
+ },
19870
+ {
19871
+ "epoch": 0.6586979227109203,
19872
+ "grad_norm": 0.492866188287735,
19873
+ "learning_rate": 0.00019157856603229048,
19874
+ "loss": 0.8481,
19875
+ "step": 2838
19876
+ },
19877
+ {
19878
+ "epoch": 0.6589300220494372,
19879
+ "grad_norm": 0.45765408873558044,
19880
+ "learning_rate": 0.0001915727067711121,
19881
+ "loss": 0.8913,
19882
+ "step": 2839
19883
+ },
19884
+ {
19885
+ "epoch": 0.659162121387954,
19886
+ "grad_norm": 0.4523009657859802,
19887
+ "learning_rate": 0.00019156684556199903,
19888
+ "loss": 0.8815,
19889
+ "step": 2840
19890
+ },
19891
+ {
19892
+ "epoch": 0.6593942207264709,
19893
+ "grad_norm": 0.463329941034317,
19894
+ "learning_rate": 0.00019156098240507592,
19895
+ "loss": 0.8844,
19896
+ "step": 2841
19897
+ },
19898
+ {
19899
+ "epoch": 0.6596263200649878,
19900
+ "grad_norm": 0.4301539957523346,
19901
+ "learning_rate": 0.00019155511730046748,
19902
+ "loss": 0.8209,
19903
+ "step": 2842
19904
+ },
19905
+ {
19906
+ "epoch": 0.6598584194035046,
19907
+ "grad_norm": 0.4687608480453491,
19908
+ "learning_rate": 0.0001915492502482985,
19909
+ "loss": 0.8791,
19910
+ "step": 2843
19911
+ },
19912
+ {
19913
+ "epoch": 0.6600905187420216,
19914
+ "grad_norm": 0.46065258979797363,
19915
+ "learning_rate": 0.00019154338124869377,
19916
+ "loss": 0.8791,
19917
+ "step": 2844
19918
+ },
19919
+ {
19920
+ "epoch": 0.6603226180805385,
19921
+ "grad_norm": 0.4436477720737457,
19922
+ "learning_rate": 0.0001915375103017781,
19923
+ "loss": 0.879,
19924
+ "step": 2845
19925
+ },
19926
+ {
19927
+ "epoch": 0.6605547174190554,
19928
+ "grad_norm": 0.4415607750415802,
19929
+ "learning_rate": 0.0001915316374076764,
19930
+ "loss": 0.8601,
19931
+ "step": 2846
19932
+ },
19933
+ {
19934
+ "epoch": 0.6607868167575722,
19935
+ "grad_norm": 0.46711909770965576,
19936
+ "learning_rate": 0.00019152576256651366,
19937
+ "loss": 0.8796,
19938
+ "step": 2847
19939
+ },
19940
+ {
19941
+ "epoch": 0.6610189160960891,
19942
+ "grad_norm": 0.4268472194671631,
19943
+ "learning_rate": 0.0001915198857784148,
19944
+ "loss": 0.8689,
19945
+ "step": 2848
19946
+ },
19947
+ {
19948
+ "epoch": 0.661251015434606,
19949
+ "grad_norm": 0.3973580002784729,
19950
+ "learning_rate": 0.0001915140070435048,
19951
+ "loss": 0.8466,
19952
+ "step": 2849
19953
+ },
19954
+ {
19955
+ "epoch": 0.6614831147731229,
19956
+ "grad_norm": 0.4282270669937134,
19957
+ "learning_rate": 0.00019150812636190874,
19958
+ "loss": 0.8451,
19959
+ "step": 2850
19960
  }
19961
  ],
19962
  "logging_steps": 1,
 
19976
  "attributes": {}
19977
  }
19978
  },
19979
+ "total_flos": 1.2650486414966784e+18,
19980
  "train_batch_size": 32,
19981
  "trial_name": null,
19982
  "trial_params": null