mosama commited on
Commit
c584dd0
·
verified ·
1 Parent(s): 48ca390

Training in progress, step 2900, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:789f226e44ff6175f0650db489f0554e7f69dc5b63c5b19f6f8f90422e097bc3
3
  size 1370666272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85e10776ed7d2feec702f85a92294fc572495be458fad36bc37e21242039a14d
3
  size 1370666272
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53c7ca855a09c6703804528921ba002a4454692bef620396449f5abdd6380228
3
  size 697294462
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f328674c88f0186255ee5dbf4ac7f148eb4bef19de18a361c1ed0eb9ce9660bb
3
  size 697294462
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:afaac9fbe4271faaba5196ab94e52163e6bf1b95bd8386498fc1f2c58b28a4a4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:605d9f0439096f21199e65a6f7490d22d8285df735f81d56920505482985be35
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6614831147731229,
5
  "eval_steps": 500,
6
- "global_step": 2850,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -19957,6 +19957,356 @@
19957
  "learning_rate": 0.00019150812636190874,
19958
  "loss": 0.8451,
19959
  "step": 2850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19960
  }
19961
  ],
19962
  "logging_steps": 1,
@@ -19976,7 +20326,7 @@
19976
  "attributes": {}
19977
  }
19978
  },
19979
- "total_flos": 1.2650486414966784e+18,
19980
  "train_batch_size": 32,
19981
  "trial_name": null,
19982
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6730880816989672,
5
  "eval_steps": 500,
6
+ "global_step": 2900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
19957
  "learning_rate": 0.00019150812636190874,
19958
  "loss": 0.8451,
19959
  "step": 2850
19960
+ },
19961
+ {
19962
+ "epoch": 0.6617152141116398,
19963
+ "grad_norm": 0.45293116569519043,
19964
+ "learning_rate": 0.00019150224373375174,
19965
+ "loss": 0.9152,
19966
+ "step": 2851
19967
+ },
19968
+ {
19969
+ "epoch": 0.6619473134501567,
19970
+ "grad_norm": 0.49578094482421875,
19971
+ "learning_rate": 0.00019149635915915889,
19972
+ "loss": 0.8429,
19973
+ "step": 2852
19974
+ },
19975
+ {
19976
+ "epoch": 0.6621794127886735,
19977
+ "grad_norm": 0.45070314407348633,
19978
+ "learning_rate": 0.00019149047263825538,
19979
+ "loss": 0.829,
19980
+ "step": 2853
19981
+ },
19982
+ {
19983
+ "epoch": 0.6624115121271904,
19984
+ "grad_norm": 0.44752323627471924,
19985
+ "learning_rate": 0.00019148458417116645,
19986
+ "loss": 0.874,
19987
+ "step": 2854
19988
+ },
19989
+ {
19990
+ "epoch": 0.6626436114657073,
19991
+ "grad_norm": 0.4903758466243744,
19992
+ "learning_rate": 0.00019147869375801734,
19993
+ "loss": 0.8787,
19994
+ "step": 2855
19995
+ },
19996
+ {
19997
+ "epoch": 0.6628757108042242,
19998
+ "grad_norm": 0.43119940161705017,
19999
+ "learning_rate": 0.00019147280139893337,
20000
+ "loss": 0.8978,
20001
+ "step": 2856
20002
+ },
20003
+ {
20004
+ "epoch": 0.6631078101427411,
20005
+ "grad_norm": 0.5306719541549683,
20006
+ "learning_rate": 0.00019146690709403988,
20007
+ "loss": 0.9067,
20008
+ "step": 2857
20009
+ },
20010
+ {
20011
+ "epoch": 0.663339909481258,
20012
+ "grad_norm": 0.45615947246551514,
20013
+ "learning_rate": 0.0001914610108434622,
20014
+ "loss": 0.839,
20015
+ "step": 2858
20016
+ },
20017
+ {
20018
+ "epoch": 0.6635720088197749,
20019
+ "grad_norm": 0.4449672996997833,
20020
+ "learning_rate": 0.00019145511264732584,
20021
+ "loss": 0.8675,
20022
+ "step": 2859
20023
+ },
20024
+ {
20025
+ "epoch": 0.6638041081582917,
20026
+ "grad_norm": 0.4791627526283264,
20027
+ "learning_rate": 0.00019144921250575619,
20028
+ "loss": 0.8853,
20029
+ "step": 2860
20030
+ },
20031
+ {
20032
+ "epoch": 0.6640362074968086,
20033
+ "grad_norm": 0.4788571298122406,
20034
+ "learning_rate": 0.00019144331041887882,
20035
+ "loss": 0.9273,
20036
+ "step": 2861
20037
+ },
20038
+ {
20039
+ "epoch": 0.6642683068353256,
20040
+ "grad_norm": 0.42612382769584656,
20041
+ "learning_rate": 0.00019143740638681922,
20042
+ "loss": 0.893,
20043
+ "step": 2862
20044
+ },
20045
+ {
20046
+ "epoch": 0.6645004061738424,
20047
+ "grad_norm": 0.47250261902809143,
20048
+ "learning_rate": 0.000191431500409703,
20049
+ "loss": 0.8795,
20050
+ "step": 2863
20051
+ },
20052
+ {
20053
+ "epoch": 0.6647325055123593,
20054
+ "grad_norm": 0.4971529543399811,
20055
+ "learning_rate": 0.00019142559248765587,
20056
+ "loss": 0.9454,
20057
+ "step": 2864
20058
+ },
20059
+ {
20060
+ "epoch": 0.6649646048508762,
20061
+ "grad_norm": 0.5304151177406311,
20062
+ "learning_rate": 0.00019141968262080335,
20063
+ "loss": 0.9761,
20064
+ "step": 2865
20065
+ },
20066
+ {
20067
+ "epoch": 0.665196704189393,
20068
+ "grad_norm": 0.4551432728767395,
20069
+ "learning_rate": 0.00019141377080927132,
20070
+ "loss": 0.8263,
20071
+ "step": 2866
20072
+ },
20073
+ {
20074
+ "epoch": 0.6654288035279099,
20075
+ "grad_norm": 0.4839153289794922,
20076
+ "learning_rate": 0.0001914078570531854,
20077
+ "loss": 0.8516,
20078
+ "step": 2867
20079
+ },
20080
+ {
20081
+ "epoch": 0.6656609028664269,
20082
+ "grad_norm": 0.501598060131073,
20083
+ "learning_rate": 0.0001914019413526715,
20084
+ "loss": 0.8753,
20085
+ "step": 2868
20086
+ },
20087
+ {
20088
+ "epoch": 0.6658930022049437,
20089
+ "grad_norm": 0.49526655673980713,
20090
+ "learning_rate": 0.00019139602370785538,
20091
+ "loss": 0.8342,
20092
+ "step": 2869
20093
+ },
20094
+ {
20095
+ "epoch": 0.6661251015434606,
20096
+ "grad_norm": 0.49142616987228394,
20097
+ "learning_rate": 0.00019139010411886291,
20098
+ "loss": 0.8389,
20099
+ "step": 2870
20100
+ },
20101
+ {
20102
+ "epoch": 0.6663572008819775,
20103
+ "grad_norm": 0.5267114639282227,
20104
+ "learning_rate": 0.00019138418258582006,
20105
+ "loss": 0.8339,
20106
+ "step": 2871
20107
+ },
20108
+ {
20109
+ "epoch": 0.6665893002204943,
20110
+ "grad_norm": 0.42393583059310913,
20111
+ "learning_rate": 0.0001913782591088528,
20112
+ "loss": 0.8576,
20113
+ "step": 2872
20114
+ },
20115
+ {
20116
+ "epoch": 0.6668213995590112,
20117
+ "grad_norm": 0.4962637722492218,
20118
+ "learning_rate": 0.0001913723336880871,
20119
+ "loss": 0.837,
20120
+ "step": 2873
20121
+ },
20122
+ {
20123
+ "epoch": 0.6670534988975282,
20124
+ "grad_norm": 0.4471946656703949,
20125
+ "learning_rate": 0.000191366406323649,
20126
+ "loss": 0.8259,
20127
+ "step": 2874
20128
+ },
20129
+ {
20130
+ "epoch": 0.667285598236045,
20131
+ "grad_norm": 0.48034703731536865,
20132
+ "learning_rate": 0.00019136047701566464,
20133
+ "loss": 0.8537,
20134
+ "step": 2875
20135
+ },
20136
+ {
20137
+ "epoch": 0.6675176975745619,
20138
+ "grad_norm": 0.47116121649742126,
20139
+ "learning_rate": 0.0001913545457642601,
20140
+ "loss": 0.8252,
20141
+ "step": 2876
20142
+ },
20143
+ {
20144
+ "epoch": 0.6677497969130788,
20145
+ "grad_norm": 0.5071761012077332,
20146
+ "learning_rate": 0.00019134861256956155,
20147
+ "loss": 0.898,
20148
+ "step": 2877
20149
+ },
20150
+ {
20151
+ "epoch": 0.6679818962515957,
20152
+ "grad_norm": 0.4993492662906647,
20153
+ "learning_rate": 0.00019134267743169524,
20154
+ "loss": 0.8555,
20155
+ "step": 2878
20156
+ },
20157
+ {
20158
+ "epoch": 0.6682139955901125,
20159
+ "grad_norm": 0.5150817036628723,
20160
+ "learning_rate": 0.00019133674035078736,
20161
+ "loss": 0.8624,
20162
+ "step": 2879
20163
+ },
20164
+ {
20165
+ "epoch": 0.6684460949286295,
20166
+ "grad_norm": 0.5153425931930542,
20167
+ "learning_rate": 0.00019133080132696426,
20168
+ "loss": 0.8093,
20169
+ "step": 2880
20170
+ },
20171
+ {
20172
+ "epoch": 0.6686781942671464,
20173
+ "grad_norm": 0.4248557686805725,
20174
+ "learning_rate": 0.00019132486036035226,
20175
+ "loss": 0.8488,
20176
+ "step": 2881
20177
+ },
20178
+ {
20179
+ "epoch": 0.6689102936056632,
20180
+ "grad_norm": 0.4647797644138336,
20181
+ "learning_rate": 0.0001913189174510777,
20182
+ "loss": 0.9239,
20183
+ "step": 2882
20184
+ },
20185
+ {
20186
+ "epoch": 0.6691423929441801,
20187
+ "grad_norm": 0.5158550143241882,
20188
+ "learning_rate": 0.00019131297259926706,
20189
+ "loss": 0.8746,
20190
+ "step": 2883
20191
+ },
20192
+ {
20193
+ "epoch": 0.669374492282697,
20194
+ "grad_norm": 0.4511086344718933,
20195
+ "learning_rate": 0.00019130702580504676,
20196
+ "loss": 0.897,
20197
+ "step": 2884
20198
+ },
20199
+ {
20200
+ "epoch": 0.6696065916212138,
20201
+ "grad_norm": 0.5059782862663269,
20202
+ "learning_rate": 0.0001913010770685433,
20203
+ "loss": 0.8666,
20204
+ "step": 2885
20205
+ },
20206
+ {
20207
+ "epoch": 0.6698386909597308,
20208
+ "grad_norm": 0.4928185045719147,
20209
+ "learning_rate": 0.00019129512638988322,
20210
+ "loss": 0.842,
20211
+ "step": 2886
20212
+ },
20213
+ {
20214
+ "epoch": 0.6700707902982477,
20215
+ "grad_norm": 0.5002438426017761,
20216
+ "learning_rate": 0.00019128917376919313,
20217
+ "loss": 0.9076,
20218
+ "step": 2887
20219
+ },
20220
+ {
20221
+ "epoch": 0.6703028896367645,
20222
+ "grad_norm": 0.427513986825943,
20223
+ "learning_rate": 0.0001912832192065996,
20224
+ "loss": 0.8238,
20225
+ "step": 2888
20226
+ },
20227
+ {
20228
+ "epoch": 0.6705349889752814,
20229
+ "grad_norm": 0.45401087403297424,
20230
+ "learning_rate": 0.0001912772627022294,
20231
+ "loss": 0.8605,
20232
+ "step": 2889
20233
+ },
20234
+ {
20235
+ "epoch": 0.6707670883137983,
20236
+ "grad_norm": 0.43657442927360535,
20237
+ "learning_rate": 0.0001912713042562091,
20238
+ "loss": 0.8506,
20239
+ "step": 2890
20240
+ },
20241
+ {
20242
+ "epoch": 0.6709991876523151,
20243
+ "grad_norm": 0.41969212889671326,
20244
+ "learning_rate": 0.00019126534386866556,
20245
+ "loss": 0.8791,
20246
+ "step": 2891
20247
+ },
20248
+ {
20249
+ "epoch": 0.6712312869908321,
20250
+ "grad_norm": 0.46783447265625,
20251
+ "learning_rate": 0.00019125938153972548,
20252
+ "loss": 0.8774,
20253
+ "step": 2892
20254
+ },
20255
+ {
20256
+ "epoch": 0.671463386329349,
20257
+ "grad_norm": 0.44763606786727905,
20258
+ "learning_rate": 0.00019125341726951577,
20259
+ "loss": 0.9214,
20260
+ "step": 2893
20261
+ },
20262
+ {
20263
+ "epoch": 0.6716954856678659,
20264
+ "grad_norm": 0.46709761023521423,
20265
+ "learning_rate": 0.00019124745105816325,
20266
+ "loss": 0.8276,
20267
+ "step": 2894
20268
+ },
20269
+ {
20270
+ "epoch": 0.6719275850063827,
20271
+ "grad_norm": 0.471754252910614,
20272
+ "learning_rate": 0.0001912414829057949,
20273
+ "loss": 0.8645,
20274
+ "step": 2895
20275
+ },
20276
+ {
20277
+ "epoch": 0.6721596843448996,
20278
+ "grad_norm": 0.4268680810928345,
20279
+ "learning_rate": 0.00019123551281253757,
20280
+ "loss": 0.8376,
20281
+ "step": 2896
20282
+ },
20283
+ {
20284
+ "epoch": 0.6723917836834165,
20285
+ "grad_norm": 0.4184396266937256,
20286
+ "learning_rate": 0.00019122954077851833,
20287
+ "loss": 0.87,
20288
+ "step": 2897
20289
+ },
20290
+ {
20291
+ "epoch": 0.6726238830219334,
20292
+ "grad_norm": 0.48813703656196594,
20293
+ "learning_rate": 0.0001912235668038642,
20294
+ "loss": 0.866,
20295
+ "step": 2898
20296
+ },
20297
+ {
20298
+ "epoch": 0.6728559823604503,
20299
+ "grad_norm": 0.4599473774433136,
20300
+ "learning_rate": 0.00019121759088870226,
20301
+ "loss": 0.834,
20302
+ "step": 2899
20303
+ },
20304
+ {
20305
+ "epoch": 0.6730880816989672,
20306
+ "grad_norm": 0.4024162292480469,
20307
+ "learning_rate": 0.00019121161303315963,
20308
+ "loss": 0.8731,
20309
+ "step": 2900
20310
  }
20311
  ],
20312
  "logging_steps": 1,
 
20326
  "attributes": {}
20327
  }
20328
  },
20329
+ "total_flos": 1.2872424773124096e+18,
20330
  "train_batch_size": 32,
20331
  "trial_name": null,
20332
  "trial_params": null