vantaa32 commited on
Commit
c78e8a8
·
verified ·
1 Parent(s): c41aab3

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- library_name: transformers
3
- tags: []
4
  ---
5
 
6
  # Model Card for Model ID
@@ -15,7 +15,7 @@ tags: []
15
 
16
  <!-- Provide a longer summary of what this model is. -->
17
 
18
- This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
19
 
20
  - **Developed by:** [More Information Needed]
21
  - **Funded by [optional]:** [More Information Needed]
@@ -196,4 +196,7 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
196
 
197
  ## Model Card Contact
198
 
199
- [More Information Needed]
 
 
 
 
1
  ---
2
+ base_model: meta-llama/Llama-2-7b-hf
3
+ library_name: peft
4
  ---
5
 
6
  # Model Card for Model ID
 
15
 
16
  <!-- Provide a longer summary of what this model is. -->
17
 
18
+
19
 
20
  - **Developed by:** [More Information Needed]
21
  - **Funded by [optional]:** [More Information Needed]
 
196
 
197
  ## Model Card Contact
198
 
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
adapter_config.json CHANGED
@@ -16,8 +16,8 @@
16
  "revision": null,
17
  "scaling": 64.0,
18
  "target_modules": [
19
- "q_proj",
20
- "v_proj"
21
  ],
22
  "task_type": "CAUSAL_LM"
23
  }
 
16
  "revision": null,
17
  "scaling": 64.0,
18
  "target_modules": [
19
+ "v_proj",
20
+ "q_proj"
21
  ],
22
  "task_type": "CAUSAL_LM"
23
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74416b783a361c9e37331d56c58c581071c3d44b5f7fe6b6c629b0ea2a18d0c0
3
  size 38408880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d69bc4c97b1766bc7558b33bbac997e4df8578d70bfcb47f300c56ee94ab771
3
  size 38408880
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d3058638990c9146c6a757db139f5e8ca3ab6d1f6d59d3e4741ebcbc65fe68a
3
+ size 76854010
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dc99f71a36fe65bf6a5e136aabd4a8cacdb3328a64f5d463f6de79f65a09eba
3
+ size 14180
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c27ec41164777d5a60d46ff725d04fa16ace6e4584569a815a0c88eecdd1c934
3
+ size 1064
tokenizer_config.json CHANGED
@@ -33,7 +33,7 @@
33
  "eos_token": "</s>",
34
  "extra_special_tokens": {},
35
  "legacy": false,
36
- "model_max_length": 4096,
37
  "pad_token": "<unk>",
38
  "padding_side": "right",
39
  "sp_model_kwargs": {},
 
33
  "eos_token": "</s>",
34
  "extra_special_tokens": {},
35
  "legacy": false,
36
+ "model_max_length": 2048,
37
  "pad_token": "<unk>",
38
  "padding_side": "right",
39
  "sp_model_kwargs": {},
trainer_state.json ADDED
@@ -0,0 +1,1142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 5,
6
+ "global_step": 7764,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.019319938176197836,
13
+ "grad_norm": 0.0015910037327557802,
14
+ "learning_rate": 0.000993560020607934,
15
+ "loss": 0.9793,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.03863987635239567,
20
+ "grad_norm": 0.0015677690971642733,
21
+ "learning_rate": 0.0009871200412158681,
22
+ "loss": 0.9245,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.05795981452859351,
27
+ "grad_norm": 0.0019664347637444735,
28
+ "learning_rate": 0.0009806800618238022,
29
+ "loss": 0.9161,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.07727975270479134,
34
+ "grad_norm": 0.0017669295193627477,
35
+ "learning_rate": 0.0009742400824317363,
36
+ "loss": 0.8992,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.09659969088098919,
41
+ "grad_norm": 0.0014395161997526884,
42
+ "learning_rate": 0.0009678001030396702,
43
+ "loss": 0.903,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.11591962905718702,
48
+ "grad_norm": 0.0019409521482884884,
49
+ "learning_rate": 0.0009613601236476044,
50
+ "loss": 0.9156,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.13523956723338484,
55
+ "grad_norm": 0.0017209590878337622,
56
+ "learning_rate": 0.0009549201442555384,
57
+ "loss": 0.8962,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.1545595054095827,
62
+ "grad_norm": 0.0018070234218612313,
63
+ "learning_rate": 0.0009484801648634724,
64
+ "loss": 0.901,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.17387944358578053,
69
+ "grad_norm": 0.0015913312090560794,
70
+ "learning_rate": 0.0009420401854714065,
71
+ "loss": 0.8757,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.19319938176197837,
76
+ "grad_norm": 0.001516812015324831,
77
+ "learning_rate": 0.0009356002060793406,
78
+ "loss": 0.9066,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.2125193199381762,
83
+ "grad_norm": 0.0024989296216517687,
84
+ "learning_rate": 0.0009291602266872746,
85
+ "loss": 0.9071,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.23183925811437403,
90
+ "grad_norm": 0.0012714399490505457,
91
+ "learning_rate": 0.0009227202472952086,
92
+ "loss": 0.9258,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.2511591962905719,
97
+ "grad_norm": 0.0011651200475171208,
98
+ "learning_rate": 0.0009162802679031428,
99
+ "loss": 0.8831,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.2704791344667697,
104
+ "grad_norm": 0.001427617622539401,
105
+ "learning_rate": 0.0009098402885110768,
106
+ "loss": 0.8902,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.28979907264296756,
111
+ "grad_norm": 0.0013865921646356583,
112
+ "learning_rate": 0.0009034003091190108,
113
+ "loss": 0.8841,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.3091190108191654,
118
+ "grad_norm": 0.0013833673438057303,
119
+ "learning_rate": 0.0008969603297269449,
120
+ "loss": 0.8972,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.3284389489953632,
125
+ "grad_norm": 0.0014730911934748292,
126
+ "learning_rate": 0.000890520350334879,
127
+ "loss": 0.9071,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.34775888717156106,
132
+ "grad_norm": 0.0017960412660613656,
133
+ "learning_rate": 0.000884080370942813,
134
+ "loss": 0.8876,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.3670788253477589,
139
+ "grad_norm": 0.0018052643863484263,
140
+ "learning_rate": 0.0008776403915507471,
141
+ "loss": 0.8858,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.38639876352395675,
146
+ "grad_norm": 0.0014563511358574033,
147
+ "learning_rate": 0.000871200412158681,
148
+ "loss": 0.8958,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.40571870170015456,
153
+ "grad_norm": 0.0019182608230039477,
154
+ "learning_rate": 0.0008647604327666152,
155
+ "loss": 0.8933,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 0.4250386398763524,
160
+ "grad_norm": 0.001481884391978383,
161
+ "learning_rate": 0.0008583204533745493,
162
+ "loss": 0.868,
163
+ "step": 1100
164
+ },
165
+ {
166
+ "epoch": 0.44435857805255025,
167
+ "grad_norm": 0.0013063091319054365,
168
+ "learning_rate": 0.0008518804739824832,
169
+ "loss": 0.8674,
170
+ "step": 1150
171
+ },
172
+ {
173
+ "epoch": 0.46367851622874806,
174
+ "grad_norm": 0.0016475298907607794,
175
+ "learning_rate": 0.0008454404945904173,
176
+ "loss": 0.8878,
177
+ "step": 1200
178
+ },
179
+ {
180
+ "epoch": 0.48299845440494593,
181
+ "grad_norm": 0.001497351098805666,
182
+ "learning_rate": 0.0008390005151983514,
183
+ "loss": 0.8949,
184
+ "step": 1250
185
+ },
186
+ {
187
+ "epoch": 0.5023183925811437,
188
+ "grad_norm": 0.0014308547833934426,
189
+ "learning_rate": 0.0008325605358062854,
190
+ "loss": 0.8982,
191
+ "step": 1300
192
+ },
193
+ {
194
+ "epoch": 0.5216383307573416,
195
+ "grad_norm": 0.0016038663452491164,
196
+ "learning_rate": 0.0008261205564142195,
197
+ "loss": 0.9057,
198
+ "step": 1350
199
+ },
200
+ {
201
+ "epoch": 0.5409582689335394,
202
+ "grad_norm": 0.001774997217580676,
203
+ "learning_rate": 0.0008196805770221536,
204
+ "loss": 0.8695,
205
+ "step": 1400
206
+ },
207
+ {
208
+ "epoch": 0.5602782071097373,
209
+ "grad_norm": 0.0019299176055938005,
210
+ "learning_rate": 0.0008132405976300876,
211
+ "loss": 0.8832,
212
+ "step": 1450
213
+ },
214
+ {
215
+ "epoch": 0.5795981452859351,
216
+ "grad_norm": 0.0020758837927132845,
217
+ "learning_rate": 0.0008068006182380216,
218
+ "loss": 0.892,
219
+ "step": 1500
220
+ },
221
+ {
222
+ "epoch": 0.5989180834621329,
223
+ "grad_norm": 0.0016486513195559382,
224
+ "learning_rate": 0.0008003606388459557,
225
+ "loss": 0.8869,
226
+ "step": 1550
227
+ },
228
+ {
229
+ "epoch": 0.6182380216383307,
230
+ "grad_norm": 0.0013964808313176036,
231
+ "learning_rate": 0.0007939206594538898,
232
+ "loss": 0.8946,
233
+ "step": 1600
234
+ },
235
+ {
236
+ "epoch": 0.6375579598145286,
237
+ "grad_norm": 0.0016684457659721375,
238
+ "learning_rate": 0.0007874806800618238,
239
+ "loss": 0.8884,
240
+ "step": 1650
241
+ },
242
+ {
243
+ "epoch": 0.6568778979907264,
244
+ "grad_norm": 0.0016556017799302936,
245
+ "learning_rate": 0.0007810407006697579,
246
+ "loss": 0.8877,
247
+ "step": 1700
248
+ },
249
+ {
250
+ "epoch": 0.6761978361669243,
251
+ "grad_norm": 0.002399856923148036,
252
+ "learning_rate": 0.0007746007212776918,
253
+ "loss": 0.9077,
254
+ "step": 1750
255
+ },
256
+ {
257
+ "epoch": 0.6955177743431221,
258
+ "grad_norm": 0.0016722239088267088,
259
+ "learning_rate": 0.000768160741885626,
260
+ "loss": 0.8986,
261
+ "step": 1800
262
+ },
263
+ {
264
+ "epoch": 0.7148377125193199,
265
+ "grad_norm": 0.001298186951316893,
266
+ "learning_rate": 0.0007617207624935601,
267
+ "loss": 0.8959,
268
+ "step": 1850
269
+ },
270
+ {
271
+ "epoch": 0.7341576506955177,
272
+ "grad_norm": 0.0012781182304024696,
273
+ "learning_rate": 0.000755280783101494,
274
+ "loss": 0.8788,
275
+ "step": 1900
276
+ },
277
+ {
278
+ "epoch": 0.7534775888717156,
279
+ "grad_norm": 0.001566325663588941,
280
+ "learning_rate": 0.0007488408037094282,
281
+ "loss": 0.8982,
282
+ "step": 1950
283
+ },
284
+ {
285
+ "epoch": 0.7727975270479135,
286
+ "grad_norm": 0.0017126682214438915,
287
+ "learning_rate": 0.0007424008243173622,
288
+ "loss": 0.8983,
289
+ "step": 2000
290
+ },
291
+ {
292
+ "epoch": 0.7921174652241113,
293
+ "grad_norm": 0.0013752984814345837,
294
+ "learning_rate": 0.0007359608449252962,
295
+ "loss": 0.8922,
296
+ "step": 2050
297
+ },
298
+ {
299
+ "epoch": 0.8114374034003091,
300
+ "grad_norm": 0.0020329777617007494,
301
+ "learning_rate": 0.0007295208655332303,
302
+ "loss": 0.8923,
303
+ "step": 2100
304
+ },
305
+ {
306
+ "epoch": 0.8307573415765069,
307
+ "grad_norm": 0.0018424971494823694,
308
+ "learning_rate": 0.0007230808861411644,
309
+ "loss": 0.8749,
310
+ "step": 2150
311
+ },
312
+ {
313
+ "epoch": 0.8500772797527048,
314
+ "grad_norm": 0.0019076282624155283,
315
+ "learning_rate": 0.0007166409067490984,
316
+ "loss": 0.8986,
317
+ "step": 2200
318
+ },
319
+ {
320
+ "epoch": 0.8693972179289027,
321
+ "grad_norm": 0.0014463101979345083,
322
+ "learning_rate": 0.0007102009273570324,
323
+ "loss": 0.8691,
324
+ "step": 2250
325
+ },
326
+ {
327
+ "epoch": 0.8887171561051005,
328
+ "grad_norm": 0.0014440215891227126,
329
+ "learning_rate": 0.0007037609479649665,
330
+ "loss": 0.8827,
331
+ "step": 2300
332
+ },
333
+ {
334
+ "epoch": 0.9080370942812983,
335
+ "grad_norm": 0.0016820535529404879,
336
+ "learning_rate": 0.0006973209685729006,
337
+ "loss": 0.8973,
338
+ "step": 2350
339
+ },
340
+ {
341
+ "epoch": 0.9273570324574961,
342
+ "grad_norm": 0.0013854255666956306,
343
+ "learning_rate": 0.0006908809891808346,
344
+ "loss": 0.8859,
345
+ "step": 2400
346
+ },
347
+ {
348
+ "epoch": 0.9466769706336939,
349
+ "grad_norm": 0.002300039865076542,
350
+ "learning_rate": 0.0006844410097887687,
351
+ "loss": 0.8829,
352
+ "step": 2450
353
+ },
354
+ {
355
+ "epoch": 0.9659969088098919,
356
+ "grad_norm": 0.0015681901713833213,
357
+ "learning_rate": 0.0006780010303967026,
358
+ "loss": 0.8909,
359
+ "step": 2500
360
+ },
361
+ {
362
+ "epoch": 0.9853168469860897,
363
+ "grad_norm": 0.0021285219117999077,
364
+ "learning_rate": 0.0006715610510046368,
365
+ "loss": 0.8852,
366
+ "step": 2550
367
+ },
368
+ {
369
+ "epoch": 1.0,
370
+ "eval_loss": 0.8796888589859009,
371
+ "eval_runtime": 3690.6011,
372
+ "eval_samples_per_second": 2.805,
373
+ "eval_steps_per_second": 0.351,
374
+ "step": 2588
375
+ },
376
+ {
377
+ "epoch": 1.0046367851622875,
378
+ "grad_norm": 0.0018459343118593097,
379
+ "learning_rate": 0.0006651210716125709,
380
+ "loss": 0.9123,
381
+ "step": 2600
382
+ },
383
+ {
384
+ "epoch": 1.0239567233384854,
385
+ "grad_norm": 0.0020025072153657675,
386
+ "learning_rate": 0.0006586810922205048,
387
+ "loss": 0.8976,
388
+ "step": 2650
389
+ },
390
+ {
391
+ "epoch": 1.0432766615146831,
392
+ "grad_norm": 0.002290483098477125,
393
+ "learning_rate": 0.000652241112828439,
394
+ "loss": 0.8649,
395
+ "step": 2700
396
+ },
397
+ {
398
+ "epoch": 1.062596599690881,
399
+ "grad_norm": 0.0015446515753865242,
400
+ "learning_rate": 0.0006458011334363731,
401
+ "loss": 0.9013,
402
+ "step": 2750
403
+ },
404
+ {
405
+ "epoch": 1.0819165378670788,
406
+ "grad_norm": 0.0015362042468041182,
407
+ "learning_rate": 0.000639361154044307,
408
+ "loss": 0.881,
409
+ "step": 2800
410
+ },
411
+ {
412
+ "epoch": 1.1012364760432767,
413
+ "grad_norm": 0.0013964643003419042,
414
+ "learning_rate": 0.0006329211746522411,
415
+ "loss": 0.893,
416
+ "step": 2850
417
+ },
418
+ {
419
+ "epoch": 1.1205564142194744,
420
+ "grad_norm": 0.0014078960521146655,
421
+ "learning_rate": 0.0006264811952601752,
422
+ "loss": 0.8959,
423
+ "step": 2900
424
+ },
425
+ {
426
+ "epoch": 1.1398763523956723,
427
+ "grad_norm": 0.0012278002686798573,
428
+ "learning_rate": 0.0006200412158681092,
429
+ "loss": 0.8737,
430
+ "step": 2950
431
+ },
432
+ {
433
+ "epoch": 1.1591962905718702,
434
+ "grad_norm": 0.0016700943233445287,
435
+ "learning_rate": 0.0006136012364760433,
436
+ "loss": 0.8926,
437
+ "step": 3000
438
+ },
439
+ {
440
+ "epoch": 1.178516228748068,
441
+ "grad_norm": 0.0013488128315657377,
442
+ "learning_rate": 0.0006071612570839773,
443
+ "loss": 0.8822,
444
+ "step": 3050
445
+ },
446
+ {
447
+ "epoch": 1.1978361669242659,
448
+ "grad_norm": 0.0013352310052141547,
449
+ "learning_rate": 0.0006007212776919114,
450
+ "loss": 0.9086,
451
+ "step": 3100
452
+ },
453
+ {
454
+ "epoch": 1.2171561051004636,
455
+ "grad_norm": 0.0015964731574058533,
456
+ "learning_rate": 0.0005942812982998454,
457
+ "loss": 0.8726,
458
+ "step": 3150
459
+ },
460
+ {
461
+ "epoch": 1.2364760432766615,
462
+ "grad_norm": 0.0015036857221275568,
463
+ "learning_rate": 0.0005878413189077795,
464
+ "loss": 0.8858,
465
+ "step": 3200
466
+ },
467
+ {
468
+ "epoch": 1.2557959814528594,
469
+ "grad_norm": 0.0014933788916096091,
470
+ "learning_rate": 0.0005814013395157137,
471
+ "loss": 0.8669,
472
+ "step": 3250
473
+ },
474
+ {
475
+ "epoch": 1.2751159196290571,
476
+ "grad_norm": 0.0014979959232732654,
477
+ "learning_rate": 0.0005749613601236476,
478
+ "loss": 0.8581,
479
+ "step": 3300
480
+ },
481
+ {
482
+ "epoch": 1.294435857805255,
483
+ "grad_norm": 0.001521074096672237,
484
+ "learning_rate": 0.0005685213807315817,
485
+ "loss": 0.8876,
486
+ "step": 3350
487
+ },
488
+ {
489
+ "epoch": 1.3137557959814528,
490
+ "grad_norm": 0.0013812105171382427,
491
+ "learning_rate": 0.0005620814013395156,
492
+ "loss": 0.8759,
493
+ "step": 3400
494
+ },
495
+ {
496
+ "epoch": 1.3330757341576507,
497
+ "grad_norm": 0.001809781650081277,
498
+ "learning_rate": 0.0005556414219474498,
499
+ "loss": 0.8637,
500
+ "step": 3450
501
+ },
502
+ {
503
+ "epoch": 1.3523956723338486,
504
+ "grad_norm": 0.001580111333169043,
505
+ "learning_rate": 0.0005492014425553839,
506
+ "loss": 0.8653,
507
+ "step": 3500
508
+ },
509
+ {
510
+ "epoch": 1.3717156105100463,
511
+ "grad_norm": 0.002373141935095191,
512
+ "learning_rate": 0.0005427614631633179,
513
+ "loss": 0.8998,
514
+ "step": 3550
515
+ },
516
+ {
517
+ "epoch": 1.3910355486862442,
518
+ "grad_norm": 0.0015024031745269895,
519
+ "learning_rate": 0.0005363214837712519,
520
+ "loss": 0.876,
521
+ "step": 3600
522
+ },
523
+ {
524
+ "epoch": 1.410355486862442,
525
+ "grad_norm": 0.0019381038146093488,
526
+ "learning_rate": 0.000529881504379186,
527
+ "loss": 0.8792,
528
+ "step": 3650
529
+ },
530
+ {
531
+ "epoch": 1.4296754250386399,
532
+ "grad_norm": 0.0015559702878817916,
533
+ "learning_rate": 0.0005234415249871201,
534
+ "loss": 0.8655,
535
+ "step": 3700
536
+ },
537
+ {
538
+ "epoch": 1.4489953632148378,
539
+ "grad_norm": 0.0015072495443746448,
540
+ "learning_rate": 0.0005170015455950541,
541
+ "loss": 0.8691,
542
+ "step": 3750
543
+ },
544
+ {
545
+ "epoch": 1.4683153013910355,
546
+ "grad_norm": 0.0016321117291226983,
547
+ "learning_rate": 0.0005105615662029882,
548
+ "loss": 0.8899,
549
+ "step": 3800
550
+ },
551
+ {
552
+ "epoch": 1.4876352395672334,
553
+ "grad_norm": 0.001877523958683014,
554
+ "learning_rate": 0.0005041215868109223,
555
+ "loss": 0.8605,
556
+ "step": 3850
557
+ },
558
+ {
559
+ "epoch": 1.5069551777434311,
560
+ "grad_norm": 0.0015456199180334806,
561
+ "learning_rate": 0.0004976816074188562,
562
+ "loss": 0.8619,
563
+ "step": 3900
564
+ },
565
+ {
566
+ "epoch": 1.526275115919629,
567
+ "grad_norm": 0.001545245642773807,
568
+ "learning_rate": 0.0004912416280267903,
569
+ "loss": 0.8807,
570
+ "step": 3950
571
+ },
572
+ {
573
+ "epoch": 1.545595054095827,
574
+ "grad_norm": 0.0014278549933806062,
575
+ "learning_rate": 0.00048480164863472436,
576
+ "loss": 0.8929,
577
+ "step": 4000
578
+ },
579
+ {
580
+ "epoch": 1.5649149922720247,
581
+ "grad_norm": 0.0016315317479893565,
582
+ "learning_rate": 0.0004783616692426584,
583
+ "loss": 0.8707,
584
+ "step": 4050
585
+ },
586
+ {
587
+ "epoch": 1.5842349304482226,
588
+ "grad_norm": 0.0015835491940379143,
589
+ "learning_rate": 0.0004719216898505925,
590
+ "loss": 0.8852,
591
+ "step": 4100
592
+ },
593
+ {
594
+ "epoch": 1.6035548686244203,
595
+ "grad_norm": 0.001612671185284853,
596
+ "learning_rate": 0.00046548171045852655,
597
+ "loss": 0.8796,
598
+ "step": 4150
599
+ },
600
+ {
601
+ "epoch": 1.6228748068006182,
602
+ "grad_norm": 0.0013706408208236098,
603
+ "learning_rate": 0.00045904173106646055,
604
+ "loss": 0.8708,
605
+ "step": 4200
606
+ },
607
+ {
608
+ "epoch": 1.6421947449768162,
609
+ "grad_norm": 0.0015082815662026405,
610
+ "learning_rate": 0.00045260175167439467,
611
+ "loss": 0.8963,
612
+ "step": 4250
613
+ },
614
+ {
615
+ "epoch": 1.6615146831530139,
616
+ "grad_norm": 0.0013959509087726474,
617
+ "learning_rate": 0.0004461617722823287,
618
+ "loss": 0.8721,
619
+ "step": 4300
620
+ },
621
+ {
622
+ "epoch": 1.6808346213292118,
623
+ "grad_norm": 0.0012363146524876356,
624
+ "learning_rate": 0.00043972179289026274,
625
+ "loss": 0.8722,
626
+ "step": 4350
627
+ },
628
+ {
629
+ "epoch": 1.7001545595054095,
630
+ "grad_norm": 0.0014632450183853507,
631
+ "learning_rate": 0.00043328181349819685,
632
+ "loss": 0.8818,
633
+ "step": 4400
634
+ },
635
+ {
636
+ "epoch": 1.7194744976816074,
637
+ "grad_norm": 0.0014374173479154706,
638
+ "learning_rate": 0.00042684183410613086,
639
+ "loss": 0.8841,
640
+ "step": 4450
641
+ },
642
+ {
643
+ "epoch": 1.7387944358578054,
644
+ "grad_norm": 0.0019015870057046413,
645
+ "learning_rate": 0.0004204018547140649,
646
+ "loss": 0.8885,
647
+ "step": 4500
648
+ },
649
+ {
650
+ "epoch": 1.758114374034003,
651
+ "grad_norm": 0.0018965965136885643,
652
+ "learning_rate": 0.000413961875321999,
653
+ "loss": 0.875,
654
+ "step": 4550
655
+ },
656
+ {
657
+ "epoch": 1.7774343122102008,
658
+ "grad_norm": 0.001437367289327085,
659
+ "learning_rate": 0.00040752189592993304,
660
+ "loss": 0.8618,
661
+ "step": 4600
662
+ },
663
+ {
664
+ "epoch": 1.7967542503863987,
665
+ "grad_norm": 0.0015649065608158708,
666
+ "learning_rate": 0.00040108191653786705,
667
+ "loss": 0.8914,
668
+ "step": 4650
669
+ },
670
+ {
671
+ "epoch": 1.8160741885625966,
672
+ "grad_norm": 0.001809162669815123,
673
+ "learning_rate": 0.00039464193714580116,
674
+ "loss": 0.8769,
675
+ "step": 4700
676
+ },
677
+ {
678
+ "epoch": 1.8353941267387945,
679
+ "grad_norm": 0.001575763919390738,
680
+ "learning_rate": 0.00038820195775373517,
681
+ "loss": 0.8873,
682
+ "step": 4750
683
+ },
684
+ {
685
+ "epoch": 1.8547140649149922,
686
+ "grad_norm": 0.0015713346656411886,
687
+ "learning_rate": 0.00038176197836166923,
688
+ "loss": 0.8689,
689
+ "step": 4800
690
+ },
691
+ {
692
+ "epoch": 1.87403400309119,
693
+ "grad_norm": 0.001345846801996231,
694
+ "learning_rate": 0.0003753219989696033,
695
+ "loss": 0.8759,
696
+ "step": 4850
697
+ },
698
+ {
699
+ "epoch": 1.8933539412673879,
700
+ "grad_norm": 0.001627634628675878,
701
+ "learning_rate": 0.00036888201957753735,
702
+ "loss": 0.8908,
703
+ "step": 4900
704
+ },
705
+ {
706
+ "epoch": 1.9126738794435858,
707
+ "grad_norm": 0.001344445045106113,
708
+ "learning_rate": 0.00036244204018547136,
709
+ "loss": 0.8815,
710
+ "step": 4950
711
+ },
712
+ {
713
+ "epoch": 1.9319938176197837,
714
+ "grad_norm": 0.0016686639282852411,
715
+ "learning_rate": 0.0003560020607934055,
716
+ "loss": 0.8668,
717
+ "step": 5000
718
+ },
719
+ {
720
+ "epoch": 1.9513137557959814,
721
+ "grad_norm": 0.0016382921021431684,
722
+ "learning_rate": 0.00034956208140133954,
723
+ "loss": 0.8905,
724
+ "step": 5050
725
+ },
726
+ {
727
+ "epoch": 1.9706336939721791,
728
+ "grad_norm": 0.001502548111602664,
729
+ "learning_rate": 0.0003431221020092736,
730
+ "loss": 0.8867,
731
+ "step": 5100
732
+ },
733
+ {
734
+ "epoch": 1.989953632148377,
735
+ "grad_norm": 0.001673164777457714,
736
+ "learning_rate": 0.00033668212261720766,
737
+ "loss": 0.8597,
738
+ "step": 5150
739
+ },
740
+ {
741
+ "epoch": 2.0,
742
+ "eval_loss": 0.8747333288192749,
743
+ "eval_runtime": 3876.6431,
744
+ "eval_samples_per_second": 2.67,
745
+ "eval_steps_per_second": 0.334,
746
+ "step": 5176
747
+ },
748
+ {
749
+ "epoch": 2.009273570324575,
750
+ "grad_norm": 0.0014372485456988215,
751
+ "learning_rate": 0.00033024214322514167,
752
+ "loss": 0.8778,
753
+ "step": 5200
754
+ },
755
+ {
756
+ "epoch": 2.028593508500773,
757
+ "grad_norm": 0.0013220058754086494,
758
+ "learning_rate": 0.0003238021638330758,
759
+ "loss": 0.8707,
760
+ "step": 5250
761
+ },
762
+ {
763
+ "epoch": 2.047913446676971,
764
+ "grad_norm": 0.0018379129469394684,
765
+ "learning_rate": 0.0003173621844410098,
766
+ "loss": 0.8866,
767
+ "step": 5300
768
+ },
769
+ {
770
+ "epoch": 2.0672333848531683,
771
+ "grad_norm": 0.0014872249448671937,
772
+ "learning_rate": 0.00031092220504894385,
773
+ "loss": 0.8757,
774
+ "step": 5350
775
+ },
776
+ {
777
+ "epoch": 2.0865533230293662,
778
+ "grad_norm": 0.0017716609872877598,
779
+ "learning_rate": 0.0003044822256568779,
780
+ "loss": 0.8647,
781
+ "step": 5400
782
+ },
783
+ {
784
+ "epoch": 2.105873261205564,
785
+ "grad_norm": 0.0014926039148122072,
786
+ "learning_rate": 0.00029804224626481197,
787
+ "loss": 0.8656,
788
+ "step": 5450
789
+ },
790
+ {
791
+ "epoch": 2.125193199381762,
792
+ "grad_norm": 0.0015732580795884132,
793
+ "learning_rate": 0.000291602266872746,
794
+ "loss": 0.8848,
795
+ "step": 5500
796
+ },
797
+ {
798
+ "epoch": 2.1445131375579596,
799
+ "grad_norm": 0.0016066147945821285,
800
+ "learning_rate": 0.0002851622874806801,
801
+ "loss": 0.8922,
802
+ "step": 5550
803
+ },
804
+ {
805
+ "epoch": 2.1638330757341575,
806
+ "grad_norm": 0.0017319379840046167,
807
+ "learning_rate": 0.0002787223080886141,
808
+ "loss": 0.8686,
809
+ "step": 5600
810
+ },
811
+ {
812
+ "epoch": 2.1831530139103554,
813
+ "grad_norm": 0.001960605848580599,
814
+ "learning_rate": 0.00027228232869654816,
815
+ "loss": 0.8896,
816
+ "step": 5650
817
+ },
818
+ {
819
+ "epoch": 2.2024729520865534,
820
+ "grad_norm": 0.0017535175429657102,
821
+ "learning_rate": 0.0002658423493044823,
822
+ "loss": 0.8898,
823
+ "step": 5700
824
+ },
825
+ {
826
+ "epoch": 2.2217928902627513,
827
+ "grad_norm": 0.0013692132197320461,
828
+ "learning_rate": 0.0002594023699124163,
829
+ "loss": 0.87,
830
+ "step": 5750
831
+ },
832
+ {
833
+ "epoch": 2.2411128284389488,
834
+ "grad_norm": 0.0012175439624115825,
835
+ "learning_rate": 0.00025296239052035035,
836
+ "loss": 0.8833,
837
+ "step": 5800
838
+ },
839
+ {
840
+ "epoch": 2.2604327666151467,
841
+ "grad_norm": 0.0014353194274008274,
842
+ "learning_rate": 0.0002465224111282844,
843
+ "loss": 0.8785,
844
+ "step": 5850
845
+ },
846
+ {
847
+ "epoch": 2.2797527047913446,
848
+ "grad_norm": 0.0014904884155839682,
849
+ "learning_rate": 0.00024008243173621844,
850
+ "loss": 0.8827,
851
+ "step": 5900
852
+ },
853
+ {
854
+ "epoch": 2.2990726429675425,
855
+ "grad_norm": 0.0015136194415390491,
856
+ "learning_rate": 0.00023364245234415253,
857
+ "loss": 0.8813,
858
+ "step": 5950
859
+ },
860
+ {
861
+ "epoch": 2.3183925811437405,
862
+ "grad_norm": 0.0021759923547506332,
863
+ "learning_rate": 0.00022720247295208656,
864
+ "loss": 0.8712,
865
+ "step": 6000
866
+ },
867
+ {
868
+ "epoch": 2.337712519319938,
869
+ "grad_norm": 0.0012809019535779953,
870
+ "learning_rate": 0.00022076249356002062,
871
+ "loss": 0.8643,
872
+ "step": 6050
873
+ },
874
+ {
875
+ "epoch": 2.357032457496136,
876
+ "grad_norm": 0.0013298860285431147,
877
+ "learning_rate": 0.00021432251416795469,
878
+ "loss": 0.8854,
879
+ "step": 6100
880
+ },
881
+ {
882
+ "epoch": 2.376352395672334,
883
+ "grad_norm": 0.0017576662357896566,
884
+ "learning_rate": 0.00020788253477588872,
885
+ "loss": 0.8719,
886
+ "step": 6150
887
+ },
888
+ {
889
+ "epoch": 2.3956723338485317,
890
+ "grad_norm": 0.0016253705834969878,
891
+ "learning_rate": 0.00020144255538382278,
892
+ "loss": 0.8708,
893
+ "step": 6200
894
+ },
895
+ {
896
+ "epoch": 2.4149922720247297,
897
+ "grad_norm": 0.0015435911482200027,
898
+ "learning_rate": 0.00019500257599175684,
899
+ "loss": 0.854,
900
+ "step": 6250
901
+ },
902
+ {
903
+ "epoch": 2.434312210200927,
904
+ "grad_norm": 0.0013563215034082532,
905
+ "learning_rate": 0.00018856259659969088,
906
+ "loss": 0.8565,
907
+ "step": 6300
908
+ },
909
+ {
910
+ "epoch": 2.453632148377125,
911
+ "grad_norm": 0.0019695733208209276,
912
+ "learning_rate": 0.00018212261720762494,
913
+ "loss": 0.8587,
914
+ "step": 6350
915
+ },
916
+ {
917
+ "epoch": 2.472952086553323,
918
+ "grad_norm": 0.001503878622315824,
919
+ "learning_rate": 0.000175682637815559,
920
+ "loss": 0.8847,
921
+ "step": 6400
922
+ },
923
+ {
924
+ "epoch": 2.492272024729521,
925
+ "grad_norm": 0.0018463142914697528,
926
+ "learning_rate": 0.00016924265842349303,
927
+ "loss": 0.8865,
928
+ "step": 6450
929
+ },
930
+ {
931
+ "epoch": 2.511591962905719,
932
+ "grad_norm": 0.0013368335785344243,
933
+ "learning_rate": 0.0001628026790314271,
934
+ "loss": 0.8658,
935
+ "step": 6500
936
+ },
937
+ {
938
+ "epoch": 2.5309119010819163,
939
+ "grad_norm": 0.001461292733438313,
940
+ "learning_rate": 0.00015636269963936115,
941
+ "loss": 0.8719,
942
+ "step": 6550
943
+ },
944
+ {
945
+ "epoch": 2.5502318392581143,
946
+ "grad_norm": 0.0017376808682456613,
947
+ "learning_rate": 0.00014992272024729521,
948
+ "loss": 0.8645,
949
+ "step": 6600
950
+ },
951
+ {
952
+ "epoch": 2.569551777434312,
953
+ "grad_norm": 0.001536277704872191,
954
+ "learning_rate": 0.00014348274085522928,
955
+ "loss": 0.8839,
956
+ "step": 6650
957
+ },
958
+ {
959
+ "epoch": 2.58887171561051,
960
+ "grad_norm": 0.0020096334628760815,
961
+ "learning_rate": 0.00013704276146316334,
962
+ "loss": 0.8809,
963
+ "step": 6700
964
+ },
965
+ {
966
+ "epoch": 2.608191653786708,
967
+ "grad_norm": 0.002817530184984207,
968
+ "learning_rate": 0.00013060278207109737,
969
+ "loss": 0.8643,
970
+ "step": 6750
971
+ },
972
+ {
973
+ "epoch": 2.6275115919629055,
974
+ "grad_norm": 0.002030319534242153,
975
+ "learning_rate": 0.00012416280267903143,
976
+ "loss": 0.8713,
977
+ "step": 6800
978
+ },
979
+ {
980
+ "epoch": 2.6468315301391034,
981
+ "grad_norm": 0.0015792109770700336,
982
+ "learning_rate": 0.00011772282328696548,
983
+ "loss": 0.874,
984
+ "step": 6850
985
+ },
986
+ {
987
+ "epoch": 2.6661514683153014,
988
+ "grad_norm": 0.0016106871189549565,
989
+ "learning_rate": 0.00011128284389489954,
990
+ "loss": 0.8904,
991
+ "step": 6900
992
+ },
993
+ {
994
+ "epoch": 2.6854714064914993,
995
+ "grad_norm": 0.001518415636382997,
996
+ "learning_rate": 0.00010484286450283359,
997
+ "loss": 0.8889,
998
+ "step": 6950
999
+ },
1000
+ {
1001
+ "epoch": 2.704791344667697,
1002
+ "grad_norm": 0.0014989189803600311,
1003
+ "learning_rate": 9.840288511076765e-05,
1004
+ "loss": 0.8765,
1005
+ "step": 7000
1006
+ },
1007
+ {
1008
+ "epoch": 2.7241112828438947,
1009
+ "grad_norm": 0.0017810104181990027,
1010
+ "learning_rate": 9.196290571870171e-05,
1011
+ "loss": 0.8768,
1012
+ "step": 7050
1013
+ },
1014
+ {
1015
+ "epoch": 2.7434312210200926,
1016
+ "grad_norm": 0.0017641150625422597,
1017
+ "learning_rate": 8.552292632663576e-05,
1018
+ "loss": 0.8839,
1019
+ "step": 7100
1020
+ },
1021
+ {
1022
+ "epoch": 2.7627511591962906,
1023
+ "grad_norm": 0.001232657814398408,
1024
+ "learning_rate": 7.908294693456982e-05,
1025
+ "loss": 0.8833,
1026
+ "step": 7150
1027
+ },
1028
+ {
1029
+ "epoch": 2.7820710973724885,
1030
+ "grad_norm": 0.0017207327764481306,
1031
+ "learning_rate": 7.264296754250387e-05,
1032
+ "loss": 0.869,
1033
+ "step": 7200
1034
+ },
1035
+ {
1036
+ "epoch": 2.8013910355486864,
1037
+ "grad_norm": 0.0015914075775071979,
1038
+ "learning_rate": 6.620298815043791e-05,
1039
+ "loss": 0.8774,
1040
+ "step": 7250
1041
+ },
1042
+ {
1043
+ "epoch": 2.820710973724884,
1044
+ "grad_norm": 0.0015985453501343727,
1045
+ "learning_rate": 5.9763008758371975e-05,
1046
+ "loss": 0.8753,
1047
+ "step": 7300
1048
+ },
1049
+ {
1050
+ "epoch": 2.840030911901082,
1051
+ "grad_norm": 0.0014812527224421501,
1052
+ "learning_rate": 5.332302936630603e-05,
1053
+ "loss": 0.8882,
1054
+ "step": 7350
1055
+ },
1056
+ {
1057
+ "epoch": 2.8593508500772797,
1058
+ "grad_norm": 0.0014649786753579974,
1059
+ "learning_rate": 4.688304997424008e-05,
1060
+ "loss": 0.8787,
1061
+ "step": 7400
1062
+ },
1063
+ {
1064
+ "epoch": 2.8786707882534777,
1065
+ "grad_norm": 0.0014871322782710195,
1066
+ "learning_rate": 4.044307058217414e-05,
1067
+ "loss": 0.8795,
1068
+ "step": 7450
1069
+ },
1070
+ {
1071
+ "epoch": 2.8979907264296756,
1072
+ "grad_norm": 0.0015188547549769282,
1073
+ "learning_rate": 3.400309119010819e-05,
1074
+ "loss": 0.88,
1075
+ "step": 7500
1076
+ },
1077
+ {
1078
+ "epoch": 2.917310664605873,
1079
+ "grad_norm": 0.001243911450728774,
1080
+ "learning_rate": 2.7563111798042246e-05,
1081
+ "loss": 0.8755,
1082
+ "step": 7550
1083
+ },
1084
+ {
1085
+ "epoch": 2.936630602782071,
1086
+ "grad_norm": 0.0015768723096698523,
1087
+ "learning_rate": 2.11231324059763e-05,
1088
+ "loss": 0.864,
1089
+ "step": 7600
1090
+ },
1091
+ {
1092
+ "epoch": 2.955950540958269,
1093
+ "grad_norm": 0.0015496944542974234,
1094
+ "learning_rate": 1.4683153013910355e-05,
1095
+ "loss": 0.8697,
1096
+ "step": 7650
1097
+ },
1098
+ {
1099
+ "epoch": 2.975270479134467,
1100
+ "grad_norm": 0.0015128754312172532,
1101
+ "learning_rate": 8.24317362184441e-06,
1102
+ "loss": 0.8702,
1103
+ "step": 7700
1104
+ },
1105
+ {
1106
+ "epoch": 2.9945904173106648,
1107
+ "grad_norm": 0.0014746102970093489,
1108
+ "learning_rate": 1.8031942297784647e-06,
1109
+ "loss": 0.8747,
1110
+ "step": 7750
1111
+ },
1112
+ {
1113
+ "epoch": 3.0,
1114
+ "eval_loss": 0.8734365105628967,
1115
+ "eval_runtime": 3266.5567,
1116
+ "eval_samples_per_second": 3.169,
1117
+ "eval_steps_per_second": 0.396,
1118
+ "step": 7764
1119
+ }
1120
+ ],
1121
+ "logging_steps": 50,
1122
+ "max_steps": 7764,
1123
+ "num_input_tokens_seen": 0,
1124
+ "num_train_epochs": 3,
1125
+ "save_steps": 50,
1126
+ "stateful_callbacks": {
1127
+ "TrainerControl": {
1128
+ "args": {
1129
+ "should_epoch_stop": false,
1130
+ "should_evaluate": false,
1131
+ "should_log": false,
1132
+ "should_save": true,
1133
+ "should_training_stop": true
1134
+ },
1135
+ "attributes": {}
1136
+ }
1137
+ },
1138
+ "total_flos": 1.570388270726652e+18,
1139
+ "train_batch_size": 2,
1140
+ "trial_name": null,
1141
+ "trial_params": null
1142
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fc87615fe03c7c0b66573781d8506f56007b5374b1204db84fc867a42df5272
3
+ size 5560