lksy commited on
Commit
cae2208
1 Parent(s): e776373

First push 30b

Browse files
Files changed (35) hide show
  1. lora-alpaca/adapter_config.json +21 -0
  2. lora-alpaca/adapter_model.bin +3 -0
  3. lora-alpaca/checkpoint-1000/optimizer.pt +3 -0
  4. lora-alpaca/checkpoint-1000/pytorch_model.bin +3 -0
  5. lora-alpaca/checkpoint-1000/rng_state_0.pth +3 -0
  6. lora-alpaca/checkpoint-1000/rng_state_1.pth +3 -0
  7. lora-alpaca/checkpoint-1000/rng_state_2.pth +3 -0
  8. lora-alpaca/checkpoint-1000/rng_state_3.pth +3 -0
  9. lora-alpaca/checkpoint-1000/rng_state_4.pth +3 -0
  10. lora-alpaca/checkpoint-1000/scaler.pt +3 -0
  11. lora-alpaca/checkpoint-1000/scheduler.pt +3 -0
  12. lora-alpaca/checkpoint-1000/trainer_state.json +656 -0
  13. lora-alpaca/checkpoint-1000/training_args.bin +3 -0
  14. lora-alpaca/checkpoint-1200/optimizer.pt +3 -0
  15. lora-alpaca/checkpoint-1200/pytorch_model.bin +3 -0
  16. lora-alpaca/checkpoint-1200/rng_state_0.pth +3 -0
  17. lora-alpaca/checkpoint-1200/rng_state_1.pth +3 -0
  18. lora-alpaca/checkpoint-1200/rng_state_2.pth +3 -0
  19. lora-alpaca/checkpoint-1200/rng_state_3.pth +3 -0
  20. lora-alpaca/checkpoint-1200/rng_state_4.pth +3 -0
  21. lora-alpaca/checkpoint-1200/scaler.pt +3 -0
  22. lora-alpaca/checkpoint-1200/scheduler.pt +3 -0
  23. lora-alpaca/checkpoint-1200/trainer_state.json +784 -0
  24. lora-alpaca/checkpoint-1200/training_args.bin +3 -0
  25. lora-alpaca/checkpoint-1400/optimizer.pt +3 -0
  26. lora-alpaca/checkpoint-1400/pytorch_model.bin +3 -0
  27. lora-alpaca/checkpoint-1400/rng_state_0.pth +3 -0
  28. lora-alpaca/checkpoint-1400/rng_state_1.pth +3 -0
  29. lora-alpaca/checkpoint-1400/rng_state_2.pth +3 -0
  30. lora-alpaca/checkpoint-1400/rng_state_3.pth +3 -0
  31. lora-alpaca/checkpoint-1400/rng_state_4.pth +3 -0
  32. lora-alpaca/checkpoint-1400/scaler.pt +3 -0
  33. lora-alpaca/checkpoint-1400/scheduler.pt +3 -0
  34. lora-alpaca/checkpoint-1400/trainer_state.json +912 -0
  35. lora-alpaca/checkpoint-1400/training_args.bin +3 -0
lora-alpaca/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "huggyllama/llama-30b",
3
+ "bias": "none",
4
+ "enable_lora": null,
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "lora_alpha": 16,
9
+ "lora_dropout": 0.05,
10
+ "merge_weights": false,
11
+ "modules_to_save": null,
12
+ "peft_type": "LORA",
13
+ "r": 16,
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
lora-alpaca/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f12d89309b9642d84c4006291181a2401947aed96fcd67a0dffa606276831c9
3
+ size 204646285
lora-alpaca/checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:461636a4c6faa8a6e577827898da57f0cb117d22ded7e570f0d296c3f96816a2
3
+ size 409356413
lora-alpaca/checkpoint-1000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37ee7d415c6af903a4cde772c259509406c79c1a7dc0e653bd7bc46120b36171
3
+ size 204646285
lora-alpaca/checkpoint-1000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1906cd0100a0a9192c8810cf377b02f5c358bef24cb8b3a729893bba13eb531
3
+ size 14583
lora-alpaca/checkpoint-1000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:539818edb3008f3edb4f6ebb645d0b4a3db0366c2c6e273d9bc01d504ba5bf51
3
+ size 14583
lora-alpaca/checkpoint-1000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6995703a4e00fb17bcd5fe96aa5a29b5a9f4ce6af05fc0397700661d06a13b9f
3
+ size 14583
lora-alpaca/checkpoint-1000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a0bbf72f1c31ca2aec1153eceabc1979287feaf11570c9149d58d74b82ff2c1
3
+ size 14583
lora-alpaca/checkpoint-1000/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13159167d4c835b3ee4d31aca01c55ede1b2a199c15e6d5c717f3c1554bdee0b
3
+ size 14583
lora-alpaca/checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:616f782e508bf93abe9de82fb1a8777069847068afdada0050c6f94df6e0661f
3
+ size 557
lora-alpaca/checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd886b500988540c113c9a39735337784eedc4116b36987879db01d78cb54435
3
+ size 627
lora-alpaca/checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,656 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8538553714752197,
3
+ "best_model_checkpoint": "./lora-alpaca/checkpoint-1000",
4
+ "epoch": 1.9282684149633629,
5
+ "global_step": 1000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 2.6999999999999996e-05,
13
+ "loss": 1.554,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "learning_rate": 5.6999999999999996e-05,
19
+ "loss": 1.4793,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.06,
24
+ "learning_rate": 8.4e-05,
25
+ "loss": 1.2861,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.08,
30
+ "learning_rate": 0.00011099999999999999,
31
+ "loss": 1.1305,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.1,
36
+ "learning_rate": 0.00014099999999999998,
37
+ "loss": 1.0653,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.12,
42
+ "learning_rate": 0.00017099999999999998,
43
+ "loss": 1.0233,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.13,
48
+ "learning_rate": 0.000201,
49
+ "loss": 0.9835,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 0.15,
54
+ "learning_rate": 0.00023099999999999998,
55
+ "loss": 0.9385,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 0.17,
60
+ "learning_rate": 0.000261,
61
+ "loss": 0.9179,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 0.19,
66
+ "learning_rate": 0.00029099999999999997,
67
+ "loss": 0.9107,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.21,
72
+ "learning_rate": 0.0002985557083906465,
73
+ "loss": 0.914,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 0.23,
78
+ "learning_rate": 0.0002964924346629986,
79
+ "loss": 0.8971,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 0.25,
84
+ "learning_rate": 0.00029442916093535074,
85
+ "loss": 0.9021,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 0.27,
90
+ "learning_rate": 0.00029236588720770286,
91
+ "loss": 0.8939,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 0.29,
96
+ "learning_rate": 0.000290302613480055,
97
+ "loss": 0.8917,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 0.31,
102
+ "learning_rate": 0.0002882393397524071,
103
+ "loss": 0.8834,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 0.33,
108
+ "learning_rate": 0.00028617606602475925,
109
+ "loss": 0.8927,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 0.35,
114
+ "learning_rate": 0.0002841127922971114,
115
+ "loss": 0.8979,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 0.37,
120
+ "learning_rate": 0.0002820495185694635,
121
+ "loss": 0.8812,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 0.39,
126
+ "learning_rate": 0.00027998624484181563,
127
+ "loss": 0.8852,
128
+ "step": 200
129
+ },
130
+ {
131
+ "epoch": 0.39,
132
+ "eval_loss": 0.8834338188171387,
133
+ "eval_runtime": 172.9592,
134
+ "eval_samples_per_second": 11.563,
135
+ "eval_steps_per_second": 0.289,
136
+ "step": 200
137
+ },
138
+ {
139
+ "epoch": 0.4,
140
+ "learning_rate": 0.0002779229711141678,
141
+ "loss": 0.8837,
142
+ "step": 210
143
+ },
144
+ {
145
+ "epoch": 0.42,
146
+ "learning_rate": 0.00027585969738651994,
147
+ "loss": 0.8837,
148
+ "step": 220
149
+ },
150
+ {
151
+ "epoch": 0.44,
152
+ "learning_rate": 0.00027379642365887207,
153
+ "loss": 0.8703,
154
+ "step": 230
155
+ },
156
+ {
157
+ "epoch": 0.46,
158
+ "learning_rate": 0.0002717331499312242,
159
+ "loss": 0.885,
160
+ "step": 240
161
+ },
162
+ {
163
+ "epoch": 0.48,
164
+ "learning_rate": 0.0002696698762035763,
165
+ "loss": 0.877,
166
+ "step": 250
167
+ },
168
+ {
169
+ "epoch": 0.5,
170
+ "learning_rate": 0.00026760660247592845,
171
+ "loss": 0.8698,
172
+ "step": 260
173
+ },
174
+ {
175
+ "epoch": 0.52,
176
+ "learning_rate": 0.0002655433287482806,
177
+ "loss": 0.8698,
178
+ "step": 270
179
+ },
180
+ {
181
+ "epoch": 0.54,
182
+ "learning_rate": 0.0002634800550206327,
183
+ "loss": 0.8732,
184
+ "step": 280
185
+ },
186
+ {
187
+ "epoch": 0.56,
188
+ "learning_rate": 0.00026141678129298484,
189
+ "loss": 0.8571,
190
+ "step": 290
191
+ },
192
+ {
193
+ "epoch": 0.58,
194
+ "learning_rate": 0.00025935350756533696,
195
+ "loss": 0.8597,
196
+ "step": 300
197
+ },
198
+ {
199
+ "epoch": 0.6,
200
+ "learning_rate": 0.0002572902338376891,
201
+ "loss": 0.8719,
202
+ "step": 310
203
+ },
204
+ {
205
+ "epoch": 0.62,
206
+ "learning_rate": 0.0002552269601100413,
207
+ "loss": 0.8624,
208
+ "step": 320
209
+ },
210
+ {
211
+ "epoch": 0.64,
212
+ "learning_rate": 0.0002531636863823934,
213
+ "loss": 0.8577,
214
+ "step": 330
215
+ },
216
+ {
217
+ "epoch": 0.66,
218
+ "learning_rate": 0.00025110041265474553,
219
+ "loss": 0.8624,
220
+ "step": 340
221
+ },
222
+ {
223
+ "epoch": 0.67,
224
+ "learning_rate": 0.00024903713892709766,
225
+ "loss": 0.8765,
226
+ "step": 350
227
+ },
228
+ {
229
+ "epoch": 0.69,
230
+ "learning_rate": 0.0002469738651994498,
231
+ "loss": 0.8595,
232
+ "step": 360
233
+ },
234
+ {
235
+ "epoch": 0.71,
236
+ "learning_rate": 0.0002449105914718019,
237
+ "loss": 0.8646,
238
+ "step": 370
239
+ },
240
+ {
241
+ "epoch": 0.73,
242
+ "learning_rate": 0.00024284731774415404,
243
+ "loss": 0.87,
244
+ "step": 380
245
+ },
246
+ {
247
+ "epoch": 0.75,
248
+ "learning_rate": 0.00024078404401650617,
249
+ "loss": 0.8568,
250
+ "step": 390
251
+ },
252
+ {
253
+ "epoch": 0.77,
254
+ "learning_rate": 0.0002387207702888583,
255
+ "loss": 0.8571,
256
+ "step": 400
257
+ },
258
+ {
259
+ "epoch": 0.77,
260
+ "eval_loss": 0.8650650382041931,
261
+ "eval_runtime": 175.1777,
262
+ "eval_samples_per_second": 11.417,
263
+ "eval_steps_per_second": 0.285,
264
+ "step": 400
265
+ },
266
+ {
267
+ "epoch": 0.79,
268
+ "learning_rate": 0.00023665749656121043,
269
+ "loss": 0.8495,
270
+ "step": 410
271
+ },
272
+ {
273
+ "epoch": 0.81,
274
+ "learning_rate": 0.00023459422283356255,
275
+ "loss": 0.8572,
276
+ "step": 420
277
+ },
278
+ {
279
+ "epoch": 0.83,
280
+ "learning_rate": 0.0002325309491059147,
281
+ "loss": 0.8629,
282
+ "step": 430
283
+ },
284
+ {
285
+ "epoch": 0.85,
286
+ "learning_rate": 0.00023046767537826684,
287
+ "loss": 0.8413,
288
+ "step": 440
289
+ },
290
+ {
291
+ "epoch": 0.87,
292
+ "learning_rate": 0.00022840440165061896,
293
+ "loss": 0.853,
294
+ "step": 450
295
+ },
296
+ {
297
+ "epoch": 0.89,
298
+ "learning_rate": 0.0002263411279229711,
299
+ "loss": 0.8662,
300
+ "step": 460
301
+ },
302
+ {
303
+ "epoch": 0.91,
304
+ "learning_rate": 0.00022427785419532322,
305
+ "loss": 0.8495,
306
+ "step": 470
307
+ },
308
+ {
309
+ "epoch": 0.93,
310
+ "learning_rate": 0.00022221458046767537,
311
+ "loss": 0.8502,
312
+ "step": 480
313
+ },
314
+ {
315
+ "epoch": 0.94,
316
+ "learning_rate": 0.0002201513067400275,
317
+ "loss": 0.8601,
318
+ "step": 490
319
+ },
320
+ {
321
+ "epoch": 0.96,
322
+ "learning_rate": 0.00021808803301237963,
323
+ "loss": 0.8503,
324
+ "step": 500
325
+ },
326
+ {
327
+ "epoch": 0.98,
328
+ "learning_rate": 0.00021602475928473176,
329
+ "loss": 0.8517,
330
+ "step": 510
331
+ },
332
+ {
333
+ "epoch": 1.0,
334
+ "learning_rate": 0.00021396148555708389,
335
+ "loss": 0.8633,
336
+ "step": 520
337
+ },
338
+ {
339
+ "epoch": 1.02,
340
+ "learning_rate": 0.00021189821182943601,
341
+ "loss": 0.853,
342
+ "step": 530
343
+ },
344
+ {
345
+ "epoch": 1.04,
346
+ "learning_rate": 0.00020983493810178817,
347
+ "loss": 0.8379,
348
+ "step": 540
349
+ },
350
+ {
351
+ "epoch": 1.06,
352
+ "learning_rate": 0.0002077716643741403,
353
+ "loss": 0.8396,
354
+ "step": 550
355
+ },
356
+ {
357
+ "epoch": 1.08,
358
+ "learning_rate": 0.00020570839064649242,
359
+ "loss": 0.8569,
360
+ "step": 560
361
+ },
362
+ {
363
+ "epoch": 1.1,
364
+ "learning_rate": 0.00020364511691884455,
365
+ "loss": 0.8594,
366
+ "step": 570
367
+ },
368
+ {
369
+ "epoch": 1.12,
370
+ "learning_rate": 0.00020158184319119668,
371
+ "loss": 0.8461,
372
+ "step": 580
373
+ },
374
+ {
375
+ "epoch": 1.14,
376
+ "learning_rate": 0.00019951856946354884,
377
+ "loss": 0.8429,
378
+ "step": 590
379
+ },
380
+ {
381
+ "epoch": 1.16,
382
+ "learning_rate": 0.00019745529573590096,
383
+ "loss": 0.8308,
384
+ "step": 600
385
+ },
386
+ {
387
+ "epoch": 1.16,
388
+ "eval_loss": 0.8592662215232849,
389
+ "eval_runtime": 175.4031,
390
+ "eval_samples_per_second": 11.402,
391
+ "eval_steps_per_second": 0.285,
392
+ "step": 600
393
+ },
394
+ {
395
+ "epoch": 1.18,
396
+ "learning_rate": 0.0001953920220082531,
397
+ "loss": 0.8355,
398
+ "step": 610
399
+ },
400
+ {
401
+ "epoch": 1.2,
402
+ "learning_rate": 0.0001933287482806052,
403
+ "loss": 0.834,
404
+ "step": 620
405
+ },
406
+ {
407
+ "epoch": 1.21,
408
+ "learning_rate": 0.00019126547455295732,
409
+ "loss": 0.8299,
410
+ "step": 630
411
+ },
412
+ {
413
+ "epoch": 1.23,
414
+ "learning_rate": 0.00018920220082530945,
415
+ "loss": 0.8431,
416
+ "step": 640
417
+ },
418
+ {
419
+ "epoch": 1.25,
420
+ "learning_rate": 0.00018713892709766163,
421
+ "loss": 0.841,
422
+ "step": 650
423
+ },
424
+ {
425
+ "epoch": 1.27,
426
+ "learning_rate": 0.00018507565337001373,
427
+ "loss": 0.8374,
428
+ "step": 660
429
+ },
430
+ {
431
+ "epoch": 1.29,
432
+ "learning_rate": 0.00018301237964236586,
433
+ "loss": 0.84,
434
+ "step": 670
435
+ },
436
+ {
437
+ "epoch": 1.31,
438
+ "learning_rate": 0.000180949105914718,
439
+ "loss": 0.8365,
440
+ "step": 680
441
+ },
442
+ {
443
+ "epoch": 1.33,
444
+ "learning_rate": 0.00017888583218707011,
445
+ "loss": 0.8287,
446
+ "step": 690
447
+ },
448
+ {
449
+ "epoch": 1.35,
450
+ "learning_rate": 0.00017682255845942227,
451
+ "loss": 0.8334,
452
+ "step": 700
453
+ },
454
+ {
455
+ "epoch": 1.37,
456
+ "learning_rate": 0.0001747592847317744,
457
+ "loss": 0.8445,
458
+ "step": 710
459
+ },
460
+ {
461
+ "epoch": 1.39,
462
+ "learning_rate": 0.00017269601100412653,
463
+ "loss": 0.8286,
464
+ "step": 720
465
+ },
466
+ {
467
+ "epoch": 1.41,
468
+ "learning_rate": 0.00017063273727647865,
469
+ "loss": 0.8266,
470
+ "step": 730
471
+ },
472
+ {
473
+ "epoch": 1.43,
474
+ "learning_rate": 0.00016856946354883078,
475
+ "loss": 0.8276,
476
+ "step": 740
477
+ },
478
+ {
479
+ "epoch": 1.45,
480
+ "learning_rate": 0.00016650618982118294,
481
+ "loss": 0.8386,
482
+ "step": 750
483
+ },
484
+ {
485
+ "epoch": 1.47,
486
+ "learning_rate": 0.00016444291609353506,
487
+ "loss": 0.8349,
488
+ "step": 760
489
+ },
490
+ {
491
+ "epoch": 1.48,
492
+ "learning_rate": 0.0001623796423658872,
493
+ "loss": 0.8292,
494
+ "step": 770
495
+ },
496
+ {
497
+ "epoch": 1.5,
498
+ "learning_rate": 0.00016031636863823932,
499
+ "loss": 0.8354,
500
+ "step": 780
501
+ },
502
+ {
503
+ "epoch": 1.52,
504
+ "learning_rate": 0.00015825309491059145,
505
+ "loss": 0.8306,
506
+ "step": 790
507
+ },
508
+ {
509
+ "epoch": 1.54,
510
+ "learning_rate": 0.00015618982118294358,
511
+ "loss": 0.8346,
512
+ "step": 800
513
+ },
514
+ {
515
+ "epoch": 1.54,
516
+ "eval_loss": 0.856368899345398,
517
+ "eval_runtime": 175.5494,
518
+ "eval_samples_per_second": 11.393,
519
+ "eval_steps_per_second": 0.285,
520
+ "step": 800
521
+ },
522
+ {
523
+ "epoch": 1.56,
524
+ "learning_rate": 0.00015412654745529573,
525
+ "loss": 0.8244,
526
+ "step": 810
527
+ },
528
+ {
529
+ "epoch": 1.58,
530
+ "learning_rate": 0.00015206327372764786,
531
+ "loss": 0.8129,
532
+ "step": 820
533
+ },
534
+ {
535
+ "epoch": 1.6,
536
+ "learning_rate": 0.00015,
537
+ "loss": 0.8216,
538
+ "step": 830
539
+ },
540
+ {
541
+ "epoch": 1.62,
542
+ "learning_rate": 0.00014793672627235211,
543
+ "loss": 0.8205,
544
+ "step": 840
545
+ },
546
+ {
547
+ "epoch": 1.64,
548
+ "learning_rate": 0.00014587345254470424,
549
+ "loss": 0.8215,
550
+ "step": 850
551
+ },
552
+ {
553
+ "epoch": 1.66,
554
+ "learning_rate": 0.00014381017881705637,
555
+ "loss": 0.8467,
556
+ "step": 860
557
+ },
558
+ {
559
+ "epoch": 1.68,
560
+ "learning_rate": 0.0001417469050894085,
561
+ "loss": 0.8258,
562
+ "step": 870
563
+ },
564
+ {
565
+ "epoch": 1.7,
566
+ "learning_rate": 0.00013968363136176065,
567
+ "loss": 0.8277,
568
+ "step": 880
569
+ },
570
+ {
571
+ "epoch": 1.72,
572
+ "learning_rate": 0.00013762035763411278,
573
+ "loss": 0.8249,
574
+ "step": 890
575
+ },
576
+ {
577
+ "epoch": 1.74,
578
+ "learning_rate": 0.0001355570839064649,
579
+ "loss": 0.8226,
580
+ "step": 900
581
+ },
582
+ {
583
+ "epoch": 1.75,
584
+ "learning_rate": 0.00013349381017881704,
585
+ "loss": 0.8268,
586
+ "step": 910
587
+ },
588
+ {
589
+ "epoch": 1.77,
590
+ "learning_rate": 0.00013143053645116917,
591
+ "loss": 0.8321,
592
+ "step": 920
593
+ },
594
+ {
595
+ "epoch": 1.79,
596
+ "learning_rate": 0.00012936726272352132,
597
+ "loss": 0.8163,
598
+ "step": 930
599
+ },
600
+ {
601
+ "epoch": 1.81,
602
+ "learning_rate": 0.00012730398899587345,
603
+ "loss": 0.8352,
604
+ "step": 940
605
+ },
606
+ {
607
+ "epoch": 1.83,
608
+ "learning_rate": 0.00012524071526822558,
609
+ "loss": 0.8106,
610
+ "step": 950
611
+ },
612
+ {
613
+ "epoch": 1.85,
614
+ "learning_rate": 0.0001231774415405777,
615
+ "loss": 0.8017,
616
+ "step": 960
617
+ },
618
+ {
619
+ "epoch": 1.87,
620
+ "learning_rate": 0.00012111416781292983,
621
+ "loss": 0.8244,
622
+ "step": 970
623
+ },
624
+ {
625
+ "epoch": 1.89,
626
+ "learning_rate": 0.00011905089408528197,
627
+ "loss": 0.8363,
628
+ "step": 980
629
+ },
630
+ {
631
+ "epoch": 1.91,
632
+ "learning_rate": 0.0001169876203576341,
633
+ "loss": 0.8307,
634
+ "step": 990
635
+ },
636
+ {
637
+ "epoch": 1.93,
638
+ "learning_rate": 0.00011492434662998623,
639
+ "loss": 0.8239,
640
+ "step": 1000
641
+ },
642
+ {
643
+ "epoch": 1.93,
644
+ "eval_loss": 0.8538553714752197,
645
+ "eval_runtime": 175.4751,
646
+ "eval_samples_per_second": 11.398,
647
+ "eval_steps_per_second": 0.285,
648
+ "step": 1000
649
+ }
650
+ ],
651
+ "max_steps": 1554,
652
+ "num_train_epochs": 3,
653
+ "total_flos": 8.910668799611503e+18,
654
+ "trial_name": null,
655
+ "trial_params": null
656
+ }
lora-alpaca/checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc2a2253014a1304b4da9f1ad32740c85d716b2f80c4fa4cafd7302357e61b5d
3
+ size 3579
lora-alpaca/checkpoint-1200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a818b62cce7a219573ba930c61bbaaa87742bf49843b0cb0ba0db07d92d9ee6
3
+ size 409356413
lora-alpaca/checkpoint-1200/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4712d9c0c18bc10c32480020bc3bfe082a1794c4a7d8653f36b1441459c9e85
3
+ size 204646285
lora-alpaca/checkpoint-1200/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:414b67a6884d6f06869f8371c9a3d42872114b60d86c33aa25a0f5965a442140
3
+ size 14583
lora-alpaca/checkpoint-1200/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a5a88a0aa401a3e2414ce9b676bd8bcb14a7bd31b87bc361ce970557db19674
3
+ size 14583
lora-alpaca/checkpoint-1200/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd6748143896033aae8d36b898efdbca78ba14692252af9649ca9f79ba5df911
3
+ size 14583
lora-alpaca/checkpoint-1200/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a7471b7a3dd9ed8eccf806e60fc77bb373473345e2bd887389e658556381788
3
+ size 14583
lora-alpaca/checkpoint-1200/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dca39181f6314fcbdd6377339d3d49d4c1627f8f987cacf061d3dfe1e4fd513
3
+ size 14583
lora-alpaca/checkpoint-1200/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4355ee941ef90008ee218ef7e450db011f5be2bdb4b1f55daa599bf63b4ac9ba
3
+ size 557
lora-alpaca/checkpoint-1200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5900efbb1fedf60bede6b5c46188dd9c04f8935b5b319e79f1d127299e64beef
3
+ size 627
lora-alpaca/checkpoint-1200/trainer_state.json ADDED
@@ -0,0 +1,784 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8536396622657776,
3
+ "best_model_checkpoint": "./lora-alpaca/checkpoint-1200",
4
+ "epoch": 2.3139220979560355,
5
+ "global_step": 1200,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 2.6999999999999996e-05,
13
+ "loss": 1.554,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "learning_rate": 5.6999999999999996e-05,
19
+ "loss": 1.4793,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.06,
24
+ "learning_rate": 8.4e-05,
25
+ "loss": 1.2861,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.08,
30
+ "learning_rate": 0.00011099999999999999,
31
+ "loss": 1.1305,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.1,
36
+ "learning_rate": 0.00014099999999999998,
37
+ "loss": 1.0653,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.12,
42
+ "learning_rate": 0.00017099999999999998,
43
+ "loss": 1.0233,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.13,
48
+ "learning_rate": 0.000201,
49
+ "loss": 0.9835,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 0.15,
54
+ "learning_rate": 0.00023099999999999998,
55
+ "loss": 0.9385,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 0.17,
60
+ "learning_rate": 0.000261,
61
+ "loss": 0.9179,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 0.19,
66
+ "learning_rate": 0.00029099999999999997,
67
+ "loss": 0.9107,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.21,
72
+ "learning_rate": 0.0002985557083906465,
73
+ "loss": 0.914,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 0.23,
78
+ "learning_rate": 0.0002964924346629986,
79
+ "loss": 0.8971,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 0.25,
84
+ "learning_rate": 0.00029442916093535074,
85
+ "loss": 0.9021,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 0.27,
90
+ "learning_rate": 0.00029236588720770286,
91
+ "loss": 0.8939,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 0.29,
96
+ "learning_rate": 0.000290302613480055,
97
+ "loss": 0.8917,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 0.31,
102
+ "learning_rate": 0.0002882393397524071,
103
+ "loss": 0.8834,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 0.33,
108
+ "learning_rate": 0.00028617606602475925,
109
+ "loss": 0.8927,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 0.35,
114
+ "learning_rate": 0.0002841127922971114,
115
+ "loss": 0.8979,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 0.37,
120
+ "learning_rate": 0.0002820495185694635,
121
+ "loss": 0.8812,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 0.39,
126
+ "learning_rate": 0.00027998624484181563,
127
+ "loss": 0.8852,
128
+ "step": 200
129
+ },
130
+ {
131
+ "epoch": 0.39,
132
+ "eval_loss": 0.8834338188171387,
133
+ "eval_runtime": 172.9592,
134
+ "eval_samples_per_second": 11.563,
135
+ "eval_steps_per_second": 0.289,
136
+ "step": 200
137
+ },
138
+ {
139
+ "epoch": 0.4,
140
+ "learning_rate": 0.0002779229711141678,
141
+ "loss": 0.8837,
142
+ "step": 210
143
+ },
144
+ {
145
+ "epoch": 0.42,
146
+ "learning_rate": 0.00027585969738651994,
147
+ "loss": 0.8837,
148
+ "step": 220
149
+ },
150
+ {
151
+ "epoch": 0.44,
152
+ "learning_rate": 0.00027379642365887207,
153
+ "loss": 0.8703,
154
+ "step": 230
155
+ },
156
+ {
157
+ "epoch": 0.46,
158
+ "learning_rate": 0.0002717331499312242,
159
+ "loss": 0.885,
160
+ "step": 240
161
+ },
162
+ {
163
+ "epoch": 0.48,
164
+ "learning_rate": 0.0002696698762035763,
165
+ "loss": 0.877,
166
+ "step": 250
167
+ },
168
+ {
169
+ "epoch": 0.5,
170
+ "learning_rate": 0.00026760660247592845,
171
+ "loss": 0.8698,
172
+ "step": 260
173
+ },
174
+ {
175
+ "epoch": 0.52,
176
+ "learning_rate": 0.0002655433287482806,
177
+ "loss": 0.8698,
178
+ "step": 270
179
+ },
180
+ {
181
+ "epoch": 0.54,
182
+ "learning_rate": 0.0002634800550206327,
183
+ "loss": 0.8732,
184
+ "step": 280
185
+ },
186
+ {
187
+ "epoch": 0.56,
188
+ "learning_rate": 0.00026141678129298484,
189
+ "loss": 0.8571,
190
+ "step": 290
191
+ },
192
+ {
193
+ "epoch": 0.58,
194
+ "learning_rate": 0.00025935350756533696,
195
+ "loss": 0.8597,
196
+ "step": 300
197
+ },
198
+ {
199
+ "epoch": 0.6,
200
+ "learning_rate": 0.0002572902338376891,
201
+ "loss": 0.8719,
202
+ "step": 310
203
+ },
204
+ {
205
+ "epoch": 0.62,
206
+ "learning_rate": 0.0002552269601100413,
207
+ "loss": 0.8624,
208
+ "step": 320
209
+ },
210
+ {
211
+ "epoch": 0.64,
212
+ "learning_rate": 0.0002531636863823934,
213
+ "loss": 0.8577,
214
+ "step": 330
215
+ },
216
+ {
217
+ "epoch": 0.66,
218
+ "learning_rate": 0.00025110041265474553,
219
+ "loss": 0.8624,
220
+ "step": 340
221
+ },
222
+ {
223
+ "epoch": 0.67,
224
+ "learning_rate": 0.00024903713892709766,
225
+ "loss": 0.8765,
226
+ "step": 350
227
+ },
228
+ {
229
+ "epoch": 0.69,
230
+ "learning_rate": 0.0002469738651994498,
231
+ "loss": 0.8595,
232
+ "step": 360
233
+ },
234
+ {
235
+ "epoch": 0.71,
236
+ "learning_rate": 0.0002449105914718019,
237
+ "loss": 0.8646,
238
+ "step": 370
239
+ },
240
+ {
241
+ "epoch": 0.73,
242
+ "learning_rate": 0.00024284731774415404,
243
+ "loss": 0.87,
244
+ "step": 380
245
+ },
246
+ {
247
+ "epoch": 0.75,
248
+ "learning_rate": 0.00024078404401650617,
249
+ "loss": 0.8568,
250
+ "step": 390
251
+ },
252
+ {
253
+ "epoch": 0.77,
254
+ "learning_rate": 0.0002387207702888583,
255
+ "loss": 0.8571,
256
+ "step": 400
257
+ },
258
+ {
259
+ "epoch": 0.77,
260
+ "eval_loss": 0.8650650382041931,
261
+ "eval_runtime": 175.1777,
262
+ "eval_samples_per_second": 11.417,
263
+ "eval_steps_per_second": 0.285,
264
+ "step": 400
265
+ },
266
+ {
267
+ "epoch": 0.79,
268
+ "learning_rate": 0.00023665749656121043,
269
+ "loss": 0.8495,
270
+ "step": 410
271
+ },
272
+ {
273
+ "epoch": 0.81,
274
+ "learning_rate": 0.00023459422283356255,
275
+ "loss": 0.8572,
276
+ "step": 420
277
+ },
278
+ {
279
+ "epoch": 0.83,
280
+ "learning_rate": 0.0002325309491059147,
281
+ "loss": 0.8629,
282
+ "step": 430
283
+ },
284
+ {
285
+ "epoch": 0.85,
286
+ "learning_rate": 0.00023046767537826684,
287
+ "loss": 0.8413,
288
+ "step": 440
289
+ },
290
+ {
291
+ "epoch": 0.87,
292
+ "learning_rate": 0.00022840440165061896,
293
+ "loss": 0.853,
294
+ "step": 450
295
+ },
296
+ {
297
+ "epoch": 0.89,
298
+ "learning_rate": 0.0002263411279229711,
299
+ "loss": 0.8662,
300
+ "step": 460
301
+ },
302
+ {
303
+ "epoch": 0.91,
304
+ "learning_rate": 0.00022427785419532322,
305
+ "loss": 0.8495,
306
+ "step": 470
307
+ },
308
+ {
309
+ "epoch": 0.93,
310
+ "learning_rate": 0.00022221458046767537,
311
+ "loss": 0.8502,
312
+ "step": 480
313
+ },
314
+ {
315
+ "epoch": 0.94,
316
+ "learning_rate": 0.0002201513067400275,
317
+ "loss": 0.8601,
318
+ "step": 490
319
+ },
320
+ {
321
+ "epoch": 0.96,
322
+ "learning_rate": 0.00021808803301237963,
323
+ "loss": 0.8503,
324
+ "step": 500
325
+ },
326
+ {
327
+ "epoch": 0.98,
328
+ "learning_rate": 0.00021602475928473176,
329
+ "loss": 0.8517,
330
+ "step": 510
331
+ },
332
+ {
333
+ "epoch": 1.0,
334
+ "learning_rate": 0.00021396148555708389,
335
+ "loss": 0.8633,
336
+ "step": 520
337
+ },
338
+ {
339
+ "epoch": 1.02,
340
+ "learning_rate": 0.00021189821182943601,
341
+ "loss": 0.853,
342
+ "step": 530
343
+ },
344
+ {
345
+ "epoch": 1.04,
346
+ "learning_rate": 0.00020983493810178817,
347
+ "loss": 0.8379,
348
+ "step": 540
349
+ },
350
+ {
351
+ "epoch": 1.06,
352
+ "learning_rate": 0.0002077716643741403,
353
+ "loss": 0.8396,
354
+ "step": 550
355
+ },
356
+ {
357
+ "epoch": 1.08,
358
+ "learning_rate": 0.00020570839064649242,
359
+ "loss": 0.8569,
360
+ "step": 560
361
+ },
362
+ {
363
+ "epoch": 1.1,
364
+ "learning_rate": 0.00020364511691884455,
365
+ "loss": 0.8594,
366
+ "step": 570
367
+ },
368
+ {
369
+ "epoch": 1.12,
370
+ "learning_rate": 0.00020158184319119668,
371
+ "loss": 0.8461,
372
+ "step": 580
373
+ },
374
+ {
375
+ "epoch": 1.14,
376
+ "learning_rate": 0.00019951856946354884,
377
+ "loss": 0.8429,
378
+ "step": 590
379
+ },
380
+ {
381
+ "epoch": 1.16,
382
+ "learning_rate": 0.00019745529573590096,
383
+ "loss": 0.8308,
384
+ "step": 600
385
+ },
386
+ {
387
+ "epoch": 1.16,
388
+ "eval_loss": 0.8592662215232849,
389
+ "eval_runtime": 175.4031,
390
+ "eval_samples_per_second": 11.402,
391
+ "eval_steps_per_second": 0.285,
392
+ "step": 600
393
+ },
394
+ {
395
+ "epoch": 1.18,
396
+ "learning_rate": 0.0001953920220082531,
397
+ "loss": 0.8355,
398
+ "step": 610
399
+ },
400
+ {
401
+ "epoch": 1.2,
402
+ "learning_rate": 0.0001933287482806052,
403
+ "loss": 0.834,
404
+ "step": 620
405
+ },
406
+ {
407
+ "epoch": 1.21,
408
+ "learning_rate": 0.00019126547455295732,
409
+ "loss": 0.8299,
410
+ "step": 630
411
+ },
412
+ {
413
+ "epoch": 1.23,
414
+ "learning_rate": 0.00018920220082530945,
415
+ "loss": 0.8431,
416
+ "step": 640
417
+ },
418
+ {
419
+ "epoch": 1.25,
420
+ "learning_rate": 0.00018713892709766163,
421
+ "loss": 0.841,
422
+ "step": 650
423
+ },
424
+ {
425
+ "epoch": 1.27,
426
+ "learning_rate": 0.00018507565337001373,
427
+ "loss": 0.8374,
428
+ "step": 660
429
+ },
430
+ {
431
+ "epoch": 1.29,
432
+ "learning_rate": 0.00018301237964236586,
433
+ "loss": 0.84,
434
+ "step": 670
435
+ },
436
+ {
437
+ "epoch": 1.31,
438
+ "learning_rate": 0.000180949105914718,
439
+ "loss": 0.8365,
440
+ "step": 680
441
+ },
442
+ {
443
+ "epoch": 1.33,
444
+ "learning_rate": 0.00017888583218707011,
445
+ "loss": 0.8287,
446
+ "step": 690
447
+ },
448
+ {
449
+ "epoch": 1.35,
450
+ "learning_rate": 0.00017682255845942227,
451
+ "loss": 0.8334,
452
+ "step": 700
453
+ },
454
+ {
455
+ "epoch": 1.37,
456
+ "learning_rate": 0.0001747592847317744,
457
+ "loss": 0.8445,
458
+ "step": 710
459
+ },
460
+ {
461
+ "epoch": 1.39,
462
+ "learning_rate": 0.00017269601100412653,
463
+ "loss": 0.8286,
464
+ "step": 720
465
+ },
466
+ {
467
+ "epoch": 1.41,
468
+ "learning_rate": 0.00017063273727647865,
469
+ "loss": 0.8266,
470
+ "step": 730
471
+ },
472
+ {
473
+ "epoch": 1.43,
474
+ "learning_rate": 0.00016856946354883078,
475
+ "loss": 0.8276,
476
+ "step": 740
477
+ },
478
+ {
479
+ "epoch": 1.45,
480
+ "learning_rate": 0.00016650618982118294,
481
+ "loss": 0.8386,
482
+ "step": 750
483
+ },
484
+ {
485
+ "epoch": 1.47,
486
+ "learning_rate": 0.00016444291609353506,
487
+ "loss": 0.8349,
488
+ "step": 760
489
+ },
490
+ {
491
+ "epoch": 1.48,
492
+ "learning_rate": 0.0001623796423658872,
493
+ "loss": 0.8292,
494
+ "step": 770
495
+ },
496
+ {
497
+ "epoch": 1.5,
498
+ "learning_rate": 0.00016031636863823932,
499
+ "loss": 0.8354,
500
+ "step": 780
501
+ },
502
+ {
503
+ "epoch": 1.52,
504
+ "learning_rate": 0.00015825309491059145,
505
+ "loss": 0.8306,
506
+ "step": 790
507
+ },
508
+ {
509
+ "epoch": 1.54,
510
+ "learning_rate": 0.00015618982118294358,
511
+ "loss": 0.8346,
512
+ "step": 800
513
+ },
514
+ {
515
+ "epoch": 1.54,
516
+ "eval_loss": 0.856368899345398,
517
+ "eval_runtime": 175.5494,
518
+ "eval_samples_per_second": 11.393,
519
+ "eval_steps_per_second": 0.285,
520
+ "step": 800
521
+ },
522
+ {
523
+ "epoch": 1.56,
524
+ "learning_rate": 0.00015412654745529573,
525
+ "loss": 0.8244,
526
+ "step": 810
527
+ },
528
+ {
529
+ "epoch": 1.58,
530
+ "learning_rate": 0.00015206327372764786,
531
+ "loss": 0.8129,
532
+ "step": 820
533
+ },
534
+ {
535
+ "epoch": 1.6,
536
+ "learning_rate": 0.00015,
537
+ "loss": 0.8216,
538
+ "step": 830
539
+ },
540
+ {
541
+ "epoch": 1.62,
542
+ "learning_rate": 0.00014793672627235211,
543
+ "loss": 0.8205,
544
+ "step": 840
545
+ },
546
+ {
547
+ "epoch": 1.64,
548
+ "learning_rate": 0.00014587345254470424,
549
+ "loss": 0.8215,
550
+ "step": 850
551
+ },
552
+ {
553
+ "epoch": 1.66,
554
+ "learning_rate": 0.00014381017881705637,
555
+ "loss": 0.8467,
556
+ "step": 860
557
+ },
558
+ {
559
+ "epoch": 1.68,
560
+ "learning_rate": 0.0001417469050894085,
561
+ "loss": 0.8258,
562
+ "step": 870
563
+ },
564
+ {
565
+ "epoch": 1.7,
566
+ "learning_rate": 0.00013968363136176065,
567
+ "loss": 0.8277,
568
+ "step": 880
569
+ },
570
+ {
571
+ "epoch": 1.72,
572
+ "learning_rate": 0.00013762035763411278,
573
+ "loss": 0.8249,
574
+ "step": 890
575
+ },
576
+ {
577
+ "epoch": 1.74,
578
+ "learning_rate": 0.0001355570839064649,
579
+ "loss": 0.8226,
580
+ "step": 900
581
+ },
582
+ {
583
+ "epoch": 1.75,
584
+ "learning_rate": 0.00013349381017881704,
585
+ "loss": 0.8268,
586
+ "step": 910
587
+ },
588
+ {
589
+ "epoch": 1.77,
590
+ "learning_rate": 0.00013143053645116917,
591
+ "loss": 0.8321,
592
+ "step": 920
593
+ },
594
+ {
595
+ "epoch": 1.79,
596
+ "learning_rate": 0.00012936726272352132,
597
+ "loss": 0.8163,
598
+ "step": 930
599
+ },
600
+ {
601
+ "epoch": 1.81,
602
+ "learning_rate": 0.00012730398899587345,
603
+ "loss": 0.8352,
604
+ "step": 940
605
+ },
606
+ {
607
+ "epoch": 1.83,
608
+ "learning_rate": 0.00012524071526822558,
609
+ "loss": 0.8106,
610
+ "step": 950
611
+ },
612
+ {
613
+ "epoch": 1.85,
614
+ "learning_rate": 0.0001231774415405777,
615
+ "loss": 0.8017,
616
+ "step": 960
617
+ },
618
+ {
619
+ "epoch": 1.87,
620
+ "learning_rate": 0.00012111416781292983,
621
+ "loss": 0.8244,
622
+ "step": 970
623
+ },
624
+ {
625
+ "epoch": 1.89,
626
+ "learning_rate": 0.00011905089408528197,
627
+ "loss": 0.8363,
628
+ "step": 980
629
+ },
630
+ {
631
+ "epoch": 1.91,
632
+ "learning_rate": 0.0001169876203576341,
633
+ "loss": 0.8307,
634
+ "step": 990
635
+ },
636
+ {
637
+ "epoch": 1.93,
638
+ "learning_rate": 0.00011492434662998623,
639
+ "loss": 0.8239,
640
+ "step": 1000
641
+ },
642
+ {
643
+ "epoch": 1.93,
644
+ "eval_loss": 0.8538553714752197,
645
+ "eval_runtime": 175.4751,
646
+ "eval_samples_per_second": 11.398,
647
+ "eval_steps_per_second": 0.285,
648
+ "step": 1000
649
+ },
650
+ {
651
+ "epoch": 1.95,
652
+ "learning_rate": 0.00011286107290233837,
653
+ "loss": 0.8269,
654
+ "step": 1010
655
+ },
656
+ {
657
+ "epoch": 1.97,
658
+ "learning_rate": 0.0001107977991746905,
659
+ "loss": 0.8238,
660
+ "step": 1020
661
+ },
662
+ {
663
+ "epoch": 1.99,
664
+ "learning_rate": 0.00010873452544704263,
665
+ "loss": 0.8175,
666
+ "step": 1030
667
+ },
668
+ {
669
+ "epoch": 2.01,
670
+ "learning_rate": 0.00010667125171939477,
671
+ "loss": 0.8139,
672
+ "step": 1040
673
+ },
674
+ {
675
+ "epoch": 2.02,
676
+ "learning_rate": 0.0001046079779917469,
677
+ "loss": 0.8025,
678
+ "step": 1050
679
+ },
680
+ {
681
+ "epoch": 2.04,
682
+ "learning_rate": 0.00010254470426409902,
683
+ "loss": 0.8056,
684
+ "step": 1060
685
+ },
686
+ {
687
+ "epoch": 2.06,
688
+ "learning_rate": 0.00010048143053645117,
689
+ "loss": 0.8111,
690
+ "step": 1070
691
+ },
692
+ {
693
+ "epoch": 2.08,
694
+ "learning_rate": 9.841815680880329e-05,
695
+ "loss": 0.8163,
696
+ "step": 1080
697
+ },
698
+ {
699
+ "epoch": 2.1,
700
+ "learning_rate": 9.635488308115543e-05,
701
+ "loss": 0.8198,
702
+ "step": 1090
703
+ },
704
+ {
705
+ "epoch": 2.12,
706
+ "learning_rate": 9.429160935350756e-05,
707
+ "loss": 0.8019,
708
+ "step": 1100
709
+ },
710
+ {
711
+ "epoch": 2.14,
712
+ "learning_rate": 9.222833562585969e-05,
713
+ "loss": 0.8032,
714
+ "step": 1110
715
+ },
716
+ {
717
+ "epoch": 2.16,
718
+ "learning_rate": 9.016506189821183e-05,
719
+ "loss": 0.8105,
720
+ "step": 1120
721
+ },
722
+ {
723
+ "epoch": 2.18,
724
+ "learning_rate": 8.810178817056396e-05,
725
+ "loss": 0.8038,
726
+ "step": 1130
727
+ },
728
+ {
729
+ "epoch": 2.2,
730
+ "learning_rate": 8.603851444291607e-05,
731
+ "loss": 0.8089,
732
+ "step": 1140
733
+ },
734
+ {
735
+ "epoch": 2.22,
736
+ "learning_rate": 8.397524071526822e-05,
737
+ "loss": 0.7989,
738
+ "step": 1150
739
+ },
740
+ {
741
+ "epoch": 2.24,
742
+ "learning_rate": 8.191196698762034e-05,
743
+ "loss": 0.8091,
744
+ "step": 1160
745
+ },
746
+ {
747
+ "epoch": 2.26,
748
+ "learning_rate": 7.984869325997248e-05,
749
+ "loss": 0.8178,
750
+ "step": 1170
751
+ },
752
+ {
753
+ "epoch": 2.28,
754
+ "learning_rate": 7.778541953232461e-05,
755
+ "loss": 0.8027,
756
+ "step": 1180
757
+ },
758
+ {
759
+ "epoch": 2.29,
760
+ "learning_rate": 7.572214580467674e-05,
761
+ "loss": 0.8074,
762
+ "step": 1190
763
+ },
764
+ {
765
+ "epoch": 2.31,
766
+ "learning_rate": 7.365887207702888e-05,
767
+ "loss": 0.8106,
768
+ "step": 1200
769
+ },
770
+ {
771
+ "epoch": 2.31,
772
+ "eval_loss": 0.8536396622657776,
773
+ "eval_runtime": 175.6344,
774
+ "eval_samples_per_second": 11.387,
775
+ "eval_steps_per_second": 0.285,
776
+ "step": 1200
777
+ }
778
+ ],
779
+ "max_steps": 1554,
780
+ "num_train_epochs": 3,
781
+ "total_flos": 1.0681954887765328e+19,
782
+ "trial_name": null,
783
+ "trial_params": null
784
+ }
lora-alpaca/checkpoint-1200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc2a2253014a1304b4da9f1ad32740c85d716b2f80c4fa4cafd7302357e61b5d
3
+ size 3579
lora-alpaca/checkpoint-1400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac446fec77e9d4ee5aa51cd382179afa490761ca791148f4bbd9b59c2cbddd14
3
+ size 409356413
lora-alpaca/checkpoint-1400/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faaf6ef20061bfc08a4b51d86ebf694ba542f6557dbe6213dd4e13e4034a4592
3
+ size 204646285
lora-alpaca/checkpoint-1400/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce3cbba1287ffb89ed45576bf524b75d2e88312d62e0d5855d120300df4ad8a6
3
+ size 14583
lora-alpaca/checkpoint-1400/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdbe47f489f3c12eaafb37e24a399690d537b7535d72448bbdede3f4b061ef33
3
+ size 14583
lora-alpaca/checkpoint-1400/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0341d52e59ad2e4c6bc78603be87f2328a7d2ef67058b348c9bb41e09f19176
3
+ size 14583
lora-alpaca/checkpoint-1400/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42cc4099e89b0a58518f2c30a3eb86252a9fbcedf3642c49111d4a8545b1e2d1
3
+ size 14583
lora-alpaca/checkpoint-1400/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:721be9068410a21b87596bb9d3ca042e78fb8cee812b82642abedfe4c2e4366f
3
+ size 14583
lora-alpaca/checkpoint-1400/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57785e78a54b2f1a6cef554d692ad7b1a86718fd255b0663e59d69aabfb85535
3
+ size 557
lora-alpaca/checkpoint-1400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73d491f5fbc2ddf0b773c4f9f82b8a35e6571e139a979e8eba6315fd9524b6ef
3
+ size 627
lora-alpaca/checkpoint-1400/trainer_state.json ADDED
@@ -0,0 +1,912 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8533167839050293,
3
+ "best_model_checkpoint": "./lora-alpaca/checkpoint-1400",
4
+ "epoch": 2.699575780948708,
5
+ "global_step": 1400,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 2.6999999999999996e-05,
13
+ "loss": 1.554,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "learning_rate": 5.6999999999999996e-05,
19
+ "loss": 1.4793,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.06,
24
+ "learning_rate": 8.4e-05,
25
+ "loss": 1.2861,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.08,
30
+ "learning_rate": 0.00011099999999999999,
31
+ "loss": 1.1305,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.1,
36
+ "learning_rate": 0.00014099999999999998,
37
+ "loss": 1.0653,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.12,
42
+ "learning_rate": 0.00017099999999999998,
43
+ "loss": 1.0233,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.13,
48
+ "learning_rate": 0.000201,
49
+ "loss": 0.9835,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 0.15,
54
+ "learning_rate": 0.00023099999999999998,
55
+ "loss": 0.9385,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 0.17,
60
+ "learning_rate": 0.000261,
61
+ "loss": 0.9179,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 0.19,
66
+ "learning_rate": 0.00029099999999999997,
67
+ "loss": 0.9107,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.21,
72
+ "learning_rate": 0.0002985557083906465,
73
+ "loss": 0.914,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 0.23,
78
+ "learning_rate": 0.0002964924346629986,
79
+ "loss": 0.8971,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 0.25,
84
+ "learning_rate": 0.00029442916093535074,
85
+ "loss": 0.9021,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 0.27,
90
+ "learning_rate": 0.00029236588720770286,
91
+ "loss": 0.8939,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 0.29,
96
+ "learning_rate": 0.000290302613480055,
97
+ "loss": 0.8917,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 0.31,
102
+ "learning_rate": 0.0002882393397524071,
103
+ "loss": 0.8834,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 0.33,
108
+ "learning_rate": 0.00028617606602475925,
109
+ "loss": 0.8927,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 0.35,
114
+ "learning_rate": 0.0002841127922971114,
115
+ "loss": 0.8979,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 0.37,
120
+ "learning_rate": 0.0002820495185694635,
121
+ "loss": 0.8812,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 0.39,
126
+ "learning_rate": 0.00027998624484181563,
127
+ "loss": 0.8852,
128
+ "step": 200
129
+ },
130
+ {
131
+ "epoch": 0.39,
132
+ "eval_loss": 0.8834338188171387,
133
+ "eval_runtime": 172.9592,
134
+ "eval_samples_per_second": 11.563,
135
+ "eval_steps_per_second": 0.289,
136
+ "step": 200
137
+ },
138
+ {
139
+ "epoch": 0.4,
140
+ "learning_rate": 0.0002779229711141678,
141
+ "loss": 0.8837,
142
+ "step": 210
143
+ },
144
+ {
145
+ "epoch": 0.42,
146
+ "learning_rate": 0.00027585969738651994,
147
+ "loss": 0.8837,
148
+ "step": 220
149
+ },
150
+ {
151
+ "epoch": 0.44,
152
+ "learning_rate": 0.00027379642365887207,
153
+ "loss": 0.8703,
154
+ "step": 230
155
+ },
156
+ {
157
+ "epoch": 0.46,
158
+ "learning_rate": 0.0002717331499312242,
159
+ "loss": 0.885,
160
+ "step": 240
161
+ },
162
+ {
163
+ "epoch": 0.48,
164
+ "learning_rate": 0.0002696698762035763,
165
+ "loss": 0.877,
166
+ "step": 250
167
+ },
168
+ {
169
+ "epoch": 0.5,
170
+ "learning_rate": 0.00026760660247592845,
171
+ "loss": 0.8698,
172
+ "step": 260
173
+ },
174
+ {
175
+ "epoch": 0.52,
176
+ "learning_rate": 0.0002655433287482806,
177
+ "loss": 0.8698,
178
+ "step": 270
179
+ },
180
+ {
181
+ "epoch": 0.54,
182
+ "learning_rate": 0.0002634800550206327,
183
+ "loss": 0.8732,
184
+ "step": 280
185
+ },
186
+ {
187
+ "epoch": 0.56,
188
+ "learning_rate": 0.00026141678129298484,
189
+ "loss": 0.8571,
190
+ "step": 290
191
+ },
192
+ {
193
+ "epoch": 0.58,
194
+ "learning_rate": 0.00025935350756533696,
195
+ "loss": 0.8597,
196
+ "step": 300
197
+ },
198
+ {
199
+ "epoch": 0.6,
200
+ "learning_rate": 0.0002572902338376891,
201
+ "loss": 0.8719,
202
+ "step": 310
203
+ },
204
+ {
205
+ "epoch": 0.62,
206
+ "learning_rate": 0.0002552269601100413,
207
+ "loss": 0.8624,
208
+ "step": 320
209
+ },
210
+ {
211
+ "epoch": 0.64,
212
+ "learning_rate": 0.0002531636863823934,
213
+ "loss": 0.8577,
214
+ "step": 330
215
+ },
216
+ {
217
+ "epoch": 0.66,
218
+ "learning_rate": 0.00025110041265474553,
219
+ "loss": 0.8624,
220
+ "step": 340
221
+ },
222
+ {
223
+ "epoch": 0.67,
224
+ "learning_rate": 0.00024903713892709766,
225
+ "loss": 0.8765,
226
+ "step": 350
227
+ },
228
+ {
229
+ "epoch": 0.69,
230
+ "learning_rate": 0.0002469738651994498,
231
+ "loss": 0.8595,
232
+ "step": 360
233
+ },
234
+ {
235
+ "epoch": 0.71,
236
+ "learning_rate": 0.0002449105914718019,
237
+ "loss": 0.8646,
238
+ "step": 370
239
+ },
240
+ {
241
+ "epoch": 0.73,
242
+ "learning_rate": 0.00024284731774415404,
243
+ "loss": 0.87,
244
+ "step": 380
245
+ },
246
+ {
247
+ "epoch": 0.75,
248
+ "learning_rate": 0.00024078404401650617,
249
+ "loss": 0.8568,
250
+ "step": 390
251
+ },
252
+ {
253
+ "epoch": 0.77,
254
+ "learning_rate": 0.0002387207702888583,
255
+ "loss": 0.8571,
256
+ "step": 400
257
+ },
258
+ {
259
+ "epoch": 0.77,
260
+ "eval_loss": 0.8650650382041931,
261
+ "eval_runtime": 175.1777,
262
+ "eval_samples_per_second": 11.417,
263
+ "eval_steps_per_second": 0.285,
264
+ "step": 400
265
+ },
266
+ {
267
+ "epoch": 0.79,
268
+ "learning_rate": 0.00023665749656121043,
269
+ "loss": 0.8495,
270
+ "step": 410
271
+ },
272
+ {
273
+ "epoch": 0.81,
274
+ "learning_rate": 0.00023459422283356255,
275
+ "loss": 0.8572,
276
+ "step": 420
277
+ },
278
+ {
279
+ "epoch": 0.83,
280
+ "learning_rate": 0.0002325309491059147,
281
+ "loss": 0.8629,
282
+ "step": 430
283
+ },
284
+ {
285
+ "epoch": 0.85,
286
+ "learning_rate": 0.00023046767537826684,
287
+ "loss": 0.8413,
288
+ "step": 440
289
+ },
290
+ {
291
+ "epoch": 0.87,
292
+ "learning_rate": 0.00022840440165061896,
293
+ "loss": 0.853,
294
+ "step": 450
295
+ },
296
+ {
297
+ "epoch": 0.89,
298
+ "learning_rate": 0.0002263411279229711,
299
+ "loss": 0.8662,
300
+ "step": 460
301
+ },
302
+ {
303
+ "epoch": 0.91,
304
+ "learning_rate": 0.00022427785419532322,
305
+ "loss": 0.8495,
306
+ "step": 470
307
+ },
308
+ {
309
+ "epoch": 0.93,
310
+ "learning_rate": 0.00022221458046767537,
311
+ "loss": 0.8502,
312
+ "step": 480
313
+ },
314
+ {
315
+ "epoch": 0.94,
316
+ "learning_rate": 0.0002201513067400275,
317
+ "loss": 0.8601,
318
+ "step": 490
319
+ },
320
+ {
321
+ "epoch": 0.96,
322
+ "learning_rate": 0.00021808803301237963,
323
+ "loss": 0.8503,
324
+ "step": 500
325
+ },
326
+ {
327
+ "epoch": 0.98,
328
+ "learning_rate": 0.00021602475928473176,
329
+ "loss": 0.8517,
330
+ "step": 510
331
+ },
332
+ {
333
+ "epoch": 1.0,
334
+ "learning_rate": 0.00021396148555708389,
335
+ "loss": 0.8633,
336
+ "step": 520
337
+ },
338
+ {
339
+ "epoch": 1.02,
340
+ "learning_rate": 0.00021189821182943601,
341
+ "loss": 0.853,
342
+ "step": 530
343
+ },
344
+ {
345
+ "epoch": 1.04,
346
+ "learning_rate": 0.00020983493810178817,
347
+ "loss": 0.8379,
348
+ "step": 540
349
+ },
350
+ {
351
+ "epoch": 1.06,
352
+ "learning_rate": 0.0002077716643741403,
353
+ "loss": 0.8396,
354
+ "step": 550
355
+ },
356
+ {
357
+ "epoch": 1.08,
358
+ "learning_rate": 0.00020570839064649242,
359
+ "loss": 0.8569,
360
+ "step": 560
361
+ },
362
+ {
363
+ "epoch": 1.1,
364
+ "learning_rate": 0.00020364511691884455,
365
+ "loss": 0.8594,
366
+ "step": 570
367
+ },
368
+ {
369
+ "epoch": 1.12,
370
+ "learning_rate": 0.00020158184319119668,
371
+ "loss": 0.8461,
372
+ "step": 580
373
+ },
374
+ {
375
+ "epoch": 1.14,
376
+ "learning_rate": 0.00019951856946354884,
377
+ "loss": 0.8429,
378
+ "step": 590
379
+ },
380
+ {
381
+ "epoch": 1.16,
382
+ "learning_rate": 0.00019745529573590096,
383
+ "loss": 0.8308,
384
+ "step": 600
385
+ },
386
+ {
387
+ "epoch": 1.16,
388
+ "eval_loss": 0.8592662215232849,
389
+ "eval_runtime": 175.4031,
390
+ "eval_samples_per_second": 11.402,
391
+ "eval_steps_per_second": 0.285,
392
+ "step": 600
393
+ },
394
+ {
395
+ "epoch": 1.18,
396
+ "learning_rate": 0.0001953920220082531,
397
+ "loss": 0.8355,
398
+ "step": 610
399
+ },
400
+ {
401
+ "epoch": 1.2,
402
+ "learning_rate": 0.0001933287482806052,
403
+ "loss": 0.834,
404
+ "step": 620
405
+ },
406
+ {
407
+ "epoch": 1.21,
408
+ "learning_rate": 0.00019126547455295732,
409
+ "loss": 0.8299,
410
+ "step": 630
411
+ },
412
+ {
413
+ "epoch": 1.23,
414
+ "learning_rate": 0.00018920220082530945,
415
+ "loss": 0.8431,
416
+ "step": 640
417
+ },
418
+ {
419
+ "epoch": 1.25,
420
+ "learning_rate": 0.00018713892709766163,
421
+ "loss": 0.841,
422
+ "step": 650
423
+ },
424
+ {
425
+ "epoch": 1.27,
426
+ "learning_rate": 0.00018507565337001373,
427
+ "loss": 0.8374,
428
+ "step": 660
429
+ },
430
+ {
431
+ "epoch": 1.29,
432
+ "learning_rate": 0.00018301237964236586,
433
+ "loss": 0.84,
434
+ "step": 670
435
+ },
436
+ {
437
+ "epoch": 1.31,
438
+ "learning_rate": 0.000180949105914718,
439
+ "loss": 0.8365,
440
+ "step": 680
441
+ },
442
+ {
443
+ "epoch": 1.33,
444
+ "learning_rate": 0.00017888583218707011,
445
+ "loss": 0.8287,
446
+ "step": 690
447
+ },
448
+ {
449
+ "epoch": 1.35,
450
+ "learning_rate": 0.00017682255845942227,
451
+ "loss": 0.8334,
452
+ "step": 700
453
+ },
454
+ {
455
+ "epoch": 1.37,
456
+ "learning_rate": 0.0001747592847317744,
457
+ "loss": 0.8445,
458
+ "step": 710
459
+ },
460
+ {
461
+ "epoch": 1.39,
462
+ "learning_rate": 0.00017269601100412653,
463
+ "loss": 0.8286,
464
+ "step": 720
465
+ },
466
+ {
467
+ "epoch": 1.41,
468
+ "learning_rate": 0.00017063273727647865,
469
+ "loss": 0.8266,
470
+ "step": 730
471
+ },
472
+ {
473
+ "epoch": 1.43,
474
+ "learning_rate": 0.00016856946354883078,
475
+ "loss": 0.8276,
476
+ "step": 740
477
+ },
478
+ {
479
+ "epoch": 1.45,
480
+ "learning_rate": 0.00016650618982118294,
481
+ "loss": 0.8386,
482
+ "step": 750
483
+ },
484
+ {
485
+ "epoch": 1.47,
486
+ "learning_rate": 0.00016444291609353506,
487
+ "loss": 0.8349,
488
+ "step": 760
489
+ },
490
+ {
491
+ "epoch": 1.48,
492
+ "learning_rate": 0.0001623796423658872,
493
+ "loss": 0.8292,
494
+ "step": 770
495
+ },
496
+ {
497
+ "epoch": 1.5,
498
+ "learning_rate": 0.00016031636863823932,
499
+ "loss": 0.8354,
500
+ "step": 780
501
+ },
502
+ {
503
+ "epoch": 1.52,
504
+ "learning_rate": 0.00015825309491059145,
505
+ "loss": 0.8306,
506
+ "step": 790
507
+ },
508
+ {
509
+ "epoch": 1.54,
510
+ "learning_rate": 0.00015618982118294358,
511
+ "loss": 0.8346,
512
+ "step": 800
513
+ },
514
+ {
515
+ "epoch": 1.54,
516
+ "eval_loss": 0.856368899345398,
517
+ "eval_runtime": 175.5494,
518
+ "eval_samples_per_second": 11.393,
519
+ "eval_steps_per_second": 0.285,
520
+ "step": 800
521
+ },
522
+ {
523
+ "epoch": 1.56,
524
+ "learning_rate": 0.00015412654745529573,
525
+ "loss": 0.8244,
526
+ "step": 810
527
+ },
528
+ {
529
+ "epoch": 1.58,
530
+ "learning_rate": 0.00015206327372764786,
531
+ "loss": 0.8129,
532
+ "step": 820
533
+ },
534
+ {
535
+ "epoch": 1.6,
536
+ "learning_rate": 0.00015,
537
+ "loss": 0.8216,
538
+ "step": 830
539
+ },
540
+ {
541
+ "epoch": 1.62,
542
+ "learning_rate": 0.00014793672627235211,
543
+ "loss": 0.8205,
544
+ "step": 840
545
+ },
546
+ {
547
+ "epoch": 1.64,
548
+ "learning_rate": 0.00014587345254470424,
549
+ "loss": 0.8215,
550
+ "step": 850
551
+ },
552
+ {
553
+ "epoch": 1.66,
554
+ "learning_rate": 0.00014381017881705637,
555
+ "loss": 0.8467,
556
+ "step": 860
557
+ },
558
+ {
559
+ "epoch": 1.68,
560
+ "learning_rate": 0.0001417469050894085,
561
+ "loss": 0.8258,
562
+ "step": 870
563
+ },
564
+ {
565
+ "epoch": 1.7,
566
+ "learning_rate": 0.00013968363136176065,
567
+ "loss": 0.8277,
568
+ "step": 880
569
+ },
570
+ {
571
+ "epoch": 1.72,
572
+ "learning_rate": 0.00013762035763411278,
573
+ "loss": 0.8249,
574
+ "step": 890
575
+ },
576
+ {
577
+ "epoch": 1.74,
578
+ "learning_rate": 0.0001355570839064649,
579
+ "loss": 0.8226,
580
+ "step": 900
581
+ },
582
+ {
583
+ "epoch": 1.75,
584
+ "learning_rate": 0.00013349381017881704,
585
+ "loss": 0.8268,
586
+ "step": 910
587
+ },
588
+ {
589
+ "epoch": 1.77,
590
+ "learning_rate": 0.00013143053645116917,
591
+ "loss": 0.8321,
592
+ "step": 920
593
+ },
594
+ {
595
+ "epoch": 1.79,
596
+ "learning_rate": 0.00012936726272352132,
597
+ "loss": 0.8163,
598
+ "step": 930
599
+ },
600
+ {
601
+ "epoch": 1.81,
602
+ "learning_rate": 0.00012730398899587345,
603
+ "loss": 0.8352,
604
+ "step": 940
605
+ },
606
+ {
607
+ "epoch": 1.83,
608
+ "learning_rate": 0.00012524071526822558,
609
+ "loss": 0.8106,
610
+ "step": 950
611
+ },
612
+ {
613
+ "epoch": 1.85,
614
+ "learning_rate": 0.0001231774415405777,
615
+ "loss": 0.8017,
616
+ "step": 960
617
+ },
618
+ {
619
+ "epoch": 1.87,
620
+ "learning_rate": 0.00012111416781292983,
621
+ "loss": 0.8244,
622
+ "step": 970
623
+ },
624
+ {
625
+ "epoch": 1.89,
626
+ "learning_rate": 0.00011905089408528197,
627
+ "loss": 0.8363,
628
+ "step": 980
629
+ },
630
+ {
631
+ "epoch": 1.91,
632
+ "learning_rate": 0.0001169876203576341,
633
+ "loss": 0.8307,
634
+ "step": 990
635
+ },
636
+ {
637
+ "epoch": 1.93,
638
+ "learning_rate": 0.00011492434662998623,
639
+ "loss": 0.8239,
640
+ "step": 1000
641
+ },
642
+ {
643
+ "epoch": 1.93,
644
+ "eval_loss": 0.8538553714752197,
645
+ "eval_runtime": 175.4751,
646
+ "eval_samples_per_second": 11.398,
647
+ "eval_steps_per_second": 0.285,
648
+ "step": 1000
649
+ },
650
+ {
651
+ "epoch": 1.95,
652
+ "learning_rate": 0.00011286107290233837,
653
+ "loss": 0.8269,
654
+ "step": 1010
655
+ },
656
+ {
657
+ "epoch": 1.97,
658
+ "learning_rate": 0.0001107977991746905,
659
+ "loss": 0.8238,
660
+ "step": 1020
661
+ },
662
+ {
663
+ "epoch": 1.99,
664
+ "learning_rate": 0.00010873452544704263,
665
+ "loss": 0.8175,
666
+ "step": 1030
667
+ },
668
+ {
669
+ "epoch": 2.01,
670
+ "learning_rate": 0.00010667125171939477,
671
+ "loss": 0.8139,
672
+ "step": 1040
673
+ },
674
+ {
675
+ "epoch": 2.02,
676
+ "learning_rate": 0.0001046079779917469,
677
+ "loss": 0.8025,
678
+ "step": 1050
679
+ },
680
+ {
681
+ "epoch": 2.04,
682
+ "learning_rate": 0.00010254470426409902,
683
+ "loss": 0.8056,
684
+ "step": 1060
685
+ },
686
+ {
687
+ "epoch": 2.06,
688
+ "learning_rate": 0.00010048143053645117,
689
+ "loss": 0.8111,
690
+ "step": 1070
691
+ },
692
+ {
693
+ "epoch": 2.08,
694
+ "learning_rate": 9.841815680880329e-05,
695
+ "loss": 0.8163,
696
+ "step": 1080
697
+ },
698
+ {
699
+ "epoch": 2.1,
700
+ "learning_rate": 9.635488308115543e-05,
701
+ "loss": 0.8198,
702
+ "step": 1090
703
+ },
704
+ {
705
+ "epoch": 2.12,
706
+ "learning_rate": 9.429160935350756e-05,
707
+ "loss": 0.8019,
708
+ "step": 1100
709
+ },
710
+ {
711
+ "epoch": 2.14,
712
+ "learning_rate": 9.222833562585969e-05,
713
+ "loss": 0.8032,
714
+ "step": 1110
715
+ },
716
+ {
717
+ "epoch": 2.16,
718
+ "learning_rate": 9.016506189821183e-05,
719
+ "loss": 0.8105,
720
+ "step": 1120
721
+ },
722
+ {
723
+ "epoch": 2.18,
724
+ "learning_rate": 8.810178817056396e-05,
725
+ "loss": 0.8038,
726
+ "step": 1130
727
+ },
728
+ {
729
+ "epoch": 2.2,
730
+ "learning_rate": 8.603851444291607e-05,
731
+ "loss": 0.8089,
732
+ "step": 1140
733
+ },
734
+ {
735
+ "epoch": 2.22,
736
+ "learning_rate": 8.397524071526822e-05,
737
+ "loss": 0.7989,
738
+ "step": 1150
739
+ },
740
+ {
741
+ "epoch": 2.24,
742
+ "learning_rate": 8.191196698762034e-05,
743
+ "loss": 0.8091,
744
+ "step": 1160
745
+ },
746
+ {
747
+ "epoch": 2.26,
748
+ "learning_rate": 7.984869325997248e-05,
749
+ "loss": 0.8178,
750
+ "step": 1170
751
+ },
752
+ {
753
+ "epoch": 2.28,
754
+ "learning_rate": 7.778541953232461e-05,
755
+ "loss": 0.8027,
756
+ "step": 1180
757
+ },
758
+ {
759
+ "epoch": 2.29,
760
+ "learning_rate": 7.572214580467674e-05,
761
+ "loss": 0.8074,
762
+ "step": 1190
763
+ },
764
+ {
765
+ "epoch": 2.31,
766
+ "learning_rate": 7.365887207702888e-05,
767
+ "loss": 0.8106,
768
+ "step": 1200
769
+ },
770
+ {
771
+ "epoch": 2.31,
772
+ "eval_loss": 0.8536396622657776,
773
+ "eval_runtime": 175.6344,
774
+ "eval_samples_per_second": 11.387,
775
+ "eval_steps_per_second": 0.285,
776
+ "step": 1200
777
+ },
778
+ {
779
+ "epoch": 2.33,
780
+ "learning_rate": 7.159559834938101e-05,
781
+ "loss": 0.7945,
782
+ "step": 1210
783
+ },
784
+ {
785
+ "epoch": 2.35,
786
+ "learning_rate": 6.953232462173315e-05,
787
+ "loss": 0.7974,
788
+ "step": 1220
789
+ },
790
+ {
791
+ "epoch": 2.37,
792
+ "learning_rate": 6.746905089408527e-05,
793
+ "loss": 0.8091,
794
+ "step": 1230
795
+ },
796
+ {
797
+ "epoch": 2.39,
798
+ "learning_rate": 6.540577716643741e-05,
799
+ "loss": 0.7932,
800
+ "step": 1240
801
+ },
802
+ {
803
+ "epoch": 2.41,
804
+ "learning_rate": 6.334250343878954e-05,
805
+ "loss": 0.808,
806
+ "step": 1250
807
+ },
808
+ {
809
+ "epoch": 2.43,
810
+ "learning_rate": 6.127922971114168e-05,
811
+ "loss": 0.8068,
812
+ "step": 1260
813
+ },
814
+ {
815
+ "epoch": 2.45,
816
+ "learning_rate": 5.9215955983493804e-05,
817
+ "loss": 0.802,
818
+ "step": 1270
819
+ },
820
+ {
821
+ "epoch": 2.47,
822
+ "learning_rate": 5.715268225584594e-05,
823
+ "loss": 0.8063,
824
+ "step": 1280
825
+ },
826
+ {
827
+ "epoch": 2.49,
828
+ "learning_rate": 5.508940852819807e-05,
829
+ "loss": 0.7924,
830
+ "step": 1290
831
+ },
832
+ {
833
+ "epoch": 2.51,
834
+ "learning_rate": 5.30261348005502e-05,
835
+ "loss": 0.792,
836
+ "step": 1300
837
+ },
838
+ {
839
+ "epoch": 2.53,
840
+ "learning_rate": 5.096286107290233e-05,
841
+ "loss": 0.8072,
842
+ "step": 1310
843
+ },
844
+ {
845
+ "epoch": 2.55,
846
+ "learning_rate": 4.8899587345254464e-05,
847
+ "loss": 0.7928,
848
+ "step": 1320
849
+ },
850
+ {
851
+ "epoch": 2.56,
852
+ "learning_rate": 4.68363136176066e-05,
853
+ "loss": 0.7966,
854
+ "step": 1330
855
+ },
856
+ {
857
+ "epoch": 2.58,
858
+ "learning_rate": 4.4773039889958734e-05,
859
+ "loss": 0.809,
860
+ "step": 1340
861
+ },
862
+ {
863
+ "epoch": 2.6,
864
+ "learning_rate": 4.270976616231086e-05,
865
+ "loss": 0.795,
866
+ "step": 1350
867
+ },
868
+ {
869
+ "epoch": 2.62,
870
+ "learning_rate": 4.0646492434662996e-05,
871
+ "loss": 0.7908,
872
+ "step": 1360
873
+ },
874
+ {
875
+ "epoch": 2.64,
876
+ "learning_rate": 3.858321870701513e-05,
877
+ "loss": 0.7942,
878
+ "step": 1370
879
+ },
880
+ {
881
+ "epoch": 2.66,
882
+ "learning_rate": 3.651994497936726e-05,
883
+ "loss": 0.797,
884
+ "step": 1380
885
+ },
886
+ {
887
+ "epoch": 2.68,
888
+ "learning_rate": 3.4456671251719394e-05,
889
+ "loss": 0.7892,
890
+ "step": 1390
891
+ },
892
+ {
893
+ "epoch": 2.7,
894
+ "learning_rate": 3.239339752407152e-05,
895
+ "loss": 0.8089,
896
+ "step": 1400
897
+ },
898
+ {
899
+ "epoch": 2.7,
900
+ "eval_loss": 0.8533167839050293,
901
+ "eval_runtime": 175.364,
902
+ "eval_samples_per_second": 11.405,
903
+ "eval_steps_per_second": 0.285,
904
+ "step": 1400
905
+ }
906
+ ],
907
+ "max_steps": 1554,
908
+ "num_train_epochs": 3,
909
+ "total_flos": 1.2465067108239147e+19,
910
+ "trial_name": null,
911
+ "trial_params": null
912
+ }
lora-alpaca/checkpoint-1400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc2a2253014a1304b4da9f1ad32740c85d716b2f80c4fa4cafd7302357e61b5d
3
+ size 3579