Training in progress, step 5, checkpoint
Browse files
last-checkpoint/adapter_config.json
CHANGED
@@ -20,13 +20,13 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"
|
24 |
-
"k_proj",
|
25 |
-
"o_proj",
|
26 |
"v_proj",
|
27 |
"down_proj",
|
28 |
-
"
|
29 |
-
"
|
|
|
|
|
30 |
],
|
31 |
"task_type": "CAUSAL_LM",
|
32 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
+
"q_proj",
|
|
|
|
|
24 |
"v_proj",
|
25 |
"down_proj",
|
26 |
+
"gate_proj",
|
27 |
+
"k_proj",
|
28 |
+
"up_proj",
|
29 |
+
"o_proj"
|
30 |
],
|
31 |
"task_type": "CAUSAL_LM",
|
32 |
"use_dora": false,
|
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 45118424
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d847474f09d76040ea9a945e5aa072ca85a0825970adc2b5fdcc4e94a928b950
|
3 |
size 45118424
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 23159290
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2bc832e1d6d58a296221fd7d502797f542b8623e1ae088266b38b7a5c1d67b17
|
3 |
size 23159290
|
last-checkpoint/rng_state_0.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14512
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69dcafec2603b1c1ced4bbd4d60d2848e0b6db84973c4da004925986c6bce1d9
|
3 |
size 14512
|
last-checkpoint/trainer_state.json
CHANGED
@@ -10,7 +10,7 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.010309278350515464,
|
13 |
-
"grad_norm": 0.
|
14 |
"learning_rate": 2.0000000000000003e-06,
|
15 |
"loss": 0.9943,
|
16 |
"step": 1
|
@@ -18,69 +18,69 @@
|
|
18 |
{
|
19 |
"epoch": 0.010309278350515464,
|
20 |
"eval_loss": 0.9863536357879639,
|
21 |
-
"eval_runtime": 11.
|
22 |
-
"eval_samples_per_second": 13.
|
23 |
-
"eval_steps_per_second": 1.
|
24 |
"step": 1
|
25 |
},
|
26 |
{
|
27 |
"epoch": 0.020618556701030927,
|
28 |
-
"grad_norm": 0.
|
29 |
"learning_rate": 4.000000000000001e-06,
|
30 |
"loss": 0.9017,
|
31 |
"step": 2
|
32 |
},
|
33 |
{
|
34 |
"epoch": 0.020618556701030927,
|
35 |
-
"eval_loss": 0.
|
36 |
-
"eval_runtime": 11.
|
37 |
-
"eval_samples_per_second": 13.
|
38 |
-
"eval_steps_per_second": 1.
|
39 |
"step": 2
|
40 |
},
|
41 |
{
|
42 |
"epoch": 0.030927835051546393,
|
43 |
-
"grad_norm": 1.
|
44 |
"learning_rate": 6e-06,
|
45 |
-
"loss": 1.
|
46 |
"step": 3
|
47 |
},
|
48 |
{
|
49 |
"epoch": 0.030927835051546393,
|
50 |
-
"eval_loss": 0.
|
51 |
-
"eval_runtime": 11.
|
52 |
-
"eval_samples_per_second": 13.
|
53 |
-
"eval_steps_per_second": 1.
|
54 |
"step": 3
|
55 |
},
|
56 |
{
|
57 |
"epoch": 0.041237113402061855,
|
58 |
-
"grad_norm": 0.
|
59 |
"learning_rate": 8.000000000000001e-06,
|
60 |
-
"loss": 0.
|
61 |
"step": 4
|
62 |
},
|
63 |
{
|
64 |
"epoch": 0.041237113402061855,
|
65 |
-
"eval_loss": 0.
|
66 |
-
"eval_runtime": 11.
|
67 |
-
"eval_samples_per_second": 13.
|
68 |
-
"eval_steps_per_second": 1.
|
69 |
"step": 4
|
70 |
},
|
71 |
{
|
72 |
"epoch": 0.05154639175257732,
|
73 |
-
"grad_norm": 0.
|
74 |
"learning_rate": 1e-05,
|
75 |
-
"loss": 0.
|
76 |
"step": 5
|
77 |
},
|
78 |
{
|
79 |
"epoch": 0.05154639175257732,
|
80 |
-
"eval_loss": 0.
|
81 |
-
"eval_runtime": 11.
|
82 |
-
"eval_samples_per_second": 13.
|
83 |
-
"eval_steps_per_second": 1.
|
84 |
"step": 5
|
85 |
}
|
86 |
],
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.010309278350515464,
|
13 |
+
"grad_norm": 0.7860156893730164,
|
14 |
"learning_rate": 2.0000000000000003e-06,
|
15 |
"loss": 0.9943,
|
16 |
"step": 1
|
|
|
18 |
{
|
19 |
"epoch": 0.010309278350515464,
|
20 |
"eval_loss": 0.9863536357879639,
|
21 |
+
"eval_runtime": 11.8025,
|
22 |
+
"eval_samples_per_second": 13.895,
|
23 |
+
"eval_steps_per_second": 1.779,
|
24 |
"step": 1
|
25 |
},
|
26 |
{
|
27 |
"epoch": 0.020618556701030927,
|
28 |
+
"grad_norm": 0.7550910115242004,
|
29 |
"learning_rate": 4.000000000000001e-06,
|
30 |
"loss": 0.9017,
|
31 |
"step": 2
|
32 |
},
|
33 |
{
|
34 |
"epoch": 0.020618556701030927,
|
35 |
+
"eval_loss": 0.9886725544929504,
|
36 |
+
"eval_runtime": 11.9125,
|
37 |
+
"eval_samples_per_second": 13.767,
|
38 |
+
"eval_steps_per_second": 1.763,
|
39 |
"step": 2
|
40 |
},
|
41 |
{
|
42 |
"epoch": 0.030927835051546393,
|
43 |
+
"grad_norm": 1.1944996118545532,
|
44 |
"learning_rate": 6e-06,
|
45 |
+
"loss": 1.1019,
|
46 |
"step": 3
|
47 |
},
|
48 |
{
|
49 |
"epoch": 0.030927835051546393,
|
50 |
+
"eval_loss": 0.9872242212295532,
|
51 |
+
"eval_runtime": 11.8973,
|
52 |
+
"eval_samples_per_second": 13.785,
|
53 |
+
"eval_steps_per_second": 1.765,
|
54 |
"step": 3
|
55 |
},
|
56 |
{
|
57 |
"epoch": 0.041237113402061855,
|
58 |
+
"grad_norm": 0.7197827696800232,
|
59 |
"learning_rate": 8.000000000000001e-06,
|
60 |
+
"loss": 0.8137,
|
61 |
"step": 4
|
62 |
},
|
63 |
{
|
64 |
"epoch": 0.041237113402061855,
|
65 |
+
"eval_loss": 0.9864019751548767,
|
66 |
+
"eval_runtime": 11.966,
|
67 |
+
"eval_samples_per_second": 13.705,
|
68 |
+
"eval_steps_per_second": 1.755,
|
69 |
"step": 4
|
70 |
},
|
71 |
{
|
72 |
"epoch": 0.05154639175257732,
|
73 |
+
"grad_norm": 0.9193218946456909,
|
74 |
"learning_rate": 1e-05,
|
75 |
+
"loss": 0.9198,
|
76 |
"step": 5
|
77 |
},
|
78 |
{
|
79 |
"epoch": 0.05154639175257732,
|
80 |
+
"eval_loss": 0.984832227230072,
|
81 |
+
"eval_runtime": 11.8502,
|
82 |
+
"eval_samples_per_second": 13.839,
|
83 |
+
"eval_steps_per_second": 1.772,
|
84 |
"step": 5
|
85 |
}
|
86 |
],
|
last-checkpoint/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 6712
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d97c4fa40f12b62671931897af867d0478f2095eee9e2b8b99df811f0daadbfa
|
3 |
size 6712
|