KevinKibe commited on
Commit
b16e4fe
1 Parent(s): 7499115

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a2971e3fa441c6195173934959b060bb37d1918f45fcb772cd683b88b14b691
3
- size 2751039840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e803bd49f3930468bb25dec6f3b145be3563fa0473cf89fe849fd156ba829d6
3
+ size 2751040864
last-checkpoint/pytorch_model-00001-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e997daa058d1d742d0043f65fc86671ea2099fdf09fb77765feeb444f4f447b
3
  size 5000078781
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b938fee26d72af327f488b7a6d3dd3965f17be62270e5fa901cc06715df86f83
3
  size 5000078781
last-checkpoint/pytorch_model-00002-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0acbc24840ee77a560ac012b96b3ffc8f3e7d2bebdb9ceee0b3779a62612f5d
3
  size 482838574
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:207e1eb9852e0014a1e5967f14829387ae512bf72200a7bd8dbe6cd2b9744fa1
3
  size 482838574
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3300103406756fad9a1f1f8a21d36f1372b8c21da5142037d90f074362090e43
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfe664d6ceb782278e808b1191d468184a14f4c1487f92773562a2a538616d86
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e18c3ed3aeaa20b55f954231cfa9b3d71b919b964de3ed9a5d9be2d054fa1c6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0483b2fb5f8063a86ad62f9b7bae0bbe612492a917afee7ab754e22d70616737
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.4362771511077881,
3
- "best_model_checkpoint": "../KevinKibe/nllb-200-distilled-1.3B-finetuned-finetuned-finetuned-finetuned/checkpoint-200",
4
- "epoch": 200.0,
5
  "eval_steps": 200,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -24,6 +24,159 @@
24
  "eval_samples_per_second": 1.065,
25
  "eval_steps_per_second": 0.533,
26
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  }
28
  ],
29
  "logging_steps": 200,
@@ -31,7 +184,7 @@
31
  "num_input_tokens_seen": 0,
32
  "num_train_epochs": 2000,
33
  "save_steps": 200,
34
- "total_flos": 500057466470400.0,
35
  "train_batch_size": 16,
36
  "trial_name": null,
37
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.3573495149612427,
3
+ "best_model_checkpoint": "../KevinKibe/nllb-200-distilled-1.3B-finetuned-finetuned-finetuned-finetuned/checkpoint-400",
4
+ "epoch": 2000.0,
5
  "eval_steps": 200,
6
+ "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
24
  "eval_samples_per_second": 1.065,
25
  "eval_steps_per_second": 0.533,
26
  "step": 200
27
+ },
28
+ {
29
+ "epoch": 400.0,
30
+ "grad_norm": 0.0009787885937839746,
31
+ "learning_rate": 4.524848217064997e-05,
32
+ "loss": 0.0003,
33
+ "step": 400
34
+ },
35
+ {
36
+ "epoch": 400.0,
37
+ "eval_gen_len": 22.5,
38
+ "eval_loss": 0.3573495149612427,
39
+ "eval_rouge": 0.2713,
40
+ "eval_runtime": 1.0507,
41
+ "eval_samples_per_second": 1.903,
42
+ "eval_steps_per_second": 0.952,
43
+ "step": 400
44
+ },
45
+ {
46
+ "epoch": 600.0,
47
+ "grad_norm": 0.00037960291956551373,
48
+ "learning_rate": 3.972638318855291e-05,
49
+ "loss": 0.0,
50
+ "step": 600
51
+ },
52
+ {
53
+ "epoch": 600.0,
54
+ "eval_gen_len": 19.5,
55
+ "eval_loss": 0.37564775347709656,
56
+ "eval_rouge": 0.2762,
57
+ "eval_runtime": 0.9,
58
+ "eval_samples_per_second": 2.222,
59
+ "eval_steps_per_second": 1.111,
60
+ "step": 600
61
+ },
62
+ {
63
+ "epoch": 800.0,
64
+ "grad_norm": 0.00027565439813770354,
65
+ "learning_rate": 3.2762763215215306e-05,
66
+ "loss": 0.0,
67
+ "step": 800
68
+ },
69
+ {
70
+ "epoch": 800.0,
71
+ "eval_gen_len": 19.5,
72
+ "eval_loss": 0.3804549276828766,
73
+ "eval_rouge": 0.2762,
74
+ "eval_runtime": 0.8354,
75
+ "eval_samples_per_second": 2.394,
76
+ "eval_steps_per_second": 1.197,
77
+ "step": 800
78
+ },
79
+ {
80
+ "epoch": 1000.0,
81
+ "grad_norm": 0.00021920226572547108,
82
+ "learning_rate": 2.5039269892020772e-05,
83
+ "loss": 0.0,
84
+ "step": 1000
85
+ },
86
+ {
87
+ "epoch": 1000.0,
88
+ "eval_gen_len": 19.5,
89
+ "eval_loss": 0.38089361786842346,
90
+ "eval_rouge": 0.2762,
91
+ "eval_runtime": 0.9583,
92
+ "eval_samples_per_second": 2.087,
93
+ "eval_steps_per_second": 1.044,
94
+ "step": 1000
95
+ },
96
+ {
97
+ "epoch": 1200.0,
98
+ "grad_norm": 0.00018822183483280241,
99
+ "learning_rate": 1.731193255818582e-05,
100
+ "loss": 0.0,
101
+ "step": 1200
102
+ },
103
+ {
104
+ "epoch": 1200.0,
105
+ "eval_gen_len": 19.5,
106
+ "eval_loss": 0.38475799560546875,
107
+ "eval_rouge": 0.2762,
108
+ "eval_runtime": 0.8748,
109
+ "eval_samples_per_second": 2.286,
110
+ "eval_steps_per_second": 1.143,
111
+ "step": 1200
112
+ },
113
+ {
114
+ "epoch": 1400.0,
115
+ "grad_norm": 0.00018423172878101468,
116
+ "learning_rate": 1.0337156831471246e-05,
117
+ "loss": 0.0,
118
+ "step": 1400
119
+ },
120
+ {
121
+ "epoch": 1400.0,
122
+ "eval_gen_len": 22.5,
123
+ "eval_loss": 0.3964395225048065,
124
+ "eval_rouge": 0.2713,
125
+ "eval_runtime": 0.8684,
126
+ "eval_samples_per_second": 2.303,
127
+ "eval_steps_per_second": 1.152,
128
+ "step": 1400
129
+ },
130
+ {
131
+ "epoch": 1600.0,
132
+ "grad_norm": 0.00017020950326696038,
133
+ "learning_rate": 4.7976823561278865e-06,
134
+ "loss": 0.0,
135
+ "step": 1600
136
+ },
137
+ {
138
+ "epoch": 1600.0,
139
+ "eval_gen_len": 22.5,
140
+ "eval_loss": 0.39614760875701904,
141
+ "eval_rouge": 0.2713,
142
+ "eval_runtime": 1.0804,
143
+ "eval_samples_per_second": 1.851,
144
+ "eval_steps_per_second": 0.926,
145
+ "step": 1600
146
+ },
147
+ {
148
+ "epoch": 1800.0,
149
+ "grad_norm": 0.00016995037731248885,
150
+ "learning_rate": 1.2357514895905003e-06,
151
+ "loss": 0.0,
152
+ "step": 1800
153
+ },
154
+ {
155
+ "epoch": 1800.0,
156
+ "eval_gen_len": 22.5,
157
+ "eval_loss": 0.3964083194732666,
158
+ "eval_rouge": 0.2713,
159
+ "eval_runtime": 0.9109,
160
+ "eval_samples_per_second": 2.196,
161
+ "eval_steps_per_second": 1.098,
162
+ "step": 1800
163
+ },
164
+ {
165
+ "epoch": 2000.0,
166
+ "grad_norm": 0.00016411281831096858,
167
+ "learning_rate": 3.0842507411921185e-11,
168
+ "loss": 0.0,
169
+ "step": 2000
170
+ },
171
+ {
172
+ "epoch": 2000.0,
173
+ "eval_gen_len": 22.5,
174
+ "eval_loss": 0.3965863287448883,
175
+ "eval_rouge": 0.2713,
176
+ "eval_runtime": 0.8715,
177
+ "eval_samples_per_second": 2.295,
178
+ "eval_steps_per_second": 1.147,
179
+ "step": 2000
180
  }
181
  ],
182
  "logging_steps": 200,
 
184
  "num_input_tokens_seen": 0,
185
  "num_train_epochs": 2000,
186
  "save_steps": 200,
187
+ "total_flos": 5000574664704000.0,
188
  "train_batch_size": 16,
189
  "trial_name": null,
190
  "trial_params": null