AlekseyKorshuk commited on
Commit
9c0c911
1 Parent(s): 4c401fc

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +8 -0
  2. train_results.json +8 -0
  3. trainer_state.json +287 -0
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 15.0,
3
+ "train_loss": 1.0260957291357629,
4
+ "train_runtime": 32997.9021,
5
+ "train_samples": 313,
6
+ "train_samples_per_second": 0.142,
7
+ "train_steps_per_second": 0.036
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 15.0,
3
+ "train_loss": 1.0260957291357629,
4
+ "train_runtime": 32997.9021,
5
+ "train_samples": 313,
6
+ "train_samples_per_second": 0.142,
7
+ "train_steps_per_second": 0.036
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 15.0,
5
+ "global_step": 1185,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 9e-07,
13
+ "loss": 2.5593,
14
+ "perplexity": 12.926765411958069,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 1.0,
19
+ "learning_rate": 9e-07,
20
+ "loss": 2.5731,
21
+ "perplexity": 13.1063913429122,
22
+ "step": 79
23
+ },
24
+ {
25
+ "epoch": 1.0,
26
+ "eval_accuracy": 0.03165881737310309,
27
+ "eval_loss": 2.611328125,
28
+ "eval_perplexity": 13.617124090346826,
29
+ "eval_runtime": 4.9176,
30
+ "eval_samples_per_second": 15.861,
31
+ "eval_steps_per_second": 0.61,
32
+ "step": 79
33
+ },
34
+ {
35
+ "epoch": 2.0,
36
+ "learning_rate": 9e-07,
37
+ "loss": 2.206,
38
+ "perplexity": 9.079326356062138,
39
+ "step": 158
40
+ },
41
+ {
42
+ "epoch": 2.0,
43
+ "eval_accuracy": 0.032757718472004184,
44
+ "eval_loss": 2.48046875,
45
+ "eval_perplexity": 11.946863197656471,
46
+ "eval_runtime": 5.4813,
47
+ "eval_samples_per_second": 14.23,
48
+ "eval_steps_per_second": 0.547,
49
+ "step": 158
50
+ },
51
+ {
52
+ "epoch": 3.0,
53
+ "learning_rate": 9e-07,
54
+ "loss": 1.9105,
55
+ "perplexity": 6.7564661872073595,
56
+ "step": 237
57
+ },
58
+ {
59
+ "epoch": 3.0,
60
+ "eval_accuracy": 0.03333333333333333,
61
+ "eval_loss": 2.451171875,
62
+ "eval_perplexity": 11.601934773230344,
63
+ "eval_runtime": 5.243,
64
+ "eval_samples_per_second": 14.877,
65
+ "eval_steps_per_second": 0.572,
66
+ "step": 237
67
+ },
68
+ {
69
+ "epoch": 4.0,
70
+ "learning_rate": 9e-07,
71
+ "loss": 1.6301,
72
+ "perplexity": 5.1043851315288045,
73
+ "step": 316
74
+ },
75
+ {
76
+ "epoch": 4.0,
77
+ "eval_accuracy": 0.03453689167974882,
78
+ "eval_loss": 2.5078125,
79
+ "eval_perplexity": 12.278042445054236,
80
+ "eval_runtime": 5.4311,
81
+ "eval_samples_per_second": 14.362,
82
+ "eval_steps_per_second": 0.552,
83
+ "step": 316
84
+ },
85
+ {
86
+ "epoch": 5.0,
87
+ "learning_rate": 9e-07,
88
+ "loss": 1.3733,
89
+ "perplexity": 3.948358804340304,
90
+ "step": 395
91
+ },
92
+ {
93
+ "epoch": 5.0,
94
+ "eval_accuracy": 0.034170591313448455,
95
+ "eval_loss": 2.681640625,
96
+ "eval_perplexity": 14.609041604294383,
97
+ "eval_runtime": 5.246,
98
+ "eval_samples_per_second": 14.868,
99
+ "eval_steps_per_second": 0.572,
100
+ "step": 395
101
+ },
102
+ {
103
+ "epoch": 6.0,
104
+ "learning_rate": 9e-07,
105
+ "loss": 1.1337,
106
+ "perplexity": 3.107131645102156,
107
+ "step": 474
108
+ },
109
+ {
110
+ "epoch": 6.0,
111
+ "eval_accuracy": 0.033019361590790164,
112
+ "eval_loss": 3.0078125,
113
+ "eval_perplexity": 20.24306974171993,
114
+ "eval_runtime": 5.3369,
115
+ "eval_samples_per_second": 14.615,
116
+ "eval_steps_per_second": 0.562,
117
+ "step": 474
118
+ },
119
+ {
120
+ "epoch": 7.0,
121
+ "learning_rate": 9e-07,
122
+ "loss": 0.9619,
123
+ "perplexity": 2.616663413821779,
124
+ "step": 553
125
+ },
126
+ {
127
+ "epoch": 7.0,
128
+ "eval_accuracy": 0.033019361590790164,
129
+ "eval_loss": 3.177734375,
130
+ "eval_perplexity": 23.992334296139543,
131
+ "eval_runtime": 5.4368,
132
+ "eval_samples_per_second": 14.347,
133
+ "eval_steps_per_second": 0.552,
134
+ "step": 553
135
+ },
136
+ {
137
+ "epoch": 8.0,
138
+ "learning_rate": 9e-07,
139
+ "loss": 0.798,
140
+ "perplexity": 2.221094294751435,
141
+ "step": 632
142
+ },
143
+ {
144
+ "epoch": 8.0,
145
+ "eval_accuracy": 0.033019361590790164,
146
+ "eval_loss": 3.255859375,
147
+ "eval_perplexity": 25.94189877662824,
148
+ "eval_runtime": 5.543,
149
+ "eval_samples_per_second": 14.072,
150
+ "eval_steps_per_second": 0.541,
151
+ "step": 632
152
+ },
153
+ {
154
+ "epoch": 9.0,
155
+ "learning_rate": 9e-07,
156
+ "loss": 0.6653,
157
+ "perplexity": 1.9450739560040564,
158
+ "step": 711
159
+ },
160
+ {
161
+ "epoch": 9.0,
162
+ "eval_accuracy": 0.033124018838304556,
163
+ "eval_loss": 3.427734375,
164
+ "eval_perplexity": 30.806767041912167,
165
+ "eval_runtime": 5.6032,
166
+ "eval_samples_per_second": 13.921,
167
+ "eval_steps_per_second": 0.535,
168
+ "step": 711
169
+ },
170
+ {
171
+ "epoch": 10.0,
172
+ "learning_rate": 9e-07,
173
+ "loss": 0.552,
174
+ "perplexity": 1.736722992721326,
175
+ "step": 790
176
+ },
177
+ {
178
+ "epoch": 10.0,
179
+ "eval_accuracy": 0.03333333333333333,
180
+ "eval_loss": 3.556640625,
181
+ "eval_perplexity": 35.04526897307925,
182
+ "eval_runtime": 5.5592,
183
+ "eval_samples_per_second": 14.031,
184
+ "eval_steps_per_second": 0.54,
185
+ "step": 790
186
+ },
187
+ {
188
+ "epoch": 11.0,
189
+ "learning_rate": 9e-07,
190
+ "loss": 0.4568,
191
+ "perplexity": 1.579013050057063,
192
+ "step": 869
193
+ },
194
+ {
195
+ "epoch": 11.0,
196
+ "eval_accuracy": 0.032443746729461015,
197
+ "eval_loss": 3.732421875,
198
+ "eval_perplexity": 41.780172086809024,
199
+ "eval_runtime": 5.2356,
200
+ "eval_samples_per_second": 14.898,
201
+ "eval_steps_per_second": 0.573,
202
+ "step": 869
203
+ },
204
+ {
205
+ "epoch": 12.0,
206
+ "learning_rate": 9e-07,
207
+ "loss": 0.3756,
208
+ "perplexity": 1.4558646714178145,
209
+ "step": 948
210
+ },
211
+ {
212
+ "epoch": 12.0,
213
+ "eval_accuracy": 0.032757718472004184,
214
+ "eval_loss": 3.818359375,
215
+ "eval_perplexity": 45.529450258314064,
216
+ "eval_runtime": 5.3544,
217
+ "eval_samples_per_second": 14.568,
218
+ "eval_steps_per_second": 0.56,
219
+ "step": 948
220
+ },
221
+ {
222
+ "epoch": 13.0,
223
+ "learning_rate": 9e-07,
224
+ "loss": 0.3119,
225
+ "perplexity": 1.3660180843907228,
226
+ "step": 1027
227
+ },
228
+ {
229
+ "epoch": 13.0,
230
+ "eval_accuracy": 0.033124018838304556,
231
+ "eval_loss": 3.84765625,
232
+ "eval_perplexity": 46.88305220915621,
233
+ "eval_runtime": 5.595,
234
+ "eval_samples_per_second": 13.941,
235
+ "eval_steps_per_second": 0.536,
236
+ "step": 1027
237
+ },
238
+ {
239
+ "epoch": 14.0,
240
+ "learning_rate": 9e-07,
241
+ "loss": 0.2448,
242
+ "perplexity": 1.2773658144929685,
243
+ "step": 1106
244
+ },
245
+ {
246
+ "epoch": 14.0,
247
+ "eval_accuracy": 0.03291470434327577,
248
+ "eval_loss": 3.90625,
249
+ "eval_perplexity": 49.71218131735948,
250
+ "eval_runtime": 5.2298,
251
+ "eval_samples_per_second": 14.915,
252
+ "eval_steps_per_second": 0.574,
253
+ "step": 1106
254
+ },
255
+ {
256
+ "epoch": 15.0,
257
+ "learning_rate": 9e-07,
258
+ "loss": 0.1986,
259
+ "perplexity": 1.219693990715056,
260
+ "step": 1185
261
+ },
262
+ {
263
+ "epoch": 15.0,
264
+ "eval_accuracy": 0.032862375719518576,
265
+ "eval_loss": 3.943359375,
266
+ "eval_perplexity": 51.591626129191596,
267
+ "eval_runtime": 5.4333,
268
+ "eval_samples_per_second": 14.356,
269
+ "eval_steps_per_second": 0.552,
270
+ "step": 1185
271
+ },
272
+ {
273
+ "epoch": 15.0,
274
+ "step": 1185,
275
+ "total_flos": 39797194752000.0,
276
+ "train_loss": 1.0260957291357629,
277
+ "train_runtime": 32997.9021,
278
+ "train_samples_per_second": 0.142,
279
+ "train_steps_per_second": 0.036
280
+ }
281
+ ],
282
+ "max_steps": 1185,
283
+ "num_train_epochs": 15,
284
+ "total_flos": 39797194752000.0,
285
+ "trial_name": null,
286
+ "trial_params": null
287
+ }