nkasmanoff commited on
Commit
70744ea
·
verified ·
1 Parent(s): 11b4e08

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +5 -5
  2. train_results.json +5 -5
  3. trainer_state.json +166 -89
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "total_flos": 2990143010382336.0,
4
- "train_loss": 0.5055735324683629,
5
- "train_runtime": 417.7958,
6
- "train_samples_per_second": 38.344,
7
- "train_steps_per_second": 4.799
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "total_flos": 6581450258638848.0,
4
+ "train_loss": 0.5889826508769839,
5
+ "train_runtime": 806.0376,
6
+ "train_samples_per_second": 31.5,
7
+ "train_steps_per_second": 3.939
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "total_flos": 2990143010382336.0,
4
- "train_loss": 0.5055735324683629,
5
- "train_runtime": 417.7958,
6
- "train_samples_per_second": 38.344,
7
- "train_steps_per_second": 4.799
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "total_flos": 6581450258638848.0,
4
+ "train_loss": 0.5889826508769839,
5
+ "train_runtime": 806.0376,
6
+ "train_samples_per_second": 31.5,
7
+ "train_steps_per_second": 3.939
8
  }
trainer_state.json CHANGED
@@ -3,163 +3,240 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 2005,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.24937655860349128,
13
- "grad_norm": 1.2734375,
14
- "learning_rate": 0.0004950495049504951,
15
- "loss": 1.4705,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 0.49875311720698257,
20
- "grad_norm": 1.265625,
21
- "learning_rate": 0.0004966720227318671,
22
- "loss": 0.9437,
23
  "step": 200
24
  },
25
  {
26
- "epoch": 0.7481296758104738,
27
- "grad_norm": 1.078125,
28
- "learning_rate": 0.0004866439935228541,
29
- "loss": 0.8591,
30
  "step": 300
31
  },
32
  {
33
- "epoch": 0.9975062344139651,
34
- "grad_norm": 1.03125,
35
- "learning_rate": 0.00047018796504965955,
36
- "loss": 0.8111,
37
  "step": 400
38
  },
39
  {
40
- "epoch": 1.2468827930174564,
41
- "grad_norm": 0.8359375,
42
- "learning_rate": 0.0004477509350887423,
43
- "loss": 0.574,
44
  "step": 500
45
  },
46
  {
47
- "epoch": 1.4962593516209477,
48
- "grad_norm": 1.0234375,
49
- "learning_rate": 0.0004199423643322254,
50
- "loss": 0.5806,
51
  "step": 600
52
  },
53
  {
54
- "epoch": 1.745635910224439,
55
- "grad_norm": 1.0078125,
56
- "learning_rate": 0.00038751762150804385,
57
- "loss": 0.5699,
58
  "step": 700
59
  },
60
  {
61
- "epoch": 1.9950124688279303,
62
- "grad_norm": 0.8046875,
63
- "learning_rate": 0.00035135746517639757,
64
- "loss": 0.5553,
65
  "step": 800
66
  },
67
  {
68
- "epoch": 2.2443890274314215,
69
- "grad_norm": 0.74609375,
70
- "learning_rate": 0.00031244411954180673,
71
- "loss": 0.3753,
72
  "step": 900
73
  },
74
  {
75
- "epoch": 2.493765586034913,
76
- "grad_norm": 0.91796875,
77
- "learning_rate": 0.00027183459413737774,
78
- "loss": 0.374,
79
  "step": 1000
80
  },
81
  {
82
- "epoch": 2.743142144638404,
83
- "grad_norm": 1.0546875,
84
- "learning_rate": 0.00023063197210303058,
85
- "loss": 0.3746,
86
  "step": 1100
87
  },
88
  {
89
- "epoch": 2.9925187032418954,
90
- "grad_norm": 1.0625,
91
- "learning_rate": 0.00018995544695885592,
92
- "loss": 0.3789,
93
  "step": 1200
94
  },
95
  {
96
- "epoch": 3.2418952618453867,
97
- "grad_norm": 0.94140625,
98
- "learning_rate": 0.0001509099217695958,
99
- "loss": 0.2943,
100
  "step": 1300
101
  },
102
  {
103
- "epoch": 3.491271820448878,
104
- "grad_norm": 0.81640625,
105
- "learning_rate": 0.00011455599648308674,
106
- "loss": 0.288,
107
  "step": 1400
108
  },
109
  {
110
- "epoch": 3.7406483790523692,
111
- "grad_norm": 0.78515625,
112
- "learning_rate": 8.18811586814684e-05,
113
- "loss": 0.2878,
114
  "step": 1500
115
  },
116
  {
117
- "epoch": 3.9900249376558605,
118
- "grad_norm": 0.82421875,
119
- "learning_rate": 5.377296029546741e-05,
120
- "loss": 0.2929,
121
  "step": 1600
122
  },
123
  {
124
- "epoch": 4.239401496259352,
125
- "grad_norm": 0.8203125,
126
- "learning_rate": 3.099490888702508e-05,
127
- "loss": 0.2837,
128
  "step": 1700
129
  },
130
  {
131
- "epoch": 4.488778054862843,
132
- "grad_norm": 0.8125,
133
- "learning_rate": 1.4165728369278874e-05,
134
- "loss": 0.2715,
135
  "step": 1800
136
  },
137
  {
138
- "epoch": 4.738154613466334,
139
- "grad_norm": 0.85546875,
140
- "learning_rate": 3.7425525083322754e-06,
141
- "loss": 0.268,
142
  "step": 1900
143
  },
144
  {
145
- "epoch": 4.987531172069826,
146
- "grad_norm": 0.88671875,
147
- "learning_rate": 8.507724455514287e-09,
148
- "loss": 0.2712,
149
  "step": 2000
150
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  {
152
  "epoch": 5.0,
153
- "step": 2005,
154
- "total_flos": 2990143010382336.0,
155
- "train_loss": 0.5055735324683629,
156
- "train_runtime": 417.7958,
157
- "train_samples_per_second": 38.344,
158
- "train_steps_per_second": 4.799
159
  }
160
  ],
161
  "logging_steps": 100,
162
- "max_steps": 2005,
163
  "num_input_tokens_seen": 0,
164
  "num_train_epochs": 5,
165
  "save_steps": 500,
@@ -175,7 +252,7 @@
175
  "attributes": {}
176
  }
177
  },
178
- "total_flos": 2990143010382336.0,
179
  "train_batch_size": 8,
180
  "trial_name": null,
181
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 3175,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.15748031496062992,
13
+ "grad_norm": 1.953125,
14
+ "learning_rate": 0.00031446540880503143,
15
+ "loss": 1.7301,
16
  "step": 100
17
  },
18
  {
19
+ "epoch": 0.31496062992125984,
20
+ "grad_norm": 1.1328125,
21
+ "learning_rate": 0.0004997720451762572,
22
+ "loss": 1.1137,
23
  "step": 200
24
  },
25
  {
26
+ "epoch": 0.47244094488188976,
27
+ "grad_norm": 1.28125,
28
+ "learning_rate": 0.0004973084374349976,
29
+ "loss": 1.0141,
30
  "step": 300
31
  },
32
  {
33
+ "epoch": 0.6299212598425197,
34
+ "grad_norm": 1.078125,
35
+ "learning_rate": 0.0004921639131931859,
36
+ "loss": 0.9538,
37
  "step": 400
38
  },
39
  {
40
+ "epoch": 0.7874015748031497,
41
+ "grad_norm": 0.9453125,
42
+ "learning_rate": 0.00048439424102900066,
43
+ "loss": 0.9061,
44
  "step": 500
45
  },
46
  {
47
+ "epoch": 0.9448818897637795,
48
+ "grad_norm": 1.0859375,
49
+ "learning_rate": 0.00047408364711169396,
50
+ "loss": 0.8785,
51
  "step": 600
52
  },
53
  {
54
+ "epoch": 1.1023622047244095,
55
+ "grad_norm": 0.83984375,
56
+ "learning_rate": 0.00046134390215823,
57
+ "loss": 0.7189,
58
  "step": 700
59
  },
60
  {
61
+ "epoch": 1.2598425196850394,
62
+ "grad_norm": 0.953125,
63
+ "learning_rate": 0.00044631310979666443,
64
+ "loss": 0.6703,
65
  "step": 800
66
  },
67
  {
68
+ "epoch": 1.4173228346456692,
69
+ "grad_norm": 0.9296875,
70
+ "learning_rate": 0.0004291542094708612,
71
+ "loss": 0.6648,
72
  "step": 900
73
  },
74
  {
75
+ "epoch": 1.574803149606299,
76
+ "grad_norm": 1.2890625,
77
+ "learning_rate": 0.000410053210115622,
78
+ "loss": 0.6524,
79
  "step": 1000
80
  },
81
  {
82
+ "epoch": 1.7322834645669292,
83
+ "grad_norm": 1.03125,
84
+ "learning_rate": 0.00038921717374985584,
85
+ "loss": 0.6622,
86
  "step": 1100
87
  },
88
  {
89
+ "epoch": 1.889763779527559,
90
+ "grad_norm": 0.96484375,
91
+ "learning_rate": 0.0003668719708463959,
92
+ "loss": 0.6379,
93
  "step": 1200
94
  },
95
  {
96
+ "epoch": 2.047244094488189,
97
+ "grad_norm": 1.0234375,
98
+ "learning_rate": 0.00034325983181110047,
99
+ "loss": 0.5745,
100
  "step": 1300
101
  },
102
  {
103
+ "epoch": 2.204724409448819,
104
+ "grad_norm": 1.0546875,
105
+ "learning_rate": 0.00031863672111412524,
106
+ "loss": 0.4619,
107
  "step": 1400
108
  },
109
  {
110
+ "epoch": 2.362204724409449,
111
+ "grad_norm": 1.203125,
112
+ "learning_rate": 0.00029326956253877123,
113
+ "loss": 0.4808,
114
  "step": 1500
115
  },
116
  {
117
+ "epoch": 2.5196850393700787,
118
+ "grad_norm": 0.9921875,
119
+ "learning_rate": 0.00026743334562725617,
120
+ "loss": 0.4598,
121
  "step": 1600
122
  },
123
  {
124
+ "epoch": 2.677165354330709,
125
+ "grad_norm": 1.109375,
126
+ "learning_rate": 0.00024140814469062377,
127
+ "loss": 0.4687,
128
  "step": 1700
129
  },
130
  {
131
+ "epoch": 2.8346456692913384,
132
+ "grad_norm": 0.98046875,
133
+ "learning_rate": 0.0002154760826978469,
134
+ "loss": 0.4703,
135
  "step": 1800
136
  },
137
  {
138
+ "epoch": 2.9921259842519685,
139
+ "grad_norm": 1.0625,
140
+ "learning_rate": 0.00018991827295670777,
141
+ "loss": 0.4558,
142
  "step": 1900
143
  },
144
  {
145
+ "epoch": 3.1496062992125986,
146
+ "grad_norm": 1.046875,
147
+ "learning_rate": 0.00016501177173978493,
148
+ "loss": 0.3817,
149
  "step": 2000
150
  },
151
+ {
152
+ "epoch": 3.3070866141732282,
153
+ "grad_norm": 1.0859375,
154
+ "learning_rate": 0.00014102657489022886,
155
+ "loss": 0.3786,
156
+ "step": 2100
157
+ },
158
+ {
159
+ "epoch": 3.4645669291338583,
160
+ "grad_norm": 0.8671875,
161
+ "learning_rate": 0.00011822269096524812,
162
+ "loss": 0.3779,
163
+ "step": 2200
164
+ },
165
+ {
166
+ "epoch": 3.622047244094488,
167
+ "grad_norm": 1.328125,
168
+ "learning_rate": 9.684732264553247e-05,
169
+ "loss": 0.3815,
170
+ "step": 2300
171
+ },
172
+ {
173
+ "epoch": 3.779527559055118,
174
+ "grad_norm": 0.85546875,
175
+ "learning_rate": 7.713218696519558e-05,
176
+ "loss": 0.3859,
177
+ "step": 2400
178
+ },
179
+ {
180
+ "epoch": 3.937007874015748,
181
+ "grad_norm": 1.0234375,
182
+ "learning_rate": 5.929100341195187e-05,
183
+ "loss": 0.3779,
184
+ "step": 2500
185
+ },
186
+ {
187
+ "epoch": 4.094488188976378,
188
+ "grad_norm": 0.765625,
189
+ "learning_rate": 4.351717712746703e-05,
190
+ "loss": 0.3608,
191
+ "step": 2600
192
+ },
193
+ {
194
+ "epoch": 4.251968503937007,
195
+ "grad_norm": 1.0546875,
196
+ "learning_rate": 2.9981702322862735e-05,
197
+ "loss": 0.3582,
198
+ "step": 2700
199
+ },
200
+ {
201
+ "epoch": 4.409448818897638,
202
+ "grad_norm": 0.9375,
203
+ "learning_rate": 1.8831308637139e-05,
204
+ "loss": 0.3731,
205
+ "step": 2800
206
+ },
207
+ {
208
+ "epoch": 4.566929133858268,
209
+ "grad_norm": 0.9296875,
210
+ "learning_rate": 1.0186870532686742e-05,
211
+ "loss": 0.3588,
212
+ "step": 2900
213
+ },
214
+ {
215
+ "epoch": 4.724409448818898,
216
+ "grad_norm": 0.71875,
217
+ "learning_rate": 4.1420969706420505e-06,
218
+ "loss": 0.3537,
219
+ "step": 3000
220
+ },
221
+ {
222
+ "epoch": 4.881889763779528,
223
+ "grad_norm": 0.83203125,
224
+ "learning_rate": 7.625155704936715e-07,
225
+ "loss": 0.3614,
226
+ "step": 3100
227
+ },
228
  {
229
  "epoch": 5.0,
230
+ "step": 3175,
231
+ "total_flos": 6581450258638848.0,
232
+ "train_loss": 0.5889826508769839,
233
+ "train_runtime": 806.0376,
234
+ "train_samples_per_second": 31.5,
235
+ "train_steps_per_second": 3.939
236
  }
237
  ],
238
  "logging_steps": 100,
239
+ "max_steps": 3175,
240
  "num_input_tokens_seen": 0,
241
  "num_train_epochs": 5,
242
  "save_steps": 500,
 
252
  "attributes": {}
253
  }
254
  },
255
+ "total_flos": 6581450258638848.0,
256
  "train_batch_size": 8,
257
  "trial_name": null,
258
  "trial_params": null