nkasmanoff commited on
Commit
989ea98
1 Parent(s): 3b0d3af

End of training

Browse files
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  library_name: transformers
3
  license: apache-2.0
4
- base_model: HuggingFaceTB/SmolLM-360M
5
  tags:
6
  - trl
7
  - sft
@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # nature-buddy
18
 
19
- This model is a fine-tuned version of [HuggingFaceTB/SmolLM-360M](https://huggingface.co/HuggingFaceTB/SmolLM-360M) on the None dataset.
20
 
21
  ## Model description
22
 
 
1
  ---
2
  library_name: transformers
3
  license: apache-2.0
4
+ base_model: Qwen/Qwen2.5-0.5B
5
  tags:
6
  - trl
7
  - sft
 
16
 
17
  # nature-buddy
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B) on the None dataset.
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "total_flos": 1.94980087881216e+16,
4
- "train_loss": 0.42690239373154526,
5
- "train_runtime": 1498.3039,
6
- "train_samples_per_second": 16.946,
7
- "train_steps_per_second": 2.119
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "total_flos": 2.254602634660147e+16,
4
+ "train_loss": 0.7783217168792965,
5
+ "train_runtime": 2040.3082,
6
+ "train_samples_per_second": 12.444,
7
+ "train_steps_per_second": 1.556
8
  }
generation_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "_from_model_config": true,
3
- "bos_token_id": 0,
4
- "eos_token_id": 0,
5
  "transformers_version": "4.44.2"
6
  }
 
1
  {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
  "transformers_version": "4.44.2"
6
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "total_flos": 1.94980087881216e+16,
4
- "train_loss": 0.42690239373154526,
5
- "train_runtime": 1498.3039,
6
- "train_samples_per_second": 16.946,
7
- "train_steps_per_second": 2.119
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "total_flos": 2.254602634660147e+16,
4
+ "train_loss": 0.7783217168792965,
5
+ "train_runtime": 2040.3082,
6
+ "train_samples_per_second": 12.444,
7
+ "train_steps_per_second": 1.556
8
  }
trainer_state.json CHANGED
@@ -10,229 +10,229 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.15748031496062992,
13
- "grad_norm": 1.2890625,
14
  "learning_rate": 0.00031446540880503143,
15
- "loss": 1.5497,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.31496062992125984,
20
- "grad_norm": 0.92578125,
21
  "learning_rate": 0.0004997720451762572,
22
- "loss": 0.9254,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.47244094488188976,
27
- "grad_norm": 0.90625,
28
  "learning_rate": 0.0004973084374349976,
29
- "loss": 0.8457,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.6299212598425197,
34
- "grad_norm": 0.734375,
35
  "learning_rate": 0.0004921639131931859,
36
- "loss": 0.7922,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.7874015748031497,
41
- "grad_norm": 0.80078125,
42
  "learning_rate": 0.00048439424102900066,
43
- "loss": 0.7571,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.9448818897637795,
48
- "grad_norm": 0.73828125,
49
  "learning_rate": 0.00047408364711169396,
50
- "loss": 0.7314,
51
  "step": 600
52
  },
53
  {
54
  "epoch": 1.1023622047244095,
55
- "grad_norm": 0.6484375,
56
  "learning_rate": 0.00046134390215823,
57
- "loss": 0.5663,
58
  "step": 700
59
  },
60
  {
61
  "epoch": 1.2598425196850394,
62
- "grad_norm": 0.70703125,
63
  "learning_rate": 0.00044631310979666443,
64
- "loss": 0.5111,
65
  "step": 800
66
  },
67
  {
68
  "epoch": 1.4173228346456692,
69
- "grad_norm": 0.78125,
70
  "learning_rate": 0.0004291542094708612,
71
- "loss": 0.5099,
72
  "step": 900
73
  },
74
  {
75
  "epoch": 1.574803149606299,
76
- "grad_norm": 1.0078125,
77
  "learning_rate": 0.000410053210115622,
78
- "loss": 0.5007,
79
  "step": 1000
80
  },
81
  {
82
  "epoch": 1.7322834645669292,
83
- "grad_norm": 0.828125,
84
  "learning_rate": 0.00038921717374985584,
85
- "loss": 0.5068,
86
  "step": 1100
87
  },
88
  {
89
  "epoch": 1.889763779527559,
90
- "grad_norm": 0.74609375,
91
  "learning_rate": 0.0003668719708463959,
92
- "loss": 0.4938,
93
  "step": 1200
94
  },
95
  {
96
  "epoch": 2.047244094488189,
97
- "grad_norm": 0.78515625,
98
  "learning_rate": 0.00034325983181110047,
99
- "loss": 0.4232,
100
  "step": 1300
101
  },
102
  {
103
  "epoch": 2.204724409448819,
104
- "grad_norm": 0.8515625,
105
  "learning_rate": 0.00031863672111412524,
106
- "loss": 0.2999,
107
  "step": 1400
108
  },
109
  {
110
  "epoch": 2.362204724409449,
111
- "grad_norm": 0.921875,
112
  "learning_rate": 0.00029326956253877123,
113
- "loss": 0.3159,
114
  "step": 1500
115
  },
116
  {
117
  "epoch": 2.5196850393700787,
118
- "grad_norm": 0.79296875,
119
  "learning_rate": 0.00026743334562725617,
120
- "loss": 0.3034,
121
  "step": 1600
122
  },
123
  {
124
  "epoch": 2.677165354330709,
125
- "grad_norm": 0.86328125,
126
  "learning_rate": 0.00024140814469062377,
127
- "loss": 0.3046,
128
  "step": 1700
129
  },
130
  {
131
  "epoch": 2.8346456692913384,
132
- "grad_norm": 0.84375,
133
  "learning_rate": 0.0002154760826978469,
134
- "loss": 0.3078,
135
  "step": 1800
136
  },
137
  {
138
  "epoch": 2.9921259842519685,
139
- "grad_norm": 0.765625,
140
  "learning_rate": 0.00018991827295670777,
141
- "loss": 0.2941,
142
  "step": 1900
143
  },
144
  {
145
  "epoch": 3.1496062992125986,
146
- "grad_norm": 0.7265625,
147
  "learning_rate": 0.00016501177173978493,
148
- "loss": 0.2171,
149
  "step": 2000
150
  },
151
  {
152
  "epoch": 3.3070866141732282,
153
- "grad_norm": 0.765625,
154
  "learning_rate": 0.00014102657489022886,
155
- "loss": 0.2132,
156
  "step": 2100
157
  },
158
  {
159
  "epoch": 3.4645669291338583,
160
- "grad_norm": 0.65234375,
161
  "learning_rate": 0.00011822269096524812,
162
- "loss": 0.2121,
163
  "step": 2200
164
  },
165
  {
166
  "epoch": 3.622047244094488,
167
- "grad_norm": 0.84375,
168
  "learning_rate": 9.684732264553247e-05,
169
- "loss": 0.2142,
170
  "step": 2300
171
  },
172
  {
173
  "epoch": 3.779527559055118,
174
- "grad_norm": 0.66796875,
175
  "learning_rate": 7.713218696519558e-05,
176
- "loss": 0.2175,
177
  "step": 2400
178
  },
179
  {
180
  "epoch": 3.937007874015748,
181
- "grad_norm": 0.68359375,
182
  "learning_rate": 5.929100341195187e-05,
183
- "loss": 0.2124,
184
  "step": 2500
185
  },
186
  {
187
  "epoch": 4.094488188976378,
188
- "grad_norm": 0.5390625,
189
  "learning_rate": 4.351717712746703e-05,
190
- "loss": 0.1979,
191
  "step": 2600
192
  },
193
  {
194
  "epoch": 4.251968503937007,
195
- "grad_norm": 0.67578125,
196
  "learning_rate": 2.9981702322862735e-05,
197
- "loss": 0.1953,
198
  "step": 2700
199
  },
200
  {
201
  "epoch": 4.409448818897638,
202
- "grad_norm": 0.625,
203
  "learning_rate": 1.8831308637139e-05,
204
- "loss": 0.2015,
205
  "step": 2800
206
  },
207
  {
208
  "epoch": 4.566929133858268,
209
- "grad_norm": 0.58203125,
210
  "learning_rate": 1.0186870532686742e-05,
211
- "loss": 0.1981,
212
  "step": 2900
213
  },
214
  {
215
  "epoch": 4.724409448818898,
216
- "grad_norm": 0.5,
217
  "learning_rate": 4.1420969706420505e-06,
218
- "loss": 0.19,
219
  "step": 3000
220
  },
221
  {
222
  "epoch": 4.881889763779528,
223
- "grad_norm": 0.55859375,
224
  "learning_rate": 7.625155704936715e-07,
225
- "loss": 0.1945,
226
  "step": 3100
227
  },
228
  {
229
  "epoch": 5.0,
230
  "step": 3175,
231
- "total_flos": 1.94980087881216e+16,
232
- "train_loss": 0.42690239373154526,
233
- "train_runtime": 1498.3039,
234
- "train_samples_per_second": 16.946,
235
- "train_steps_per_second": 2.119
236
  }
237
  ],
238
  "logging_steps": 100,
@@ -252,7 +252,7 @@
252
  "attributes": {}
253
  }
254
  },
255
- "total_flos": 1.94980087881216e+16,
256
  "train_batch_size": 8,
257
  "trial_name": null,
258
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.15748031496062992,
13
+ "grad_norm": 5.84375,
14
  "learning_rate": 0.00031446540880503143,
15
+ "loss": 1.6016,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.31496062992125984,
20
+ "grad_norm": 2.90625,
21
  "learning_rate": 0.0004997720451762572,
22
+ "loss": 2.0478,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.47244094488188976,
27
+ "grad_norm": 3.0,
28
  "learning_rate": 0.0004973084374349976,
29
+ "loss": 2.0072,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.6299212598425197,
34
+ "grad_norm": 2.578125,
35
  "learning_rate": 0.0004921639131931859,
36
+ "loss": 1.8507,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.7874015748031497,
41
+ "grad_norm": 1.6953125,
42
  "learning_rate": 0.00048439424102900066,
43
+ "loss": 1.7019,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.9448818897637795,
48
+ "grad_norm": 2.6875,
49
  "learning_rate": 0.00047408364711169396,
50
+ "loss": 1.5988,
51
  "step": 600
52
  },
53
  {
54
  "epoch": 1.1023622047244095,
55
+ "grad_norm": 1.8203125,
56
  "learning_rate": 0.00046134390215823,
57
+ "loss": 1.3103,
58
  "step": 700
59
  },
60
  {
61
  "epoch": 1.2598425196850394,
62
+ "grad_norm": 1.671875,
63
  "learning_rate": 0.00044631310979666443,
64
+ "loss": 1.237,
65
  "step": 800
66
  },
67
  {
68
  "epoch": 1.4173228346456692,
69
+ "grad_norm": 1.7421875,
70
  "learning_rate": 0.0004291542094708612,
71
+ "loss": 1.2112,
72
  "step": 900
73
  },
74
  {
75
  "epoch": 1.574803149606299,
76
+ "grad_norm": 2.28125,
77
  "learning_rate": 0.000410053210115622,
78
+ "loss": 1.1948,
79
  "step": 1000
80
  },
81
  {
82
  "epoch": 1.7322834645669292,
83
+ "grad_norm": 1.5859375,
84
  "learning_rate": 0.00038921717374985584,
85
+ "loss": 1.1666,
86
  "step": 1100
87
  },
88
  {
89
  "epoch": 1.889763779527559,
90
+ "grad_norm": 1.59375,
91
  "learning_rate": 0.0003668719708463959,
92
+ "loss": 1.1001,
93
  "step": 1200
94
  },
95
  {
96
  "epoch": 2.047244094488189,
97
+ "grad_norm": 1.6171875,
98
  "learning_rate": 0.00034325983181110047,
99
+ "loss": 0.9377,
100
  "step": 1300
101
  },
102
  {
103
  "epoch": 2.204724409448819,
104
+ "grad_norm": 1.640625,
105
  "learning_rate": 0.00031863672111412524,
106
+ "loss": 0.6561,
107
  "step": 1400
108
  },
109
  {
110
  "epoch": 2.362204724409449,
111
+ "grad_norm": 1.9296875,
112
  "learning_rate": 0.00029326956253877123,
113
+ "loss": 0.6872,
114
  "step": 1500
115
  },
116
  {
117
  "epoch": 2.5196850393700787,
118
+ "grad_norm": 1.5859375,
119
  "learning_rate": 0.00026743334562725617,
120
+ "loss": 0.6448,
121
  "step": 1600
122
  },
123
  {
124
  "epoch": 2.677165354330709,
125
+ "grad_norm": 1.6484375,
126
  "learning_rate": 0.00024140814469062377,
127
+ "loss": 0.643,
128
  "step": 1700
129
  },
130
  {
131
  "epoch": 2.8346456692913384,
132
+ "grad_norm": 1.28125,
133
  "learning_rate": 0.0002154760826978469,
134
+ "loss": 0.6216,
135
  "step": 1800
136
  },
137
  {
138
  "epoch": 2.9921259842519685,
139
+ "grad_norm": 1.625,
140
  "learning_rate": 0.00018991827295670777,
141
+ "loss": 0.5788,
142
  "step": 1900
143
  },
144
  {
145
  "epoch": 3.1496062992125986,
146
+ "grad_norm": 1.2890625,
147
  "learning_rate": 0.00016501177173978493,
148
+ "loss": 0.2589,
149
  "step": 2000
150
  },
151
  {
152
  "epoch": 3.3070866141732282,
153
+ "grad_norm": 1.7734375,
154
  "learning_rate": 0.00014102657489022886,
155
+ "loss": 0.2383,
156
  "step": 2100
157
  },
158
  {
159
  "epoch": 3.4645669291338583,
160
+ "grad_norm": 1.1875,
161
  "learning_rate": 0.00011822269096524812,
162
+ "loss": 0.2227,
163
  "step": 2200
164
  },
165
  {
166
  "epoch": 3.622047244094488,
167
+ "grad_norm": 1.46875,
168
  "learning_rate": 9.684732264553247e-05,
169
+ "loss": 0.2168,
170
  "step": 2300
171
  },
172
  {
173
  "epoch": 3.779527559055118,
174
+ "grad_norm": 1.0390625,
175
  "learning_rate": 7.713218696519558e-05,
176
+ "loss": 0.2162,
177
  "step": 2400
178
  },
179
  {
180
  "epoch": 3.937007874015748,
181
+ "grad_norm": 1.609375,
182
  "learning_rate": 5.929100341195187e-05,
183
+ "loss": 0.1944,
184
  "step": 2500
185
  },
186
  {
187
  "epoch": 4.094488188976378,
188
+ "grad_norm": 0.45703125,
189
  "learning_rate": 4.351717712746703e-05,
190
+ "loss": 0.123,
191
  "step": 2600
192
  },
193
  {
194
  "epoch": 4.251968503937007,
195
+ "grad_norm": 0.4609375,
196
  "learning_rate": 2.9981702322862735e-05,
197
+ "loss": 0.0768,
198
  "step": 2700
199
  },
200
  {
201
  "epoch": 4.409448818897638,
202
+ "grad_norm": 0.59375,
203
  "learning_rate": 1.8831308637139e-05,
204
+ "loss": 0.0761,
205
  "step": 2800
206
  },
207
  {
208
  "epoch": 4.566929133858268,
209
+ "grad_norm": 0.53515625,
210
  "learning_rate": 1.0186870532686742e-05,
211
+ "loss": 0.0786,
212
  "step": 2900
213
  },
214
  {
215
  "epoch": 4.724409448818898,
216
+ "grad_norm": 0.640625,
217
  "learning_rate": 4.1420969706420505e-06,
218
+ "loss": 0.0758,
219
  "step": 3000
220
  },
221
  {
222
  "epoch": 4.881889763779528,
223
+ "grad_norm": 0.515625,
224
  "learning_rate": 7.625155704936715e-07,
225
+ "loss": 0.0792,
226
  "step": 3100
227
  },
228
  {
229
  "epoch": 5.0,
230
  "step": 3175,
231
+ "total_flos": 2.254602634660147e+16,
232
+ "train_loss": 0.7783217168792965,
233
+ "train_runtime": 2040.3082,
234
+ "train_samples_per_second": 12.444,
235
+ "train_steps_per_second": 1.556
236
  }
237
  ],
238
  "logging_steps": 100,
 
252
  "attributes": {}
253
  }
254
  },
255
+ "total_flos": 2.254602634660147e+16,
256
  "train_batch_size": 8,
257
  "trial_name": null,
258
  "trial_params": null