yiran-wang3 commited on
Commit
b505451
1 Parent(s): 6b42668

End of training

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: yiran-wang3/qwen1_chat_adamw_iter3
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ datasets:
11
+ - self-generate/qw1_sppo_hard_new_cn_mining_oj_iter3-binarized
12
+ model-index:
13
+ - name: qwen1_chat_adamw_iter4
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # qwen1_chat_adamw_iter4
21
+
22
+ This model is a fine-tuned version of [yiran-wang3/qwen1_chat_adamw_iter3](https://huggingface.co/yiran-wang3/qwen1_chat_adamw_iter3) on the self-generate/qw1_sppo_hard_new_cn_mining_oj_iter3-binarized dataset.
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 1e-06
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 8
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: constant
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - lr_scheduler_warmup_steps: 100
53
+ - num_epochs: 1.0
54
+
55
+ ### Training results
56
+
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.45.0
62
+ - Pytorch 2.4.0+cu121
63
+ - Datasets 2.14.6
64
+ - Tokenizers 0.20.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.4958818086556026,
5
+ "train_runtime": 153.9035,
6
+ "train_samples": 2688,
7
+ "train_samples_per_second": 17.465,
8
+ "train_steps_per_second": 0.273
9
+ }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
- "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
+ "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.45.0"
14
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.4958818086556026,
5
+ "train_runtime": 153.9035,
6
+ "train_samples": 2688,
7
+ "train_samples_per_second": 17.465,
8
+ "train_steps_per_second": 0.273
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,924 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 100,
6
+ "global_step": 42,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "debug/policy_chosen_logits": -0.6439481973648071,
13
+ "debug/policy_chosen_logps": -163.79736328125,
14
+ "debug/policy_rejected_logits": -0.6659940481185913,
15
+ "debug/policy_rejected_logps": -166.54815673828125,
16
+ "debug/reference_chosen_logps": -163.79736328125,
17
+ "debug/reference_rejected_logps": -166.54815673828125,
18
+ "epoch": 0.023809523809523808,
19
+ "grad_norm": 8.358603090602669,
20
+ "learning_rate": 1e-06,
21
+ "logits/chosen": -0.6439481973648071,
22
+ "logits/rejected": -0.6659940481185913,
23
+ "logps/chosen": -163.79736328125,
24
+ "logps/rejected": -166.54815673828125,
25
+ "loss": 0.5,
26
+ "rewards/accuracies": 0.0,
27
+ "rewards/chosen": 0.0,
28
+ "rewards/margins": 0.0,
29
+ "rewards/rejected": 0.0,
30
+ "step": 1
31
+ },
32
+ {
33
+ "debug/policy_chosen_logits": -0.7713825702667236,
34
+ "debug/policy_chosen_logps": -184.54640197753906,
35
+ "debug/policy_rejected_logits": -0.652373731136322,
36
+ "debug/policy_rejected_logps": -165.83177185058594,
37
+ "debug/reference_chosen_logps": -184.75267028808594,
38
+ "debug/reference_rejected_logps": -166.013916015625,
39
+ "epoch": 0.047619047619047616,
40
+ "grad_norm": 7.3640129793355005,
41
+ "learning_rate": 1e-06,
42
+ "logits/chosen": -0.7713825702667236,
43
+ "logits/rejected": -0.652373731136322,
44
+ "logps/chosen": -184.54640197753906,
45
+ "logps/rejected": -165.83177185058594,
46
+ "loss": 0.5002,
47
+ "rewards/accuracies": 0.5,
48
+ "rewards/chosen": 0.002062625717371702,
49
+ "rewards/margins": 0.00024105067132040858,
50
+ "rewards/rejected": 0.0018215751042589545,
51
+ "step": 2
52
+ },
53
+ {
54
+ "debug/policy_chosen_logits": -0.6304819583892822,
55
+ "debug/policy_chosen_logps": -217.33445739746094,
56
+ "debug/policy_rejected_logits": -0.6844841837882996,
57
+ "debug/policy_rejected_logps": -181.04327392578125,
58
+ "debug/reference_chosen_logps": -217.23004150390625,
59
+ "debug/reference_rejected_logps": -180.46067810058594,
60
+ "epoch": 0.07142857142857142,
61
+ "grad_norm": 7.75582359273493,
62
+ "learning_rate": 1e-06,
63
+ "logits/chosen": -0.6304819583892822,
64
+ "logits/rejected": -0.6844841837882996,
65
+ "logps/chosen": -217.33445739746094,
66
+ "logps/rejected": -181.04327392578125,
67
+ "loss": 0.499,
68
+ "rewards/accuracies": 0.625,
69
+ "rewards/chosen": -0.0010441780323162675,
70
+ "rewards/margins": 0.0047818757593631744,
71
+ "rewards/rejected": -0.005826053209602833,
72
+ "step": 3
73
+ },
74
+ {
75
+ "debug/policy_chosen_logits": -0.7617719769477844,
76
+ "debug/policy_chosen_logps": -149.78887939453125,
77
+ "debug/policy_rejected_logits": -0.7908269762992859,
78
+ "debug/policy_rejected_logps": -156.19158935546875,
79
+ "debug/reference_chosen_logps": -150.406494140625,
80
+ "debug/reference_rejected_logps": -156.66110229492188,
81
+ "epoch": 0.09523809523809523,
82
+ "grad_norm": 7.458999772215554,
83
+ "learning_rate": 1e-06,
84
+ "logits/chosen": -0.7617719769477844,
85
+ "logits/rejected": -0.7908269762992859,
86
+ "logps/chosen": -149.78887939453125,
87
+ "logps/rejected": -156.19158935546875,
88
+ "loss": 0.5011,
89
+ "rewards/accuracies": 0.5,
90
+ "rewards/chosen": 0.006176195107400417,
91
+ "rewards/margins": 0.0014810848515480757,
92
+ "rewards/rejected": 0.004695110023021698,
93
+ "step": 4
94
+ },
95
+ {
96
+ "debug/policy_chosen_logits": -0.8336195945739746,
97
+ "debug/policy_chosen_logps": -167.17535400390625,
98
+ "debug/policy_rejected_logits": -0.815830409526825,
99
+ "debug/policy_rejected_logps": -167.67922973632812,
100
+ "debug/reference_chosen_logps": -166.9479217529297,
101
+ "debug/reference_rejected_logps": -167.18917846679688,
102
+ "epoch": 0.11904761904761904,
103
+ "grad_norm": 7.115986789851628,
104
+ "learning_rate": 1e-06,
105
+ "logits/chosen": -0.8336195945739746,
106
+ "logits/rejected": -0.815830409526825,
107
+ "logps/chosen": -167.17535400390625,
108
+ "logps/rejected": -167.67922973632812,
109
+ "loss": 0.4994,
110
+ "rewards/accuracies": 0.625,
111
+ "rewards/chosen": -0.0022743605077266693,
112
+ "rewards/margins": 0.002626247238367796,
113
+ "rewards/rejected": -0.004900607746094465,
114
+ "step": 5
115
+ },
116
+ {
117
+ "debug/policy_chosen_logits": -0.7617685198783875,
118
+ "debug/policy_chosen_logps": -184.65655517578125,
119
+ "debug/policy_rejected_logits": -0.8746406435966492,
120
+ "debug/policy_rejected_logps": -182.16636657714844,
121
+ "debug/reference_chosen_logps": -185.37466430664062,
122
+ "debug/reference_rejected_logps": -182.4168243408203,
123
+ "epoch": 0.14285714285714285,
124
+ "grad_norm": 8.182698444266666,
125
+ "learning_rate": 1e-06,
126
+ "logits/chosen": -0.7617685198783875,
127
+ "logits/rejected": -0.8746406435966492,
128
+ "logps/chosen": -184.65655517578125,
129
+ "logps/rejected": -182.16636657714844,
130
+ "loss": 0.4999,
131
+ "rewards/accuracies": 0.625,
132
+ "rewards/chosen": 0.007181110326200724,
133
+ "rewards/margins": 0.004676399286836386,
134
+ "rewards/rejected": 0.002504711039364338,
135
+ "step": 6
136
+ },
137
+ {
138
+ "debug/policy_chosen_logits": -0.6539808511734009,
139
+ "debug/policy_chosen_logps": -176.93988037109375,
140
+ "debug/policy_rejected_logits": -0.8548093438148499,
141
+ "debug/policy_rejected_logps": -169.08441162109375,
142
+ "debug/reference_chosen_logps": -178.0904541015625,
143
+ "debug/reference_rejected_logps": -169.7830810546875,
144
+ "epoch": 0.16666666666666666,
145
+ "grad_norm": 7.369445143000824,
146
+ "learning_rate": 1e-06,
147
+ "logits/chosen": -0.6539808511734009,
148
+ "logits/rejected": -0.8548093438148499,
149
+ "logps/chosen": -176.93988037109375,
150
+ "logps/rejected": -169.08441162109375,
151
+ "loss": 0.499,
152
+ "rewards/accuracies": 0.625,
153
+ "rewards/chosen": 0.011505775153636932,
154
+ "rewards/margins": 0.004519080743193626,
155
+ "rewards/rejected": 0.006986694410443306,
156
+ "step": 7
157
+ },
158
+ {
159
+ "debug/policy_chosen_logits": -0.7742728590965271,
160
+ "debug/policy_chosen_logps": -182.88198852539062,
161
+ "debug/policy_rejected_logits": -0.9402408599853516,
162
+ "debug/policy_rejected_logps": -163.03701782226562,
163
+ "debug/reference_chosen_logps": -182.63662719726562,
164
+ "debug/reference_rejected_logps": -162.864013671875,
165
+ "epoch": 0.19047619047619047,
166
+ "grad_norm": 7.553395553689711,
167
+ "learning_rate": 1e-06,
168
+ "logits/chosen": -0.7742728590965271,
169
+ "logits/rejected": -0.9402408599853516,
170
+ "logps/chosen": -182.88198852539062,
171
+ "logps/rejected": -163.03701782226562,
172
+ "loss": 0.4921,
173
+ "rewards/accuracies": 0.5,
174
+ "rewards/chosen": -0.002453470602631569,
175
+ "rewards/margins": -0.0007233805954456329,
176
+ "rewards/rejected": -0.0017300891922786832,
177
+ "step": 8
178
+ },
179
+ {
180
+ "debug/policy_chosen_logits": -0.8858636617660522,
181
+ "debug/policy_chosen_logps": -185.8077850341797,
182
+ "debug/policy_rejected_logits": -0.7941207885742188,
183
+ "debug/policy_rejected_logps": -175.6250762939453,
184
+ "debug/reference_chosen_logps": -186.4120330810547,
185
+ "debug/reference_rejected_logps": -175.30789184570312,
186
+ "epoch": 0.21428571428571427,
187
+ "grad_norm": 7.467636712052708,
188
+ "learning_rate": 1e-06,
189
+ "logits/chosen": -0.8858636617660522,
190
+ "logits/rejected": -0.7941207885742188,
191
+ "logps/chosen": -185.8077850341797,
192
+ "logps/rejected": -175.6250762939453,
193
+ "loss": 0.4997,
194
+ "rewards/accuracies": 0.625,
195
+ "rewards/chosen": 0.006042527966201305,
196
+ "rewards/margins": 0.009214296005666256,
197
+ "rewards/rejected": -0.003171768505126238,
198
+ "step": 9
199
+ },
200
+ {
201
+ "debug/policy_chosen_logits": -0.7334473729133606,
202
+ "debug/policy_chosen_logps": -190.0528564453125,
203
+ "debug/policy_rejected_logits": -0.7853918075561523,
204
+ "debug/policy_rejected_logps": -195.84039306640625,
205
+ "debug/reference_chosen_logps": -189.301025390625,
206
+ "debug/reference_rejected_logps": -194.78436279296875,
207
+ "epoch": 0.23809523809523808,
208
+ "grad_norm": 7.964770440555194,
209
+ "learning_rate": 1e-06,
210
+ "logits/chosen": -0.7334473729133606,
211
+ "logits/rejected": -0.7853918075561523,
212
+ "logps/chosen": -190.0528564453125,
213
+ "logps/rejected": -195.84039306640625,
214
+ "loss": 0.4948,
215
+ "rewards/accuracies": 0.75,
216
+ "rewards/chosen": -0.007518520578742027,
217
+ "rewards/margins": 0.0030419151298701763,
218
+ "rewards/rejected": -0.010560436174273491,
219
+ "step": 10
220
+ },
221
+ {
222
+ "debug/policy_chosen_logits": -0.8003349304199219,
223
+ "debug/policy_chosen_logps": -168.6732177734375,
224
+ "debug/policy_rejected_logits": -0.9006801247596741,
225
+ "debug/policy_rejected_logps": -167.43930053710938,
226
+ "debug/reference_chosen_logps": -169.00479125976562,
227
+ "debug/reference_rejected_logps": -167.300537109375,
228
+ "epoch": 0.2619047619047619,
229
+ "grad_norm": 7.30274943780698,
230
+ "learning_rate": 1e-06,
231
+ "logits/chosen": -0.8003349304199219,
232
+ "logits/rejected": -0.9006801247596741,
233
+ "logps/chosen": -168.6732177734375,
234
+ "logps/rejected": -167.43930053710938,
235
+ "loss": 0.4969,
236
+ "rewards/accuracies": 0.625,
237
+ "rewards/chosen": 0.0033157537691295147,
238
+ "rewards/margins": 0.004703282844275236,
239
+ "rewards/rejected": -0.0013875290751457214,
240
+ "step": 11
241
+ },
242
+ {
243
+ "debug/policy_chosen_logits": -0.6824413537979126,
244
+ "debug/policy_chosen_logps": -184.35902404785156,
245
+ "debug/policy_rejected_logits": -0.789950430393219,
246
+ "debug/policy_rejected_logps": -166.5255889892578,
247
+ "debug/reference_chosen_logps": -183.53871154785156,
248
+ "debug/reference_rejected_logps": -165.9300537109375,
249
+ "epoch": 0.2857142857142857,
250
+ "grad_norm": 8.173133805318917,
251
+ "learning_rate": 1e-06,
252
+ "logits/chosen": -0.6824413537979126,
253
+ "logits/rejected": -0.789950430393219,
254
+ "logps/chosen": -184.35902404785156,
255
+ "logps/rejected": -166.5255889892578,
256
+ "loss": 0.5052,
257
+ "rewards/accuracies": 0.5,
258
+ "rewards/chosen": -0.008203163743019104,
259
+ "rewards/margins": -0.002247829921543598,
260
+ "rewards/rejected": -0.005955332890152931,
261
+ "step": 12
262
+ },
263
+ {
264
+ "debug/policy_chosen_logits": -0.6157864928245544,
265
+ "debug/policy_chosen_logps": -211.65432739257812,
266
+ "debug/policy_rejected_logits": -0.6641220450401306,
267
+ "debug/policy_rejected_logps": -185.09515380859375,
268
+ "debug/reference_chosen_logps": -211.2996826171875,
269
+ "debug/reference_rejected_logps": -183.47618103027344,
270
+ "epoch": 0.30952380952380953,
271
+ "grad_norm": 8.444033138598066,
272
+ "learning_rate": 1e-06,
273
+ "logits/chosen": -0.6157864928245544,
274
+ "logits/rejected": -0.6641220450401306,
275
+ "logps/chosen": -211.65432739257812,
276
+ "logps/rejected": -185.09515380859375,
277
+ "loss": 0.5043,
278
+ "rewards/accuracies": 0.5,
279
+ "rewards/chosen": -0.003546419320628047,
280
+ "rewards/margins": 0.012643384747207165,
281
+ "rewards/rejected": -0.016189804300665855,
282
+ "step": 13
283
+ },
284
+ {
285
+ "debug/policy_chosen_logits": -0.7403644919395447,
286
+ "debug/policy_chosen_logps": -195.4632110595703,
287
+ "debug/policy_rejected_logits": -0.7375439405441284,
288
+ "debug/policy_rejected_logps": -183.90927124023438,
289
+ "debug/reference_chosen_logps": -195.64161682128906,
290
+ "debug/reference_rejected_logps": -182.89498901367188,
291
+ "epoch": 0.3333333333333333,
292
+ "grad_norm": 7.408605476058083,
293
+ "learning_rate": 1e-06,
294
+ "logits/chosen": -0.7403644919395447,
295
+ "logits/rejected": -0.7375439405441284,
296
+ "logps/chosen": -195.4632110595703,
297
+ "logps/rejected": -183.90927124023438,
298
+ "loss": 0.5004,
299
+ "rewards/accuracies": 0.75,
300
+ "rewards/chosen": 0.0017841914668679237,
301
+ "rewards/margins": 0.01192712876945734,
302
+ "rewards/rejected": -0.010142937302589417,
303
+ "step": 14
304
+ },
305
+ {
306
+ "debug/policy_chosen_logits": -0.7125188112258911,
307
+ "debug/policy_chosen_logps": -185.01406860351562,
308
+ "debug/policy_rejected_logits": -0.6736459732055664,
309
+ "debug/policy_rejected_logps": -198.6136474609375,
310
+ "debug/reference_chosen_logps": -184.66270446777344,
311
+ "debug/reference_rejected_logps": -199.4155731201172,
312
+ "epoch": 0.35714285714285715,
313
+ "grad_norm": 7.276791758228749,
314
+ "learning_rate": 1e-06,
315
+ "logits/chosen": -0.7125188112258911,
316
+ "logits/rejected": -0.6736459732055664,
317
+ "logps/chosen": -185.01406860351562,
318
+ "logps/rejected": -198.6136474609375,
319
+ "loss": 0.5004,
320
+ "rewards/accuracies": 0.25,
321
+ "rewards/chosen": -0.0035135941579937935,
322
+ "rewards/margins": -0.011532812379300594,
323
+ "rewards/rejected": 0.0080192182213068,
324
+ "step": 15
325
+ },
326
+ {
327
+ "debug/policy_chosen_logits": -0.5169116854667664,
328
+ "debug/policy_chosen_logps": -190.42169189453125,
329
+ "debug/policy_rejected_logits": -0.6472907662391663,
330
+ "debug/policy_rejected_logps": -190.09664916992188,
331
+ "debug/reference_chosen_logps": -189.55734252929688,
332
+ "debug/reference_rejected_logps": -190.24142456054688,
333
+ "epoch": 0.38095238095238093,
334
+ "grad_norm": 8.20069010191667,
335
+ "learning_rate": 1e-06,
336
+ "logits/chosen": -0.5169116854667664,
337
+ "logits/rejected": -0.6472907662391663,
338
+ "logps/chosen": -190.42169189453125,
339
+ "logps/rejected": -190.09664916992188,
340
+ "loss": 0.499,
341
+ "rewards/accuracies": 0.25,
342
+ "rewards/chosen": -0.008643608540296555,
343
+ "rewards/margins": -0.01009130384773016,
344
+ "rewards/rejected": 0.0014476971700787544,
345
+ "step": 16
346
+ },
347
+ {
348
+ "debug/policy_chosen_logits": -0.7377853393554688,
349
+ "debug/policy_chosen_logps": -200.92160034179688,
350
+ "debug/policy_rejected_logits": -0.7474555969238281,
351
+ "debug/policy_rejected_logps": -171.23770141601562,
352
+ "debug/reference_chosen_logps": -203.17816162109375,
353
+ "debug/reference_rejected_logps": -173.13076782226562,
354
+ "epoch": 0.40476190476190477,
355
+ "grad_norm": 7.891928207816402,
356
+ "learning_rate": 1e-06,
357
+ "logits/chosen": -0.7377853393554688,
358
+ "logits/rejected": -0.7474555969238281,
359
+ "logps/chosen": -200.92160034179688,
360
+ "logps/rejected": -171.23770141601562,
361
+ "loss": 0.4943,
362
+ "rewards/accuracies": 0.625,
363
+ "rewards/chosen": 0.022565487772226334,
364
+ "rewards/margins": 0.0036347098648548126,
365
+ "rewards/rejected": 0.01893077790737152,
366
+ "step": 17
367
+ },
368
+ {
369
+ "debug/policy_chosen_logits": -0.7308653593063354,
370
+ "debug/policy_chosen_logps": -167.10345458984375,
371
+ "debug/policy_rejected_logits": -0.6757017970085144,
372
+ "debug/policy_rejected_logps": -181.9193115234375,
373
+ "debug/reference_chosen_logps": -167.61911010742188,
374
+ "debug/reference_rejected_logps": -181.3040313720703,
375
+ "epoch": 0.42857142857142855,
376
+ "grad_norm": 7.194541349513648,
377
+ "learning_rate": 1e-06,
378
+ "logits/chosen": -0.7308653593063354,
379
+ "logits/rejected": -0.6757017970085144,
380
+ "logps/chosen": -167.10345458984375,
381
+ "logps/rejected": -181.9193115234375,
382
+ "loss": 0.4943,
383
+ "rewards/accuracies": 0.625,
384
+ "rewards/chosen": 0.005156440194696188,
385
+ "rewards/margins": 0.011309223249554634,
386
+ "rewards/rejected": -0.0061527821235358715,
387
+ "step": 18
388
+ },
389
+ {
390
+ "debug/policy_chosen_logits": -1.0581732988357544,
391
+ "debug/policy_chosen_logps": -179.4306640625,
392
+ "debug/policy_rejected_logits": -0.7043694853782654,
393
+ "debug/policy_rejected_logps": -193.04718017578125,
394
+ "debug/reference_chosen_logps": -180.15823364257812,
395
+ "debug/reference_rejected_logps": -190.65826416015625,
396
+ "epoch": 0.4523809523809524,
397
+ "grad_norm": 7.6448718281675925,
398
+ "learning_rate": 1e-06,
399
+ "logits/chosen": -1.0581732988357544,
400
+ "logits/rejected": -0.7043694853782654,
401
+ "logps/chosen": -179.4306640625,
402
+ "logps/rejected": -193.04718017578125,
403
+ "loss": 0.4931,
404
+ "rewards/accuracies": 0.875,
405
+ "rewards/chosen": 0.007275867275893688,
406
+ "rewards/margins": 0.031165312975645065,
407
+ "rewards/rejected": -0.02388944663107395,
408
+ "step": 19
409
+ },
410
+ {
411
+ "debug/policy_chosen_logits": -0.9795810580253601,
412
+ "debug/policy_chosen_logps": -144.11380004882812,
413
+ "debug/policy_rejected_logits": -0.7538442611694336,
414
+ "debug/policy_rejected_logps": -214.71441650390625,
415
+ "debug/reference_chosen_logps": -144.12933349609375,
416
+ "debug/reference_rejected_logps": -213.61111450195312,
417
+ "epoch": 0.47619047619047616,
418
+ "grad_norm": 7.786576857481396,
419
+ "learning_rate": 1e-06,
420
+ "logits/chosen": -0.9795810580253601,
421
+ "logits/rejected": -0.7538442611694336,
422
+ "logps/chosen": -144.11380004882812,
423
+ "logps/rejected": -214.71441650390625,
424
+ "loss": 0.4928,
425
+ "rewards/accuracies": 0.625,
426
+ "rewards/chosen": 0.00015540141612291336,
427
+ "rewards/margins": 0.011188210919499397,
428
+ "rewards/rejected": -0.011032810434699059,
429
+ "step": 20
430
+ },
431
+ {
432
+ "debug/policy_chosen_logits": -0.7604963183403015,
433
+ "debug/policy_chosen_logps": -157.40818786621094,
434
+ "debug/policy_rejected_logits": -0.6597434282302856,
435
+ "debug/policy_rejected_logps": -185.41986083984375,
436
+ "debug/reference_chosen_logps": -160.44407653808594,
437
+ "debug/reference_rejected_logps": -182.03868103027344,
438
+ "epoch": 0.5,
439
+ "grad_norm": 7.858114281838474,
440
+ "learning_rate": 1e-06,
441
+ "logits/chosen": -0.7604963183403015,
442
+ "logits/rejected": -0.6597434282302856,
443
+ "logps/chosen": -157.40818786621094,
444
+ "logps/rejected": -185.41986083984375,
445
+ "loss": 0.4914,
446
+ "rewards/accuracies": 0.875,
447
+ "rewards/chosen": 0.03035891428589821,
448
+ "rewards/margins": 0.06417068839073181,
449
+ "rewards/rejected": -0.0338117778301239,
450
+ "step": 21
451
+ },
452
+ {
453
+ "debug/policy_chosen_logits": -0.8006449341773987,
454
+ "debug/policy_chosen_logps": -162.42440795898438,
455
+ "debug/policy_rejected_logits": -0.8503648638725281,
456
+ "debug/policy_rejected_logps": -174.94754028320312,
457
+ "debug/reference_chosen_logps": -162.67764282226562,
458
+ "debug/reference_rejected_logps": -174.77188110351562,
459
+ "epoch": 0.5238095238095238,
460
+ "grad_norm": 7.589809681723597,
461
+ "learning_rate": 1e-06,
462
+ "logits/chosen": -0.8006449341773987,
463
+ "logits/rejected": -0.8503648638725281,
464
+ "logps/chosen": -162.42440795898438,
465
+ "logps/rejected": -174.94754028320312,
466
+ "loss": 0.4952,
467
+ "rewards/accuracies": 0.625,
468
+ "rewards/chosen": 0.0025324064772576094,
469
+ "rewards/margins": 0.00428897887468338,
470
+ "rewards/rejected": -0.00175657297950238,
471
+ "step": 22
472
+ },
473
+ {
474
+ "debug/policy_chosen_logits": -0.6231905221939087,
475
+ "debug/policy_chosen_logps": -183.37933349609375,
476
+ "debug/policy_rejected_logits": -0.7476862668991089,
477
+ "debug/policy_rejected_logps": -187.78033447265625,
478
+ "debug/reference_chosen_logps": -184.22604370117188,
479
+ "debug/reference_rejected_logps": -189.98977661132812,
480
+ "epoch": 0.5476190476190477,
481
+ "grad_norm": 7.69965081121822,
482
+ "learning_rate": 1e-06,
483
+ "logits/chosen": -0.6231905221939087,
484
+ "logits/rejected": -0.7476862668991089,
485
+ "logps/chosen": -183.37933349609375,
486
+ "logps/rejected": -187.78033447265625,
487
+ "loss": 0.4966,
488
+ "rewards/accuracies": 0.375,
489
+ "rewards/chosen": 0.008467159233987331,
490
+ "rewards/margins": -0.013627204112708569,
491
+ "rewards/rejected": 0.0220943633466959,
492
+ "step": 23
493
+ },
494
+ {
495
+ "debug/policy_chosen_logits": -0.7983882427215576,
496
+ "debug/policy_chosen_logps": -185.93341064453125,
497
+ "debug/policy_rejected_logits": -0.6273135542869568,
498
+ "debug/policy_rejected_logps": -189.53146362304688,
499
+ "debug/reference_chosen_logps": -187.12313842773438,
500
+ "debug/reference_rejected_logps": -185.13299560546875,
501
+ "epoch": 0.5714285714285714,
502
+ "grad_norm": 7.265726875759624,
503
+ "learning_rate": 1e-06,
504
+ "logits/chosen": -0.7983882427215576,
505
+ "logits/rejected": -0.6273135542869568,
506
+ "logps/chosen": -185.93341064453125,
507
+ "logps/rejected": -189.53146362304688,
508
+ "loss": 0.4943,
509
+ "rewards/accuracies": 0.625,
510
+ "rewards/chosen": 0.011897353455424309,
511
+ "rewards/margins": 0.05588197708129883,
512
+ "rewards/rejected": -0.04398462548851967,
513
+ "step": 24
514
+ },
515
+ {
516
+ "debug/policy_chosen_logits": -0.7363254427909851,
517
+ "debug/policy_chosen_logps": -190.56884765625,
518
+ "debug/policy_rejected_logits": -0.8710167407989502,
519
+ "debug/policy_rejected_logps": -178.10833740234375,
520
+ "debug/reference_chosen_logps": -191.86727905273438,
521
+ "debug/reference_rejected_logps": -178.6790771484375,
522
+ "epoch": 0.5952380952380952,
523
+ "grad_norm": 8.03325556485008,
524
+ "learning_rate": 1e-06,
525
+ "logits/chosen": -0.7363254427909851,
526
+ "logits/rejected": -0.8710167407989502,
527
+ "logps/chosen": -190.56884765625,
528
+ "logps/rejected": -178.10833740234375,
529
+ "loss": 0.4987,
530
+ "rewards/accuracies": 0.5,
531
+ "rewards/chosen": 0.01298445649445057,
532
+ "rewards/margins": 0.007277126424014568,
533
+ "rewards/rejected": 0.005707330536097288,
534
+ "step": 25
535
+ },
536
+ {
537
+ "debug/policy_chosen_logits": -0.7094403505325317,
538
+ "debug/policy_chosen_logps": -192.48045349121094,
539
+ "debug/policy_rejected_logits": -0.8579303622245789,
540
+ "debug/policy_rejected_logps": -160.70278930664062,
541
+ "debug/reference_chosen_logps": -192.60931396484375,
542
+ "debug/reference_rejected_logps": -160.4152069091797,
543
+ "epoch": 0.6190476190476191,
544
+ "grad_norm": 8.392052457610536,
545
+ "learning_rate": 1e-06,
546
+ "logits/chosen": -0.7094403505325317,
547
+ "logits/rejected": -0.8579303622245789,
548
+ "logps/chosen": -192.48045349121094,
549
+ "logps/rejected": -160.70278930664062,
550
+ "loss": 0.4925,
551
+ "rewards/accuracies": 0.625,
552
+ "rewards/chosen": 0.0012884328607469797,
553
+ "rewards/margins": 0.004164162091910839,
554
+ "rewards/rejected": -0.002875728067010641,
555
+ "step": 26
556
+ },
557
+ {
558
+ "debug/policy_chosen_logits": -0.7367204427719116,
559
+ "debug/policy_chosen_logps": -184.6185302734375,
560
+ "debug/policy_rejected_logits": -0.9200541377067566,
561
+ "debug/policy_rejected_logps": -154.06069946289062,
562
+ "debug/reference_chosen_logps": -183.95535278320312,
563
+ "debug/reference_rejected_logps": -151.5093994140625,
564
+ "epoch": 0.6428571428571429,
565
+ "grad_norm": 7.914288836516155,
566
+ "learning_rate": 1e-06,
567
+ "logits/chosen": -0.7367204427719116,
568
+ "logits/rejected": -0.9200541377067566,
569
+ "logps/chosen": -184.6185302734375,
570
+ "logps/rejected": -154.06069946289062,
571
+ "loss": 0.4969,
572
+ "rewards/accuracies": 0.875,
573
+ "rewards/chosen": -0.006631812080740929,
574
+ "rewards/margins": 0.018881112337112427,
575
+ "rewards/rejected": -0.025512924417853355,
576
+ "step": 27
577
+ },
578
+ {
579
+ "debug/policy_chosen_logits": -0.7553640604019165,
580
+ "debug/policy_chosen_logps": -168.9253387451172,
581
+ "debug/policy_rejected_logits": -0.7094727754592896,
582
+ "debug/policy_rejected_logps": -187.59722900390625,
583
+ "debug/reference_chosen_logps": -169.41290283203125,
584
+ "debug/reference_rejected_logps": -185.31549072265625,
585
+ "epoch": 0.6666666666666666,
586
+ "grad_norm": 7.711585954287691,
587
+ "learning_rate": 1e-06,
588
+ "logits/chosen": -0.7553640604019165,
589
+ "logits/rejected": -0.7094727754592896,
590
+ "logps/chosen": -168.9253387451172,
591
+ "logps/rejected": -187.59722900390625,
592
+ "loss": 0.4958,
593
+ "rewards/accuracies": 0.375,
594
+ "rewards/chosen": 0.004875545389950275,
595
+ "rewards/margins": 0.027692819014191628,
596
+ "rewards/rejected": -0.022817274555563927,
597
+ "step": 28
598
+ },
599
+ {
600
+ "debug/policy_chosen_logits": -0.7795163989067078,
601
+ "debug/policy_chosen_logps": -174.99765014648438,
602
+ "debug/policy_rejected_logits": -0.6498023867607117,
603
+ "debug/policy_rejected_logps": -176.65753173828125,
604
+ "debug/reference_chosen_logps": -177.16432189941406,
605
+ "debug/reference_rejected_logps": -169.98782348632812,
606
+ "epoch": 0.6904761904761905,
607
+ "grad_norm": 8.24116900154138,
608
+ "learning_rate": 1e-06,
609
+ "logits/chosen": -0.7795163989067078,
610
+ "logits/rejected": -0.6498023867607117,
611
+ "logps/chosen": -174.99765014648438,
612
+ "logps/rejected": -176.65753173828125,
613
+ "loss": 0.4927,
614
+ "rewards/accuracies": 0.625,
615
+ "rewards/chosen": 0.02166653797030449,
616
+ "rewards/margins": 0.0883636474609375,
617
+ "rewards/rejected": -0.06669710576534271,
618
+ "step": 29
619
+ },
620
+ {
621
+ "debug/policy_chosen_logits": -0.765297532081604,
622
+ "debug/policy_chosen_logps": -166.05271911621094,
623
+ "debug/policy_rejected_logits": -0.7573273181915283,
624
+ "debug/policy_rejected_logps": -172.04452514648438,
625
+ "debug/reference_chosen_logps": -168.9293212890625,
626
+ "debug/reference_rejected_logps": -166.12124633789062,
627
+ "epoch": 0.7142857142857143,
628
+ "grad_norm": 7.684221348146213,
629
+ "learning_rate": 1e-06,
630
+ "logits/chosen": -0.765297532081604,
631
+ "logits/rejected": -0.7573273181915283,
632
+ "logps/chosen": -166.05271911621094,
633
+ "logps/rejected": -172.04452514648438,
634
+ "loss": 0.4765,
635
+ "rewards/accuracies": 0.5,
636
+ "rewards/chosen": 0.02876604162156582,
637
+ "rewards/margins": 0.0879988819360733,
638
+ "rewards/rejected": -0.05923284590244293,
639
+ "step": 30
640
+ },
641
+ {
642
+ "debug/policy_chosen_logits": -0.8140920400619507,
643
+ "debug/policy_chosen_logps": -157.61114501953125,
644
+ "debug/policy_rejected_logits": -0.8540012240409851,
645
+ "debug/policy_rejected_logps": -149.83985900878906,
646
+ "debug/reference_chosen_logps": -157.62954711914062,
647
+ "debug/reference_rejected_logps": -150.52252197265625,
648
+ "epoch": 0.7380952380952381,
649
+ "grad_norm": 7.312129798518852,
650
+ "learning_rate": 1e-06,
651
+ "logits/chosen": -0.8140920400619507,
652
+ "logits/rejected": -0.8540012240409851,
653
+ "logps/chosen": -157.61114501953125,
654
+ "logps/rejected": -149.83985900878906,
655
+ "loss": 0.4846,
656
+ "rewards/accuracies": 0.25,
657
+ "rewards/chosen": 0.00018399255350232124,
658
+ "rewards/margins": -0.006642608437687159,
659
+ "rewards/rejected": 0.0068266005255281925,
660
+ "step": 31
661
+ },
662
+ {
663
+ "debug/policy_chosen_logits": -0.828736424446106,
664
+ "debug/policy_chosen_logps": -199.23670959472656,
665
+ "debug/policy_rejected_logits": -0.8586711883544922,
666
+ "debug/policy_rejected_logps": -195.8037109375,
667
+ "debug/reference_chosen_logps": -202.66885375976562,
668
+ "debug/reference_rejected_logps": -196.2105712890625,
669
+ "epoch": 0.7619047619047619,
670
+ "grad_norm": 8.499188978992954,
671
+ "learning_rate": 1e-06,
672
+ "logits/chosen": -0.828736424446106,
673
+ "logits/rejected": -0.8586711883544922,
674
+ "logps/chosen": -199.23670959472656,
675
+ "logps/rejected": -195.8037109375,
676
+ "loss": 0.4947,
677
+ "rewards/accuracies": 0.875,
678
+ "rewards/chosen": 0.03432141989469528,
679
+ "rewards/margins": 0.030252739787101746,
680
+ "rewards/rejected": 0.004068680107593536,
681
+ "step": 32
682
+ },
683
+ {
684
+ "debug/policy_chosen_logits": -0.999685525894165,
685
+ "debug/policy_chosen_logps": -152.76031494140625,
686
+ "debug/policy_rejected_logits": -0.9262170195579529,
687
+ "debug/policy_rejected_logps": -162.0545654296875,
688
+ "debug/reference_chosen_logps": -154.5694580078125,
689
+ "debug/reference_rejected_logps": -163.3520050048828,
690
+ "epoch": 0.7857142857142857,
691
+ "grad_norm": 9.443819086095726,
692
+ "learning_rate": 1e-06,
693
+ "logits/chosen": -0.999685525894165,
694
+ "logits/rejected": -0.9262170195579529,
695
+ "logps/chosen": -152.76031494140625,
696
+ "logps/rejected": -162.0545654296875,
697
+ "loss": 0.4979,
698
+ "rewards/accuracies": 0.5,
699
+ "rewards/chosen": 0.018091343343257904,
700
+ "rewards/margins": 0.005116909742355347,
701
+ "rewards/rejected": 0.012974433600902557,
702
+ "step": 33
703
+ },
704
+ {
705
+ "debug/policy_chosen_logits": -0.8131186366081238,
706
+ "debug/policy_chosen_logps": -176.6094970703125,
707
+ "debug/policy_rejected_logits": -0.8697344660758972,
708
+ "debug/policy_rejected_logps": -163.28733825683594,
709
+ "debug/reference_chosen_logps": -178.95578002929688,
710
+ "debug/reference_rejected_logps": -164.67742919921875,
711
+ "epoch": 0.8095238095238095,
712
+ "grad_norm": 7.712830918193006,
713
+ "learning_rate": 1e-06,
714
+ "logits/chosen": -0.8131186366081238,
715
+ "logits/rejected": -0.8697344660758972,
716
+ "logps/chosen": -176.6094970703125,
717
+ "logps/rejected": -163.28733825683594,
718
+ "loss": 0.4802,
719
+ "rewards/accuracies": 0.625,
720
+ "rewards/chosen": 0.023462962359189987,
721
+ "rewards/margins": 0.009562014602124691,
722
+ "rewards/rejected": 0.013900947757065296,
723
+ "step": 34
724
+ },
725
+ {
726
+ "debug/policy_chosen_logits": -0.8859131336212158,
727
+ "debug/policy_chosen_logps": -151.08670043945312,
728
+ "debug/policy_rejected_logits": -0.7367634177207947,
729
+ "debug/policy_rejected_logps": -187.77529907226562,
730
+ "debug/reference_chosen_logps": -151.78994750976562,
731
+ "debug/reference_rejected_logps": -188.71148681640625,
732
+ "epoch": 0.8333333333333334,
733
+ "grad_norm": 7.772010316317829,
734
+ "learning_rate": 1e-06,
735
+ "logits/chosen": -0.8859131336212158,
736
+ "logits/rejected": -0.7367634177207947,
737
+ "logps/chosen": -151.08670043945312,
738
+ "logps/rejected": -187.77529907226562,
739
+ "loss": 0.4987,
740
+ "rewards/accuracies": 0.625,
741
+ "rewards/chosen": 0.007032603491097689,
742
+ "rewards/margins": -0.0023292540572583675,
743
+ "rewards/rejected": 0.009361859411001205,
744
+ "step": 35
745
+ },
746
+ {
747
+ "debug/policy_chosen_logits": -1.021881103515625,
748
+ "debug/policy_chosen_logps": -152.13558959960938,
749
+ "debug/policy_rejected_logits": -0.9204192757606506,
750
+ "debug/policy_rejected_logps": -156.73532104492188,
751
+ "debug/reference_chosen_logps": -153.53016662597656,
752
+ "debug/reference_rejected_logps": -155.900390625,
753
+ "epoch": 0.8571428571428571,
754
+ "grad_norm": 7.79269703298962,
755
+ "learning_rate": 1e-06,
756
+ "logits/chosen": -1.021881103515625,
757
+ "logits/rejected": -0.9204192757606506,
758
+ "logps/chosen": -152.13558959960938,
759
+ "logps/rejected": -156.73532104492188,
760
+ "loss": 0.4992,
761
+ "rewards/accuracies": 0.875,
762
+ "rewards/chosen": 0.013945741578936577,
763
+ "rewards/margins": 0.022295091301202774,
764
+ "rewards/rejected": -0.008349351584911346,
765
+ "step": 36
766
+ },
767
+ {
768
+ "debug/policy_chosen_logits": -0.9045673608779907,
769
+ "debug/policy_chosen_logps": -162.5475616455078,
770
+ "debug/policy_rejected_logits": -0.7598194479942322,
771
+ "debug/policy_rejected_logps": -182.2893524169922,
772
+ "debug/reference_chosen_logps": -160.34432983398438,
773
+ "debug/reference_rejected_logps": -181.31771850585938,
774
+ "epoch": 0.8809523809523809,
775
+ "grad_norm": 7.302553625861432,
776
+ "learning_rate": 1e-06,
777
+ "logits/chosen": -0.9045673608779907,
778
+ "logits/rejected": -0.7598194479942322,
779
+ "logps/chosen": -162.5475616455078,
780
+ "logps/rejected": -182.2893524169922,
781
+ "loss": 0.4948,
782
+ "rewards/accuracies": 0.375,
783
+ "rewards/chosen": -0.022032355889678,
784
+ "rewards/margins": -0.012315940111875534,
785
+ "rewards/rejected": -0.009716415777802467,
786
+ "step": 37
787
+ },
788
+ {
789
+ "debug/policy_chosen_logits": -0.8838567137718201,
790
+ "debug/policy_chosen_logps": -159.49447631835938,
791
+ "debug/policy_rejected_logits": -0.7277163863182068,
792
+ "debug/policy_rejected_logps": -173.29983520507812,
793
+ "debug/reference_chosen_logps": -157.34768676757812,
794
+ "debug/reference_rejected_logps": -174.5061798095703,
795
+ "epoch": 0.9047619047619048,
796
+ "grad_norm": 8.009966690297112,
797
+ "learning_rate": 1e-06,
798
+ "logits/chosen": -0.8838567137718201,
799
+ "logits/rejected": -0.7277163863182068,
800
+ "logps/chosen": -159.49447631835938,
801
+ "logps/rejected": -173.29983520507812,
802
+ "loss": 0.4969,
803
+ "rewards/accuracies": 0.5,
804
+ "rewards/chosen": -0.021467961370944977,
805
+ "rewards/margins": -0.033531542867422104,
806
+ "rewards/rejected": 0.012063578702509403,
807
+ "step": 38
808
+ },
809
+ {
810
+ "debug/policy_chosen_logits": -0.8647378087043762,
811
+ "debug/policy_chosen_logps": -168.72879028320312,
812
+ "debug/policy_rejected_logits": -0.8511701226234436,
813
+ "debug/policy_rejected_logps": -167.3757781982422,
814
+ "debug/reference_chosen_logps": -170.26791381835938,
815
+ "debug/reference_rejected_logps": -156.91049194335938,
816
+ "epoch": 0.9285714285714286,
817
+ "grad_norm": 10.230464779726292,
818
+ "learning_rate": 1e-06,
819
+ "logits/chosen": -0.8647378087043762,
820
+ "logits/rejected": -0.8511701226234436,
821
+ "logps/chosen": -168.72879028320312,
822
+ "logps/rejected": -167.3757781982422,
823
+ "loss": 0.4957,
824
+ "rewards/accuracies": 0.625,
825
+ "rewards/chosen": 0.015391308814287186,
826
+ "rewards/margins": 0.12004411220550537,
827
+ "rewards/rejected": -0.10465279966592789,
828
+ "step": 39
829
+ },
830
+ {
831
+ "debug/policy_chosen_logits": -0.8634677529335022,
832
+ "debug/policy_chosen_logps": -165.60440063476562,
833
+ "debug/policy_rejected_logits": -0.8949252963066101,
834
+ "debug/policy_rejected_logps": -175.58946228027344,
835
+ "debug/reference_chosen_logps": -165.80642700195312,
836
+ "debug/reference_rejected_logps": -175.4860076904297,
837
+ "epoch": 0.9523809523809523,
838
+ "grad_norm": 7.9621712252904775,
839
+ "learning_rate": 1e-06,
840
+ "logits/chosen": -0.8634677529335022,
841
+ "logits/rejected": -0.8949252963066101,
842
+ "logps/chosen": -165.60440063476562,
843
+ "logps/rejected": -175.58946228027344,
844
+ "loss": 0.5005,
845
+ "rewards/accuracies": 0.5,
846
+ "rewards/chosen": 0.002020292915403843,
847
+ "rewards/margins": 0.003054857486858964,
848
+ "rewards/rejected": -0.0010345648042857647,
849
+ "step": 40
850
+ },
851
+ {
852
+ "debug/policy_chosen_logits": -0.7188435792922974,
853
+ "debug/policy_chosen_logps": -172.37435913085938,
854
+ "debug/policy_rejected_logits": -0.7236220240592957,
855
+ "debug/policy_rejected_logps": -183.69638061523438,
856
+ "debug/reference_chosen_logps": -175.7699432373047,
857
+ "debug/reference_rejected_logps": -184.23777770996094,
858
+ "epoch": 0.9761904761904762,
859
+ "grad_norm": 7.838370371563211,
860
+ "learning_rate": 1e-06,
861
+ "logits/chosen": -0.7188435792922974,
862
+ "logits/rejected": -0.7236220240592957,
863
+ "logps/chosen": -172.37435913085938,
864
+ "logps/rejected": -183.69638061523438,
865
+ "loss": 0.4862,
866
+ "rewards/accuracies": 0.875,
867
+ "rewards/chosen": 0.033955782651901245,
868
+ "rewards/margins": 0.028541620820760727,
869
+ "rewards/rejected": 0.005414160899817944,
870
+ "step": 41
871
+ },
872
+ {
873
+ "debug/policy_chosen_logits": -0.5989887118339539,
874
+ "debug/policy_chosen_logps": -197.09739685058594,
875
+ "debug/policy_rejected_logits": -0.6930462718009949,
876
+ "debug/policy_rejected_logps": -169.6961212158203,
877
+ "debug/reference_chosen_logps": -200.82525634765625,
878
+ "debug/reference_rejected_logps": -174.00146484375,
879
+ "epoch": 1.0,
880
+ "grad_norm": 9.25027887070611,
881
+ "learning_rate": 1e-06,
882
+ "logits/chosen": -0.5989887118339539,
883
+ "logits/rejected": -0.6930462718009949,
884
+ "logps/chosen": -197.09739685058594,
885
+ "logps/rejected": -169.6961212158203,
886
+ "loss": 0.5012,
887
+ "rewards/accuracies": 0.25,
888
+ "rewards/chosen": 0.037278518080711365,
889
+ "rewards/margins": -0.005774736870080233,
890
+ "rewards/rejected": 0.04305325448513031,
891
+ "step": 42
892
+ },
893
+ {
894
+ "epoch": 1.0,
895
+ "step": 42,
896
+ "total_flos": 0.0,
897
+ "train_loss": 0.4958818086556026,
898
+ "train_runtime": 153.9035,
899
+ "train_samples_per_second": 17.465,
900
+ "train_steps_per_second": 0.273
901
+ }
902
+ ],
903
+ "logging_steps": 1,
904
+ "max_steps": 42,
905
+ "num_input_tokens_seen": 0,
906
+ "num_train_epochs": 1,
907
+ "save_steps": 500,
908
+ "stateful_callbacks": {
909
+ "TrainerControl": {
910
+ "args": {
911
+ "should_epoch_stop": false,
912
+ "should_evaluate": false,
913
+ "should_log": false,
914
+ "should_save": true,
915
+ "should_training_stop": true
916
+ },
917
+ "attributes": {}
918
+ }
919
+ },
920
+ "total_flos": 0.0,
921
+ "train_batch_size": 8,
922
+ "trial_name": null,
923
+ "trial_params": null
924
+ }