lewtun HF staff commited on
Commit
1461795
1 Parent(s): c1a1250

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: apache-2.0
3
  base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - dpo
9
  - generated_from_trainer
10
- datasets:
11
- - HuggingFaceH4/ultrafeedback_binarized
12
  model-index:
13
  - name: zephyr-7b-dpo-full
14
  results: []
@@ -19,17 +15,17 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # zephyr-7b-dpo-full
21
 
22
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the HuggingFaceH4/ultrafeedback_binarized dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 0.5028
25
- - Rewards/chosen: -0.9469
26
- - Rewards/rejected: -1.8932
27
- - Rewards/accuracies: 0.7656
28
- - Rewards/margins: 0.9463
29
- - Logps/rejected: -451.4661
30
- - Logps/chosen: -357.2325
31
- - Logits/rejected: 1.5731
32
- - Logits/chosen: 0.6530
33
 
34
  ## Model description
35
 
@@ -66,10 +62,10 @@ The following hyperparameters were used during training:
66
 
67
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
68
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
69
- | 0.5545 | 0.21 | 100 | 0.5658 | -0.4953 | -1.1217 | 0.7188 | 0.6264 | -374.3159 | -312.0799 | -1.0287 | -1.3212 |
70
- | 0.5026 | 0.42 | 200 | 0.5202 | -0.8995 | -1.7718 | 0.7461 | 0.8723 | -439.3264 | -352.4985 | 0.5190 | -0.1773 |
71
- | 0.5106 | 0.63 | 300 | 0.5104 | -0.7946 | -1.6285 | 0.7656 | 0.8339 | -424.9976 | -342.0043 | 0.9099 | 0.0862 |
72
- | 0.4859 | 0.84 | 400 | 0.5031 | -0.9777 | -1.9580 | 0.7578 | 0.9803 | -457.9452 | -360.3139 | 1.7438 | 0.7818 |
73
 
74
 
75
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
 
 
5
  - trl
6
  - dpo
7
  - generated_from_trainer
 
 
8
  model-index:
9
  - name: zephyr-7b-dpo-full
10
  results: []
 
15
 
16
  # zephyr-7b-dpo-full
17
 
18
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.5042
21
+ - Rewards/chosen: -1.0500
22
+ - Rewards/rejected: -2.0480
23
+ - Rewards/accuracies: 0.7539
24
+ - Rewards/margins: 0.9980
25
+ - Logps/rejected: -468.1450
26
+ - Logps/chosen: -368.4135
27
+ - Logits/rejected: 2.3821
28
+ - Logits/chosen: 1.6141
29
 
30
  ## Model description
31
 
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
+ | 0.5723 | 0.21 | 100 | 0.5851 | -0.4097 | -0.8752 | 0.7031 | 0.4655 | -350.8695 | -304.3812 | -2.3494 | -2.4070 |
66
+ | 0.5084 | 0.42 | 200 | 0.5251 | -0.9116 | -1.7472 | 0.7422 | 0.8355 | -438.0663 | -354.5790 | 1.3918 | 0.9248 |
67
+ | 0.5059 | 0.63 | 300 | 0.5130 | -0.8646 | -1.7542 | 0.75 | 0.8896 | -438.7735 | -349.8758 | 2.0331 | 1.2558 |
68
+ | 0.4853 | 0.84 | 400 | 0.5050 | -1.0929 | -2.1085 | 0.7539 | 1.0156 | -474.1963 | -372.7067 | 2.5922 | 1.8194 |
69
 
70
 
71
  ### Framework versions
all_results.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": 0.6529867053031921,
4
- "eval_logits/rejected": 1.5730761289596558,
5
- "eval_logps/chosen": -357.2324523925781,
6
- "eval_logps/rejected": -451.466064453125,
7
- "eval_loss": 0.5028161406517029,
8
- "eval_rewards/accuracies": 0.765625,
9
- "eval_rewards/chosen": -0.9468507170677185,
10
- "eval_rewards/margins": 0.946345865726471,
11
- "eval_rewards/rejected": -1.8931965827941895,
12
- "eval_runtime": 86.4354,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 23.139,
15
- "eval_steps_per_second": 0.37,
16
- "train_loss": 0.5366686437918052,
17
- "train_runtime": 5196.7487,
18
  "train_samples": 61135,
19
- "train_samples_per_second": 11.764,
20
- "train_steps_per_second": 0.092
21
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_logits/chosen": 1.6140714883804321,
4
+ "eval_logits/rejected": 2.3821487426757812,
5
+ "eval_logps/chosen": -368.41351318359375,
6
+ "eval_logps/rejected": -468.14495849609375,
7
+ "eval_loss": 0.5042223334312439,
8
+ "eval_rewards/accuracies": 0.75390625,
9
+ "eval_rewards/chosen": -1.049981713294983,
10
+ "eval_rewards/margins": 0.9979785680770874,
11
+ "eval_rewards/rejected": -2.0479602813720703,
12
+ "eval_runtime": 91.1991,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 21.93,
15
+ "eval_steps_per_second": 0.351,
16
+ "train_loss": 0.5379065808890754,
17
+ "train_runtime": 5396.8094,
18
  "train_samples": 61135,
19
+ "train_samples_per_second": 11.328,
20
+ "train_steps_per_second": 0.089
21
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": 0.6529867053031921,
4
- "eval_logits/rejected": 1.5730761289596558,
5
- "eval_logps/chosen": -357.2324523925781,
6
- "eval_logps/rejected": -451.466064453125,
7
- "eval_loss": 0.5028161406517029,
8
- "eval_rewards/accuracies": 0.765625,
9
- "eval_rewards/chosen": -0.9468507170677185,
10
- "eval_rewards/margins": 0.946345865726471,
11
- "eval_rewards/rejected": -1.8931965827941895,
12
- "eval_runtime": 86.4354,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 23.139,
15
- "eval_steps_per_second": 0.37
16
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_logits/chosen": 1.6140714883804321,
4
+ "eval_logits/rejected": 2.3821487426757812,
5
+ "eval_logps/chosen": -368.41351318359375,
6
+ "eval_logps/rejected": -468.14495849609375,
7
+ "eval_loss": 0.5042223334312439,
8
+ "eval_rewards/accuracies": 0.75390625,
9
+ "eval_rewards/chosen": -1.049981713294983,
10
+ "eval_rewards/margins": 0.9979785680770874,
11
+ "eval_rewards/rejected": -2.0479602813720703,
12
+ "eval_runtime": 91.1991,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 21.93,
15
+ "eval_steps_per_second": 0.351
16
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17698e797fee795be264ff6fceb1ce5de5e2d3504ccebb6cc762eccf36863396
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74afc0675c24c08eae25d9a1b5d0d849aba3015f65871f579181d4f822dd5216
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d57e247aa8357fee9c5771c9956ae96eea45624a113b40fdba0fcd2b848476f7
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d7a278fc93c6f6c9f8c55bfe6421851acb60fb90b311cdbd8edf9e9f44858b7
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:daa924d9453a024a65583a098420d94aa3946268cadc893d3a672cde300d7731
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35c86540b037e38a7b9b55a77f9551c811ebda4b3b99b4c31acdf75bffdf9917
3
  size 4540516344
runs/Jan10_03-25-54_ip-26-0-168-34/events.out.tfevents.1704857214.ip-26-0-168-34.1146299.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf725a56b70c28826538ef708644d9fb6baeb7dd379efca2b4e66857bdb5aa34
3
- size 33329
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d16ef4aaa5aa22c0a866161c2f9c5b8fcc1f479c9f913439e27dfe74dd18608d
3
+ size 38121
runs/Jan10_03-25-54_ip-26-0-168-34/events.out.tfevents.1704862702.ip-26-0-168-34.1146299.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3fbb23ad68116119e144362585f232e6defa4b4d47e223c62e5f3615b2d54ba
3
+ size 828
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.5366686437918052,
4
- "train_runtime": 5196.7487,
5
  "train_samples": 61135,
6
- "train_samples_per_second": 11.764,
7
- "train_steps_per_second": 0.092
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.5379065808890754,
4
+ "train_runtime": 5396.8094,
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 11.328,
7
+ "train_steps_per_second": 0.089
8
  }
trainer_state.json CHANGED
@@ -11,10 +11,10 @@
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 1.0416666666666666e-08,
14
- "logits/chosen": -2.5017967224121094,
15
- "logits/rejected": -2.3871021270751953,
16
- "logps/chosen": -332.3011474609375,
17
- "logps/rejected": -277.1512756347656,
18
  "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
@@ -25,733 +25,733 @@
25
  {
26
  "epoch": 0.02,
27
  "learning_rate": 1.0416666666666667e-07,
28
- "logits/chosen": -2.540015459060669,
29
- "logits/rejected": -2.500929832458496,
30
- "logps/chosen": -247.11024475097656,
31
- "logps/rejected": -213.2850341796875,
32
  "loss": 0.6932,
33
- "rewards/accuracies": 0.4097222089767456,
34
- "rewards/chosen": -0.000517037755344063,
35
- "rewards/margins": -0.00017402732919435948,
36
- "rewards/rejected": -0.0003430104407016188,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.04,
41
  "learning_rate": 2.0833333333333333e-07,
42
- "logits/chosen": -2.5355143547058105,
43
- "logits/rejected": -2.484562873840332,
44
- "logps/chosen": -272.35711669921875,
45
- "logps/rejected": -249.6931915283203,
46
- "loss": 0.6924,
47
- "rewards/accuracies": 0.581250011920929,
48
- "rewards/chosen": 0.0008343200897797942,
49
- "rewards/margins": 0.001956597436219454,
50
- "rewards/rejected": -0.001122277113609016,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.06,
55
  "learning_rate": 3.1249999999999997e-07,
56
- "logits/chosen": -2.6150834560394287,
57
- "logits/rejected": -2.542232036590576,
58
- "logps/chosen": -283.331298828125,
59
- "logps/rejected": -276.0316467285156,
60
- "loss": 0.6885,
61
- "rewards/accuracies": 0.612500011920929,
62
- "rewards/chosen": 0.008794735185801983,
63
- "rewards/margins": 0.006988237611949444,
64
- "rewards/rejected": 0.0018064973410218954,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.08,
69
  "learning_rate": 4.1666666666666667e-07,
70
- "logits/chosen": -2.519700527191162,
71
- "logits/rejected": -2.4544477462768555,
72
- "logps/chosen": -288.609130859375,
73
- "logps/rejected": -265.47576904296875,
74
  "loss": 0.6756,
75
- "rewards/accuracies": 0.668749988079071,
76
- "rewards/chosen": 0.04671400785446167,
77
- "rewards/margins": 0.03756200894713402,
78
- "rewards/rejected": 0.009151997044682503,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.1,
83
  "learning_rate": 4.999733114418725e-07,
84
- "logits/chosen": -2.4968419075012207,
85
- "logits/rejected": -2.4539003372192383,
86
- "logps/chosen": -312.46380615234375,
87
- "logps/rejected": -309.41632080078125,
88
  "loss": 0.6601,
89
- "rewards/accuracies": 0.7250000238418579,
90
- "rewards/chosen": -0.006021331064403057,
91
- "rewards/margins": 0.07272790372371674,
92
- "rewards/rejected": -0.07874923944473267,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.13,
97
  "learning_rate": 4.990398100856366e-07,
98
- "logits/chosen": -2.461306095123291,
99
- "logits/rejected": -2.4000632762908936,
100
- "logps/chosen": -267.9833679199219,
101
- "logps/rejected": -251.8109130859375,
102
- "loss": 0.6381,
103
  "rewards/accuracies": 0.668749988079071,
104
- "rewards/chosen": -0.10297150909900665,
105
- "rewards/margins": 0.12468205392360687,
106
- "rewards/rejected": -0.22765357792377472,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.15,
111
  "learning_rate": 4.967775735898179e-07,
112
- "logits/chosen": -2.427084445953369,
113
- "logits/rejected": -2.366863250732422,
114
- "logps/chosen": -298.81085205078125,
115
- "logps/rejected": -287.7050476074219,
116
- "loss": 0.6177,
117
- "rewards/accuracies": 0.6499999761581421,
118
- "rewards/chosen": -0.1814979612827301,
119
- "rewards/margins": 0.2026226818561554,
120
- "rewards/rejected": -0.3841206133365631,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.17,
125
  "learning_rate": 4.931986719649298e-07,
126
- "logits/chosen": -1.6477082967758179,
127
- "logits/rejected": -1.5141746997833252,
128
- "logps/chosen": -328.52178955078125,
129
- "logps/rejected": -357.9725646972656,
130
- "loss": 0.609,
131
- "rewards/accuracies": 0.699999988079071,
132
- "rewards/chosen": -0.5934440493583679,
133
- "rewards/margins": 0.24087591469287872,
134
- "rewards/rejected": -0.8343199491500854,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.19,
139
  "learning_rate": 4.883222001996351e-07,
140
- "logits/chosen": -1.456400990486145,
141
- "logits/rejected": -1.1816024780273438,
142
- "logps/chosen": -324.58734130859375,
143
- "logps/rejected": -347.7338562011719,
144
- "loss": 0.5717,
145
- "rewards/accuracies": 0.7124999761581421,
146
- "rewards/chosen": -0.5771389603614807,
147
- "rewards/margins": 0.38918066024780273,
148
- "rewards/rejected": -0.966319739818573,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.21,
153
  "learning_rate": 4.821741763807186e-07,
154
- "logits/chosen": -1.2872960567474365,
155
- "logits/rejected": -0.9513088464736938,
156
- "logps/chosen": -339.1795959472656,
157
- "logps/rejected": -366.32769775390625,
158
- "loss": 0.5545,
159
- "rewards/accuracies": 0.699999988079071,
160
- "rewards/chosen": -0.5583249926567078,
161
- "rewards/margins": 0.47431764006614685,
162
- "rewards/rejected": -1.0326426029205322,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.21,
167
- "eval_logits/chosen": -1.3212240934371948,
168
- "eval_logits/rejected": -1.0286760330200195,
169
- "eval_logps/chosen": -312.0798645019531,
170
- "eval_logps/rejected": -374.31585693359375,
171
- "eval_loss": 0.5658453106880188,
172
- "eval_rewards/accuracies": 0.71875,
173
- "eval_rewards/chosen": -0.49532508850097656,
174
- "eval_rewards/margins": 0.6263692378997803,
175
- "eval_rewards/rejected": -1.1216944456100464,
176
- "eval_runtime": 85.4149,
177
- "eval_samples_per_second": 23.415,
178
- "eval_steps_per_second": 0.375,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.23,
183
  "learning_rate": 4.747874028753375e-07,
184
- "logits/chosen": -1.1495229005813599,
185
- "logits/rejected": -0.6862327456474304,
186
- "logps/chosen": -370.4473571777344,
187
- "logps/rejected": -381.8430480957031,
188
- "loss": 0.5534,
189
- "rewards/accuracies": 0.75,
190
- "rewards/chosen": -0.503128170967102,
191
- "rewards/margins": 0.6164419054985046,
192
- "rewards/rejected": -1.1195701360702515,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.25,
197
  "learning_rate": 4.662012913161997e-07,
198
- "logits/chosen": -0.21524420380592346,
199
- "logits/rejected": 0.5560011863708496,
200
- "logps/chosen": -363.287353515625,
201
- "logps/rejected": -385.227783203125,
202
- "loss": 0.5308,
203
- "rewards/accuracies": 0.71875,
204
- "rewards/chosen": -0.8372129201889038,
205
- "rewards/margins": 0.710771918296814,
206
- "rewards/rejected": -1.5479847192764282,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.27,
211
  "learning_rate": 4.5646165232345103e-07,
212
- "logits/chosen": -0.9693559408187866,
213
- "logits/rejected": -0.585214376449585,
214
- "logps/chosen": -310.8514404296875,
215
- "logps/rejected": -407.11956787109375,
216
- "loss": 0.5387,
217
- "rewards/accuracies": 0.7437499761581421,
218
- "rewards/chosen": -0.4881950318813324,
219
- "rewards/margins": 0.6856316328048706,
220
- "rewards/rejected": -1.1738265752792358,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.29,
225
  "learning_rate": 4.456204510851956e-07,
226
- "logits/chosen": -1.0192829370498657,
227
- "logits/rejected": -0.2164110690355301,
228
- "logps/chosen": -359.496826171875,
229
- "logps/rejected": -383.21826171875,
230
- "loss": 0.5381,
231
- "rewards/accuracies": 0.7562500238418579,
232
- "rewards/chosen": -0.6586028337478638,
233
- "rewards/margins": 0.6349435448646545,
234
- "rewards/rejected": -1.2935463190078735,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.31,
239
  "learning_rate": 4.337355301007335e-07,
240
- "logits/chosen": -0.4600129723548889,
241
- "logits/rejected": 0.2966030240058899,
242
- "logps/chosen": -379.4418640136719,
243
- "logps/rejected": -396.3006286621094,
244
- "loss": 0.5288,
245
- "rewards/accuracies": 0.7124999761581421,
246
- "rewards/chosen": -0.9255654215812683,
247
- "rewards/margins": 0.6836373805999756,
248
- "rewards/rejected": -1.6092027425765991,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.33,
253
  "learning_rate": 4.2087030056579986e-07,
254
- "logits/chosen": -0.17871518433094025,
255
- "logits/rejected": 0.4601938724517822,
256
- "logps/chosen": -382.32659912109375,
257
- "logps/rejected": -447.52716064453125,
258
- "loss": 0.5063,
259
- "rewards/accuracies": 0.7875000238418579,
260
- "rewards/chosen": -0.9520798921585083,
261
- "rewards/margins": 0.810064435005188,
262
- "rewards/rejected": -1.7621443271636963,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.36,
267
  "learning_rate": 4.070934040463998e-07,
268
- "logits/chosen": -0.7312272191047668,
269
- "logits/rejected": -0.21112406253814697,
270
- "logps/chosen": -320.51849365234375,
271
- "logps/rejected": -360.88427734375,
272
- "loss": 0.5618,
273
- "rewards/accuracies": 0.6499999761581421,
274
- "rewards/chosen": -0.7297524213790894,
275
- "rewards/margins": 0.565701425075531,
276
- "rewards/rejected": -1.2954537868499756,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.38,
281
  "learning_rate": 3.9247834624635404e-07,
282
- "logits/chosen": -0.5727821588516235,
283
- "logits/rejected": -0.19889096915721893,
284
- "logps/chosen": -369.8377990722656,
285
- "logps/rejected": -436.2527770996094,
286
- "loss": 0.5238,
287
- "rewards/accuracies": 0.71875,
288
- "rewards/chosen": -0.7431085705757141,
289
- "rewards/margins": 0.8097877502441406,
290
- "rewards/rejected": -1.55289626121521,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.4,
295
  "learning_rate": 3.7710310482256523e-07,
296
- "logits/chosen": -0.8184123039245605,
297
- "logits/rejected": -0.09880775213241577,
298
- "logps/chosen": -360.60174560546875,
299
- "logps/rejected": -434.35540771484375,
300
- "loss": 0.5039,
301
- "rewards/accuracies": 0.762499988079071,
302
- "rewards/chosen": -0.7761167287826538,
303
- "rewards/margins": 0.8615070581436157,
304
- "rewards/rejected": -1.6376237869262695,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.42,
309
  "learning_rate": 3.610497133404795e-07,
310
- "logits/chosen": -0.597638726234436,
311
- "logits/rejected": 0.14091506600379944,
312
- "logps/chosen": -373.0635681152344,
313
- "logps/rejected": -402.6120910644531,
314
- "loss": 0.5026,
315
- "rewards/accuracies": 0.65625,
316
- "rewards/chosen": -0.908000648021698,
317
- "rewards/margins": 0.6652692556381226,
318
- "rewards/rejected": -1.5732697248458862,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.42,
323
- "eval_logits/chosen": -0.17727772891521454,
324
- "eval_logits/rejected": 0.5189895629882812,
325
- "eval_logps/chosen": -352.49847412109375,
326
- "eval_logps/rejected": -439.3264465332031,
327
- "eval_loss": 0.5201631188392639,
328
- "eval_rewards/accuracies": 0.74609375,
329
- "eval_rewards/chosen": -0.899510383605957,
330
- "eval_rewards/margins": 0.8722902536392212,
331
- "eval_rewards/rejected": -1.7718006372451782,
332
- "eval_runtime": 85.7368,
333
- "eval_samples_per_second": 23.327,
334
- "eval_steps_per_second": 0.373,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.44,
339
  "learning_rate": 3.4440382358952115e-07,
340
- "logits/chosen": -0.04346243292093277,
341
- "logits/rejected": 0.8316682577133179,
342
- "logps/chosen": -377.2864990234375,
343
- "logps/rejected": -403.12933349609375,
344
- "loss": 0.5357,
345
  "rewards/accuracies": 0.6937500238418579,
346
- "rewards/chosen": -1.005774736404419,
347
- "rewards/margins": 0.6370893716812134,
348
- "rewards/rejected": -1.6428638696670532,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.46,
353
  "learning_rate": 3.272542485937368e-07,
354
- "logits/chosen": 0.27445563673973083,
355
- "logits/rejected": 0.8739471435546875,
356
- "logps/chosen": -359.21343994140625,
357
- "logps/rejected": -410.8561096191406,
358
  "loss": 0.5269,
359
- "rewards/accuracies": 0.793749988079071,
360
- "rewards/chosen": -0.8274116516113281,
361
- "rewards/margins": 0.8387683033943176,
362
- "rewards/rejected": -1.6661800146102905,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.48,
367
  "learning_rate": 3.096924887558854e-07,
368
- "logits/chosen": 0.12390404939651489,
369
- "logits/rejected": 0.7874934673309326,
370
- "logps/chosen": -343.6993713378906,
371
- "logps/rejected": -443.02716064453125,
372
- "loss": 0.5192,
373
- "rewards/accuracies": 0.7875000238418579,
374
- "rewards/chosen": -0.7115140557289124,
375
- "rewards/margins": 0.9087392091751099,
376
- "rewards/rejected": -1.6202532052993774,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.5,
381
  "learning_rate": 2.9181224366319943e-07,
382
- "logits/chosen": 0.47393637895584106,
383
- "logits/rejected": 1.2820873260498047,
384
- "logps/chosen": -341.52032470703125,
385
- "logps/rejected": -402.3468322753906,
386
- "loss": 0.5251,
387
- "rewards/accuracies": 0.699999988079071,
388
- "rewards/chosen": -0.9017995595932007,
389
- "rewards/margins": 0.7208075523376465,
390
- "rewards/rejected": -1.6226072311401367,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.52,
395
  "learning_rate": 2.7370891215954565e-07,
396
- "logits/chosen": 0.6556827425956726,
397
- "logits/rejected": 1.5371648073196411,
398
- "logps/chosen": -363.3061218261719,
399
- "logps/rejected": -423.22216796875,
400
- "loss": 0.5168,
401
- "rewards/accuracies": 0.6937500238418579,
402
- "rewards/chosen": -0.9660114049911499,
403
- "rewards/margins": 0.6768711805343628,
404
- "rewards/rejected": -1.6428825855255127,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.54,
409
  "learning_rate": 2.55479083351317e-07,
410
- "logits/chosen": 1.074449062347412,
411
- "logits/rejected": 2.2258691787719727,
412
- "logps/chosen": -393.25042724609375,
413
- "logps/rejected": -435.5625915527344,
414
- "loss": 0.5007,
415
- "rewards/accuracies": 0.793749988079071,
416
- "rewards/chosen": -1.1183383464813232,
417
- "rewards/margins": 0.817892849445343,
418
- "rewards/rejected": -1.936231255531311,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.56,
423
  "learning_rate": 2.3722002126275822e-07,
424
- "logits/chosen": 1.035612940788269,
425
- "logits/rejected": 1.9490232467651367,
426
- "logps/chosen": -391.06488037109375,
427
- "logps/rejected": -470.64599609375,
428
- "loss": 0.5064,
429
- "rewards/accuracies": 0.7749999761581421,
430
- "rewards/chosen": -1.1720463037490845,
431
- "rewards/margins": 0.9149090051651001,
432
- "rewards/rejected": -2.0869553089141846,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.59,
437
  "learning_rate": 2.19029145890313e-07,
438
- "logits/chosen": 0.5210274457931519,
439
- "logits/rejected": 1.496964693069458,
440
- "logps/chosen": -369.4839782714844,
441
- "logps/rejected": -415.17974853515625,
442
- "loss": 0.5189,
443
- "rewards/accuracies": 0.762499988079071,
444
- "rewards/chosen": -0.9482043981552124,
445
- "rewards/margins": 0.8109124898910522,
446
- "rewards/rejected": -1.759116768836975,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.61,
451
  "learning_rate": 2.0100351342479216e-07,
452
- "logits/chosen": 0.4483126103878021,
453
- "logits/rejected": 1.3368585109710693,
454
- "logps/chosen": -373.81475830078125,
455
- "logps/rejected": -435.36151123046875,
456
- "loss": 0.519,
457
- "rewards/accuracies": 0.7437499761581421,
458
- "rewards/chosen": -0.8730584979057312,
459
- "rewards/margins": 0.7535277009010315,
460
- "rewards/rejected": -1.6265861988067627,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.63,
465
  "learning_rate": 1.8323929841460178e-07,
466
- "logits/chosen": 0.27887973189353943,
467
- "logits/rejected": 1.0783421993255615,
468
- "logps/chosen": -381.5480651855469,
469
- "logps/rejected": -453.80242919921875,
470
- "loss": 0.5106,
471
- "rewards/accuracies": 0.7250000238418579,
472
- "rewards/chosen": -0.8568315505981445,
473
- "rewards/margins": 0.6526933312416077,
474
- "rewards/rejected": -1.5095248222351074,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.63,
479
- "eval_logits/chosen": 0.08615332096815109,
480
- "eval_logits/rejected": 0.9099248647689819,
481
- "eval_logps/chosen": -342.00433349609375,
482
- "eval_logps/rejected": -424.99755859375,
483
- "eval_loss": 0.5103623270988464,
484
- "eval_rewards/accuracies": 0.765625,
485
- "eval_rewards/chosen": -0.7945692539215088,
486
- "eval_rewards/margins": 0.8339425921440125,
487
- "eval_rewards/rejected": -1.628511905670166,
488
- "eval_runtime": 86.1066,
489
- "eval_samples_per_second": 23.227,
490
- "eval_steps_per_second": 0.372,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.65,
495
  "learning_rate": 1.6583128063291573e-07,
496
- "logits/chosen": 0.13822266459465027,
497
- "logits/rejected": 1.3017876148223877,
498
- "logps/chosen": -365.63958740234375,
499
- "logps/rejected": -384.9111328125,
500
- "loss": 0.4988,
501
  "rewards/accuracies": 0.75,
502
- "rewards/chosen": -0.9199568629264832,
503
- "rewards/margins": 0.7403375506401062,
504
- "rewards/rejected": -1.6602942943572998,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.67,
509
  "learning_rate": 1.488723393865766e-07,
510
- "logits/chosen": 0.672313392162323,
511
- "logits/rejected": 1.8797632455825806,
512
- "logps/chosen": -326.7810974121094,
513
- "logps/rejected": -421.84765625,
514
- "loss": 0.5073,
515
  "rewards/accuracies": 0.7749999761581421,
516
- "rewards/chosen": -0.7888548374176025,
517
- "rewards/margins": 1.0594618320465088,
518
- "rewards/rejected": -1.8483164310455322,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.69,
523
  "learning_rate": 1.3245295796480788e-07,
524
- "logits/chosen": 0.46894121170043945,
525
- "logits/rejected": 1.3748019933700562,
526
- "logps/chosen": -373.7689208984375,
527
- "logps/rejected": -419.54364013671875,
528
- "loss": 0.5143,
529
- "rewards/accuracies": 0.7562500238418579,
530
- "rewards/chosen": -0.8813614845275879,
531
- "rewards/margins": 0.7780431509017944,
532
- "rewards/rejected": -1.6594045162200928,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.71,
537
  "learning_rate": 1.1666074087171627e-07,
538
- "logits/chosen": 0.8704012036323547,
539
- "logits/rejected": 1.6405874490737915,
540
- "logps/chosen": -351.2308654785156,
541
- "logps/rejected": -407.41064453125,
542
- "loss": 0.5135,
543
  "rewards/accuracies": 0.7250000238418579,
544
- "rewards/chosen": -0.8790988922119141,
545
- "rewards/margins": 0.7077994346618652,
546
- "rewards/rejected": -1.5868984460830688,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.73,
551
  "learning_rate": 1.0157994641835734e-07,
552
- "logits/chosen": 0.8922025561332703,
553
- "logits/rejected": 2.06630277633667,
554
- "logps/chosen": -348.2925720214844,
555
- "logps/rejected": -419.91363525390625,
556
- "loss": 0.4771,
557
- "rewards/accuracies": 0.8125,
558
- "rewards/chosen": -0.9759310483932495,
559
- "rewards/margins": 0.898769736289978,
560
- "rewards/rejected": -1.8747007846832275,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.75,
565
  "learning_rate": 8.729103716819111e-08,
566
- "logits/chosen": 0.996601402759552,
567
- "logits/rejected": 2.303062915802002,
568
- "logps/chosen": -422.4617614746094,
569
- "logps/rejected": -457.77691650390625,
570
- "loss": 0.4833,
571
- "rewards/accuracies": 0.7749999761581421,
572
- "rewards/chosen": -1.044933557510376,
573
- "rewards/margins": 0.9208766222000122,
574
- "rewards/rejected": -1.9658104181289673,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.77,
579
  "learning_rate": 7.387025063449081e-08,
580
- "logits/chosen": 1.0122365951538086,
581
- "logits/rejected": 2.279644727706909,
582
- "logps/chosen": -411.8594665527344,
583
- "logps/rejected": -457.96807861328125,
584
- "loss": 0.4969,
585
- "rewards/accuracies": 0.8062499761581421,
586
- "rewards/chosen": -1.0128066539764404,
587
- "rewards/margins": 1.0020781755447388,
588
- "rewards/rejected": -2.0148847103118896,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.79,
593
  "learning_rate": 6.138919252022435e-08,
594
- "logits/chosen": 0.8450608253479004,
595
- "logits/rejected": 1.820166826248169,
596
- "logps/chosen": -383.67559814453125,
597
- "logps/rejected": -475.4779357910156,
598
- "loss": 0.4768,
599
- "rewards/accuracies": 0.768750011920929,
600
- "rewards/chosen": -1.0418307781219482,
601
- "rewards/margins": 0.9196658134460449,
602
- "rewards/rejected": -1.9614967107772827,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.82,
607
  "learning_rate": 4.991445467064689e-08,
608
- "logits/chosen": 1.0426132678985596,
609
- "logits/rejected": 1.9015953540802002,
610
- "logps/chosen": -384.1719970703125,
611
- "logps/rejected": -459.9111328125,
612
- "loss": 0.5073,
613
- "rewards/accuracies": 0.6937500238418579,
614
- "rewards/chosen": -1.1213358640670776,
615
- "rewards/margins": 0.7478152513504028,
616
- "rewards/rejected": -1.8691511154174805,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.84,
621
  "learning_rate": 3.9507259776993954e-08,
622
- "logits/chosen": 0.8149229288101196,
623
- "logits/rejected": 1.6555734872817993,
624
- "logps/chosen": -430.68231201171875,
625
- "logps/rejected": -496.66900634765625,
626
- "loss": 0.4859,
627
- "rewards/accuracies": 0.7749999761581421,
628
- "rewards/chosen": -1.098642110824585,
629
- "rewards/margins": 0.9415690302848816,
630
- "rewards/rejected": -2.0402112007141113,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.84,
635
- "eval_logits/chosen": 0.7817753553390503,
636
- "eval_logits/rejected": 1.7438044548034668,
637
- "eval_logps/chosen": -360.3138732910156,
638
- "eval_logps/rejected": -457.9451599121094,
639
- "eval_loss": 0.5031262040138245,
640
- "eval_rewards/accuracies": 0.7578125,
641
- "eval_rewards/chosen": -0.9776647090911865,
642
- "eval_rewards/margins": 0.9803228974342346,
643
- "eval_rewards/rejected": -1.9579875469207764,
644
- "eval_runtime": 86.2141,
645
- "eval_samples_per_second": 23.198,
646
- "eval_steps_per_second": 0.371,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 0.86,
651
  "learning_rate": 3.022313472693447e-08,
652
- "logits/chosen": 1.0873724222183228,
653
- "logits/rejected": 2.1809914112091064,
654
- "logps/chosen": -358.1825256347656,
655
- "logps/rejected": -440.631591796875,
656
- "loss": 0.5114,
657
- "rewards/accuracies": 0.762499988079071,
658
- "rewards/chosen": -0.9868854284286499,
659
- "rewards/margins": 0.9311205744743347,
660
- "rewards/rejected": -1.918006181716919,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.88,
665
  "learning_rate": 2.2111614344599684e-08,
666
- "logits/chosen": 0.8678689002990723,
667
- "logits/rejected": 1.581181526184082,
668
- "logps/chosen": -370.5611877441406,
669
- "logps/rejected": -477.7791442871094,
670
- "loss": 0.5041,
671
- "rewards/accuracies": 0.75,
672
- "rewards/chosen": -0.9568517804145813,
673
- "rewards/margins": 0.967370331287384,
674
- "rewards/rejected": -1.9242219924926758,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.9,
679
  "learning_rate": 1.521597710086439e-08,
680
- "logits/chosen": 0.4731677174568176,
681
- "logits/rejected": 1.4058442115783691,
682
- "logps/chosen": -396.15325927734375,
683
- "logps/rejected": -442.45404052734375,
684
- "loss": 0.5063,
685
- "rewards/accuracies": 0.762499988079071,
686
- "rewards/chosen": -0.994260311126709,
687
- "rewards/margins": 0.8869916796684265,
688
- "rewards/rejected": -1.8812516927719116,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.92,
693
  "learning_rate": 9.57301420397924e-09,
694
- "logits/chosen": 0.8329499959945679,
695
- "logits/rejected": 1.7063286304473877,
696
- "logps/chosen": -360.3570861816406,
697
- "logps/rejected": -442.45574951171875,
698
- "loss": 0.4884,
699
- "rewards/accuracies": 0.762499988079071,
700
- "rewards/chosen": -1.0168806314468384,
701
- "rewards/margins": 0.8515257835388184,
702
- "rewards/rejected": -1.8684062957763672,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.94,
707
  "learning_rate": 5.212833302556258e-09,
708
- "logits/chosen": 0.8873745799064636,
709
- "logits/rejected": 1.8809913396835327,
710
- "logps/chosen": -344.71197509765625,
711
- "logps/rejected": -413.39471435546875,
712
- "loss": 0.5068,
713
- "rewards/accuracies": 0.7437499761581421,
714
- "rewards/chosen": -0.99580317735672,
715
- "rewards/margins": 0.8256435394287109,
716
- "rewards/rejected": -1.8214466571807861,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.96,
721
  "learning_rate": 2.158697848236607e-09,
722
- "logits/chosen": 1.0402039289474487,
723
- "logits/rejected": 2.0626494884490967,
724
- "logps/chosen": -350.25115966796875,
725
- "logps/rejected": -416.1396484375,
726
- "loss": 0.4859,
727
- "rewards/accuracies": 0.737500011920929,
728
- "rewards/chosen": -1.070770502090454,
729
- "rewards/margins": 0.807015597820282,
730
- "rewards/rejected": -1.8777860403060913,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.98,
735
  "learning_rate": 4.269029751107489e-10,
736
- "logits/chosen": 0.4914283752441406,
737
- "logits/rejected": 1.8748416900634766,
738
- "logps/chosen": -398.6361083984375,
739
- "logps/rejected": -465.1482849121094,
740
- "loss": 0.4872,
741
- "rewards/accuracies": 0.800000011920929,
742
- "rewards/chosen": -1.004058599472046,
743
- "rewards/margins": 1.027204155921936,
744
- "rewards/rejected": -2.0312628746032715,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
  "step": 478,
750
  "total_flos": 0.0,
751
- "train_loss": 0.5366686437918052,
752
- "train_runtime": 5196.7487,
753
- "train_samples_per_second": 11.764,
754
- "train_steps_per_second": 0.092
755
  }
756
  ],
757
  "logging_steps": 10,
 
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 1.0416666666666666e-08,
14
+ "logits/chosen": -2.6023898124694824,
15
+ "logits/rejected": -2.49088191986084,
16
+ "logps/chosen": -330.5306396484375,
17
+ "logps/rejected": -275.0410461425781,
18
  "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
 
25
  {
26
  "epoch": 0.02,
27
  "learning_rate": 1.0416666666666667e-07,
28
+ "logits/chosen": -2.624011516571045,
29
+ "logits/rejected": -2.59273624420166,
30
+ "logps/chosen": -247.91769409179688,
31
+ "logps/rejected": -215.07041931152344,
32
  "loss": 0.6932,
33
+ "rewards/accuracies": 0.3541666567325592,
34
+ "rewards/chosen": -0.00047609664034098387,
35
+ "rewards/margins": -0.0011458636727184057,
36
+ "rewards/rejected": 0.0006697670323774219,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.04,
41
  "learning_rate": 2.0833333333333333e-07,
42
+ "logits/chosen": -2.614908218383789,
43
+ "logits/rejected": -2.573396682739258,
44
+ "logps/chosen": -273.2959289550781,
45
+ "logps/rejected": -251.2639617919922,
46
+ "loss": 0.6925,
47
+ "rewards/accuracies": 0.643750011920929,
48
+ "rewards/chosen": 0.0009359431569464505,
49
+ "rewards/margins": 0.002007069531828165,
50
+ "rewards/rejected": -0.0010711264330893755,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.06,
55
  "learning_rate": 3.1249999999999997e-07,
56
+ "logits/chosen": -2.6856637001037598,
57
+ "logits/rejected": -2.6220130920410156,
58
+ "logps/chosen": -284.86114501953125,
59
+ "logps/rejected": -277.53057861328125,
60
+ "loss": 0.6886,
61
+ "rewards/accuracies": 0.625,
62
+ "rewards/chosen": 0.00352325732819736,
63
+ "rewards/margins": 0.007650823798030615,
64
+ "rewards/rejected": -0.0041275653056800365,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.08,
69
  "learning_rate": 4.1666666666666667e-07,
70
+ "logits/chosen": -2.579878807067871,
71
+ "logits/rejected": -2.5135815143585205,
72
+ "logps/chosen": -292.1109619140625,
73
+ "logps/rejected": -274.44683837890625,
74
  "loss": 0.6756,
75
+ "rewards/accuracies": 0.65625,
76
+ "rewards/chosen": 0.02378256432712078,
77
+ "rewards/margins": 0.03553395718336105,
78
+ "rewards/rejected": -0.011751385405659676,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.1,
83
  "learning_rate": 4.999733114418725e-07,
84
+ "logits/chosen": -2.5302300453186035,
85
+ "logits/rejected": -2.4865477085113525,
86
+ "logps/chosen": -315.3640441894531,
87
+ "logps/rejected": -310.5618591308594,
88
  "loss": 0.6601,
89
+ "rewards/accuracies": 0.731249988079071,
90
+ "rewards/chosen": -0.014850592240691185,
91
+ "rewards/margins": 0.06933780014514923,
92
+ "rewards/rejected": -0.08418838679790497,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.13,
97
  "learning_rate": 4.990398100856366e-07,
98
+ "logits/chosen": -2.461594820022583,
99
+ "logits/rejected": -2.393406867980957,
100
+ "logps/chosen": -264.4418640136719,
101
+ "logps/rejected": -252.02163696289062,
102
+ "loss": 0.6391,
103
  "rewards/accuracies": 0.668749988079071,
104
+ "rewards/chosen": -0.06258662045001984,
105
+ "rewards/margins": 0.1386002004146576,
106
+ "rewards/rejected": -0.20118682086467743,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.15,
111
  "learning_rate": 4.967775735898179e-07,
112
+ "logits/chosen": -2.5176403522491455,
113
+ "logits/rejected": -2.444599151611328,
114
+ "logps/chosen": -308.10845947265625,
115
+ "logps/rejected": -298.1520690917969,
116
+ "loss": 0.6219,
117
+ "rewards/accuracies": 0.65625,
118
+ "rewards/chosen": -0.26666340231895447,
119
+ "rewards/margins": 0.21313416957855225,
120
+ "rewards/rejected": -0.4797976016998291,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.17,
125
  "learning_rate": 4.931986719649298e-07,
126
+ "logits/chosen": -2.4516353607177734,
127
+ "logits/rejected": -2.4085216522216797,
128
+ "logps/chosen": -298.8356018066406,
129
+ "logps/rejected": -325.5304260253906,
130
+ "loss": 0.611,
131
+ "rewards/accuracies": 0.6875,
132
+ "rewards/chosen": -0.2911642789840698,
133
+ "rewards/margins": 0.20117318630218506,
134
+ "rewards/rejected": -0.49233752489089966,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.19,
139
  "learning_rate": 4.883222001996351e-07,
140
+ "logits/chosen": -2.426361560821533,
141
+ "logits/rejected": -2.3368563652038574,
142
+ "logps/chosen": -293.616943359375,
143
+ "logps/rejected": -308.7396545410156,
144
+ "loss": 0.5867,
145
+ "rewards/accuracies": 0.6187499761581421,
146
+ "rewards/chosen": -0.2579975724220276,
147
+ "rewards/margins": 0.30983540415763855,
148
+ "rewards/rejected": -0.5678330063819885,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.21,
153
  "learning_rate": 4.821741763807186e-07,
154
+ "logits/chosen": -2.488579034805298,
155
+ "logits/rejected": -2.3800113201141357,
156
+ "logps/chosen": -328.0105285644531,
157
+ "logps/rejected": -337.8644104003906,
158
+ "loss": 0.5723,
159
+ "rewards/accuracies": 0.65625,
160
+ "rewards/chosen": -0.4366111755371094,
161
+ "rewards/margins": 0.3044855296611786,
162
+ "rewards/rejected": -0.7410967350006104,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.21,
167
+ "eval_logits/chosen": -2.4070217609405518,
168
+ "eval_logits/rejected": -2.3494362831115723,
169
+ "eval_logps/chosen": -304.3812255859375,
170
+ "eval_logps/rejected": -350.8694763183594,
171
+ "eval_loss": 0.5851432681083679,
172
+ "eval_rewards/accuracies": 0.703125,
173
+ "eval_rewards/chosen": -0.4096587896347046,
174
+ "eval_rewards/margins": 0.46554654836654663,
175
+ "eval_rewards/rejected": -0.8752052783966064,
176
+ "eval_runtime": 91.1907,
177
+ "eval_samples_per_second": 21.932,
178
+ "eval_steps_per_second": 0.351,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.23,
183
  "learning_rate": 4.747874028753375e-07,
184
+ "logits/chosen": -2.0290980339050293,
185
+ "logits/rejected": -1.8976500034332275,
186
+ "logps/chosen": -374.5489807128906,
187
+ "logps/rejected": -375.1778869628906,
188
+ "loss": 0.5723,
189
+ "rewards/accuracies": 0.7437499761581421,
190
+ "rewards/chosen": -0.5513430833816528,
191
+ "rewards/margins": 0.49042654037475586,
192
+ "rewards/rejected": -1.0417697429656982,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.25,
197
  "learning_rate": 4.662012913161997e-07,
198
+ "logits/chosen": -0.8261772990226746,
199
+ "logits/rejected": -0.4543725550174713,
200
+ "logps/chosen": -370.54437255859375,
201
+ "logps/rejected": -376.8744201660156,
202
+ "loss": 0.546,
203
+ "rewards/accuracies": 0.706250011920929,
204
+ "rewards/chosen": -0.893993079662323,
205
+ "rewards/margins": 0.5693421363830566,
206
+ "rewards/rejected": -1.4633351564407349,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.27,
211
  "learning_rate": 4.5646165232345103e-07,
212
+ "logits/chosen": -0.5733903050422668,
213
+ "logits/rejected": -0.41144052147865295,
214
+ "logps/chosen": -331.88458251953125,
215
+ "logps/rejected": -418.39404296875,
216
+ "loss": 0.5492,
217
+ "rewards/accuracies": 0.7124999761581421,
218
+ "rewards/chosen": -0.6849642395973206,
219
+ "rewards/margins": 0.5858219265937805,
220
+ "rewards/rejected": -1.2707862854003906,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.29,
225
  "learning_rate": 4.456204510851956e-07,
226
+ "logits/chosen": -0.7106949687004089,
227
+ "logits/rejected": -0.2236645519733429,
228
+ "logps/chosen": -367.40484619140625,
229
+ "logps/rejected": -390.296142578125,
230
+ "loss": 0.5335,
231
+ "rewards/accuracies": 0.762499988079071,
232
+ "rewards/chosen": -0.7277344465255737,
233
+ "rewards/margins": 0.6220408082008362,
234
+ "rewards/rejected": -1.3497753143310547,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.31,
239
  "learning_rate": 4.337355301007335e-07,
240
+ "logits/chosen": -0.2654598355293274,
241
+ "logits/rejected": 0.43950486183166504,
242
+ "logps/chosen": -385.2984924316406,
243
+ "logps/rejected": -397.6144714355469,
244
+ "loss": 0.5356,
245
+ "rewards/accuracies": 0.699999988079071,
246
+ "rewards/chosen": -0.9714946746826172,
247
+ "rewards/margins": 0.61899733543396,
248
+ "rewards/rejected": -1.5904920101165771,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.33,
253
  "learning_rate": 4.2087030056579986e-07,
254
+ "logits/chosen": 0.1484789103269577,
255
+ "logits/rejected": 0.8263363838195801,
256
+ "logps/chosen": -369.7867736816406,
257
+ "logps/rejected": -436.39373779296875,
258
+ "loss": 0.5065,
259
+ "rewards/accuracies": 0.7749999761581421,
260
+ "rewards/chosen": -0.8267679214477539,
261
+ "rewards/margins": 0.8252193331718445,
262
+ "rewards/rejected": -1.6519873142242432,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.36,
267
  "learning_rate": 4.070934040463998e-07,
268
+ "logits/chosen": 0.2387746274471283,
269
+ "logits/rejected": 0.7541650533676147,
270
+ "logps/chosen": -330.07525634765625,
271
+ "logps/rejected": -366.41204833984375,
272
+ "loss": 0.5659,
273
+ "rewards/accuracies": 0.6812499761581421,
274
+ "rewards/chosen": -0.8212235569953918,
275
+ "rewards/margins": 0.529572606086731,
276
+ "rewards/rejected": -1.3507962226867676,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.38,
281
  "learning_rate": 3.9247834624635404e-07,
282
+ "logits/chosen": 0.45646604895591736,
283
+ "logits/rejected": 0.8084599375724792,
284
+ "logps/chosen": -366.8728942871094,
285
+ "logps/rejected": -432.2496032714844,
286
+ "loss": 0.5249,
287
+ "rewards/accuracies": 0.737500011920929,
288
+ "rewards/chosen": -0.6927820444107056,
289
+ "rewards/margins": 0.8015207052230835,
290
+ "rewards/rejected": -1.4943029880523682,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.4,
295
  "learning_rate": 3.7710310482256523e-07,
296
+ "logits/chosen": 1.0517617464065552,
297
+ "logits/rejected": 1.6709725856781006,
298
+ "logps/chosen": -378.12396240234375,
299
+ "logps/rejected": -458.1866149902344,
300
+ "loss": 0.5056,
301
+ "rewards/accuracies": 0.7875000238418579,
302
+ "rewards/chosen": -0.9326898455619812,
303
+ "rewards/margins": 0.9154269099235535,
304
+ "rewards/rejected": -1.8481168746948242,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.42,
309
  "learning_rate": 3.610497133404795e-07,
310
+ "logits/chosen": 0.9935806393623352,
311
+ "logits/rejected": 1.650398850440979,
312
+ "logps/chosen": -391.5450744628906,
313
+ "logps/rejected": -418.3558654785156,
314
+ "loss": 0.5084,
315
+ "rewards/accuracies": 0.643750011920929,
316
+ "rewards/chosen": -1.0861790180206299,
317
+ "rewards/margins": 0.634604275226593,
318
+ "rewards/rejected": -1.7207832336425781,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.42,
323
+ "eval_logits/chosen": 0.9247687458992004,
324
+ "eval_logits/rejected": 1.3918358087539673,
325
+ "eval_logps/chosen": -354.5789794921875,
326
+ "eval_logps/rejected": -438.0662536621094,
327
+ "eval_loss": 0.5251370072364807,
328
+ "eval_rewards/accuracies": 0.7421875,
329
+ "eval_rewards/chosen": -0.9116362929344177,
330
+ "eval_rewards/margins": 0.8355368375778198,
331
+ "eval_rewards/rejected": -1.7471731901168823,
332
+ "eval_runtime": 91.7577,
333
+ "eval_samples_per_second": 21.797,
334
+ "eval_steps_per_second": 0.349,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.44,
339
  "learning_rate": 3.4440382358952115e-07,
340
+ "logits/chosen": 1.0475047826766968,
341
+ "logits/rejected": 1.849473237991333,
342
+ "logps/chosen": -367.184814453125,
343
+ "logps/rejected": -398.2117614746094,
344
+ "loss": 0.5251,
345
  "rewards/accuracies": 0.6937500238418579,
346
+ "rewards/chosen": -0.8909347653388977,
347
+ "rewards/margins": 0.6959229707717896,
348
+ "rewards/rejected": -1.586857557296753,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.46,
353
  "learning_rate": 3.272542485937368e-07,
354
+ "logits/chosen": 1.6884968280792236,
355
+ "logits/rejected": 2.2008445262908936,
356
+ "logps/chosen": -353.2514343261719,
357
+ "logps/rejected": -404.71221923828125,
358
  "loss": 0.5269,
359
+ "rewards/accuracies": 0.75,
360
+ "rewards/chosen": -0.7567670345306396,
361
+ "rewards/margins": 0.8415945768356323,
362
+ "rewards/rejected": -1.5983617305755615,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.48,
367
  "learning_rate": 3.096924887558854e-07,
368
+ "logits/chosen": 1.460933804512024,
369
+ "logits/rejected": 1.9314343929290771,
370
+ "logps/chosen": -351.2489318847656,
371
+ "logps/rejected": -453.9790954589844,
372
+ "loss": 0.519,
373
+ "rewards/accuracies": 0.768750011920929,
374
+ "rewards/chosen": -0.7599745988845825,
375
+ "rewards/margins": 0.8532025218009949,
376
+ "rewards/rejected": -1.6131770610809326,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.5,
381
  "learning_rate": 2.9181224366319943e-07,
382
+ "logits/chosen": 1.796936273574829,
383
+ "logits/rejected": 2.389878988265991,
384
+ "logps/chosen": -351.67498779296875,
385
+ "logps/rejected": -421.3821716308594,
386
+ "loss": 0.5261,
387
+ "rewards/accuracies": 0.7437499761581421,
388
+ "rewards/chosen": -0.9927783012390137,
389
+ "rewards/margins": 0.786289632320404,
390
+ "rewards/rejected": -1.7790677547454834,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.52,
395
  "learning_rate": 2.7370891215954565e-07,
396
+ "logits/chosen": 1.5744327306747437,
397
+ "logits/rejected": 2.3407230377197266,
398
+ "logps/chosen": -358.4691467285156,
399
+ "logps/rejected": -418.01031494140625,
400
+ "loss": 0.5134,
401
+ "rewards/accuracies": 0.7124999761581421,
402
+ "rewards/chosen": -0.903947651386261,
403
+ "rewards/margins": 0.6940609216690063,
404
+ "rewards/rejected": -1.5980085134506226,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.54,
409
  "learning_rate": 2.55479083351317e-07,
410
+ "logits/chosen": 1.873732566833496,
411
+ "logits/rejected": 2.9474740028381348,
412
+ "logps/chosen": -371.85552978515625,
413
+ "logps/rejected": -420.95904541015625,
414
+ "loss": 0.4922,
415
+ "rewards/accuracies": 0.8062499761581421,
416
+ "rewards/chosen": -0.9079627990722656,
417
+ "rewards/margins": 0.8738547563552856,
418
+ "rewards/rejected": -1.7818174362182617,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.56,
423
  "learning_rate": 2.3722002126275822e-07,
424
+ "logits/chosen": 2.415181875228882,
425
+ "logits/rejected": 3.162013530731201,
426
+ "logps/chosen": -388.0815734863281,
427
+ "logps/rejected": -478.11785888671875,
428
+ "loss": 0.498,
429
+ "rewards/accuracies": 0.762499988079071,
430
+ "rewards/chosen": -1.128756046295166,
431
+ "rewards/margins": 1.0180633068084717,
432
+ "rewards/rejected": -2.146819591522217,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.59,
437
  "learning_rate": 2.19029145890313e-07,
438
+ "logits/chosen": 1.9844467639923096,
439
+ "logits/rejected": 2.9561781883239746,
440
+ "logps/chosen": -369.2903747558594,
441
+ "logps/rejected": -419.6259765625,
442
+ "loss": 0.5207,
443
+ "rewards/accuracies": 0.7437499761581421,
444
+ "rewards/chosen": -0.9253425598144531,
445
+ "rewards/margins": 0.8587535619735718,
446
+ "rewards/rejected": -1.784096121788025,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.61,
451
  "learning_rate": 2.0100351342479216e-07,
452
+ "logits/chosen": 1.8705106973648071,
453
+ "logits/rejected": 2.6589739322662354,
454
+ "logps/chosen": -380.0862731933594,
455
+ "logps/rejected": -439.79168701171875,
456
+ "loss": 0.515,
457
+ "rewards/accuracies": 0.71875,
458
+ "rewards/chosen": -0.9231119155883789,
459
+ "rewards/margins": 0.735679030418396,
460
+ "rewards/rejected": -1.6587913036346436,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.63,
465
  "learning_rate": 1.8323929841460178e-07,
466
+ "logits/chosen": 1.3944432735443115,
467
+ "logits/rejected": 2.3618969917297363,
468
+ "logps/chosen": -389.6896057128906,
469
+ "logps/rejected": -470.2090759277344,
470
+ "loss": 0.5059,
471
+ "rewards/accuracies": 0.71875,
472
+ "rewards/chosen": -0.8919968605041504,
473
+ "rewards/margins": 0.6746976971626282,
474
+ "rewards/rejected": -1.5666944980621338,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.63,
479
+ "eval_logits/chosen": 1.2558308839797974,
480
+ "eval_logits/rejected": 2.033073902130127,
481
+ "eval_logps/chosen": -349.8758239746094,
482
+ "eval_logps/rejected": -438.77349853515625,
483
+ "eval_loss": 0.5130496621131897,
484
+ "eval_rewards/accuracies": 0.75,
485
+ "eval_rewards/chosen": -0.8646047711372375,
486
+ "eval_rewards/margins": 0.8896409273147583,
487
+ "eval_rewards/rejected": -1.7542455196380615,
488
+ "eval_runtime": 92.0798,
489
+ "eval_samples_per_second": 21.72,
490
+ "eval_steps_per_second": 0.348,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.65,
495
  "learning_rate": 1.6583128063291573e-07,
496
+ "logits/chosen": 1.2974698543548584,
497
+ "logits/rejected": 2.6388087272644043,
498
+ "logps/chosen": -382.4002990722656,
499
+ "logps/rejected": -406.01153564453125,
500
+ "loss": 0.4978,
501
  "rewards/accuracies": 0.75,
502
+ "rewards/chosen": -1.0794718265533447,
503
+ "rewards/margins": 0.7805131673812866,
504
+ "rewards/rejected": -1.8599849939346313,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.67,
509
  "learning_rate": 1.488723393865766e-07,
510
+ "logits/chosen": 1.9306262731552124,
511
+ "logits/rejected": 2.9958901405334473,
512
+ "logps/chosen": -357.4389953613281,
513
+ "logps/rejected": -452.7220764160156,
514
+ "loss": 0.5064,
515
  "rewards/accuracies": 0.7749999761581421,
516
+ "rewards/chosen": -1.0856704711914062,
517
+ "rewards/margins": 1.057279109954834,
518
+ "rewards/rejected": -2.1429495811462402,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.69,
523
  "learning_rate": 1.3245295796480788e-07,
524
+ "logits/chosen": 1.4244121313095093,
525
+ "logits/rejected": 2.2654335498809814,
526
+ "logps/chosen": -404.91082763671875,
527
+ "logps/rejected": -450.8277893066406,
528
+ "loss": 0.5096,
529
+ "rewards/accuracies": 0.75,
530
+ "rewards/chosen": -1.1859899759292603,
531
+ "rewards/margins": 0.7777279019355774,
532
+ "rewards/rejected": -1.9637176990509033,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.71,
537
  "learning_rate": 1.1666074087171627e-07,
538
+ "logits/chosen": 1.5507278442382812,
539
+ "logits/rejected": 2.3268961906433105,
540
+ "logps/chosen": -363.16473388671875,
541
+ "logps/rejected": -420.6800231933594,
542
+ "loss": 0.5173,
543
  "rewards/accuracies": 0.7250000238418579,
544
+ "rewards/chosen": -1.0014616250991821,
545
+ "rewards/margins": 0.7089160680770874,
546
+ "rewards/rejected": -1.7103776931762695,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.73,
551
  "learning_rate": 1.0157994641835734e-07,
552
+ "logits/chosen": 1.5202906131744385,
553
+ "logits/rejected": 2.6713767051696777,
554
+ "logps/chosen": -359.4294128417969,
555
+ "logps/rejected": -433.394287109375,
556
+ "loss": 0.4787,
557
+ "rewards/accuracies": 0.8062499761581421,
558
+ "rewards/chosen": -1.0805784463882446,
559
+ "rewards/margins": 0.9193571209907532,
560
+ "rewards/rejected": -1.999935507774353,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.75,
565
  "learning_rate": 8.729103716819111e-08,
566
+ "logits/chosen": 1.5974103212356567,
567
+ "logits/rejected": 3.016284942626953,
568
+ "logps/chosen": -435.1712951660156,
569
+ "logps/rejected": -469.9830017089844,
570
+ "loss": 0.4902,
571
+ "rewards/accuracies": 0.75,
572
+ "rewards/chosen": -1.1632494926452637,
573
+ "rewards/margins": 0.9136824607849121,
574
+ "rewards/rejected": -2.0769317150115967,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.77,
579
  "learning_rate": 7.387025063449081e-08,
580
+ "logits/chosen": 1.7092777490615845,
581
+ "logits/rejected": 2.965677261352539,
582
+ "logps/chosen": -423.5621643066406,
583
+ "logps/rejected": -466.57196044921875,
584
+ "loss": 0.5002,
585
+ "rewards/accuracies": 0.8187500238418579,
586
+ "rewards/chosen": -1.1220273971557617,
587
+ "rewards/margins": 0.9678171277046204,
588
+ "rewards/rejected": -2.0898444652557373,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.79,
593
  "learning_rate": 6.138919252022435e-08,
594
+ "logits/chosen": 1.7860336303710938,
595
+ "logits/rejected": 2.569241523742676,
596
+ "logps/chosen": -395.4902648925781,
597
+ "logps/rejected": -483.0901794433594,
598
+ "loss": 0.4772,
599
+ "rewards/accuracies": 0.7749999761581421,
600
+ "rewards/chosen": -1.1515331268310547,
601
+ "rewards/margins": 0.8898499608039856,
602
+ "rewards/rejected": -2.0413832664489746,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.82,
607
  "learning_rate": 4.991445467064689e-08,
608
+ "logits/chosen": 2.0826852321624756,
609
+ "logits/rejected": 2.8060660362243652,
610
+ "logps/chosen": -398.78375244140625,
611
+ "logps/rejected": -471.2264099121094,
612
+ "loss": 0.5066,
613
+ "rewards/accuracies": 0.675000011920929,
614
+ "rewards/chosen": -1.2567694187164307,
615
+ "rewards/margins": 0.7216086983680725,
616
+ "rewards/rejected": -1.9783780574798584,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.84,
621
  "learning_rate": 3.9507259776993954e-08,
622
+ "logits/chosen": 1.802354097366333,
623
+ "logits/rejected": 2.5923492908477783,
624
+ "logps/chosen": -446.500244140625,
625
+ "logps/rejected": -510.20269775390625,
626
+ "loss": 0.4853,
627
+ "rewards/accuracies": 0.7562500238418579,
628
+ "rewards/chosen": -1.2238214015960693,
629
+ "rewards/margins": 0.9289990663528442,
630
+ "rewards/rejected": -2.152820587158203,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.84,
635
+ "eval_logits/chosen": 1.8194458484649658,
636
+ "eval_logits/rejected": 2.592175245285034,
637
+ "eval_logps/chosen": -372.7066650390625,
638
+ "eval_logps/rejected": -474.1963195800781,
639
+ "eval_loss": 0.5050143003463745,
640
+ "eval_rewards/accuracies": 0.75390625,
641
+ "eval_rewards/chosen": -1.0929131507873535,
642
+ "eval_rewards/margins": 1.0155609846115112,
643
+ "eval_rewards/rejected": -2.108474016189575,
644
+ "eval_runtime": 90.5801,
645
+ "eval_samples_per_second": 22.08,
646
+ "eval_steps_per_second": 0.353,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 0.86,
651
  "learning_rate": 3.022313472693447e-08,
652
+ "logits/chosen": 2.2372403144836426,
653
+ "logits/rejected": 3.196664333343506,
654
+ "logps/chosen": -370.81719970703125,
655
+ "logps/rejected": -452.06549072265625,
656
+ "loss": 0.5086,
657
+ "rewards/accuracies": 0.793749988079071,
658
+ "rewards/chosen": -1.1016533374786377,
659
+ "rewards/margins": 0.9261430501937866,
660
+ "rewards/rejected": -2.0277962684631348,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.88,
665
  "learning_rate": 2.2111614344599684e-08,
666
+ "logits/chosen": 1.831080675125122,
667
+ "logits/rejected": 2.4410791397094727,
668
+ "logps/chosen": -385.7922058105469,
669
+ "logps/rejected": -492.590576171875,
670
+ "loss": 0.5061,
671
+ "rewards/accuracies": 0.762499988079071,
672
+ "rewards/chosen": -1.084149956703186,
673
+ "rewards/margins": 0.9615718722343445,
674
+ "rewards/rejected": -2.0457215309143066,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.9,
679
  "learning_rate": 1.521597710086439e-08,
680
+ "logits/chosen": 1.4260971546173096,
681
+ "logits/rejected": 2.3162856101989746,
682
+ "logps/chosen": -407.1165466308594,
683
+ "logps/rejected": -454.90374755859375,
684
+ "loss": 0.5059,
685
+ "rewards/accuracies": 0.75,
686
+ "rewards/chosen": -1.0966671705245972,
687
+ "rewards/margins": 0.9018322229385376,
688
+ "rewards/rejected": -1.9984995126724243,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.92,
693
  "learning_rate": 9.57301420397924e-09,
694
+ "logits/chosen": 1.783463716506958,
695
+ "logits/rejected": 2.5885117053985596,
696
+ "logps/chosen": -373.5993347167969,
697
+ "logps/rejected": -458.12091064453125,
698
+ "loss": 0.487,
699
+ "rewards/accuracies": 0.7562500238418579,
700
+ "rewards/chosen": -1.1429402828216553,
701
+ "rewards/margins": 0.8700854182243347,
702
+ "rewards/rejected": -2.0130257606506348,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.94,
707
  "learning_rate": 5.212833302556258e-09,
708
+ "logits/chosen": 1.8070141077041626,
709
+ "logits/rejected": 2.747885227203369,
710
+ "logps/chosen": -355.58221435546875,
711
+ "logps/rejected": -426.42584228515625,
712
+ "loss": 0.5082,
713
+ "rewards/accuracies": 0.731249988079071,
714
+ "rewards/chosen": -1.0844265222549438,
715
+ "rewards/margins": 0.8474240303039551,
716
+ "rewards/rejected": -1.9318506717681885,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.96,
721
  "learning_rate": 2.158697848236607e-09,
722
+ "logits/chosen": 2.0278899669647217,
723
+ "logits/rejected": 3.022653818130493,
724
+ "logps/chosen": -362.0993347167969,
725
+ "logps/rejected": -428.6521911621094,
726
+ "loss": 0.4861,
727
+ "rewards/accuracies": 0.75,
728
+ "rewards/chosen": -1.1751190423965454,
729
+ "rewards/margins": 0.813240647315979,
730
+ "rewards/rejected": -1.9883596897125244,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.98,
735
  "learning_rate": 4.269029751107489e-10,
736
+ "logits/chosen": 1.3355131149291992,
737
+ "logits/rejected": 2.729475736618042,
738
+ "logps/chosen": -406.28033447265625,
739
+ "logps/rejected": -480.8604431152344,
740
+ "loss": 0.4807,
741
+ "rewards/accuracies": 0.8125,
742
+ "rewards/chosen": -1.0706168413162231,
743
+ "rewards/margins": 1.0933626890182495,
744
+ "rewards/rejected": -2.1639795303344727,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 1.0,
749
  "step": 478,
750
  "total_flos": 0.0,
751
+ "train_loss": 0.5379065808890754,
752
+ "train_runtime": 5396.8094,
753
+ "train_samples_per_second": 11.328,
754
+ "train_steps_per_second": 0.089
755
  }
756
  ],
757
  "logging_steps": 10,