tanliboy commited on
Commit
90d8fed
1 Parent(s): a22b788

Model save

Browse files
README.md CHANGED
@@ -3,15 +3,9 @@ library_name: transformers
3
  license: llama3.1
4
  base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
5
  tags:
6
- - alignment-handbook
7
  - trl
8
  - dpo
9
  - generated_from_trainer
10
- - trl
11
- - dpo
12
- - generated_from_trainer
13
- datasets:
14
- - tanliboy/orca_dpo_pairs
15
  model-index:
16
  - name: lambda-llama-3-8b-dpo-test-orca
17
  results: []
@@ -22,17 +16,17 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # lambda-llama-3-8b-dpo-test-orca
24
 
25
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on the tanliboy/orca_dpo_pairs dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 0.1235
28
- - Rewards/chosen: -2.8028
29
- - Rewards/rejected: -6.6852
30
- - Rewards/accuracies: 0.9643
31
- - Rewards/margins: 3.8824
32
- - Logps/rejected: -970.0546
33
- - Logps/chosen: -562.0943
34
- - Logits/rejected: -1.9611
35
- - Logits/chosen: -2.4346
36
 
37
  ## Model description
38
 
@@ -67,6 +61,13 @@ The following hyperparameters were used during training:
67
 
68
  ### Training results
69
 
 
 
 
 
 
 
 
70
 
71
 
72
  ### Framework versions
 
3
  license: llama3.1
4
  base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
5
  tags:
 
6
  - trl
7
  - dpo
8
  - generated_from_trainer
 
 
 
 
 
9
  model-index:
10
  - name: lambda-llama-3-8b-dpo-test-orca
11
  results: []
 
16
 
17
  # lambda-llama-3-8b-dpo-test-orca
18
 
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.4798
22
+ - Rewards/chosen: -1.7085
23
+ - Rewards/rejected: -2.8440
24
+ - Rewards/accuracies: 0.7259
25
+ - Rewards/margins: 1.1355
26
+ - Logps/rejected: -648.5815
27
+ - Logps/chosen: -551.6072
28
+ - Logits/rejected: -2.6442
29
+ - Logits/chosen: -2.5812
30
 
31
  ## Model description
32
 
 
61
 
62
  ### Training results
63
 
64
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
+ | 0.6011 | 0.1744 | 100 | 0.5738 | -0.8770 | -1.2808 | 0.6988 | 0.4038 | -492.2603 | -468.4565 | -2.4544 | -2.4042 |
67
+ | 0.5447 | 0.3489 | 200 | 0.5242 | -1.3236 | -2.0879 | 0.7289 | 0.7644 | -572.9752 | -513.1177 | -2.6319 | -2.5732 |
68
+ | 0.5173 | 0.5233 | 300 | 0.5003 | -1.6828 | -2.6810 | 0.7259 | 0.9982 | -632.2809 | -549.0404 | -2.6140 | -2.5556 |
69
+ | 0.5144 | 0.6978 | 400 | 0.4851 | -1.7107 | -2.8135 | 0.7319 | 1.1028 | -645.5279 | -551.8306 | -2.7027 | -2.6365 |
70
+ | 0.5162 | 0.8722 | 500 | 0.4798 | -1.7085 | -2.8440 | 0.7259 | 1.1355 | -648.5815 | -551.6072 | -2.6442 | -2.5812 |
71
 
72
 
73
  ### Framework versions
all_results.json CHANGED
@@ -1,22 +1,9 @@
1
  {
2
- "epoch": 0.9947643979057592,
3
- "eval_logits/chosen": -2.4346187114715576,
4
- "eval_logits/rejected": -1.9611175060272217,
5
- "eval_logps/chosen": -562.0942993164062,
6
- "eval_logps/rejected": -970.0545654296875,
7
- "eval_loss": 0.12353485077619553,
8
- "eval_rewards/accuracies": 0.9642857313156128,
9
- "eval_rewards/chosen": -2.802823305130005,
10
- "eval_rewards/margins": 3.8823659420013428,
11
- "eval_rewards/rejected": -6.685189247131348,
12
- "eval_runtime": 28.3026,
13
- "eval_samples": 643,
14
- "eval_samples_per_second": 22.719,
15
- "eval_steps_per_second": 0.742,
16
  "total_flos": 0.0,
17
- "train_loss": 0.38092811358602424,
18
- "train_runtime": 1269.7396,
19
- "train_samples": 12216,
20
- "train_samples_per_second": 9.621,
21
- "train_steps_per_second": 0.075
22
  }
 
1
  {
2
+ "epoch": 0.9995638901003053,
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "total_flos": 0.0,
4
+ "train_loss": 0.5544013454860002,
5
+ "train_runtime": 8395.7281,
6
+ "train_samples": 73350,
7
+ "train_samples_per_second": 8.737,
8
+ "train_steps_per_second": 0.068
9
  }
config.json CHANGED
@@ -34,6 +34,6 @@
34
  "tie_word_embeddings": false,
35
  "torch_dtype": "bfloat16",
36
  "transformers_version": "4.44.2",
37
- "use_cache": true,
38
  "vocab_size": 128256
39
  }
 
34
  "tie_word_embeddings": false,
35
  "torch_dtype": "bfloat16",
36
  "transformers_version": "4.44.2",
37
+ "use_cache": false,
38
  "vocab_size": 128256
39
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcdc7e21bc0f1f8df53a8a8da73557968cf0fb2c6a7660ee5cbd351e9bde31fd
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cce40c6f951a85596a41140c106a04b5ba4cb8af6849f8c482601d170fd181da
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3302769ff8d4bd9ced9e5328eb79bea2294a35cc9ead16d4e065024c981dc310
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fddeae186d1262531acdf2336cc970cb7141706bccb8e760b66b57fae2d26de2
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7528b1f5a5e3f953a12f7a8bb0ba400007c20be0649e670836f11050637624fa
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2184c75a3771112041911faa922b6743368141106130052ae7cd136b2aee9aea
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dac63a749558e3413c1ffade9de85e166b9698562195b56ea62c5efced0546bb
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba04b276331c76dd9e1ddace66c519b440f6615a3701cbd31e93be43bdd61c37
3
  size 1168138808
runs/Sep21_04-47-22_action-graph-trainer/events.out.tfevents.1726894898.action-graph-trainer.3148605.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2ee78f712207d8c958c1dd542736962f6095e8437503b729a61de5f5b01d0e0
3
+ size 50265
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9947643979057592,
3
  "total_flos": 0.0,
4
- "train_loss": 0.38092811358602424,
5
- "train_runtime": 1269.7396,
6
- "train_samples": 12216,
7
- "train_samples_per_second": 9.621,
8
- "train_steps_per_second": 0.075
9
  }
 
1
  {
2
+ "epoch": 0.9995638901003053,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.5544013454860002,
5
+ "train_runtime": 8395.7281,
6
+ "train_samples": 73350,
7
+ "train_samples_per_second": 8.737,
8
+ "train_steps_per_second": 0.068
9
  }
trainer_state.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9947643979057592,
5
  "eval_steps": 100,
6
- "global_step": 95,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.010471204188481676,
13
- "grad_norm": 14.42849432669548,
14
- "learning_rate": 2e-08,
15
- "logits/chosen": -2.705627918243408,
16
- "logits/rejected": -1.8209420442581177,
17
- "logps/chosen": -315.2232666015625,
18
- "logps/rejected": -333.2189025878906,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -24,152 +24,952 @@
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.10471204188481675,
28
- "grad_norm": 13.44691822523609,
29
- "learning_rate": 2e-07,
30
- "logits/chosen": -2.7048144340515137,
31
- "logits/rejected": -2.1536295413970947,
32
- "logps/chosen": -277.1604309082031,
33
- "logps/rejected": -290.7293701171875,
34
- "loss": 0.6923,
35
- "rewards/accuracies": 0.5347222089767456,
36
- "rewards/chosen": 0.00019832928956020623,
37
- "rewards/margins": 0.0016432523261755705,
38
- "rewards/rejected": -0.0014449231093749404,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.2094240837696335,
43
- "grad_norm": 16.21569144475015,
44
- "learning_rate": 1.9324722294043556e-07,
45
- "logits/chosen": -2.492572546005249,
46
- "logits/rejected": -2.0814006328582764,
47
- "logps/chosen": -309.6625061035156,
48
- "logps/rejected": -296.83868408203125,
49
- "loss": 0.6658,
50
- "rewards/accuracies": 0.949999988079071,
51
- "rewards/chosen": -0.003994358237832785,
52
- "rewards/margins": 0.057490717619657516,
53
- "rewards/rejected": -0.061485081911087036,
54
  "step": 20
55
  },
56
  {
57
- "epoch": 0.31413612565445026,
58
- "grad_norm": 14.182589244162711,
59
- "learning_rate": 1.739008917220659e-07,
60
- "logits/chosen": -2.387019395828247,
61
- "logits/rejected": -1.9367955923080444,
62
- "logps/chosen": -299.33404541015625,
63
- "logps/rejected": -322.9083251953125,
64
- "loss": 0.5937,
65
- "rewards/accuracies": 0.9937499761581421,
66
- "rewards/chosen": -0.022330567240715027,
67
- "rewards/margins": 0.21069249510765076,
68
- "rewards/rejected": -0.23302307724952698,
69
  "step": 30
70
  },
71
  {
72
- "epoch": 0.418848167539267,
73
- "grad_norm": 16.048018340744715,
74
- "learning_rate": 1.4457383557765383e-07,
75
- "logits/chosen": -2.4496796131134033,
76
- "logits/rejected": -2.079987049102783,
77
- "logps/chosen": -294.8586730957031,
78
- "logps/rejected": -345.1488037109375,
79
- "loss": 0.5035,
80
- "rewards/accuracies": 0.9750000238418579,
81
- "rewards/chosen": -0.1000712662935257,
82
- "rewards/margins": 0.45323365926742554,
83
- "rewards/rejected": -0.55330491065979,
84
  "step": 40
85
  },
86
  {
87
- "epoch": 0.5235602094240838,
88
- "grad_norm": 22.958491853819712,
89
- "learning_rate": 1.092268359463302e-07,
90
- "logits/chosen": -2.2879467010498047,
91
- "logits/rejected": -1.8292083740234375,
92
- "logps/chosen": -306.67706298828125,
93
- "logps/rejected": -418.6455993652344,
94
- "loss": 0.4123,
95
- "rewards/accuracies": 0.987500011920929,
96
- "rewards/chosen": -0.4231874942779541,
97
- "rewards/margins": 0.8138816952705383,
98
- "rewards/rejected": -1.2370691299438477,
99
  "step": 50
100
  },
101
  {
102
- "epoch": 0.6282722513089005,
103
- "grad_norm": 19.370046611297532,
104
- "learning_rate": 7.263370099279171e-08,
105
- "logits/chosen": -2.491612672805786,
106
- "logits/rejected": -2.067852735519409,
107
- "logps/chosen": -442.7276916503906,
108
- "logps/rejected": -641.3619384765625,
109
- "loss": 0.2439,
110
- "rewards/accuracies": 0.9312499761581421,
111
- "rewards/chosen": -1.5304675102233887,
112
- "rewards/margins": 1.8317034244537354,
113
- "rewards/rejected": -3.362171173095703,
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.7329842931937173,
118
- "grad_norm": 17.60064306931795,
119
- "learning_rate": 3.973653636207437e-08,
120
- "logits/chosen": -2.3535571098327637,
121
- "logits/rejected": -1.9946672916412354,
122
- "logps/chosen": -591.0353393554688,
123
- "logps/rejected": -910.4153442382812,
124
- "loss": 0.1698,
125
- "rewards/accuracies": 0.9624999761581421,
126
- "rewards/chosen": -2.834195613861084,
127
- "rewards/margins": 3.2335205078125,
128
- "rewards/rejected": -6.067716598510742,
129
  "step": 70
130
  },
131
  {
132
- "epoch": 0.837696335078534,
133
- "grad_norm": 20.39945478972367,
134
- "learning_rate": 1.49782864270386e-08,
135
- "logits/chosen": -2.500075101852417,
136
- "logits/rejected": -2.16581654548645,
137
- "logps/chosen": -597.4744262695312,
138
- "logps/rejected": -964.6823120117188,
139
- "loss": 0.1431,
140
- "rewards/accuracies": 0.956250011920929,
141
- "rewards/chosen": -3.152522563934326,
142
- "rewards/margins": 3.5699126720428467,
143
- "rewards/rejected": -6.722434997558594,
144
  "step": 80
145
  },
146
  {
147
- "epoch": 0.9424083769633508,
148
- "grad_norm": 27.851116517075948,
149
- "learning_rate": 1.7026900316098214e-09,
150
- "logits/chosen": -2.4192073345184326,
151
- "logits/rejected": -2.111260175704956,
152
- "logps/chosen": -618.6544799804688,
153
- "logps/rejected": -911.25146484375,
154
- "loss": 0.137,
155
- "rewards/accuracies": 0.949999988079071,
156
- "rewards/chosen": -2.717777967453003,
157
- "rewards/margins": 3.354006290435791,
158
- "rewards/rejected": -6.071784019470215,
159
  "step": 90
160
  },
161
  {
162
- "epoch": 0.9947643979057592,
163
- "step": 95,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  "total_flos": 0.0,
165
- "train_loss": 0.38092811358602424,
166
- "train_runtime": 1269.7396,
167
- "train_samples_per_second": 9.621,
168
- "train_steps_per_second": 0.075
169
  }
170
  ],
171
  "logging_steps": 10,
172
- "max_steps": 95,
173
  "num_input_tokens_seen": 0,
174
  "num_train_epochs": 1,
175
  "save_steps": 500,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9995638901003053,
5
  "eval_steps": 100,
6
+ "global_step": 573,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0017444395987788923,
13
+ "grad_norm": 5.784088092395373,
14
+ "learning_rate": 3.4482758620689654e-09,
15
+ "logits/chosen": -2.946424722671509,
16
+ "logits/rejected": -2.985557794570923,
17
+ "logps/chosen": -261.0296630859375,
18
+ "logps/rejected": -338.7343444824219,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.01744439598778892,
28
+ "grad_norm": 6.628089918841342,
29
+ "learning_rate": 3.448275862068966e-08,
30
+ "logits/chosen": -2.282945156097412,
31
+ "logits/rejected": -2.351466178894043,
32
+ "logps/chosen": -388.3266906738281,
33
+ "logps/rejected": -407.86871337890625,
34
+ "loss": 0.6931,
35
+ "rewards/accuracies": 0.4097222089767456,
36
+ "rewards/chosen": 0.0005646743229590356,
37
+ "rewards/margins": -0.0001476690813433379,
38
+ "rewards/rejected": 0.0007123433169908822,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.03488879197557784,
43
+ "grad_norm": 5.358446945552647,
44
+ "learning_rate": 6.896551724137931e-08,
45
+ "logits/chosen": -2.507432222366333,
46
+ "logits/rejected": -2.6201975345611572,
47
+ "logps/chosen": -354.7762145996094,
48
+ "logps/rejected": -351.57476806640625,
49
+ "loss": 0.693,
50
+ "rewards/accuracies": 0.5375000238418579,
51
+ "rewards/chosen": 0.00036403987905941904,
52
+ "rewards/margins": 0.00059556431369856,
53
+ "rewards/rejected": -0.00023152439098339528,
54
  "step": 20
55
  },
56
  {
57
+ "epoch": 0.05233318796336677,
58
+ "grad_norm": 6.200484723185969,
59
+ "learning_rate": 1.0344827586206897e-07,
60
+ "logits/chosen": -2.2877583503723145,
61
+ "logits/rejected": -2.38248348236084,
62
+ "logps/chosen": -374.0684509277344,
63
+ "logps/rejected": -363.9664001464844,
64
+ "loss": 0.6921,
65
+ "rewards/accuracies": 0.6187499761581421,
66
+ "rewards/chosen": -0.003327027428895235,
67
+ "rewards/margins": 0.0030093365348875523,
68
+ "rewards/rejected": -0.0063363634981215,
69
  "step": 30
70
  },
71
  {
72
+ "epoch": 0.06977758395115569,
73
+ "grad_norm": 6.01035041441692,
74
+ "learning_rate": 1.3793103448275863e-07,
75
+ "logits/chosen": -2.264878988265991,
76
+ "logits/rejected": -2.4066078662872314,
77
+ "logps/chosen": -391.4086608886719,
78
+ "logps/rejected": -380.48248291015625,
79
+ "loss": 0.6893,
80
+ "rewards/accuracies": 0.668749988079071,
81
+ "rewards/chosen": -0.01258127111941576,
82
+ "rewards/margins": 0.0077340505085885525,
83
+ "rewards/rejected": -0.020315321162343025,
84
  "step": 40
85
  },
86
  {
87
+ "epoch": 0.08722197993894461,
88
+ "grad_norm": 5.071749818122234,
89
+ "learning_rate": 1.7241379310344825e-07,
90
+ "logits/chosen": -2.1940102577209473,
91
+ "logits/rejected": -2.205831289291382,
92
+ "logps/chosen": -352.8985290527344,
93
+ "logps/rejected": -375.4139099121094,
94
+ "loss": 0.6837,
95
+ "rewards/accuracies": 0.6499999761581421,
96
+ "rewards/chosen": -0.03738312050700188,
97
+ "rewards/margins": 0.01618872582912445,
98
+ "rewards/rejected": -0.053571850061416626,
99
  "step": 50
100
  },
101
  {
102
+ "epoch": 0.10466637592673354,
103
+ "grad_norm": 5.2912013186214075,
104
+ "learning_rate": 1.9999255765332945e-07,
105
+ "logits/chosen": -2.174111843109131,
106
+ "logits/rejected": -2.182332992553711,
107
+ "logps/chosen": -343.0633850097656,
108
+ "logps/rejected": -346.3739318847656,
109
+ "loss": 0.67,
110
+ "rewards/accuracies": 0.675000011920929,
111
+ "rewards/chosen": -0.07623813301324844,
112
+ "rewards/margins": 0.036114297807216644,
113
+ "rewards/rejected": -0.11235243082046509,
114
  "step": 60
115
  },
116
  {
117
+ "epoch": 0.12211077191452246,
118
+ "grad_norm": 6.100737724237624,
119
+ "learning_rate": 1.9973219181729437e-07,
120
+ "logits/chosen": -2.14943790435791,
121
+ "logits/rejected": -2.206282138824463,
122
+ "logps/chosen": -377.2199401855469,
123
+ "logps/rejected": -405.7254943847656,
124
+ "loss": 0.6554,
125
+ "rewards/accuracies": 0.6312500238418579,
126
+ "rewards/chosen": -0.15880194306373596,
127
+ "rewards/margins": 0.06730278581380844,
128
+ "rewards/rejected": -0.2261047065258026,
129
  "step": 70
130
  },
131
  {
132
+ "epoch": 0.13955516790231137,
133
+ "grad_norm": 6.8468151533929635,
134
+ "learning_rate": 1.9910081567726745e-07,
135
+ "logits/chosen": -2.1643874645233154,
136
+ "logits/rejected": -2.2666897773742676,
137
+ "logps/chosen": -394.8481140136719,
138
+ "logps/rejected": -411.7466735839844,
139
+ "loss": 0.6296,
140
+ "rewards/accuracies": 0.675000011920929,
141
+ "rewards/chosen": -0.32392174005508423,
142
+ "rewards/margins": 0.18173113465309143,
143
+ "rewards/rejected": -0.5056527853012085,
144
  "step": 80
145
  },
146
  {
147
+ "epoch": 0.1569995638901003,
148
+ "grad_norm": 9.734003752103645,
149
+ "learning_rate": 1.9810077799395846e-07,
150
+ "logits/chosen": -2.2468771934509277,
151
+ "logits/rejected": -2.21939754486084,
152
+ "logps/chosen": -451.7411193847656,
153
+ "logps/rejected": -489.8981018066406,
154
+ "loss": 0.6041,
155
+ "rewards/accuracies": 0.675000011920929,
156
+ "rewards/chosen": -0.6296826004981995,
157
+ "rewards/margins": 0.2538929581642151,
158
+ "rewards/rejected": -0.8835756182670593,
159
  "step": 90
160
  },
161
  {
162
+ "epoch": 0.17444395987788922,
163
+ "grad_norm": 9.198990938758325,
164
+ "learning_rate": 1.9673579897323202e-07,
165
+ "logits/chosen": -2.2583627700805664,
166
+ "logits/rejected": -2.319267749786377,
167
+ "logps/chosen": -449.26287841796875,
168
+ "logps/rejected": -485.2509765625,
169
+ "loss": 0.6011,
170
+ "rewards/accuracies": 0.6812499761581421,
171
+ "rewards/chosen": -0.8181831240653992,
172
+ "rewards/margins": 0.3021746277809143,
173
+ "rewards/rejected": -1.1203577518463135,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 0.17444395987788922,
178
+ "eval_logits/chosen": -2.4041965007781982,
179
+ "eval_logits/rejected": -2.4543516635894775,
180
+ "eval_logps/chosen": -468.4564514160156,
181
+ "eval_logps/rejected": -492.26025390625,
182
+ "eval_loss": 0.5738404989242554,
183
+ "eval_rewards/accuracies": 0.6987951993942261,
184
+ "eval_rewards/chosen": -0.8769527673721313,
185
+ "eval_rewards/margins": 0.403839111328125,
186
+ "eval_rewards/rejected": -1.2807917594909668,
187
+ "eval_runtime": 115.5904,
188
+ "eval_samples_per_second": 22.865,
189
+ "eval_steps_per_second": 0.718,
190
+ "step": 100
191
+ },
192
+ {
193
+ "epoch": 0.19188835586567815,
194
+ "grad_norm": 8.908627437983846,
195
+ "learning_rate": 1.9501095642669734e-07,
196
+ "logits/chosen": -2.5096914768218994,
197
+ "logits/rejected": -2.509361982345581,
198
+ "logps/chosen": -456.6226501464844,
199
+ "logps/rejected": -513.7633056640625,
200
+ "loss": 0.5838,
201
+ "rewards/accuracies": 0.731249988079071,
202
+ "rewards/chosen": -0.8924768567085266,
203
+ "rewards/margins": 0.5146722793579102,
204
+ "rewards/rejected": -1.407149076461792,
205
+ "step": 110
206
+ },
207
+ {
208
+ "epoch": 0.2093327518534671,
209
+ "grad_norm": 10.31116892436808,
210
+ "learning_rate": 1.9293266688191557e-07,
211
+ "logits/chosen": -2.586188793182373,
212
+ "logits/rejected": -2.6450071334838867,
213
+ "logps/chosen": -438.7850646972656,
214
+ "logps/rejected": -467.4371032714844,
215
+ "loss": 0.5873,
216
+ "rewards/accuracies": 0.6875,
217
+ "rewards/chosen": -0.8641183972358704,
218
+ "rewards/margins": 0.36442071199417114,
219
+ "rewards/rejected": -1.2285391092300415,
220
+ "step": 120
221
+ },
222
+ {
223
+ "epoch": 0.226777147841256,
224
+ "grad_norm": 13.457335930423358,
225
+ "learning_rate": 1.9050866171249576e-07,
226
+ "logits/chosen": -2.546307325363159,
227
+ "logits/rejected": -2.5156311988830566,
228
+ "logps/chosen": -459.08306884765625,
229
+ "logps/rejected": -529.2793579101562,
230
+ "loss": 0.5508,
231
+ "rewards/accuracies": 0.7124999761581421,
232
+ "rewards/chosen": -0.9206414222717285,
233
+ "rewards/margins": 0.5198907256126404,
234
+ "rewards/rejected": -1.4405320882797241,
235
+ "step": 130
236
+ },
237
+ {
238
+ "epoch": 0.24422154382904493,
239
+ "grad_norm": 13.212896873797849,
240
+ "learning_rate": 1.8774795837687736e-07,
241
+ "logits/chosen": -2.499363422393799,
242
+ "logits/rejected": -2.6360957622528076,
243
+ "logps/chosen": -512.037353515625,
244
+ "logps/rejected": -548.9447631835938,
245
+ "loss": 0.5728,
246
+ "rewards/accuracies": 0.6625000238418579,
247
+ "rewards/chosen": -1.1992404460906982,
248
+ "rewards/margins": 0.5107727646827698,
249
+ "rewards/rejected": -1.7100131511688232,
250
+ "step": 140
251
+ },
252
+ {
253
+ "epoch": 0.26166593981683384,
254
+ "grad_norm": 13.642312982739075,
255
+ "learning_rate": 1.8466082687279243e-07,
256
+ "logits/chosen": -2.574205160140991,
257
+ "logits/rejected": -2.595555305480957,
258
+ "logps/chosen": -434.45709228515625,
259
+ "logps/rejected": -513.8720703125,
260
+ "loss": 0.572,
261
+ "rewards/accuracies": 0.6937500238418579,
262
+ "rewards/chosen": -0.979955792427063,
263
+ "rewards/margins": 0.5753864645957947,
264
+ "rewards/rejected": -1.5553423166275024,
265
+ "step": 150
266
+ },
267
+ {
268
+ "epoch": 0.27911033580462274,
269
+ "grad_norm": 12.825272909657283,
270
+ "learning_rate": 1.8125875153219963e-07,
271
+ "logits/chosen": -2.561600923538208,
272
+ "logits/rejected": -2.64467716217041,
273
+ "logps/chosen": -466.68121337890625,
274
+ "logps/rejected": -522.5623168945312,
275
+ "loss": 0.5687,
276
+ "rewards/accuracies": 0.7124999761581421,
277
+ "rewards/chosen": -0.9761050939559937,
278
+ "rewards/margins": 0.5850359797477722,
279
+ "rewards/rejected": -1.561141014099121,
280
+ "step": 160
281
+ },
282
+ {
283
+ "epoch": 0.2965547317924117,
284
+ "grad_norm": 10.830977614727727,
285
+ "learning_rate": 1.77554388298815e-07,
286
+ "logits/chosen": -2.541180372238159,
287
+ "logits/rejected": -2.5329298973083496,
288
+ "logps/chosen": -469.1494140625,
289
+ "logps/rejected": -527.3177490234375,
290
+ "loss": 0.5516,
291
+ "rewards/accuracies": 0.762499988079071,
292
+ "rewards/chosen": -0.9942892789840698,
293
+ "rewards/margins": 0.6251423954963684,
294
+ "rewards/rejected": -1.6194318532943726,
295
+ "step": 170
296
+ },
297
+ {
298
+ "epoch": 0.3139991277802006,
299
+ "grad_norm": 11.736716858310073,
300
+ "learning_rate": 1.735615176471701e-07,
301
+ "logits/chosen": -2.4211761951446533,
302
+ "logits/rejected": -2.5089921951293945,
303
+ "logps/chosen": -524.66259765625,
304
+ "logps/rejected": -578.3092651367188,
305
+ "loss": 0.5538,
306
+ "rewards/accuracies": 0.699999988079071,
307
+ "rewards/chosen": -1.192502737045288,
308
+ "rewards/margins": 0.5517352819442749,
309
+ "rewards/rejected": -1.744238257408142,
310
+ "step": 180
311
+ },
312
+ {
313
+ "epoch": 0.3314435237679895,
314
+ "grad_norm": 12.055419698459572,
315
+ "learning_rate": 1.692949933183416e-07,
316
+ "logits/chosen": -2.556756019592285,
317
+ "logits/rejected": -2.691668748855591,
318
+ "logps/chosen": -500.4410095214844,
319
+ "logps/rejected": -543.4578857421875,
320
+ "loss": 0.5649,
321
+ "rewards/accuracies": 0.731249988079071,
322
+ "rewards/chosen": -1.0238206386566162,
323
+ "rewards/margins": 0.6150747537612915,
324
+ "rewards/rejected": -1.6388953924179077,
325
+ "step": 190
326
+ },
327
+ {
328
+ "epoch": 0.34888791975577843,
329
+ "grad_norm": 18.710020531498376,
330
+ "learning_rate": 1.64770687063059e-07,
331
+ "logits/chosen": -2.658360242843628,
332
+ "logits/rejected": -2.7501015663146973,
333
+ "logps/chosen": -490.89263916015625,
334
+ "logps/rejected": -532.71142578125,
335
+ "loss": 0.5447,
336
+ "rewards/accuracies": 0.699999988079071,
337
+ "rewards/chosen": -1.049081563949585,
338
+ "rewards/margins": 0.5787625312805176,
339
+ "rewards/rejected": -1.627844214439392,
340
+ "step": 200
341
+ },
342
+ {
343
+ "epoch": 0.34888791975577843,
344
+ "eval_logits/chosen": -2.5731849670410156,
345
+ "eval_logits/rejected": -2.631882905960083,
346
+ "eval_logps/chosen": -513.11767578125,
347
+ "eval_logps/rejected": -572.9752197265625,
348
+ "eval_loss": 0.524207353591919,
349
+ "eval_rewards/accuracies": 0.7289156913757324,
350
+ "eval_rewards/chosen": -1.3235652446746826,
351
+ "eval_rewards/margins": 0.7643768787384033,
352
+ "eval_rewards/rejected": -2.087942123413086,
353
+ "eval_runtime": 115.5218,
354
+ "eval_samples_per_second": 22.879,
355
+ "eval_steps_per_second": 0.718,
356
+ "step": 200
357
+ },
358
+ {
359
+ "epoch": 0.3663323157435674,
360
+ "grad_norm": 14.782374340335828,
361
+ "learning_rate": 1.6000542959774935e-07,
362
+ "logits/chosen": -2.4421656131744385,
363
+ "logits/rejected": -2.49908709526062,
364
+ "logps/chosen": -515.1295166015625,
365
+ "logps/rejected": -585.6087036132812,
366
+ "loss": 0.5478,
367
+ "rewards/accuracies": 0.6937500238418579,
368
+ "rewards/chosen": -1.2250968217849731,
369
+ "rewards/margins": 0.6607337594032288,
370
+ "rewards/rejected": -1.8858305215835571,
371
+ "step": 210
372
+ },
373
+ {
374
+ "epoch": 0.3837767117313563,
375
+ "grad_norm": 16.586610273944082,
376
+ "learning_rate": 1.550169479931667e-07,
377
+ "logits/chosen": -2.4158313274383545,
378
+ "logits/rejected": -2.621553897857666,
379
+ "logps/chosen": -513.0745849609375,
380
+ "logps/rejected": -560.3120727539062,
381
+ "loss": 0.5455,
382
+ "rewards/accuracies": 0.7562500238418579,
383
+ "rewards/chosen": -1.242750883102417,
384
+ "rewards/margins": 0.648566722869873,
385
+ "rewards/rejected": -1.89131760597229,
386
+ "step": 220
387
+ },
388
+ {
389
+ "epoch": 0.4012211077191452,
390
+ "grad_norm": 17.097117314278307,
391
+ "learning_rate": 1.498237997285247e-07,
392
+ "logits/chosen": -2.6082608699798584,
393
+ "logits/rejected": -2.528676748275757,
394
+ "logps/chosen": -483.05859375,
395
+ "logps/rejected": -599.164794921875,
396
+ "loss": 0.5383,
397
+ "rewards/accuracies": 0.7875000238418579,
398
+ "rewards/chosen": -1.363515019416809,
399
+ "rewards/margins": 0.8434240221977234,
400
+ "rewards/rejected": -2.2069389820098877,
401
+ "step": 230
402
+ },
403
+ {
404
+ "epoch": 0.4186655037069342,
405
+ "grad_norm": 13.962480844795232,
406
+ "learning_rate": 1.4444530365645478e-07,
407
+ "logits/chosen": -2.5863888263702393,
408
+ "logits/rejected": -2.6373305320739746,
409
+ "logps/chosen": -480.4344787597656,
410
+ "logps/rejected": -544.6641235351562,
411
+ "loss": 0.5372,
412
+ "rewards/accuracies": 0.7124999761581421,
413
+ "rewards/chosen": -1.089019536972046,
414
+ "rewards/margins": 0.6869910955429077,
415
+ "rewards/rejected": -1.7760107517242432,
416
+ "step": 240
417
+ },
418
+ {
419
+ "epoch": 0.4361098996947231,
420
+ "grad_norm": 17.51761421234639,
421
+ "learning_rate": 1.389014681356059e-07,
422
+ "logits/chosen": -2.37886905670166,
423
+ "logits/rejected": -2.5640721321105957,
424
+ "logps/chosen": -522.6568603515625,
425
+ "logps/rejected": -547.7442626953125,
426
+ "loss": 0.5414,
427
+ "rewards/accuracies": 0.7562500238418579,
428
+ "rewards/chosen": -1.2930206060409546,
429
+ "rewards/margins": 0.7195638418197632,
430
+ "rewards/rejected": -2.012584686279297,
431
+ "step": 250
432
+ },
433
+ {
434
+ "epoch": 0.453554295682512,
435
+ "grad_norm": 12.985606778494319,
436
+ "learning_rate": 1.3321291659823587e-07,
437
+ "logits/chosen": -2.4836788177490234,
438
+ "logits/rejected": -2.5293915271759033,
439
+ "logps/chosen": -514.5162353515625,
440
+ "logps/rejected": -592.8468627929688,
441
+ "loss": 0.5395,
442
+ "rewards/accuracies": 0.768750011920929,
443
+ "rewards/chosen": -1.1346120834350586,
444
+ "rewards/margins": 0.8249943852424622,
445
+ "rewards/rejected": -1.959606409072876,
446
+ "step": 260
447
+ },
448
+ {
449
+ "epoch": 0.4709986916703009,
450
+ "grad_norm": 13.86783402737886,
451
+ "learning_rate": 1.2740081082968898e-07,
452
+ "logits/chosen": -2.4625072479248047,
453
+ "logits/rejected": -2.4806463718414307,
454
+ "logps/chosen": -491.41607666015625,
455
+ "logps/rejected": -535.6668701171875,
456
+ "loss": 0.5356,
457
+ "rewards/accuracies": 0.706250011920929,
458
+ "rewards/chosen": -1.145365595817566,
459
+ "rewards/margins": 0.6424422264099121,
460
+ "rewards/rejected": -1.787807822227478,
461
+ "step": 270
462
+ },
463
+ {
464
+ "epoch": 0.48844308765808986,
465
+ "grad_norm": 17.896568729720236,
466
+ "learning_rate": 1.2148677224516458e-07,
467
+ "logits/chosen": -2.426962375640869,
468
+ "logits/rejected": -2.6068427562713623,
469
+ "logps/chosen": -513.377197265625,
470
+ "logps/rejected": -570.4599609375,
471
+ "loss": 0.5408,
472
+ "rewards/accuracies": 0.699999988079071,
473
+ "rewards/chosen": -1.4353644847869873,
474
+ "rewards/margins": 0.6926987767219543,
475
+ "rewards/rejected": -2.128063201904297,
476
+ "step": 280
477
+ },
478
+ {
479
+ "epoch": 0.5058874836458788,
480
+ "grad_norm": 12.317018338997508,
481
+ "learning_rate": 1.1549280145663242e-07,
482
+ "logits/chosen": -2.395157814025879,
483
+ "logits/rejected": -2.5112946033477783,
484
+ "logps/chosen": -489.17291259765625,
485
+ "logps/rejected": -561.0538940429688,
486
+ "loss": 0.5286,
487
+ "rewards/accuracies": 0.731249988079071,
488
+ "rewards/chosen": -1.206729531288147,
489
+ "rewards/margins": 0.7861859798431396,
490
+ "rewards/rejected": -1.9929155111312866,
491
+ "step": 290
492
+ },
493
+ {
494
+ "epoch": 0.5233318796336677,
495
+ "grad_norm": 18.19143218783634,
496
+ "learning_rate": 1.0944119642911108e-07,
497
+ "logits/chosen": -2.4938693046569824,
498
+ "logits/rejected": -2.5243020057678223,
499
+ "logps/chosen": -499.1390686035156,
500
+ "logps/rejected": -593.7322998046875,
501
+ "loss": 0.5173,
502
+ "rewards/accuracies": 0.7562500238418579,
503
+ "rewards/chosen": -1.3326854705810547,
504
+ "rewards/margins": 0.7864323854446411,
505
+ "rewards/rejected": -2.1191182136535645,
506
+ "step": 300
507
+ },
508
+ {
509
+ "epoch": 0.5233318796336677,
510
+ "eval_logits/chosen": -2.5555672645568848,
511
+ "eval_logits/rejected": -2.614039659500122,
512
+ "eval_logps/chosen": -549.0404052734375,
513
+ "eval_logps/rejected": -632.2808837890625,
514
+ "eval_loss": 0.5003006458282471,
515
+ "eval_rewards/accuracies": 0.7259036302566528,
516
+ "eval_rewards/chosen": -1.682791829109192,
517
+ "eval_rewards/margins": 0.998206377029419,
518
+ "eval_rewards/rejected": -2.6809980869293213,
519
+ "eval_runtime": 115.4293,
520
+ "eval_samples_per_second": 22.897,
521
+ "eval_steps_per_second": 0.719,
522
+ "step": 300
523
+ },
524
+ {
525
+ "epoch": 0.5407762756214566,
526
+ "grad_norm": 12.275061942289778,
527
+ "learning_rate": 1.0335446953077364e-07,
528
+ "logits/chosen": -2.607405424118042,
529
+ "logits/rejected": -2.575361728668213,
530
+ "logps/chosen": -478.61065673828125,
531
+ "logps/rejected": -597.8718872070312,
532
+ "loss": 0.5464,
533
+ "rewards/accuracies": 0.737500011920929,
534
+ "rewards/chosen": -1.3932697772979736,
535
+ "rewards/margins": 0.8137472867965698,
536
+ "rewards/rejected": -2.207017421722412,
537
+ "step": 310
538
+ },
539
+ {
540
+ "epoch": 0.5582206716092455,
541
+ "grad_norm": 15.17777342496625,
542
+ "learning_rate": 9.725526378545951e-08,
543
+ "logits/chosen": -2.447172164916992,
544
+ "logits/rejected": -2.569131374359131,
545
+ "logps/chosen": -548.3880615234375,
546
+ "logps/rejected": -593.2091064453125,
547
+ "loss": 0.5209,
548
+ "rewards/accuracies": 0.7250000238418579,
549
+ "rewards/chosen": -1.2796152830123901,
550
+ "rewards/margins": 0.6179531812667847,
551
+ "rewards/rejected": -1.8975684642791748,
552
+ "step": 320
553
+ },
554
+ {
555
+ "epoch": 0.5756650675970345,
556
+ "grad_norm": 18.51656864343838,
557
+ "learning_rate": 9.116626863913827e-08,
558
+ "logits/chosen": -2.427771806716919,
559
+ "logits/rejected": -2.4281983375549316,
560
+ "logps/chosen": -509.34130859375,
561
+ "logps/rejected": -607.3682861328125,
562
+ "loss": 0.535,
563
+ "rewards/accuracies": 0.75,
564
+ "rewards/chosen": -1.440760612487793,
565
+ "rewards/margins": 0.8197328448295593,
566
+ "rewards/rejected": -2.260493755340576,
567
+ "step": 330
568
+ },
569
+ {
570
+ "epoch": 0.5931094635848234,
571
+ "grad_norm": 15.020047645081487,
572
+ "learning_rate": 8.511013555368081e-08,
573
+ "logits/chosen": -2.5503029823303223,
574
+ "logits/rejected": -2.5775723457336426,
575
+ "logps/chosen": -479.23614501953125,
576
+ "logps/rejected": -573.2176513671875,
577
+ "loss": 0.533,
578
+ "rewards/accuracies": 0.7562500238418579,
579
+ "rewards/chosen": -1.3124325275421143,
580
+ "rewards/margins": 0.8196013569831848,
581
+ "rewards/rejected": -2.1320338249206543,
582
+ "step": 340
583
+ },
584
+ {
585
+ "epoch": 0.6105538595726123,
586
+ "grad_norm": 18.51590550961604,
587
+ "learning_rate": 7.910939374193313e-08,
588
+ "logits/chosen": -2.548419952392578,
589
+ "logits/rejected": -2.51499605178833,
590
+ "logps/chosen": -466.15008544921875,
591
+ "logps/rejected": -586.0462646484375,
592
+ "loss": 0.5176,
593
+ "rewards/accuracies": 0.768750011920929,
594
+ "rewards/chosen": -1.3407198190689087,
595
+ "rewards/margins": 0.9716306924819946,
596
+ "rewards/rejected": -2.3123507499694824,
597
+ "step": 350
598
+ },
599
+ {
600
+ "epoch": 0.6279982555604012,
601
+ "grad_norm": 15.425853441749453,
602
+ "learning_rate": 7.31863663575649e-08,
603
+ "logits/chosen": -2.620063543319702,
604
+ "logits/rejected": -2.692065715789795,
605
+ "logps/chosen": -528.7435913085938,
606
+ "logps/rejected": -594.0607299804688,
607
+ "loss": 0.5143,
608
+ "rewards/accuracies": 0.731249988079071,
609
+ "rewards/chosen": -1.457281231880188,
610
+ "rewards/margins": 0.8597862124443054,
611
+ "rewards/rejected": -2.3170676231384277,
612
+ "step": 360
613
+ },
614
+ {
615
+ "epoch": 0.6454426515481901,
616
+ "grad_norm": 22.465982391087056,
617
+ "learning_rate": 6.736308745147167e-08,
618
+ "logits/chosen": -2.482309103012085,
619
+ "logits/rejected": -2.63562273979187,
620
+ "logps/chosen": -524.6629638671875,
621
+ "logps/rejected": -600.9884033203125,
622
+ "loss": 0.5375,
623
+ "rewards/accuracies": 0.731249988079071,
624
+ "rewards/chosen": -1.3087918758392334,
625
+ "rewards/margins": 0.8371159434318542,
626
+ "rewards/rejected": -2.1459078788757324,
627
+ "step": 370
628
+ },
629
+ {
630
+ "epoch": 0.662887047535979,
631
+ "grad_norm": 16.00565580949092,
632
+ "learning_rate": 6.166122000365834e-08,
633
+ "logits/chosen": -2.5903308391571045,
634
+ "logits/rejected": -2.7240214347839355,
635
+ "logps/chosen": -514.4639282226562,
636
+ "logps/rejected": -595.83154296875,
637
+ "loss": 0.5013,
638
+ "rewards/accuracies": 0.7875000238418579,
639
+ "rewards/chosen": -1.2856967449188232,
640
+ "rewards/margins": 0.8659652471542358,
641
+ "rewards/rejected": -2.1516618728637695,
642
+ "step": 380
643
+ },
644
+ {
645
+ "epoch": 0.680331443523768,
646
+ "grad_norm": 13.854127143964314,
647
+ "learning_rate": 5.610197533553057e-08,
648
+ "logits/chosen": -2.4627175331115723,
649
+ "logits/rejected": -2.506941795349121,
650
+ "logps/chosen": -571.271484375,
651
+ "logps/rejected": -634.978271484375,
652
+ "loss": 0.5238,
653
+ "rewards/accuracies": 0.699999988079071,
654
+ "rewards/chosen": -1.5051517486572266,
655
+ "rewards/margins": 0.8115229606628418,
656
+ "rewards/rejected": -2.3166747093200684,
657
+ "step": 390
658
+ },
659
+ {
660
+ "epoch": 0.6977758395115569,
661
+ "grad_norm": 14.662169677675843,
662
+ "learning_rate": 5.0706034202386236e-08,
663
+ "logits/chosen": -2.586434841156006,
664
+ "logits/rejected": -2.635715961456299,
665
+ "logps/chosen": -530.5022583007812,
666
+ "logps/rejected": -592.48681640625,
667
+ "loss": 0.5144,
668
+ "rewards/accuracies": 0.762499988079071,
669
+ "rewards/chosen": -1.4007527828216553,
670
+ "rewards/margins": 0.9174124002456665,
671
+ "rewards/rejected": -2.3181653022766113,
672
+ "step": 400
673
+ },
674
+ {
675
+ "epoch": 0.6977758395115569,
676
+ "eval_logits/chosen": -2.636509895324707,
677
+ "eval_logits/rejected": -2.7027053833007812,
678
+ "eval_logps/chosen": -551.8306274414062,
679
+ "eval_logps/rejected": -645.5278930664062,
680
+ "eval_loss": 0.4851160943508148,
681
+ "eval_rewards/accuracies": 0.7319276928901672,
682
+ "eval_rewards/chosen": -1.7106943130493164,
683
+ "eval_rewards/margins": 1.1027746200561523,
684
+ "eval_rewards/rejected": -2.8134689331054688,
685
+ "eval_runtime": 115.3965,
686
+ "eval_samples_per_second": 22.904,
687
+ "eval_steps_per_second": 0.719,
688
+ "step": 400
689
+ },
690
+ {
691
+ "epoch": 0.7152202354993459,
692
+ "grad_norm": 15.725842795128791,
693
+ "learning_rate": 4.5493469859647183e-08,
694
+ "logits/chosen": -2.5586659908294678,
695
+ "logits/rejected": -2.570362091064453,
696
+ "logps/chosen": -540.7760009765625,
697
+ "logps/rejected": -640.4522705078125,
698
+ "loss": 0.5189,
699
+ "rewards/accuracies": 0.706250011920929,
700
+ "rewards/chosen": -1.5797544717788696,
701
+ "rewards/margins": 0.9039770364761353,
702
+ "rewards/rejected": -2.483731269836426,
703
+ "step": 410
704
+ },
705
+ {
706
+ "epoch": 0.7326646314871348,
707
+ "grad_norm": 16.117914868185306,
708
+ "learning_rate": 4.048367338903067e-08,
709
+ "logits/chosen": -2.653681516647339,
710
+ "logits/rejected": -2.6537811756134033,
711
+ "logps/chosen": -497.29254150390625,
712
+ "logps/rejected": -593.9075927734375,
713
+ "loss": 0.5409,
714
+ "rewards/accuracies": 0.78125,
715
+ "rewards/chosen": -1.431205153465271,
716
+ "rewards/margins": 0.8377032279968262,
717
+ "rewards/rejected": -2.2689082622528076,
718
+ "step": 420
719
+ },
720
+ {
721
+ "epoch": 0.7501090274749237,
722
+ "grad_norm": 17.441237136205846,
723
+ "learning_rate": 3.569528156245196e-08,
724
+ "logits/chosen": -2.567112684249878,
725
+ "logits/rejected": -2.5458970069885254,
726
+ "logps/chosen": -497.869140625,
727
+ "logps/rejected": -593.7197875976562,
728
+ "loss": 0.5176,
729
+ "rewards/accuracies": 0.7562500238418579,
730
+ "rewards/chosen": -1.315579891204834,
731
+ "rewards/margins": 0.8977547883987427,
732
+ "rewards/rejected": -2.213334798812866,
733
+ "step": 430
734
+ },
735
+ {
736
+ "epoch": 0.7675534234627126,
737
+ "grad_norm": 15.596090656894537,
738
+ "learning_rate": 3.1146107512008505e-08,
739
+ "logits/chosen": -2.474587917327881,
740
+ "logits/rejected": -2.5896928310394287,
741
+ "logps/chosen": -536.7967529296875,
742
+ "logps/rejected": -597.7022705078125,
743
+ "loss": 0.5158,
744
+ "rewards/accuracies": 0.737500011920929,
745
+ "rewards/chosen": -1.424076795578003,
746
+ "rewards/margins": 0.8131541013717651,
747
+ "rewards/rejected": -2.2372307777404785,
748
+ "step": 440
749
+ },
750
+ {
751
+ "epoch": 0.7849978194505015,
752
+ "grad_norm": 16.944860501421836,
753
+ "learning_rate": 2.6853074463958614e-08,
754
+ "logits/chosen": -2.64913010597229,
755
+ "logits/rejected": -2.6915199756622314,
756
+ "logps/chosen": -557.8106079101562,
757
+ "logps/rejected": -629.3753662109375,
758
+ "loss": 0.5297,
759
+ "rewards/accuracies": 0.737500011920929,
760
+ "rewards/chosen": -1.6522800922393799,
761
+ "rewards/margins": 1.0145955085754395,
762
+ "rewards/rejected": -2.6668756008148193,
763
+ "step": 450
764
+ },
765
+ {
766
+ "epoch": 0.8024422154382904,
767
+ "grad_norm": 16.014043499865636,
768
+ "learning_rate": 2.283215278320839e-08,
769
+ "logits/chosen": -2.3578901290893555,
770
+ "logits/rejected": -2.526782989501953,
771
+ "logps/chosen": -549.39306640625,
772
+ "logps/rejected": -602.7447509765625,
773
+ "loss": 0.5112,
774
+ "rewards/accuracies": 0.768750011920929,
775
+ "rewards/chosen": -1.3445526361465454,
776
+ "rewards/margins": 0.8740399479866028,
777
+ "rewards/rejected": -2.218592405319214,
778
+ "step": 460
779
+ },
780
+ {
781
+ "epoch": 0.8198866114260793,
782
+ "grad_norm": 16.802924841873157,
783
+ "learning_rate": 1.9098300562505266e-08,
784
+ "logits/chosen": -2.4426217079162598,
785
+ "logits/rejected": -2.56174898147583,
786
+ "logps/chosen": -512.5400390625,
787
+ "logps/rejected": -600.3233642578125,
788
+ "loss": 0.5097,
789
+ "rewards/accuracies": 0.7749999761581421,
790
+ "rewards/chosen": -1.3759263753890991,
791
+ "rewards/margins": 0.9335910081863403,
792
+ "rewards/rejected": -2.3095173835754395,
793
+ "step": 470
794
+ },
795
+ {
796
+ "epoch": 0.8373310074138683,
797
+ "grad_norm": 21.49561746024287,
798
+ "learning_rate": 1.5665407977350388e-08,
799
+ "logits/chosen": -2.4791998863220215,
800
+ "logits/rejected": -2.603175640106201,
801
+ "logps/chosen": -527.2852783203125,
802
+ "logps/rejected": -596.7613525390625,
803
+ "loss": 0.4766,
804
+ "rewards/accuracies": 0.699999988079071,
805
+ "rewards/chosen": -1.4009100198745728,
806
+ "rewards/margins": 0.7795251607894897,
807
+ "rewards/rejected": -2.1804349422454834,
808
+ "step": 480
809
+ },
810
+ {
811
+ "epoch": 0.8547754034016573,
812
+ "grad_norm": 16.63581899808769,
813
+ "learning_rate": 1.2546245613633688e-08,
814
+ "logits/chosen": -2.5286920070648193,
815
+ "logits/rejected": -2.70131254196167,
816
+ "logps/chosen": -574.4547119140625,
817
+ "logps/rejected": -626.7061767578125,
818
+ "loss": 0.5147,
819
+ "rewards/accuracies": 0.75,
820
+ "rewards/chosen": -1.7204509973526,
821
+ "rewards/margins": 0.9493522644042969,
822
+ "rewards/rejected": -2.6698031425476074,
823
+ "step": 490
824
+ },
825
+ {
826
+ "epoch": 0.8722197993894462,
827
+ "grad_norm": 19.877142016669573,
828
+ "learning_rate": 9.752416960215437e-09,
829
+ "logits/chosen": -2.546855926513672,
830
+ "logits/rejected": -2.697226047515869,
831
+ "logps/chosen": -550.6837158203125,
832
+ "logps/rejected": -651.0521850585938,
833
+ "loss": 0.5162,
834
+ "rewards/accuracies": 0.762499988079071,
835
+ "rewards/chosen": -1.7263736724853516,
836
+ "rewards/margins": 0.9590435028076172,
837
+ "rewards/rejected": -2.6854171752929688,
838
+ "step": 500
839
+ },
840
+ {
841
+ "epoch": 0.8722197993894462,
842
+ "eval_logits/chosen": -2.581234931945801,
843
+ "eval_logits/rejected": -2.644242525100708,
844
+ "eval_logps/chosen": -551.6072387695312,
845
+ "eval_logps/rejected": -648.58154296875,
846
+ "eval_loss": 0.4797753691673279,
847
+ "eval_rewards/accuracies": 0.7259036302566528,
848
+ "eval_rewards/chosen": -1.7084604501724243,
849
+ "eval_rewards/margins": 1.135544776916504,
850
+ "eval_rewards/rejected": -2.8440048694610596,
851
+ "eval_runtime": 115.2944,
852
+ "eval_samples_per_second": 22.924,
853
+ "eval_steps_per_second": 0.72,
854
+ "step": 500
855
+ },
856
+ {
857
+ "epoch": 0.8896641953772351,
858
+ "grad_norm": 15.90276225770206,
859
+ "learning_rate": 7.294315243185578e-09,
860
+ "logits/chosen": -2.5750200748443604,
861
+ "logits/rejected": -2.5045828819274902,
862
+ "logps/chosen": -503.51995849609375,
863
+ "logps/rejected": -624.311279296875,
864
+ "loss": 0.4957,
865
+ "rewards/accuracies": 0.7562500238418579,
866
+ "rewards/chosen": -1.5083858966827393,
867
+ "rewards/margins": 0.9982180595397949,
868
+ "rewards/rejected": -2.506603956222534,
869
+ "step": 510
870
+ },
871
+ {
872
+ "epoch": 0.907108591365024,
873
+ "grad_norm": 13.947597611873226,
874
+ "learning_rate": 5.18108476238015e-09,
875
+ "logits/chosen": -2.6210732460021973,
876
+ "logits/rejected": -2.634568452835083,
877
+ "logps/chosen": -557.1134643554688,
878
+ "logps/rejected": -666.3057861328125,
879
+ "loss": 0.5147,
880
+ "rewards/accuracies": 0.6937500238418579,
881
+ "rewards/chosen": -1.5826480388641357,
882
+ "rewards/margins": 0.9864925146102905,
883
+ "rewards/rejected": -2.569140672683716,
884
+ "step": 520
885
+ },
886
+ {
887
+ "epoch": 0.9245529873528129,
888
+ "grad_norm": 15.796674513066053,
889
+ "learning_rate": 3.4205868739851316e-09,
890
+ "logits/chosen": -2.608445167541504,
891
+ "logits/rejected": -2.5723965167999268,
892
+ "logps/chosen": -519.9942626953125,
893
+ "logps/rejected": -620.4813842773438,
894
+ "loss": 0.5135,
895
+ "rewards/accuracies": 0.699999988079071,
896
+ "rewards/chosen": -1.6632301807403564,
897
+ "rewards/margins": 0.8547458648681641,
898
+ "rewards/rejected": -2.5179758071899414,
899
+ "step": 530
900
+ },
901
+ {
902
+ "epoch": 0.9419973833406018,
903
+ "grad_norm": 19.347436900380448,
904
+ "learning_rate": 2.0193707457752727e-09,
905
+ "logits/chosen": -2.589115858078003,
906
+ "logits/rejected": -2.6705126762390137,
907
+ "logps/chosen": -543.7213134765625,
908
+ "logps/rejected": -621.6940307617188,
909
+ "loss": 0.5146,
910
+ "rewards/accuracies": 0.706250011920929,
911
+ "rewards/chosen": -1.5609385967254639,
912
+ "rewards/margins": 0.875388503074646,
913
+ "rewards/rejected": -2.4363272190093994,
914
+ "step": 540
915
+ },
916
+ {
917
+ "epoch": 0.9594417793283908,
918
+ "grad_norm": 17.51566703914577,
919
+ "learning_rate": 9.826489937796556e-10,
920
+ "logits/chosen": -2.6343085765838623,
921
+ "logits/rejected": -2.683014392852783,
922
+ "logps/chosen": -527.5286254882812,
923
+ "logps/rejected": -622.0753173828125,
924
+ "loss": 0.5309,
925
+ "rewards/accuracies": 0.71875,
926
+ "rewards/chosen": -1.4900976419448853,
927
+ "rewards/margins": 0.976272702217102,
928
+ "rewards/rejected": -2.466370105743408,
929
+ "step": 550
930
+ },
931
+ {
932
+ "epoch": 0.9768861753161797,
933
+ "grad_norm": 23.275500057845633,
934
+ "learning_rate": 3.142782910077968e-10,
935
+ "logits/chosen": -2.542066812515259,
936
+ "logits/rejected": -2.601771831512451,
937
+ "logps/chosen": -528.61328125,
938
+ "logps/rejected": -610.769287109375,
939
+ "loss": 0.5156,
940
+ "rewards/accuracies": 0.6875,
941
+ "rewards/chosen": -1.7321722507476807,
942
+ "rewards/margins": 0.8784612417221069,
943
+ "rewards/rejected": -2.610633373260498,
944
+ "step": 560
945
+ },
946
+ {
947
+ "epoch": 0.9943305713039686,
948
+ "grad_norm": 18.734942592809794,
949
+ "learning_rate": 1.674502037277703e-11,
950
+ "logits/chosen": -2.5349037647247314,
951
+ "logits/rejected": -2.5301239490509033,
952
+ "logps/chosen": -525.6041259765625,
953
+ "logps/rejected": -626.7215576171875,
954
+ "loss": 0.5109,
955
+ "rewards/accuracies": 0.7749999761581421,
956
+ "rewards/chosen": -1.4237686395645142,
957
+ "rewards/margins": 0.9176391363143921,
958
+ "rewards/rejected": -2.3414077758789062,
959
+ "step": 570
960
+ },
961
+ {
962
+ "epoch": 0.9995638901003053,
963
+ "step": 573,
964
  "total_flos": 0.0,
965
+ "train_loss": 0.5544013454860002,
966
+ "train_runtime": 8395.7281,
967
+ "train_samples_per_second": 8.737,
968
+ "train_steps_per_second": 0.068
969
  }
970
  ],
971
  "logging_steps": 10,
972
+ "max_steps": 573,
973
  "num_input_tokens_seen": 0,
974
  "num_train_epochs": 1,
975
  "save_steps": 500,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ce8b2421921474318011c5f15c249adb5088f7fc2bc277edc3bb5ecf5ff4157
3
  size 7544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca9ebd23674d256f63b967bc2d04821d10f0cc4060b957f60ff0155883808065
3
  size 7544