just1nseo commited on
Commit
bf9d83a
1 Parent(s): 9d147e3

Model save

Browse files
README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ base_model: alignment-handbook/zephyr-7b-sft-full
9
+ model-index:
10
+ - name: zephyr-dpop-qlora-gpt4-5e-7-epoch3
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # zephyr-dpop-qlora-gpt4-5e-7-epoch3
18
+
19
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 1.5707
22
+ - Positive Losses: 8.9081
23
+ - Dpo Losses: 0.6660
24
+ - Rewards/chosen: -0.0431
25
+ - Rewards/rejected: -0.1110
26
+ - Rewards/accuracies: 0.6151
27
+ - Rewards/margins: 0.0679
28
+ - Rewards/margins Max: 0.3167
29
+ - Rewards/margins Min: -0.1568
30
+ - Rewards/margins Std: 0.2111
31
+ - Logps/rejected: -270.2825
32
+ - Logps/chosen: -289.5273
33
+ - Logits/rejected: -2.6606
34
+ - Logits/chosen: -2.7037
35
+
36
+ ## Model description
37
+
38
+ More information needed
39
+
40
+ ## Intended uses & limitations
41
+
42
+ More information needed
43
+
44
+ ## Training and evaluation data
45
+
46
+ More information needed
47
+
48
+ ## Training procedure
49
+
50
+ ### Training hyperparameters
51
+
52
+ The following hyperparameters were used during training:
53
+ - learning_rate: 5e-07
54
+ - train_batch_size: 2
55
+ - eval_batch_size: 4
56
+ - seed: 42
57
+ - distributed_type: multi-GPU
58
+ - num_devices: 8
59
+ - total_train_batch_size: 16
60
+ - total_eval_batch_size: 32
61
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
+ - lr_scheduler_type: cosine
63
+ - lr_scheduler_warmup_ratio: 0.1
64
+ - num_epochs: 3
65
+
66
+ ### Training results
67
+
68
+ | Training Loss | Epoch | Step | Validation Loss | Positive Losses | Dpo Losses | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Rewards/margins Max | Rewards/margins Min | Rewards/margins Std | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
69
+ |:-------------:|:-----:|:----:|:---------------:|:---------------:|:----------:|:--------------:|:----------------:|:------------------:|:---------------:|:-------------------:|:-------------------:|:-------------------:|:--------------:|:------------:|:---------------:|:-------------:|
70
+ | 0.6835 | 0.28 | 100 | 0.6965 | 0.0436 | 0.6917 | 0.0092 | 0.0061 | 0.5833 | 0.0030 | 0.0155 | -0.0076 | 0.0103 | -258.5689 | -284.3059 | -2.8089 | -2.8541 |
71
+ | 0.6367 | 0.56 | 200 | 0.7633 | 0.6990 | 0.6863 | 0.0215 | 0.0070 | 0.5873 | 0.0145 | 0.0761 | -0.0391 | 0.0511 | -258.4836 | -283.0695 | -2.7779 | -2.8224 |
72
+ | 0.5913 | 0.85 | 300 | 0.9198 | 2.2041 | 0.6810 | 0.0123 | -0.0144 | 0.5714 | 0.0267 | 0.1358 | -0.0683 | 0.0899 | -260.6202 | -283.9922 | -2.7412 | -2.7853 |
73
+ | 0.5502 | 1.13 | 400 | 1.0826 | 3.7846 | 0.6770 | 0.0010 | -0.0361 | 0.5754 | 0.0370 | 0.1861 | -0.0963 | 0.1243 | -262.7899 | -285.1261 | -2.7113 | -2.7545 |
74
+ | 0.5398 | 1.41 | 500 | 1.1571 | 4.6567 | 0.6734 | 0.0027 | -0.0441 | 0.5833 | 0.0468 | 0.2338 | -0.1166 | 0.1549 | -263.5918 | -284.9548 | -2.6935 | -2.7368 |
75
+ | 0.5293 | 1.69 | 600 | 1.2245 | 5.3740 | 0.6703 | 0.0016 | -0.0536 | 0.5913 | 0.0552 | 0.2655 | -0.1284 | 0.1752 | -264.5410 | -285.0616 | -2.6767 | -2.7201 |
76
+ | 0.5238 | 1.97 | 700 | 1.3783 | 6.9387 | 0.6683 | -0.0190 | -0.0800 | 0.6032 | 0.0610 | 0.2891 | -0.1425 | 0.1922 | -267.1869 | -287.1237 | -2.6726 | -2.7154 |
77
+ | 0.488 | 2.25 | 800 | 1.4896 | 8.0964 | 0.6670 | -0.0328 | -0.0978 | 0.6111 | 0.0650 | 0.3063 | -0.1511 | 0.2037 | -268.9666 | -288.5044 | -2.6644 | -2.7076 |
78
+ | 0.5027 | 2.54 | 900 | 1.5575 | 8.7828 | 0.6661 | -0.0416 | -0.1091 | 0.6190 | 0.0675 | 0.3151 | -0.1563 | 0.2099 | -270.0926 | -289.3809 | -2.6629 | -2.7059 |
79
+ | 0.4962 | 2.82 | 1000 | 1.5707 | 8.9081 | 0.6660 | -0.0431 | -0.1110 | 0.6151 | 0.0679 | 0.3167 | -0.1568 | 0.2111 | -270.2825 | -289.5273 | -2.6606 | -2.7037 |
80
+
81
+
82
+ ### Framework versions
83
+
84
+ - PEFT 0.7.1
85
+ - Transformers 4.39.0.dev0
86
+ - Pytorch 2.1.2+cu121
87
+ - Datasets 2.14.6
88
+ - Tokenizers 0.15.2
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e4666d605bd9256d7d5efe7089b301ca167db77762667386e0fd8dfb71f52d1
3
  size 671150064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aa6a63c1f04b9bcd9909ce2443b9d4dbb12f0f073d7307c4703a7f6f5dde8f3
3
  size 671150064
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.5596254680078354,
4
+ "train_runtime": 11165.5936,
5
+ "train_samples": 5678,
6
+ "train_samples_per_second": 1.526,
7
+ "train_steps_per_second": 0.095
8
+ }
runs/Jul29_12-00-30_node02/events.out.tfevents.1722222423.node02.3000485.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4be0e7b1e1426a45da27ed0eecf4983d2ffd90f7b04320c4221e39c9a9e7296a
3
- size 116014
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01529344d77ce278f9518776ae2bab343c21a98b40d20713ce3884eba1ae2d49
3
+ size 122338
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.5596254680078354,
4
+ "train_runtime": 11165.5936,
5
+ "train_samples": 5678,
6
+ "train_samples_per_second": 1.526,
7
+ "train_steps_per_second": 0.095
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 100,
6
+ "global_step": 1065,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "dpo_losses": 0.6931471824645996,
13
+ "epoch": 0.0,
14
+ "grad_norm": 2.1187342100202096,
15
+ "learning_rate": 4.6728971962616815e-09,
16
+ "logits/chosen": -2.8477635383605957,
17
+ "logits/rejected": -2.8469698429107666,
18
+ "logps/chosen": -522.6112670898438,
19
+ "logps/rejected": -359.48583984375,
20
+ "loss": 0.6931,
21
+ "positive_losses": 0.0,
22
+ "rewards/accuracies": 0.0,
23
+ "rewards/chosen": 0.0,
24
+ "rewards/margins": 0.0,
25
+ "rewards/margins_max": 0.0,
26
+ "rewards/margins_min": 0.0,
27
+ "rewards/margins_std": 0.0,
28
+ "rewards/rejected": 0.0,
29
+ "step": 1
30
+ },
31
+ {
32
+ "dpo_losses": 0.6926888227462769,
33
+ "epoch": 0.03,
34
+ "grad_norm": 24.895678335901746,
35
+ "learning_rate": 4.672897196261682e-08,
36
+ "logits/chosen": -2.9204907417297363,
37
+ "logits/rejected": -2.7960145473480225,
38
+ "logps/chosen": -313.4462890625,
39
+ "logps/rejected": -170.4320068359375,
40
+ "loss": 0.6989,
41
+ "positive_losses": 0.036266010254621506,
42
+ "rewards/accuracies": 0.3888888955116272,
43
+ "rewards/chosen": 0.00024180197215173393,
44
+ "rewards/margins": 0.0009183046640828252,
45
+ "rewards/margins_max": 0.0020684306509792805,
46
+ "rewards/margins_min": -0.0002318212646059692,
47
+ "rewards/margins_std": 0.0016265236772596836,
48
+ "rewards/rejected": -0.0006765025900676847,
49
+ "step": 10
50
+ },
51
+ {
52
+ "dpo_losses": 0.6935218572616577,
53
+ "epoch": 0.06,
54
+ "grad_norm": 26.261976434571714,
55
+ "learning_rate": 9.345794392523364e-08,
56
+ "logits/chosen": -2.764099597930908,
57
+ "logits/rejected": -2.7112996578216553,
58
+ "logps/chosen": -381.15252685546875,
59
+ "logps/rejected": -244.30020141601562,
60
+ "loss": 0.702,
61
+ "positive_losses": 0.120574951171875,
62
+ "rewards/accuracies": 0.4000000059604645,
63
+ "rewards/chosen": -0.00042650121031329036,
64
+ "rewards/margins": -0.0007463769870810211,
65
+ "rewards/margins_max": 0.0008523863507434726,
66
+ "rewards/margins_min": -0.0023451403249055147,
67
+ "rewards/margins_std": 0.0022609930019825697,
68
+ "rewards/rejected": 0.0003198757185600698,
69
+ "step": 20
70
+ },
71
+ {
72
+ "dpo_losses": 0.6929606795310974,
73
+ "epoch": 0.08,
74
+ "grad_norm": 26.568307039870714,
75
+ "learning_rate": 1.4018691588785045e-07,
76
+ "logits/chosen": -2.87646484375,
77
+ "logits/rejected": -2.8245913982391357,
78
+ "logps/chosen": -375.969970703125,
79
+ "logps/rejected": -252.5666961669922,
80
+ "loss": 0.6964,
81
+ "positive_losses": 0.026204681023955345,
82
+ "rewards/accuracies": 0.3499999940395355,
83
+ "rewards/chosen": 0.0013370837550610304,
84
+ "rewards/margins": 0.0003754205536097288,
85
+ "rewards/margins_max": 0.001665195683017373,
86
+ "rewards/margins_min": -0.00091435422655195,
87
+ "rewards/margins_std": 0.0018240170320495963,
88
+ "rewards/rejected": 0.0009616632014513016,
89
+ "step": 30
90
+ },
91
+ {
92
+ "dpo_losses": 0.6924411058425903,
93
+ "epoch": 0.11,
94
+ "grad_norm": 10.124655258526282,
95
+ "learning_rate": 1.8691588785046729e-07,
96
+ "logits/chosen": -2.7380053997039795,
97
+ "logits/rejected": -2.7716264724731445,
98
+ "logps/chosen": -306.62396240234375,
99
+ "logps/rejected": -317.3713684082031,
100
+ "loss": 0.6942,
101
+ "positive_losses": 0.01987609826028347,
102
+ "rewards/accuracies": 0.800000011920929,
103
+ "rewards/chosen": 0.003017005743458867,
104
+ "rewards/margins": 0.0014187573688104749,
105
+ "rewards/margins_max": 0.0038917693309485912,
106
+ "rewards/margins_min": -0.0010542543604969978,
107
+ "rewards/margins_std": 0.003497367026284337,
108
+ "rewards/rejected": 0.001598248491063714,
109
+ "step": 40
110
+ },
111
+ {
112
+ "dpo_losses": 0.6920477151870728,
113
+ "epoch": 0.14,
114
+ "grad_norm": 2.3619576775282343,
115
+ "learning_rate": 2.336448598130841e-07,
116
+ "logits/chosen": -2.7991483211517334,
117
+ "logits/rejected": -2.7094950675964355,
118
+ "logps/chosen": -244.6391143798828,
119
+ "logps/rejected": -173.8690643310547,
120
+ "loss": 0.6924,
121
+ "positive_losses": 0.0,
122
+ "rewards/accuracies": 0.699999988079071,
123
+ "rewards/chosen": 0.005839168094098568,
124
+ "rewards/margins": 0.0022048726677894592,
125
+ "rewards/margins_max": 0.004601255524903536,
126
+ "rewards/margins_min": -0.00019151013111695647,
127
+ "rewards/margins_std": 0.0033889967016875744,
128
+ "rewards/rejected": 0.0036342956591397524,
129
+ "step": 50
130
+ },
131
+ {
132
+ "dpo_losses": 0.6909143328666687,
133
+ "epoch": 0.17,
134
+ "grad_norm": 2.3567546921207154,
135
+ "learning_rate": 2.803738317757009e-07,
136
+ "logits/chosen": -2.7648746967315674,
137
+ "logits/rejected": -2.6973958015441895,
138
+ "logps/chosen": -266.65625,
139
+ "logps/rejected": -235.16543579101562,
140
+ "loss": 0.6914,
141
+ "positive_losses": 0.0,
142
+ "rewards/accuracies": 0.800000011920929,
143
+ "rewards/chosen": 0.0077917128801345825,
144
+ "rewards/margins": 0.004477448761463165,
145
+ "rewards/margins_max": 0.006284839008003473,
146
+ "rewards/margins_min": 0.0026700585149228573,
147
+ "rewards/margins_std": 0.0025560357607901096,
148
+ "rewards/rejected": 0.0033142641186714172,
149
+ "step": 60
150
+ },
151
+ {
152
+ "dpo_losses": 0.6901382207870483,
153
+ "epoch": 0.2,
154
+ "grad_norm": 1.8400876800751589,
155
+ "learning_rate": 3.271028037383177e-07,
156
+ "logits/chosen": -2.6974568367004395,
157
+ "logits/rejected": -2.699871063232422,
158
+ "logps/chosen": -332.7781677246094,
159
+ "logps/rejected": -214.24301147460938,
160
+ "loss": 0.6902,
161
+ "positive_losses": 0.0,
162
+ "rewards/accuracies": 0.8999999761581421,
163
+ "rewards/chosen": 0.01162528432905674,
164
+ "rewards/margins": 0.006035626865923405,
165
+ "rewards/margins_max": 0.010061298497021198,
166
+ "rewards/margins_min": 0.0020099543035030365,
167
+ "rewards/margins_std": 0.005693159066140652,
168
+ "rewards/rejected": 0.005589658860117197,
169
+ "step": 70
170
+ },
171
+ {
172
+ "dpo_losses": 0.688714861869812,
173
+ "epoch": 0.23,
174
+ "grad_norm": 4.087198387473911,
175
+ "learning_rate": 3.7383177570093457e-07,
176
+ "logits/chosen": -2.944587469100952,
177
+ "logits/rejected": -2.859727382659912,
178
+ "logps/chosen": -335.75775146484375,
179
+ "logps/rejected": -286.72412109375,
180
+ "loss": 0.6886,
181
+ "positive_losses": 0.0,
182
+ "rewards/accuracies": 0.8999999761581421,
183
+ "rewards/chosen": 0.012916642241179943,
184
+ "rewards/margins": 0.00890201423317194,
185
+ "rewards/margins_max": 0.013774129562079906,
186
+ "rewards/margins_min": 0.004029898438602686,
187
+ "rewards/margins_std": 0.006890212185680866,
188
+ "rewards/rejected": 0.004014627076685429,
189
+ "step": 80
190
+ },
191
+ {
192
+ "dpo_losses": 0.6876496076583862,
193
+ "epoch": 0.25,
194
+ "grad_norm": 2.4851076976994455,
195
+ "learning_rate": 4.205607476635514e-07,
196
+ "logits/chosen": -2.777012586593628,
197
+ "logits/rejected": -2.767564296722412,
198
+ "logps/chosen": -275.3215637207031,
199
+ "logps/rejected": -202.66746520996094,
200
+ "loss": 0.6869,
201
+ "positive_losses": 0.0,
202
+ "rewards/accuracies": 0.8500000238418579,
203
+ "rewards/chosen": 0.01653674617409706,
204
+ "rewards/margins": 0.011065873317420483,
205
+ "rewards/margins_max": 0.019997073337435722,
206
+ "rewards/margins_min": 0.002134673995897174,
207
+ "rewards/margins_std": 0.012630623765289783,
208
+ "rewards/rejected": 0.005470870994031429,
209
+ "step": 90
210
+ },
211
+ {
212
+ "dpo_losses": 0.6823551058769226,
213
+ "epoch": 0.28,
214
+ "grad_norm": 2.301867398682679,
215
+ "learning_rate": 4.672897196261682e-07,
216
+ "logits/chosen": -2.96421480178833,
217
+ "logits/rejected": -2.8904025554656982,
218
+ "logps/chosen": -455.4552307128906,
219
+ "logps/rejected": -341.5201416015625,
220
+ "loss": 0.6835,
221
+ "positive_losses": 0.0,
222
+ "rewards/accuracies": 0.949999988079071,
223
+ "rewards/chosen": 0.028032690286636353,
224
+ "rewards/margins": 0.02175181731581688,
225
+ "rewards/margins_max": 0.03047388233244419,
226
+ "rewards/margins_min": 0.013029751367866993,
227
+ "rewards/margins_std": 0.01233486458659172,
228
+ "rewards/rejected": 0.00628087529912591,
229
+ "step": 100
230
+ },
231
+ {
232
+ "epoch": 0.28,
233
+ "eval_dpo_losses": 0.6916564702987671,
234
+ "eval_logits/chosen": -2.854139804840088,
235
+ "eval_logits/rejected": -2.808910608291626,
236
+ "eval_logps/chosen": -284.305908203125,
237
+ "eval_logps/rejected": -258.5689392089844,
238
+ "eval_loss": 0.6964598298072815,
239
+ "eval_positive_losses": 0.0435669906437397,
240
+ "eval_rewards/accuracies": 0.5833333134651184,
241
+ "eval_rewards/chosen": 0.009152961894869804,
242
+ "eval_rewards/margins": 0.0030159971211105585,
243
+ "eval_rewards/margins_max": 0.015517139807343483,
244
+ "eval_rewards/margins_min": -0.007612254936248064,
245
+ "eval_rewards/margins_std": 0.01032496802508831,
246
+ "eval_rewards/rejected": 0.006136965472251177,
247
+ "eval_runtime": 380.9273,
248
+ "eval_samples_per_second": 5.25,
249
+ "eval_steps_per_second": 0.165,
250
+ "step": 100
251
+ },
252
+ {
253
+ "dpo_losses": 0.6818417310714722,
254
+ "epoch": 0.31,
255
+ "grad_norm": 2.49767805990243,
256
+ "learning_rate": 4.999879018839287e-07,
257
+ "logits/chosen": -2.851139545440674,
258
+ "logits/rejected": -2.726069688796997,
259
+ "logps/chosen": -324.7695007324219,
260
+ "logps/rejected": -245.7756805419922,
261
+ "loss": 0.6789,
262
+ "positive_losses": 0.0,
263
+ "rewards/accuracies": 0.8999999761581421,
264
+ "rewards/chosen": 0.029296237975358963,
265
+ "rewards/margins": 0.022873710840940475,
266
+ "rewards/margins_max": 0.03740672022104263,
267
+ "rewards/margins_min": 0.008340701460838318,
268
+ "rewards/margins_std": 0.020552778616547585,
269
+ "rewards/rejected": 0.006422528065741062,
270
+ "step": 110
271
+ },
272
+ {
273
+ "dpo_losses": 0.6763116717338562,
274
+ "epoch": 0.34,
275
+ "grad_norm": 2.0754455271947125,
276
+ "learning_rate": 4.997728568369408e-07,
277
+ "logits/chosen": -2.9743714332580566,
278
+ "logits/rejected": -2.9403293132781982,
279
+ "logps/chosen": -367.7530212402344,
280
+ "logps/rejected": -330.84222412109375,
281
+ "loss": 0.6754,
282
+ "positive_losses": 0.0,
283
+ "rewards/accuracies": 0.949999988079071,
284
+ "rewards/chosen": 0.041877757757902145,
285
+ "rewards/margins": 0.03419335186481476,
286
+ "rewards/margins_max": 0.04843282699584961,
287
+ "rewards/margins_min": 0.019953874871134758,
288
+ "rewards/margins_std": 0.02013765648007393,
289
+ "rewards/rejected": 0.0076844110153615475,
290
+ "step": 120
291
+ },
292
+ {
293
+ "dpo_losses": 0.6634218692779541,
294
+ "epoch": 0.37,
295
+ "grad_norm": 2.0479204842061978,
296
+ "learning_rate": 4.992892309373227e-07,
297
+ "logits/chosen": -2.8403239250183105,
298
+ "logits/rejected": -2.7551536560058594,
299
+ "logps/chosen": -384.84814453125,
300
+ "logps/rejected": -258.2791442871094,
301
+ "loss": 0.6672,
302
+ "positive_losses": 0.0,
303
+ "rewards/accuracies": 1.0,
304
+ "rewards/chosen": 0.061241261661052704,
305
+ "rewards/margins": 0.06080120801925659,
306
+ "rewards/margins_max": 0.08230480551719666,
307
+ "rewards/margins_min": 0.039297617971897125,
308
+ "rewards/margins_std": 0.03041067160665989,
309
+ "rewards/rejected": 0.00044005707604810596,
310
+ "step": 130
311
+ },
312
+ {
313
+ "dpo_losses": 0.6647445559501648,
314
+ "epoch": 0.39,
315
+ "grad_norm": 1.7367015733096829,
316
+ "learning_rate": 4.985375442281968e-07,
317
+ "logits/chosen": -2.8128199577331543,
318
+ "logits/rejected": -2.783970355987549,
319
+ "logps/chosen": -345.33953857421875,
320
+ "logps/rejected": -231.666259765625,
321
+ "loss": 0.666,
322
+ "positive_losses": 0.0,
323
+ "rewards/accuracies": 0.949999988079071,
324
+ "rewards/chosen": 0.05871708318591118,
325
+ "rewards/margins": 0.058249205350875854,
326
+ "rewards/margins_max": 0.0843496099114418,
327
+ "rewards/margins_min": 0.03214879333972931,
328
+ "rewards/margins_std": 0.03691155090928078,
329
+ "rewards/rejected": 0.0004678791738115251,
330
+ "step": 140
331
+ },
332
+ {
333
+ "dpo_losses": 0.6681698560714722,
334
+ "epoch": 0.42,
335
+ "grad_norm": 2.1268042590236584,
336
+ "learning_rate": 4.975186049985817e-07,
337
+ "logits/chosen": -2.8787732124328613,
338
+ "logits/rejected": -2.80892276763916,
339
+ "logps/chosen": -291.04962158203125,
340
+ "logps/rejected": -247.11825561523438,
341
+ "loss": 0.6614,
342
+ "positive_losses": 0.0,
343
+ "rewards/accuracies": 0.949999988079071,
344
+ "rewards/chosen": 0.05772637203335762,
345
+ "rewards/margins": 0.051212601363658905,
346
+ "rewards/margins_max": 0.08002050220966339,
347
+ "rewards/margins_min": 0.022404693067073822,
348
+ "rewards/margins_std": 0.04074053093791008,
349
+ "rewards/rejected": 0.006513768341392279,
350
+ "step": 150
351
+ },
352
+ {
353
+ "dpo_losses": 0.6561421751976013,
354
+ "epoch": 0.45,
355
+ "grad_norm": 1.7754914073644856,
356
+ "learning_rate": 4.962335089142375e-07,
357
+ "logits/chosen": -2.888826608657837,
358
+ "logits/rejected": -2.7711613178253174,
359
+ "logps/chosen": -323.6219787597656,
360
+ "logps/rejected": -242.70925903320312,
361
+ "loss": 0.6537,
362
+ "positive_losses": 0.020649338141083717,
363
+ "rewards/accuracies": 0.8999999761581421,
364
+ "rewards/chosen": 0.08097714930772781,
365
+ "rewards/margins": 0.07674388587474823,
366
+ "rewards/margins_max": 0.12844815850257874,
367
+ "rewards/margins_min": 0.02503962442278862,
368
+ "rewards/margins_std": 0.07312087714672089,
369
+ "rewards/rejected": 0.004233261104673147,
370
+ "step": 160
371
+ },
372
+ {
373
+ "dpo_losses": 0.6377928853034973,
374
+ "epoch": 0.48,
375
+ "grad_norm": 1.8314743659672637,
376
+ "learning_rate": 4.946836378394966e-07,
377
+ "logits/chosen": -2.927858591079712,
378
+ "logits/rejected": -2.771623134613037,
379
+ "logps/chosen": -388.8592224121094,
380
+ "logps/rejected": -233.54714965820312,
381
+ "loss": 0.6505,
382
+ "positive_losses": 0.0,
383
+ "rewards/accuracies": 0.949999988079071,
384
+ "rewards/chosen": 0.11622031778097153,
385
+ "rewards/margins": 0.11623603105545044,
386
+ "rewards/margins_max": 0.1615394800901413,
387
+ "rewards/margins_min": 0.07093258202075958,
388
+ "rewards/margins_std": 0.0640687644481659,
389
+ "rewards/rejected": -1.5713460015831515e-05,
390
+ "step": 170
391
+ },
392
+ {
393
+ "dpo_losses": 0.6507928371429443,
394
+ "epoch": 0.51,
395
+ "grad_norm": 2.090628503360293,
396
+ "learning_rate": 4.92870658351344e-07,
397
+ "logits/chosen": -2.827677011489868,
398
+ "logits/rejected": -2.7344062328338623,
399
+ "logps/chosen": -314.31219482421875,
400
+ "logps/rejected": -378.66693115234375,
401
+ "loss": 0.6443,
402
+ "positive_losses": 0.0,
403
+ "rewards/accuracies": 0.949999988079071,
404
+ "rewards/chosen": 0.09823790937662125,
405
+ "rewards/margins": 0.08783474564552307,
406
+ "rewards/margins_max": 0.11842919886112213,
407
+ "rewards/margins_min": 0.05724028870463371,
408
+ "rewards/margins_std": 0.043267086148262024,
409
+ "rewards/rejected": 0.010403157211840153,
410
+ "step": 180
411
+ },
412
+ {
413
+ "dpo_losses": 0.6270108222961426,
414
+ "epoch": 0.54,
415
+ "grad_norm": 2.5880477915128974,
416
+ "learning_rate": 4.90796519947347e-07,
417
+ "logits/chosen": -2.7683603763580322,
418
+ "logits/rejected": -2.600294589996338,
419
+ "logps/chosen": -441.5703125,
420
+ "logps/rejected": -221.9513702392578,
421
+ "loss": 0.6377,
422
+ "positive_losses": 0.0,
423
+ "rewards/accuracies": 1.0,
424
+ "rewards/chosen": 0.14541961252689362,
425
+ "rewards/margins": 0.13966473937034607,
426
+ "rewards/margins_max": 0.2020079791545868,
427
+ "rewards/margins_min": 0.07732154428958893,
428
+ "rewards/margins_std": 0.08816662430763245,
429
+ "rewards/rejected": 0.005754842888563871,
430
+ "step": 190
431
+ },
432
+ {
433
+ "dpo_losses": 0.6278845071792603,
434
+ "epoch": 0.56,
435
+ "grad_norm": 6.409126045529177,
436
+ "learning_rate": 4.88463452949359e-07,
437
+ "logits/chosen": -2.9043941497802734,
438
+ "logits/rejected": -2.8018863201141357,
439
+ "logps/chosen": -317.3318786621094,
440
+ "logps/rejected": -204.9638671875,
441
+ "loss": 0.6367,
442
+ "positive_losses": 0.0,
443
+ "rewards/accuracies": 1.0,
444
+ "rewards/chosen": 0.13463780283927917,
445
+ "rewards/margins": 0.13884270191192627,
446
+ "rewards/margins_max": 0.20423416793346405,
447
+ "rewards/margins_min": 0.07345118373632431,
448
+ "rewards/margins_std": 0.09247754514217377,
449
+ "rewards/rejected": -0.004204885568469763,
450
+ "step": 200
451
+ },
452
+ {
453
+ "epoch": 0.56,
454
+ "eval_dpo_losses": 0.6862879395484924,
455
+ "eval_logits/chosen": -2.8224480152130127,
456
+ "eval_logits/rejected": -2.777904510498047,
457
+ "eval_logps/chosen": -283.0695495605469,
458
+ "eval_logps/rejected": -258.4836120605469,
459
+ "eval_loss": 0.7632620930671692,
460
+ "eval_positive_losses": 0.6990463733673096,
461
+ "eval_rewards/accuracies": 0.5873016119003296,
462
+ "eval_rewards/chosen": 0.02151678316295147,
463
+ "eval_rewards/margins": 0.014526319690048695,
464
+ "eval_rewards/margins_max": 0.07610397040843964,
465
+ "eval_rewards/margins_min": -0.039124276489019394,
466
+ "eval_rewards/margins_std": 0.051066700369119644,
467
+ "eval_rewards/rejected": 0.006990462075918913,
468
+ "eval_runtime": 353.0866,
469
+ "eval_samples_per_second": 5.664,
470
+ "eval_steps_per_second": 0.178,
471
+ "step": 200
472
+ },
473
+ {
474
+ "dpo_losses": 0.6355398893356323,
475
+ "epoch": 0.59,
476
+ "grad_norm": 2.0528134694636475,
477
+ "learning_rate": 4.858739661052539e-07,
478
+ "logits/chosen": -2.6923115253448486,
479
+ "logits/rejected": -2.6051807403564453,
480
+ "logps/chosen": -343.03765869140625,
481
+ "logps/rejected": -267.228271484375,
482
+ "loss": 0.6285,
483
+ "positive_losses": 0.02857360802590847,
484
+ "rewards/accuracies": 0.8999999761581421,
485
+ "rewards/chosen": 0.1379883587360382,
486
+ "rewards/margins": 0.12217812240123749,
487
+ "rewards/margins_max": 0.2092159241437912,
488
+ "rewards/margins_min": 0.035140346735715866,
489
+ "rewards/margins_std": 0.12309001386165619,
490
+ "rewards/rejected": 0.01581023633480072,
491
+ "step": 210
492
+ },
493
+ {
494
+ "dpo_losses": 0.6001917719841003,
495
+ "epoch": 0.62,
496
+ "grad_norm": 2.0452189261623355,
497
+ "learning_rate": 4.830308438912687e-07,
498
+ "logits/chosen": -2.9291024208068848,
499
+ "logits/rejected": -2.7718849182128906,
500
+ "logps/chosen": -389.2041931152344,
501
+ "logps/rejected": -278.2353515625,
502
+ "loss": 0.6208,
503
+ "positive_losses": 0.0,
504
+ "rewards/accuracies": 1.0,
505
+ "rewards/chosen": 0.20065200328826904,
506
+ "rewards/margins": 0.2000540941953659,
507
+ "rewards/margins_max": 0.27747657895088196,
508
+ "rewards/margins_min": 0.12263162434101105,
509
+ "rewards/margins_std": 0.10949190706014633,
510
+ "rewards/rejected": 0.0005978975677862763,
511
+ "step": 220
512
+ },
513
+ {
514
+ "dpo_losses": 0.6307692527770996,
515
+ "epoch": 0.65,
516
+ "grad_norm": 2.2006814811498345,
517
+ "learning_rate": 4.799371435178545e-07,
518
+ "logits/chosen": -2.8787121772766113,
519
+ "logits/rejected": -2.7966089248657227,
520
+ "logps/chosen": -366.04168701171875,
521
+ "logps/rejected": -335.1198425292969,
522
+ "loss": 0.611,
523
+ "positive_losses": 0.0,
524
+ "rewards/accuracies": 1.0,
525
+ "rewards/chosen": 0.14716704189777374,
526
+ "rewards/margins": 0.13250373303890228,
527
+ "rewards/margins_max": 0.20927949249744415,
528
+ "rewards/margins_min": 0.05572795867919922,
529
+ "rewards/margins_std": 0.10857733339071274,
530
+ "rewards/rejected": 0.01466330885887146,
531
+ "step": 230
532
+ },
533
+ {
534
+ "dpo_losses": 0.6340683102607727,
535
+ "epoch": 0.68,
536
+ "grad_norm": 1.8268647373784295,
537
+ "learning_rate": 4.765961916422574e-07,
538
+ "logits/chosen": -2.884403944015503,
539
+ "logits/rejected": -2.725059986114502,
540
+ "logps/chosen": -327.7989196777344,
541
+ "logps/rejected": -292.8590087890625,
542
+ "loss": 0.6248,
543
+ "positive_losses": 0.0,
544
+ "rewards/accuracies": 0.8999999761581421,
545
+ "rewards/chosen": 0.13348861038684845,
546
+ "rewards/margins": 0.12396250665187836,
547
+ "rewards/margins_max": 0.17216315865516663,
548
+ "rewards/margins_min": 0.0757618397474289,
549
+ "rewards/margins_std": 0.06816603988409042,
550
+ "rewards/rejected": 0.009526104666292667,
551
+ "step": 240
552
+ },
553
+ {
554
+ "dpo_losses": 0.6013151407241821,
555
+ "epoch": 0.7,
556
+ "grad_norm": 1.9729455716823008,
557
+ "learning_rate": 4.730115807913626e-07,
558
+ "logits/chosen": -2.900844097137451,
559
+ "logits/rejected": -2.716480016708374,
560
+ "logps/chosen": -394.9379577636719,
561
+ "logps/rejected": -255.08218383789062,
562
+ "loss": 0.6072,
563
+ "positive_losses": 0.0,
564
+ "rewards/accuracies": 1.0,
565
+ "rewards/chosen": 0.20289082825183868,
566
+ "rewards/margins": 0.19841626286506653,
567
+ "rewards/margins_max": 0.2661207318305969,
568
+ "rewards/margins_min": 0.13071177899837494,
569
+ "rewards/margins_std": 0.09574858844280243,
570
+ "rewards/rejected": 0.004474560730159283,
571
+ "step": 250
572
+ },
573
+ {
574
+ "dpo_losses": 0.6106809973716736,
575
+ "epoch": 0.73,
576
+ "grad_norm": 5.819181150531001,
577
+ "learning_rate": 4.691871654986485e-07,
578
+ "logits/chosen": -2.8338541984558105,
579
+ "logits/rejected": -2.799828052520752,
580
+ "logps/chosen": -303.6441345214844,
581
+ "logps/rejected": -224.24734497070312,
582
+ "loss": 0.6119,
583
+ "positive_losses": 0.0,
584
+ "rewards/accuracies": 0.8999999761581421,
585
+ "rewards/chosen": 0.18178164958953857,
586
+ "rewards/margins": 0.18184307217597961,
587
+ "rewards/margins_max": 0.31147363781929016,
588
+ "rewards/margins_min": 0.05221250653266907,
589
+ "rewards/margins_std": 0.18332532048225403,
590
+ "rewards/rejected": -6.143822974991053e-05,
591
+ "step": 260
592
+ },
593
+ {
594
+ "dpo_losses": 0.5980932116508484,
595
+ "epoch": 0.76,
596
+ "grad_norm": 2.0922326810258784,
597
+ "learning_rate": 4.6512705815940536e-07,
598
+ "logits/chosen": -2.8425519466400146,
599
+ "logits/rejected": -2.702369451522827,
600
+ "logps/chosen": -409.5565490722656,
601
+ "logps/rejected": -233.59228515625,
602
+ "loss": 0.6087,
603
+ "positive_losses": 0.0,
604
+ "rewards/accuracies": 1.0,
605
+ "rewards/chosen": 0.19440071284770966,
606
+ "rewards/margins": 0.20493817329406738,
607
+ "rewards/margins_max": 0.2935718894004822,
608
+ "rewards/margins_min": 0.11630449444055557,
609
+ "rewards/margins_std": 0.12534697353839874,
610
+ "rewards/rejected": -0.010537461377680302,
611
+ "step": 270
612
+ },
613
+ {
614
+ "dpo_losses": 0.628231406211853,
615
+ "epoch": 0.79,
616
+ "grad_norm": 3.7503530971865384,
617
+ "learning_rate": 4.6083562460867544e-07,
618
+ "logits/chosen": -2.710557460784912,
619
+ "logits/rejected": -2.6718039512634277,
620
+ "logps/chosen": -314.34271240234375,
621
+ "logps/rejected": -264.9143981933594,
622
+ "loss": 0.608,
623
+ "positive_losses": 0.09001044929027557,
624
+ "rewards/accuracies": 0.949999988079071,
625
+ "rewards/chosen": 0.1473403126001358,
626
+ "rewards/margins": 0.13957121968269348,
627
+ "rewards/margins_max": 0.22123010456562042,
628
+ "rewards/margins_min": 0.05791233107447624,
629
+ "rewards/margins_std": 0.1154831051826477,
630
+ "rewards/rejected": 0.0077690863981842995,
631
+ "step": 280
632
+ },
633
+ {
634
+ "dpo_losses": 0.5966510772705078,
635
+ "epoch": 0.82,
636
+ "grad_norm": 2.147411420458553,
637
+ "learning_rate": 4.563174794266683e-07,
638
+ "logits/chosen": -2.9081060886383057,
639
+ "logits/rejected": -2.7455201148986816,
640
+ "logps/chosen": -339.8738708496094,
641
+ "logps/rejected": -270.620849609375,
642
+ "loss": 0.6098,
643
+ "positive_losses": 0.0,
644
+ "rewards/accuracies": 1.0,
645
+ "rewards/chosen": 0.21971745789051056,
646
+ "rewards/margins": 0.21189472079277039,
647
+ "rewards/margins_max": 0.33527523279190063,
648
+ "rewards/margins_min": 0.08851419389247894,
649
+ "rewards/margins_std": 0.17448639869689941,
650
+ "rewards/rejected": 0.00782275851815939,
651
+ "step": 290
652
+ },
653
+ {
654
+ "dpo_losses": 0.6003170013427734,
655
+ "epoch": 0.85,
656
+ "grad_norm": 6.556261857915724,
657
+ "learning_rate": 4.515774809767012e-07,
658
+ "logits/chosen": -2.829227924346924,
659
+ "logits/rejected": -2.7758102416992188,
660
+ "logps/chosen": -317.3044128417969,
661
+ "logps/rejected": -312.4596862792969,
662
+ "loss": 0.5913,
663
+ "positive_losses": 0.0,
664
+ "rewards/accuracies": 0.949999988079071,
665
+ "rewards/chosen": 0.20438948273658752,
666
+ "rewards/margins": 0.20183344185352325,
667
+ "rewards/margins_max": 0.29987072944641113,
668
+ "rewards/margins_min": 0.10379616171121597,
669
+ "rewards/margins_std": 0.13864566385746002,
670
+ "rewards/rejected": 0.0025560318026691675,
671
+ "step": 300
672
+ },
673
+ {
674
+ "epoch": 0.85,
675
+ "eval_dpo_losses": 0.6810439825057983,
676
+ "eval_logits/chosen": -2.7852697372436523,
677
+ "eval_logits/rejected": -2.7412352561950684,
678
+ "eval_logps/chosen": -283.9922180175781,
679
+ "eval_logps/rejected": -260.6202087402344,
680
+ "eval_loss": 0.919795036315918,
681
+ "eval_positive_losses": 2.2040839195251465,
682
+ "eval_rewards/accuracies": 0.5714285969734192,
683
+ "eval_rewards/chosen": 0.012289770878851414,
684
+ "eval_rewards/margins": 0.02666497975587845,
685
+ "eval_rewards/margins_max": 0.13577593863010406,
686
+ "eval_rewards/margins_min": -0.06833065301179886,
687
+ "eval_rewards/margins_std": 0.08987120538949966,
688
+ "eval_rewards/rejected": -0.01437520980834961,
689
+ "eval_runtime": 376.3495,
690
+ "eval_samples_per_second": 5.314,
691
+ "eval_steps_per_second": 0.167,
692
+ "step": 300
693
+ },
694
+ {
695
+ "dpo_losses": 0.5683969259262085,
696
+ "epoch": 0.87,
697
+ "grad_norm": 1.838064317269046,
698
+ "learning_rate": 4.4662072618099887e-07,
699
+ "logits/chosen": -2.98117995262146,
700
+ "logits/rejected": -2.7507948875427246,
701
+ "logps/chosen": -399.4131164550781,
702
+ "logps/rejected": -263.9547424316406,
703
+ "loss": 0.593,
704
+ "positive_losses": 0.0,
705
+ "rewards/accuracies": 0.949999988079071,
706
+ "rewards/chosen": 0.2483425885438919,
707
+ "rewards/margins": 0.2780510485172272,
708
+ "rewards/margins_max": 0.41263166069984436,
709
+ "rewards/margins_min": 0.14347048103809357,
710
+ "rewards/margins_std": 0.19032566249370575,
711
+ "rewards/rejected": -0.029708484187722206,
712
+ "step": 310
713
+ },
714
+ {
715
+ "dpo_losses": 0.5936921834945679,
716
+ "epoch": 0.9,
717
+ "grad_norm": 1.773863081765084,
718
+ "learning_rate": 4.414525450399712e-07,
719
+ "logits/chosen": -2.838667631149292,
720
+ "logits/rejected": -2.7466790676116943,
721
+ "logps/chosen": -326.25115966796875,
722
+ "logps/rejected": -239.88632202148438,
723
+ "loss": 0.5855,
724
+ "positive_losses": 0.0,
725
+ "rewards/accuracies": 0.949999988079071,
726
+ "rewards/chosen": 0.2000167816877365,
727
+ "rewards/margins": 0.21792948246002197,
728
+ "rewards/margins_max": 0.3168962597846985,
729
+ "rewards/margins_min": 0.11896270513534546,
730
+ "rewards/margins_std": 0.1399601548910141,
731
+ "rewards/rejected": -0.01791267842054367,
732
+ "step": 320
733
+ },
734
+ {
735
+ "dpo_losses": 0.5716263651847839,
736
+ "epoch": 0.93,
737
+ "grad_norm": 1.998569334452282,
738
+ "learning_rate": 4.360784949008615e-07,
739
+ "logits/chosen": -2.938047409057617,
740
+ "logits/rejected": -2.8133039474487305,
741
+ "logps/chosen": -355.54949951171875,
742
+ "logps/rejected": -251.0133056640625,
743
+ "loss": 0.5905,
744
+ "positive_losses": 0.12526169419288635,
745
+ "rewards/accuracies": 0.949999988079071,
746
+ "rewards/chosen": 0.24137118458747864,
747
+ "rewards/margins": 0.2771782875061035,
748
+ "rewards/margins_max": 0.4031391143798828,
749
+ "rewards/margins_min": 0.1512174755334854,
750
+ "rewards/margins_std": 0.17813549935817719,
751
+ "rewards/rejected": -0.03580709546804428,
752
+ "step": 330
753
+ },
754
+ {
755
+ "dpo_losses": 0.6020129919052124,
756
+ "epoch": 0.96,
757
+ "grad_norm": 1.9680439727569183,
758
+ "learning_rate": 4.305043544819289e-07,
759
+ "logits/chosen": -2.8385097980499268,
760
+ "logits/rejected": -2.6945478916168213,
761
+ "logps/chosen": -348.60601806640625,
762
+ "logps/rejected": -204.32705688476562,
763
+ "loss": 0.5827,
764
+ "positive_losses": 0.0,
765
+ "rewards/accuracies": 0.8999999761581421,
766
+ "rewards/chosen": 0.19889523088932037,
767
+ "rewards/margins": 0.19913819432258606,
768
+ "rewards/margins_max": 0.2865811586380005,
769
+ "rewards/margins_min": 0.11169523000717163,
770
+ "rewards/margins_std": 0.123663030564785,
771
+ "rewards/rejected": -0.00024295822368003428,
772
+ "step": 340
773
+ },
774
+ {
775
+ "dpo_losses": 0.5496091842651367,
776
+ "epoch": 0.99,
777
+ "grad_norm": 4.932583836163482,
778
+ "learning_rate": 4.247361176585903e-07,
779
+ "logits/chosen": -2.8020033836364746,
780
+ "logits/rejected": -2.7020020484924316,
781
+ "logps/chosen": -401.37969970703125,
782
+ "logps/rejected": -303.43603515625,
783
+ "loss": 0.5792,
784
+ "positive_losses": 0.0,
785
+ "rewards/accuracies": 1.0,
786
+ "rewards/chosen": 0.27023789286613464,
787
+ "rewards/margins": 0.32071441411972046,
788
+ "rewards/margins_max": 0.4361411929130554,
789
+ "rewards/margins_min": 0.2052876502275467,
790
+ "rewards/margins_std": 0.16323810815811157,
791
+ "rewards/rejected": -0.05047653242945671,
792
+ "step": 350
793
+ },
794
+ {
795
+ "dpo_losses": 0.5841382741928101,
796
+ "epoch": 1.01,
797
+ "grad_norm": 1.907764958469603,
798
+ "learning_rate": 4.187799870182038e-07,
799
+ "logits/chosen": -2.7761244773864746,
800
+ "logits/rejected": -2.671306610107422,
801
+ "logps/chosen": -325.5156555175781,
802
+ "logps/rejected": -205.7819061279297,
803
+ "loss": 0.5713,
804
+ "positive_losses": 0.0,
805
+ "rewards/accuracies": 0.949999988079071,
806
+ "rewards/chosen": 0.20690234005451202,
807
+ "rewards/margins": 0.2369709461927414,
808
+ "rewards/margins_max": 0.33839744329452515,
809
+ "rewards/margins_min": 0.13554444909095764,
810
+ "rewards/margins_std": 0.14343872666358948,
811
+ "rewards/rejected": -0.03006860613822937,
812
+ "step": 360
813
+ },
814
+ {
815
+ "dpo_losses": 0.571262776851654,
816
+ "epoch": 1.04,
817
+ "grad_norm": 7.1801497317882195,
818
+ "learning_rate": 4.126423671904236e-07,
819
+ "logits/chosen": -2.667179822921753,
820
+ "logits/rejected": -2.659519672393799,
821
+ "logps/chosen": -327.2443542480469,
822
+ "logps/rejected": -267.2635192871094,
823
+ "loss": 0.566,
824
+ "positive_losses": 0.06003761291503906,
825
+ "rewards/accuracies": 0.949999988079071,
826
+ "rewards/chosen": 0.2441311627626419,
827
+ "rewards/margins": 0.2789308428764343,
828
+ "rewards/margins_max": 0.43240299820899963,
829
+ "rewards/margins_min": 0.12545865774154663,
830
+ "rewards/margins_std": 0.217042475938797,
831
+ "rewards/rejected": -0.03479967638850212,
832
+ "step": 370
833
+ },
834
+ {
835
+ "dpo_losses": 0.5604450106620789,
836
+ "epoch": 1.07,
837
+ "grad_norm": 3.976395938884955,
838
+ "learning_rate": 4.0632985796030007e-07,
839
+ "logits/chosen": -2.775261640548706,
840
+ "logits/rejected": -2.5520777702331543,
841
+ "logps/chosen": -365.10052490234375,
842
+ "logps/rejected": -188.36569213867188,
843
+ "loss": 0.5666,
844
+ "positive_losses": 0.1160304993391037,
845
+ "rewards/accuracies": 0.949999988079071,
846
+ "rewards/chosen": 0.2588001787662506,
847
+ "rewards/margins": 0.29767391085624695,
848
+ "rewards/margins_max": 0.4550997316837311,
849
+ "rewards/margins_min": 0.1402481645345688,
850
+ "rewards/margins_std": 0.22263364493846893,
851
+ "rewards/rejected": -0.03887376934289932,
852
+ "step": 380
853
+ },
854
+ {
855
+ "dpo_losses": 0.5191501379013062,
856
+ "epoch": 1.1,
857
+ "grad_norm": 2.0256079149805704,
858
+ "learning_rate": 3.9984924717152713e-07,
859
+ "logits/chosen": -2.7934913635253906,
860
+ "logits/rejected": -2.7538201808929443,
861
+ "logps/chosen": -368.53741455078125,
862
+ "logps/rejected": -343.0877380371094,
863
+ "loss": 0.568,
864
+ "positive_losses": 0.0,
865
+ "rewards/accuracies": 0.949999988079071,
866
+ "rewards/chosen": 0.2897542417049408,
867
+ "rewards/margins": 0.4042133390903473,
868
+ "rewards/margins_max": 0.5862180590629578,
869
+ "rewards/margins_min": 0.22220861911773682,
870
+ "rewards/margins_std": 0.2573935389518738,
871
+ "rewards/rejected": -0.11445906013250351,
872
+ "step": 390
873
+ },
874
+ {
875
+ "dpo_losses": 0.5777878761291504,
876
+ "epoch": 1.13,
877
+ "grad_norm": 1.913623993136942,
878
+ "learning_rate": 3.932075034274723e-07,
879
+ "logits/chosen": -2.720954418182373,
880
+ "logits/rejected": -2.731393337249756,
881
+ "logps/chosen": -278.46453857421875,
882
+ "logps/rejected": -233.5457000732422,
883
+ "loss": 0.5502,
884
+ "positive_losses": 0.0,
885
+ "rewards/accuracies": 0.949999988079071,
886
+ "rewards/chosen": 0.2027483880519867,
887
+ "rewards/margins": 0.26584392786026,
888
+ "rewards/margins_max": 0.36782872676849365,
889
+ "rewards/margins_min": 0.16385909914970398,
890
+ "rewards/margins_std": 0.14422830939292908,
891
+ "rewards/rejected": -0.06309551745653152,
892
+ "step": 400
893
+ },
894
+ {
895
+ "epoch": 1.13,
896
+ "eval_dpo_losses": 0.6769555807113647,
897
+ "eval_logits/chosen": -2.7545130252838135,
898
+ "eval_logits/rejected": -2.7112648487091064,
899
+ "eval_logps/chosen": -285.1260681152344,
900
+ "eval_logps/rejected": -262.7898864746094,
901
+ "eval_loss": 1.0825976133346558,
902
+ "eval_positive_losses": 3.7846007347106934,
903
+ "eval_rewards/accuracies": 0.5753968358039856,
904
+ "eval_rewards/chosen": 0.0009513738332316279,
905
+ "eval_rewards/margins": 0.037023574113845825,
906
+ "eval_rewards/margins_max": 0.18605393171310425,
907
+ "eval_rewards/margins_min": -0.09629133343696594,
908
+ "eval_rewards/margins_std": 0.12429077178239822,
909
+ "eval_rewards/rejected": -0.036072202026844025,
910
+ "eval_runtime": 389.163,
911
+ "eval_samples_per_second": 5.139,
912
+ "eval_steps_per_second": 0.162,
913
+ "step": 400
914
+ },
915
+ {
916
+ "dpo_losses": 0.5712305307388306,
917
+ "epoch": 1.15,
918
+ "grad_norm": 2.3137171866935726,
919
+ "learning_rate": 3.8641176859783383e-07,
920
+ "logits/chosen": -2.801839590072632,
921
+ "logits/rejected": -2.7598752975463867,
922
+ "logps/chosen": -269.1628723144531,
923
+ "logps/rejected": -211.1710662841797,
924
+ "loss": 0.5608,
925
+ "positive_losses": 0.19508972764015198,
926
+ "rewards/accuracies": 0.8500000238418579,
927
+ "rewards/chosen": 0.2552756071090698,
928
+ "rewards/margins": 0.2850131392478943,
929
+ "rewards/margins_max": 0.5092954635620117,
930
+ "rewards/margins_min": 0.06073073670268059,
931
+ "rewards/margins_std": 0.31718316674232483,
932
+ "rewards/rejected": -0.029737496748566628,
933
+ "step": 410
934
+ },
935
+ {
936
+ "dpo_losses": 0.5262824296951294,
937
+ "epoch": 1.18,
938
+ "grad_norm": 2.2394985330776582,
939
+ "learning_rate": 3.7946935013898606e-07,
940
+ "logits/chosen": -2.8580517768859863,
941
+ "logits/rejected": -2.7411797046661377,
942
+ "logps/chosen": -366.37841796875,
943
+ "logps/rejected": -270.374755859375,
944
+ "loss": 0.5707,
945
+ "positive_losses": 0.0,
946
+ "rewards/accuracies": 1.0,
947
+ "rewards/chosen": 0.33375903964042664,
948
+ "rewards/margins": 0.3817841410636902,
949
+ "rewards/margins_max": 0.5622913241386414,
950
+ "rewards/margins_min": 0.20127694308757782,
951
+ "rewards/margins_std": 0.255275696516037,
952
+ "rewards/rejected": -0.048025064170360565,
953
+ "step": 420
954
+ },
955
+ {
956
+ "dpo_losses": 0.543947160243988,
957
+ "epoch": 1.21,
958
+ "grad_norm": 5.549999635713583,
959
+ "learning_rate": 3.7238771323626817e-07,
960
+ "logits/chosen": -2.782466173171997,
961
+ "logits/rejected": -2.675513744354248,
962
+ "logps/chosen": -357.595458984375,
963
+ "logps/rejected": -268.3279724121094,
964
+ "loss": 0.5395,
965
+ "positive_losses": 0.051211167126894,
966
+ "rewards/accuracies": 0.8999999761581421,
967
+ "rewards/chosen": 0.2949376702308655,
968
+ "rewards/margins": 0.3435710072517395,
969
+ "rewards/margins_max": 0.5000001192092896,
970
+ "rewards/margins_min": 0.18714189529418945,
971
+ "rewards/margins_std": 0.22122418880462646,
972
+ "rewards/rejected": -0.04863337427377701,
973
+ "step": 430
974
+ },
975
+ {
976
+ "dpo_losses": 0.5732239484786987,
977
+ "epoch": 1.24,
978
+ "grad_norm": 1.8704082850098978,
979
+ "learning_rate": 3.651744727766676e-07,
980
+ "logits/chosen": -2.7718758583068848,
981
+ "logits/rejected": -2.657120704650879,
982
+ "logps/chosen": -292.4672546386719,
983
+ "logps/rejected": -199.0224609375,
984
+ "loss": 0.5501,
985
+ "positive_losses": 0.02040863037109375,
986
+ "rewards/accuracies": 1.0,
987
+ "rewards/chosen": 0.22786328196525574,
988
+ "rewards/margins": 0.2696700394153595,
989
+ "rewards/margins_max": 0.39789730310440063,
990
+ "rewards/margins_min": 0.14144271612167358,
991
+ "rewards/margins_std": 0.1813407838344574,
992
+ "rewards/rejected": -0.04180673882365227,
993
+ "step": 440
994
+ },
995
+ {
996
+ "dpo_losses": 0.5517903566360474,
997
+ "epoch": 1.27,
998
+ "grad_norm": 1.988829111938975,
999
+ "learning_rate": 3.5783738516052897e-07,
1000
+ "logits/chosen": -2.73368501663208,
1001
+ "logits/rejected": -2.664750576019287,
1002
+ "logps/chosen": -314.8210144042969,
1003
+ "logps/rejected": -276.9835205078125,
1004
+ "loss": 0.555,
1005
+ "positive_losses": 0.0,
1006
+ "rewards/accuracies": 1.0,
1007
+ "rewards/chosen": 0.23353490233421326,
1008
+ "rewards/margins": 0.32702386379241943,
1009
+ "rewards/margins_max": 0.47540155053138733,
1010
+ "rewards/margins_min": 0.17864616215229034,
1011
+ "rewards/margins_std": 0.20983779430389404,
1012
+ "rewards/rejected": -0.09348895400762558,
1013
+ "step": 450
1014
+ },
1015
+ {
1016
+ "dpo_losses": 0.5432690382003784,
1017
+ "epoch": 1.3,
1018
+ "grad_norm": 6.009256851534543,
1019
+ "learning_rate": 3.5038433996109404e-07,
1020
+ "logits/chosen": -2.723193645477295,
1021
+ "logits/rejected": -2.7120513916015625,
1022
+ "logps/chosen": -370.0165710449219,
1023
+ "logps/rejected": -410.7750549316406,
1024
+ "loss": 0.5516,
1025
+ "positive_losses": 0.0,
1026
+ "rewards/accuracies": 1.0,
1027
+ "rewards/chosen": 0.25261393189430237,
1028
+ "rewards/margins": 0.33443617820739746,
1029
+ "rewards/margins_max": 0.4311138689517975,
1030
+ "rewards/margins_min": 0.23775847256183624,
1031
+ "rewards/margins_std": 0.1367228925228119,
1032
+ "rewards/rejected": -0.0818222239613533,
1033
+ "step": 460
1034
+ },
1035
+ {
1036
+ "dpo_losses": 0.5220767259597778,
1037
+ "epoch": 1.32,
1038
+ "grad_norm": 9.755904928259515,
1039
+ "learning_rate": 3.428233514408398e-07,
1040
+ "logits/chosen": -2.768638849258423,
1041
+ "logits/rejected": -2.6755402088165283,
1042
+ "logps/chosen": -328.5848388671875,
1043
+ "logps/rejected": -230.9825439453125,
1044
+ "loss": 0.538,
1045
+ "positive_losses": 0.0,
1046
+ "rewards/accuracies": 1.0,
1047
+ "rewards/chosen": 0.28665998578071594,
1048
+ "rewards/margins": 0.38968348503112793,
1049
+ "rewards/margins_max": 0.48602980375289917,
1050
+ "rewards/margins_min": 0.2933371067047119,
1051
+ "rewards/margins_std": 0.13625434041023254,
1052
+ "rewards/rejected": -0.10302351415157318,
1053
+ "step": 470
1054
+ },
1055
+ {
1056
+ "dpo_losses": 0.4982558786869049,
1057
+ "epoch": 1.35,
1058
+ "grad_norm": 1.8535108656666373,
1059
+ "learning_rate": 3.3516254993373945e-07,
1060
+ "logits/chosen": -2.855764150619507,
1061
+ "logits/rejected": -2.696035861968994,
1062
+ "logps/chosen": -384.2411193847656,
1063
+ "logps/rejected": -291.2297668457031,
1064
+ "loss": 0.542,
1065
+ "positive_losses": 0.18444347381591797,
1066
+ "rewards/accuracies": 0.949999988079071,
1067
+ "rewards/chosen": 0.3161051273345947,
1068
+ "rewards/margins": 0.46426716446876526,
1069
+ "rewards/margins_max": 0.6109346747398376,
1070
+ "rewards/margins_min": 0.3175995349884033,
1071
+ "rewards/margins_std": 0.2074192762374878,
1072
+ "rewards/rejected": -0.14816200733184814,
1073
+ "step": 480
1074
+ },
1075
+ {
1076
+ "dpo_losses": 0.5826688408851624,
1077
+ "epoch": 1.38,
1078
+ "grad_norm": 2.1891083390571136,
1079
+ "learning_rate": 3.274101731027105e-07,
1080
+ "logits/chosen": -2.7111635208129883,
1081
+ "logits/rejected": -2.5885300636291504,
1082
+ "logps/chosen": -197.87130737304688,
1083
+ "logps/rejected": -222.28286743164062,
1084
+ "loss": 0.5455,
1085
+ "positive_losses": 0.21304932236671448,
1086
+ "rewards/accuracies": 0.8999999761581421,
1087
+ "rewards/chosen": 0.19410201907157898,
1088
+ "rewards/margins": 0.2511723041534424,
1089
+ "rewards/margins_max": 0.4196406304836273,
1090
+ "rewards/margins_min": 0.08270399272441864,
1091
+ "rewards/margins_std": 0.23825016617774963,
1092
+ "rewards/rejected": -0.057070292532444,
1093
+ "step": 490
1094
+ },
1095
+ {
1096
+ "dpo_losses": 0.512485682964325,
1097
+ "epoch": 1.41,
1098
+ "grad_norm": 7.948297320675137,
1099
+ "learning_rate": 3.1957455708165314e-07,
1100
+ "logits/chosen": -2.642686367034912,
1101
+ "logits/rejected": -2.5784401893615723,
1102
+ "logps/chosen": -350.71331787109375,
1103
+ "logps/rejected": -246.94711303710938,
1104
+ "loss": 0.5398,
1105
+ "positive_losses": 0.1300731599330902,
1106
+ "rewards/accuracies": 1.0,
1107
+ "rewards/chosen": 0.33491355180740356,
1108
+ "rewards/margins": 0.42528876662254333,
1109
+ "rewards/margins_max": 0.566541850566864,
1110
+ "rewards/margins_min": 0.28403571248054504,
1111
+ "rewards/margins_std": 0.1997620314359665,
1112
+ "rewards/rejected": -0.09037523716688156,
1113
+ "step": 500
1114
+ },
1115
+ {
1116
+ "epoch": 1.41,
1117
+ "eval_dpo_losses": 0.6733708381652832,
1118
+ "eval_logits/chosen": -2.736778974533081,
1119
+ "eval_logits/rejected": -2.6934895515441895,
1120
+ "eval_logps/chosen": -284.954833984375,
1121
+ "eval_logps/rejected": -263.591796875,
1122
+ "eval_loss": 1.1571382284164429,
1123
+ "eval_positive_losses": 4.656679630279541,
1124
+ "eval_rewards/accuracies": 0.5833333134651184,
1125
+ "eval_rewards/chosen": 0.0026637099217623472,
1126
+ "eval_rewards/margins": 0.04675525426864624,
1127
+ "eval_rewards/margins_max": 0.23377186059951782,
1128
+ "eval_rewards/margins_min": -0.11662713438272476,
1129
+ "eval_rewards/margins_std": 0.15486779808998108,
1130
+ "eval_rewards/rejected": -0.04409153386950493,
1131
+ "eval_runtime": 348.5206,
1132
+ "eval_samples_per_second": 5.739,
1133
+ "eval_steps_per_second": 0.181,
1134
+ "step": 500
1135
+ },
1136
+ {
1137
+ "dpo_losses": 0.558113694190979,
1138
+ "epoch": 1.44,
1139
+ "grad_norm": 1.9204678229765098,
1140
+ "learning_rate": 3.116641275116018e-07,
1141
+ "logits/chosen": -2.4792568683624268,
1142
+ "logits/rejected": -2.477318286895752,
1143
+ "logps/chosen": -267.8980407714844,
1144
+ "logps/rejected": -321.2754821777344,
1145
+ "loss": 0.5399,
1146
+ "positive_losses": 0.0,
1147
+ "rewards/accuracies": 1.0,
1148
+ "rewards/chosen": 0.24782295525074005,
1149
+ "rewards/margins": 0.30803701281547546,
1150
+ "rewards/margins_max": 0.4859069287776947,
1151
+ "rewards/margins_min": 0.13016708195209503,
1152
+ "rewards/margins_std": 0.25154608488082886,
1153
+ "rewards/rejected": -0.06021404266357422,
1154
+ "step": 510
1155
+ },
1156
+ {
1157
+ "dpo_losses": 0.5207396149635315,
1158
+ "epoch": 1.46,
1159
+ "grad_norm": 1.8920245074404747,
1160
+ "learning_rate": 3.036873904806295e-07,
1161
+ "logits/chosen": -2.7549643516540527,
1162
+ "logits/rejected": -2.6787781715393066,
1163
+ "logps/chosen": -313.6898498535156,
1164
+ "logps/rejected": -246.35202026367188,
1165
+ "loss": 0.5479,
1166
+ "positive_losses": 0.4201123118400574,
1167
+ "rewards/accuracies": 0.949999988079071,
1168
+ "rewards/chosen": 0.2792617380619049,
1169
+ "rewards/margins": 0.4105965197086334,
1170
+ "rewards/margins_max": 0.6103520393371582,
1171
+ "rewards/margins_min": 0.21084094047546387,
1172
+ "rewards/margins_std": 0.28249698877334595,
1173
+ "rewards/rejected": -0.13133473694324493,
1174
+ "step": 520
1175
+ },
1176
+ {
1177
+ "dpo_losses": 0.5030455589294434,
1178
+ "epoch": 1.49,
1179
+ "grad_norm": 2.1161532532804963,
1180
+ "learning_rate": 2.956529233772492e-07,
1181
+ "logits/chosen": -2.721666097640991,
1182
+ "logits/rejected": -2.7209858894348145,
1183
+ "logps/chosen": -350.1697998046875,
1184
+ "logps/rejected": -284.701416015625,
1185
+ "loss": 0.5383,
1186
+ "positive_losses": 0.0,
1187
+ "rewards/accuracies": 1.0,
1188
+ "rewards/chosen": 0.2883579134941101,
1189
+ "rewards/margins": 0.4384874701499939,
1190
+ "rewards/margins_max": 0.5584603548049927,
1191
+ "rewards/margins_min": 0.31851455569267273,
1192
+ "rewards/margins_std": 0.16966724395751953,
1193
+ "rewards/rejected": -0.15012958645820618,
1194
+ "step": 530
1195
+ },
1196
+ {
1197
+ "dpo_losses": 0.4899216592311859,
1198
+ "epoch": 1.52,
1199
+ "grad_norm": 8.076099370517953,
1200
+ "learning_rate": 2.875693656671431e-07,
1201
+ "logits/chosen": -2.8387975692749023,
1202
+ "logits/rejected": -2.7006285190582275,
1203
+ "logps/chosen": -364.0811462402344,
1204
+ "logps/rejected": -250.73196411132812,
1205
+ "loss": 0.5309,
1206
+ "positive_losses": 0.0,
1207
+ "rewards/accuracies": 1.0,
1208
+ "rewards/chosen": 0.3547210097312927,
1209
+ "rewards/margins": 0.4873575270175934,
1210
+ "rewards/margins_max": 0.6359044313430786,
1211
+ "rewards/margins_min": 0.3388107419013977,
1212
+ "rewards/margins_std": 0.2100769281387329,
1213
+ "rewards/rejected": -0.13263657689094543,
1214
+ "step": 540
1215
+ },
1216
+ {
1217
+ "dpo_losses": 0.5343499183654785,
1218
+ "epoch": 1.55,
1219
+ "grad_norm": 2.5715523833788465,
1220
+ "learning_rate": 2.794454096031429e-07,
1221
+ "logits/chosen": -2.7994258403778076,
1222
+ "logits/rejected": -2.7424252033233643,
1223
+ "logps/chosen": -283.88751220703125,
1224
+ "logps/rejected": -277.2805480957031,
1225
+ "loss": 0.5163,
1226
+ "positive_losses": 0.1207679733633995,
1227
+ "rewards/accuracies": 1.0,
1228
+ "rewards/chosen": 0.2706112265586853,
1229
+ "rewards/margins": 0.3728974759578705,
1230
+ "rewards/margins_max": 0.5763125419616699,
1231
+ "rewards/margins_min": 0.16948243975639343,
1232
+ "rewards/margins_std": 0.2876723110675812,
1233
+ "rewards/rejected": -0.10228625684976578,
1234
+ "step": 550
1235
+ },
1236
+ {
1237
+ "dpo_losses": 0.5260319709777832,
1238
+ "epoch": 1.58,
1239
+ "grad_norm": 2.100863996516755,
1240
+ "learning_rate": 2.7128979087844593e-07,
1241
+ "logits/chosen": -2.7390799522399902,
1242
+ "logits/rejected": -2.6754238605499268,
1243
+ "logps/chosen": -291.78973388671875,
1244
+ "logps/rejected": -352.7436218261719,
1245
+ "loss": 0.5334,
1246
+ "positive_losses": 0.1672992706298828,
1247
+ "rewards/accuracies": 0.949999988079071,
1248
+ "rewards/chosen": 0.24893280863761902,
1249
+ "rewards/margins": 0.388366162776947,
1250
+ "rewards/margins_max": 0.5333099961280823,
1251
+ "rewards/margins_min": 0.24342235922813416,
1252
+ "rewards/margins_std": 0.2049814909696579,
1253
+ "rewards/rejected": -0.139433354139328,
1254
+ "step": 560
1255
+ },
1256
+ {
1257
+ "dpo_losses": 0.49240389466285706,
1258
+ "epoch": 1.61,
1259
+ "grad_norm": 8.972280412241831,
1260
+ "learning_rate": 2.6311127923312153e-07,
1261
+ "logits/chosen": -2.757660388946533,
1262
+ "logits/rejected": -2.627717971801758,
1263
+ "logps/chosen": -397.9381103515625,
1264
+ "logps/rejected": -329.2608947753906,
1265
+ "loss": 0.5165,
1266
+ "positive_losses": 0.0,
1267
+ "rewards/accuracies": 1.0,
1268
+ "rewards/chosen": 0.3426300883293152,
1269
+ "rewards/margins": 0.47666874527931213,
1270
+ "rewards/margins_max": 0.6256591081619263,
1271
+ "rewards/margins_min": 0.32767823338508606,
1272
+ "rewards/margins_std": 0.21070432662963867,
1273
+ "rewards/rejected": -0.13403865694999695,
1274
+ "step": 570
1275
+ },
1276
+ {
1277
+ "dpo_losses": 0.5157249569892883,
1278
+ "epoch": 1.63,
1279
+ "grad_norm": 1.8838498119092633,
1280
+ "learning_rate": 2.5491866902400565e-07,
1281
+ "logits/chosen": -2.7878096103668213,
1282
+ "logits/rejected": -2.70076060295105,
1283
+ "logps/chosen": -270.0621337890625,
1284
+ "logps/rejected": -235.17105102539062,
1285
+ "loss": 0.5244,
1286
+ "positive_losses": 0.14246292412281036,
1287
+ "rewards/accuracies": 1.0,
1288
+ "rewards/chosen": 0.28403371572494507,
1289
+ "rewards/margins": 0.4212573170661926,
1290
+ "rewards/margins_max": 0.6702337861061096,
1291
+ "rewards/margins_min": 0.17228081822395325,
1292
+ "rewards/margins_std": 0.35210588574409485,
1293
+ "rewards/rejected": -0.13722361624240875,
1294
+ "step": 580
1295
+ },
1296
+ {
1297
+ "dpo_losses": 0.524344801902771,
1298
+ "epoch": 1.66,
1299
+ "grad_norm": 7.342743240499319,
1300
+ "learning_rate": 2.4672076976812543e-07,
1301
+ "logits/chosen": -2.6465201377868652,
1302
+ "logits/rejected": -2.559436082839966,
1303
+ "logps/chosen": -339.4754943847656,
1304
+ "logps/rejected": -302.15557861328125,
1305
+ "loss": 0.5432,
1306
+ "positive_losses": 0.1482059508562088,
1307
+ "rewards/accuracies": 0.949999988079071,
1308
+ "rewards/chosen": 0.2978511452674866,
1309
+ "rewards/margins": 0.3972291350364685,
1310
+ "rewards/margins_max": 0.5903550982475281,
1311
+ "rewards/margins_min": 0.2041032314300537,
1312
+ "rewards/margins_std": 0.27312135696411133,
1313
+ "rewards/rejected": -0.09937803447246552,
1314
+ "step": 590
1315
+ },
1316
+ {
1317
+ "dpo_losses": 0.5494340658187866,
1318
+ "epoch": 1.69,
1319
+ "grad_norm": 8.369464468282295,
1320
+ "learning_rate": 2.385263966698222e-07,
1321
+ "logits/chosen": -2.7804884910583496,
1322
+ "logits/rejected": -2.7123289108276367,
1323
+ "logps/chosen": -253.48031616210938,
1324
+ "logps/rejected": -268.9811096191406,
1325
+ "loss": 0.5293,
1326
+ "positive_losses": 0.0,
1327
+ "rewards/accuracies": 1.0,
1328
+ "rewards/chosen": 0.2517229914665222,
1329
+ "rewards/margins": 0.3363966941833496,
1330
+ "rewards/margins_max": 0.48714661598205566,
1331
+ "rewards/margins_min": 0.18564683198928833,
1332
+ "rewards/margins_std": 0.21319253742694855,
1333
+ "rewards/rejected": -0.08467370271682739,
1334
+ "step": 600
1335
+ },
1336
+ {
1337
+ "epoch": 1.69,
1338
+ "eval_dpo_losses": 0.6702548265457153,
1339
+ "eval_logits/chosen": -2.7200992107391357,
1340
+ "eval_logits/rejected": -2.6766622066497803,
1341
+ "eval_logps/chosen": -285.0616149902344,
1342
+ "eval_logps/rejected": -264.5409851074219,
1343
+ "eval_loss": 1.2245166301727295,
1344
+ "eval_positive_losses": 5.373989582061768,
1345
+ "eval_rewards/accuracies": 0.591269850730896,
1346
+ "eval_rewards/chosen": 0.0015958804870024323,
1347
+ "eval_rewards/margins": 0.05517909303307533,
1348
+ "eval_rewards/margins_max": 0.2654685378074646,
1349
+ "eval_rewards/margins_min": -0.128387451171875,
1350
+ "eval_rewards/margins_std": 0.1752447932958603,
1351
+ "eval_rewards/rejected": -0.053583212196826935,
1352
+ "eval_runtime": 377.5968,
1353
+ "eval_samples_per_second": 5.297,
1354
+ "eval_steps_per_second": 0.167,
1355
+ "step": 600
1356
+ },
1357
+ {
1358
+ "dpo_losses": 0.5357104539871216,
1359
+ "epoch": 1.72,
1360
+ "grad_norm": 8.256557748215014,
1361
+ "learning_rate": 2.3034436114175838e-07,
1362
+ "logits/chosen": -2.6488845348358154,
1363
+ "logits/rejected": -2.538515567779541,
1364
+ "logps/chosen": -336.7628479003906,
1365
+ "logps/rejected": -277.02264404296875,
1366
+ "loss": 0.5238,
1367
+ "positive_losses": 0.0,
1368
+ "rewards/accuracies": 0.949999988079071,
1369
+ "rewards/chosen": 0.28726598620414734,
1370
+ "rewards/margins": 0.37427616119384766,
1371
+ "rewards/margins_max": 0.564933717250824,
1372
+ "rewards/margins_min": 0.18361851572990417,
1373
+ "rewards/margins_std": 0.2696306109428406,
1374
+ "rewards/rejected": -0.08701014518737793,
1375
+ "step": 610
1376
+ },
1377
+ {
1378
+ "dpo_losses": 0.5276176929473877,
1379
+ "epoch": 1.75,
1380
+ "grad_norm": 2.4842449610043764,
1381
+ "learning_rate": 2.2218346133000264e-07,
1382
+ "logits/chosen": -2.6663379669189453,
1383
+ "logits/rejected": -2.5575757026672363,
1384
+ "logps/chosen": -281.8042297363281,
1385
+ "logps/rejected": -206.41580200195312,
1386
+ "loss": 0.5162,
1387
+ "positive_losses": 0.15401744842529297,
1388
+ "rewards/accuracies": 0.949999988079071,
1389
+ "rewards/chosen": 0.2947912812232971,
1390
+ "rewards/margins": 0.38691216707229614,
1391
+ "rewards/margins_max": 0.5189955234527588,
1392
+ "rewards/margins_min": 0.2548287510871887,
1393
+ "rewards/margins_std": 0.18679411709308624,
1394
+ "rewards/rejected": -0.09212087094783783,
1395
+ "step": 620
1396
+ },
1397
+ {
1398
+ "dpo_losses": 0.5108110308647156,
1399
+ "epoch": 1.77,
1400
+ "grad_norm": 9.294890068434233,
1401
+ "learning_rate": 2.1405247265337917e-07,
1402
+ "logits/chosen": -2.6686511039733887,
1403
+ "logits/rejected": -2.5705299377441406,
1404
+ "logps/chosen": -360.513671875,
1405
+ "logps/rejected": -233.205322265625,
1406
+ "loss": 0.5195,
1407
+ "positive_losses": 0.0,
1408
+ "rewards/accuracies": 1.0,
1409
+ "rewards/chosen": 0.3424997925758362,
1410
+ "rewards/margins": 0.4331623613834381,
1411
+ "rewards/margins_max": 0.5366414785385132,
1412
+ "rewards/margins_min": 0.3296832740306854,
1413
+ "rewards/margins_std": 0.14634151756763458,
1414
+ "rewards/rejected": -0.09066257625818253,
1415
+ "step": 630
1416
+ },
1417
+ {
1418
+ "dpo_losses": 0.5333245396614075,
1419
+ "epoch": 1.8,
1420
+ "grad_norm": 1.9615557786341105,
1421
+ "learning_rate": 2.0596013836725657e-07,
1422
+ "logits/chosen": -2.793367862701416,
1423
+ "logits/rejected": -2.739849090576172,
1424
+ "logps/chosen": -275.52984619140625,
1425
+ "logps/rejected": -210.9861297607422,
1426
+ "loss": 0.542,
1427
+ "positive_losses": 0.4148605465888977,
1428
+ "rewards/accuracies": 1.0,
1429
+ "rewards/chosen": 0.26945605874061584,
1430
+ "rewards/margins": 0.38035932183265686,
1431
+ "rewards/margins_max": 0.5337552428245544,
1432
+ "rewards/margins_min": 0.2269633710384369,
1433
+ "rewards/margins_std": 0.21693463623523712,
1434
+ "rewards/rejected": -0.11090326309204102,
1435
+ "step": 640
1436
+ },
1437
+ {
1438
+ "dpo_losses": 0.5089551210403442,
1439
+ "epoch": 1.83,
1440
+ "grad_norm": 11.059235344334038,
1441
+ "learning_rate": 1.9791516016192213e-07,
1442
+ "logits/chosen": -2.844893217086792,
1443
+ "logits/rejected": -2.7059712409973145,
1444
+ "logps/chosen": -298.34588623046875,
1445
+ "logps/rejected": -231.402099609375,
1446
+ "loss": 0.5102,
1447
+ "positive_losses": 0.0,
1448
+ "rewards/accuracies": 0.949999988079071,
1449
+ "rewards/chosen": 0.3605387806892395,
1450
+ "rewards/margins": 0.446432501077652,
1451
+ "rewards/margins_max": 0.6981122493743896,
1452
+ "rewards/margins_min": 0.1947527527809143,
1453
+ "rewards/margins_std": 0.35592886805534363,
1454
+ "rewards/rejected": -0.0858936682343483,
1455
+ "step": 650
1456
+ },
1457
+ {
1458
+ "dpo_losses": 0.5299183130264282,
1459
+ "epoch": 1.86,
1460
+ "grad_norm": 16.447544756456054,
1461
+ "learning_rate": 1.8992618880565036e-07,
1462
+ "logits/chosen": -2.5619029998779297,
1463
+ "logits/rejected": -2.5244908332824707,
1464
+ "logps/chosen": -294.64801025390625,
1465
+ "logps/rejected": -199.9069061279297,
1466
+ "loss": 0.54,
1467
+ "positive_losses": 0.0,
1468
+ "rewards/accuracies": 0.949999988079071,
1469
+ "rewards/chosen": 0.293754518032074,
1470
+ "rewards/margins": 0.39233601093292236,
1471
+ "rewards/margins_max": 0.5898939371109009,
1472
+ "rewards/margins_min": 0.19477804005146027,
1473
+ "rewards/margins_std": 0.2793891131877899,
1474
+ "rewards/rejected": -0.0985814779996872,
1475
+ "step": 660
1476
+ },
1477
+ {
1478
+ "dpo_losses": 0.4770924150943756,
1479
+ "epoch": 1.89,
1480
+ "grad_norm": 14.950375677893724,
1481
+ "learning_rate": 1.8200181484252885e-07,
1482
+ "logits/chosen": -2.7151541709899902,
1483
+ "logits/rejected": -2.7282559871673584,
1484
+ "logps/chosen": -347.63665771484375,
1485
+ "logps/rejected": -335.4634094238281,
1486
+ "loss": 0.5076,
1487
+ "positive_losses": 0.011021423153579235,
1488
+ "rewards/accuracies": 1.0,
1489
+ "rewards/chosen": 0.3512026071548462,
1490
+ "rewards/margins": 0.532378077507019,
1491
+ "rewards/margins_max": 0.7549656629562378,
1492
+ "rewards/margins_min": 0.3097904622554779,
1493
+ "rewards/margins_std": 0.314786434173584,
1494
+ "rewards/rejected": -0.18117551505565643,
1495
+ "step": 670
1496
+ },
1497
+ {
1498
+ "dpo_losses": 0.48109301924705505,
1499
+ "epoch": 1.92,
1500
+ "grad_norm": 1.8077778304327392,
1501
+ "learning_rate": 1.7415055935504233e-07,
1502
+ "logits/chosen": -2.796618938446045,
1503
+ "logits/rejected": -2.6364896297454834,
1504
+ "logps/chosen": -357.8368835449219,
1505
+ "logps/rejected": -312.83050537109375,
1506
+ "loss": 0.5045,
1507
+ "positive_losses": 0.0,
1508
+ "rewards/accuracies": 1.0,
1509
+ "rewards/chosen": 0.3552249073982239,
1510
+ "rewards/margins": 0.5131222009658813,
1511
+ "rewards/margins_max": 0.6571098566055298,
1512
+ "rewards/margins_min": 0.3691345155239105,
1513
+ "rewards/margins_std": 0.2036292850971222,
1514
+ "rewards/rejected": -0.15789727866649628,
1515
+ "step": 680
1516
+ },
1517
+ {
1518
+ "dpo_losses": 0.5480272769927979,
1519
+ "epoch": 1.94,
1520
+ "grad_norm": 7.710262187848809,
1521
+ "learning_rate": 1.6638086480134952e-07,
1522
+ "logits/chosen": -2.6526730060577393,
1523
+ "logits/rejected": -2.5982117652893066,
1524
+ "logps/chosen": -211.858154296875,
1525
+ "logps/rejected": -139.07669067382812,
1526
+ "loss": 0.519,
1527
+ "positive_losses": 0.0,
1528
+ "rewards/accuracies": 0.8999999761581421,
1529
+ "rewards/chosen": 0.25107190012931824,
1530
+ "rewards/margins": 0.34679529070854187,
1531
+ "rewards/margins_max": 0.5567124485969543,
1532
+ "rewards/margins_min": 0.1368781328201294,
1533
+ "rewards/margins_std": 0.2968676686286926,
1534
+ "rewards/rejected": -0.09572339057922363,
1535
+ "step": 690
1536
+ },
1537
+ {
1538
+ "dpo_losses": 0.4729584753513336,
1539
+ "epoch": 1.97,
1540
+ "grad_norm": 13.004502323752208,
1541
+ "learning_rate": 1.5870108593710471e-07,
1542
+ "logits/chosen": -2.577094793319702,
1543
+ "logits/rejected": -2.474807024002075,
1544
+ "logps/chosen": -388.23529052734375,
1545
+ "logps/rejected": -229.945556640625,
1546
+ "loss": 0.5238,
1547
+ "positive_losses": 0.0,
1548
+ "rewards/accuracies": 0.949999988079071,
1549
+ "rewards/chosen": 0.3872971832752228,
1550
+ "rewards/margins": 0.5365989804267883,
1551
+ "rewards/margins_max": 0.6989376544952393,
1552
+ "rewards/margins_min": 0.3742601275444031,
1553
+ "rewards/margins_std": 0.22958168387413025,
1554
+ "rewards/rejected": -0.14930173754692078,
1555
+ "step": 700
1556
+ },
1557
+ {
1558
+ "epoch": 1.97,
1559
+ "eval_dpo_losses": 0.6682811975479126,
1560
+ "eval_logits/chosen": -2.7153635025024414,
1561
+ "eval_logits/rejected": -2.6725597381591797,
1562
+ "eval_logps/chosen": -287.1236877441406,
1563
+ "eval_logps/rejected": -267.1868896484375,
1564
+ "eval_loss": 1.3783056735992432,
1565
+ "eval_positive_losses": 6.938729763031006,
1566
+ "eval_rewards/accuracies": 0.60317462682724,
1567
+ "eval_rewards/chosen": -0.019024791195988655,
1568
+ "eval_rewards/margins": 0.061017535626888275,
1569
+ "eval_rewards/margins_max": 0.28910568356513977,
1570
+ "eval_rewards/margins_min": -0.14251713454723358,
1571
+ "eval_rewards/margins_std": 0.1922098994255066,
1572
+ "eval_rewards/rejected": -0.08004232496023178,
1573
+ "eval_runtime": 374.9449,
1574
+ "eval_samples_per_second": 5.334,
1575
+ "eval_steps_per_second": 0.168,
1576
+ "step": 700
1577
+ },
1578
+ {
1579
+ "dpo_losses": 0.5287314057350159,
1580
+ "epoch": 2.0,
1581
+ "grad_norm": 2.5225442066988153,
1582
+ "learning_rate": 1.5111948083158528e-07,
1583
+ "logits/chosen": -2.645501136779785,
1584
+ "logits/rejected": -2.5416979789733887,
1585
+ "logps/chosen": -277.56146240234375,
1586
+ "logps/rejected": -194.89093017578125,
1587
+ "loss": 0.509,
1588
+ "positive_losses": 0.014240646734833717,
1589
+ "rewards/accuracies": 1.0,
1590
+ "rewards/chosen": 0.2871246635913849,
1591
+ "rewards/margins": 0.39854830503463745,
1592
+ "rewards/margins_max": 0.6037707328796387,
1593
+ "rewards/margins_min": 0.19332581758499146,
1594
+ "rewards/margins_std": 0.2902284264564514,
1595
+ "rewards/rejected": -0.11142361164093018,
1596
+ "step": 710
1597
+ },
1598
+ {
1599
+ "dpo_losses": 0.4969090521335602,
1600
+ "epoch": 2.03,
1601
+ "grad_norm": 6.7685343277404435,
1602
+ "learning_rate": 1.4364420198778658e-07,
1603
+ "logits/chosen": -2.711702823638916,
1604
+ "logits/rejected": -2.667511224746704,
1605
+ "logps/chosen": -318.25323486328125,
1606
+ "logps/rejected": -349.77325439453125,
1607
+ "loss": 0.4995,
1608
+ "positive_losses": 0.2103443145751953,
1609
+ "rewards/accuracies": 1.0,
1610
+ "rewards/chosen": 0.29361721873283386,
1611
+ "rewards/margins": 0.48742538690567017,
1612
+ "rewards/margins_max": 0.7825571298599243,
1613
+ "rewards/margins_min": 0.19229364395141602,
1614
+ "rewards/margins_std": 0.417379230260849,
1615
+ "rewards/rejected": -0.1938081681728363,
1616
+ "step": 720
1617
+ },
1618
+ {
1619
+ "dpo_losses": 0.46872028708457947,
1620
+ "epoch": 2.06,
1621
+ "grad_norm": 1.9152175221021666,
1622
+ "learning_rate": 1.3628328757603242e-07,
1623
+ "logits/chosen": -2.715259552001953,
1624
+ "logits/rejected": -2.639958143234253,
1625
+ "logps/chosen": -387.1348571777344,
1626
+ "logps/rejected": -270.98504638671875,
1627
+ "loss": 0.5078,
1628
+ "positive_losses": 0.07048721611499786,
1629
+ "rewards/accuracies": 0.949999988079071,
1630
+ "rewards/chosen": 0.3561549186706543,
1631
+ "rewards/margins": 0.5383674502372742,
1632
+ "rewards/margins_max": 0.6862602829933167,
1633
+ "rewards/margins_min": 0.39047467708587646,
1634
+ "rewards/margins_std": 0.20915205776691437,
1635
+ "rewards/rejected": -0.18221257627010345,
1636
+ "step": 730
1637
+ },
1638
+ {
1639
+ "dpo_losses": 0.48394322395324707,
1640
+ "epoch": 2.08,
1641
+ "grad_norm": 1.9934270854366338,
1642
+ "learning_rate": 1.2904465279052723e-07,
1643
+ "logits/chosen": -2.743479013442993,
1644
+ "logits/rejected": -2.6603636741638184,
1645
+ "logps/chosen": -314.8957824707031,
1646
+ "logps/rejected": -239.50637817382812,
1647
+ "loss": 0.5064,
1648
+ "positive_losses": 0.24646854400634766,
1649
+ "rewards/accuracies": 0.949999988079071,
1650
+ "rewards/chosen": 0.37057873606681824,
1651
+ "rewards/margins": 0.5130593776702881,
1652
+ "rewards/margins_max": 0.7771711945533752,
1653
+ "rewards/margins_min": 0.2489476501941681,
1654
+ "rewards/margins_std": 0.373510479927063,
1655
+ "rewards/rejected": -0.14248065650463104,
1656
+ "step": 740
1657
+ },
1658
+ {
1659
+ "dpo_losses": 0.5582190752029419,
1660
+ "epoch": 2.11,
1661
+ "grad_norm": 9.439233885890266,
1662
+ "learning_rate": 1.219360813381446e-07,
1663
+ "logits/chosen": -2.647359848022461,
1664
+ "logits/rejected": -2.597470760345459,
1665
+ "logps/chosen": -167.36611938476562,
1666
+ "logps/rejected": -147.13504028320312,
1667
+ "loss": 0.4945,
1668
+ "positive_losses": 0.07765503227710724,
1669
+ "rewards/accuracies": 0.949999988079071,
1670
+ "rewards/chosen": 0.21209602057933807,
1671
+ "rewards/margins": 0.31703463196754456,
1672
+ "rewards/margins_max": 0.4738802909851074,
1673
+ "rewards/margins_min": 0.1601889729499817,
1674
+ "rewards/margins_std": 0.22181324660778046,
1675
+ "rewards/rejected": -0.10493861138820648,
1676
+ "step": 750
1677
+ },
1678
+ {
1679
+ "dpo_losses": 0.4905478060245514,
1680
+ "epoch": 2.14,
1681
+ "grad_norm": 11.006048100367254,
1682
+ "learning_rate": 1.149652170686039e-07,
1683
+ "logits/chosen": -2.7402448654174805,
1684
+ "logits/rejected": -2.6278908252716064,
1685
+ "logps/chosen": -300.5604553222656,
1686
+ "logps/rejected": -279.05877685546875,
1687
+ "loss": 0.5065,
1688
+ "positive_losses": 0.7673536539077759,
1689
+ "rewards/accuracies": 0.949999988079071,
1690
+ "rewards/chosen": 0.3250073492527008,
1691
+ "rewards/margins": 0.48759451508522034,
1692
+ "rewards/margins_max": 0.6997831463813782,
1693
+ "rewards/margins_min": 0.2754059433937073,
1694
+ "rewards/margins_std": 0.30007994174957275,
1695
+ "rewards/rejected": -0.1625872105360031,
1696
+ "step": 760
1697
+ },
1698
+ {
1699
+ "dpo_losses": 0.4878745675086975,
1700
+ "epoch": 2.17,
1701
+ "grad_norm": 2.041360663493708,
1702
+ "learning_rate": 1.0813955575503587e-07,
1703
+ "logits/chosen": -2.6781890392303467,
1704
+ "logits/rejected": -2.673537254333496,
1705
+ "logps/chosen": -303.26922607421875,
1706
+ "logps/rejected": -278.1116638183594,
1707
+ "loss": 0.4955,
1708
+ "positive_losses": 0.0,
1709
+ "rewards/accuracies": 1.0,
1710
+ "rewards/chosen": 0.303671658039093,
1711
+ "rewards/margins": 0.5081648230552673,
1712
+ "rewards/margins_max": 0.7443748712539673,
1713
+ "rewards/margins_min": 0.27195480465888977,
1714
+ "rewards/margins_std": 0.3340514898300171,
1715
+ "rewards/rejected": -0.2044931948184967,
1716
+ "step": 770
1717
+ },
1718
+ {
1719
+ "dpo_losses": 0.5306052565574646,
1720
+ "epoch": 2.2,
1721
+ "grad_norm": 12.846225026696498,
1722
+ "learning_rate": 1.0146643703377486e-07,
1723
+ "logits/chosen": -2.753620147705078,
1724
+ "logits/rejected": -2.623445749282837,
1725
+ "logps/chosen": -295.22698974609375,
1726
+ "logps/rejected": -235.02810668945312,
1727
+ "loss": 0.5182,
1728
+ "positive_losses": 0.0,
1729
+ "rewards/accuracies": 0.949999988079071,
1730
+ "rewards/chosen": 0.31814926862716675,
1731
+ "rewards/margins": 0.3883030414581299,
1732
+ "rewards/margins_max": 0.6206892728805542,
1733
+ "rewards/margins_min": 0.15591678023338318,
1734
+ "rewards/margins_std": 0.3286438286304474,
1735
+ "rewards/rejected": -0.07015376538038254,
1736
+ "step": 780
1737
+ },
1738
+ {
1739
+ "dpo_losses": 0.4863054156303406,
1740
+ "epoch": 2.23,
1741
+ "grad_norm": 3.0491196409319192,
1742
+ "learning_rate": 9.495303651204494e-08,
1743
+ "logits/chosen": -2.697542667388916,
1744
+ "logits/rejected": -2.644935131072998,
1745
+ "logps/chosen": -334.7068786621094,
1746
+ "logps/rejected": -294.2601318359375,
1747
+ "loss": 0.4865,
1748
+ "positive_losses": 0.0,
1749
+ "rewards/accuracies": 1.0,
1750
+ "rewards/chosen": 0.3127191960811615,
1751
+ "rewards/margins": 0.4877268671989441,
1752
+ "rewards/margins_max": 0.660971999168396,
1753
+ "rewards/margins_min": 0.3144817650318146,
1754
+ "rewards/margins_std": 0.24500557780265808,
1755
+ "rewards/rejected": -0.17500770092010498,
1756
+ "step": 790
1757
+ },
1758
+ {
1759
+ "dpo_losses": 0.4887468218803406,
1760
+ "epoch": 2.25,
1761
+ "grad_norm": 10.205242036662113,
1762
+ "learning_rate": 8.860635805202615e-08,
1763
+ "logits/chosen": -2.723806381225586,
1764
+ "logits/rejected": -2.6369194984436035,
1765
+ "logps/chosen": -329.61749267578125,
1766
+ "logps/rejected": -261.149169921875,
1767
+ "loss": 0.488,
1768
+ "positive_losses": 0.0,
1769
+ "rewards/accuracies": 1.0,
1770
+ "rewards/chosen": 0.33780962228775024,
1771
+ "rewards/margins": 0.49595513939857483,
1772
+ "rewards/margins_max": 0.7063379287719727,
1773
+ "rewards/margins_min": 0.2855724096298218,
1774
+ "rewards/margins_std": 0.29752615094184875,
1775
+ "rewards/rejected": -0.15814556181430817,
1776
+ "step": 800
1777
+ },
1778
+ {
1779
+ "epoch": 2.25,
1780
+ "eval_dpo_losses": 0.6669806838035583,
1781
+ "eval_logits/chosen": -2.707551956176758,
1782
+ "eval_logits/rejected": -2.664377450942993,
1783
+ "eval_logps/chosen": -288.50439453125,
1784
+ "eval_logps/rejected": -268.966552734375,
1785
+ "eval_loss": 1.4895577430725098,
1786
+ "eval_positive_losses": 8.096439361572266,
1787
+ "eval_rewards/accuracies": 0.6111111044883728,
1788
+ "eval_rewards/chosen": -0.03283155709505081,
1789
+ "eval_rewards/margins": 0.06500754505395889,
1790
+ "eval_rewards/margins_max": 0.30630454421043396,
1791
+ "eval_rewards/margins_min": -0.15107154846191406,
1792
+ "eval_rewards/margins_std": 0.20374149084091187,
1793
+ "eval_rewards/rejected": -0.09783907979726791,
1794
+ "eval_runtime": 342.0756,
1795
+ "eval_samples_per_second": 5.847,
1796
+ "eval_steps_per_second": 0.184,
1797
+ "step": 800
1798
+ },
1799
+ {
1800
+ "dpo_losses": 0.5151875019073486,
1801
+ "epoch": 2.28,
1802
+ "grad_norm": 8.219325520486382,
1803
+ "learning_rate": 8.24332262395994e-08,
1804
+ "logits/chosen": -2.7791354656219482,
1805
+ "logits/rejected": -2.720329999923706,
1806
+ "logps/chosen": -268.9266052246094,
1807
+ "logps/rejected": -261.6560363769531,
1808
+ "loss": 0.4995,
1809
+ "positive_losses": 0.07818031311035156,
1810
+ "rewards/accuracies": 1.0,
1811
+ "rewards/chosen": 0.25640755891799927,
1812
+ "rewards/margins": 0.4340863823890686,
1813
+ "rewards/margins_max": 0.7111722230911255,
1814
+ "rewards/margins_min": 0.15700046718120575,
1815
+ "rewards/margins_std": 0.39185863733291626,
1816
+ "rewards/rejected": -0.17767879366874695,
1817
+ "step": 810
1818
+ },
1819
+ {
1820
+ "dpo_losses": 0.5199899673461914,
1821
+ "epoch": 2.31,
1822
+ "grad_norm": 11.3286779229284,
1823
+ "learning_rate": 7.644027904586586e-08,
1824
+ "logits/chosen": -2.7511487007141113,
1825
+ "logits/rejected": -2.6695072650909424,
1826
+ "logps/chosen": -276.63726806640625,
1827
+ "logps/rejected": -221.2215118408203,
1828
+ "loss": 0.5014,
1829
+ "positive_losses": 0.24863624572753906,
1830
+ "rewards/accuracies": 0.949999988079071,
1831
+ "rewards/chosen": 0.2888849079608917,
1832
+ "rewards/margins": 0.42505329847335815,
1833
+ "rewards/margins_max": 0.6548727750778198,
1834
+ "rewards/margins_min": 0.1952337622642517,
1835
+ "rewards/margins_std": 0.3250138759613037,
1836
+ "rewards/rejected": -0.13616837561130524,
1837
+ "step": 820
1838
+ },
1839
+ {
1840
+ "dpo_losses": 0.4646497666835785,
1841
+ "epoch": 2.34,
1842
+ "grad_norm": 8.714610877659624,
1843
+ "learning_rate": 7.063396068933469e-08,
1844
+ "logits/chosen": -2.73659086227417,
1845
+ "logits/rejected": -2.634260654449463,
1846
+ "logps/chosen": -415.6689453125,
1847
+ "logps/rejected": -272.5089111328125,
1848
+ "loss": 0.4872,
1849
+ "positive_losses": 0.20364531874656677,
1850
+ "rewards/accuracies": 1.0,
1851
+ "rewards/chosen": 0.38227415084838867,
1852
+ "rewards/margins": 0.5651294589042664,
1853
+ "rewards/margins_max": 0.7014433145523071,
1854
+ "rewards/margins_min": 0.4288156032562256,
1855
+ "rewards/margins_std": 0.19277691841125488,
1856
+ "rewards/rejected": -0.1828552931547165,
1857
+ "step": 830
1858
+ },
1859
+ {
1860
+ "dpo_losses": 0.49954962730407715,
1861
+ "epoch": 2.37,
1862
+ "grad_norm": 6.5716343841546685,
1863
+ "learning_rate": 6.502051470645148e-08,
1864
+ "logits/chosen": -2.883086919784546,
1865
+ "logits/rejected": -2.6989049911499023,
1866
+ "logps/chosen": -359.88653564453125,
1867
+ "logps/rejected": -300.28302001953125,
1868
+ "loss": 0.5152,
1869
+ "positive_losses": 0.010610580444335938,
1870
+ "rewards/accuracies": 0.8999999761581421,
1871
+ "rewards/chosen": 0.3157649636268616,
1872
+ "rewards/margins": 0.4664032459259033,
1873
+ "rewards/margins_max": 0.6633520126342773,
1874
+ "rewards/margins_min": 0.2694544792175293,
1875
+ "rewards/margins_std": 0.2785276472568512,
1876
+ "rewards/rejected": -0.15063826739788055,
1877
+ "step": 840
1878
+ },
1879
+ {
1880
+ "dpo_losses": 0.4566032290458679,
1881
+ "epoch": 2.39,
1882
+ "grad_norm": 8.455170840404687,
1883
+ "learning_rate": 5.960597723792194e-08,
1884
+ "logits/chosen": -2.742910623550415,
1885
+ "logits/rejected": -2.619158983230591,
1886
+ "logps/chosen": -344.3529052734375,
1887
+ "logps/rejected": -276.9927673339844,
1888
+ "loss": 0.4916,
1889
+ "positive_losses": 0.32693710923194885,
1890
+ "rewards/accuracies": 1.0,
1891
+ "rewards/chosen": 0.40712490677833557,
1892
+ "rewards/margins": 0.5964423418045044,
1893
+ "rewards/margins_max": 0.88237464427948,
1894
+ "rewards/margins_min": 0.31051012873649597,
1895
+ "rewards/margins_std": 0.4043692648410797,
1896
+ "rewards/rejected": -0.1893174648284912,
1897
+ "step": 850
1898
+ },
1899
+ {
1900
+ "dpo_losses": 0.4746529459953308,
1901
+ "epoch": 2.42,
1902
+ "grad_norm": 11.331305453316807,
1903
+ "learning_rate": 5.4396170538046486e-08,
1904
+ "logits/chosen": -2.788321018218994,
1905
+ "logits/rejected": -2.7099854946136475,
1906
+ "logps/chosen": -321.00872802734375,
1907
+ "logps/rejected": -288.5102844238281,
1908
+ "loss": 0.4835,
1909
+ "positive_losses": 0.03827323764562607,
1910
+ "rewards/accuracies": 0.949999988079071,
1911
+ "rewards/chosen": 0.34411686658859253,
1912
+ "rewards/margins": 0.5461706519126892,
1913
+ "rewards/margins_max": 0.8038791418075562,
1914
+ "rewards/margins_min": 0.28846222162246704,
1915
+ "rewards/margins_std": 0.36445480585098267,
1916
+ "rewards/rejected": -0.20205385982990265,
1917
+ "step": 860
1918
+ },
1919
+ {
1920
+ "dpo_losses": 0.515887439250946,
1921
+ "epoch": 2.45,
1922
+ "grad_norm": 2.8323654503779396,
1923
+ "learning_rate": 4.93966967140487e-08,
1924
+ "logits/chosen": -2.714071750640869,
1925
+ "logits/rejected": -2.624173641204834,
1926
+ "logps/chosen": -314.1236267089844,
1927
+ "logps/rejected": -330.68817138671875,
1928
+ "loss": 0.4782,
1929
+ "positive_losses": 0.0,
1930
+ "rewards/accuracies": 1.0,
1931
+ "rewards/chosen": 0.2643023431301117,
1932
+ "rewards/margins": 0.4125981330871582,
1933
+ "rewards/margins_max": 0.6127610802650452,
1934
+ "rewards/margins_min": 0.21243515610694885,
1935
+ "rewards/margins_std": 0.28307315707206726,
1936
+ "rewards/rejected": -0.14829573035240173,
1937
+ "step": 870
1938
+ },
1939
+ {
1940
+ "dpo_losses": 0.48759278655052185,
1941
+ "epoch": 2.48,
1942
+ "grad_norm": 2.371240249959621,
1943
+ "learning_rate": 4.4612931702126433e-08,
1944
+ "logits/chosen": -2.7775819301605225,
1945
+ "logits/rejected": -2.6954903602600098,
1946
+ "logps/chosen": -286.12994384765625,
1947
+ "logps/rejected": -263.48614501953125,
1948
+ "loss": 0.4882,
1949
+ "positive_losses": 0.3580247759819031,
1950
+ "rewards/accuracies": 0.949999988079071,
1951
+ "rewards/chosen": 0.31085458397865295,
1952
+ "rewards/margins": 0.4919183850288391,
1953
+ "rewards/margins_max": 0.6966783404350281,
1954
+ "rewards/margins_min": 0.2871583104133606,
1955
+ "rewards/margins_std": 0.28957444429397583,
1956
+ "rewards/rejected": -0.181063711643219,
1957
+ "step": 880
1958
+ },
1959
+ {
1960
+ "dpo_losses": 0.4207460284233093,
1961
+ "epoch": 2.51,
1962
+ "grad_norm": 15.410592912064324,
1963
+ "learning_rate": 4.005001948670605e-08,
1964
+ "logits/chosen": -2.787081003189087,
1965
+ "logits/rejected": -2.7058584690093994,
1966
+ "logps/chosen": -433.3082580566406,
1967
+ "logps/rejected": -349.69342041015625,
1968
+ "loss": 0.4854,
1969
+ "positive_losses": 0.38000088930130005,
1970
+ "rewards/accuracies": 1.0,
1971
+ "rewards/chosen": 0.43636050820350647,
1972
+ "rewards/margins": 0.6726460456848145,
1973
+ "rewards/margins_max": 0.818884015083313,
1974
+ "rewards/margins_min": 0.5264080762863159,
1975
+ "rewards/margins_std": 0.20681175589561462,
1976
+ "rewards/rejected": -0.23628559708595276,
1977
+ "step": 890
1978
+ },
1979
+ {
1980
+ "dpo_losses": 0.44164472818374634,
1981
+ "epoch": 2.54,
1982
+ "grad_norm": 9.649985247190592,
1983
+ "learning_rate": 3.571286656911376e-08,
1984
+ "logits/chosen": -2.757315158843994,
1985
+ "logits/rejected": -2.582063674926758,
1986
+ "logps/chosen": -362.41473388671875,
1987
+ "logps/rejected": -289.0292663574219,
1988
+ "loss": 0.5027,
1989
+ "positive_losses": 0.0,
1990
+ "rewards/accuracies": 1.0,
1991
+ "rewards/chosen": 0.3792465031147003,
1992
+ "rewards/margins": 0.6161192655563354,
1993
+ "rewards/margins_max": 0.8344296216964722,
1994
+ "rewards/margins_min": 0.3978089690208435,
1995
+ "rewards/margins_std": 0.30873745679855347,
1996
+ "rewards/rejected": -0.2368728220462799,
1997
+ "step": 900
1998
+ },
1999
+ {
2000
+ "epoch": 2.54,
2001
+ "eval_dpo_losses": 0.6661122441291809,
2002
+ "eval_logits/chosen": -2.7059361934661865,
2003
+ "eval_logits/rejected": -2.662884473800659,
2004
+ "eval_logps/chosen": -289.38092041015625,
2005
+ "eval_logps/rejected": -270.0925598144531,
2006
+ "eval_loss": 1.5575090646743774,
2007
+ "eval_positive_losses": 8.782768249511719,
2008
+ "eval_rewards/accuracies": 0.6190476417541504,
2009
+ "eval_rewards/chosen": -0.04159707948565483,
2010
+ "eval_rewards/margins": 0.06750191748142242,
2011
+ "eval_rewards/margins_max": 0.31507408618927,
2012
+ "eval_rewards/margins_min": -0.1562977135181427,
2013
+ "eval_rewards/margins_std": 0.2099103033542633,
2014
+ "eval_rewards/rejected": -0.10909900069236755,
2015
+ "eval_runtime": 373.4993,
2016
+ "eval_samples_per_second": 5.355,
2017
+ "eval_steps_per_second": 0.169,
2018
+ "step": 900
2019
+ },
2020
+ {
2021
+ "dpo_losses": 0.45572906732559204,
2022
+ "epoch": 2.56,
2023
+ "grad_norm": 2.7671497689189013,
2024
+ "learning_rate": 3.160613669161255e-08,
2025
+ "logits/chosen": -2.8252711296081543,
2026
+ "logits/rejected": -2.692148208618164,
2027
+ "logps/chosen": -382.2273254394531,
2028
+ "logps/rejected": -260.42022705078125,
2029
+ "loss": 0.4747,
2030
+ "positive_losses": 0.24142150580883026,
2031
+ "rewards/accuracies": 1.0,
2032
+ "rewards/chosen": 0.36954694986343384,
2033
+ "rewards/margins": 0.5896551012992859,
2034
+ "rewards/margins_max": 0.8211862444877625,
2035
+ "rewards/margins_min": 0.35812389850616455,
2036
+ "rewards/margins_std": 0.32743456959724426,
2037
+ "rewards/rejected": -0.22010818123817444,
2038
+ "step": 910
2039
+ },
2040
+ {
2041
+ "dpo_losses": 0.4458266794681549,
2042
+ "epoch": 2.59,
2043
+ "grad_norm": 2.5946275076947987,
2044
+ "learning_rate": 2.7734245822478436e-08,
2045
+ "logits/chosen": -2.7092032432556152,
2046
+ "logits/rejected": -2.5538458824157715,
2047
+ "logps/chosen": -320.76727294921875,
2048
+ "logps/rejected": -214.5659942626953,
2049
+ "loss": 0.4918,
2050
+ "positive_losses": 0.05587196350097656,
2051
+ "rewards/accuracies": 0.949999988079071,
2052
+ "rewards/chosen": 0.4010286331176758,
2053
+ "rewards/margins": 0.6215327382087708,
2054
+ "rewards/margins_max": 0.8581794500350952,
2055
+ "rewards/margins_min": 0.3848857879638672,
2056
+ "rewards/margins_std": 0.3346691429615021,
2057
+ "rewards/rejected": -0.2205040454864502,
2058
+ "step": 920
2059
+ },
2060
+ {
2061
+ "dpo_losses": 0.48411306738853455,
2062
+ "epoch": 2.62,
2063
+ "grad_norm": 9.968121643352859,
2064
+ "learning_rate": 2.410135740750821e-08,
2065
+ "logits/chosen": -2.7608590126037598,
2066
+ "logits/rejected": -2.713438034057617,
2067
+ "logps/chosen": -317.3198547363281,
2068
+ "logps/rejected": -291.6824645996094,
2069
+ "loss": 0.4877,
2070
+ "positive_losses": 0.46479111909866333,
2071
+ "rewards/accuracies": 1.0,
2072
+ "rewards/chosen": 0.2928626835346222,
2073
+ "rewards/margins": 0.5131780505180359,
2074
+ "rewards/margins_max": 0.6953937411308289,
2075
+ "rewards/margins_min": 0.33096247911453247,
2076
+ "rewards/margins_std": 0.25769174098968506,
2077
+ "rewards/rejected": -0.22031545639038086,
2078
+ "step": 930
2079
+ },
2080
+ {
2081
+ "dpo_losses": 0.5361374616622925,
2082
+ "epoch": 2.65,
2083
+ "grad_norm": 13.584990105876331,
2084
+ "learning_rate": 2.071137789306418e-08,
2085
+ "logits/chosen": -2.739955425262451,
2086
+ "logits/rejected": -2.62253999710083,
2087
+ "logps/chosen": -313.5608825683594,
2088
+ "logps/rejected": -221.56198120117188,
2089
+ "loss": 0.499,
2090
+ "positive_losses": 0.22314663231372833,
2091
+ "rewards/accuracies": 0.949999988079071,
2092
+ "rewards/chosen": 0.2565317451953888,
2093
+ "rewards/margins": 0.3702241778373718,
2094
+ "rewards/margins_max": 0.5549638271331787,
2095
+ "rewards/margins_min": 0.18548452854156494,
2096
+ "rewards/margins_std": 0.26126131415367126,
2097
+ "rewards/rejected": -0.11369242519140244,
2098
+ "step": 940
2099
+ },
2100
+ {
2101
+ "dpo_losses": 0.4967280328273773,
2102
+ "epoch": 2.68,
2103
+ "grad_norm": 8.2589469715046,
2104
+ "learning_rate": 1.7567952525471107e-08,
2105
+ "logits/chosen": -2.7256572246551514,
2106
+ "logits/rejected": -2.615950584411621,
2107
+ "logps/chosen": -271.1017150878906,
2108
+ "logps/rejected": -220.56100463867188,
2109
+ "loss": 0.5302,
2110
+ "positive_losses": 1.0882877111434937,
2111
+ "rewards/accuracies": 1.0,
2112
+ "rewards/chosen": 0.31043368577957153,
2113
+ "rewards/margins": 0.48788315057754517,
2114
+ "rewards/margins_max": 0.704791784286499,
2115
+ "rewards/margins_min": 0.2709745466709137,
2116
+ "rewards/margins_std": 0.30675509572029114,
2117
+ "rewards/rejected": -0.17744943499565125,
2118
+ "step": 950
2119
+ },
2120
+ {
2121
+ "dpo_losses": 0.4955645501613617,
2122
+ "epoch": 2.7,
2123
+ "grad_norm": 18.114348971792364,
2124
+ "learning_rate": 1.467446143128101e-08,
2125
+ "logits/chosen": -2.8821053504943848,
2126
+ "logits/rejected": -2.793936252593994,
2127
+ "logps/chosen": -297.2878112792969,
2128
+ "logps/rejected": -261.58392333984375,
2129
+ "loss": 0.5139,
2130
+ "positive_losses": 0.06438522040843964,
2131
+ "rewards/accuracies": 0.949999988079071,
2132
+ "rewards/chosen": 0.29606151580810547,
2133
+ "rewards/margins": 0.4726499915122986,
2134
+ "rewards/margins_max": 0.6590073704719543,
2135
+ "rewards/margins_min": 0.2862926125526428,
2136
+ "rewards/margins_std": 0.26354914903640747,
2137
+ "rewards/rejected": -0.1765884906053543,
2138
+ "step": 960
2139
+ },
2140
+ {
2141
+ "dpo_losses": 0.47625431418418884,
2142
+ "epoch": 2.73,
2143
+ "grad_norm": 2.2745052596169986,
2144
+ "learning_rate": 1.2034015982622243e-08,
2145
+ "logits/chosen": -2.7958290576934814,
2146
+ "logits/rejected": -2.691840887069702,
2147
+ "logps/chosen": -341.50140380859375,
2148
+ "logps/rejected": -322.9205627441406,
2149
+ "loss": 0.4943,
2150
+ "positive_losses": 0.23634567856788635,
2151
+ "rewards/accuracies": 1.0,
2152
+ "rewards/chosen": 0.33635348081588745,
2153
+ "rewards/margins": 0.5307850241661072,
2154
+ "rewards/margins_max": 0.7724507451057434,
2155
+ "rewards/margins_min": 0.2891193926334381,
2156
+ "rewards/margins_std": 0.34176692366600037,
2157
+ "rewards/rejected": -0.1944316029548645,
2158
+ "step": 970
2159
+ },
2160
+ {
2161
+ "dpo_losses": 0.514926552772522,
2162
+ "epoch": 2.76,
2163
+ "grad_norm": 9.098303215376248,
2164
+ "learning_rate": 9.649455451539418e-09,
2165
+ "logits/chosen": -2.6313929557800293,
2166
+ "logits/rejected": -2.6137583255767822,
2167
+ "logps/chosen": -211.71536254882812,
2168
+ "logps/rejected": -190.9324493408203,
2169
+ "loss": 0.4996,
2170
+ "positive_losses": 0.12087974697351456,
2171
+ "rewards/accuracies": 1.0,
2172
+ "rewards/chosen": 0.2592369318008423,
2173
+ "rewards/margins": 0.4249873757362366,
2174
+ "rewards/margins_max": 0.6153701543807983,
2175
+ "rewards/margins_min": 0.2346045970916748,
2176
+ "rewards/margins_std": 0.26924189925193787,
2177
+ "rewards/rejected": -0.1657504439353943,
2178
+ "step": 980
2179
+ },
2180
+ {
2181
+ "dpo_losses": 0.471360445022583,
2182
+ "epoch": 2.79,
2183
+ "grad_norm": 7.946144983120647,
2184
+ "learning_rate": 7.523343956923194e-09,
2185
+ "logits/chosen": -2.797616481781006,
2186
+ "logits/rejected": -2.7332680225372314,
2187
+ "logps/chosen": -315.5093688964844,
2188
+ "logps/rejected": -304.4640808105469,
2189
+ "loss": 0.4919,
2190
+ "positive_losses": 0.0,
2191
+ "rewards/accuracies": 1.0,
2192
+ "rewards/chosen": 0.36568814516067505,
2193
+ "rewards/margins": 0.5568908452987671,
2194
+ "rewards/margins_max": 0.824783980846405,
2195
+ "rewards/margins_min": 0.28899770975112915,
2196
+ "rewards/margins_std": 0.37885811924934387,
2197
+ "rewards/rejected": -0.19120268523693085,
2198
+ "step": 990
2199
+ },
2200
+ {
2201
+ "dpo_losses": 0.513391375541687,
2202
+ "epoch": 2.82,
2203
+ "grad_norm": 7.57336855401101,
2204
+ "learning_rate": 5.6579677073121945e-09,
2205
+ "logits/chosen": -2.6762197017669678,
2206
+ "logits/rejected": -2.677922487258911,
2207
+ "logps/chosen": -255.44204711914062,
2208
+ "logps/rejected": -303.0498962402344,
2209
+ "loss": 0.4962,
2210
+ "positive_losses": 0.01267166156321764,
2211
+ "rewards/accuracies": 0.949999988079071,
2212
+ "rewards/chosen": 0.2648389935493469,
2213
+ "rewards/margins": 0.43319040536880493,
2214
+ "rewards/margins_max": 0.6835809946060181,
2215
+ "rewards/margins_min": 0.18279966711997986,
2216
+ "rewards/margins_std": 0.3541058897972107,
2217
+ "rewards/rejected": -0.168351411819458,
2218
+ "step": 1000
2219
+ },
2220
+ {
2221
+ "epoch": 2.82,
2222
+ "eval_dpo_losses": 0.6659845113754272,
2223
+ "eval_logits/chosen": -2.7036855220794678,
2224
+ "eval_logits/rejected": -2.6605641841888428,
2225
+ "eval_logps/chosen": -289.52734375,
2226
+ "eval_logps/rejected": -270.2825012207031,
2227
+ "eval_loss": 1.5707319974899292,
2228
+ "eval_positive_losses": 8.908056259155273,
2229
+ "eval_rewards/accuracies": 0.6150793433189392,
2230
+ "eval_rewards/chosen": -0.04306148737668991,
2231
+ "eval_rewards/margins": 0.06793692708015442,
2232
+ "eval_rewards/margins_max": 0.3166824281215668,
2233
+ "eval_rewards/margins_min": -0.15675365924835205,
2234
+ "eval_rewards/margins_std": 0.2111140936613083,
2235
+ "eval_rewards/rejected": -0.11099842935800552,
2236
+ "eval_runtime": 385.4819,
2237
+ "eval_samples_per_second": 5.188,
2238
+ "eval_steps_per_second": 0.163,
2239
+ "step": 1000
2240
+ },
2241
+ {
2242
+ "dpo_losses": 0.4899842143058777,
2243
+ "epoch": 2.85,
2244
+ "grad_norm": 11.596233256209617,
2245
+ "learning_rate": 4.0553325425319585e-09,
2246
+ "logits/chosen": -2.7825233936309814,
2247
+ "logits/rejected": -2.710404872894287,
2248
+ "logps/chosen": -283.56024169921875,
2249
+ "logps/rejected": -277.7485656738281,
2250
+ "loss": 0.5045,
2251
+ "positive_losses": 0.0,
2252
+ "rewards/accuracies": 1.0,
2253
+ "rewards/chosen": 0.32106488943099976,
2254
+ "rewards/margins": 0.503676176071167,
2255
+ "rewards/margins_max": 0.7390400171279907,
2256
+ "rewards/margins_min": 0.2683122456073761,
2257
+ "rewards/margins_std": 0.3328548073768616,
2258
+ "rewards/rejected": -0.18261122703552246,
2259
+ "step": 1010
2260
+ },
2261
+ {
2262
+ "dpo_losses": 0.4729464650154114,
2263
+ "epoch": 2.87,
2264
+ "grad_norm": 2.4410647222056254,
2265
+ "learning_rate": 2.717161776814747e-09,
2266
+ "logits/chosen": -2.742475986480713,
2267
+ "logits/rejected": -2.64007568359375,
2268
+ "logps/chosen": -277.25311279296875,
2269
+ "logps/rejected": -253.4837188720703,
2270
+ "loss": 0.4843,
2271
+ "positive_losses": 0.0,
2272
+ "rewards/accuracies": 0.949999988079071,
2273
+ "rewards/chosen": 0.3422829210758209,
2274
+ "rewards/margins": 0.5467172861099243,
2275
+ "rewards/margins_max": 0.7624825835227966,
2276
+ "rewards/margins_min": 0.3309520483016968,
2277
+ "rewards/margins_std": 0.30513814091682434,
2278
+ "rewards/rejected": -0.20443439483642578,
2279
+ "step": 1020
2280
+ },
2281
+ {
2282
+ "dpo_losses": 0.48559775948524475,
2283
+ "epoch": 2.9,
2284
+ "grad_norm": 8.212943342949115,
2285
+ "learning_rate": 1.6448943457189613e-09,
2286
+ "logits/chosen": -2.765453815460205,
2287
+ "logits/rejected": -2.705345630645752,
2288
+ "logps/chosen": -331.9368896484375,
2289
+ "logps/rejected": -294.2420349121094,
2290
+ "loss": 0.4995,
2291
+ "positive_losses": 0.18167057633399963,
2292
+ "rewards/accuracies": 0.949999988079071,
2293
+ "rewards/chosen": 0.33231136202812195,
2294
+ "rewards/margins": 0.5095351934432983,
2295
+ "rewards/margins_max": 0.6868753433227539,
2296
+ "rewards/margins_min": 0.332194983959198,
2297
+ "rewards/margins_std": 0.2507968842983246,
2298
+ "rewards/rejected": -0.17722377181053162,
2299
+ "step": 1030
2300
+ },
2301
+ {
2302
+ "dpo_losses": 0.4621458053588867,
2303
+ "epoch": 2.93,
2304
+ "grad_norm": 8.550282024171548,
2305
+ "learning_rate": 8.396832588411229e-10,
2306
+ "logits/chosen": -2.6322901248931885,
2307
+ "logits/rejected": -2.517449378967285,
2308
+ "logps/chosen": -312.41729736328125,
2309
+ "logps/rejected": -251.61849975585938,
2310
+ "loss": 0.5147,
2311
+ "positive_losses": 0.4247921109199524,
2312
+ "rewards/accuracies": 1.0,
2313
+ "rewards/chosen": 0.33802804350852966,
2314
+ "rewards/margins": 0.5674580335617065,
2315
+ "rewards/margins_max": 0.8604093790054321,
2316
+ "rewards/margins_min": 0.2745068669319153,
2317
+ "rewards/margins_std": 0.41429558396339417,
2318
+ "rewards/rejected": -0.22943000495433807,
2319
+ "step": 1040
2320
+ },
2321
+ {
2322
+ "dpo_losses": 0.49349918961524963,
2323
+ "epoch": 2.96,
2324
+ "grad_norm": 2.0362116151192713,
2325
+ "learning_rate": 3.0239435998430374e-10,
2326
+ "logits/chosen": -2.7547783851623535,
2327
+ "logits/rejected": -2.6397135257720947,
2328
+ "logps/chosen": -298.6687927246094,
2329
+ "logps/rejected": -274.22381591796875,
2330
+ "loss": 0.5095,
2331
+ "positive_losses": 0.5305103063583374,
2332
+ "rewards/accuracies": 0.949999988079071,
2333
+ "rewards/chosen": 0.32915833592414856,
2334
+ "rewards/margins": 0.4934717118740082,
2335
+ "rewards/margins_max": 0.7064648866653442,
2336
+ "rewards/margins_min": 0.28047865629196167,
2337
+ "rewards/margins_std": 0.3012176752090454,
2338
+ "rewards/rejected": -0.1643133908510208,
2339
+ "step": 1050
2340
+ },
2341
+ {
2342
+ "dpo_losses": 0.5019891858100891,
2343
+ "epoch": 2.99,
2344
+ "grad_norm": 2.0527644568071164,
2345
+ "learning_rate": 3.360539611582669e-11,
2346
+ "logits/chosen": -2.6836140155792236,
2347
+ "logits/rejected": -2.597043514251709,
2348
+ "logps/chosen": -294.3243103027344,
2349
+ "logps/rejected": -225.4201202392578,
2350
+ "loss": 0.5343,
2351
+ "positive_losses": 0.3622688353061676,
2352
+ "rewards/accuracies": 0.8500000238418579,
2353
+ "rewards/chosen": 0.3153269588947296,
2354
+ "rewards/margins": 0.4751824736595154,
2355
+ "rewards/margins_max": 0.7748254537582397,
2356
+ "rewards/margins_min": 0.1755395084619522,
2357
+ "rewards/margins_std": 0.4237591624259949,
2358
+ "rewards/rejected": -0.15985555946826935,
2359
+ "step": 1060
2360
+ },
2361
+ {
2362
+ "epoch": 3.0,
2363
+ "step": 1065,
2364
+ "total_flos": 0.0,
2365
+ "train_loss": 0.5596254680078354,
2366
+ "train_runtime": 11165.5936,
2367
+ "train_samples_per_second": 1.526,
2368
+ "train_steps_per_second": 0.095
2369
+ }
2370
+ ],
2371
+ "logging_steps": 10,
2372
+ "max_steps": 1065,
2373
+ "num_input_tokens_seen": 0,
2374
+ "num_train_epochs": 3,
2375
+ "save_steps": 100,
2376
+ "total_flos": 0.0,
2377
+ "train_batch_size": 2,
2378
+ "trial_name": null,
2379
+ "trial_params": null
2380
+ }