Update README.md
Browse files
README.md
CHANGED
@@ -27,14 +27,19 @@ tags:
|
|
27 |
|
28 |
## Training Details
|
29 |
|
|
|
|
|
|
|
|
|
30 |
### Training Hyperparameters
|
31 |
-
|
32 |
-
`
|
|
|
33 |
`bf16`: True \
|
34 |
-
`learning_rate`: 1e-
|
35 |
`lr_scheduler_type`: cosine \
|
36 |
-
`per_device_train_batch_size`:
|
37 |
-
`gradient_accumulation_steps`:
|
38 |
`torch_dtype`: bfloat16 \
|
39 |
`num_train_epochs`: 1 \
|
40 |
`max_prompt_length`: 512 \
|
@@ -43,10 +48,10 @@ tags:
|
|
43 |
|
44 |
### Results
|
45 |
|
46 |
-
`init_train_loss`: 0.
|
47 |
-
`final_train_loss`: 0.
|
48 |
`accuracy`: 0.7188 \
|
49 |
-
`reward_margin`: 0.
|
50 |
|
51 |
### Training script
|
52 |
|
|
|
27 |
|
28 |
## Training Details
|
29 |
|
30 |
+
devices: 4 * NPU 910B-64GB \
|
31 |
+
precision: bf16 mixed-precision \
|
32 |
+
global_batch_size: 64
|
33 |
+
|
34 |
### Training Hyperparameters
|
35 |
+
|
36 |
+
`attn_implementation`: None \
|
37 |
+
`beta`: 0.1 \
|
38 |
`bf16`: True \
|
39 |
+
`learning_rate`: 1e-6 \
|
40 |
`lr_scheduler_type`: cosine \
|
41 |
+
`per_device_train_batch_size`: 8 \
|
42 |
+
`gradient_accumulation_steps`: 2 \
|
43 |
`torch_dtype`: bfloat16 \
|
44 |
`num_train_epochs`: 1 \
|
45 |
`max_prompt_length`: 512 \
|
|
|
48 |
|
49 |
### Results
|
50 |
|
51 |
+
`init_train_loss`: 0.6958 \
|
52 |
+
`final_train_loss`: 0.5375 \
|
53 |
`accuracy`: 0.7188 \
|
54 |
+
`reward_margin`: 0.7227
|
55 |
|
56 |
### Training script
|
57 |
|