Model save

Browse files

Files changed (5) hide show

README.md +7 -7
all_results.json +6 -6
runs/Aug13_19-05-34_ip-172-31-10-237/events.out.tfevents.1723575945.ip-172-31-10-237.662759.0 +2 -2
train_results.json +6 -6
trainer_state.json +132 -11

README.md CHANGED Viewed

@@ -1,13 +1,11 @@
 ---
 base_model: meta-llama/Meta-Llama-3-8B
-datasets:
-- HuggingFaceH4/ultrachat_200k
 library_name: peft
 license: llama3
 tags:
-- alignment-handbook
 - trl
 - sft
 - generated_from_trainer
 model-index:
 - name: llama3-sudo
@@ -19,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
 # llama3-sudo
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the HuggingFaceH4/ultrachat_200k dataset.
 It achieves the following results on the evaluation set:
-- Loss: 1.0709
 ## Model description
@@ -41,12 +39,12 @@ More information needed
 The following hyperparameters were used during training:
 - learning_rate: 0.0002
-- train_batch_size: 8
 - eval_batch_size: 4
 - seed: 42
 - distributed_type: multi-GPU
 - num_devices: 4
-- gradient_accumulation_steps: 8
 - total_train_batch_size: 256
 - total_eval_batch_size: 16
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
@@ -60,6 +58,8 @@ The following hyperparameters were used during training:
 |:-------------:|:------:|:----:|:---------------:|
 | 1.3252        | 0.9697 | 24   | 1.1693          |
 | 1.1352        | 1.9798 | 49   | 1.0709          |
 ### Framework versions

 ---
 base_model: meta-llama/Meta-Llama-3-8B
 library_name: peft
 license: llama3
 tags:
 - trl
 - sft
+- alignment-handbook
 - generated_from_trainer
 model-index:
 - name: llama3-sudo
 # llama3-sudo
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- Loss: 1.0100
 ## Model description
 The following hyperparameters were used during training:
 - learning_rate: 0.0002
+- train_batch_size: 16
 - eval_batch_size: 4
 - seed: 42
 - distributed_type: multi-GPU
 - num_devices: 4
+- gradient_accumulation_steps: 4
 - total_train_batch_size: 256
 - total_eval_batch_size: 16
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 |:-------------:|:------:|:----:|:---------------:|
 | 1.3252        | 0.9697 | 24   | 1.1693          |
 | 1.1352        | 1.9798 | 49   | 1.0709          |
+| 1.1265        | 1.9899 | 98   | 1.0308          |
+| 1.1113        | 2.9798 | 147  | 1.0100          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 2.909090909090909,
-    "total_flos": 644347544469504.0,
-    "train_loss": 0.0,
-    "train_runtime": 0.0111,
     "train_samples": 6321,
-    "train_samples_per_second": 1702737.829,
-    "train_steps_per_second": 6465.07
 }

 {
+    "epoch": 2.9797979797979797,
+    "total_flos": 981865656745984.0,
+    "train_loss": 0.5657350935903537,
+    "train_runtime": 914.4964,
     "train_samples": 6321,
+    "train_samples_per_second": 20.736,
+    "train_steps_per_second": 0.161
 }

runs/Aug13_19-05-34_ip-172-31-10-237/events.out.tfevents.1723575945.ip-172-31-10-237.662759.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2a9fbe250686fd5af471baa634830930f33838aea00a8eb3dfcb0d7751deb67
-size 8938

 version https://git-lfs.github.com/spec/v1
+oid sha256:8c20b797bd8224221ce2960fdc3e46fdeae386ed3cafad64f825e32718371658
+size 9563

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 2.909090909090909,
-    "total_flos": 644347544469504.0,
-    "train_loss": 0.0,
-    "train_runtime": 0.0111,
     "train_samples": 6321,
-    "train_samples_per_second": 1702737.829,
-    "train_steps_per_second": 6465.07
 }

 {
+    "epoch": 2.9797979797979797,
+    "total_flos": 981865656745984.0,
+    "train_loss": 0.5657350935903537,
+    "train_runtime": 914.4964,
     "train_samples": 6321,
+    "train_samples_per_second": 20.736,
+    "train_steps_per_second": 0.161
 }

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 2.909090909090909,
   "eval_steps": 500,
-  "global_step": 72,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -130,17 +130,138 @@
       "step": 70
     },
     {
-      "epoch": 2.909090909090909,
-      "step": 72,
-      "total_flos": 644347544469504.0,
-      "train_loss": 0.0,
-      "train_runtime": 0.0111,
-      "train_samples_per_second": 1702737.829,
-      "train_steps_per_second": 6465.07
     }
   ],
   "logging_steps": 5,
-  "max_steps": 72,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 25,
@@ -156,7 +277,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 644347544469504.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 2.9797979797979797,
   "eval_steps": 500,
+  "global_step": 147,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "step": 70
     },
     {
+      "epoch": 1.5252525252525253,
+      "grad_norm": 1.5131939349650319,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 1.4021,
+      "step": 75
+    },
+    {
+      "epoch": 1.6262626262626263,
+      "grad_norm": 0.4578537596053918,
+      "learning_rate": 0.00010237976975461075,
+      "loss": 1.2517,
+      "step": 80
+    },
+    {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 0.33225716851066756,
+      "learning_rate": 9.049439566958175e-05,
+      "loss": 1.139,
+      "step": 85
+    },
+    {
+      "epoch": 1.8282828282828283,
+      "grad_norm": 0.27743266981716935,
+      "learning_rate": 7.874347104470234e-05,
+      "loss": 1.0842,
+      "step": 90
+    },
+    {
+      "epoch": 1.9292929292929293,
+      "grad_norm": 0.3335911253392335,
+      "learning_rate": 6.729320366825784e-05,
+      "loss": 1.1265,
+      "step": 95
+    },
+    {
+      "epoch": 1.98989898989899,
+      "eval_loss": 1.0307520627975464,
+      "eval_runtime": 165.0307,
+      "eval_samples_per_second": 38.302,
+      "eval_steps_per_second": 2.4,
+      "step": 98
+    },
+    {
+      "epoch": 2.0303030303030303,
+      "grad_norm": 0.2767977674400862,
+      "learning_rate": 5.630554876306407e-05,
+      "loss": 1.0492,
+      "step": 100
+    },
+    {
+      "epoch": 2.1313131313131315,
+      "grad_norm": 0.2747976722762822,
+      "learning_rate": 4.593591825444028e-05,
+      "loss": 1.0602,
+      "step": 105
+    },
+    {
+      "epoch": 2.2323232323232323,
+      "grad_norm": 0.2987036396560577,
+      "learning_rate": 3.6330982588091186e-05,
+      "loss": 1.0785,
+      "step": 110
+    },
+    {
+      "epoch": 2.3333333333333335,
+      "grad_norm": 0.27812388202445604,
+      "learning_rate": 2.7626596189492983e-05,
+      "loss": 1.0563,
+      "step": 115
+    },
+    {
+      "epoch": 2.4343434343434343,
+      "grad_norm": 0.24657888130613284,
+      "learning_rate": 1.994587590756397e-05,
+      "loss": 1.0835,
+      "step": 120
+    },
+    {
+      "epoch": 2.5353535353535355,
+      "grad_norm": 0.24337497990605553,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 1.071,
+      "step": 125
+    },
+    {
+      "epoch": 2.6363636363636362,
+      "grad_norm": 0.2979117602370218,
+      "learning_rate": 8.073969641833445e-06,
+      "loss": 1.036,
+      "step": 130
+    },
+    {
+      "epoch": 2.7373737373737375,
+      "grad_norm": 0.25304418891814984,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 1.1085,
+      "step": 135
+    },
+    {
+      "epoch": 2.8383838383838382,
+      "grad_norm": 0.26313547369333573,
+      "learning_rate": 1.3845646281813507e-06,
+      "loss": 1.0692,
+      "step": 140
+    },
+    {
+      "epoch": 2.9393939393939394,
+      "grad_norm": 0.2581657699360705,
+      "learning_rate": 1.1326608169920372e-07,
+      "loss": 1.1113,
+      "step": 145
+    },
+    {
+      "epoch": 2.9797979797979797,
+      "eval_loss": 1.0100449323654175,
+      "eval_runtime": 162.6209,
+      "eval_samples_per_second": 38.87,
+      "eval_steps_per_second": 2.435,
+      "step": 147
+    },
+    {
+      "epoch": 2.9797979797979797,
+      "step": 147,
+      "total_flos": 981865656745984.0,
+      "train_loss": 0.5657350935903537,
+      "train_runtime": 914.4964,
+      "train_samples_per_second": 20.736,
+      "train_steps_per_second": 0.161
     }
   ],
   "logging_steps": 5,
+  "max_steps": 147,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 25,
       "attributes": {}
     }
   },
+  "total_flos": 981865656745984.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null