End of training

Browse files

Files changed (2) hide show

README.md +9 -30
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=bs4_NO_liger_baseline, student_model_use_liger=False/events.out.tfevents.1726148253.1c1a426a2fee +3 -0

README.md CHANGED Viewed

@@ -54,16 +54,17 @@ LlamaForCausalLM(
           (o_proj): Linear(in_features=576, out_features=576, bias=False)
           (rotary_emb): LlamaRotaryEmbedding()
         )
-        (mlp): LigerSwiGLUMLP(
           (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
           (up_proj): Linear(in_features=576, out_features=1536, bias=False)
           (down_proj): Linear(in_features=1536, out_features=576, bias=False)
         )
-        (input_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
-        (post_attention_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
       )
     )
-    (norm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
     (rotary_emb): LlamaRotaryEmbedding()
   )
   (lm_head): Linear(in_features=576, out_features=49152, bias=False)
@@ -77,7 +78,7 @@ LlamaForCausalLM(
 # Resource Usage
-- Max Train VRAM Use: 12.7772 GB
 - Available VRAM: 23.4329 GB
 - GPUs:
   - 1x NVIDIA GeForce RTX 4090
@@ -107,28 +108,6 @@ LlamaForCausalLM(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=576, out_features=576, bias=False)
            (k_proj): Linear(in_features=576, out_features=192, bias=False)
-@@ -10,17 +10,16 @@
-           (o_proj): Linear(in_features=576, out_features=576, bias=False)
-           (rotary_emb): LlamaRotaryEmbedding()
-         )
--        (mlp): LlamaMLP(
-+        (mlp): LigerSwiGLUMLP(
-           (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
-           (up_proj): Linear(in_features=576, out_features=1536, bias=False)
-           (down_proj): Linear(in_features=1536, out_features=576, bias=False)
--          (act_fn): SiLU()
-         )
--        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
--        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
-+        (input_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
-+        (post_attention_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
-       )
-     )
--    (norm): LlamaRMSNorm((576,), eps=1e-05)
-+    (norm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
-     (rotary_emb): LlamaRotaryEmbedding()
-   )
-   (lm_head): Linear(in_features=576, out_features=49152, bias=False)
 ```
@@ -136,7 +115,7 @@ LlamaForCausalLM(
 <br/>
 # Train Dataset
-Trained on 84,857,838 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
 - Num Samples: `99,800`
 - Subset: `20231101.en`
@@ -185,14 +164,14 @@ The following hyperparameters were used during training:
         weight=0
     )
 )`
-- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x70f69e83a110>`
 - student_model_name_or_path: `None`
 - student_config_name_or_path: `None`
 - student_model_config: `{'num_hidden_layers': 15}`
 - reinitialize_weights: `None`
 - copy_teacher_modules: `[('lm_head', False)]`
 - student_model_as_bitnet: `False`
-- student_model_use_liger: `True`
 - teacher_model_name_or_path: `HuggingFaceTB/SmolLM-135M`
 - teacher_load_in_8bit: `False`
 - teacher_load_in_4bit: `False`

           (o_proj): Linear(in_features=576, out_features=576, bias=False)
           (rotary_emb): LlamaRotaryEmbedding()
         )
+        (mlp): LlamaMLP(
           (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
           (up_proj): Linear(in_features=576, out_features=1536, bias=False)
           (down_proj): Linear(in_features=1536, out_features=576, bias=False)
+          (act_fn): SiLU()
         )
+        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
+        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
       )
     )
+    (norm): LlamaRMSNorm((576,), eps=1e-05)
     (rotary_emb): LlamaRotaryEmbedding()
   )
   (lm_head): Linear(in_features=576, out_features=49152, bias=False)
 # Resource Usage
+- Max Train VRAM Use: 12.7946 GB
 - Available VRAM: 23.4329 GB
 - GPUs:
   - 1x NVIDIA GeForce RTX 4090
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=576, out_features=576, bias=False)
            (k_proj): Linear(in_features=576, out_features=192, bias=False)
 ```
 <br/>
 # Train Dataset
+Trained on 84,871,894 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
 - Num Samples: `99,800`
 - Subset: `20231101.en`
         weight=0
     )
 )`
+- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7eb253ff9660>`
 - student_model_name_or_path: `None`
 - student_config_name_or_path: `None`
 - student_model_config: `{'num_hidden_layers': 15}`
 - reinitialize_weights: `None`
 - copy_teacher_modules: `[('lm_head', False)]`
 - student_model_as_bitnet: `False`
+- student_model_use_liger: `False`
 - teacher_model_name_or_path: `HuggingFaceTB/SmolLM-135M`
 - teacher_load_in_8bit: `False`
 - teacher_load_in_4bit: `False`

logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=bs4_NO_liger_baseline, student_model_use_liger=False/events.out.tfevents.1726148253.1c1a426a2fee ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf4731dea58e40db4e1b8a523b91f2b8e9b403d55da8ebebf39d902946255bab
+size 253