lapp0 commited on
Commit
2be2990
1 Parent(s): 257d257

End of training

Browse files
README.md CHANGED
@@ -54,16 +54,17 @@ LlamaForCausalLM(
54
  (o_proj): Linear(in_features=576, out_features=576, bias=False)
55
  (rotary_emb): LlamaRotaryEmbedding()
56
  )
57
- (mlp): LigerSwiGLUMLP(
58
  (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
59
  (up_proj): Linear(in_features=576, out_features=1536, bias=False)
60
  (down_proj): Linear(in_features=1536, out_features=576, bias=False)
 
61
  )
62
- (input_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
63
- (post_attention_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
64
  )
65
  )
66
- (norm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
67
  (rotary_emb): LlamaRotaryEmbedding()
68
  )
69
  (lm_head): Linear(in_features=576, out_features=49152, bias=False)
@@ -77,7 +78,7 @@ LlamaForCausalLM(
77
 
78
  # Resource Usage
79
 
80
- - Max Train VRAM Use: 12.7772 GB
81
  - Available VRAM: 23.4329 GB
82
  - GPUs:
83
  - 1x NVIDIA GeForce RTX 4090
@@ -107,28 +108,6 @@ LlamaForCausalLM(
107
  (self_attn): LlamaSdpaAttention(
108
  (q_proj): Linear(in_features=576, out_features=576, bias=False)
109
  (k_proj): Linear(in_features=576, out_features=192, bias=False)
110
- @@ -10,17 +10,16 @@
111
- (o_proj): Linear(in_features=576, out_features=576, bias=False)
112
- (rotary_emb): LlamaRotaryEmbedding()
113
- )
114
- - (mlp): LlamaMLP(
115
- + (mlp): LigerSwiGLUMLP(
116
- (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
117
- (up_proj): Linear(in_features=576, out_features=1536, bias=False)
118
- (down_proj): Linear(in_features=1536, out_features=576, bias=False)
119
- - (act_fn): SiLU()
120
- )
121
- - (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
122
- - (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
123
- + (input_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
124
- + (post_attention_layernorm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
125
- )
126
- )
127
- - (norm): LlamaRMSNorm((576,), eps=1e-05)
128
- + (norm): LigerRMSNorm((576,), eps=1e-05, offset=0.0)
129
- (rotary_emb): LlamaRotaryEmbedding()
130
- )
131
- (lm_head): Linear(in_features=576, out_features=49152, bias=False)
132
 
133
  ```
134
 
@@ -136,7 +115,7 @@ LlamaForCausalLM(
136
  <br/>
137
 
138
  # Train Dataset
139
- Trained on 84,857,838 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
140
 
141
  - Num Samples: `99,800`
142
  - Subset: `20231101.en`
@@ -185,14 +164,14 @@ The following hyperparameters were used during training:
185
  weight=0
186
  )
187
  )`
188
- - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x70f69e83a110>`
189
  - student_model_name_or_path: `None`
190
  - student_config_name_or_path: `None`
191
  - student_model_config: `{'num_hidden_layers': 15}`
192
  - reinitialize_weights: `None`
193
  - copy_teacher_modules: `[('lm_head', False)]`
194
  - student_model_as_bitnet: `False`
195
- - student_model_use_liger: `True`
196
  - teacher_model_name_or_path: `HuggingFaceTB/SmolLM-135M`
197
  - teacher_load_in_8bit: `False`
198
  - teacher_load_in_4bit: `False`
 
54
  (o_proj): Linear(in_features=576, out_features=576, bias=False)
55
  (rotary_emb): LlamaRotaryEmbedding()
56
  )
57
+ (mlp): LlamaMLP(
58
  (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
59
  (up_proj): Linear(in_features=576, out_features=1536, bias=False)
60
  (down_proj): Linear(in_features=1536, out_features=576, bias=False)
61
+ (act_fn): SiLU()
62
  )
63
+ (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
64
+ (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
65
  )
66
  )
67
+ (norm): LlamaRMSNorm((576,), eps=1e-05)
68
  (rotary_emb): LlamaRotaryEmbedding()
69
  )
70
  (lm_head): Linear(in_features=576, out_features=49152, bias=False)
 
78
 
79
  # Resource Usage
80
 
81
+ - Max Train VRAM Use: 12.7946 GB
82
  - Available VRAM: 23.4329 GB
83
  - GPUs:
84
  - 1x NVIDIA GeForce RTX 4090
 
108
  (self_attn): LlamaSdpaAttention(
109
  (q_proj): Linear(in_features=576, out_features=576, bias=False)
110
  (k_proj): Linear(in_features=576, out_features=192, bias=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  ```
113
 
 
115
  <br/>
116
 
117
  # Train Dataset
118
+ Trained on 84,871,894 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
119
 
120
  - Num Samples: `99,800`
121
  - Subset: `20231101.en`
 
164
  weight=0
165
  )
166
  )`
167
+ - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7eb253ff9660>`
168
  - student_model_name_or_path: `None`
169
  - student_config_name_or_path: `None`
170
  - student_model_config: `{'num_hidden_layers': 15}`
171
  - reinitialize_weights: `None`
172
  - copy_teacher_modules: `[('lm_head', False)]`
173
  - student_model_as_bitnet: `False`
174
+ - student_model_use_liger: `False`
175
  - teacher_model_name_or_path: `HuggingFaceTB/SmolLM-135M`
176
  - teacher_load_in_8bit: `False`
177
  - teacher_load_in_4bit: `False`
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=bs4_NO_liger_baseline, student_model_use_liger=False/events.out.tfevents.1726148253.1c1a426a2fee ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf4731dea58e40db4e1b8a523b91f2b8e9b403d55da8ebebf39d902946255bab
3
+ size 253