lapp0 commited on
Commit
1553098
1 Parent(s): 2be2990

End of training

Browse files
README.md CHANGED
@@ -78,12 +78,12 @@ LlamaForCausalLM(
78
 
79
  # Resource Usage
80
 
81
- - Max Train VRAM Use: 12.7946 GB
82
  - Available VRAM: 23.4329 GB
83
  - GPUs:
84
  - 1x NVIDIA GeForce RTX 4090
85
  - CPUs: 64
86
- - CPU Memory: 251.7299 GB
87
  - CPU Memory Bandwidth: 1600 GB/s
88
 
89
  # Distillation (Teacher -> Student) Architecture Difference:
@@ -115,7 +115,7 @@ LlamaForCausalLM(
115
  <br/>
116
 
117
  # Train Dataset
118
- Trained on 84,871,894 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
119
 
120
  - Num Samples: `99,800`
121
  - Subset: `20231101.en`
@@ -134,7 +134,11 @@ DistillationObjective(
134
  weight=0
135
  ),
136
  attn_loss_component=LossComponent(
137
- weight=0
 
 
 
 
138
  )
139
  )
140
  ```
@@ -146,7 +150,7 @@ The following hyperparameters were used during training:
146
  <summary>Expand</summary>
147
 
148
  - learning_rate: `0.0002`
149
- - train_batch_size: `4`
150
  - eval_batch_size: `2`
151
  - seed: `42`
152
  - optimizer: `Adam with betas=(0.9,0.999) and epsilon=1e-08`
@@ -161,10 +165,14 @@ The following hyperparameters were used during training:
161
  weight=0
162
  ),
163
  attn_loss_component=LossComponent(
164
- weight=0
 
 
 
 
165
  )
166
  )`
167
- - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7eb253ff9660>`
168
  - student_model_name_or_path: `None`
169
  - student_config_name_or_path: `None`
170
  - student_model_config: `{'num_hidden_layers': 15}`
@@ -197,6 +205,6 @@ The following hyperparameters were used during training:
197
 
198
  # Framework Versions
199
  - Distily 0.5.0
200
- - Transformers 4.44.2
201
  - Pytorch 2.5.0.dev20240911+cu121
202
- - Datasets 2.21.0
 
78
 
79
  # Resource Usage
80
 
81
+ - Max Train VRAM Use: 20.9417 GB
82
  - Available VRAM: 23.4329 GB
83
  - GPUs:
84
  - 1x NVIDIA GeForce RTX 4090
85
  - CPUs: 64
86
+ - CPU Memory: 251.7190 GB
87
  - CPU Memory Bandwidth: 1600 GB/s
88
 
89
  # Distillation (Teacher -> Student) Architecture Difference:
 
115
  <br/>
116
 
117
  # Train Dataset
118
+ Trained on 84,851,671 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
119
 
120
  - Num Samples: `99,800`
121
  - Subset: `20231101.en`
 
134
  weight=0
135
  ),
136
  attn_loss_component=LossComponent(
137
+ weight=5.0,
138
+ loss_fn='raw_mse',
139
+ layer_mapper='layer-2',
140
+ norm='layernorm_teacher_only_affine',
141
+ projector='mlp'
142
  )
143
  )
144
  ```
 
150
  <summary>Expand</summary>
151
 
152
  - learning_rate: `0.0002`
153
+ - train_batch_size: `2`
154
  - eval_batch_size: `2`
155
  - seed: `42`
156
  - optimizer: `Adam with betas=(0.9,0.999) and epsilon=1e-08`
 
165
  weight=0
166
  ),
167
  attn_loss_component=LossComponent(
168
+ weight=5.0,
169
+ loss_fn='raw_mse',
170
+ layer_mapper='layer-2',
171
+ norm='layernorm_teacher_only_affine',
172
+ projector='mlp'
173
  )
174
  )`
175
+ - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7f6ad1b142e0>`
176
  - student_model_name_or_path: `None`
177
  - student_config_name_or_path: `None`
178
  - student_model_config: `{'num_hidden_layers': 15}`
 
205
 
206
  # Framework Versions
207
  - Distily 0.5.0
208
+ - Transformers 4.45.0.dev0
209
  - Pytorch 2.5.0.dev20240911+cu121
210
+ - Datasets 3.0.0
config.json CHANGED
@@ -7,6 +7,7 @@
7
  "attention_dropout": 0.0,
8
  "bos_token_id": 0,
9
  "eos_token_id": 0,
 
10
  "hidden_act": "silu",
11
  "hidden_size": 576,
12
  "initializer_range": 0.02,
@@ -23,7 +24,7 @@
23
  "rope_theta": 10000.0,
24
  "tie_word_embeddings": true,
25
  "torch_dtype": "bfloat16",
26
- "transformers_version": "4.44.2",
27
  "use_cache": false,
28
  "vocab_size": 49152
29
  }
 
7
  "attention_dropout": 0.0,
8
  "bos_token_id": 0,
9
  "eos_token_id": 0,
10
+ "head_dim": 64,
11
  "hidden_act": "silu",
12
  "hidden_size": 576,
13
  "initializer_range": 0.02,
 
24
  "rope_theta": 10000.0,
25
  "tie_word_embeddings": true,
26
  "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.45.0.dev0",
28
  "use_cache": false,
29
  "vocab_size": 49152
30
  }
generation_config.json CHANGED
@@ -2,6 +2,6 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 0,
4
  "eos_token_id": 0,
5
- "transformers_version": "4.44.2",
6
  "use_cache": false
7
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 0,
4
  "eos_token_id": 0,
5
+ "transformers_version": "4.45.0.dev0",
6
  "use_cache": false
7
  }
logs/attn_projector=mlp, per_device_train_batch_size=2, run_name=bs2/events.out.tfevents.1726128095.46d00238c241 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05049f5d87e4a7ddb650af801d9766520fbc61a7d71713dd3954ec73157c8e85
3
- size 1242037
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f78bc4a2160f151a0d8f9bdf66792688f7bc2240e5b7bd32ad62225dc932b183
3
+ size 1378148
logs/attn_projector=mlp, per_device_train_batch_size=2, run_name=bs2/events.out.tfevents.1726148603.46d00238c241 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7ca17bc388a40ebfeee488d1e80d7a08e2f9ca04365e57be9d8a1de4a8cf834
3
+ size 253
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4138576611832b1bbe77dc349189271d76106c052e215808da84cf40e98bd034
3
  size 162842416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4d35e5221c1a9340eef663c0674f7b544ed09e4f95457f0c98eb1c255a2acce
3
  size 162842416
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:555ceb45b35190b72c7d1a0f76611e172bacea5f42f7a7ad17b8bf6a95802281
3
- size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:738e9326fbb30717baf30f64578731beac353bb8abd9108ce8783db4565fa56e
3
+ size 5368