End of training
Browse files- README.md +17 -9
- config.json +2 -1
- generation_config.json +1 -1
- logs/attn_projector=mlp, per_device_train_batch_size=2, run_name=bs2/events.out.tfevents.1726128095.46d00238c241 +2 -2
- logs/attn_projector=mlp, per_device_train_batch_size=2, run_name=bs2/events.out.tfevents.1726148603.46d00238c241 +3 -0
- model.safetensors +1 -1
- training_args.bin +2 -2
README.md
CHANGED
@@ -78,12 +78,12 @@ LlamaForCausalLM(
|
|
78 |
|
79 |
# Resource Usage
|
80 |
|
81 |
-
- Max Train VRAM Use:
|
82 |
- Available VRAM: 23.4329 GB
|
83 |
- GPUs:
|
84 |
- 1x NVIDIA GeForce RTX 4090
|
85 |
- CPUs: 64
|
86 |
-
- CPU Memory: 251.
|
87 |
- CPU Memory Bandwidth: 1600 GB/s
|
88 |
|
89 |
# Distillation (Teacher -> Student) Architecture Difference:
|
@@ -115,7 +115,7 @@ LlamaForCausalLM(
|
|
115 |
<br/>
|
116 |
|
117 |
# Train Dataset
|
118 |
-
Trained on 84,
|
119 |
|
120 |
- Num Samples: `99,800`
|
121 |
- Subset: `20231101.en`
|
@@ -134,7 +134,11 @@ DistillationObjective(
|
|
134 |
weight=0
|
135 |
),
|
136 |
attn_loss_component=LossComponent(
|
137 |
-
weight=0
|
|
|
|
|
|
|
|
|
138 |
)
|
139 |
)
|
140 |
```
|
@@ -146,7 +150,7 @@ The following hyperparameters were used during training:
|
|
146 |
<summary>Expand</summary>
|
147 |
|
148 |
- learning_rate: `0.0002`
|
149 |
-
- train_batch_size: `
|
150 |
- eval_batch_size: `2`
|
151 |
- seed: `42`
|
152 |
- optimizer: `Adam with betas=(0.9,0.999) and epsilon=1e-08`
|
@@ -161,10 +165,14 @@ The following hyperparameters were used during training:
|
|
161 |
weight=0
|
162 |
),
|
163 |
attn_loss_component=LossComponent(
|
164 |
-
weight=0
|
|
|
|
|
|
|
|
|
165 |
)
|
166 |
)`
|
167 |
-
- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at
|
168 |
- student_model_name_or_path: `None`
|
169 |
- student_config_name_or_path: `None`
|
170 |
- student_model_config: `{'num_hidden_layers': 15}`
|
@@ -197,6 +205,6 @@ The following hyperparameters were used during training:
|
|
197 |
|
198 |
# Framework Versions
|
199 |
- Distily 0.5.0
|
200 |
-
- Transformers 4.
|
201 |
- Pytorch 2.5.0.dev20240911+cu121
|
202 |
-
- Datasets
|
|
|
78 |
|
79 |
# Resource Usage
|
80 |
|
81 |
+
- Max Train VRAM Use: 20.9417 GB
|
82 |
- Available VRAM: 23.4329 GB
|
83 |
- GPUs:
|
84 |
- 1x NVIDIA GeForce RTX 4090
|
85 |
- CPUs: 64
|
86 |
+
- CPU Memory: 251.7190 GB
|
87 |
- CPU Memory Bandwidth: 1600 GB/s
|
88 |
|
89 |
# Distillation (Teacher -> Student) Architecture Difference:
|
|
|
115 |
<br/>
|
116 |
|
117 |
# Train Dataset
|
118 |
+
Trained on 84,851,671 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
|
119 |
|
120 |
- Num Samples: `99,800`
|
121 |
- Subset: `20231101.en`
|
|
|
134 |
weight=0
|
135 |
),
|
136 |
attn_loss_component=LossComponent(
|
137 |
+
weight=5.0,
|
138 |
+
loss_fn='raw_mse',
|
139 |
+
layer_mapper='layer-2',
|
140 |
+
norm='layernorm_teacher_only_affine',
|
141 |
+
projector='mlp'
|
142 |
)
|
143 |
)
|
144 |
```
|
|
|
150 |
<summary>Expand</summary>
|
151 |
|
152 |
- learning_rate: `0.0002`
|
153 |
+
- train_batch_size: `2`
|
154 |
- eval_batch_size: `2`
|
155 |
- seed: `42`
|
156 |
- optimizer: `Adam with betas=(0.9,0.999) and epsilon=1e-08`
|
|
|
165 |
weight=0
|
166 |
),
|
167 |
attn_loss_component=LossComponent(
|
168 |
+
weight=5.0,
|
169 |
+
loss_fn='raw_mse',
|
170 |
+
layer_mapper='layer-2',
|
171 |
+
norm='layernorm_teacher_only_affine',
|
172 |
+
projector='mlp'
|
173 |
)
|
174 |
)`
|
175 |
+
- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7f6ad1b142e0>`
|
176 |
- student_model_name_or_path: `None`
|
177 |
- student_config_name_or_path: `None`
|
178 |
- student_model_config: `{'num_hidden_layers': 15}`
|
|
|
205 |
|
206 |
# Framework Versions
|
207 |
- Distily 0.5.0
|
208 |
+
- Transformers 4.45.0.dev0
|
209 |
- Pytorch 2.5.0.dev20240911+cu121
|
210 |
+
- Datasets 3.0.0
|
config.json
CHANGED
@@ -7,6 +7,7 @@
|
|
7 |
"attention_dropout": 0.0,
|
8 |
"bos_token_id": 0,
|
9 |
"eos_token_id": 0,
|
|
|
10 |
"hidden_act": "silu",
|
11 |
"hidden_size": 576,
|
12 |
"initializer_range": 0.02,
|
@@ -23,7 +24,7 @@
|
|
23 |
"rope_theta": 10000.0,
|
24 |
"tie_word_embeddings": true,
|
25 |
"torch_dtype": "bfloat16",
|
26 |
-
"transformers_version": "4.
|
27 |
"use_cache": false,
|
28 |
"vocab_size": 49152
|
29 |
}
|
|
|
7 |
"attention_dropout": 0.0,
|
8 |
"bos_token_id": 0,
|
9 |
"eos_token_id": 0,
|
10 |
+
"head_dim": 64,
|
11 |
"hidden_act": "silu",
|
12 |
"hidden_size": 576,
|
13 |
"initializer_range": 0.02,
|
|
|
24 |
"rope_theta": 10000.0,
|
25 |
"tie_word_embeddings": true,
|
26 |
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.45.0.dev0",
|
28 |
"use_cache": false,
|
29 |
"vocab_size": 49152
|
30 |
}
|
generation_config.json
CHANGED
@@ -2,6 +2,6 @@
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 0,
|
4 |
"eos_token_id": 0,
|
5 |
-
"transformers_version": "4.
|
6 |
"use_cache": false
|
7 |
}
|
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 0,
|
4 |
"eos_token_id": 0,
|
5 |
+
"transformers_version": "4.45.0.dev0",
|
6 |
"use_cache": false
|
7 |
}
|
logs/attn_projector=mlp, per_device_train_batch_size=2, run_name=bs2/events.out.tfevents.1726128095.46d00238c241
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f78bc4a2160f151a0d8f9bdf66792688f7bc2240e5b7bd32ad62225dc932b183
|
3 |
+
size 1378148
|
logs/attn_projector=mlp, per_device_train_batch_size=2, run_name=bs2/events.out.tfevents.1726148603.46d00238c241
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7ca17bc388a40ebfeee488d1e80d7a08e2f9ca04365e57be9d8a1de4a8cf834
|
3 |
+
size 253
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 162842416
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4d35e5221c1a9340eef663c0674f7b544ed09e4f95457f0c98eb1c255a2acce
|
3 |
size 162842416
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:738e9326fbb30717baf30f64578731beac353bb8abd9108ce8783db4565fa56e
|
3 |
+
size 5368
|