Training in progress, step 5000
Browse files- README.md +9 -17
- config.json +1 -2
- generation_config.json +1 -1
- logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=bs4_NO_liger_baseline, student_model_use_liger=False/completed.flag +0 -0
- logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=bs4_NO_liger_baseline, student_model_use_liger=False/events.out.tfevents.1726148253.1c1a426a2fee +2 -2
- logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_bs4_liger_torch_compile, student_model_use_liger=True, torch_compile=True/events.out.tfevents.1726155371.1c1a426a2fee +3 -0
- logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_bs4_liger_torch_compile, student_model_use_liger=True, torch_compile=True/events.out.tfevents.1726155767.1c1a426a2fee +3 -0
- logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_bs4_torch_compile, student_model_use_liger=False, torch_compile=True/events.out.tfevents.1726156182.1c1a426a2fee +3 -0
- model.safetensors +1 -1
- training_args.bin +2 -2
README.md
CHANGED
@@ -78,12 +78,12 @@ LlamaForCausalLM(
|
|
78 |
|
79 |
# Resource Usage
|
80 |
|
81 |
-
- Max Train VRAM Use:
|
82 |
- Available VRAM: 23.4329 GB
|
83 |
- GPUs:
|
84 |
- 1x NVIDIA GeForce RTX 4090
|
85 |
- CPUs: 64
|
86 |
-
- CPU Memory: 251.
|
87 |
- CPU Memory Bandwidth: 1600 GB/s
|
88 |
|
89 |
# Distillation (Teacher -> Student) Architecture Difference:
|
@@ -115,7 +115,7 @@ LlamaForCausalLM(
|
|
115 |
<br/>
|
116 |
|
117 |
# Train Dataset
|
118 |
-
Trained on 84,
|
119 |
|
120 |
- Num Samples: `99,800`
|
121 |
- Subset: `20231101.en`
|
@@ -134,11 +134,7 @@ DistillationObjective(
|
|
134 |
weight=0
|
135 |
),
|
136 |
attn_loss_component=LossComponent(
|
137 |
-
weight=
|
138 |
-
loss_fn='raw_mse',
|
139 |
-
layer_mapper='layer-2',
|
140 |
-
norm='layernorm_teacher_only_affine',
|
141 |
-
projector='mlp'
|
142 |
)
|
143 |
)
|
144 |
```
|
@@ -150,7 +146,7 @@ The following hyperparameters were used during training:
|
|
150 |
<summary>Expand</summary>
|
151 |
|
152 |
- learning_rate: `0.0002`
|
153 |
-
- train_batch_size: `
|
154 |
- eval_batch_size: `2`
|
155 |
- seed: `42`
|
156 |
- optimizer: `Adam with betas=(0.9,0.999) and epsilon=1e-08`
|
@@ -165,14 +161,10 @@ The following hyperparameters were used during training:
|
|
165 |
weight=0
|
166 |
),
|
167 |
attn_loss_component=LossComponent(
|
168 |
-
weight=
|
169 |
-
loss_fn='raw_mse',
|
170 |
-
layer_mapper='layer-2',
|
171 |
-
norm='layernorm_teacher_only_affine',
|
172 |
-
projector='mlp'
|
173 |
)
|
174 |
)`
|
175 |
-
- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at
|
176 |
- student_model_name_or_path: `None`
|
177 |
- student_config_name_or_path: `None`
|
178 |
- student_model_config: `{'num_hidden_layers': 15}`
|
@@ -205,6 +197,6 @@ The following hyperparameters were used during training:
|
|
205 |
|
206 |
# Framework Versions
|
207 |
- Distily 0.5.0
|
208 |
-
- Transformers 4.
|
209 |
- Pytorch 2.5.0.dev20240911+cu121
|
210 |
-
- Datasets
|
|
|
78 |
|
79 |
# Resource Usage
|
80 |
|
81 |
+
- Max Train VRAM Use: 12.7946 GB
|
82 |
- Available VRAM: 23.4329 GB
|
83 |
- GPUs:
|
84 |
- 1x NVIDIA GeForce RTX 4090
|
85 |
- CPUs: 64
|
86 |
+
- CPU Memory: 251.7299 GB
|
87 |
- CPU Memory Bandwidth: 1600 GB/s
|
88 |
|
89 |
# Distillation (Teacher -> Student) Architecture Difference:
|
|
|
115 |
<br/>
|
116 |
|
117 |
# Train Dataset
|
118 |
+
Trained on 84,871,894 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
|
119 |
|
120 |
- Num Samples: `99,800`
|
121 |
- Subset: `20231101.en`
|
|
|
134 |
weight=0
|
135 |
),
|
136 |
attn_loss_component=LossComponent(
|
137 |
+
weight=0
|
|
|
|
|
|
|
|
|
138 |
)
|
139 |
)
|
140 |
```
|
|
|
146 |
<summary>Expand</summary>
|
147 |
|
148 |
- learning_rate: `0.0002`
|
149 |
+
- train_batch_size: `4`
|
150 |
- eval_batch_size: `2`
|
151 |
- seed: `42`
|
152 |
- optimizer: `Adam with betas=(0.9,0.999) and epsilon=1e-08`
|
|
|
161 |
weight=0
|
162 |
),
|
163 |
attn_loss_component=LossComponent(
|
164 |
+
weight=0
|
|
|
|
|
|
|
|
|
165 |
)
|
166 |
)`
|
167 |
+
- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7eb253ff9660>`
|
168 |
- student_model_name_or_path: `None`
|
169 |
- student_config_name_or_path: `None`
|
170 |
- student_model_config: `{'num_hidden_layers': 15}`
|
|
|
197 |
|
198 |
# Framework Versions
|
199 |
- Distily 0.5.0
|
200 |
+
- Transformers 4.44.2
|
201 |
- Pytorch 2.5.0.dev20240911+cu121
|
202 |
+
- Datasets 2.21.0
|
config.json
CHANGED
@@ -7,7 +7,6 @@
|
|
7 |
"attention_dropout": 0.0,
|
8 |
"bos_token_id": 0,
|
9 |
"eos_token_id": 0,
|
10 |
-
"head_dim": 64,
|
11 |
"hidden_act": "silu",
|
12 |
"hidden_size": 576,
|
13 |
"initializer_range": 0.02,
|
@@ -24,7 +23,7 @@
|
|
24 |
"rope_theta": 10000.0,
|
25 |
"tie_word_embeddings": true,
|
26 |
"torch_dtype": "bfloat16",
|
27 |
-
"transformers_version": "4.
|
28 |
"use_cache": false,
|
29 |
"vocab_size": 49152
|
30 |
}
|
|
|
7 |
"attention_dropout": 0.0,
|
8 |
"bos_token_id": 0,
|
9 |
"eos_token_id": 0,
|
|
|
10 |
"hidden_act": "silu",
|
11 |
"hidden_size": 576,
|
12 |
"initializer_range": 0.02,
|
|
|
23 |
"rope_theta": 10000.0,
|
24 |
"tie_word_embeddings": true,
|
25 |
"torch_dtype": "bfloat16",
|
26 |
+
"transformers_version": "4.44.2",
|
27 |
"use_cache": false,
|
28 |
"vocab_size": 49152
|
29 |
}
|
generation_config.json
CHANGED
@@ -2,6 +2,6 @@
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 0,
|
4 |
"eos_token_id": 0,
|
5 |
-
"transformers_version": "4.
|
6 |
"use_cache": false
|
7 |
}
|
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 0,
|
4 |
"eos_token_id": 0,
|
5 |
+
"transformers_version": "4.44.2",
|
6 |
"use_cache": false
|
7 |
}
|
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=bs4_NO_liger_baseline, student_model_use_liger=False/completed.flag
ADDED
File without changes
|
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=bs4_NO_liger_baseline, student_model_use_liger=False/events.out.tfevents.1726148253.1c1a426a2fee
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aae803ff6dd328699919493928d13e384e3295111e1bbc7544b79d04a10f8b4c
|
3 |
+
size 529
|
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_bs4_liger_torch_compile, student_model_use_liger=True, torch_compile=True/events.out.tfevents.1726155371.1c1a426a2fee
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78a4c0b3458c4104b6611e0e2f78d4bd512b7d55a41ef1d8d2b882d2159db624
|
3 |
+
size 5616
|
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_bs4_liger_torch_compile, student_model_use_liger=True, torch_compile=True/events.out.tfevents.1726155767.1c1a426a2fee
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3dfda7795f8dca11e9c2853b47db79f8d5e4c59a5bc1a0bc9c83322f37addd8
|
3 |
+
size 5616
|
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_bs4_torch_compile, student_model_use_liger=False, torch_compile=True/events.out.tfevents.1726156182.1c1a426a2fee
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:644939ffd31e8ce5fa6b45b761c3446ed209e14a56b40929a54cb540fd206e39
|
3 |
+
size 140550
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 162842416
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c1db03f19d5282f8261330094f26b6cf7d4178ec35cb3b13b390167637a7fdc
|
3 |
size 162842416
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3127321d43c3138dea3921e8b46cf74e63fb22ba80f0338e4425f98d3dd416cd
|
3 |
+
size 5432
|