lapp0 commited on
Commit
0ad7340
1 Parent(s): 1553098

Training in progress, step 5000

Browse files
README.md CHANGED
@@ -78,12 +78,12 @@ LlamaForCausalLM(
78
 
79
  # Resource Usage
80
 
81
- - Max Train VRAM Use: 20.9417 GB
82
  - Available VRAM: 23.4329 GB
83
  - GPUs:
84
  - 1x NVIDIA GeForce RTX 4090
85
  - CPUs: 64
86
- - CPU Memory: 251.7190 GB
87
  - CPU Memory Bandwidth: 1600 GB/s
88
 
89
  # Distillation (Teacher -> Student) Architecture Difference:
@@ -115,7 +115,7 @@ LlamaForCausalLM(
115
  <br/>
116
 
117
  # Train Dataset
118
- Trained on 84,851,671 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
119
 
120
  - Num Samples: `99,800`
121
  - Subset: `20231101.en`
@@ -134,11 +134,7 @@ DistillationObjective(
134
  weight=0
135
  ),
136
  attn_loss_component=LossComponent(
137
- weight=5.0,
138
- loss_fn='raw_mse',
139
- layer_mapper='layer-2',
140
- norm='layernorm_teacher_only_affine',
141
- projector='mlp'
142
  )
143
  )
144
  ```
@@ -150,7 +146,7 @@ The following hyperparameters were used during training:
150
  <summary>Expand</summary>
151
 
152
  - learning_rate: `0.0002`
153
- - train_batch_size: `2`
154
  - eval_batch_size: `2`
155
  - seed: `42`
156
  - optimizer: `Adam with betas=(0.9,0.999) and epsilon=1e-08`
@@ -165,14 +161,10 @@ The following hyperparameters were used during training:
165
  weight=0
166
  ),
167
  attn_loss_component=LossComponent(
168
- weight=5.0,
169
- loss_fn='raw_mse',
170
- layer_mapper='layer-2',
171
- norm='layernorm_teacher_only_affine',
172
- projector='mlp'
173
  )
174
  )`
175
- - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7f6ad1b142e0>`
176
  - student_model_name_or_path: `None`
177
  - student_config_name_or_path: `None`
178
  - student_model_config: `{'num_hidden_layers': 15}`
@@ -205,6 +197,6 @@ The following hyperparameters were used during training:
205
 
206
  # Framework Versions
207
  - Distily 0.5.0
208
- - Transformers 4.45.0.dev0
209
  - Pytorch 2.5.0.dev20240911+cu121
210
- - Datasets 3.0.0
 
78
 
79
  # Resource Usage
80
 
81
+ - Max Train VRAM Use: 12.7946 GB
82
  - Available VRAM: 23.4329 GB
83
  - GPUs:
84
  - 1x NVIDIA GeForce RTX 4090
85
  - CPUs: 64
86
+ - CPU Memory: 251.7299 GB
87
  - CPU Memory Bandwidth: 1600 GB/s
88
 
89
  # Distillation (Teacher -> Student) Architecture Difference:
 
115
  <br/>
116
 
117
  # Train Dataset
118
+ Trained on 84,871,894 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
119
 
120
  - Num Samples: `99,800`
121
  - Subset: `20231101.en`
 
134
  weight=0
135
  ),
136
  attn_loss_component=LossComponent(
137
+ weight=0
 
 
 
 
138
  )
139
  )
140
  ```
 
146
  <summary>Expand</summary>
147
 
148
  - learning_rate: `0.0002`
149
+ - train_batch_size: `4`
150
  - eval_batch_size: `2`
151
  - seed: `42`
152
  - optimizer: `Adam with betas=(0.9,0.999) and epsilon=1e-08`
 
161
  weight=0
162
  ),
163
  attn_loss_component=LossComponent(
164
+ weight=0
 
 
 
 
165
  )
166
  )`
167
+ - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7eb253ff9660>`
168
  - student_model_name_or_path: `None`
169
  - student_config_name_or_path: `None`
170
  - student_model_config: `{'num_hidden_layers': 15}`
 
197
 
198
  # Framework Versions
199
  - Distily 0.5.0
200
+ - Transformers 4.44.2
201
  - Pytorch 2.5.0.dev20240911+cu121
202
+ - Datasets 2.21.0
config.json CHANGED
@@ -7,7 +7,6 @@
7
  "attention_dropout": 0.0,
8
  "bos_token_id": 0,
9
  "eos_token_id": 0,
10
- "head_dim": 64,
11
  "hidden_act": "silu",
12
  "hidden_size": 576,
13
  "initializer_range": 0.02,
@@ -24,7 +23,7 @@
24
  "rope_theta": 10000.0,
25
  "tie_word_embeddings": true,
26
  "torch_dtype": "bfloat16",
27
- "transformers_version": "4.45.0.dev0",
28
  "use_cache": false,
29
  "vocab_size": 49152
30
  }
 
7
  "attention_dropout": 0.0,
8
  "bos_token_id": 0,
9
  "eos_token_id": 0,
 
10
  "hidden_act": "silu",
11
  "hidden_size": 576,
12
  "initializer_range": 0.02,
 
23
  "rope_theta": 10000.0,
24
  "tie_word_embeddings": true,
25
  "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.44.2",
27
  "use_cache": false,
28
  "vocab_size": 49152
29
  }
generation_config.json CHANGED
@@ -2,6 +2,6 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 0,
4
  "eos_token_id": 0,
5
- "transformers_version": "4.45.0.dev0",
6
  "use_cache": false
7
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 0,
4
  "eos_token_id": 0,
5
+ "transformers_version": "4.44.2",
6
  "use_cache": false
7
  }
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=bs4_NO_liger_baseline, student_model_use_liger=False/completed.flag ADDED
File without changes
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=bs4_NO_liger_baseline, student_model_use_liger=False/events.out.tfevents.1726148253.1c1a426a2fee CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf4731dea58e40db4e1b8a523b91f2b8e9b403d55da8ebebf39d902946255bab
3
- size 253
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aae803ff6dd328699919493928d13e384e3295111e1bbc7544b79d04a10f8b4c
3
+ size 529
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_bs4_liger_torch_compile, student_model_use_liger=True, torch_compile=True/events.out.tfevents.1726155371.1c1a426a2fee ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78a4c0b3458c4104b6611e0e2f78d4bd512b7d55a41ef1d8d2b882d2159db624
3
+ size 5616
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_bs4_liger_torch_compile, student_model_use_liger=True, torch_compile=True/events.out.tfevents.1726155767.1c1a426a2fee ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3dfda7795f8dca11e9c2853b47db79f8d5e4c59a5bc1a0bc9c83322f37addd8
3
+ size 5616
logs/attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_bs4_torch_compile, student_model_use_liger=False, torch_compile=True/events.out.tfevents.1726156182.1c1a426a2fee ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:644939ffd31e8ce5fa6b45b761c3446ed209e14a56b40929a54cb540fd206e39
3
+ size 140550
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4d35e5221c1a9340eef663c0674f7b544ed09e4f95457f0c98eb1c255a2acce
3
  size 162842416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c1db03f19d5282f8261330094f26b6cf7d4178ec35cb3b13b390167637a7fdc
3
  size 162842416
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:738e9326fbb30717baf30f64578731beac353bb8abd9108ce8783db4565fa56e
3
- size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3127321d43c3138dea3921e8b46cf74e63fb22ba80f0338e4425f98d3dd416cd
3
+ size 5432