lapp0 commited on
Commit
58277b2
1 Parent(s): c4b616f

Training in progress, step 24750

Browse files
Files changed (10) hide show
  1. README.md +34 -35
  2. logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_cosine_distance_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss())/events.out.tfevents.1723624186.93d6cbb3ad53 +2 -2
  3. logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_jsd_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723624352.93d6cbb3ad53 +3 -0
  4. logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_jsd_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723624893.93d6cbb3ad53 +3 -0
  5. logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_jsd_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723625572.93d6cbb3ad53 +3 -0
  6. logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_jsd_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723625795.93d6cbb3ad53 +3 -0
  7. logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_jsd_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723634352.93d6cbb3ad53 +3 -0
  8. logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_kl_divergence_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723634546.93d6cbb3ad53 +3 -0
  9. model.safetensors +1 -1
  10. training_args.bin +1 -1
README.md CHANGED
@@ -1,7 +1,6 @@
1
  ---
2
  base_model: gpt2
3
  library_name: Distily
4
- license: mit
5
  tags:
6
  - generated_from_trainer
7
  model-index:
@@ -16,13 +15,13 @@ This student model is distilled from the teacher model [gpt2](https://huggingfac
16
  The [Distily](https://github.com/lapp0/distily) library was used for this distillation.
17
 
18
  It achieves the following results on the evaluation set:
19
- - eval_enwikippl: 218.4545
20
- - eval_frwikippl: 1236.6089
21
- - eval_zhwikippl: 614.7109
22
- - eval_loss: 1.3631
23
- - eval_runtime: 34.6994
24
- - eval_samples_per_second: 57.638
25
- - eval_steps_per_second: 7.205
26
 
27
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
28
  should probably proofread and complete it, then remove this comment.
@@ -45,7 +44,7 @@ More information needed
45
  ### Training hyperparameters
46
 
47
  The following hyperparameters were used during training:
48
- - distillation_objective: MultiObjective(logits_weight=1, logits_loss_fn=(fn:kl_divergence_loss()), hs_weight=0.2, hs_loss_fn=(fn:cosine_distance_loss()), attn_weight=0, attn_loss_fn=(fn:soft_mse_loss()))
49
  - train_embeddings: True
50
  - learning_rate: 4e-05
51
  - train_batch_size: 8
@@ -62,32 +61,32 @@ Peak GPU Memory: 8.0903 GB
62
  | step | epoch | enwikippl | frwikippl | loss | runtime | samples_per_second | steps_per_second | zhwikippl |
63
  | --- | --- | --- | --- | --- | --- | --- | --- | --- |
64
  | **teacher eval** | | 30.2086 | 57.2728 | | | | | 18.1784 |
65
- | 0 | 0 | 56797.875 | 58468.6992 | 6.1585 | 34.5637 | 57.864 | 7.233 | 59002.2891 |
66
- | 1000 | 0.0404 | 708.5766 | 4694.9590 | 2.0551 | 34.6903 | 57.653 | 7.207 | 18355.4062 |
67
- | 2000 | 0.0808 | 503.3047 | 3185.3564 | 1.8542 | 34.8131 | 57.45 | 7.181 | 2047.6730 |
68
- | 3000 | 0.1212 | 420.6891 | 2758.6240 | 1.7367 | 34.8612 | 57.37 | 7.171 | 1283.1558 |
69
- | 4000 | 0.1616 | 367.8625 | 2401.8950 | 1.6477 | 34.7075 | 57.624 | 7.203 | 973.7838 |
70
- | 5000 | 0.2020 | 320.5472 | 1856.9045 | 1.5766 | 34.6934 | 57.648 | 7.206 | 916.7459 |
71
- | 6000 | 0.2424 | 282.7001 | 1619.0619 | 1.5071 | 34.9019 | 57.303 | 7.163 | 924.1207 |
72
- | 7000 | 0.2828 | 255.9720 | 1427.6995 | 1.4560 | 34.8182 | 57.441 | 7.18 | 1441.6195 |
73
- | 8000 | 0.3232 | 237.3258 | 1286.0482 | 1.4053 | 34.7475 | 57.558 | 7.195 | 814.6700 |
74
- | 9000 | 0.3636 | 218.4545 | 1236.6089 | 1.3631 | 34.6994 | 57.638 | 7.205 | 614.7109 |
75
- | 10000 | 0.4040 | 203.9934 | 1223.5991 | 1.3192 | 34.7173 | 57.608 | 7.201 | 549.0444 |
76
- | 11000 | 0.4444 | 187.8893 | 1096.1538 | 1.2761 | 34.5898 | 57.821 | 7.228 | 637.4493 |
77
- | 12000 | 0.4848 | 172.7610 | 1002.2672 | 1.2335 | 34.6239 | 57.764 | 7.22 | 559.7791 |
78
- | 13000 | 0.5253 | 159.2338 | 890.8124 | 1.1992 | 34.7167 | 57.609 | 7.201 | 579.3239 |
79
- | 14000 | 0.5657 | 153.4912 | 829.4653 | 1.1654 | 34.7669 | 57.526 | 7.191 | 670.8995 |
80
- | 15000 | 0.6061 | 145.7548 | 784.8597 | 1.1471 | 34.7193 | 57.605 | 7.201 | 650.9545 |
81
- | 16000 | 0.6465 | 144.2797 | 727.8268 | 1.1271 | 34.8277 | 57.426 | 7.178 | 545.4637 |
82
- | 17000 | 0.6869 | 137.5085 | 734.4252 | 1.1099 | 34.6222 | 57.766 | 7.221 | 613.1535 |
83
- | 18000 | 0.7273 | 137.3485 | 726.0842 | 1.1051 | 34.7716 | 57.518 | 7.19 | 584.0622 |
84
- | 19000 | 0.7677 | 133.5828 | 694.8320 | 1.0903 | 34.5719 | 57.851 | 7.231 | 406.3925 |
85
- | 20000 | 0.8081 | 130.7299 | 687.1347 | 1.0779 | 34.6669 | 57.692 | 7.211 | 398.8122 |
86
- | 21000 | 0.8485 | 128.6154 | 632.9050 | 1.0672 | 34.6196 | 57.771 | 7.221 | 464.8858 |
87
- | 22000 | 0.8889 | 127.9977 | 623.7347 | 1.0636 | 34.6887 | 57.656 | 7.207 | 495.0645 |
88
- | 23000 | 0.9293 | 127.1556 | 660.9055 | 1.0580 | 34.7945 | 57.48 | 7.185 | 299.3615 |
89
- | 24000 | 0.9697 | 124.0927 | 611.2403 | 1.0467 | 34.6542 | 57.713 | 7.214 | 312.8069 |
90
- | 24750 | 1.0 | 123.3816 | 660.7193 | 1.0458 | 34.7015 | 57.634 | 7.204 | 324.2909 |
91
 
92
  ### Framework versions
93
  - Distily 0.2.0
 
1
  ---
2
  base_model: gpt2
3
  library_name: Distily
 
4
  tags:
5
  - generated_from_trainer
6
  model-index:
 
15
  The [Distily](https://github.com/lapp0/distily) library was used for this distillation.
16
 
17
  It achieves the following results on the evaluation set:
18
+ - eval_enwikippl: 300.6556
19
+ - eval_frwikippl: 1956.3597
20
+ - eval_zhwikippl: 1863.9412
21
+ - eval_loss: 1.5080
22
+ - eval_runtime: 36.2633
23
+ - eval_samples_per_second: 55.152
24
+ - eval_steps_per_second: 6.894
25
 
26
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
27
  should probably proofread and complete it, then remove this comment.
 
44
  ### Training hyperparameters
45
 
46
  The following hyperparameters were used during training:
47
+ - distillation_objective: MultiObjective(logits_weight=1, logits_loss_fn=(fn:kl_divergence_loss()), hs_weight=0.2, hs_loss_fn=(fn:jsd_loss()), attn_weight=0, attn_loss_fn=(fn:soft_mse_loss()))
48
  - train_embeddings: True
49
  - learning_rate: 4e-05
50
  - train_batch_size: 8
 
61
  | step | epoch | enwikippl | frwikippl | loss | runtime | samples_per_second | steps_per_second | zhwikippl |
62
  | --- | --- | --- | --- | --- | --- | --- | --- | --- |
63
  | **teacher eval** | | 30.2086 | 57.2728 | | | | | 18.1784 |
64
+ | 0 | 0 | 58489.7812 | 56515.125 | 23.3780 | 36.5332 | 54.745 | 6.843 | 57894.0352 |
65
+ | 1000 | 0.0404 | 1043.9308 | 5520.7402 | 2.3618 | 36.284 | 55.121 | 6.89 | 18612.1113 |
66
+ | 2000 | 0.0808 | 741.2775 | 3967.9961 | 2.0659 | 36.0061 | 55.546 | 6.943 | 5552.3940 |
67
+ | 3000 | 0.1212 | 605.6614 | 3446.1201 | 1.9370 | 36.1801 | 55.279 | 6.91 | 3337.375 |
68
+ | 4000 | 0.1616 | 522.5402 | 3105.9614 | 1.8315 | 36.1309 | 55.354 | 6.919 | 2158.2932 |
69
+ | 5000 | 0.2020 | 457.7764 | 2678.5098 | 1.7492 | 36.1083 | 55.389 | 6.924 | 2813.0127 |
70
+ | 6000 | 0.2424 | 401.6627 | 2722.6836 | 1.6794 | 36.1519 | 55.322 | 6.915 | 1611.4553 |
71
+ | 7000 | 0.2828 | 354.7337 | 2500.7419 | 1.6117 | 36.2282 | 55.206 | 6.901 | 2297.4802 |
72
+ | 8000 | 0.3232 | 326.1971 | 2137.8101 | 1.5573 | 36.0207 | 55.524 | 6.94 | 1784.0629 |
73
+ | 9000 | 0.3636 | 300.6556 | 1956.3597 | 1.5080 | 36.2633 | 55.152 | 6.894 | 1863.9412 |
74
+ | 10000 | 0.4040 | 277.5233 | 1719.5438 | 1.4690 | 36.6308 | 54.599 | 6.825 | 1311.5671 |
75
+ | 11000 | 0.4444 | 259.9179 | 1497.8167 | 1.4308 | 36.3181 | 55.069 | 6.884 | 792.9883 |
76
+ | 12000 | 0.4848 | 245.9564 | 1497.3939 | 1.3959 | 36.4046 | 54.938 | 6.867 | 1003.8881 |
77
+ | 13000 | 0.5253 | 229.0869 | 1452.4731 | 1.3629 | 36.6598 | 54.556 | 6.819 | 1079.2407 |
78
+ | 14000 | 0.5657 | 217.4221 | 1300.0883 | 1.3298 | 36.152 | 55.322 | 6.915 | 1190.3823 |
79
+ | 15000 | 0.6061 | 205.3283 | 1161.2318 | 1.3009 | 36.1689 | 55.296 | 6.912 | 949.3873 |
80
+ | 16000 | 0.6465 | 198.7860 | 1095.5355 | 1.2807 | 36.2562 | 55.163 | 6.895 | 1139.3685 |
81
+ | 17000 | 0.6869 | 192.0938 | 1026.8737 | 1.2628 | 36.298 | 55.099 | 6.887 | 827.6089 |
82
+ | 18000 | 0.7273 | 182.0580 | 986.5624 | 1.2441 | 36.1201 | 55.371 | 6.921 | 1010.3434 |
83
+ | 19000 | 0.7677 | 178.0731 | 975.4955 | 1.2283 | 36.1677 | 55.298 | 6.912 | 872.3183 |
84
+ | 20000 | 0.8081 | 175.3561 | 970.5560 | 1.2150 | 36.1615 | 55.307 | 6.913 | 865.3570 |
85
+ | 21000 | 0.8485 | 171.6644 | 930.0918 | 1.2089 | 35.9764 | 55.592 | 6.949 | 832.4859 |
86
+ | 22000 | 0.8889 | 168.4032 | 871.8605 | 1.1983 | 35.8999 | 55.71 | 6.964 | 733.7902 |
87
+ | 23000 | 0.9293 | 167.6074 | 855.1790 | 1.1917 | 35.9055 | 55.702 | 6.963 | 772.9152 |
88
+ | 24000 | 0.9697 | 166.0399 | 822.7090 | 1.1858 | 35.9815 | 55.584 | 6.948 | 620.6498 |
89
+ | 24750 | 1.0 | 162.6707 | 952.2545 | 1.1803 | 36.0128 | 55.536 | 6.942 | 582.5044 |
90
 
91
  ### Framework versions
92
  - Distily 0.2.0
logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_cosine_distance_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss())/events.out.tfevents.1723624186.93d6cbb3ad53 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e543bf9f554af69b4bdb8f902684ee393e811039db60244832ff488ea8f96ff9
3
- size 253
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c812f40574081af2f911c00e1d803eb544b5aaf4d7e1239a9e5d18fe275cae5
3
+ size 529
logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_jsd_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723624352.93d6cbb3ad53 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cf389b22849320df350f590391862f3d81d18451b853952e1e20d795f8f33ba
3
+ size 140444
logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_jsd_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723624893.93d6cbb3ad53 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbc003d568954661e2cce676e951baf363ce661581526d58416a8846b8c697ea
3
+ size 392505
logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_jsd_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723625572.93d6cbb3ad53 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9df6014e1d1cc837e716249f37cc93fccf6453718df7662544cd3918bc80763
3
+ size 6138
logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_jsd_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723625795.93d6cbb3ad53 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccfdd9cc7f0da84d8d4b099b13f89c534ce996276af1ca2465efd6ab8965d651
3
+ size 6729486
logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_jsd_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723634352.93d6cbb3ad53 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78a518ecdf536f91b864699aec9f3269302a513666b06b219f49af1ab80e0af7
3
+ size 529
logs/distillation_objective=MultiObjective(logits_weight_1__logits_loss_fn_(fn_kl_divergence_loss())__hs_weight_0.2__hs_loss_fn_(fn_kl_divergence_loss())__attn_weight_0__attn_loss_fn_(fn_soft_mse_loss()))/events.out.tfevents.1723634546.93d6cbb3ad53 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac26eff556cbd575454d912b8c245925c1d9e3dbafb1b444ed7475ab9e06f57c
3
+ size 6729496
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db0b0e8b780774b77126560ee93d90c6756c36c085e52f55c47a4755704d2d50
3
  size 248894656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c9495ea14e6c2ebedf6ebe73240d587d782b4105889f51455a8e7e4254f6475
3
  size 248894656
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75f2fc1365ecfa56fcff8f5f9cfb87cc90efd424689a7b7bfc87e7965be55020
3
  size 907106756
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed6853059e8b211ba0f61a8a79b94418b13983053a35e5585d2a82479479ca65
3
  size 907106756