Update
Browse files- ReadMe.txt → README.md +77 -77
ReadMe.txt → README.md
RENAMED
@@ -1,78 +1,78 @@
|
|
1 |
-
---
|
2 |
-
language: All languages
|
3 |
-
datasets: ISML datasets (80 thousands hours unlabeled data) + babel datasets (2 thousands unlabeled data)
|
4 |
-
|
5 |
-
# Chinese W2v-conformer
|
6 |
-
## Model description
|
7 |
-
This is the set of Speech W2v-conformer model pre-trained by UER-py. You can download the model either from the [UER-py Github page](https://github.com/dbiir/UER-py/):
|
8 |
-
|
9 |
-
## How to use
|
10 |
-
You can use the model directly with a pipeline for speech recognition:
|
11 |
-
```python
|
12 |
-
>>> from wenet.dataset.dataset import CollateFunc, AudioDataset
|
13 |
-
>>> from wenet.transformer.asr_model import ASRModel
|
14 |
-
>>> from wenet.transformer.encoder import ConformerEncoder
|
15 |
-
>>> from wenet.transformer.decoder import TransformerDecoder
|
16 |
-
>>> from wenet.transformer.ctc import CTC
|
17 |
-
>>> from wenet.utils.executor import Executor
|
18 |
-
>>> from wenet.utils.checkpoint import save_checkpoint, load_checkpoint
|
19 |
-
>>> encoder = ConformerEncoder(input_dim, **configs['encoder_conf'])
|
20 |
-
>>> decoder = TransformerDecoder(vocab_size, encoder.output_size(), **configs['decoder_conf'])
|
21 |
-
>>> ctc = CTC(vocab_size, encoder.output_size())
|
22 |
-
>>> with open(args.config, 'r') as fin: configs = yaml.load(fin)
|
23 |
-
>>> model = ASRModel(
|
24 |
-
vocab_size=vocab_size,
|
25 |
-
encoder=encoder,
|
26 |
-
decoder=decoder,
|
27 |
-
ctc=ctc,
|
28 |
-
**configs['model_conf'],
|
29 |
-
)
|
30 |
-
>>> infos = load_checkpoint(model, args.checkpoint)
|
31 |
-
|
32 |
-
```
|
33 |
-
|
34 |
-
## Training data
|
35 |
-
ISML datasets (80 thousands hours unlabeled data) and babel datasets (2 thousands unlabeled data) are used as training data.
|
36 |
-
## Training procedure
|
37 |
-
The model is pre-trained by wav2vec2 (https://github.com/dbiir/UER-py/) on [Tencent Cloud](https://cloud.tencent.com/). We pre-train 70 epochs with a batch size of 128. We use the same hyper-parameters on different model sizes.
|
38 |
-
The downstream models are finetuned:
|
39 |
-
```
|
40 |
-
Stage 1:
|
41 |
-
```
|
42 |
-
python wenet/bin/train.py --gpu 0,1,2,3,4,5,6,7 \
|
43 |
-
--config $train_config \
|
44 |
-
--train_data train.data \
|
45 |
-
--cv_data dev.data \
|
46 |
-
${checkpoint:+--checkpoint $checkpoint} \
|
47 |
-
--model_dir $dir \
|
48 |
-
--ddp.init_method $init_method \
|
49 |
-
--ddp.world_size 7 \
|
50 |
-
--ddp.dist_backend nccl \
|
51 |
-
--num_workers 2
|
52 |
-
```
|
53 |
-
|
54 |
-
### BibTeX entry and citation info
|
55 |
-
```
|
56 |
-
@article{baevski2020wav2vec,
|
57 |
-
title={wav2vec 2.0: A framework for self-supervised learning of speech representations},
|
58 |
-
author={Baevski, Alexei and Zhou, Henry and Mohamed, Abdelrahman and Auli, Michael},
|
59 |
-
journal={arXiv preprint arXiv:2006.11477},
|
60 |
-
year={2020}
|
61 |
-
}
|
62 |
-
|
63 |
-
@article{zhang2020pushing,
|
64 |
-
title={Pushing the limits of semi-supervised learning for automatic speech recognition},
|
65 |
-
author={Zhang, Yu and Qin, James and Park, Daniel S and Han, Wei and Chiu, Chung-Cheng and Pang, Ruoming and Le, Quoc V and Wu, Yonghui},
|
66 |
-
journal={arXiv preprint arXiv:2010.10504},
|
67 |
-
year={2020}
|
68 |
-
}
|
69 |
-
|
70 |
-
@article{zhang2021wenet,
|
71 |
-
title={WeNet: Production First and Production Ready End-to-End Speech Recognition Toolkit},
|
72 |
-
author={Zhang, Binbin and Wu, Di and Yang, Chao and Chen, Xiaoyu and Peng, Zhendong and Wang, Xiangming and Yao, Zhuoyuan and Wang, Xiong and Yu, Fan and Xie, Lei and others},
|
73 |
-
journal={arXiv preprint arXiv:2102.01547},
|
74 |
-
year={2021}
|
75 |
-
}
|
76 |
-
```
|
77 |
-
[base]:https://huggingface.co/uer/albert-base-chinese-cluecorpussmall
|
78 |
[large]:https://huggingface.co/uer/albert-large-chinese-cluecorpussmall
|
|
|
1 |
+
---
|
2 |
+
language: All languages
|
3 |
+
datasets: ISML datasets (80 thousands hours unlabeled data) + babel datasets (2 thousands unlabeled data)
|
4 |
+
|
5 |
+
# Chinese W2v-conformer
|
6 |
+
## Model description
|
7 |
+
This is the set of Speech W2v-conformer model pre-trained by UER-py. You can download the model either from the [UER-py Github page](https://github.com/dbiir/UER-py/):
|
8 |
+
|
9 |
+
## How to use
|
10 |
+
You can use the model directly with a pipeline for speech recognition:
|
11 |
+
```python
|
12 |
+
>>> from wenet.dataset.dataset import CollateFunc, AudioDataset
|
13 |
+
>>> from wenet.transformer.asr_model import ASRModel
|
14 |
+
>>> from wenet.transformer.encoder import ConformerEncoder
|
15 |
+
>>> from wenet.transformer.decoder import TransformerDecoder
|
16 |
+
>>> from wenet.transformer.ctc import CTC
|
17 |
+
>>> from wenet.utils.executor import Executor
|
18 |
+
>>> from wenet.utils.checkpoint import save_checkpoint, load_checkpoint
|
19 |
+
>>> encoder = ConformerEncoder(input_dim, **configs['encoder_conf'])
|
20 |
+
>>> decoder = TransformerDecoder(vocab_size, encoder.output_size(), **configs['decoder_conf'])
|
21 |
+
>>> ctc = CTC(vocab_size, encoder.output_size())
|
22 |
+
>>> with open(args.config, 'r') as fin: configs = yaml.load(fin)
|
23 |
+
>>> model = ASRModel(
|
24 |
+
vocab_size=vocab_size,
|
25 |
+
encoder=encoder,
|
26 |
+
decoder=decoder,
|
27 |
+
ctc=ctc,
|
28 |
+
**configs['model_conf'],
|
29 |
+
)
|
30 |
+
>>> infos = load_checkpoint(model, args.checkpoint)
|
31 |
+
|
32 |
+
```
|
33 |
+
|
34 |
+
## Training data
|
35 |
+
ISML datasets (80 thousands hours unlabeled data) and babel datasets (2 thousands unlabeled data) are used as training data.
|
36 |
+
## Training procedure
|
37 |
+
The model is pre-trained by wav2vec2 (https://github.com/dbiir/UER-py/) on [Tencent Cloud](https://cloud.tencent.com/). We pre-train 70 epochs with a batch size of 128. We use the same hyper-parameters on different model sizes.
|
38 |
+
The downstream models are finetuned:
|
39 |
+
```
|
40 |
+
Stage 1:
|
41 |
+
```
|
42 |
+
python wenet/bin/train.py --gpu 0,1,2,3,4,5,6,7 \
|
43 |
+
--config $train_config \
|
44 |
+
--train_data train.data \
|
45 |
+
--cv_data dev.data \
|
46 |
+
${checkpoint:+--checkpoint $checkpoint} \
|
47 |
+
--model_dir $dir \
|
48 |
+
--ddp.init_method $init_method \
|
49 |
+
--ddp.world_size 7 \
|
50 |
+
--ddp.dist_backend nccl \
|
51 |
+
--num_workers 2
|
52 |
+
```
|
53 |
+
|
54 |
+
### BibTeX entry and citation info
|
55 |
+
```
|
56 |
+
@article{baevski2020wav2vec,
|
57 |
+
title={wav2vec 2.0: A framework for self-supervised learning of speech representations},
|
58 |
+
author={Baevski, Alexei and Zhou, Henry and Mohamed, Abdelrahman and Auli, Michael},
|
59 |
+
journal={arXiv preprint arXiv:2006.11477},
|
60 |
+
year={2020}
|
61 |
+
}
|
62 |
+
|
63 |
+
@article{zhang2020pushing,
|
64 |
+
title={Pushing the limits of semi-supervised learning for automatic speech recognition},
|
65 |
+
author={Zhang, Yu and Qin, James and Park, Daniel S and Han, Wei and Chiu, Chung-Cheng and Pang, Ruoming and Le, Quoc V and Wu, Yonghui},
|
66 |
+
journal={arXiv preprint arXiv:2010.10504},
|
67 |
+
year={2020}
|
68 |
+
}
|
69 |
+
|
70 |
+
@article{zhang2021wenet,
|
71 |
+
title={WeNet: Production First and Production Ready End-to-End Speech Recognition Toolkit},
|
72 |
+
author={Zhang, Binbin and Wu, Di and Yang, Chao and Chen, Xiaoyu and Peng, Zhendong and Wang, Xiangming and Yao, Zhuoyuan and Wang, Xiong and Yu, Fan and Xie, Lei and others},
|
73 |
+
journal={arXiv preprint arXiv:2102.01547},
|
74 |
+
year={2021}
|
75 |
+
}
|
76 |
+
```
|
77 |
+
[base]:https://huggingface.co/uer/albert-base-chinese-cluecorpussmall
|
78 |
[large]:https://huggingface.co/uer/albert-large-chinese-cluecorpussmall
|