aluminumbox
commited on
update model
Browse files- README.md +36 -26
- cosyvoice.yaml +8 -3
- flow.decoder.estimator.fp32.onnx +3 -0
- flow.encoder.fp32.zip +3 -0
- llm.llm.fp16.zip +3 -0
- llm.text_encoder.fp16.zip +3 -0
README.md
CHANGED
@@ -22,6 +22,8 @@ git submodule update --init --recursive
|
|
22 |
``` sh
|
23 |
conda create -n cosyvoice python=3.8
|
24 |
conda activate cosyvoice
|
|
|
|
|
25 |
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
|
26 |
|
27 |
# If you encounter sox compatibility issues
|
@@ -33,7 +35,7 @@ sudo yum install sox sox-devel
|
|
33 |
|
34 |
**Model download**
|
35 |
|
36 |
-
We strongly
|
37 |
|
38 |
If you are expert in this field, and you are only interested in training your own CosyVoice model from scratch, you can skip this step.
|
39 |
|
@@ -43,7 +45,7 @@ from modelscope import snapshot_download
|
|
43 |
snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
|
44 |
snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
|
45 |
snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
|
46 |
-
snapshot_download('iic/
|
47 |
```
|
48 |
|
49 |
``` sh
|
@@ -52,12 +54,15 @@ mkdir -p pretrained_models
|
|
52 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
|
53 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
|
54 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
|
55 |
-
git clone https://www.modelscope.cn/iic/
|
56 |
```
|
57 |
|
58 |
-
|
|
|
|
|
|
|
59 |
``` sh
|
60 |
-
cd pretrained_models/
|
61 |
unzip resource.zip -d .
|
62 |
pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
|
63 |
```
|
@@ -67,10 +72,10 @@ pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
|
|
67 |
For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model.
|
68 |
For sft inference, please use `CosyVoice-300M-SFT` model.
|
69 |
For instruct inference, please use `CosyVoice-300M-Instruct` model.
|
70 |
-
First, add `third_party/
|
71 |
|
72 |
``` sh
|
73 |
-
export PYTHONPATH=third_party/
|
74 |
```
|
75 |
|
76 |
``` python
|
@@ -78,26 +83,27 @@ from cosyvoice.cli.cosyvoice import CosyVoice
|
|
78 |
from cosyvoice.utils.file_utils import load_wav
|
79 |
import torchaudio
|
80 |
|
81 |
-
cosyvoice = CosyVoice('
|
82 |
# sft usage
|
83 |
print(cosyvoice.list_avaliable_spks())
|
84 |
-
|
85 |
-
|
|
|
86 |
|
87 |
-
cosyvoice = CosyVoice('
|
88 |
-
# zero_shot usage
|
89 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
90 |
-
|
91 |
-
torchaudio.save('
|
92 |
# cross_lingual usage
|
93 |
prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
94 |
-
|
95 |
-
torchaudio.save('
|
96 |
|
97 |
-
cosyvoice = CosyVoice('
|
98 |
-
# instruct usage
|
99 |
-
|
100 |
-
torchaudio.save('
|
101 |
```
|
102 |
|
103 |
**Start web demo**
|
@@ -108,8 +114,8 @@ We support sft/zero_shot/cross_lingual/instruct inference in web demo.
|
|
108 |
Please see the demo website for details.
|
109 |
|
110 |
``` python
|
111 |
-
# change
|
112 |
-
python3 webui.py --port 50000 --model_dir
|
113 |
```
|
114 |
|
115 |
**Advanced Usage**
|
@@ -125,16 +131,20 @@ you can run following steps. Otherwise, you can just ignore this step.
|
|
125 |
``` sh
|
126 |
cd runtime/python
|
127 |
docker build -t cosyvoice:v1.0 .
|
128 |
-
# change
|
129 |
-
|
130 |
-
python3
|
|
|
|
|
|
|
|
|
131 |
```
|
132 |
|
133 |
## Discussion & Communication
|
134 |
|
135 |
You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues).
|
136 |
|
137 |
-
You can also scan the QR code to join our
|
138 |
|
139 |
<img src="./asset/dingding.png" width="250px">
|
140 |
|
|
|
22 |
``` sh
|
23 |
conda create -n cosyvoice python=3.8
|
24 |
conda activate cosyvoice
|
25 |
+
# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
|
26 |
+
conda install -y -c conda-forge pynini==2.1.5
|
27 |
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
|
28 |
|
29 |
# If you encounter sox compatibility issues
|
|
|
35 |
|
36 |
**Model download**
|
37 |
|
38 |
+
We strongly recommend that you download our pretrained `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `CosyVoice-ttsfrd` resource.
|
39 |
|
40 |
If you are expert in this field, and you are only interested in training your own CosyVoice model from scratch, you can skip this step.
|
41 |
|
|
|
45 |
snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
|
46 |
snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
|
47 |
snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
|
48 |
+
snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
|
49 |
```
|
50 |
|
51 |
``` sh
|
|
|
54 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
|
55 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
|
56 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
|
57 |
+
git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
|
58 |
```
|
59 |
|
60 |
+
Optionaly, you can unzip `ttsfrd` resouce and install `ttsfrd` package for better text normalization performance.
|
61 |
+
|
62 |
+
Notice that this step is not necessary. If you do not install `ttsfrd` package, we will use WeTextProcessing by default.
|
63 |
+
|
64 |
``` sh
|
65 |
+
cd pretrained_models/CosyVoice-ttsfrd/
|
66 |
unzip resource.zip -d .
|
67 |
pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
|
68 |
```
|
|
|
72 |
For zero_shot/cross_lingual inference, please use `CosyVoice-300M` model.
|
73 |
For sft inference, please use `CosyVoice-300M-SFT` model.
|
74 |
For instruct inference, please use `CosyVoice-300M-Instruct` model.
|
75 |
+
First, add `third_party/Matcha-TTS` to your `PYTHONPATH`.
|
76 |
|
77 |
``` sh
|
78 |
+
export PYTHONPATH=third_party/Matcha-TTS
|
79 |
```
|
80 |
|
81 |
``` python
|
|
|
83 |
from cosyvoice.utils.file_utils import load_wav
|
84 |
import torchaudio
|
85 |
|
86 |
+
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
|
87 |
# sft usage
|
88 |
print(cosyvoice.list_avaliable_spks())
|
89 |
+
# change stream=True for chunk stream inference
|
90 |
+
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
91 |
+
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
|
92 |
|
93 |
+
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M')
|
94 |
+
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
95 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
96 |
+
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
97 |
+
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], 22050)
|
98 |
# cross_lingual usage
|
99 |
prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
100 |
+
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
|
101 |
+
torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
|
102 |
|
103 |
+
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
104 |
+
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
105 |
+
for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
|
106 |
+
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
|
107 |
```
|
108 |
|
109 |
**Start web demo**
|
|
|
114 |
Please see the demo website for details.
|
115 |
|
116 |
``` python
|
117 |
+
# change iic/CosyVoice-300M-SFT for sft inference, or iic/CosyVoice-300M-Instruct for instruct inference
|
118 |
+
python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M
|
119 |
```
|
120 |
|
121 |
**Advanced Usage**
|
|
|
131 |
``` sh
|
132 |
cd runtime/python
|
133 |
docker build -t cosyvoice:v1.0 .
|
134 |
+
# change iic/CosyVoice-300M to iic/CosyVoice-300M-Instruct if you want to use instruct inference
|
135 |
+
# for grpc usage
|
136 |
+
docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python/grpc && python3 server.py --port 50000 --max_conc 4 --model_dir iic/CosyVoice-300M && sleep infinity"
|
137 |
+
cd grpc && python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
|
138 |
+
# for fastapi usage
|
139 |
+
docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python/fastapi && MODEL_DIR=iic/CosyVoice-300M fastapi dev --port 50000 server.py && sleep infinity"
|
140 |
+
cd fastapi && python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
|
141 |
```
|
142 |
|
143 |
## Discussion & Communication
|
144 |
|
145 |
You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues).
|
146 |
|
147 |
+
You can also scan the QR code to join our official Dingding chat group.
|
148 |
|
149 |
<img src="./asset/dingding.png" width="250px">
|
150 |
|
cosyvoice.yaml
CHANGED
@@ -31,7 +31,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
31 |
num_blocks: 6
|
32 |
dropout_rate: 0.1
|
33 |
positional_dropout_rate: 0.1
|
34 |
-
attention_dropout_rate: 0
|
35 |
normalize_before: True
|
36 |
input_layer: 'linear'
|
37 |
pos_enc_layer_type: 'rel_pos_espnet'
|
@@ -49,11 +49,16 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
49 |
num_blocks: 14
|
50 |
dropout_rate: 0.1
|
51 |
positional_dropout_rate: 0.1
|
52 |
-
attention_dropout_rate: 0
|
53 |
input_layer: 'linear_legacy'
|
54 |
pos_enc_layer_type: 'rel_pos_espnet'
|
55 |
selfattention_layer_type: 'rel_selfattn'
|
56 |
static_chunk_size: 1
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
59 |
input_size: 512
|
@@ -97,7 +102,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|
97 |
in_channels: 320
|
98 |
out_channels: 80
|
99 |
channels: [256, 256]
|
100 |
-
dropout: 0
|
101 |
attention_head_dim: 64
|
102 |
n_blocks: 4
|
103 |
num_mid_blocks: 12
|
|
|
31 |
num_blocks: 6
|
32 |
dropout_rate: 0.1
|
33 |
positional_dropout_rate: 0.1
|
34 |
+
attention_dropout_rate: 0.0
|
35 |
normalize_before: True
|
36 |
input_layer: 'linear'
|
37 |
pos_enc_layer_type: 'rel_pos_espnet'
|
|
|
49 |
num_blocks: 14
|
50 |
dropout_rate: 0.1
|
51 |
positional_dropout_rate: 0.1
|
52 |
+
attention_dropout_rate: 0.0
|
53 |
input_layer: 'linear_legacy'
|
54 |
pos_enc_layer_type: 'rel_pos_espnet'
|
55 |
selfattention_layer_type: 'rel_selfattn'
|
56 |
static_chunk_size: 1
|
57 |
+
sampling: !name:cosyvoice.utils.common.ras_sampling
|
58 |
+
top_p: 0.8
|
59 |
+
top_k: 25
|
60 |
+
win_size: 10
|
61 |
+
tau_r: 0.1
|
62 |
|
63 |
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
64 |
input_size: 512
|
|
|
102 |
in_channels: 320
|
103 |
out_channels: 80
|
104 |
channels: [256, 256]
|
105 |
+
dropout: 0.0
|
106 |
attention_head_dim: 64
|
107 |
n_blocks: 4
|
108 |
num_mid_blocks: 12
|
flow.decoder.estimator.fp32.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:482e27304e8242dc3d7bc9989bad84ec7835394ddf9e78826337d9484a4ee3ee
|
3 |
+
size 328627300
|
flow.encoder.fp32.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:637f9ef66ba7ecd677b4e1d3d0b1af1e0c6d744485782553ca5bc1ecfa4cf0f7
|
3 |
+
size 103558803
|
llm.llm.fp16.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16d1fe9dd3ebcd15ed47337fe1bc84a81e4f6b1eee0b41e689123dbe9b8eb471
|
3 |
+
size 809092215
|
llm.text_encoder.fp16.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:20e2db67ef0e856fbc573461d30f397cb93d4b45c8ef76133840ee1c94395f2a
|
3 |
+
size 205829251
|