Add colab notebook
Browse files- tortoise_tts.ipynb +248 -0
tortoise_tts.ipynb
ADDED
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"nbformat": 4,
|
3 |
+
"nbformat_minor": 0,
|
4 |
+
"metadata": {
|
5 |
+
"colab": {
|
6 |
+
"name": "tortoise-tts.ipynb",
|
7 |
+
"provenance": [],
|
8 |
+
"collapsed_sections": []
|
9 |
+
},
|
10 |
+
"kernelspec": {
|
11 |
+
"name": "python3",
|
12 |
+
"display_name": "Python 3"
|
13 |
+
},
|
14 |
+
"language_info": {
|
15 |
+
"name": "python"
|
16 |
+
},
|
17 |
+
"accelerator": "GPU"
|
18 |
+
},
|
19 |
+
"cells": [
|
20 |
+
{
|
21 |
+
"cell_type": "code",
|
22 |
+
"execution_count": null,
|
23 |
+
"metadata": {
|
24 |
+
"id": "JrK20I32grP6"
|
25 |
+
},
|
26 |
+
"outputs": [],
|
27 |
+
"source": [
|
28 |
+
"!git clone https://github.com/neonbjb/tortoise-tts.git\n",
|
29 |
+
"%cd tortoise-tts\n",
|
30 |
+
"!pip install -r requirements.txt"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"source": [
|
36 |
+
"# Imports used through the rest of the notebook.\n",
|
37 |
+
"import torch\n",
|
38 |
+
"import torchaudio\n",
|
39 |
+
"import torch.nn as nn\n",
|
40 |
+
"import torch.nn.functional as F\n",
|
41 |
+
"from tqdm import tqdm\n",
|
42 |
+
"\n",
|
43 |
+
"from utils.tokenizer import VoiceBpeTokenizer\n",
|
44 |
+
"from models.discrete_diffusion_vocoder import DiscreteDiffusionVocoder\n",
|
45 |
+
"from models.text_voice_clip import VoiceCLIP\n",
|
46 |
+
"from models.dvae import DiscreteVAE\n",
|
47 |
+
"from models.autoregressive import UnifiedVoice\n",
|
48 |
+
"\n",
|
49 |
+
"# These have some fairly interesting code that is hidden in the colab. Consider checking it out.\n",
|
50 |
+
"from do_tts import download_models, load_discrete_vocoder_diffuser, load_conditioning, fix_autoregressive_output, do_spectrogram_diffusion"
|
51 |
+
],
|
52 |
+
"metadata": {
|
53 |
+
"id": "Gen09NM4hONQ"
|
54 |
+
},
|
55 |
+
"execution_count": null,
|
56 |
+
"outputs": []
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"source": [
|
61 |
+
"# Download pretrained models and set up pretrained voice bank. Feel free to upload and add your own voices here.\n",
|
62 |
+
"# To do so, upload two WAV files cropped to 5-10 seconds of someone speaking.\n",
|
63 |
+
"download_models()\n",
|
64 |
+
"preselected_cond_voices = {\n",
|
65 |
+
" # Male voices\n",
|
66 |
+
" 'dotrice': ['voices/dotrice/1.wav', 'voices/dotrice/2.wav'],\n",
|
67 |
+
" 'harris': ['voices/harris/1.wav', 'voices/harris/2.wav'],\n",
|
68 |
+
" 'lescault': ['voices/lescault/1.wav', 'voices/lescault/2.wav'],\n",
|
69 |
+
" 'otto': ['voices/otto/1.wav', 'voices/otto/2.wav'],\n",
|
70 |
+
" # Female voices\n",
|
71 |
+
" 'atkins': ['voices/atkins/1.wav', 'voices/atkins/2.wav'],\n",
|
72 |
+
" 'grace': ['voices/grace/1.wav', 'voices/grace/2.wav'],\n",
|
73 |
+
" 'kennard': ['voices/kennard/1.wav', 'voices/kennard/2.wav'],\n",
|
74 |
+
" 'mol': ['voices/mol/1.wav', 'voices/mol/2.wav'],\n",
|
75 |
+
" }"
|
76 |
+
],
|
77 |
+
"metadata": {
|
78 |
+
"id": "SSleVnRAiEE2"
|
79 |
+
},
|
80 |
+
"execution_count": null,
|
81 |
+
"outputs": []
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"cell_type": "code",
|
85 |
+
"source": [
|
86 |
+
"# This is the text that will be spoken.\n",
|
87 |
+
"text = \"And took the other as just as fair, and having perhaps the better claim, because it was grassy and wanted wear.\"\n",
|
88 |
+
"# This is the voice that will speak it.\n",
|
89 |
+
"voice = 'atkins'\n",
|
90 |
+
"# This is the number of samples we will generate from the DALLE-style model. More will produce better results, but will take longer to produce.\n",
|
91 |
+
"# I don't recommend going less than 128.\n",
|
92 |
+
"num_autoregressive_samples = 128"
|
93 |
+
],
|
94 |
+
"metadata": {
|
95 |
+
"id": "bt_aoxONjfL2"
|
96 |
+
},
|
97 |
+
"execution_count": null,
|
98 |
+
"outputs": []
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"cell_type": "code",
|
102 |
+
"source": [
|
103 |
+
"# Prepare data.\n",
|
104 |
+
"tokenizer = VoiceBpeTokenizer()\n",
|
105 |
+
"text = torch.IntTensor(tokenizer.encode(text)).unsqueeze(0).cuda()\n",
|
106 |
+
"text = F.pad(text, (0,1)) # This may not be necessary.\n",
|
107 |
+
"cond_paths = preselected_cond_voices[voice]\n",
|
108 |
+
"conds = []\n",
|
109 |
+
"for cond_path in cond_paths:\n",
|
110 |
+
" c, cond_wav = load_conditioning(cond_path)\n",
|
111 |
+
" conds.append(c)\n",
|
112 |
+
"conds = torch.stack(conds, dim=1) # And just use the last cond_wav for the diffusion model."
|
113 |
+
],
|
114 |
+
"metadata": {
|
115 |
+
"id": "KEXOKjIvn6NW"
|
116 |
+
},
|
117 |
+
"execution_count": null,
|
118 |
+
"outputs": []
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"cell_type": "code",
|
122 |
+
"source": [
|
123 |
+
"# Load the autoregressive model.\n",
|
124 |
+
"autoregressive = UnifiedVoice(max_mel_tokens=300, max_text_tokens=200, max_conditioning_inputs=2, layers=30, model_dim=1024,\n",
|
125 |
+
" heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False, train_solo_embeddings=False).cuda().eval()\n",
|
126 |
+
"autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))\n",
|
127 |
+
"stop_mel_token = autoregressive.stop_mel_token"
|
128 |
+
],
|
129 |
+
"metadata": {
|
130 |
+
"id": "Z15xFT_uhP8v"
|
131 |
+
},
|
132 |
+
"execution_count": null,
|
133 |
+
"outputs": []
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"cell_type": "code",
|
137 |
+
"source": [
|
138 |
+
"# Perform inference with the autoregressive model, generating num_autoregressive_samples\n",
|
139 |
+
"with torch.no_grad():\n",
|
140 |
+
" samples = []\n",
|
141 |
+
" for b in tqdm(range(num_autoregressive_samples // 16)):\n",
|
142 |
+
" codes = autoregressive.inference_speech(conds, text, num_beams=1, repetition_penalty=1.0, do_sample=True, top_k=50, top_p=.95,\n",
|
143 |
+
" temperature=.9, num_return_sequences=16, length_penalty=1)\n",
|
144 |
+
" padding_needed = 250 - codes.shape[1]\n",
|
145 |
+
" codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)\n",
|
146 |
+
" samples.append(codes)\n",
|
147 |
+
"\n",
|
148 |
+
"# Delete model weights to conserve memory.\n",
|
149 |
+
"del autoregressive"
|
150 |
+
],
|
151 |
+
"metadata": {
|
152 |
+
"id": "xajqWiEik-j0"
|
153 |
+
},
|
154 |
+
"execution_count": null,
|
155 |
+
"outputs": []
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "code",
|
159 |
+
"source": [
|
160 |
+
"# Load the CLIP model.\n",
|
161 |
+
"clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=8, text_seq_len=120, text_heads=8,\n",
|
162 |
+
" num_speech_tokens=8192, speech_enc_depth=10, speech_heads=8, speech_seq_len=250).cuda().eval()\n",
|
163 |
+
"clip.load_state_dict(torch.load('.models/clip.pth'))"
|
164 |
+
],
|
165 |
+
"metadata": {
|
166 |
+
"id": "KNgYSyuyliMs"
|
167 |
+
},
|
168 |
+
"execution_count": null,
|
169 |
+
"outputs": []
|
170 |
+
},
|
171 |
+
{
|
172 |
+
"cell_type": "code",
|
173 |
+
"source": [
|
174 |
+
"# Use the CLIP model to select the best autoregressive output to match the given text.\n",
|
175 |
+
"clip_results = []\n",
|
176 |
+
"with torch.no_grad():\n",
|
177 |
+
" for batch in samples:\n",
|
178 |
+
" for i in range(batch.shape[0]):\n",
|
179 |
+
" batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)\n",
|
180 |
+
" text = text[:, :120] # Ugly hack to fix the fact that I didn't train CLIP to handle long enough text.\n",
|
181 |
+
" clip_results.append(clip(text.repeat(batch.shape[0], 1),\n",
|
182 |
+
" torch.full((batch.shape[0],), fill_value=text.shape[1]-1, dtype=torch.long, device='cuda'),\n",
|
183 |
+
" batch, torch.full((batch.shape[0],), fill_value=batch.shape[1]*1024, dtype=torch.long, device='cuda'),\n",
|
184 |
+
" return_loss=False))\n",
|
185 |
+
" clip_results = torch.cat(clip_results, dim=0)\n",
|
186 |
+
" samples = torch.cat(samples, dim=0)\n",
|
187 |
+
" best_results = samples[torch.topk(clip_results, k=1).indices]\n",
|
188 |
+
"\n",
|
189 |
+
"# Save samples to CPU memory, delete clip to conserve memory.\n",
|
190 |
+
"samples = samples.cpu()\n",
|
191 |
+
"del clip"
|
192 |
+
],
|
193 |
+
"metadata": {
|
194 |
+
"id": "DDXkM0lclp4U"
|
195 |
+
},
|
196 |
+
"execution_count": null,
|
197 |
+
"outputs": []
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"cell_type": "code",
|
201 |
+
"source": [
|
202 |
+
"# Load the DVAE and diffusion model.\n",
|
203 |
+
"dvae = DiscreteVAE(positional_dims=1, channels=80, hidden_dim=512, num_resnet_blocks=3, codebook_dim=512, num_tokens=8192, num_layers=2,\n",
|
204 |
+
" record_codes=True, kernel_size=3, use_transposed_convs=False).cuda().eval()\n",
|
205 |
+
"dvae.load_state_dict(torch.load('.models/dvae.pth'), strict=False)\n",
|
206 |
+
"diffusion = DiscreteDiffusionVocoder(model_channels=128, dvae_dim=80, channel_mult=[1, 1, 1.5, 2, 3, 4, 6, 8, 8, 8, 8], num_res_blocks=[1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1],\n",
|
207 |
+
" spectrogram_conditioning_resolutions=[2,512], attention_resolutions=[512,1024], num_heads=4, kernel_size=3, scale_factor=2,\n",
|
208 |
+
" conditioning_inputs_provided=True, time_embed_dim_multiplier=4).cuda().eval()\n",
|
209 |
+
"diffusion.load_state_dict(torch.load('.models/diffusion.pth'))\n",
|
210 |
+
"diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=100)"
|
211 |
+
],
|
212 |
+
"metadata": {
|
213 |
+
"id": "97acSnBal8Q2"
|
214 |
+
},
|
215 |
+
"execution_count": null,
|
216 |
+
"outputs": []
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"cell_type": "code",
|
220 |
+
"source": [
|
221 |
+
"# Decode the (best) discrete sequence created by the autoregressive model.\n",
|
222 |
+
"with torch.no_grad():\n",
|
223 |
+
" for b in range(best_results.shape[0]):\n",
|
224 |
+
" code = best_results[b].unsqueeze(0)\n",
|
225 |
+
" wav = do_spectrogram_diffusion(diffusion, dvae, diffuser, code, cond_wav, spectrogram_compression_factor=256, mean=True)\n",
|
226 |
+
" torchaudio.save(f'{voice}_{b}.wav', wav.squeeze(0).cpu(), 22050)"
|
227 |
+
],
|
228 |
+
"metadata": {
|
229 |
+
"id": "HEDABTrdl_kM"
|
230 |
+
},
|
231 |
+
"execution_count": null,
|
232 |
+
"outputs": []
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"cell_type": "code",
|
236 |
+
"source": [
|
237 |
+
"# Listen to your text! (told you that'd take a long time..)\n",
|
238 |
+
"from IPython.display import Audio\n",
|
239 |
+
"Audio(data=wav.squeeze(0).cpu().numpy(), rate=22050)"
|
240 |
+
],
|
241 |
+
"metadata": {
|
242 |
+
"id": "EyHmcdqBmSvf"
|
243 |
+
},
|
244 |
+
"execution_count": null,
|
245 |
+
"outputs": []
|
246 |
+
}
|
247 |
+
]
|
248 |
+
}
|