add option to specify model directory to API
Browse files
api.py
CHANGED
@@ -170,35 +170,40 @@ class TextToSpeech:
|
|
170 |
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
|
171 |
GPU OOM errors. Larger numbers generates slightly faster.
|
172 |
"""
|
173 |
-
def __init__(self, autoregressive_batch_size=16):
|
174 |
self.autoregressive_batch_size = autoregressive_batch_size
|
175 |
self.tokenizer = VoiceBpeTokenizer()
|
176 |
download_models()
|
177 |
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
186 |
text_seq_len=350, text_heads=8,
|
187 |
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
188 |
use_xformers=True).cpu().eval()
|
189 |
-
self.clvp.load_state_dict(torch.load('
|
190 |
|
191 |
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
192 |
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
193 |
-
self.cvvp.load_state_dict(torch.load('
|
194 |
-
|
195 |
-
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
|
196 |
-
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
|
197 |
-
layer_drop=0, unconditioned_percentage=0).cpu().eval()
|
198 |
-
self.diffusion.load_state_dict(torch.load('.models/diffusion_decoder.pth'))
|
199 |
|
200 |
self.vocoder = UnivNetGenerator().cpu()
|
201 |
-
self.vocoder.load_state_dict(torch.load('
|
202 |
self.vocoder.eval(inference=True)
|
203 |
|
204 |
def tts_with_preset(self, text, voice_samples, preset='fast', **kwargs):
|
@@ -216,7 +221,7 @@ class TextToSpeech:
|
|
216 |
'cond_free_k': 2.0, 'diffusion_temperature': 1.0})
|
217 |
# Presets are defined here.
|
218 |
presets = {
|
219 |
-
'ultra_fast': {'num_autoregressive_samples':
|
220 |
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
|
221 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
|
222 |
'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
|
|
|
170 |
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
|
171 |
GPU OOM errors. Larger numbers generates slightly faster.
|
172 |
"""
|
173 |
+
def __init__(self, autoregressive_batch_size=16, models_dir='.models'):
|
174 |
self.autoregressive_batch_size = autoregressive_batch_size
|
175 |
self.tokenizer = VoiceBpeTokenizer()
|
176 |
download_models()
|
177 |
|
178 |
+
if os.path.exists(f'{models_dir}/autoregressive.ptt'):
|
179 |
+
# Assume this is a traced directory.
|
180 |
+
self.autoregressive = torch.jit.load(f'{models_dir}/autoregressive.ptt')
|
181 |
+
self.diffusion = torch.jit.load(f'{models_dir}/diffusion_decoder.ptt')
|
182 |
+
else:
|
183 |
+
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
184 |
+
model_dim=1024,
|
185 |
+
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
186 |
+
train_solo_embeddings=False,
|
187 |
+
average_conditioning_embeddings=True).cpu().eval()
|
188 |
+
self.autoregressive.load_state_dict(torch.load(f'{models_dir}/autoregressive.pth'))
|
189 |
+
|
190 |
+
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
|
191 |
+
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
|
192 |
+
layer_drop=0, unconditioned_percentage=0).cpu().eval()
|
193 |
+
self.diffusion.load_state_dict(torch.load(f'{models_dir}/diffusion_decoder.pth'))
|
194 |
|
195 |
self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
196 |
text_seq_len=350, text_heads=8,
|
197 |
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
198 |
use_xformers=True).cpu().eval()
|
199 |
+
self.clvp.load_state_dict(torch.load(f'{models_dir}/clvp.pth'))
|
200 |
|
201 |
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
202 |
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
203 |
+
self.cvvp.load_state_dict(torch.load(f'{models_dir}/cvvp.pth'))
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
self.vocoder = UnivNetGenerator().cpu()
|
206 |
+
self.vocoder.load_state_dict(torch.load(f'{models_dir}/vocoder.pth')['model_g'])
|
207 |
self.vocoder.eval(inference=True)
|
208 |
|
209 |
def tts_with_preset(self, text, voice_samples, preset='fast', **kwargs):
|
|
|
221 |
'cond_free_k': 2.0, 'diffusion_temperature': 1.0})
|
222 |
# Presets are defined here.
|
223 |
presets = {
|
224 |
+
'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 32, 'cond_free': False},
|
225 |
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
|
226 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
|
227 |
'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
|