import argparse from ctypes import alignment import os os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import sys sys.path.append('rtvc/') from pathlib import Path import time import spacy import matplotlib.pyplot as plt import librosa import numpy as np import soundfile as sf import torch import noisereduce as nr import io from scipy.io.wavfile import write import base64 import streamlit as st from rtvc.encoder import inference as encoder from rtvc.encoder.params_data import * from rtvc.encoder.params_model import model_embedding_size as speaker_embedding_size from rtvc.synthesizer.inference import Synthesizer_infer from rtvc.utils.argutils import print_args from rtvc.utils.default_models import ensure_default_models from rtvc.vocoder import inference as vocoder from rtvc.vocoder.display import save_attention_multiple, save_spectrogram, save_stop_tokens from rtvc.synthesizer.utils.cleaners import english_cleaners_predict from rtvc.speed_changer.fixSpeed import * def tts(text, embed_name, nlp, autoplay=True): run_id = "default" models_dir = Path("rtvc/saved_models") embed_path = f"embeds/{embed_name}.npy" if torch.cuda.is_available(): device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) ensure_default_models(run_id, models_dir) synthesizer = Synthesizer_infer(list(models_dir.glob(f"{run_id}/synthesizer.pt"))[0]) # vocoder.load_model(list(models_dir.glob(f"{run_id}/vocoder.pt"))[0]) ## Generating the spectrogram # The synthesizer works in batch, so you need to put your data in a list or numpy array def split_text(text): text = english_cleaners_predict(text) texts = [i.text.strip() for i in nlp(text).sents] # split paragraph to sentences return texts texts = split_text(text) print(f"the list of inputs texts:\n{texts}") embed = np.load(embed_path) specs = [] alignments = [] stop_tokens = [] for text in texts: spec, align, stop_token = synthesizer.synthesize_spectrograms([text], [embed], require_visualization=True) specs.append(spec[0]) alignments.append(align[0]) stop_tokens.append(stop_token[0]) breaks = [spec.shape[1] for spec in specs] spec = np.concatenate(specs, axis=1) ## Save synthesizer visualization results if not os.path.exists("syn_results"): os.mkdir("syn_results") # save_attention_multiple(alignments, "syn_results/attention") # save_stop_tokens(stop_tokens, "syn_results/stop_tokens") # save_spectrogram(spec, "syn_results/mel") print("Created the mel spectrogram") ## Generating the waveform print("Synthesizing the waveform:") # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. # wav = vocoder.infer_waveform(spec) wav = synthesizer.griffin_lim(spec) wav = vocoder.waveform_denoising(wav) # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer_infer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer_infer.sample_rate))] * len(breaks) wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Trim excess silences to compensate for gaps in spectrograms (issue #53) # generated_wav = encoder.preprocess_wav(generated_wav) wav = wav / np.abs(wav).max() * 10 if autoplay: # Play the audio (non-blocking) import sounddevice as sd try: sd.stop() sd.play(wav, synthesizer.sample_rate) time_span = len(wav)//synthesizer.sample_rate + 1 time.sleep(time_span) except sd.PortAudioError as e: print("\nCaught exception: %s" % repr(e)) print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n") except: raise bytes_wav = bytes() byte_io = io.BytesIO(bytes_wav) write(byte_io, synthesizer.sample_rate, wav.astype(np.float32)) result_bytes = byte_io.read() return base64.b64encode(result_bytes).decode() if __name__ == "__main__": text = "Adkins was raised by a young single mother in various working-class neighbourhoods of London." embed_name = "Helen_Mirren" nlp = spacy.load('en_core_web_lg') b64 = tts(text, embed_name, nlp, autoplay=False) md = f""" """ st.markdown(md, unsafe_allow_html=True)