File size: 4,753 Bytes
6bc94ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a034be6
6bc94ac
 
 
 
 
 
 
 
 
 
436ce71
6bc94ac
 
 
 
 
 
 
 
 
 
 
d2b6583
6bc94ac
db5ef00
6bc94ac
 
 
 
436ce71
6bc94ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edcdcdb
 
 
6bc94ac
 
 
 
 
 
 
 
db5ef00
 
6bc94ac
 
 
 
 
 
 
 
 
 
 
136fecc
6bc94ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0194f4
 
 
 
 
6bc94ac
 
a034be6
5beab45
 
a034be6
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import argparse
from ctypes import alignment
import os 
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import sys
sys.path.append('rtvc/')
from pathlib import Path
import time
import spacy
import matplotlib.pyplot as plt

import librosa
import numpy as np
import soundfile as sf
import torch
import noisereduce as nr  
import io
from scipy.io.wavfile import write
import base64
import streamlit as st


from rtvc.encoder import inference as encoder
from rtvc.encoder.params_data import *
from rtvc.encoder.params_model import model_embedding_size as speaker_embedding_size
from rtvc.synthesizer.inference import Synthesizer_infer
from rtvc.utils.argutils import print_args
from rtvc.utils.default_models import ensure_default_models
from rtvc.vocoder import inference as vocoder
from rtvc.vocoder.display import save_attention_multiple, save_spectrogram, save_stop_tokens
from rtvc.synthesizer.utils.cleaners import english_cleaners_predict
from rtvc.speed_changer.fixSpeed import *


def tts(text, embed_name, nlp, autoplay=True):
    run_id = "default"
    models_dir = Path("rtvc/saved_models")
    embed_path = f"embeds/{embed_name}.npy"
    if torch.cuda.is_available():
        device_id = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(device_id)

    ensure_default_models(run_id, models_dir)
    synthesizer = Synthesizer_infer(list(models_dir.glob(f"{run_id}/synthesizer.pt"))[0])
    # vocoder.load_model(list(models_dir.glob(f"{run_id}/vocoder.pt"))[0])
    ## Generating the spectrogram

    # The synthesizer works in batch, so you need to put your data in a list or numpy array
    def split_text(text):
        text = english_cleaners_predict(text)
        texts = [i.text.strip() for i in nlp(text).sents]  # split paragraph to sentences
        return texts

    texts = split_text(text)
    print(f"the list of inputs texts:\n{texts}")

    embed = np.load(embed_path)
    specs = []
    alignments = []
    stop_tokens = []

    for text in texts:
        spec, align, stop_token = synthesizer.synthesize_spectrograms([text], [embed], require_visualization=True)
        specs.append(spec[0])
        alignments.append(align[0])
        stop_tokens.append(stop_token[0])

    breaks = [spec.shape[1] for spec in specs]
    spec = np.concatenate(specs, axis=1)

    ## Save synthesizer visualization results
    if not os.path.exists("syn_results"):
        os.mkdir("syn_results")
    # save_attention_multiple(alignments, "syn_results/attention")
    # save_stop_tokens(stop_tokens, "syn_results/stop_tokens")
    # save_spectrogram(spec, "syn_results/mel")
    print("Created the mel spectrogram")


    ## Generating the waveform
    print("Synthesizing the waveform:")

    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
    # spectrogram, the more time-efficient the vocoder.
    # wav = vocoder.infer_waveform(spec)
    wav = synthesizer.griffin_lim(spec)
    wav = vocoder.waveform_denoising(wav)

    # Add breaks
    b_ends = np.cumsum(np.array(breaks) * Synthesizer_infer.hparams.hop_size)
    b_starts = np.concatenate(([0], b_ends[:-1]))
    wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
    breaks = [np.zeros(int(0.15 * Synthesizer_infer.sample_rate))] * len(breaks)
    wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
    # generated_wav = encoder.preprocess_wav(generated_wav)
    wav = wav / np.abs(wav).max() * 10

    if autoplay:
        # Play the audio (non-blocking)
        import sounddevice as sd
        try:
            sd.stop()
            sd.play(wav, synthesizer.sample_rate)
            time_span = len(wav)//synthesizer.sample_rate + 1
            time.sleep(time_span)

        except sd.PortAudioError as e:
            print("\nCaught exception: %s" % repr(e))
            print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
        except:
            raise
    bytes_wav = bytes()
    byte_io = io.BytesIO(bytes_wav)
    write(byte_io, synthesizer.sample_rate, wav.astype(np.float32))
    result_bytes = byte_io.read()
    return base64.b64encode(result_bytes).decode()

if __name__ == "__main__":
    text = "Adkins was raised by a young single mother in various working-class neighbourhoods of London."
    embed_name = "Helen_Mirren"
    nlp = spacy.load('en_core_web_lg')
    b64 = tts(text, embed_name, nlp, autoplay=False)

    md = f"""
    <audio controls autoplay>
    <source src="data:audio/wav;base64,{b64}" type="audio/wav">
    Your browser does not support the audio element.
    </audio>
    """
    st.markdown(md, unsafe_allow_html=True)