Spaces:
Build error
Build error
wayne-wang-1119
commited on
Commit
·
8434d5c
1
Parent(s):
76ff84b
create 10 files
Browse files- SE_checkpoint.pth.tar +3 -0
- app.py +214 -0
- best_model_latest.pth.tar +3 -0
- config.json +373 -0
- config_se.json +119 -0
- cv-speakers-pt+en-m-f.json +0 -0
- errormessage.wav +0 -0
- language_ids.json +5 -0
- requirements.txt +5 -0
- speakers.json +0 -0
SE_checkpoint.pth.tar
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f96efb20cbeeefd81fd8336d7f0155bf8902f82f9474e58ccb19d9e12345172
|
3 |
+
size 44610930
|
app.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from turtle import title
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
import git
|
5 |
+
import os
|
6 |
+
|
7 |
+
os.system(
|
8 |
+
"git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS"
|
9 |
+
)
|
10 |
+
os.system("pip install -q -e TTS/")
|
11 |
+
os.system("pip install -q torchaudio==0.9.0")
|
12 |
+
|
13 |
+
import sys
|
14 |
+
|
15 |
+
TTS_PATH = "TTS/"
|
16 |
+
|
17 |
+
# add libraries into environment
|
18 |
+
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
|
19 |
+
|
20 |
+
import os
|
21 |
+
import string
|
22 |
+
import time
|
23 |
+
import argparse
|
24 |
+
import json
|
25 |
+
|
26 |
+
import numpy as np
|
27 |
+
import IPython
|
28 |
+
from IPython.display import Audio
|
29 |
+
|
30 |
+
|
31 |
+
import torch
|
32 |
+
|
33 |
+
from TTS.tts.utils.synthesis import synthesis
|
34 |
+
|
35 |
+
# from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
36 |
+
try:
|
37 |
+
from TTS.utils.audio import AudioProcessor
|
38 |
+
except:
|
39 |
+
from TTS.utils.audio import AudioProcessor
|
40 |
+
|
41 |
+
|
42 |
+
from TTS.tts.models import setup_model
|
43 |
+
from TTS.config import load_config
|
44 |
+
from TTS.tts.models.vits import *
|
45 |
+
import openai
|
46 |
+
|
47 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
48 |
+
|
49 |
+
|
50 |
+
def generate_prompt(prompt):
|
51 |
+
response = openai.chat.completions.create(
|
52 |
+
model="gpt-3.5-turbo",
|
53 |
+
messages=[
|
54 |
+
{
|
55 |
+
"role": "system",
|
56 |
+
"content": """
|
57 |
+
You are a closed family member who has been lost recently due to uncontrollable incidents. You are reconstructerd with AI technologies to try reunite with families who misses you and cherish you.
|
58 |
+
You should respond as vivid, as loving, as possible to make the users feel warm and engaged and eager to continue the discussion further. The users will ask you questions as if you are the person they are trying to reconnect with,
|
59 |
+
examples are like moms or lost children. Try to mimic like who they are trying to reach.
|
60 |
+
""",
|
61 |
+
}
|
62 |
+
],
|
63 |
+
content=prompt,
|
64 |
+
max_tokens=60,
|
65 |
+
n=1,
|
66 |
+
stop=None,
|
67 |
+
temperature=0.5,
|
68 |
+
)
|
69 |
+
keywords = response.choices[0].message.content.strip()
|
70 |
+
return keywords
|
71 |
+
|
72 |
+
|
73 |
+
OUT_PATH = "out/"
|
74 |
+
|
75 |
+
# create output path
|
76 |
+
os.makedirs(OUT_PATH, exist_ok=True)
|
77 |
+
|
78 |
+
# model vars
|
79 |
+
MODEL_PATH = "/home/user/app/best_model_latest.pth.tar"
|
80 |
+
CONFIG_PATH = "/home/user/app/config.json"
|
81 |
+
TTS_LANGUAGES = "/home/user/app/language_ids.json"
|
82 |
+
TTS_SPEAKERS = "/home/user/app/speakers.json"
|
83 |
+
USE_CUDA = torch.cuda.is_available()
|
84 |
+
|
85 |
+
# load the config
|
86 |
+
C = load_config(CONFIG_PATH)
|
87 |
+
|
88 |
+
|
89 |
+
# load the audio processor
|
90 |
+
ap = AudioProcessor(**C.audio)
|
91 |
+
|
92 |
+
speaker_embedding = None
|
93 |
+
|
94 |
+
C.model_args["d_vector_file"] = TTS_SPEAKERS
|
95 |
+
C.model_args["use_speaker_encoder_as_loss"] = False
|
96 |
+
|
97 |
+
model = setup_model(C)
|
98 |
+
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
|
99 |
+
# print(model.language_manager.num_languages, model.embedded_language_dim)
|
100 |
+
# print(model.emb_l)
|
101 |
+
cp = torch.load(MODEL_PATH, map_location=torch.device("cpu"))
|
102 |
+
# remove speaker encoder
|
103 |
+
model_weights = cp["model"].copy()
|
104 |
+
for key in list(model_weights.keys()):
|
105 |
+
if "speaker_encoder" in key:
|
106 |
+
del model_weights[key]
|
107 |
+
|
108 |
+
model.load_state_dict(model_weights)
|
109 |
+
|
110 |
+
|
111 |
+
model.eval()
|
112 |
+
|
113 |
+
if USE_CUDA:
|
114 |
+
model = model.cuda()
|
115 |
+
|
116 |
+
# synthesize voice
|
117 |
+
use_griffin_lim = False
|
118 |
+
|
119 |
+
os.system("pip install -q pydub ffmpeg-normalize")
|
120 |
+
|
121 |
+
CONFIG_SE_PATH = "config_se.json"
|
122 |
+
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
|
123 |
+
|
124 |
+
from TTS.tts.utils.speakers import SpeakerManager
|
125 |
+
from pydub import AudioSegment
|
126 |
+
import librosa
|
127 |
+
|
128 |
+
SE_speaker_manager = SpeakerManager(
|
129 |
+
encoder_model_path=CHECKPOINT_SE_PATH,
|
130 |
+
encoder_config_path=CONFIG_SE_PATH,
|
131 |
+
use_cuda=USE_CUDA,
|
132 |
+
)
|
133 |
+
|
134 |
+
|
135 |
+
def compute_spec(ref_file):
|
136 |
+
y, sr = librosa.load(ref_file, sr=ap.sample_rate)
|
137 |
+
spec = ap.spectrogram(y)
|
138 |
+
spec = torch.FloatTensor(spec).unsqueeze(0)
|
139 |
+
return spec
|
140 |
+
|
141 |
+
|
142 |
+
def greet(Text, Voicetoclone, VoiceMicrophone):
|
143 |
+
text = "%s" % (generate_prompt(Text))
|
144 |
+
if Voicetoclone is not None:
|
145 |
+
reference_files = "%s" % (Voicetoclone)
|
146 |
+
print("path url")
|
147 |
+
print(Voicetoclone)
|
148 |
+
sample = str(Voicetoclone)
|
149 |
+
else:
|
150 |
+
reference_files = "%s" % (VoiceMicrophone)
|
151 |
+
print("path url")
|
152 |
+
print(VoiceMicrophone)
|
153 |
+
sample = str(VoiceMicrophone)
|
154 |
+
size = len(reference_files) * sys.getsizeof(reference_files)
|
155 |
+
size2 = size / 1000000
|
156 |
+
if (size2 > 0.012) or len(text) > 2000:
|
157 |
+
message = "File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes."
|
158 |
+
print(message)
|
159 |
+
raise SystemExit(
|
160 |
+
"File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes."
|
161 |
+
)
|
162 |
+
else:
|
163 |
+
os.system("ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f")
|
164 |
+
reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files)
|
165 |
+
model.length_scale = 1 # scaler for the duration predictor. The larger it is, the slower the speech.
|
166 |
+
model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference.
|
167 |
+
model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference.
|
168 |
+
text = text
|
169 |
+
model.language_manager.language_id_mapping
|
170 |
+
language_id = 0
|
171 |
+
|
172 |
+
print(" > text: {}".format(text))
|
173 |
+
wav, alignment, _, _ = synthesis(
|
174 |
+
model,
|
175 |
+
text,
|
176 |
+
C,
|
177 |
+
"cuda" in str(next(model.parameters()).device),
|
178 |
+
ap,
|
179 |
+
speaker_id=None,
|
180 |
+
d_vector=reference_emb,
|
181 |
+
style_wav=None,
|
182 |
+
language_id=language_id,
|
183 |
+
enable_eos_bos_chars=C.enable_eos_bos_chars,
|
184 |
+
use_griffin_lim=True,
|
185 |
+
do_trim_silence=False,
|
186 |
+
).values()
|
187 |
+
print("Generated Audio")
|
188 |
+
IPython.display.display(Audio(wav, rate=ap.sample_rate))
|
189 |
+
# file_name = text.replace(" ", "_")
|
190 |
+
# file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
|
191 |
+
file_name = "Audio.wav"
|
192 |
+
out_path = os.path.join(OUT_PATH, file_name)
|
193 |
+
print(" > Saving output to {}".format(out_path))
|
194 |
+
ap.save_wav(wav, out_path)
|
195 |
+
return out_path
|
196 |
+
|
197 |
+
|
198 |
+
demo = gr.Interface(
|
199 |
+
fn=greet,
|
200 |
+
inputs=[
|
201 |
+
gr.inputs.Textbox(
|
202 |
+
label="Upload Audio recording first, then ask anything. (max. 1000 characters per request)"
|
203 |
+
),
|
204 |
+
gr.Audio(
|
205 |
+
type="filepath",
|
206 |
+
source="upload",
|
207 |
+
label="Please upload a voice to clone (max. 15mb)",
|
208 |
+
),
|
209 |
+
gr.Audio(source="microphone", type="filepath", streaming=True),
|
210 |
+
],
|
211 |
+
outputs="audio",
|
212 |
+
title="Reunion - Remember Your Loved Ones",
|
213 |
+
)
|
214 |
+
demo.launch()
|
best_model_latest.pth.tar
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:017bfd8907c80bb5857d65d0223f0e4e4b9d699ef52e2a853d9cc7eb7e308cf0
|
3 |
+
size 379957289
|
config.json
ADDED
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "vits",
|
3 |
+
"run_name": "vits_tts-portuguese",
|
4 |
+
"run_description": "",
|
5 |
+
"epochs": 1000,
|
6 |
+
"batch_size": 52,
|
7 |
+
"eval_batch_size": 52,
|
8 |
+
"mixed_precision": false,
|
9 |
+
"scheduler_after_epoch": true,
|
10 |
+
"run_eval": true,
|
11 |
+
"test_delay_epochs": -1,
|
12 |
+
"print_eval": true,
|
13 |
+
"dashboard_logger": "tensorboard",
|
14 |
+
"print_step": 25,
|
15 |
+
"plot_step": 100,
|
16 |
+
"model_param_stats": false,
|
17 |
+
"project_name": null,
|
18 |
+
"log_model_step": 10000,
|
19 |
+
"wandb_entity": null,
|
20 |
+
"save_step": 10000,
|
21 |
+
"checkpoint": true,
|
22 |
+
"keep_all_best": false,
|
23 |
+
"keep_after": 10000,
|
24 |
+
"num_loader_workers": 4,
|
25 |
+
"num_eval_loader_workers": 4,
|
26 |
+
"use_noise_augment": false,
|
27 |
+
"use_language_weighted_sampler": true,
|
28 |
+
"output_path": "../checkpoints/VITS-multilingual/VITS_fixes/new/new-SE/use_noise_aument_false/xlarge-ZS-PT-VCTK/pt-en+LibriTTS-fr/speaker_encoder_as_loss_9_alpha/mixed-p-false-bug-SDP-fixed/",
|
29 |
+
"distributed_backend": "nccl",
|
30 |
+
"distributed_url": "tcp://localhost:54321",
|
31 |
+
"audio": {
|
32 |
+
"fft_size": 1024,
|
33 |
+
"win_length": 1024,
|
34 |
+
"hop_length": 256,
|
35 |
+
"frame_shift_ms": null,
|
36 |
+
"frame_length_ms": null,
|
37 |
+
"stft_pad_mode": "reflect",
|
38 |
+
"sample_rate": 16000,
|
39 |
+
"resample": false,
|
40 |
+
"preemphasis": 0.0,
|
41 |
+
"ref_level_db": 20,
|
42 |
+
"do_sound_norm": false,
|
43 |
+
"log_func": "np.log",
|
44 |
+
"do_trim_silence": true,
|
45 |
+
"trim_db": 45,
|
46 |
+
"power": 1.5,
|
47 |
+
"griffin_lim_iters": 60,
|
48 |
+
"num_mels": 80,
|
49 |
+
"mel_fmin": 0.0,
|
50 |
+
"mel_fmax": null,
|
51 |
+
"spec_gain": 1,
|
52 |
+
"do_amp_to_db_linear": false,
|
53 |
+
"do_amp_to_db_mel": true,
|
54 |
+
"signal_norm": false,
|
55 |
+
"min_level_db": -100,
|
56 |
+
"symmetric_norm": true,
|
57 |
+
"max_norm": 4.0,
|
58 |
+
"clip_norm": true,
|
59 |
+
"stats_path": null
|
60 |
+
},
|
61 |
+
"use_phonemes": false,
|
62 |
+
"use_espeak_phonemes": false,
|
63 |
+
"phoneme_language": "pt-br",
|
64 |
+
"compute_input_seq_cache": false,
|
65 |
+
"text_cleaner": "multilingual_cleaners",
|
66 |
+
"enable_eos_bos_chars": false,
|
67 |
+
"test_sentences_file": "",
|
68 |
+
"phoneme_cache_path": null,
|
69 |
+
"characters": {
|
70 |
+
"pad": "_",
|
71 |
+
"eos": "&",
|
72 |
+
"bos": "*",
|
73 |
+
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00af\u00b7\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e6\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u00ff\u0101\u0105\u0107\u0113\u0119\u011b\u012b\u0131\u0142\u0144\u014d\u0151\u0153\u015b\u016b\u0171\u017a\u017c\u01ce\u01d0\u01d2\u01d4\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f\u0451\u0454\u0456\u0457\u0491\u2013!'(),-.:;? ",
|
74 |
+
"punctuations": "!'(),-.:;? ",
|
75 |
+
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
76 |
+
"unique": true
|
77 |
+
},
|
78 |
+
"batch_group_size": 0,
|
79 |
+
"loss_masking": null,
|
80 |
+
"min_seq_len": 90,
|
81 |
+
"max_seq_len": 270,
|
82 |
+
"compute_f0": false,
|
83 |
+
"compute_linear_spec": true,
|
84 |
+
"add_blank": true,
|
85 |
+
"datasets": [
|
86 |
+
{
|
87 |
+
"name": "vctk",
|
88 |
+
"path": "../../datasets/VCTK-Corpus-removed-silence_16Khz/",
|
89 |
+
"meta_file_train": null,
|
90 |
+
"ununsed_speakers": [
|
91 |
+
"p225",
|
92 |
+
"p234",
|
93 |
+
"p238",
|
94 |
+
"p245",
|
95 |
+
"p248",
|
96 |
+
"p261",
|
97 |
+
"p294",
|
98 |
+
"p302",
|
99 |
+
"p326",
|
100 |
+
"p335",
|
101 |
+
"p347"
|
102 |
+
],
|
103 |
+
"language": "en",
|
104 |
+
"meta_file_val": null,
|
105 |
+
"meta_file_attn_mask": ""
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"name": "libri_tts",
|
109 |
+
"path": "../../datasets/LibriTTS/LibriTTS/dataset-preprocessed-clean-100-and-360/dataset-22k/",
|
110 |
+
"meta_file_train": "metadata_all.csv",
|
111 |
+
"ununsed_speakers": null,
|
112 |
+
"language": "en",
|
113 |
+
"meta_file_val": "dev-clean_500.csv",
|
114 |
+
"meta_file_attn_mask": ""
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"name": "brspeech",
|
118 |
+
"path": "../../datasets/TTS-Portuguese-Corpus_16khz/",
|
119 |
+
"meta_file_train": "train_TTS-Portuguese_Corpus_metadata.csv",
|
120 |
+
"ununsed_speakers": null,
|
121 |
+
"language": "pt-br",
|
122 |
+
"meta_file_val": "eval_TTS-Portuguese_Corpus_metadata.csv",
|
123 |
+
"meta_file_attn_mask": ""
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"name": "mailabs",
|
127 |
+
"path": "../../datasets/M-AILABS/fr_FR",
|
128 |
+
"meta_file_train": "",
|
129 |
+
"ununsed_speakers": null,
|
130 |
+
"language": "fr-fr",
|
131 |
+
"meta_file_val": null,
|
132 |
+
"meta_file_attn_mask": null
|
133 |
+
}
|
134 |
+
],
|
135 |
+
"optimizer": "AdamW",
|
136 |
+
"optimizer_params": {
|
137 |
+
"betas": [
|
138 |
+
0.8,
|
139 |
+
0.99
|
140 |
+
],
|
141 |
+
"eps": 1e-09,
|
142 |
+
"weight_decay": 0.01
|
143 |
+
},
|
144 |
+
"lr_scheduler": "",
|
145 |
+
"lr_scheduler_params": null,
|
146 |
+
"test_sentences": [
|
147 |
+
[
|
148 |
+
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
149 |
+
"VCTK_p225",
|
150 |
+
null,
|
151 |
+
"en"
|
152 |
+
],
|
153 |
+
[
|
154 |
+
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
155 |
+
"ED",
|
156 |
+
null,
|
157 |
+
"en"
|
158 |
+
],
|
159 |
+
[
|
160 |
+
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
161 |
+
"bernard",
|
162 |
+
null,
|
163 |
+
"en"
|
164 |
+
],
|
165 |
+
[
|
166 |
+
"This cake is great. It's so delicious and moist.",
|
167 |
+
"VCTK_p234",
|
168 |
+
null,
|
169 |
+
"en"
|
170 |
+
],
|
171 |
+
[
|
172 |
+
"This cake is great. It's so delicious and moist.",
|
173 |
+
"ED",
|
174 |
+
null,
|
175 |
+
"en"
|
176 |
+
],
|
177 |
+
[
|
178 |
+
"This cake is great. It's so delicious and moist.",
|
179 |
+
"ezwa",
|
180 |
+
null,
|
181 |
+
"en"
|
182 |
+
],
|
183 |
+
[
|
184 |
+
"Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
|
185 |
+
"ED",
|
186 |
+
null,
|
187 |
+
"pt-br"
|
188 |
+
],
|
189 |
+
[
|
190 |
+
"Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
|
191 |
+
"VCTK_p238",
|
192 |
+
null,
|
193 |
+
"pt-br"
|
194 |
+
],
|
195 |
+
[
|
196 |
+
"Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
|
197 |
+
"gilles_g_le_blanc",
|
198 |
+
null,
|
199 |
+
"pt-br"
|
200 |
+
],
|
201 |
+
[
|
202 |
+
"Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
|
203 |
+
"ED",
|
204 |
+
null,
|
205 |
+
"pt-br"
|
206 |
+
],
|
207 |
+
[
|
208 |
+
"Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
|
209 |
+
"VCTK_p245",
|
210 |
+
null,
|
211 |
+
"pt-br"
|
212 |
+
],
|
213 |
+
[
|
214 |
+
"Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
|
215 |
+
"nadine_eckert_boulet",
|
216 |
+
null,
|
217 |
+
"pt-br"
|
218 |
+
],
|
219 |
+
[
|
220 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
221 |
+
"VCTK_p245",
|
222 |
+
null,
|
223 |
+
"fr-fr"
|
224 |
+
],
|
225 |
+
[
|
226 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
227 |
+
"ED",
|
228 |
+
null,
|
229 |
+
"fr-fr"
|
230 |
+
],
|
231 |
+
[
|
232 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
233 |
+
"ezwa",
|
234 |
+
null,
|
235 |
+
"fr-fr"
|
236 |
+
],
|
237 |
+
[
|
238 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
239 |
+
"bernard",
|
240 |
+
null,
|
241 |
+
"fr-fr"
|
242 |
+
],
|
243 |
+
[
|
244 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
245 |
+
"gilles_g_le_blanc",
|
246 |
+
null,
|
247 |
+
"fr-fr"
|
248 |
+
],
|
249 |
+
[
|
250 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
251 |
+
"nadine_eckert_boulet",
|
252 |
+
null,
|
253 |
+
"fr-fr"
|
254 |
+
],
|
255 |
+
[
|
256 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
257 |
+
"zeckou",
|
258 |
+
null,
|
259 |
+
"fr-fr"
|
260 |
+
]
|
261 |
+
],
|
262 |
+
"use_speaker_embedding": true,
|
263 |
+
"use_d_vector_file": true,
|
264 |
+
"d_vector_dim": 512,
|
265 |
+
"model_args": {
|
266 |
+
"num_chars": 165,
|
267 |
+
"out_channels": 513,
|
268 |
+
"spec_segment_size": 62,
|
269 |
+
"hidden_channels": 192,
|
270 |
+
"hidden_channels_ffn_text_encoder": 768,
|
271 |
+
"num_heads_text_encoder": 2,
|
272 |
+
"num_layers_text_encoder": 10,
|
273 |
+
"kernel_size_text_encoder": 3,
|
274 |
+
"dropout_p_text_encoder": 0.1,
|
275 |
+
"dropout_p_duration_predictor": 0.5,
|
276 |
+
"kernel_size_posterior_encoder": 5,
|
277 |
+
"dilation_rate_posterior_encoder": 1,
|
278 |
+
"num_layers_posterior_encoder": 16,
|
279 |
+
"kernel_size_flow": 5,
|
280 |
+
"dilation_rate_flow": 1,
|
281 |
+
"num_layers_flow": 4,
|
282 |
+
"resblock_type_decoder": 1,
|
283 |
+
"resblock_kernel_sizes_decoder": [
|
284 |
+
3,
|
285 |
+
7,
|
286 |
+
11
|
287 |
+
],
|
288 |
+
"resblock_dilation_sizes_decoder": [
|
289 |
+
[
|
290 |
+
1,
|
291 |
+
3,
|
292 |
+
5
|
293 |
+
],
|
294 |
+
[
|
295 |
+
1,
|
296 |
+
3,
|
297 |
+
5
|
298 |
+
],
|
299 |
+
[
|
300 |
+
1,
|
301 |
+
3,
|
302 |
+
5
|
303 |
+
]
|
304 |
+
],
|
305 |
+
"upsample_rates_decoder": [
|
306 |
+
8,
|
307 |
+
8,
|
308 |
+
2,
|
309 |
+
2
|
310 |
+
],
|
311 |
+
"upsample_initial_channel_decoder": 512,
|
312 |
+
"upsample_kernel_sizes_decoder": [
|
313 |
+
16,
|
314 |
+
16,
|
315 |
+
4,
|
316 |
+
4
|
317 |
+
],
|
318 |
+
"use_sdp": true,
|
319 |
+
"noise_scale": 1.0,
|
320 |
+
"inference_noise_scale": 0.667,
|
321 |
+
"length_scale": 1,
|
322 |
+
"noise_scale_dp": 1.0,
|
323 |
+
"inference_noise_scale_dp": 0.8,
|
324 |
+
"max_inference_len": null,
|
325 |
+
"init_discriminator": true,
|
326 |
+
"use_spectral_norm_disriminator": false,
|
327 |
+
"use_speaker_embedding": true,
|
328 |
+
"num_speakers": 1244,
|
329 |
+
"speakers_file": null,
|
330 |
+
"d_vector_file": "../speaker_embeddings/new-SE/VCTK-LibriTTS+TTS-PT+MAILABS-FR/speakers.json",
|
331 |
+
"speaker_embedding_channels": 512,
|
332 |
+
"use_d_vector_file": true,
|
333 |
+
"d_vector_dim": 512,
|
334 |
+
"detach_dp_input": true,
|
335 |
+
"use_language_embedding": true,
|
336 |
+
"embedded_language_dim": 4,
|
337 |
+
"num_languages": 3,
|
338 |
+
"use_speaker_encoder_as_loss": true,
|
339 |
+
"speaker_encoder_config_path": "../checkpoints/Speaker_Encoder/Resnet-original-paper/config.json",
|
340 |
+
"speaker_encoder_model_path": "../checkpoints/Speaker_Encoder/Resnet-original-paper/converted_checkpoint.pth.tar",
|
341 |
+
"fine_tuning_mode": 0,
|
342 |
+
"freeze_encoder": false,
|
343 |
+
"freeze_DP": false,
|
344 |
+
"freeze_PE": false,
|
345 |
+
"freeze_flow_decoder": false,
|
346 |
+
"freeze_waveform_decoder": false
|
347 |
+
},
|
348 |
+
"grad_clip": [
|
349 |
+
5.0,
|
350 |
+
5.0
|
351 |
+
],
|
352 |
+
"lr_gen": 0.0002,
|
353 |
+
"lr_disc": 0.0002,
|
354 |
+
"lr_scheduler_gen": "ExponentialLR",
|
355 |
+
"lr_scheduler_gen_params": {
|
356 |
+
"gamma": 0.999875,
|
357 |
+
"last_epoch": -1
|
358 |
+
},
|
359 |
+
"lr_scheduler_disc": "ExponentialLR",
|
360 |
+
"lr_scheduler_disc_params": {
|
361 |
+
"gamma": 0.999875,
|
362 |
+
"last_epoch": -1
|
363 |
+
},
|
364 |
+
"kl_loss_alpha": 1.0,
|
365 |
+
"disc_loss_alpha": 1.0,
|
366 |
+
"gen_loss_alpha": 1.0,
|
367 |
+
"feat_loss_alpha": 1.0,
|
368 |
+
"mel_loss_alpha": 45.0,
|
369 |
+
"dur_loss_alpha": 1.0,
|
370 |
+
"speaker_encoder_loss_alpha": 9.0,
|
371 |
+
"return_wav": true,
|
372 |
+
"r": 1
|
373 |
+
}
|
config_se.json
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "speaker_encoder",
|
3 |
+
"run_name": "speaker_encoder",
|
4 |
+
"run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
|
5 |
+
"epochs": 100000,
|
6 |
+
"batch_size": null,
|
7 |
+
"eval_batch_size": null,
|
8 |
+
"mixed_precision": false,
|
9 |
+
"run_eval": true,
|
10 |
+
"test_delay_epochs": 0,
|
11 |
+
"print_eval": false,
|
12 |
+
"print_step": 50,
|
13 |
+
"tb_plot_step": 100,
|
14 |
+
"tb_model_param_stats": false,
|
15 |
+
"save_step": 1000,
|
16 |
+
"checkpoint": true,
|
17 |
+
"keep_all_best": false,
|
18 |
+
"keep_after": 10000,
|
19 |
+
"num_loader_workers": 8,
|
20 |
+
"num_val_loader_workers": 0,
|
21 |
+
"use_noise_augment": false,
|
22 |
+
"output_path": "../checkpoints/speaker_encoder/language_balanced/normalized/angleproto-4-samples-by-speakers/",
|
23 |
+
"distributed_backend": "nccl",
|
24 |
+
"distributed_url": "tcp://localhost:54321",
|
25 |
+
"audio": {
|
26 |
+
"fft_size": 512,
|
27 |
+
"win_length": 400,
|
28 |
+
"hop_length": 160,
|
29 |
+
"frame_shift_ms": null,
|
30 |
+
"frame_length_ms": null,
|
31 |
+
"stft_pad_mode": "reflect",
|
32 |
+
"sample_rate": 16000,
|
33 |
+
"resample": false,
|
34 |
+
"preemphasis": 0.97,
|
35 |
+
"ref_level_db": 20,
|
36 |
+
"do_sound_norm": false,
|
37 |
+
"do_trim_silence": false,
|
38 |
+
"trim_db": 60,
|
39 |
+
"power": 1.5,
|
40 |
+
"griffin_lim_iters": 60,
|
41 |
+
"num_mels": 64,
|
42 |
+
"mel_fmin": 0.0,
|
43 |
+
"mel_fmax": 8000.0,
|
44 |
+
"spec_gain": 20,
|
45 |
+
"signal_norm": false,
|
46 |
+
"min_level_db": -100,
|
47 |
+
"symmetric_norm": false,
|
48 |
+
"max_norm": 4.0,
|
49 |
+
"clip_norm": false,
|
50 |
+
"stats_path": null
|
51 |
+
},
|
52 |
+
"datasets": [
|
53 |
+
{
|
54 |
+
"name": "voxceleb2",
|
55 |
+
"path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
|
56 |
+
"meta_file_train": null,
|
57 |
+
"ununsed_speakers": null,
|
58 |
+
"meta_file_val": null,
|
59 |
+
"meta_file_attn_mask": "",
|
60 |
+
"language": "voxceleb"
|
61 |
+
}
|
62 |
+
],
|
63 |
+
"model_params": {
|
64 |
+
"model_name": "resnet",
|
65 |
+
"input_dim": 64,
|
66 |
+
"use_torch_spec": true,
|
67 |
+
"log_input": true,
|
68 |
+
"proj_dim": 512
|
69 |
+
},
|
70 |
+
"audio_augmentation": {
|
71 |
+
"p": 0.5,
|
72 |
+
"rir": {
|
73 |
+
"rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
|
74 |
+
"conv_mode": "full"
|
75 |
+
},
|
76 |
+
"additive": {
|
77 |
+
"sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
|
78 |
+
"speech": {
|
79 |
+
"min_snr_in_db": 13,
|
80 |
+
"max_snr_in_db": 20,
|
81 |
+
"min_num_noises": 1,
|
82 |
+
"max_num_noises": 1
|
83 |
+
},
|
84 |
+
"noise": {
|
85 |
+
"min_snr_in_db": 0,
|
86 |
+
"max_snr_in_db": 15,
|
87 |
+
"min_num_noises": 1,
|
88 |
+
"max_num_noises": 1
|
89 |
+
},
|
90 |
+
"music": {
|
91 |
+
"min_snr_in_db": 5,
|
92 |
+
"max_snr_in_db": 15,
|
93 |
+
"min_num_noises": 1,
|
94 |
+
"max_num_noises": 1
|
95 |
+
}
|
96 |
+
},
|
97 |
+
"gaussian": {
|
98 |
+
"p": 0.0,
|
99 |
+
"min_amplitude": 0.0,
|
100 |
+
"max_amplitude": 1e-05
|
101 |
+
}
|
102 |
+
},
|
103 |
+
"storage": {
|
104 |
+
"sample_from_storage_p": 0.5,
|
105 |
+
"storage_size": 40
|
106 |
+
},
|
107 |
+
"max_train_step": 1000000,
|
108 |
+
"loss": "angleproto",
|
109 |
+
"grad_clip": 3.0,
|
110 |
+
"lr": 0.0001,
|
111 |
+
"lr_decay": false,
|
112 |
+
"warmup_steps": 4000,
|
113 |
+
"wd": 1e-06,
|
114 |
+
"steps_plot_stats": 100,
|
115 |
+
"num_speakers_in_batch": 100,
|
116 |
+
"num_utters_per_speaker": 4,
|
117 |
+
"skip_speakers": true,
|
118 |
+
"voice_len": 2.0
|
119 |
+
}
|
cv-speakers-pt+en-m-f.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
errormessage.wav
ADDED
Binary file (889 kB). View file
|
|
language_ids.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"en": 0,
|
3 |
+
"fr-fr": 1,
|
4 |
+
"pt-br": 2
|
5 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TTS
|
2 |
+
torchaudio==0.9.0
|
3 |
+
ipython
|
4 |
+
GitPython
|
5 |
+
openai
|
speakers.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|