Ruslan Magana Vsevolodovna commited on
Commit
ee9686a
1 Parent(s): e36bae7

fixing quality audio

Browse files
Files changed (2) hide show
  1. app.py +121 -38
  2. requirements.txt +0 -0
app.py CHANGED
@@ -1,26 +1,24 @@
1
  import gradio as gr
2
  import os
 
3
  import sys
4
- import os
5
- import string
6
- import numpy as np
7
- import IPython
8
- from IPython.display import Audio
9
- import torch
10
- import argparse
11
- import os
12
  from pathlib import Path
13
- import librosa
14
  import numpy as np
15
- import soundfile as sf
16
  import torch
17
  from encoder import inference as encoder
18
- from encoder.params_model import model_embedding_size as speaker_embedding_size
19
  from synthesizer.inference import Synthesizer
20
- from utils.argutils import print_args
21
- from utils.default_models import ensure_default_models
22
  from vocoder import inference as vocoder
23
- #import sounddevice as sd
 
 
 
 
 
 
 
24
  parser = argparse.ArgumentParser(
25
  formatter_class=argparse.ArgumentDefaultsHelpFormatter
26
  )
@@ -43,6 +41,16 @@ args = parser.parse_args()
43
  arg_dict = vars(args)
44
  print_args(args, parser)
45
 
 
 
 
 
 
 
 
 
 
 
46
  # Hide GPUs from Pytorch to force CPU processing
47
  if arg_dict.pop("cpu"):
48
  os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
@@ -67,65 +75,137 @@ else:
67
  ## Load the models one by one.
68
  print("Preparing the encoder, the synthesizer and the vocoder...")
69
  ensure_default_models(Path("saved_models"))
70
- encoder.load_model(args.enc_model_fpath)
71
- synthesizer = Synthesizer(args.syn_model_fpath)
72
- vocoder.load_model(args.voc_model_fpath)
73
 
74
  def compute_embedding(in_fpath):
 
 
 
 
 
 
 
 
 
75
  ## Computing the embedding
76
  # First, we load the wav using the function that the speaker encoder provides. This is
 
 
 
 
 
77
  # important: there is preprocessing that must be applied.
78
 
79
  # The following two methods are equivalent:
80
  # - Directly load from the filepath:
81
- preprocessed_wav = encoder.preprocess_wav(in_fpath)
 
82
  # - If the wav is already loaded:
83
- original_wav, sampling_rate = librosa.load(str(in_fpath))
84
- preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
 
 
 
 
 
85
  print("Loaded file succesfully")
86
 
87
  # Then we derive the embedding. There are many functions and parameters that the
88
  # speaker encoder interfaces. These are mostly for in-depth research. You will typically
89
  # only use this function (with its default parameters):
90
- embed = encoder.embed_utterance(preprocessed_wav)
91
 
92
  return embed
93
- def create_spectrogram(text,embed, synthesizer ):
94
  # If seed is specified, reset torch seed and force synthesizer reload
95
  if args.seed is not None:
96
  torch.manual_seed(args.seed)
97
  synthesizer = Synthesizer(args.syn_model_fpath)
 
 
 
 
 
 
 
 
 
 
98
  # The synthesizer works in batch, so you need to put your data in a list or numpy array
99
  texts = [text]
100
  embeds = [embed]
101
  # If you know what the attention layer alignments are, you can retrieve them here by
102
  # passing return_alignments=True
103
  specs = synthesizer.synthesize_spectrograms(texts, embeds)
104
- spec = specs[0]
105
- return spec
 
 
 
 
 
 
 
 
106
 
107
- def generate_waveform(spec):
108
  ## Generating the waveform
109
  print("Synthesizing the waveform:")
110
  # If seed is specified, reset torch seed and reload vocoder
111
  if args.seed is not None:
112
  torch.manual_seed(args.seed)
113
  vocoder.load_model(args.voc_model_fpath)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  # Synthesizing the waveform is fairly straightforward. Remember that the longer the
115
  # spectrogram, the more time-efficient the vocoder.
116
- generated_wav = vocoder.infer_waveform(spec)
 
 
 
 
 
 
 
 
117
 
118
  ## Post-generation
119
  # There's a bug with sounddevice that makes the audio cut one second earlier, so we
120
  # pad it.
121
- generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
 
 
 
 
 
 
 
 
122
 
123
  # Trim excess silences to compensate for gaps in spectrograms (issue #53)
124
  generated_wav = encoder.preprocess_wav(generated_wav)
 
 
125
  return generated_wav
126
 
127
 
128
- def save_on_disk(generated_wav,synthesizer):
129
  # Save it on the disk
130
  filename = "cloned_voice.wav"
131
  print(generated_wav.dtype)
@@ -135,41 +215,43 @@ def save_on_disk(generated_wav,synthesizer):
135
  #result = os.path.join(OUT, filename)
136
  result = filename
137
  print(" > Saving output to {}".format(result))
138
- sf.write(result, generated_wav.astype(np.float32), synthesizer.sample_rate)
139
  print("\nSaved output as %s\n\n" % result)
140
 
141
  return result
142
- def play_audio(generated_wav,synthesizer):
143
  # Play the audio (non-blocking)
144
  if not args.no_sound:
145
 
146
  try:
147
  sd.stop()
148
- sd.play(generated_wav, synthesizer.sample_rate)
149
  except sd.PortAudioError as e:
150
  print("\nCaught exception: %s" % repr(e))
151
  print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
152
  except:
153
  raise
154
 
155
- def clone_voice(in_fpath, text,synthesizer):
156
  try:
 
157
  # Compute embedding
158
  embed=compute_embedding(in_fpath)
159
  print("Created the embedding")
160
  # Generating the spectrogram
161
- spec = create_spectrogram(text,embed,synthesizer)
 
162
  print("Created the mel spectrogram")
163
 
164
  # Create waveform
165
- generated_wav=generate_waveform(spec)
166
  print("Created the the waveform ")
167
 
168
  # Save it on the disk
169
- save_on_disk(generated_wav,synthesizer)
170
 
171
  #Play the audio
172
- #play_audio(generated_wav,synthesizer)
173
 
174
  return
175
  except Exception as e:
@@ -214,7 +296,7 @@ def greet(Text,Voicetoclone):
214
  in_fpath = Path(Voicetoclone)
215
  #in_fpath= in_fpath.replace("\"", "").replace("\'", "")
216
 
217
- out_path=clone_voice(in_fpath, text,synthesizer)
218
 
219
  print(" > text: {}".format(text))
220
 
@@ -228,6 +310,7 @@ demo = gr.Interface(
228
  type="filepath",
229
  source="upload",
230
  label='Please upload a voice to clone (max. 30mb)')
 
231
  ],
232
  outputs="audio",
233
 
@@ -242,7 +325,7 @@ demo = gr.Interface(
242
  </div>''',
243
 
244
  examples = [
245
- ["I am the cloned version of Donald Trump.Well, I think what's happening to this country is unbelievably bad. We're no longer a respected country" ,"trump.mp3"]
246
 
247
  ]
248
 
 
1
  import gradio as gr
2
  import os
3
+ from utils.default_models import ensure_default_models
4
  import sys
5
+ import traceback
 
 
 
 
 
 
 
6
  from pathlib import Path
7
+ from time import perf_counter as timer
8
  import numpy as np
 
9
  import torch
10
  from encoder import inference as encoder
 
11
  from synthesizer.inference import Synthesizer
12
+ #from toolbox.utterance import Utterance
 
13
  from vocoder import inference as vocoder
14
+ import time
15
+ import librosa
16
+ import numpy as np
17
+ import sounddevice as sd
18
+ import soundfile as sf
19
+ import argparse
20
+ from utils.argutils import print_args
21
+
22
  parser = argparse.ArgumentParser(
23
  formatter_class=argparse.ArgumentDefaultsHelpFormatter
24
  )
 
41
  arg_dict = vars(args)
42
  print_args(args, parser)
43
 
44
+ # Maximum of generated wavs to keep on memory
45
+ MAX_WAVS = 15
46
+ utterances = set()
47
+ current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
48
+ synthesizer = None # type: Synthesizer
49
+ current_wav = None
50
+ waves_list = []
51
+ waves_count = 0
52
+ waves_namelist = []
53
+
54
  # Hide GPUs from Pytorch to force CPU processing
55
  if arg_dict.pop("cpu"):
56
  os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
75
  ## Load the models one by one.
76
  print("Preparing the encoder, the synthesizer and the vocoder...")
77
  ensure_default_models(Path("saved_models"))
78
+ #encoder.load_model(args.enc_model_fpath)
79
+ #synthesizer = Synthesizer(args.syn_model_fpath)
80
+ #vocoder.load_model(args.voc_model_fpath)
81
 
82
  def compute_embedding(in_fpath):
83
+
84
+ if not encoder.is_loaded():
85
+ model_fpath = args.enc_model_fpath
86
+ print("Loading the encoder %s... " % model_fpath)
87
+ start = time.time()
88
+ encoder.load_model(model_fpath)
89
+ print("Done (%dms)." % int(1000 * (time.time() - start)), "append")
90
+
91
+
92
  ## Computing the embedding
93
  # First, we load the wav using the function that the speaker encoder provides. This is
94
+
95
+ # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
96
+ # playback, so as to have a fair comparison with the generated audio
97
+ wav = Synthesizer.load_preprocess_wav(in_fpath)
98
+
99
  # important: there is preprocessing that must be applied.
100
 
101
  # The following two methods are equivalent:
102
  # - Directly load from the filepath:
103
+ preprocessed_wav = encoder.preprocess_wav(wav)
104
+
105
  # - If the wav is already loaded:
106
+ #original_wav, sampling_rate = librosa.load(str(in_fpath))
107
+ #preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
108
+
109
+ # Compute the embedding
110
+ embed, partial_embeds, _ = encoder.embed_utterance(preprocessed_wav, return_partials=True)
111
+
112
+
113
  print("Loaded file succesfully")
114
 
115
  # Then we derive the embedding. There are many functions and parameters that the
116
  # speaker encoder interfaces. These are mostly for in-depth research. You will typically
117
  # only use this function (with its default parameters):
118
+ #embed = encoder.embed_utterance(preprocessed_wav)
119
 
120
  return embed
121
+ def create_spectrogram(text,embed):
122
  # If seed is specified, reset torch seed and force synthesizer reload
123
  if args.seed is not None:
124
  torch.manual_seed(args.seed)
125
  synthesizer = Synthesizer(args.syn_model_fpath)
126
+
127
+
128
+ # Synthesize the spectrogram
129
+ model_fpath = args.syn_model_fpath
130
+ print("Loading the synthesizer %s... " % model_fpath)
131
+ start = time.time()
132
+ synthesizer = Synthesizer(model_fpath)
133
+ print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
134
+
135
+
136
  # The synthesizer works in batch, so you need to put your data in a list or numpy array
137
  texts = [text]
138
  embeds = [embed]
139
  # If you know what the attention layer alignments are, you can retrieve them here by
140
  # passing return_alignments=True
141
  specs = synthesizer.synthesize_spectrograms(texts, embeds)
142
+ breaks = [spec.shape[1] for spec in specs]
143
+ spec = np.concatenate(specs, axis=1)
144
+ sample_rate=synthesizer.sample_rate
145
+ return spec, breaks , sample_rate
146
+
147
+
148
+ def generate_waveform(current_generated):
149
+
150
+ speaker_name, spec, breaks = current_generated
151
+ assert spec is not None
152
 
 
153
  ## Generating the waveform
154
  print("Synthesizing the waveform:")
155
  # If seed is specified, reset torch seed and reload vocoder
156
  if args.seed is not None:
157
  torch.manual_seed(args.seed)
158
  vocoder.load_model(args.voc_model_fpath)
159
+
160
+ model_fpath = args.voc_model_fpath
161
+ # Synthesize the waveform
162
+ if not vocoder.is_loaded():
163
+ print("Loading the vocoder %s... " % model_fpath)
164
+ start = time.time()
165
+ vocoder.load_model(model_fpath)
166
+ print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
167
+
168
+ current_vocoder_fpath= model_fpath
169
+ def vocoder_progress(i, seq_len, b_size, gen_rate):
170
+ real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
171
+ line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
172
+ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
173
+ print(line, "overwrite")
174
+
175
+
176
  # Synthesizing the waveform is fairly straightforward. Remember that the longer the
177
  # spectrogram, the more time-efficient the vocoder.
178
+ if current_vocoder_fpath is not None:
179
+ print("")
180
+ generated_wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
181
+ else:
182
+ print("Waveform generation with Griffin-Lim... ")
183
+ generated_wav = Synthesizer.griffin_lim(spec)
184
+
185
+ print(" Done!", "append")
186
+
187
 
188
  ## Post-generation
189
  # There's a bug with sounddevice that makes the audio cut one second earlier, so we
190
  # pad it.
191
+ generated_wav = np.pad(generated_wav, (0, Synthesizer.sample_rate), mode="constant")
192
+
193
+ # Add breaks
194
+ b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
195
+ b_starts = np.concatenate(([0], b_ends[:-1]))
196
+ wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
197
+ breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
198
+ generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
199
+
200
 
201
  # Trim excess silences to compensate for gaps in spectrograms (issue #53)
202
  generated_wav = encoder.preprocess_wav(generated_wav)
203
+
204
+
205
  return generated_wav
206
 
207
 
208
+ def save_on_disk(generated_wav,sample_rate):
209
  # Save it on the disk
210
  filename = "cloned_voice.wav"
211
  print(generated_wav.dtype)
 
215
  #result = os.path.join(OUT, filename)
216
  result = filename
217
  print(" > Saving output to {}".format(result))
218
+ sf.write(result, generated_wav.astype(np.float32), sample_rate)
219
  print("\nSaved output as %s\n\n" % result)
220
 
221
  return result
222
+ def play_audio(generated_wav,sample_rate):
223
  # Play the audio (non-blocking)
224
  if not args.no_sound:
225
 
226
  try:
227
  sd.stop()
228
+ sd.play(generated_wav, sample_rate)
229
  except sd.PortAudioError as e:
230
  print("\nCaught exception: %s" % repr(e))
231
  print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
232
  except:
233
  raise
234
 
235
+ def clone_voice(in_fpath, text):
236
  try:
237
+ speaker_name = "output"
238
  # Compute embedding
239
  embed=compute_embedding(in_fpath)
240
  print("Created the embedding")
241
  # Generating the spectrogram
242
+ spec, breaks, sample_rate = create_spectrogram(text,embed)
243
+ current_generated = (speaker_name, spec, breaks)
244
  print("Created the mel spectrogram")
245
 
246
  # Create waveform
247
+ generated_wav=generate_waveform(current_generated)
248
  print("Created the the waveform ")
249
 
250
  # Save it on the disk
251
+ save_on_disk(generated_wav,sample_rate)
252
 
253
  #Play the audio
254
+ #play_audio(generated_wav,sample_rate)
255
 
256
  return
257
  except Exception as e:
 
296
  in_fpath = Path(Voicetoclone)
297
  #in_fpath= in_fpath.replace("\"", "").replace("\'", "")
298
 
299
+ out_path=clone_voice(in_fpath, text)
300
 
301
  print(" > text: {}".format(text))
302
 
 
310
  type="filepath",
311
  source="upload",
312
  label='Please upload a voice to clone (max. 30mb)')
313
+
314
  ],
315
  outputs="audio",
316
 
 
325
  </div>''',
326
 
327
  examples = [
328
+ ["I am the cloned version of Donald Trump. Well, I think what's happening to this country is unbelievably bad. We're no longer a respected country" ,"trump.mp3"]
329
 
330
  ]
331
 
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ