ruslanmv commited on
Commit
819f1a5
1 Parent(s): 6eb84b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +376 -376
app.py CHANGED
@@ -1,377 +1,377 @@
1
- import gradio as gr
2
- import os
3
- from utils.default_models import ensure_default_models
4
- import sys
5
- import traceback
6
- from pathlib import Path
7
- from time import perf_counter as timer
8
- import numpy as np
9
- import torch
10
- from encoder import inference as encoder
11
- from synthesizer.inference import Synthesizer
12
- #from toolbox.utterance import Utterance
13
- from vocoder import inference as vocoder
14
- import time
15
- import librosa
16
- import numpy as np
17
- #import sounddevice as sd
18
- import soundfile as sf
19
- import argparse
20
- from utils.argutils import print_args
21
-
22
- parser = argparse.ArgumentParser(
23
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
24
- )
25
- parser.add_argument("-e", "--enc_model_fpath", type=Path,
26
- default="saved_models/default/encoder.pt",
27
- help="Path to a saved encoder")
28
- parser.add_argument("-s", "--syn_model_fpath", type=Path,
29
- default="saved_models/default/synthesizer.pt",
30
- help="Path to a saved synthesizer")
31
- parser.add_argument("-v", "--voc_model_fpath", type=Path,
32
- default="saved_models/default/vocoder.pt",
33
- help="Path to a saved vocoder")
34
- parser.add_argument("--cpu", action="store_true", help=\
35
- "If True, processing is done on CPU, even when a GPU is available.")
36
- parser.add_argument("--no_sound", action="store_true", help=\
37
- "If True, audio won't be played.")
38
- parser.add_argument("--seed", type=int, default=None, help=\
39
- "Optional random number seed value to make toolbox deterministic.")
40
- args = parser.parse_args()
41
- arg_dict = vars(args)
42
- print_args(args, parser)
43
-
44
- # Maximum of generated wavs to keep on memory
45
- MAX_WAVS = 15
46
- utterances = set()
47
- current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
48
- synthesizer = None # type: Synthesizer
49
- current_wav = None
50
- waves_list = []
51
- waves_count = 0
52
- waves_namelist = []
53
-
54
- # Hide GPUs from Pytorch to force CPU processing
55
- if arg_dict.pop("cpu"):
56
- os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
57
-
58
- print("Running a test of your configuration...\n")
59
-
60
- if torch.cuda.is_available():
61
- device_id = torch.cuda.current_device()
62
- gpu_properties = torch.cuda.get_device_properties(device_id)
63
- ## Print some environment information (for debugging purposes)
64
- print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
65
- "%.1fGb total memory.\n" %
66
- (torch.cuda.device_count(),
67
- device_id,
68
- gpu_properties.name,
69
- gpu_properties.major,
70
- gpu_properties.minor,
71
- gpu_properties.total_memory / 1e9))
72
- else:
73
- print("Using CPU for inference.\n")
74
-
75
- ## Load the models one by one.
76
- print("Preparing the encoder, the synthesizer and the vocoder...")
77
- ensure_default_models(Path("saved_models"))
78
- #encoder.load_model(args.enc_model_fpath)
79
- #synthesizer = Synthesizer(args.syn_model_fpath)
80
- #vocoder.load_model(args.voc_model_fpath)
81
-
82
- def compute_embedding(in_fpath):
83
-
84
- if not encoder.is_loaded():
85
- model_fpath = args.enc_model_fpath
86
- print("Loading the encoder %s... " % model_fpath)
87
- start = time.time()
88
- encoder.load_model(model_fpath)
89
- print("Done (%dms)." % int(1000 * (time.time() - start)), "append")
90
-
91
-
92
- ## Computing the embedding
93
- # First, we load the wav using the function that the speaker encoder provides. This is
94
-
95
- # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
96
- # playback, so as to have a fair comparison with the generated audio
97
- wav = Synthesizer.load_preprocess_wav(in_fpath)
98
-
99
- # important: there is preprocessing that must be applied.
100
-
101
- # The following two methods are equivalent:
102
- # - Directly load from the filepath:
103
- preprocessed_wav = encoder.preprocess_wav(wav)
104
-
105
- # - If the wav is already loaded:
106
- #original_wav, sampling_rate = librosa.load(str(in_fpath))
107
- #preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
108
-
109
- # Compute the embedding
110
- embed, partial_embeds, _ = encoder.embed_utterance(preprocessed_wav, return_partials=True)
111
-
112
-
113
- print("Loaded file succesfully")
114
-
115
- # Then we derive the embedding. There are many functions and parameters that the
116
- # speaker encoder interfaces. These are mostly for in-depth research. You will typically
117
- # only use this function (with its default parameters):
118
- #embed = encoder.embed_utterance(preprocessed_wav)
119
-
120
- return embed
121
- def create_spectrogram(text,embed):
122
- # If seed is specified, reset torch seed and force synthesizer reload
123
- if args.seed is not None:
124
- torch.manual_seed(args.seed)
125
- synthesizer = Synthesizer(args.syn_model_fpath)
126
-
127
-
128
- # Synthesize the spectrogram
129
- model_fpath = args.syn_model_fpath
130
- print("Loading the synthesizer %s... " % model_fpath)
131
- start = time.time()
132
- synthesizer = Synthesizer(model_fpath)
133
- print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
134
-
135
-
136
- # The synthesizer works in batch, so you need to put your data in a list or numpy array
137
- texts = [text]
138
- embeds = [embed]
139
- # If you know what the attention layer alignments are, you can retrieve them here by
140
- # passing return_alignments=True
141
- specs = synthesizer.synthesize_spectrograms(texts, embeds)
142
- breaks = [spec.shape[1] for spec in specs]
143
- spec = np.concatenate(specs, axis=1)
144
- sample_rate=synthesizer.sample_rate
145
- return spec, breaks , sample_rate
146
-
147
-
148
- def generate_waveform(current_generated):
149
-
150
- speaker_name, spec, breaks = current_generated
151
- assert spec is not None
152
-
153
- ## Generating the waveform
154
- print("Synthesizing the waveform:")
155
- # If seed is specified, reset torch seed and reload vocoder
156
- if args.seed is not None:
157
- torch.manual_seed(args.seed)
158
- vocoder.load_model(args.voc_model_fpath)
159
-
160
- model_fpath = args.voc_model_fpath
161
- # Synthesize the waveform
162
- if not vocoder.is_loaded():
163
- print("Loading the vocoder %s... " % model_fpath)
164
- start = time.time()
165
- vocoder.load_model(model_fpath)
166
- print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
167
-
168
- current_vocoder_fpath= model_fpath
169
- def vocoder_progress(i, seq_len, b_size, gen_rate):
170
- real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
171
- line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
172
- % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
173
- print(line, "overwrite")
174
-
175
-
176
- # Synthesizing the waveform is fairly straightforward. Remember that the longer the
177
- # spectrogram, the more time-efficient the vocoder.
178
- if current_vocoder_fpath is not None:
179
- print("")
180
- generated_wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
181
- else:
182
- print("Waveform generation with Griffin-Lim... ")
183
- generated_wav = Synthesizer.griffin_lim(spec)
184
-
185
- print(" Done!", "append")
186
-
187
-
188
- ## Post-generation
189
- # There's a bug with sounddevice that makes the audio cut one second earlier, so we
190
- # pad it.
191
- generated_wav = np.pad(generated_wav, (0, Synthesizer.sample_rate), mode="constant")
192
-
193
- # Add breaks
194
- b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
195
- b_starts = np.concatenate(([0], b_ends[:-1]))
196
- wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
197
- breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
198
- generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
199
-
200
-
201
- # Trim excess silences to compensate for gaps in spectrograms (issue #53)
202
- generated_wav = encoder.preprocess_wav(generated_wav)
203
-
204
-
205
- return generated_wav
206
-
207
-
208
- def save_on_disk(generated_wav,sample_rate):
209
- # Save it on the disk
210
- filename = "cloned_voice.wav"
211
- print(generated_wav.dtype)
212
- #OUT=os.environ['OUT_PATH']
213
- # Returns `None` if key doesn't exist
214
- #OUT=os.environ.get('OUT_PATH')
215
- #result = os.path.join(OUT, filename)
216
- result = filename
217
- print(" > Saving output to {}".format(result))
218
- sf.write(result, generated_wav.astype(np.float32), sample_rate)
219
- print("\nSaved output as %s\n\n" % result)
220
-
221
- return result
222
- def play_audio(generated_wav,sample_rate):
223
- # Play the audio (non-blocking)
224
- if not args.no_sound:
225
-
226
- try:
227
- sd.stop()
228
- sd.play(generated_wav, sample_rate)
229
- except sd.PortAudioError as e:
230
- print("\nCaught exception: %s" % repr(e))
231
- print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
232
- except:
233
- raise
234
-
235
-
236
- def clean_memory():
237
- import gc
238
- #import GPUtil
239
- # To see memory usage
240
- print('Before clean ')
241
- #GPUtil.showUtilization()
242
- #cleaning memory 1
243
- gc.collect()
244
- torch.cuda.empty_cache()
245
- time.sleep(2)
246
- print('After Clean GPU')
247
- #GPUtil.showUtilization()
248
-
249
- def clone_voice(in_fpath, text):
250
- try:
251
- speaker_name = "output"
252
- # Compute embedding
253
- embed=compute_embedding(in_fpath)
254
- print("Created the embedding")
255
- # Generating the spectrogram
256
- spec, breaks, sample_rate = create_spectrogram(text,embed)
257
- current_generated = (speaker_name, spec, breaks)
258
- print("Created the mel spectrogram")
259
-
260
- # Create waveform
261
- generated_wav=generate_waveform(current_generated)
262
- print("Created the the waveform ")
263
-
264
- # Save it on the disk
265
- save_on_disk(generated_wav,sample_rate)
266
-
267
- #Play the audio
268
- #play_audio(generated_wav,sample_rate)
269
-
270
- return
271
- except Exception as e:
272
- print("Caught exception: %s" % repr(e))
273
- print("Restarting\n")
274
-
275
- # Set environment variables
276
- home_dir = os.getcwd()
277
- OUT_PATH=os.path.join(home_dir, "out/")
278
- os.environ['OUT_PATH'] = OUT_PATH
279
-
280
- # create output path
281
- os.makedirs(OUT_PATH, exist_ok=True)
282
-
283
- USE_CUDA = torch.cuda.is_available()
284
-
285
- os.system('pip install -q pydub ffmpeg-normalize')
286
- CONFIG_SE_PATH = "config_se.json"
287
- CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
288
- def greet(Text,Voicetoclone ,input_mic=None):
289
- text= "%s" % (Text)
290
- #reference_files= "%s" % (Voicetoclone)
291
-
292
- clean_memory()
293
- print(text,len(text),type(text))
294
- print(Voicetoclone,type(Voicetoclone))
295
-
296
- if len(text) == 0 :
297
- print("Please add text to the program")
298
- Text="Please add text to the program, thank you."
299
- is_no_text=True
300
- else:
301
- is_no_text=False
302
-
303
-
304
- if Voicetoclone==None and input_mic==None:
305
- print("There is no input audio")
306
- Text="Please add audio input, to the program, thank you."
307
- Voicetoclone='trump.mp3'
308
- if is_no_text:
309
- Text="Please add text and audio, to the program, thank you."
310
-
311
- if input_mic != "" and input_mic != None :
312
- # Get the wav file from the microphone
313
- print('The value of MIC IS :',input_mic,type(input_mic))
314
- Voicetoclone= input_mic
315
-
316
- text= "%s" % (Text)
317
- reference_files= Voicetoclone
318
- print("path url")
319
- print(Voicetoclone)
320
- sample= str(Voicetoclone)
321
- os.environ['sample'] = sample
322
- size= len(reference_files)*sys.getsizeof(reference_files)
323
- size2= size / 1000000
324
- if (size2 > 0.012) or len(text)>2000:
325
- message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes."
326
- print(message)
327
- raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.")
328
- else:
329
-
330
- env_var = 'sample'
331
- if env_var in os.environ:
332
- print(f'{env_var} value is {os.environ[env_var]}')
333
- else:
334
- print(f'{env_var} does not exist')
335
- #os.system(f'ffmpeg-normalize {os.environ[env_var]} -nt rms -t=-27 -o {os.environ[env_var]} -ar 16000 -f')
336
- in_fpath = Path(Voicetoclone)
337
- #in_fpath= in_fpath.replace("\"", "").replace("\'", "")
338
-
339
- out_path=clone_voice(in_fpath, text)
340
-
341
- print(" > text: {}".format(text))
342
-
343
- print("Generated Audio")
344
- return "cloned_voice.wav"
345
-
346
- demo = gr.Interface(
347
- fn=greet,
348
- inputs=[gr.inputs.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),
349
- gr.Audio(
350
- type="filepath",
351
- source="upload",
352
- label='Please upload a voice to clone (max. 30mb)'),
353
- gr.inputs.Audio(
354
- source="microphone",
355
- label='or record',
356
- type="filepath",
357
- optional=True)
358
- ],
359
- outputs="audio",
360
-
361
- title = 'Clone Your Voice',
362
- description = 'A simple application that Clone Your Voice. Wait one minute to process.',
363
- article =
364
- '''<div>
365
- <p style="text-align: center"> All you need to do is record your voice, type what you want be say
366
- ,then wait for compiling. After that click on Play/Pause for listen the audio. The audio is saved in an wav format.
367
- For more information visit <a href="https://ruslanmv.com/">ruslanmv.com</a>
368
- </p>
369
- </div>''',
370
-
371
- examples = [["I am the cloned version of Donald Trump. Well. I think what's happening to this country is unbelievably bad. We're no longer a respected country","trump.mp3","trump.mp3"],
372
- ["I am the cloned version of Elon Musk. Persistence is very important. You should not give up unless you are forced to give up.","musk.mp3","musk.mp3"] ,
373
- ["I am the cloned version of Elizabeth. It has always been easy to hate and destroy. To build and to cherish is much more difficult." ,"queen.mp3","queen.mp3"]
374
- ]
375
-
376
- )
377
  demo.launch()
 
1
+ import gradio as gr
2
+ import os
3
+ from utils.default_models import ensure_default_models
4
+ import sys
5
+ import traceback
6
+ from pathlib import Path
7
+ from time import perf_counter as timer
8
+ import numpy as np
9
+ import torch
10
+ from encoder import inference as encoder
11
+ from synthesizer.inference import Synthesizer
12
+ #from toolbox.utterance import Utterance
13
+ from vocoder import inference as vocoder
14
+ import time
15
+ import librosa
16
+ import numpy as np
17
+ #import sounddevice as sd
18
+ import soundfile as sf
19
+ import argparse
20
+ from utils.argutils import print_args
21
+
22
+ parser = argparse.ArgumentParser(
23
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
24
+ )
25
+ parser.add_argument("-e", "--enc_model_fpath", type=Path,
26
+ default="saved_models/default/encoder.pt",
27
+ help="Path to a saved encoder")
28
+ parser.add_argument("-s", "--syn_model_fpath", type=Path,
29
+ default="saved_models/default/synthesizer.pt",
30
+ help="Path to a saved synthesizer")
31
+ parser.add_argument("-v", "--voc_model_fpath", type=Path,
32
+ default="saved_models/default/vocoder.pt",
33
+ help="Path to a saved vocoder")
34
+ parser.add_argument("--cpu", action="store_true", help=\
35
+ "If True, processing is done on CPU, even when a GPU is available.")
36
+ parser.add_argument("--no_sound", action="store_true", help=\
37
+ "If True, audio won't be played.")
38
+ parser.add_argument("--seed", type=int, default=None, help=\
39
+ "Optional random number seed value to make toolbox deterministic.")
40
+ args = parser.parse_args()
41
+ arg_dict = vars(args)
42
+ print_args(args, parser)
43
+
44
+ # Maximum of generated wavs to keep on memory
45
+ MAX_WAVS = 15
46
+ utterances = set()
47
+ current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
48
+ synthesizer = None # type: Synthesizer
49
+ current_wav = None
50
+ waves_list = []
51
+ waves_count = 0
52
+ waves_namelist = []
53
+
54
+ # Hide GPUs from Pytorch to force CPU processing
55
+ if arg_dict.pop("cpu"):
56
+ os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
57
+
58
+ print("Running a test of your configuration...\n")
59
+
60
+ if torch.cuda.is_available():
61
+ device_id = torch.cuda.current_device()
62
+ gpu_properties = torch.cuda.get_device_properties(device_id)
63
+ ## Print some environment information (for debugging purposes)
64
+ print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
65
+ "%.1fGb total memory.\n" %
66
+ (torch.cuda.device_count(),
67
+ device_id,
68
+ gpu_properties.name,
69
+ gpu_properties.major,
70
+ gpu_properties.minor,
71
+ gpu_properties.total_memory / 1e9))
72
+ else:
73
+ print("Using CPU for inference.\n")
74
+
75
+ ## Load the models one by one.
76
+ print("Preparing the encoder, the synthesizer and the vocoder...")
77
+ ensure_default_models(Path("saved_models"))
78
+ #encoder.load_model(args.enc_model_fpath)
79
+ #synthesizer = Synthesizer(args.syn_model_fpath)
80
+ #vocoder.load_model(args.voc_model_fpath)
81
+
82
+ def compute_embedding(in_fpath):
83
+
84
+ if not encoder.is_loaded():
85
+ model_fpath = args.enc_model_fpath
86
+ print("Loading the encoder %s... " % model_fpath)
87
+ start = time.time()
88
+ encoder.load_model(model_fpath)
89
+ print("Done (%dms)." % int(1000 * (time.time() - start)), "append")
90
+
91
+
92
+ ## Computing the embedding
93
+ # First, we load the wav using the function that the speaker encoder provides. This is
94
+
95
+ # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
96
+ # playback, so as to have a fair comparison with the generated audio
97
+ wav = Synthesizer.load_preprocess_wav(in_fpath)
98
+
99
+ # important: there is preprocessing that must be applied.
100
+
101
+ # The following two methods are equivalent:
102
+ # - Directly load from the filepath:
103
+ preprocessed_wav = encoder.preprocess_wav(wav)
104
+
105
+ # - If the wav is already loaded:
106
+ #original_wav, sampling_rate = librosa.load(str(in_fpath))
107
+ #preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
108
+
109
+ # Compute the embedding
110
+ embed, partial_embeds, _ = encoder.embed_utterance(preprocessed_wav, return_partials=True)
111
+
112
+
113
+ print("Loaded file succesfully")
114
+
115
+ # Then we derive the embedding. There are many functions and parameters that the
116
+ # speaker encoder interfaces. These are mostly for in-depth research. You will typically
117
+ # only use this function (with its default parameters):
118
+ #embed = encoder.embed_utterance(preprocessed_wav)
119
+
120
+ return embed
121
+ def create_spectrogram(text,embed):
122
+ # If seed is specified, reset torch seed and force synthesizer reload
123
+ if args.seed is not None:
124
+ torch.manual_seed(args.seed)
125
+ synthesizer = Synthesizer(args.syn_model_fpath)
126
+
127
+
128
+ # Synthesize the spectrogram
129
+ model_fpath = args.syn_model_fpath
130
+ print("Loading the synthesizer %s... " % model_fpath)
131
+ start = time.time()
132
+ synthesizer = Synthesizer(model_fpath)
133
+ print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
134
+
135
+
136
+ # The synthesizer works in batch, so you need to put your data in a list or numpy array
137
+ texts = [text]
138
+ embeds = [embed]
139
+ # If you know what the attention layer alignments are, you can retrieve them here by
140
+ # passing return_alignments=True
141
+ specs = synthesizer.synthesize_spectrograms(texts, embeds)
142
+ breaks = [spec.shape[1] for spec in specs]
143
+ spec = np.concatenate(specs, axis=1)
144
+ sample_rate=synthesizer.sample_rate
145
+ return spec, breaks , sample_rate
146
+
147
+
148
+ def generate_waveform(current_generated):
149
+
150
+ speaker_name, spec, breaks = current_generated
151
+ assert spec is not None
152
+
153
+ ## Generating the waveform
154
+ print("Synthesizing the waveform:")
155
+ # If seed is specified, reset torch seed and reload vocoder
156
+ if args.seed is not None:
157
+ torch.manual_seed(args.seed)
158
+ vocoder.load_model(args.voc_model_fpath)
159
+
160
+ model_fpath = args.voc_model_fpath
161
+ # Synthesize the waveform
162
+ if not vocoder.is_loaded():
163
+ print("Loading the vocoder %s... " % model_fpath)
164
+ start = time.time()
165
+ vocoder.load_model(model_fpath)
166
+ print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
167
+
168
+ current_vocoder_fpath= model_fpath
169
+ def vocoder_progress(i, seq_len, b_size, gen_rate):
170
+ real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
171
+ line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
172
+ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
173
+ print(line, "overwrite")
174
+
175
+
176
+ # Synthesizing the waveform is fairly straightforward. Remember that the longer the
177
+ # spectrogram, the more time-efficient the vocoder.
178
+ if current_vocoder_fpath is not None:
179
+ print("")
180
+ generated_wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
181
+ else:
182
+ print("Waveform generation with Griffin-Lim... ")
183
+ generated_wav = Synthesizer.griffin_lim(spec)
184
+
185
+ print(" Done!", "append")
186
+
187
+
188
+ ## Post-generation
189
+ # There's a bug with sounddevice that makes the audio cut one second earlier, so we
190
+ # pad it.
191
+ generated_wav = np.pad(generated_wav, (0, Synthesizer.sample_rate), mode="constant")
192
+
193
+ # Add breaks
194
+ b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
195
+ b_starts = np.concatenate(([0], b_ends[:-1]))
196
+ wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
197
+ breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
198
+ generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
199
+
200
+
201
+ # Trim excess silences to compensate for gaps in spectrograms (issue #53)
202
+ generated_wav = encoder.preprocess_wav(generated_wav)
203
+
204
+
205
+ return generated_wav
206
+
207
+
208
+ def save_on_disk(generated_wav,sample_rate):
209
+ # Save it on the disk
210
+ filename = "cloned_voice.wav"
211
+ print(generated_wav.dtype)
212
+ #OUT=os.environ['OUT_PATH']
213
+ # Returns `None` if key doesn't exist
214
+ #OUT=os.environ.get('OUT_PATH')
215
+ #result = os.path.join(OUT, filename)
216
+ result = filename
217
+ print(" > Saving output to {}".format(result))
218
+ sf.write(result, generated_wav.astype(np.float32), sample_rate)
219
+ print("\nSaved output as %s\n\n" % result)
220
+
221
+ return result
222
+ def play_audio(generated_wav,sample_rate):
223
+ # Play the audio (non-blocking)
224
+ if not args.no_sound:
225
+
226
+ try:
227
+ sd.stop()
228
+ sd.play(generated_wav, sample_rate)
229
+ except sd.PortAudioError as e:
230
+ print("\nCaught exception: %s" % repr(e))
231
+ print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
232
+ except:
233
+ raise
234
+
235
+
236
+ def clean_memory():
237
+ import gc
238
+ #import GPUtil
239
+ # To see memory usage
240
+ print('Before clean ')
241
+ #GPUtil.showUtilization()
242
+ #cleaning memory 1
243
+ gc.collect()
244
+ torch.cuda.empty_cache()
245
+ time.sleep(2)
246
+ print('After Clean GPU')
247
+ #GPUtil.showUtilization()
248
+
249
+ def clone_voice(in_fpath, text):
250
+ try:
251
+ speaker_name = "output"
252
+ # Compute embedding
253
+ embed=compute_embedding(in_fpath)
254
+ print("Created the embedding")
255
+ # Generating the spectrogram
256
+ spec, breaks, sample_rate = create_spectrogram(text,embed)
257
+ current_generated = (speaker_name, spec, breaks)
258
+ print("Created the mel spectrogram")
259
+
260
+ # Create waveform
261
+ generated_wav=generate_waveform(current_generated)
262
+ print("Created the the waveform ")
263
+
264
+ # Save it on the disk
265
+ save_on_disk(generated_wav,sample_rate)
266
+
267
+ #Play the audio
268
+ #play_audio(generated_wav,sample_rate)
269
+
270
+ return
271
+ except Exception as e:
272
+ print("Caught exception: %s" % repr(e))
273
+ print("Restarting\n")
274
+
275
+ # Set environment variables
276
+ home_dir = os.getcwd()
277
+ OUT_PATH=os.path.join(home_dir, "out/")
278
+ os.environ['OUT_PATH'] = OUT_PATH
279
+
280
+ # create output path
281
+ os.makedirs(OUT_PATH, exist_ok=True)
282
+
283
+ USE_CUDA = torch.cuda.is_available()
284
+
285
+ os.system('pip install -q pydub ffmpeg-normalize')
286
+ CONFIG_SE_PATH = "config_se.json"
287
+ CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
288
+ def greet(Text,Voicetoclone ,input_mic=None):
289
+ text= "%s" % (Text)
290
+ #reference_files= "%s" % (Voicetoclone)
291
+
292
+ clean_memory()
293
+ print(text,len(text),type(text))
294
+ print(Voicetoclone,type(Voicetoclone))
295
+
296
+ if len(text) == 0 :
297
+ print("Please add text to the program")
298
+ Text="Please add text to the program, thank you."
299
+ is_no_text=True
300
+ else:
301
+ is_no_text=False
302
+
303
+
304
+ if Voicetoclone==None and input_mic==None:
305
+ print("There is no input audio")
306
+ Text="Please add audio input, to the program, thank you."
307
+ Voicetoclone='trump.mp3'
308
+ if is_no_text:
309
+ Text="Please add text and audio, to the program, thank you."
310
+
311
+ if input_mic != "" and input_mic != None :
312
+ # Get the wav file from the microphone
313
+ print('The value of MIC IS :',input_mic,type(input_mic))
314
+ Voicetoclone= input_mic
315
+
316
+ text= "%s" % (Text)
317
+ reference_files= Voicetoclone
318
+ print("path url")
319
+ print(Voicetoclone)
320
+ sample= str(Voicetoclone)
321
+ os.environ['sample'] = sample
322
+ size= len(reference_files)*sys.getsizeof(reference_files)
323
+ size2= size / 1000000
324
+ if (size2 > 0.012) or len(text)>2000:
325
+ message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes."
326
+ print(message)
327
+ raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.")
328
+ else:
329
+
330
+ env_var = 'sample'
331
+ if env_var in os.environ:
332
+ print(f'{env_var} value is {os.environ[env_var]}')
333
+ else:
334
+ print(f'{env_var} does not exist')
335
+ #os.system(f'ffmpeg-normalize {os.environ[env_var]} -nt rms -t=-27 -o {os.environ[env_var]} -ar 16000 -f')
336
+ in_fpath = Path(Voicetoclone)
337
+ #in_fpath= in_fpath.replace("\"", "").replace("\'", "")
338
+
339
+ out_path=clone_voice(in_fpath, text)
340
+
341
+ print(" > text: {}".format(text))
342
+
343
+ print("Generated Audio")
344
+ return "cloned_voice.wav"
345
+
346
+ demo = gr.Interface(
347
+ fn=greet,
348
+ inputs=[gr.inputs.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),
349
+ gr.Audio(
350
+ type="filepath",
351
+ source="upload",
352
+ label='Please upload a voice to clone (max. 30mb)'),
353
+ gr.inputs.Audio(
354
+ source="microphone",
355
+ label='or record',
356
+ type="filepath",
357
+ optional=True)
358
+ ],
359
+ outputs="audio",
360
+
361
+ title = 'Clone Your Voice',
362
+ description = 'A simple application that Clone Your Voice. Wait one minute to process.',
363
+ article =
364
+ '''<div>
365
+ <p style="text-align: center"> All you need to do is record your voice, type what you want be say
366
+ ,then wait for compiling. After that click on Play/Pause for listen the audio. The audio is saved in an wav format.
367
+ For more information visit <a href="https://ruslanmv.com/">ruslanmv.com</a>
368
+ </p>
369
+ </div>''',
370
+
371
+ #examples = [["I am the cloned version of Donald Trump. Well. I think what's happening to this country is unbelievably bad. We're no longer a respected country","trump.mp3","trump.mp3"],
372
+ # ["I am the cloned version of Elon Musk. Persistence is very important. You should not give up unless you are forced to give up.","musk.mp3","musk.mp3"] ,
373
+ # ["I am the cloned version of Elizabeth. It has always been easy to hate and destroy. To build and to cherish is much more difficult." ,"queen.mp3","queen.mp3"]
374
+ # ]
375
+
376
+ )
377
  demo.launch()