Spaces:
Sleeping
Sleeping
Staticaliza
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -173,42 +173,37 @@ bigvgan_44k_model.remove_weight_norm()
|
|
173 |
bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
|
174 |
print("[INFO] | BigVGAN 44kHz model loaded, weight norm removed, set to eval mode, and moved to CPU.")
|
175 |
|
176 |
-
#
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
def crossfade(chunk1, chunk2, overlap):
|
185 |
-
fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
|
186 |
-
fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
|
187 |
-
chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
|
188 |
-
return chunk2
|
189 |
|
190 |
# ----------------------------
|
191 |
-
#
|
192 |
# ----------------------------
|
193 |
|
194 |
@torch.no_grad()
|
195 |
@torch.inference_mode()
|
196 |
-
def voice_conversion(
|
197 |
print("[INFO] | Voice conversion started.")
|
198 |
|
199 |
-
inference_module = model if not
|
200 |
-
mel_fn = to_mel if not
|
201 |
-
bigvgan_fn = bigvgan_model if not
|
202 |
-
sr_current = 22050 if not
|
203 |
-
hop_length_current = 256 if not
|
204 |
max_context_window = sr_current // hop_length_current * 30
|
205 |
overlap_wave_len = 16 * hop_length_current
|
206 |
bitrate = "320k"
|
207 |
|
208 |
# Load audio using librosa
|
209 |
print("[INFO] | Loading source and reference audio.")
|
210 |
-
source_audio, _ = librosa.load(
|
211 |
-
ref_audio, _ = librosa.load(
|
212 |
|
213 |
# Clip reference audio to 25 seconds
|
214 |
ref_audio = ref_audio[:sr_current * 25]
|
@@ -325,7 +320,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
325 |
print(f"[INFO] | Mel spectrogram shapes: mel={mel.shape}, mel2={mel2.shape}")
|
326 |
|
327 |
# Length adjustment
|
328 |
-
target_lengths = torch.LongTensor([int(mel.size(2) *
|
329 |
target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
|
330 |
print(f"[INFO] | Target lengths: {target_lengths.item()}, {target2_lengths.item()}")
|
331 |
|
@@ -342,7 +337,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
342 |
print(f"[INFO] | Style2 shape: {style2.shape}")
|
343 |
|
344 |
# F0 Conditioning
|
345 |
-
if
|
346 |
print("[INFO] | Performing F0 conditioning.")
|
347 |
F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.5)
|
348 |
F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.5)
|
@@ -367,8 +362,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
367 |
log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
|
368 |
)
|
369 |
shifted_f0_alt = torch.exp(shifted_log_f0_alt)
|
370 |
-
if
|
371 |
-
shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1],
|
372 |
print("[INFO] | F0 conditioning completed.")
|
373 |
else:
|
374 |
F0_ori = None
|
@@ -401,8 +396,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
401 |
mel2,
|
402 |
style2,
|
403 |
None,
|
404 |
-
|
405 |
-
inference_cfg_rate=
|
406 |
)
|
407 |
vc_target = vc_target[:, :, mel2.size(2):]
|
408 |
print(f"[INFO] | vc_target shape: {vc_target.shape}")
|
@@ -438,9 +433,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
438 |
|
439 |
return temp_file_path
|
440 |
|
441 |
-
|
442 |
-
|
443 |
-
# ----------------------------
|
444 |
|
445 |
@spaces.GPU(duration=15)
|
446 |
def gpu():
|
@@ -450,32 +444,38 @@ def gpu():
|
|
450 |
# Gradio Interface
|
451 |
# ----------------------------
|
452 |
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
458 |
|
459 |
-
|
460 |
-
gr.Audio(type="filepath", label="Source Audio"),
|
461 |
-
gr.Audio(type="filepath", label="Reference Audio"),
|
462 |
-
gr.Slider(minimum=1, maximum=100, value=25, step=1, label="Diffusion Steps", info="Default is 25. Use 50-100 for best quality."),
|
463 |
-
gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjustment", info="<1.0 to speed up speech, >1.0 to slow down speech."),
|
464 |
-
gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="Has a subtle influence."),
|
465 |
-
gr.Checkbox(label="Use F0 Conditioned Model", value=False, info="Must be enabled for singing voice conversion."),
|
466 |
-
gr.Checkbox(label="Auto F0 Adjustment", value=True, info="Roughly adjusts F0 to match target voice. Only works when 'Use F0 Conditioned Model' is enabled."),
|
467 |
-
gr.Slider(label='Pitch Shift (semitones)', minimum=-12, maximum=12, step=1, value=0, info="Pitch shift in semitones. Only works when 'Use F0 Conditioned Model' is enabled."),
|
468 |
-
]
|
469 |
-
|
470 |
-
# Set outputs to a single gr.Audio component with type="filepath"
|
471 |
-
outputs = gr.Audio(label="Full Output Audio", type="filepath")
|
472 |
-
|
473 |
-
gr.Interface(
|
474 |
-
fn=voice_conversion,
|
475 |
-
description=description,
|
476 |
-
inputs=inputs,
|
477 |
-
outputs=outputs,
|
478 |
-
title="Seed Voice Conversion",
|
479 |
-
cache_examples=False,
|
480 |
-
allow_flagging="never"
|
481 |
-
).launch(share=True)
|
|
|
173 |
bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
|
174 |
print("[INFO] | BigVGAN 44kHz model loaded, weight norm removed, set to eval mode, and moved to CPU.")
|
175 |
|
176 |
+
# CSS Styling
|
177 |
+
css = '''
|
178 |
+
.gradio-container{max-width: 560px !important}
|
179 |
+
h1{text-align:center}
|
180 |
+
footer {
|
181 |
+
visibility: hidden
|
182 |
+
}
|
183 |
+
'''
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
# ----------------------------
|
186 |
+
# Functions
|
187 |
# ----------------------------
|
188 |
|
189 |
@torch.no_grad()
|
190 |
@torch.inference_mode()
|
191 |
+
def voice_conversion(input, reference, steps, guidance, speed, use_conditioned, use_auto_adjustment, pitch):
|
192 |
print("[INFO] | Voice conversion started.")
|
193 |
|
194 |
+
inference_module = model if not use_conditioned else model_f0
|
195 |
+
mel_fn = to_mel if not use_conditioned else to_mel_f0
|
196 |
+
bigvgan_fn = bigvgan_model if not use_conditioned else bigvgan_44k_model
|
197 |
+
sr_current = 22050 if not use_conditioned else 44100
|
198 |
+
hop_length_current = 256 if not use_conditioned else 512
|
199 |
max_context_window = sr_current // hop_length_current * 30
|
200 |
overlap_wave_len = 16 * hop_length_current
|
201 |
bitrate = "320k"
|
202 |
|
203 |
# Load audio using librosa
|
204 |
print("[INFO] | Loading source and reference audio.")
|
205 |
+
source_audio, _ = librosa.load(input, sr=sr_current)
|
206 |
+
ref_audio, _ = librosa.load(reference, sr=sr_current)
|
207 |
|
208 |
# Clip reference audio to 25 seconds
|
209 |
ref_audio = ref_audio[:sr_current * 25]
|
|
|
320 |
print(f"[INFO] | Mel spectrogram shapes: mel={mel.shape}, mel2={mel2.shape}")
|
321 |
|
322 |
# Length adjustment
|
323 |
+
target_lengths = torch.LongTensor([int(mel.size(2) * speed)]).to(mel.device)
|
324 |
target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
|
325 |
print(f"[INFO] | Target lengths: {target_lengths.item()}, {target2_lengths.item()}")
|
326 |
|
|
|
337 |
print(f"[INFO] | Style2 shape: {style2.shape}")
|
338 |
|
339 |
# F0 Conditioning
|
340 |
+
if use_conditioned:
|
341 |
print("[INFO] | Performing F0 conditioning.")
|
342 |
F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.5)
|
343 |
F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.5)
|
|
|
362 |
log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
|
363 |
)
|
364 |
shifted_f0_alt = torch.exp(shifted_log_f0_alt)
|
365 |
+
if pitch != 0:
|
366 |
+
shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch)
|
367 |
print("[INFO] | F0 conditioning completed.")
|
368 |
else:
|
369 |
F0_ori = None
|
|
|
396 |
mel2,
|
397 |
style2,
|
398 |
None,
|
399 |
+
steps,
|
400 |
+
inference_cfg_rate=guidance
|
401 |
)
|
402 |
vc_target = vc_target[:, :, mel2.size(2):]
|
403 |
print(f"[INFO] | vc_target shape: {vc_target.shape}")
|
|
|
433 |
|
434 |
return temp_file_path
|
435 |
|
436 |
+
def cloud():
|
437 |
+
print("[CLOUD] | Space maintained.")
|
|
|
438 |
|
439 |
@spaces.GPU(duration=15)
|
440 |
def gpu():
|
|
|
444 |
# Gradio Interface
|
445 |
# ----------------------------
|
446 |
|
447 |
+
for model_file in model_files:
|
448 |
+
model_path = os.path.join(model_repo_dir, model_file)
|
449 |
+
tts_instance = TTS(model_path)
|
450 |
+
model_to_speakers[model_file] = tts_instance.get_speakers()
|
451 |
+
del tts_instance
|
452 |
+
|
453 |
+
with gr.Blocks(css=css) as main:
|
454 |
+
with gr.Column():
|
455 |
+
gr.Markdown("🪄 Add tone to audio.")
|
456 |
+
|
457 |
+
with gr.Column():
|
458 |
+
input = gr.Audio(label="Input Audio", type="filepath"),
|
459 |
+
reference_input = gr.Audio(label="Reference Audio", type="filepath"),
|
460 |
+
|
461 |
+
with gr.Column():
|
462 |
+
steps = gr.Slider(label="Steps", value=1, minimum=1, maximum=100, step=1),
|
463 |
+
guidance = gr.Slider(label="Guidance", value=0.7, minimum=0.0, maximum=1.0, step=0.1),
|
464 |
+
speed = gr.Slider(label="Speed", value=1.0, minimum=0.5, maximum=2.0, step=0.1),
|
465 |
+
|
466 |
+
with gr.Column():
|
467 |
+
use_conditioned = gr.Checkbox(label="Use 'F0 Conditioned Model'", value=False),
|
468 |
+
use_auto_adjustment = gr.Checkbox(label="Use 'Auto F0 Adjustment' with 'F0 Conditioned Model'", value=True),
|
469 |
+
pitch = gr.Slider(label="Pitch with 'F0 Conditioned Model'", value=0, minimum=-12, maximum=12, step=1),
|
470 |
+
|
471 |
+
with gr.Column():
|
472 |
+
submit = gr.Button("▶")
|
473 |
+
maintain = gr.Button("☁️")
|
474 |
+
|
475 |
+
with gr.Column():
|
476 |
+
output = gr.Audio(label="Output", type="filepath")
|
477 |
+
|
478 |
+
submit.click(voice_conversion, inputs=[input, reference_input, steps, guidance, speed, use_conditioned, use_auto_adjustment, pitch], outputs=output, queue=False)
|
479 |
+
maintain.click(cloud, inputs=[], outputs=[], queue=False)
|
480 |
|
481 |
+
main.launch(show_api=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|