Staticaliza commited on
Commit
8bb8aaa
·
verified ·
1 Parent(s): 4956f0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -59
app.py CHANGED
@@ -173,42 +173,37 @@ bigvgan_44k_model.remove_weight_norm()
173
  bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
174
  print("[INFO] | BigVGAN 44kHz model loaded, weight norm removed, set to eval mode, and moved to CPU.")
175
 
176
- # ----------------------------
177
- # Helper Functions
178
- # ----------------------------
179
-
180
- def adjust_f0_semitones(f0_sequence, n_semitones):
181
- factor = 2 ** (n_semitones / 12)
182
- return f0_sequence * factor
183
-
184
- def crossfade(chunk1, chunk2, overlap):
185
- fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
186
- fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
187
- chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
188
- return chunk2
189
 
190
  # ----------------------------
191
- # Voice Conversion Function
192
  # ----------------------------
193
 
194
  @torch.no_grad()
195
  @torch.inference_mode()
196
- def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, f0_condition, auto_f0_adjust, pitch_shift):
197
  print("[INFO] | Voice conversion started.")
198
 
199
- inference_module = model if not f0_condition else model_f0
200
- mel_fn = to_mel if not f0_condition else to_mel_f0
201
- bigvgan_fn = bigvgan_model if not f0_condition else bigvgan_44k_model
202
- sr_current = 22050 if not f0_condition else 44100
203
- hop_length_current = 256 if not f0_condition else 512
204
  max_context_window = sr_current // hop_length_current * 30
205
  overlap_wave_len = 16 * hop_length_current
206
  bitrate = "320k"
207
 
208
  # Load audio using librosa
209
  print("[INFO] | Loading source and reference audio.")
210
- source_audio, _ = librosa.load(source, sr=sr_current)
211
- ref_audio, _ = librosa.load(target, sr=sr_current)
212
 
213
  # Clip reference audio to 25 seconds
214
  ref_audio = ref_audio[:sr_current * 25]
@@ -325,7 +320,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
325
  print(f"[INFO] | Mel spectrogram shapes: mel={mel.shape}, mel2={mel2.shape}")
326
 
327
  # Length adjustment
328
- target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
329
  target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
330
  print(f"[INFO] | Target lengths: {target_lengths.item()}, {target2_lengths.item()}")
331
 
@@ -342,7 +337,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
342
  print(f"[INFO] | Style2 shape: {style2.shape}")
343
 
344
  # F0 Conditioning
345
- if f0_condition:
346
  print("[INFO] | Performing F0 conditioning.")
347
  F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.5)
348
  F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.5)
@@ -367,8 +362,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
367
  log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
368
  )
369
  shifted_f0_alt = torch.exp(shifted_log_f0_alt)
370
- if pitch_shift != 0:
371
- shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch_shift)
372
  print("[INFO] | F0 conditioning completed.")
373
  else:
374
  F0_ori = None
@@ -401,8 +396,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
401
  mel2,
402
  style2,
403
  None,
404
- diffusion_steps,
405
- inference_cfg_rate=inference_cfg_rate
406
  )
407
  vc_target = vc_target[:, :, mel2.size(2):]
408
  print(f"[INFO] | vc_target shape: {vc_target.shape}")
@@ -438,9 +433,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
438
 
439
  return temp_file_path
440
 
441
- # ----------------------------
442
- # Bypass GPU Initialization Error (KEEP THIS FUNCTION AND NEVER DELETE, OTHERWISE IT WILL ERROR DUE TO NOT AT LEAST ONE FUNCTION HAVING USE OF GPU)
443
- # ----------------------------
444
 
445
  @spaces.GPU(duration=15)
446
  def gpu():
@@ -450,32 +444,38 @@ def gpu():
450
  # Gradio Interface
451
  # ----------------------------
452
 
453
- description = (
454
- "🪄 **Voice Conversion Tool**\n\n"
455
- "Upload your **Source Audio** and **Reference Audio** files to perform voice conversion. "
456
- "Adjust the sliders and checkboxes to customize the conversion process."
457
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
- inputs = [
460
- gr.Audio(type="filepath", label="Source Audio"),
461
- gr.Audio(type="filepath", label="Reference Audio"),
462
- gr.Slider(minimum=1, maximum=100, value=25, step=1, label="Diffusion Steps", info="Default is 25. Use 50-100 for best quality."),
463
- gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjustment", info="<1.0 to speed up speech, >1.0 to slow down speech."),
464
- gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="Has a subtle influence."),
465
- gr.Checkbox(label="Use F0 Conditioned Model", value=False, info="Must be enabled for singing voice conversion."),
466
- gr.Checkbox(label="Auto F0 Adjustment", value=True, info="Roughly adjusts F0 to match target voice. Only works when 'Use F0 Conditioned Model' is enabled."),
467
- gr.Slider(label='Pitch Shift (semitones)', minimum=-12, maximum=12, step=1, value=0, info="Pitch shift in semitones. Only works when 'Use F0 Conditioned Model' is enabled."),
468
- ]
469
-
470
- # Set outputs to a single gr.Audio component with type="filepath"
471
- outputs = gr.Audio(label="Full Output Audio", type="filepath")
472
-
473
- gr.Interface(
474
- fn=voice_conversion,
475
- description=description,
476
- inputs=inputs,
477
- outputs=outputs,
478
- title="Seed Voice Conversion",
479
- cache_examples=False,
480
- allow_flagging="never"
481
- ).launch(share=True)
 
173
  bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
174
  print("[INFO] | BigVGAN 44kHz model loaded, weight norm removed, set to eval mode, and moved to CPU.")
175
 
176
+ # CSS Styling
177
+ css = '''
178
+ .gradio-container{max-width: 560px !important}
179
+ h1{text-align:center}
180
+ footer {
181
+ visibility: hidden
182
+ }
183
+ '''
 
 
 
 
 
184
 
185
  # ----------------------------
186
+ # Functions
187
  # ----------------------------
188
 
189
  @torch.no_grad()
190
  @torch.inference_mode()
191
+ def voice_conversion(input, reference, steps, guidance, speed, use_conditioned, use_auto_adjustment, pitch):
192
  print("[INFO] | Voice conversion started.")
193
 
194
+ inference_module = model if not use_conditioned else model_f0
195
+ mel_fn = to_mel if not use_conditioned else to_mel_f0
196
+ bigvgan_fn = bigvgan_model if not use_conditioned else bigvgan_44k_model
197
+ sr_current = 22050 if not use_conditioned else 44100
198
+ hop_length_current = 256 if not use_conditioned else 512
199
  max_context_window = sr_current // hop_length_current * 30
200
  overlap_wave_len = 16 * hop_length_current
201
  bitrate = "320k"
202
 
203
  # Load audio using librosa
204
  print("[INFO] | Loading source and reference audio.")
205
+ source_audio, _ = librosa.load(input, sr=sr_current)
206
+ ref_audio, _ = librosa.load(reference, sr=sr_current)
207
 
208
  # Clip reference audio to 25 seconds
209
  ref_audio = ref_audio[:sr_current * 25]
 
320
  print(f"[INFO] | Mel spectrogram shapes: mel={mel.shape}, mel2={mel2.shape}")
321
 
322
  # Length adjustment
323
+ target_lengths = torch.LongTensor([int(mel.size(2) * speed)]).to(mel.device)
324
  target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
325
  print(f"[INFO] | Target lengths: {target_lengths.item()}, {target2_lengths.item()}")
326
 
 
337
  print(f"[INFO] | Style2 shape: {style2.shape}")
338
 
339
  # F0 Conditioning
340
+ if use_conditioned:
341
  print("[INFO] | Performing F0 conditioning.")
342
  F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.5)
343
  F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.5)
 
362
  log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
363
  )
364
  shifted_f0_alt = torch.exp(shifted_log_f0_alt)
365
+ if pitch != 0:
366
+ shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch)
367
  print("[INFO] | F0 conditioning completed.")
368
  else:
369
  F0_ori = None
 
396
  mel2,
397
  style2,
398
  None,
399
+ steps,
400
+ inference_cfg_rate=guidance
401
  )
402
  vc_target = vc_target[:, :, mel2.size(2):]
403
  print(f"[INFO] | vc_target shape: {vc_target.shape}")
 
433
 
434
  return temp_file_path
435
 
436
+ def cloud():
437
+ print("[CLOUD] | Space maintained.")
 
438
 
439
  @spaces.GPU(duration=15)
440
  def gpu():
 
444
  # Gradio Interface
445
  # ----------------------------
446
 
447
+ for model_file in model_files:
448
+ model_path = os.path.join(model_repo_dir, model_file)
449
+ tts_instance = TTS(model_path)
450
+ model_to_speakers[model_file] = tts_instance.get_speakers()
451
+ del tts_instance
452
+
453
+ with gr.Blocks(css=css) as main:
454
+ with gr.Column():
455
+ gr.Markdown("🪄 Add tone to audio.")
456
+
457
+ with gr.Column():
458
+ input = gr.Audio(label="Input Audio", type="filepath"),
459
+ reference_input = gr.Audio(label="Reference Audio", type="filepath"),
460
+
461
+ with gr.Column():
462
+ steps = gr.Slider(label="Steps", value=1, minimum=1, maximum=100, step=1),
463
+ guidance = gr.Slider(label="Guidance", value=0.7, minimum=0.0, maximum=1.0, step=0.1),
464
+ speed = gr.Slider(label="Speed", value=1.0, minimum=0.5, maximum=2.0, step=0.1),
465
+
466
+ with gr.Column():
467
+ use_conditioned = gr.Checkbox(label="Use 'F0 Conditioned Model'", value=False),
468
+ use_auto_adjustment = gr.Checkbox(label="Use 'Auto F0 Adjustment' with 'F0 Conditioned Model'", value=True),
469
+ pitch = gr.Slider(label="Pitch with 'F0 Conditioned Model'", value=0, minimum=-12, maximum=12, step=1),
470
+
471
+ with gr.Column():
472
+ submit = gr.Button("▶")
473
+ maintain = gr.Button("☁️")
474
+
475
+ with gr.Column():
476
+ output = gr.Audio(label="Output", type="filepath")
477
+
478
+ submit.click(voice_conversion, inputs=[input, reference_input, steps, guidance, speed, use_conditioned, use_auto_adjustment, pitch], outputs=output, queue=False)
479
+ maintain.click(cloud, inputs=[], outputs=[], queue=False)
480
 
481
+ main.launch(show_api=True)