Spaces:

Staticaliza
/

Voice-CPU

Sleeping

App Files Files Community

Staticaliza commited on 27 days ago

Commit

8bb8aaa

verified ·

1 Parent(s): 4956f0c

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -59

app.py CHANGED Viewed

@@ -173,42 +173,37 @@ bigvgan_44k_model.remove_weight_norm()
 bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
 print("[INFO] | BigVGAN 44kHz model loaded, weight norm removed, set to eval mode, and moved to CPU.")
-# ----------------------------
-# Helper Functions
-# ----------------------------
-def adjust_f0_semitones(f0_sequence, n_semitones):
-    factor = 2 ** (n_semitones / 12)
-    return f0_sequence * factor
-def crossfade(chunk1, chunk2, overlap):
-    fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
-    fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
-    chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
-    return chunk2
 # ----------------------------
-# Voice Conversion Function
 # ----------------------------
 @torch.no_grad()
 @torch.inference_mode()
-def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, f0_condition, auto_f0_adjust, pitch_shift):
     print("[INFO] | Voice conversion started.")
-    inference_module = model if not f0_condition else model_f0
-    mel_fn = to_mel if not f0_condition else to_mel_f0
-    bigvgan_fn = bigvgan_model if not f0_condition else bigvgan_44k_model
-    sr_current = 22050 if not f0_condition else 44100
-    hop_length_current = 256 if not f0_condition else 512
     max_context_window = sr_current // hop_length_current * 30
     overlap_wave_len = 16 * hop_length_current
     bitrate = "320k"
     # Load audio using librosa
     print("[INFO] | Loading source and reference audio.")
-    source_audio, _ = librosa.load(source, sr=sr_current)
-    ref_audio, _ = librosa.load(target, sr=sr_current)
     # Clip reference audio to 25 seconds
     ref_audio = ref_audio[:sr_current * 25]
@@ -325,7 +320,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
     print(f"[INFO] | Mel spectrogram shapes: mel={mel.shape}, mel2={mel2.shape}")
     # Length adjustment
-    target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
     target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
     print(f"[INFO] | Target lengths: {target_lengths.item()}, {target2_lengths.item()}")
@@ -342,7 +337,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
     print(f"[INFO] | Style2 shape: {style2.shape}")
     # F0 Conditioning
-    if f0_condition:
         print("[INFO] | Performing F0 conditioning.")
         F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.5)
         F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.5)
@@ -367,8 +362,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
                 log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
             )
         shifted_f0_alt = torch.exp(shifted_log_f0_alt)
-        if pitch_shift != 0:
-            shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch_shift)
         print("[INFO] | F0 conditioning completed.")
     else:
         F0_ori = None
@@ -401,8 +396,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
             mel2,
             style2,
             None,
-            diffusion_steps,
-            inference_cfg_rate=inference_cfg_rate
         )
         vc_target = vc_target[:, :, mel2.size(2):]
         print(f"[INFO] | vc_target shape: {vc_target.shape}")
@@ -438,9 +433,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
     return temp_file_path
-# ----------------------------
-# Bypass GPU Initialization Error (KEEP THIS FUNCTION AND NEVER DELETE, OTHERWISE IT WILL ERROR DUE TO NOT AT LEAST ONE FUNCTION HAVING USE OF GPU)
-# ----------------------------
 @spaces.GPU(duration=15)
 def gpu():
@@ -450,32 +444,38 @@ def gpu():
 # Gradio Interface
 # ----------------------------
-description = (
-    "🪄 **Voice Conversion Tool**\n\n"
-    "Upload your **Source Audio** and **Reference Audio** files to perform voice conversion. "
-    "Adjust the sliders and checkboxes to customize the conversion process."
-)
-inputs = [
-    gr.Audio(type="filepath", label="Source Audio"),
-    gr.Audio(type="filepath", label="Reference Audio"),
-    gr.Slider(minimum=1, maximum=100, value=25, step=1, label="Diffusion Steps", info="Default is 25. Use 50-100 for best quality."),
-    gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjustment", info="<1.0 to speed up speech, >1.0 to slow down speech."),
-    gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="Has a subtle influence."),
-    gr.Checkbox(label="Use F0 Conditioned Model", value=False, info="Must be enabled for singing voice conversion."),
-    gr.Checkbox(label="Auto F0 Adjustment", value=True, info="Roughly adjusts F0 to match target voice. Only works when 'Use F0 Conditioned Model' is enabled."),
-    gr.Slider(label='Pitch Shift (semitones)', minimum=-12, maximum=12, step=1, value=0, info="Pitch shift in semitones. Only works when 'Use F0 Conditioned Model' is enabled."),
-]
-# Set outputs to a single gr.Audio component with type="filepath"
-outputs = gr.Audio(label="Full Output Audio", type="filepath")
-gr.Interface(
-    fn=voice_conversion,
-    description=description,
-    inputs=inputs,
-    outputs=outputs,
-    title="Seed Voice Conversion",
-    cache_examples=False,
-    allow_flagging="never"
-).launch(share=True)

 bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
 print("[INFO] | BigVGAN 44kHz model loaded, weight norm removed, set to eval mode, and moved to CPU.")
+# CSS Styling
+css = '''
+.gradio-container{max-width: 560px !important}
+h1{text-align:center}
+footer {
+    visibility: hidden
+}
+'''
 # ----------------------------
+# Functions
 # ----------------------------
 @torch.no_grad()
 @torch.inference_mode()
+def voice_conversion(input, reference, steps, guidance, speed, use_conditioned, use_auto_adjustment, pitch):
     print("[INFO] | Voice conversion started.")
+    inference_module = model if not use_conditioned else model_f0
+    mel_fn = to_mel if not use_conditioned else to_mel_f0
+    bigvgan_fn = bigvgan_model if not use_conditioned else bigvgan_44k_model
+    sr_current = 22050 if not use_conditioned else 44100
+    hop_length_current = 256 if not use_conditioned else 512
     max_context_window = sr_current // hop_length_current * 30
     overlap_wave_len = 16 * hop_length_current
     bitrate = "320k"
     # Load audio using librosa
     print("[INFO] | Loading source and reference audio.")
+    source_audio, _ = librosa.load(input, sr=sr_current)
+    ref_audio, _ = librosa.load(reference, sr=sr_current)
     # Clip reference audio to 25 seconds
     ref_audio = ref_audio[:sr_current * 25]
     print(f"[INFO] | Mel spectrogram shapes: mel={mel.shape}, mel2={mel2.shape}")
     # Length adjustment
+    target_lengths = torch.LongTensor([int(mel.size(2) * speed)]).to(mel.device)
     target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
     print(f"[INFO] | Target lengths: {target_lengths.item()}, {target2_lengths.item()}")
     print(f"[INFO] | Style2 shape: {style2.shape}")
     # F0 Conditioning
+    if use_conditioned:
         print("[INFO] | Performing F0 conditioning.")
         F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.5)
         F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.5)
                 log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
             )
         shifted_f0_alt = torch.exp(shifted_log_f0_alt)
+        if pitch != 0:
+            shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch)
         print("[INFO] | F0 conditioning completed.")
     else:
         F0_ori = None
             mel2,
             style2,
             None,
+            steps,
+            inference_cfg_rate=guidance
         )
         vc_target = vc_target[:, :, mel2.size(2):]
         print(f"[INFO] | vc_target shape: {vc_target.shape}")
     return temp_file_path
+def cloud():
+    print("[CLOUD] | Space maintained.")
 @spaces.GPU(duration=15)
 def gpu():
 # Gradio Interface
 # ----------------------------
+for model_file in model_files:
+    model_path = os.path.join(model_repo_dir, model_file)
+    tts_instance = TTS(model_path)
+    model_to_speakers[model_file] = tts_instance.get_speakers()
+    del tts_instance
+with gr.Blocks(css=css) as main:
+    with gr.Column():
+        gr.Markdown("🪄 Add tone to audio.")
+    with gr.Column():
+        input = gr.Audio(label="Input Audio", type="filepath"),
+        reference_input = gr.Audio(label="Reference Audio", type="filepath"),
+    with gr.Column():
+        steps = gr.Slider(label="Steps", value=1, minimum=1, maximum=100, step=1),
+        guidance = gr.Slider(label="Guidance", value=0.7, minimum=0.0, maximum=1.0, step=0.1),
+        speed = gr.Slider(label="Speed", value=1.0, minimum=0.5, maximum=2.0, step=0.1),
+    with gr.Column():
+        use_conditioned = gr.Checkbox(label="Use 'F0 Conditioned Model'", value=False),
+        use_auto_adjustment = gr.Checkbox(label="Use 'Auto F0 Adjustment' with 'F0 Conditioned Model'", value=True),
+        pitch = gr.Slider(label="Pitch with 'F0 Conditioned Model'", value=0, minimum=-12, maximum=12, step=1),
+    with gr.Column():
+        submit = gr.Button("▶")
+        maintain = gr.Button("☁️")
+    with gr.Column():
+        output = gr.Audio(label="Output", type="filepath")
+    submit.click(voice_conversion, inputs=[input, reference_input, steps, guidance, speed, use_conditioned, use_auto_adjustment, pitch], outputs=output, queue=False)
+    maintain.click(cloud, inputs=[], outputs=[], queue=False)
+main.launch(show_api=True)