VoiceCloning-be's picture
h
3a478bf
import os
import sys
import time
import torch
import librosa
import logging
import traceback
import numpy as np
import soundfile as sf
import noisereduce as nr
from pedalboard import (
Pedalboard,
Chorus,
Distortion,
Reverb,
PitchShift,
Limiter,
Gain,
Bitcrush,
Clipping,
Compressor,
Delay,
)
from scipy.io import wavfile
from audio_upscaler import upscale
now_dir = os.getcwd()
sys.path.append(now_dir)
from rvc.infer.pipeline import Pipeline as VC
from rvc.lib.utils import load_audio_infer, load_embedding
from rvc.lib.tools.split_audio import process_audio, merge_audio
from rvc.lib.algorithm.synthesizers import Synthesizer
from rvc.configs.config import Config
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
logging.getLogger("faiss").setLevel(logging.WARNING)
logging.getLogger("faiss.loader").setLevel(logging.WARNING)
class VoiceConverter:
"""
A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method.
"""
def __init__(self):
"""
Initializes the VoiceConverter with default configuration, and sets up models and parameters.
"""
self.config = Config() # Load RVC configuration
self.hubert_model = (
None # Initialize the Hubert model (for embedding extraction)
)
self.last_embedder_model = None # Last used embedder model
self.tgt_sr = None # Target sampling rate for the output audio
self.net_g = None # Generator network for voice conversion
self.vc = None # Voice conversion pipeline instance
self.cpt = None # Checkpoint for loading model weights
self.version = None # Model version
self.n_spk = None # Number of speakers in the model
self.use_f0 = None # Whether the model uses F0
def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
"""
Loads the HuBERT model for speaker embedding extraction.
Args:
embedder_model (str): Path to the pre-trained HuBERT model.
embedder_model_custom (str): Path to the custom HuBERT model.
"""
self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
self.hubert_model.to(self.config.device)
self.hubert_model = (
self.hubert_model.half()
if self.config.is_half
else self.hubert_model.float()
)
self.hubert_model.eval()
@staticmethod
def remove_audio_noise(input_audio_path, reduction_strength=0.7):
"""
Removes noise from an audio file using the NoiseReduce library.
Args:
input_audio_path (str): Path to the input audio file.
reduction_strength (float): Strength of the noise reduction. Default is 0.7.
"""
try:
rate, data = wavfile.read(input_audio_path)
reduced_noise = nr.reduce_noise(
y=data, sr=rate, prop_decrease=reduction_strength
)
return reduced_noise
except Exception as error:
print(f"An error occurred removing audio noise: {error}")
return None
@staticmethod
def convert_audio_format(input_path, output_path, output_format):
"""
Converts an audio file to a specified output format.
Args:
input_path (str): Path to the input audio file.
output_path (str): Path to the output audio file.
output_format (str): Desired audio format (e.g., "WAV", "MP3").
"""
try:
if output_format != "WAV":
print(f"Converting audio to {output_format} format...")
audio, sample_rate = librosa.load(input_path, sr=None)
common_sample_rates = [
8000,
11025,
12000,
16000,
22050,
24000,
32000,
44100,
48000,
]
target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
audio = librosa.resample(
audio, orig_sr=sample_rate, target_sr=target_sr
)
sf.write(output_path, audio, target_sr, format=output_format.lower())
return output_path
except Exception as error:
print(f"An error occurred converting the audio format: {error}")
@staticmethod
def post_process_audio(
audio_input,
sample_rate,
reverb: bool,
reverb_room_size: float,
reverb_damping: float,
reverb_wet_level: float,
reverb_dry_level: float,
reverb_width: float,
reverb_freeze_mode: float,
pitch_shift: bool,
pitch_shift_semitones: int,
limiter: bool,
limiter_threshold: float,
limiter_release: float,
gain: bool,
gain_db: float,
distortion: bool,
distortion_gain: float,
chorus: bool,
chorus_rate: float,
chorus_depth: float,
chorus_delay: float,
chorus_feedback: float,
chorus_mix: float,
bitcrush: bool,
bitcrush_bit_depth: int,
clipping: bool,
clipping_threshold: float,
compressor: bool,
compressor_threshold: float,
compressor_ratio: float,
compressor_attack: float,
compressor_release: float,
delay: bool,
delay_seconds: float,
delay_feedback: float,
delay_mix: float,
audio_output_path: str,
):
board = Pedalboard()
if reverb:
reverb = Reverb(
room_size=reverb_room_size,
damping=reverb_damping,
wet_level=reverb_wet_level,
dry_level=reverb_dry_level,
width=reverb_width,
freeze_mode=reverb_freeze_mode,
)
board.append(reverb)
if pitch_shift:
pitch_shift = PitchShift(semitones=pitch_shift_semitones)
board.append(pitch_shift)
if limiter:
limiter = Limiter(
threshold_db=limiter_threshold, release_ms=limiter_release
)
board.append(limiter)
if gain:
gain = Gain(gain_db=gain_db)
board.append(gain)
if distortion:
distortion = Distortion(drive_db=distortion_gain)
board.append(distortion)
if chorus:
chorus = Chorus(
rate_hz=chorus_rate,
depth=chorus_depth,
centre_delay_ms=chorus_delay,
feedback=chorus_feedback,
mix=chorus_mix,
)
board.append(chorus)
if bitcrush:
bitcrush = Bitcrush(bit_depth=bitcrush_bit_depth)
board.append(bitcrush)
if clipping:
clipping = Clipping(threshold_db=clipping_threshold)
board.append(clipping)
if compressor:
compressor = Compressor(
threshold_db=compressor_threshold,
ratio=compressor_ratio,
attack_ms=compressor_attack,
release_ms=compressor_release,
)
board.append(compressor)
if delay:
delay = Delay(
delay_seconds=delay_seconds,
feedback=delay_feedback,
mix=delay_mix,
)
board.append(delay)
audio_input, sample_rate = librosa.load(audio_input, sr=sample_rate)
output = board(audio_input, sample_rate)
sf.write(audio_output_path, output, sample_rate, format="WAV")
return audio_output_path
def convert_audio(
self,
audio_input_path: str,
audio_output_path: str,
model_path: str,
index_path: str,
embedder_model: str,
pitch: int,
f0_file: str,
f0_method: str,
index_rate: float,
volume_envelope: int,
protect: float,
hop_length: int,
split_audio: bool,
f0_autotune: bool,
filter_radius: int,
embedder_model_custom: str,
clean_audio: bool,
clean_strength: float,
export_format: str,
upscale_audio: bool,
formant_shifting: bool,
formant_qfrency: float,
formant_timbre: float,
post_process: bool,
reverb: bool,
pitch_shift: bool,
limiter: bool,
gain: bool,
distortion: bool,
chorus: bool,
bitcrush: bool,
clipping: bool,
compressor: bool,
delay: bool,
sliders: dict,
resample_sr: int = 0,
sid: int = 0,
):
"""
Performs voice conversion on the input audio.
Args:
audio_input_path (str): Path to the input audio file.
audio_output_path (str): Path to the output audio file.
model_path (str): Path to the voice conversion model.
index_path (str): Path to the index file.
sid (int, optional): Speaker ID. Default is 0.
pitch (str, optional): Key for F0 up-sampling. Default is None.
f0_file (str, optional): Path to the F0 file. Default is None.
f0_method (str, optional): Method for F0 extraction. Default is None.
index_rate (float, optional): Rate for index matching. Default is None.
resample_sr (int, optional): Resample sampling rate. Default is 0.
volume_envelope (float, optional): RMS mix rate. Default is None.
protect (float, optional): Protection rate for certain audio segments. Default is None.
hop_length (int, optional): Hop length for audio processing. Default is None.
split_audio (bool, optional): Whether to split the audio for processing. Default is False.
f0_autotune (bool, optional): Whether to use F0 autotune. Default is False.
filter_radius (int, optional): Radius for filtering. Default is None.
embedder_model (str, optional): Path to the embedder model. Default is None.
embedder_model_custom (str, optional): Path to the custom embedder model. Default is None.
clean_audio (bool, optional): Whether to clean the audio. Default is False.
clean_strength (float, optional): Strength of the audio cleaning. Default is 0.7.
export_format (str, optional): Format for exporting the audio. Default is "WAV".
upscale_audio (bool, optional): Whether to upscale the audio. Default is False.
formant_shift (bool, optional): Whether to shift the formants. Default is False.
formant_qfrency (float, optional): Formant frequency. Default is 1.0.
formant_timbre (float, optional): Formant timbre. Default is 1.0.
reverb (bool, optional): Whether to apply reverb. Default is False.
pitch_shift (bool, optional): Whether to apply pitch shift. Default is False.
limiter (bool, optional): Whether to apply a limiter. Default is False.
gain (bool, optional): Whether to apply gain. Default is False.
distortion (bool, optional): Whether to apply distortion. Default is False.
chorus (bool, optional): Whether to apply chorus. Default is False.
bitcrush (bool, optional): Whether to apply bitcrush. Default is False.
clipping (bool, optional): Whether to apply clipping. Default is False.
compressor (bool, optional): Whether to apply a compressor. Default is False.
delay (bool, optional): Whether to apply delay. Default is False.
sliders (dict, optional): Dictionary of effect parameters. Default is None.
"""
self.get_vc(model_path, sid)
try:
start_time = time.time()
print(f"Converting audio '{audio_input_path}'...")
if upscale_audio == True:
upscale(audio_input_path, audio_input_path)
audio = load_audio_infer(
audio_input_path,
16000,
formant_shifting,
formant_qfrency,
formant_timbre,
)
audio_max = np.abs(audio).max() / 0.95
if audio_max > 1:
audio /= audio_max
if not self.hubert_model or embedder_model != self.last_embedder_model:
self.load_hubert(embedder_model, embedder_model_custom)
self.last_embedder_model = embedder_model
file_index = (
index_path.strip()
.strip('"')
.strip("\n")
.strip('"')
.strip()
.replace("trained", "added")
)
if self.tgt_sr != resample_sr >= 16000:
self.tgt_sr = resample_sr
if split_audio:
result, new_dir_path = process_audio(audio_input_path)
if result == "Error":
return "Error with Split Audio", None
dir_path = (
new_dir_path.strip().strip('"').strip("\n").strip('"').strip()
)
if dir_path:
paths = [
os.path.join(root, name)
for root, _, files in os.walk(dir_path, topdown=False)
for name in files
if name.endswith(".wav") and root == dir_path
]
try:
for path in paths:
self.convert_audio(
audio_input_path=path,
audio_output_path=path,
model_path=model_path,
index_path=index_path,
sid=sid,
pitch=pitch,
f0_file=None,
f0_method=f0_method,
index_rate=index_rate,
resample_sr=resample_sr,
volume_envelope=volume_envelope,
protect=protect,
hop_length=hop_length,
split_audio=False,
f0_autotune=f0_autotune,
filter_radius=filter_radius,
export_format=export_format,
upscale_audio=upscale_audio,
embedder_model=embedder_model,
embedder_model_custom=embedder_model_custom,
clean_audio=clean_audio,
clean_strength=clean_strength,
formant_shifting=formant_shifting,
formant_qfrency=formant_qfrency,
formant_timbre=formant_timbre,
post_process=post_process,
reverb=reverb,
pitch_shift=pitch_shift,
limiter=limiter,
gain=gain,
distortion=distortion,
chorus=chorus,
bitcrush=bitcrush,
clipping=clipping,
compressor=compressor,
delay=delay,
sliders=sliders,
)
except Exception as error:
print(f"An error occurred processing the segmented audio: {error}")
print(traceback.format_exc())
return f"Error {error}"
print("Finished processing segmented audio, now merging audio...")
merge_timestamps_file = os.path.join(
os.path.dirname(new_dir_path),
f"{os.path.basename(audio_input_path).split('.')[0]}_timestamps.txt",
)
self.tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
os.remove(merge_timestamps_file)
if post_process:
audio_opt = self.post_process_audio(
audio_input=audio_opt,
sample_rate=self.tgt_sr,
reverb=reverb,
reverb_room_size=sliders[0],
reverb_damping=sliders[1],
reverb_wet_level=sliders[2],
reverb_dry_level=sliders[3],
reverb_width=sliders[4],
reverb_freeze_mode=sliders[5],
pitch_shift=pitch_shift,
pitch_shift_semitones=sliders[6],
limiter=limiter,
limiter_threshold=sliders[7],
limiter_release=sliders[8],
gain=gain,
gain_db=sliders[9],
distortion=distortion,
distortion_gain=sliders[10],
chorus=chorus,
chorus_rate=sliders[11],
chorus_depth=sliders[12],
chorus_delay=sliders[13],
chorus_feedback=sliders[14],
chorus_mix=sliders[15],
bitcrush=bitcrush,
bitcrush_bit_depth=sliders[16],
clipping=clipping,
clipping_threshold=sliders[17],
compressor=compressor,
compressor_threshold=sliders[18],
compressor_ratio=sliders[19],
compressor_attack=sliders[20],
compressor_release=sliders[21],
delay=delay,
delay_seconds=sliders[22],
delay_feedback=sliders[23],
delay_mix=sliders[24],
audio_output_path=audio_output_path,
)
sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV")
else:
audio_opt = self.vc.pipeline(
model=self.hubert_model,
net_g=self.net_g,
sid=sid,
audio=audio,
input_audio_path=audio_input_path,
pitch=pitch,
f0_method=f0_method,
file_index=file_index,
index_rate=index_rate,
pitch_guidance=self.use_f0,
filter_radius=filter_radius,
tgt_sr=self.tgt_sr,
resample_sr=resample_sr,
volume_envelope=volume_envelope,
version=self.version,
protect=protect,
hop_length=hop_length,
f0_autotune=f0_autotune,
f0_file=f0_file,
)
if audio_output_path:
sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV")
if clean_audio:
cleaned_audio = self.remove_audio_noise(
audio_output_path, clean_strength
)
if cleaned_audio is not None:
sf.write(
audio_output_path, cleaned_audio, self.tgt_sr, format="WAV"
)
if post_process:
audio_output_path = self.post_process_audio(
audio_input=audio_output_path,
sample_rate=self.tgt_sr,
reverb=reverb,
reverb_room_size=sliders["reverb_room_size"],
reverb_damping=sliders["reverb_damping"],
reverb_wet_level=sliders["reverb_wet_level"],
reverb_dry_level=sliders["reverb_dry_level"],
reverb_width=sliders["reverb_width"],
reverb_freeze_mode=sliders["reverb_freeze_mode"],
pitch_shift=pitch_shift,
pitch_shift_semitones=sliders["pitch_shift_semitones"],
limiter=limiter,
limiter_threshold=sliders["limiter_threshold"],
limiter_release=sliders["limiter_release"],
gain=gain,
gain_db=sliders["gain_db"],
distortion=distortion,
distortion_gain=sliders["distortion_gain"],
chorus=chorus,
chorus_rate=sliders["chorus_rate"],
chorus_depth=sliders["chorus_depth"],
chorus_delay=sliders["chorus_delay"],
chorus_feedback=sliders["chorus_feedback"],
chorus_mix=sliders["chorus_mix"],
bitcrush=bitcrush,
bitcrush_bit_depth=sliders["bitcrush_bit_depth"],
clipping=clipping,
clipping_threshold=sliders["clipping_threshold"],
compressor=compressor,
compressor_threshold=sliders["compressor_threshold"],
compressor_ratio=sliders["compressor_ratio"],
compressor_attack=sliders["compressor_attack"],
compressor_release=sliders["compressor_release"],
delay=delay,
delay_seconds=sliders["delay_seconds"],
delay_feedback=sliders["delay_feedback"],
delay_mix=sliders["delay_mix"],
audio_output_path=audio_output_path,
)
output_path_format = audio_output_path.replace(
".wav", f".{export_format.lower()}"
)
audio_output_path = self.convert_audio_format(
audio_output_path, output_path_format, export_format
)
elapsed_time = time.time() - start_time
print(
f"Conversion completed at '{audio_output_path}' in {elapsed_time:.2f} seconds."
)
except Exception as error:
print(f"An error occurred during audio conversion: {error}")
print(traceback.format_exc())
def convert_audio_batch(
self,
audio_input_paths: str,
audio_output_path: str,
model_path: str,
index_path: str,
embedder_model: str,
pitch: int,
f0_file: str,
f0_method: str,
index_rate: float,
volume_envelope: int,
protect: float,
hop_length: int,
split_audio: bool,
f0_autotune: bool,
filter_radius: int,
embedder_model_custom: str,
clean_audio: bool,
clean_strength: float,
export_format: str,
upscale_audio: bool,
formant_shifting: bool,
formant_qfrency: float,
formant_timbre: float,
resample_sr: int = 0,
sid: int = 0,
pid_file_path: str = None,
post_process: bool = False,
reverb: bool = False,
pitch_shift: bool = False,
limiter: bool = False,
gain: bool = False,
distortion: bool = False,
chorus: bool = False,
bitcrush: bool = False,
clipping: bool = False,
compressor: bool = False,
delay: bool = False,
sliders: dict = None,
):
"""
Performs voice conversion on a batch of input audio files.
Args:
audio_input_paths (list): List of paths to the input audio files.
audio_output_path (str): Path to the output audio file.
model_path (str): Path to the voice conversion model.
index_path (str): Path to the index file.
sid (int, optional): Speaker ID. Default is 0.
pitch (str, optional): Key for F0 up-sampling. Default is None.
f0_file (str, optional): Path to the F0 file. Default is None.
f0_method (str, optional): Method for F0 extraction. Default is None.
index_rate (float, optional): Rate for index matching. Default is None.
resample_sr (int, optional): Resample sampling rate. Default is 0.
volume_envelope (float, optional): RMS mix rate. Default is None.
protect (float, optional): Protection rate for certain audio segments. Default is None.
hop_length (int, optional): Hop length for audio processing. Default is None.
split_audio (bool, optional): Whether to split the audio for processing. Default is False.
f0_autotune (bool, optional): Whether to use F0 autotune. Default is False.
filter_radius (int, optional): Radius for filtering. Default is None.
embedder_model (str, optional): Path to the embedder model. Default is None.
embedder_model_custom (str, optional): Path to the custom embedder model. Default is None.
clean_audio (bool, optional): Whether to clean the audio. Default is False.
clean_strength (float, optional): Strength of the audio cleaning. Default is 0.7.
export_format (str, optional): Format for exporting the audio. Default is "WAV".
upscale_audio (bool, optional): Whether to upscale the audio. Default is False.
formant_shift (bool, optional): Whether to shift the formants. Default is False.
formant_qfrency (float, optional): Formant frequency. Default is 1.0.
formant_timbre (float, optional): Formant timbre. Default is 1.0.
pid_file_path (str, optional): Path to the PID file. Default is None.
post_process (bool, optional): Whether to apply post-processing effects. Default is False.
reverb (bool, optional): Whether to apply reverb. Default is False.
pitch_shift (bool, optional): Whether to apply pitch shift. Default is False.
limiter (bool, optional): Whether to apply a limiter. Default is False.
gain (bool, optional): Whether to apply gain. Default is False.
distortion (bool, optional): Whether to apply distortion. Default is False.
chorus (bool, optional): Whether to apply chorus. Default is False.
bitcrush (bool, optional): Whether to apply bitcrush. Default is False.
clipping (bool, optional): Whether to apply clipping. Default is False.
compressor (bool, optional): Whether to apply a compressor. Default is False.
delay (bool, optional): Whether to apply delay. Default is False.
sliders (dict, optional): Dictionary of effect parameters. Default is None.
"""
pid = os.getpid()
with open(pid_file_path, "w") as pid_file:
pid_file.write(str(pid))
try:
if not self.hubert_model or embedder_model != self.last_embedder_model:
self.load_hubert(embedder_model, embedder_model_custom)
self.last_embedder_model = embedder_model
self.get_vc(model_path, sid)
file_index = (
index_path.strip()
.strip('"')
.strip("\n")
.strip('"')
.strip()
.replace("trained", "added")
)
start_time = time.time()
print(f"Converting audio batch '{audio_input_paths}'...")
audio_files = [
f
for f in os.listdir(audio_input_paths)
if f.endswith((".mp3", ".wav", ".flac", ".m4a", ".ogg", ".opus"))
]
print(f"Detected {len(audio_files)} audio files for inference.")
for i, audio_input_path in enumerate(audio_files):
audio_output_paths = os.path.join(
audio_output_path,
f"{os.path.splitext(os.path.basename(audio_input_path))[0]}_output.{export_format.lower()}",
)
if os.path.exists(audio_output_paths):
continue
print(f"Converting audio '{audio_input_path}'...")
audio_input_path = os.path.join(audio_input_paths, audio_input_path)
if upscale_audio == True:
upscale(audio_input_path, audio_input_path)
audio = load_audio_infer(
audio_input_path,
16000,
formant_shifting,
formant_qfrency,
formant_timbre,
)
audio_max = np.abs(audio).max() / 0.95
if audio_max > 1:
audio /= audio_max
if self.tgt_sr != resample_sr >= 16000:
self.tgt_sr = resample_sr
if split_audio:
result, new_dir_path = process_audio(audio_input_path)
if result == "Error":
return "Error with Split Audio", None
dir_path = (
new_dir_path.strip().strip('"').strip("\n").strip('"').strip()
)
if dir_path:
paths = [
os.path.join(root, name)
for root, _, files in os.walk(dir_path, topdown=False)
for name in files
if name.endswith(".wav") and root == dir_path
]
try:
for path in paths:
self.convert_audio(
audio_input_path=path,
audio_output_path=path,
model_path=model_path,
index_path=index_path,
sid=sid,
pitch=pitch,
f0_file=None,
f0_method=f0_method,
index_rate=index_rate,
resample_sr=resample_sr,
volume_envelope=volume_envelope,
protect=protect,
hop_length=hop_length,
split_audio=False,
f0_autotune=f0_autotune,
filter_radius=filter_radius,
export_format=export_format,
upscale_audio=upscale_audio,
embedder_model=embedder_model,
embedder_model_custom=embedder_model_custom,
clean_audio=clean_audio,
clean_strength=clean_strength,
formant_shifting=formant_shifting,
formant_qfrency=formant_qfrency,
formant_timbre=formant_timbre,
post_process=post_process,
reverb=reverb,
pitch_shift=pitch_shift,
limiter=limiter,
gain=gain,
distortion=distortion,
chorus=chorus,
bitcrush=bitcrush,
clipping=clipping,
compressor=compressor,
delay=delay,
sliders=sliders,
)
except Exception as error:
print(
f"An error occurred processing the segmented audio: {error}"
)
print(traceback.format_exc())
return f"Error {error}"
print("Finished processing segmented audio, now merging audio...")
merge_timestamps_file = os.path.join(
os.path.dirname(new_dir_path),
f"{os.path.basename(audio_input_path).split('.')[0]}_timestamps.txt",
)
self.tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
os.remove(merge_timestamps_file)
if post_process:
audio_opt = self.post_process_audio(
audio_input=audio_opt,
sample_rate=self.tgt_sr,
reverb=reverb,
reverb_room_size=sliders[0],
reverb_damping=sliders[1],
reverb_wet_level=sliders[2],
reverb_dry_level=sliders[3],
reverb_width=sliders[4],
reverb_freeze_mode=sliders[5],
pitch_shift=pitch_shift,
pitch_shift_semitones=sliders[6],
limiter=limiter,
limiter_threshold=sliders[7],
limiter_release=sliders[8],
gain=gain,
gain_db=sliders[9],
distortion=distortion,
distortion_gain=sliders[10],
chorus=chorus,
chorus_rate=sliders[11],
chorus_depth=sliders[12],
chorus_delay=sliders[13],
chorus_feedback=sliders[14],
chorus_mix=sliders[15],
bitcrush=bitcrush,
bitcrush_bit_depth=sliders[16],
clipping=clipping,
clipping_threshold=sliders[17],
compressor=compressor,
compressor_threshold=sliders[18],
compressor_ratio=sliders[19],
compressor_attack=sliders[20],
compressor_release=sliders[21],
delay=delay,
delay_seconds=sliders[22],
delay_feedback=sliders[23],
delay_mix=sliders[24],
audio_output_path=audio_output_paths,
)
sf.write(
audio_output_paths, audio_opt, self.tgt_sr, format="WAV"
)
else:
audio_opt = self.vc.pipeline(
model=self.hubert_model,
net_g=self.net_g,
sid=sid,
audio=audio,
input_audio_path=audio_input_path,
pitch=pitch,
f0_method=f0_method,
file_index=file_index,
index_rate=index_rate,
pitch_guidance=self.use_f0,
filter_radius=filter_radius,
tgt_sr=self.tgt_sr,
resample_sr=resample_sr,
volume_envelope=volume_envelope,
version=self.version,
protect=protect,
hop_length=hop_length,
f0_autotune=f0_autotune,
f0_file=f0_file,
)
if audio_output_paths:
sf.write(audio_output_paths, audio_opt, self.tgt_sr, format="WAV")
if clean_audio:
cleaned_audio = self.remove_audio_noise(
audio_output_paths, clean_strength
)
if cleaned_audio is not None:
sf.write(
audio_output_paths, cleaned_audio, self.tgt_sr, format="WAV"
)
if post_process:
audio_output_paths = self.post_process_audio(
audio_input=audio_output_paths,
sample_rate=self.tgt_sr,
reverb=reverb,
reverb_room_size=sliders["reverb_room_size"],
reverb_damping=sliders["reverb_damping"],
reverb_wet_level=sliders["reverb_wet_level"],
reverb_dry_level=sliders["reverb_dry_level"],
reverb_width=sliders["reverb_width"],
reverb_freeze_mode=sliders["reverb_freeze_mode"],
pitch_shift=pitch_shift,
pitch_shift_semitones=sliders["pitch_shift_semitones"],
limiter=limiter,
limiter_threshold=sliders["limiter_threshold"],
limiter_release=sliders["limiter_release"],
gain=gain,
gain_db=sliders["gain_db"],
distortion=distortion,
distortion_gain=sliders["distortion_gain"],
chorus=chorus,
chorus_rate=sliders["chorus_rate"],
chorus_depth=sliders["chorus_depth"],
chorus_delay=sliders["chorus_delay"],
chorus_feedback=sliders["chorus_feedback"],
chorus_mix=sliders["chorus_mix"],
bitcrush=bitcrush,
bitcrush_bit_depth=sliders["bitcrush_bit_depth"],
clipping=clipping,
clipping_threshold=sliders["clipping_threshold"],
compressor=compressor,
compressor_threshold=sliders["compressor_threshold"],
compressor_ratio=sliders["compressor_ratio"],
compressor_attack=sliders["compressor_attack"],
compressor_release=sliders["compressor_release"],
delay=delay,
delay_seconds=sliders["delay_seconds"],
delay_feedback=sliders["delay_feedback"],
delay_mix=sliders["delay_mix"],
audio_output_path=audio_output_paths,
)
output_path_format = audio_output_paths.replace(
".wav", f".{export_format.lower()}"
)
audio_output_paths = self.convert_audio_format(
audio_output_paths, output_path_format, export_format
)
print(f"Conversion completed at '{audio_output_paths}'.")
elapsed_time = time.time() - start_time
print(f"Batch conversion completed in {elapsed_time:.2f} seconds.")
os.remove(pid_file_path)
except Exception as error:
print(f"An error occurred during audio conversion: {error}")
print(traceback.format_exc())
def get_vc(self, weight_root, sid):
"""
Loads the voice conversion model and sets up the pipeline.
Args:
weight_root (str): Path to the model weights.
sid (int): Speaker ID.
"""
if sid == "" or sid == []:
self.cleanup_model()
if torch.cuda.is_available():
torch.cuda.empty_cache()
self.load_model(weight_root)
if self.cpt is not None:
self.setup_network()
self.setup_vc_instance()
def cleanup_model(self):
"""
Cleans up the model and releases resources.
"""
if self.hubert_model is not None:
del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
if torch.cuda.is_available():
torch.cuda.empty_cache()
del self.net_g, self.cpt
if torch.cuda.is_available():
torch.cuda.empty_cache()
self.cpt = None
def load_model(self, weight_root):
"""
Loads the model weights from the specified path.
Args:
weight_root (str): Path to the model weights.
"""
self.cpt = (
torch.load(weight_root, map_location="cpu")
if os.path.isfile(weight_root)
else None
)
def setup_network(self):
"""
Sets up the network configuration based on the loaded checkpoint.
"""
if self.cpt is not None:
self.tgt_sr = self.cpt["config"][-1]
self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
self.use_f0 = self.cpt.get("f0", 1)
self.version = self.cpt.get("version", "v1")
self.text_enc_hidden_dim = 768 if self.version == "v2" else 256
self.net_g = Synthesizer(
*self.cpt["config"],
use_f0=self.use_f0,
text_enc_hidden_dim=self.text_enc_hidden_dim,
is_half=self.config.is_half,
)
del self.net_g.enc_q
self.net_g.load_state_dict(self.cpt["weight"], strict=False)
self.net_g.eval().to(self.config.device)
self.net_g = (
self.net_g.half() if self.config.is_half else self.net_g.float()
)
def setup_vc_instance(self):
"""
Sets up the voice conversion pipeline instance based on the target sampling rate and configuration.
"""
if self.cpt is not None:
self.vc = VC(self.tgt_sr, self.config)
self.n_spk = self.cpt["config"][-3]