diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml deleted file mode 100644 index d4204761c085cc04792c6446ae5e7d55633719fc..0000000000000000000000000000000000000000 --- a/.github/workflows/sync.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: Sync to Hugging Face Spaces - -on: - push: - branches: - - main - -jobs: - sync: - name: Sync - runs-on: ubuntu-latest - - steps: - - name: Checkout Repository - uses: actions/checkout@v4 - with: - lfs: true - - - name: Sync to Hugging Face Spaces - uses: JacobLinCool/huggingface-sync@v1 - with: - github: ${{ secrets.GITHUB_TOKEN }} - user: jacoblincool # Hugging Face username or organization name - space: ZeroRVC # Hugging Face space name - token: ${{ secrets.HF_TOKEN }} # Hugging Face token - configuration: headers.yaml diff --git a/.gitignore b/.gitignore index 2d30cb63522405061de0db9926bf9afa021e42c6..9ab76a12bc96eff64a46cc52cd9a22f7aa9ae58f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,3 @@ .DS_Store *.pyc __pycache__ -dist/ -logs/ -separated/ diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 38bc5a7b8423e62742d0ac3f08527fb11ba20b2d..0000000000000000000000000000000000000000 --- a/LICENSE +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2024 Jacob Lin - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md index 5cdec13ed806aa9e69bb6af2806c10796b235ace..bd0ea5ec405147f732262cd1d65ef33a204a8dd9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ --- title: ZeroRVC -emoji: 🎙️ +emoji: 🦀 colorFrom: gray colorTo: gray sdk: gradio @@ -11,47 +11,4 @@ pinned: false # ZeroRVC -Run Retrieval-based Voice Conversion training and inference with ease. - -## Features - -- [x] Dataset Preparation -- [x] Hugging Face Datasets Integration -- [x] Hugging Face Accelerate Integration -- [x] Trainer API -- [x] Inference API - - [ ] Index Support -- [x] Tensorboard Support -- [ ] FP16 Support - -## Dataset Preparation - -ZeroRVC provides a simple API to prepare your dataset for training. You only need to provide the path to your audio files. The feature extraction models will be downloaded automatically, or you can provide your own with the `hubert` and `rmvpe` arguments. - -```py -from datasets import load_dataset -from zerorvc import prepare, RVCTrainer - -dataset = load_dataset("my-audio-dataset") -dataset = prepare(dataset) - -trainer = RVCTrainer( - "my-rvc-model", - dataset_train=dataset["train"], - dataset_test=dataset["test"], -) -trainer.train(epochs=100, batch_size=8, upload="someone/rvc-test-1") -``` - -## Inference - -ZeroRVC provides an easy API to convert your voice with the trained model. - -```py -from zerorvc import RVC -import soundfile as sf - -rvc = RVC.from_pretrained("someone/rvc-test-1") -samples = rvc.convert("test.mp3") -sf.write("output.wav", samples, rvc.sr) -``` +Run Retrieval-based Voice Conversion training and inference on HuggingFace ZeroGPU. diff --git a/app.py b/app.py index faaea632750a2c695143b42f2ec7335937fe4cbf..c3518a88a9c4e3e0e3f900d5cec674ac5b0aed08 100644 --- a/app.py +++ b/app.py @@ -1,49 +1,53 @@ +from prelude import prelude + +prelude() + import gradio as gr -from app.settings import SettingsTab -from app.tutorial import TutotialTab -from app.dataset import DatasetTab +from app.setup import SetupTab +from app.extract import FeatureExtractionTab from app.train import TrainTab +from app.export import ExportTab from app.infer import InferenceTab -from app.zero import zero_is_available - -if zero_is_available: - import torch - - torch.backends.cuda.matmul.allow_tf32 = True with gr.Blocks() as app: gr.Markdown("# ZeroRVC") gr.Markdown( - "Run Retrieval-based Voice Conversion training and inference on Hugging Face ZeroGPU or locally." + "Run Retrieval-based Voice Conversion training and inference on HuggingFace ZeroGPU." ) - settings = SettingsTab() - tutorial = TutotialTab() - dataset = DatasetTab() - training = TrainTab() - inference = InferenceTab() + exp_dir = gr.Textbox( + label="Experiment directory", + visible=True, + interactive=False, + ) - with gr.Accordion(label="Environment Settings"): - settings.ui() + setup = SetupTab() + feature_extraction = FeatureExtractionTab() + training = TrainTab() + export = ExportTab() + inferencing = InferenceTab() with gr.Tabs(): - with gr.Tab(label="Tutorial", id=0): - tutorial.ui() + with gr.Tab(label="Setup"): + setup.ui() - with gr.Tab(label="Dataset", id=1): - dataset.ui() + with gr.Tab(label="Feature Extraction"): + feature_extraction.ui() - with gr.Tab(label="Training", id=2): + with gr.Tab(label="Training"): training.ui() - with gr.Tab(label="Inference", id=3): - inference.ui() + with gr.Tab(label="Download"): + export.ui() + + with gr.Tab(label="Inference"): + inferencing.ui() - settings.build() - tutorial.build() - dataset.build(settings.exp_dir, settings.hf_token) - training.build(settings.exp_dir, settings.hf_token) - inference.build(settings.exp_dir) + setup.build(exp_dir) + feature_extraction.build(exp_dir) + training.build(exp_dir) + export.build(exp_dir) + inferencing.build(exp_dir) app.launch() diff --git a/app/constants.py b/app/constants.py deleted file mode 100644 index a5e1a1e424940db07c890b4827bebbcacac323a5..0000000000000000000000000000000000000000 --- a/app/constants.py +++ /dev/null @@ -1,13 +0,0 @@ -import os -from pathlib import Path - -HF_TOKEN = os.environ.get("HF_TOKEN") - -ROOT_EXP_DIR = Path( - os.environ.get("ROOT_EXP_DIR") - or os.path.join(os.path.dirname(os.path.abspath(__file__)), "../logs") -).resolve() -ROOT_EXP_DIR.mkdir(exist_ok=True, parents=True) - -BATCH_SIZE = int(os.environ.get("BATCH_SIZE") or 8) -TRAINING_EPOCHS = int(os.environ.get("TRAINING_EPOCHS") or 10) diff --git a/app/dataset.py b/app/dataset.py deleted file mode 100644 index d3afa4c204ff7cbc1b2d514da3a773f5a3d72241..0000000000000000000000000000000000000000 --- a/app/dataset.py +++ /dev/null @@ -1,225 +0,0 @@ -import os -import gradio as gr -import zipfile -import tempfile -from zerorvc import prepare -from datasets import load_dataset, load_from_disk -from .constants import ROOT_EXP_DIR, BATCH_SIZE -from .zero import zero -from .model import accelerator - - -def extract_audio_files(zip_file: str, target_dir: str) -> list[str]: - with zipfile.ZipFile(zip_file, "r") as zip_ref: - zip_ref.extractall(target_dir) - - audio_files = [ - os.path.join(target_dir, f) - for f in os.listdir(target_dir) - if f.endswith((".wav", ".mp3", ".ogg")) - ] - if not audio_files: - raise gr.Error("No audio files found at the top level of the zip file") - - return audio_files - - -def make_dataset_from_zip(exp_dir: str, zip_file: str): - if not exp_dir: - exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR) - print(f"Using exp dir: {exp_dir}") - - data_dir = os.path.join(exp_dir, "raw_data") - if not os.path.exists(data_dir): - os.makedirs(data_dir) - extract_audio_files(zip_file, data_dir) - - ds = prepare( - data_dir, - accelerator=accelerator, - batch_size=BATCH_SIZE, - stage=1, - ) - - return exp_dir, str(ds) - - -@zero(duration=120) -def make_dataset_from_zip_stage_2(exp_dir: str): - data_dir = os.path.join(exp_dir, "raw_data") - ds = prepare( - data_dir, - accelerator=accelerator, - batch_size=BATCH_SIZE, - stage=2, - ) - return exp_dir, str(ds) - - -def make_dataset_from_zip_stage_3(exp_dir: str): - data_dir = os.path.join(exp_dir, "raw_data") - ds = prepare( - data_dir, - accelerator=accelerator, - batch_size=BATCH_SIZE, - stage=3, - ) - - dataset = os.path.join(exp_dir, "dataset") - ds.save_to_disk(dataset) - return exp_dir, str(ds) - - -def make_dataset_from_repo(repo: str, hf_token: str): - ds = load_dataset(repo, token=hf_token) - ds = prepare( - ds, - accelerator=accelerator, - batch_size=BATCH_SIZE, - stage=1, - ) - return str(ds) - - -@zero(duration=120) -def make_dataset_from_repo_stage_2(repo: str, hf_token: str): - ds = load_dataset(repo, token=hf_token) - ds = prepare( - ds, - accelerator=accelerator, - batch_size=BATCH_SIZE, - stage=2, - ) - return str(ds) - - -def make_dataset_from_repo_stage_3(exp_dir: str, repo: str, hf_token: str): - ds = load_dataset(repo, token=hf_token) - ds = prepare( - ds, - accelerator=accelerator, - batch_size=BATCH_SIZE, - stage=3, - ) - - if not exp_dir: - exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR) - print(f"Using exp dir: {exp_dir}") - - dataset = os.path.join(exp_dir, "dataset") - ds.save_to_disk(dataset) - return exp_dir, str(ds) - - -def use_dataset(exp_dir: str, repo: str, hf_token: str): - gr.Info("Fetching dataset") - ds = load_dataset(repo, token=hf_token) - - if not exp_dir: - exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR) - print(f"Using exp dir: {exp_dir}") - - dataset = os.path.join(exp_dir, "dataset") - ds.save_to_disk(dataset) - return exp_dir, str(ds) - - -def upload_dataset(exp_dir: str, repo: str, hf_token: str): - dataset = os.path.join(exp_dir, "dataset") - if not os.path.exists(dataset): - raise gr.Error("Dataset not found") - - gr.Info("Uploading dataset") - ds = load_from_disk(dataset) - ds.push_to_hub(repo, token=hf_token, private=True) - gr.Info("Dataset uploaded successfully") - - -class DatasetTab: - def __init__(self): - pass - - def ui(self): - gr.Markdown("# Dataset") - gr.Markdown("The suggested dataset size is > 5 minutes of audio.") - - gr.Markdown("## Create Dataset from ZIP") - gr.Markdown( - "Create a dataset by simply upload a zip file containing audio files. The audio files should be at the top level of the zip file." - ) - with gr.Row(): - self.zip_file = gr.File( - label="Upload a zip file containing audio files", - file_types=["zip"], - ) - self.make_ds_from_dir = gr.Button( - value="Create Dataset from ZIP", variant="primary" - ) - - gr.Markdown("## Create Dataset from Dataset Repository") - gr.Markdown( - "You can also create a dataset from any Hugging Face dataset repository that has 'audio' column." - ) - with gr.Row(): - self.repo = gr.Textbox( - label="Hugging Face Dataset Repository", - placeholder="username/dataset-name", - ) - self.make_ds_from_repo = gr.Button( - value="Create Dataset from Repo", variant="primary" - ) - - gr.Markdown("## Sync Preprocessed Dataset") - gr.Markdown( - "After you have preprocessed the dataset, you can upload the dataset to Hugging Face. And fetch it back later directly." - ) - with gr.Row(): - self.preprocessed_repo = gr.Textbox( - label="Hugging Face Dataset Repository", - placeholder="username/dataset-name", - ) - self.fetch_ds = gr.Button(value="Fetch Dataset", variant="primary") - self.upload_ds = gr.Button(value="Upload Dataset", variant="primary") - - self.ds_state = gr.Textbox(label="Dataset Info", lines=5) - - def build(self, exp_dir: gr.Textbox, hf_token: gr.Textbox): - self.make_ds_from_dir.click( - fn=make_dataset_from_zip, - inputs=[exp_dir, self.zip_file], - outputs=[exp_dir, self.ds_state], - ).success( - fn=make_dataset_from_zip_stage_2, - inputs=[exp_dir], - outputs=[exp_dir, self.ds_state], - ).success( - fn=make_dataset_from_zip_stage_3, - inputs=[exp_dir], - outputs=[exp_dir, self.ds_state], - ) - - self.make_ds_from_repo.click( - fn=make_dataset_from_repo, - inputs=[self.repo, hf_token], - outputs=[self.ds_state], - ).success( - fn=make_dataset_from_repo_stage_2, - inputs=[self.repo, hf_token], - outputs=[self.ds_state], - ).success( - fn=make_dataset_from_repo_stage_3, - inputs=[exp_dir, self.repo, hf_token], - outputs=[exp_dir, self.ds_state], - ) - - self.fetch_ds.click( - fn=use_dataset, - inputs=[exp_dir, self.preprocessed_repo, hf_token], - outputs=[exp_dir, self.ds_state], - ) - - self.upload_ds.click( - fn=upload_dataset, - inputs=[exp_dir, self.preprocessed_repo, hf_token], - outputs=[], - ) diff --git a/app/dataset_maker.py b/app/dataset_maker.py index fcf9a4610524475ed37ed6cf4911bb1ddee0fb35..9f3a3656a76da54fd99f2d015a57c1e1e4712ad7 100644 --- a/app/dataset_maker.py +++ b/app/dataset_maker.py @@ -9,19 +9,16 @@ import zipfile # Function to download audio from YouTube and save it as a WAV file def download_youtube_audio(url, audio_name): ydl_opts = { - "format": "bestaudio/best", - "postprocessors": [ - { - "key": "FFmpegExtractAudio", - "preferredcodec": "wav", - } - ], - "outtmpl": f"youtubeaudio/{audio_name}", # Output template + 'format': 'bestaudio/best', + 'postprocessors': [{ + 'key': 'FFmpegExtractAudio', + 'preferredcodec': 'wav', + }], + "outtmpl": f'youtubeaudio/{audio_name}', # Output template } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) - return f"youtubeaudio/{audio_name}.wav" - + return f'youtubeaudio/{audio_name}.wav' # Function to calculate RMS def get_rms(y, frame_length=2048, hop_length=512, pad_mode="constant"): @@ -33,7 +30,9 @@ def get_rms(y, frame_length=2048, hop_length=512, pad_mode="constant"): x_shape_trimmed = list(y.shape) x_shape_trimmed[axis] -= frame_length - 1 out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) - xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) + xw = np.lib.stride_tricks.as_strided( + y, shape=out_shape, strides=out_strides + ) if axis < 0: target_axis = axis - 1 else: @@ -46,28 +45,15 @@ def get_rms(y, frame_length=2048, hop_length=512, pad_mode="constant"): power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) return np.sqrt(power) - # Slicer class class Slicer: - def __init__( - self, - sr, - threshold=-40.0, - min_length=5000, - min_interval=300, - hop_size=20, - max_sil_kept=5000, - ): + def __init__(self, sr, threshold=-40., min_length=5000, min_interval=300, hop_size=20, max_sil_kept=5000): if not min_length >= min_interval >= hop_size: - raise ValueError( - "The following condition must be satisfied: min_length >= min_interval >= hop_size" - ) + raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size') if not max_sil_kept >= hop_size: - raise ValueError( - "The following condition must be satisfied: max_sil_kept >= hop_size" - ) + raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size') min_interval = sr * min_interval / 1000 - self.threshold = 10 ** (threshold / 20.0) + self.threshold = 10 ** (threshold / 20.) self.hop_size = round(sr * hop_size / 1000) self.win_size = min(round(min_interval), 4 * self.hop_size) self.min_length = round(sr * min_length / 1000 / self.hop_size) @@ -76,13 +62,9 @@ class Slicer: def _apply_slice(self, waveform, begin, end): if len(waveform.shape) > 1: - return waveform[ - :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) - ] + return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)] else: - return waveform[ - begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) - ] + return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)] def slice(self, waveform): if len(waveform.shape) > 1: @@ -91,9 +73,7 @@ class Slicer: samples = waveform if samples.shape[0] <= self.min_length: return [waveform] - rms_list = get_rms( - y=samples, frame_length=self.win_size, hop_length=self.hop_size - ).squeeze(0) + rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) sil_tags = [] silence_start = None clip_start = 0 @@ -105,36 +85,22 @@ class Slicer: if silence_start is None: continue is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = ( - i - silence_start >= self.min_interval - and i - clip_start >= self.min_length - ) + need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length if not is_leading_silence and not need_slice_middle: silence_start = None continue if i - silence_start <= self.max_sil_kept: - pos = rms_list[silence_start : i + 1].argmin() + silence_start + pos = rms_list[silence_start: i + 1].argmin() + silence_start if silence_start == 0: sil_tags.append((0, pos)) else: sil_tags.append((pos, pos)) clip_start = pos elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[ - i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 - ].argmin() + pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin() pos += i - self.max_sil_kept - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) + pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept if silence_start == 0: sil_tags.append((0, pos_r)) clip_start = pos_r @@ -142,17 +108,8 @@ class Slicer: sil_tags.append((min(pos_l, pos), max(pos_r, pos))) clip_start = max(pos_r, pos) else: - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) + pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept if silence_start == 0: sil_tags.append((0, pos_r)) else: @@ -160,12 +117,9 @@ class Slicer: clip_start = pos_r silence_start = None total_frames = rms_list.shape[0] - if ( - silence_start is not None - and total_frames - silence_start >= self.min_interval - ): + if silence_start is not None and total_frames - silence_start >= self.min_interval: silence_end = min(total_frames, silence_start + self.max_sil_kept) - pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start + pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start sil_tags.append((pos, total_frames + 1)) if len(sil_tags) == 0: return [waveform] @@ -174,41 +128,28 @@ class Slicer: if sil_tags[0][0] > 0: chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) for i in range(len(sil_tags) - 1): - chunks.append( - self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]) - ) + chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])) if sil_tags[-1][1] < total_frames: - chunks.append( - self._apply_slice(waveform, sil_tags[-1][1], total_frames) - ) + chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames)) return chunks - # Function to slice and save audio chunks def slice_audio(file_path, audio_name): audio, sr = librosa.load(file_path, sr=None, mono=False) - os.makedirs(f"dataset/{audio_name}", exist_ok=True) - slicer = Slicer( - sr=sr, - threshold=-40, - min_length=5000, - min_interval=500, - hop_size=10, - max_sil_kept=500, - ) + os.makedirs(f'dataset/{audio_name}', exist_ok=True) + slicer = Slicer(sr=sr, threshold=-40, min_length=5000, min_interval=500, hop_size=10, max_sil_kept=500) chunks = slicer.slice(audio) for i, chunk in enumerate(chunks): if len(chunk.shape) > 1: chunk = chunk.T - sf.write(f"dataset/{audio_name}/split_{i}.wav", chunk, sr) + sf.write(f'dataset/{audio_name}/split_{i}.wav', chunk, sr) return f"dataset/{audio_name}" - # Function to zip the dataset directory def zip_directory(directory_path, audio_name): zip_file = f"dataset/{audio_name}.zip" os.makedirs(os.path.dirname(zip_file), exist_ok=True) # Ensure the directory exists - with zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED) as zipf: + with zipfile.ZipFile(zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf: for root, dirs, files in os.walk(directory_path): for file in files: file_path = os.path.join(root, file) @@ -222,4 +163,4 @@ def process_audio(url, audio_name): file_path = download_youtube_audio(url, audio_name) dataset_path = slice_audio(file_path, audio_name) zip_file = zip_directory(dataset_path, audio_name) - return zip_file, print(f"{zip_file} successfully processed") + return zip_file, print(f"{zip_file} successfully processed") \ No newline at end of file diff --git a/app/export.py b/app/export.py new file mode 100644 index 0000000000000000000000000000000000000000..628d3b6da0ac772c072e7d20a14649647baeba60 --- /dev/null +++ b/app/export.py @@ -0,0 +1,178 @@ +from glob import glob +import os +import shutil +import gradio as gr +from infer.lib.train.process_ckpt import extract_small_model +from app.train import train_index +from huggingface_hub import upload_folder + + +def download_weight(exp_dir: str) -> str: + checkpoints = glob(f"{exp_dir}/G_*.pth") + if not checkpoints: + raise gr.Error("No checkpoint found") + + latest_checkpoint = max(checkpoints, key=os.path.getctime) + print(f"Latest checkpoint: {latest_checkpoint}") + + out = os.path.join(exp_dir, f"model.pth") + extract_small_model( + latest_checkpoint, out, "40k", True, "Model trained by ZeroGPU.", "v2" + ) + + return out + + +def download_inference_pack(exp_dir: str) -> str: + net_g = download_weight(exp_dir) + index = glob(f"{exp_dir}/added_*.index") + if not index: + train_index(exp_dir) + index = glob(f"{exp_dir}/added_*.index") + if not index: + raise gr.Error("Index not found") + + # make zip of those two files + tmp = os.path.join(exp_dir, "inference_pack") + if os.path.exists(tmp): + shutil.rmtree(tmp) + os.makedirs(tmp) + shutil.copy(net_g, tmp) + shutil.copy(index[0], tmp) + shutil.make_archive(tmp, "zip", tmp) + shutil.rmtree(tmp) + + return f"{tmp}.zip" + + +def download_expdir(exp_dir: str) -> str: + shutil.make_archive(exp_dir, "zip", exp_dir) + return f"{exp_dir}.zip" + + +def upload_to_huggingface(exp_dir: str, repo_id: str, token: str) -> str: + commit = upload_folder( + repo_id=repo_id, + folder_path=exp_dir, + ignore_patterns=["_data", "*.zip", "tmp.wav"], + token=token if token.startswith("hf_") else None, + ) + return commit.commit_url + + +def remove_legacy_checkpoints(exp_dir: str): + checkpoints = glob(f"{exp_dir}/G_*.pth") + if not checkpoints: + raise gr.Error("No checkpoint found") + + latest_checkpoint = max(checkpoints, key=os.path.getctime) + print(f"Latest checkpoint: {latest_checkpoint}") + for checkpoint in checkpoints: + if checkpoint != latest_checkpoint: + os.remove(checkpoint) + print(f"Removed: {checkpoint}") + + checkpoints = glob(f"{exp_dir}/D_*.pth") + if not checkpoints: + raise gr.Error("No checkpoint found") + + latest_checkpoint = max(checkpoints, key=os.path.getctime) + print(f"Latest checkpoint: {latest_checkpoint}") + for checkpoint in checkpoints: + if checkpoint != latest_checkpoint: + os.remove(checkpoint) + print(f"Removed: {checkpoint}") + + +def remove_expdir(exp_dir: str) -> str: + shutil.rmtree(exp_dir) + return "" + + +class ExportTab: + def __init__(self): + pass + + def ui(self): + gr.Markdown("# Download Model or Experiment Directory") + gr.Markdown( + "You can download the latest model or the entire experiment directory here." + ) + + with gr.Row(): + self.download_weight_btn = gr.Button( + value="Latest model (for inferencing)", variant="primary" + ) + self.download_weight_output = gr.File(label="Prune latest model") + + with gr.Row(): + self.download_inference_pack_btn = gr.Button( + value="Download inference pack (model + index)", variant="primary" + ) + self.download_inference_pack_output = gr.File(label="Inference pack") + + with gr.Row(): + self.download_expdir_btn = gr.Button( + value="Download experiment directory", variant="primary" + ) + self.download_expdir_output = gr.File(label="Archive experiment directory") + + with gr.Row(): + with gr.Column(): + gr.Markdown("### Upload to Hugging Face") + gr.Markdown( + "You can upload the entire experiment directory to Hugging Face." + ) + self.commit_link = gr.Markdown("") + with gr.Column(): + self.repo_id = gr.Textbox(label="Repository ID") + self.token = gr.Textbox(label="Personal access token") + self.upload_to_huggingface_btn = gr.Button( + value="Upload to Hugging Face", variant="primary" + ) + + with gr.Row(): + self.remove_legacy_checkpoints_btn = gr.Button( + value="Remove legacy checkpoints" + ) + + with gr.Row(): + self.remove_expdir_btn = gr.Button( + value="REMOVE experiment directory", variant="stop" + ) + + def build(self, exp_dir: gr.Textbox): + self.download_weight_btn.click( + fn=download_weight, + inputs=[exp_dir], + outputs=[self.download_weight_output], + ) + + self.download_inference_pack_btn.click( + fn=download_inference_pack, + inputs=[exp_dir], + outputs=[self.download_inference_pack_output], + ) + + self.download_expdir_btn.click( + fn=download_expdir, + inputs=[exp_dir], + outputs=[self.download_expdir_output], + ) + + self.upload_to_huggingface_btn.click( + fn=upload_to_huggingface, + inputs=[exp_dir, self.repo_id, self.token], + outputs=[self.commit_link], + ) + + self.remove_legacy_checkpoints_btn.click( + fn=remove_legacy_checkpoints, + inputs=[exp_dir], + ) + + self.remove_expdir_btn.click( + fn=remove_expdir, + inputs=[exp_dir], + outputs=[exp_dir], + ) diff --git a/app/extract.py b/app/extract.py new file mode 100644 index 0000000000000000000000000000000000000000..7d1f095d75ac27b00606d142b1838c301d582e2d --- /dev/null +++ b/app/extract.py @@ -0,0 +1,64 @@ +import gradio as gr +from infer.modules.train.extract.extract_f0_rmvpe import FeatureInput +from infer.modules.train.extract_feature_print import HubertFeatureExtractor +from zero import zero + + +@zero(duration=300) +def extract_features(exp_dir: str) -> str: + err = None + fi = FeatureInput(exp_dir) + try: + fi.run() + except Exception as e: + err = e + + fi.logfile.seek(0) + log = fi.logfile.read() + + if err: + log = f"Error: {err}\n{log}" + return log + + hfe = HubertFeatureExtractor(exp_dir) + try: + hfe.run() + except Exception as e: + err = e + + hfe.logfile.seek(0) + log += hfe.logfile.read() + + if err: + log = f"Error: {err}\n{log}" + + return log + + +class FeatureExtractionTab: + def __init__(self): + pass + + def ui(self): + gr.Markdown("# Feature Extraction") + gr.Markdown( + "Before training, you need to extract features from the audio files. " + "This process may take a while, depending on the number of audio files. " + "Under the hood, this process extracts speech features using HuBERT and extracts F0 by RMVPE." + ) + + with gr.Row(): + self.extract_features_btn = gr.Button( + value="Extract features", variant="primary" + ) + with gr.Row(): + self.extract_features_log = gr.Textbox( + label="Feature extraction log", lines=10 + ) + + def build(self, exp_dir: gr.Textbox): + self.extract_features_btn.click( + fn=extract_features, + inputs=[exp_dir], + outputs=[self.extract_features_log], + ) diff --git a/app/infer.py b/app/infer.py index bf23c29f38ca652be3133be5c6d547cb75cedf1f..34317e46aefdc009df094a8792338faf5aff178a 100644 --- a/app/infer.py +++ b/app/infer.py @@ -1,81 +1,65 @@ +from glob import glob import os -import shutil -import hashlib -from pathlib import Path from typing import Tuple from demucs.separate import main as demucs import gradio as gr import numpy as np import soundfile as sf -from zerorvc import RVC -from .zero import zero -from .model import device -import yt_dlp - - -def download_audio(url): - ydl_opts = { - "format": "bestaudio/best", - "outtmpl": "ytdl/%(title)s.%(ext)s", - "postprocessors": [ - { - "key": "FFmpegExtractAudio", - "preferredcodec": "wav", - "preferredquality": "192", - } - ], - } - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info_dict = ydl.extract_info(url, download=True) - file_path = ydl.prepare_filename(info_dict).rsplit(".", 1)[0] + ".wav" - sample_rate, audio_data = read(file_path) - audio_array = np.asarray(audio_data, dtype=np.int16) - - return sample_rate, audio_array +from configs.config import Config +from infer.modules.vc.modules import VC +from zero import zero +from model import device @zero(duration=120) def infer( - exp_dir: str, original_audio: str, pitch_mod: int, protect: float + exp_dir: str, original_audio: str, f0add: int, index_rate: float, protect: float ) -> Tuple[int, np.ndarray]: - checkpoint_dir = os.path.join(exp_dir, "checkpoints") - if not os.path.exists(checkpoint_dir): + model = os.path.join(exp_dir, "model.pth") + if not os.path.exists(model): raise gr.Error("Model not found") - # rename the original audio to the hash - with open(original_audio, "rb") as f: - original_audio_hash = hashlib.md5(f.read()).hexdigest() - ext = Path(original_audio).suffix - original_audio_hashed = os.path.join(exp_dir, f"{original_audio_hash}{ext}") - shutil.copy(original_audio, original_audio_hashed) - - out = os.path.join("separated", "htdemucs", original_audio_hash, "vocals.wav") - if not os.path.exists(out): - demucs( - [ - "--two-stems", - "vocals", - "-d", - str(device), - "-n", - "htdemucs", - original_audio_hashed, - ] - ) + index = glob(f"{exp_dir}/added_*.index") + if index: + index = index[0] + else: + index = None + + base = os.path.basename(original_audio) + base = os.path.splitext(base)[0] + demucs( + ["--two-stems", "vocals", "-d", str(device), "-n", "htdemucs", original_audio] + ) + out = os.path.join("separated", "htdemucs", base, "vocals.wav") + + cfg = Config() + vc = VC(cfg) + vc.get_vc(model) + _, wav_opt = vc.vc_single( + 0, + out, + f0add, + None, + "rmvpe", + index, + None, + index_rate, + 3, # this only has effect when f0_method is "harvest" + 0, + 1, + protect, + ) - rvc = RVC.from_pretrained(checkpoint_dir) - samples = rvc.convert(out, pitch_modification=pitch_mod, protect=protect) - file = os.path.join(exp_dir, "infer.wav") - sf.write(file, samples, rvc.sr) + sr = wav_opt[0] + data = wav_opt[1] - return file + return sr, data def merge(exp_dir: str, original_audio: str, vocal: Tuple[int, np.ndarray]) -> str: - with open(original_audio, "rb") as f: - original_audio_hash = hashlib.md5(f.read()).hexdigest() - music = os.path.join("separated", "htdemucs", original_audio_hash, "no_vocals.wav") + base = os.path.basename(original_audio) + base = os.path.splitext(base)[0] + music = os.path.join("separated", "htdemucs", base, "no_vocals.wav") tmp = os.path.join(exp_dir, "tmp.wav") sf.write(tmp, vocal[1], vocal[0]) @@ -104,35 +88,26 @@ class InferenceTab: type="filepath", show_download_button=True, ) - with gr.Accordion("inference by Link", open=False): - with gr.Row(): - youtube_link = gr.Textbox( - label="Link", - placeholder="Paste the link here", - interactive=True, - ) - with gr.Row(): - gr.Markdown( - "You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)" - ) - with gr.Row(): - download_button = gr.Button("Download!", variant="primary") - download_button.click( - download_audio, [youtube_link], [self.original_audio] - ) with gr.Column(): - self.pitch_mod = gr.Slider( - label="Pitch Modification +/-", + self.f0add = gr.Slider( + label="F0 +/-", minimum=-16, maximum=16, step=1, value=0, ) + self.index_rate = gr.Slider( + label="Index rate", + minimum=-0, + maximum=1, + step=0.01, + value=0.5, + ) self.protect = gr.Slider( label="Protect", minimum=0, - maximum=0.5, + maximum=1, step=0.01, value=0.33, ) @@ -153,7 +128,8 @@ class InferenceTab: inputs=[ exp_dir, self.original_audio, - self.pitch_mod, + self.f0add, + self.index_rate, self.protect, ], outputs=[self.infer_output], diff --git a/app/model.py b/app/model.py deleted file mode 100644 index 5e796adfbd690c9d1c4f10336563624ac68f4c90..0000000000000000000000000000000000000000 --- a/app/model.py +++ /dev/null @@ -1,17 +0,0 @@ -import logging -from accelerate import Accelerator -from zerorvc import load_hubert, load_rmvpe - -logger = logging.getLogger(__name__) - -accelerator = Accelerator() -device = accelerator.device - -logger.info(f"device: {device}") -logger.info(f"mixed_precision: {accelerator.mixed_precision}") - -rmvpe = load_rmvpe(device=device) -logger.info("RMVPE model loaded.") - -hubert = load_hubert(device=device) -logger.info("HuBERT model loaded.") diff --git a/app/settings.py b/app/settings.py deleted file mode 100644 index 042948688da318273aea2d6cb7101e32b55b5b86..0000000000000000000000000000000000000000 --- a/app/settings.py +++ /dev/null @@ -1,26 +0,0 @@ -import gradio as gr -from .constants import HF_TOKEN - - -class SettingsTab: - def __init__(self): - pass - - def ui(self): - self.exp_dir = gr.Textbox( - label="Temporary Experiment Directory (auto-managed)", - placeholder="It will be auto-generated after setup", - interactive=True, - ) - gr.Markdown( - "### Sync with Hugging Face 🤗\n\nThe access token will be use to upload/download the dataset and model." - ) - self.hf_token = gr.Textbox( - label="Hugging Face Access Token", - placeholder="Paste your Hugging Face access token here (hf_...)", - value=HF_TOKEN, - interactive=True, - ) - - def build(self): - pass diff --git a/app/setup.py b/app/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..22bc5f11c9e9cfeb67b58cd09c3f1217ae280c57 --- /dev/null +++ b/app/setup.py @@ -0,0 +1,138 @@ +import os +import shutil +import gradio as gr +import zipfile +import tempfile +from infer.modules.train.preprocess import PreProcess +from typing import Tuple +from huggingface_hub import snapshot_download + + +def extract_audio_files(zip_file: str, target_dir: str) -> list[str]: + with zipfile.ZipFile(zip_file, "r") as zip_ref: + zip_ref.extractall(target_dir) + + audio_files = [ + os.path.join(target_dir, f) + for f in os.listdir(target_dir) + if f.endswith((".wav", ".mp3", ".ogg")) + ] + if not audio_files: + raise gr.Error("No audio files found at the top level of the zip file") + + return audio_files + + +def create_new_expdir(zip_file: str) -> Tuple[str, str]: + temp_dir = tempfile.mkdtemp() + print(f"Using exp dir: {temp_dir}") + + data_dir = os.path.join(temp_dir, "_data") + os.makedirs(data_dir) + audio_files = extract_audio_files(zip_file, data_dir) + + pp = PreProcess(40000, temp_dir, 3.0, False) + pp.pipeline_mp_inp_dir(data_dir, 4) + + pp.logfile.seek(0) + log = pp.logfile.read() + + return temp_dir, f"Preprocessed {len(audio_files)} audio files.\n{log}" + + +def restore_expdir(zip: str) -> str: + exp_dir = tempfile.mkdtemp() + shutil.unpack_archive(zip, exp_dir) + return exp_dir + + +def restore_from_huggingface(repo: str, token: str) -> str: + exp_dir = os.path.join(tempfile.mkdtemp(), repo.lower()) + snapshot_download( + repo, local_dir=exp_dir, token=token if token.startswith("hf_") else None + ) + return exp_dir + + +def set_dir(dir_val: str) -> str: + if not dir_val.startswith("/tmp/"): + dir_val = os.path.join("/tmp", dir_val) + if not os.path.isdir(dir_val): + raise gr.Error("Directory does not exist") + + return dir_val + + +class SetupTab: + def __init__(self): + pass + + def ui(self): + gr.Markdown("# Setup Experiment") + gr.Markdown( + "You can upload a zip file containing audio files to start a new experiment, or upload an experiment directory zip file to restore an existing experiment.\n" + "The suggested dataset size is > 5 minutes of audio." + ) + + with gr.Row(): + with gr.Column(): + self.zip_file = gr.File( + label="Upload a zip file containing audio files for training", + file_types=["zip"], + ) + self.preprocess_log = gr.Textbox(label="Log", lines=5) + + self.preprocess_btn = gr.Button( + value="Start New Experiment", variant="primary" + ) + + with gr.Row(): + self.restore_zip_file = gr.File( + label="Upload the experiment directory zip file", + file_types=["zip"], + ) + self.restore_btn = gr.Button(value="Restore Experiment", variant="primary") + + gr.Markdown("You can also restore from a Hugging Face repo.") + with gr.Row(): + self.hf_repo = gr.Textbox( + label="Restore from Hugging Face repo", + placeholder="username/repo", + ) + self.hf_token = gr.Textbox( + label="Hugging Face token (optional)", + placeholder="hf_...", + ) + self.restore_hf_btn = gr.Button(value="Restore from Hugging Face") + + with gr.Row(): + self.dir_val = gr.Textbox( + label="Manually set the experiment directory (don't touch it unless you know what you are doing)", + placeholder="/tmp/...", + ) + self.set_dir_btn = gr.Button(value="Set Directory") + + def build(self, exp_dir: gr.Textbox): + self.preprocess_btn.click( + fn=create_new_expdir, + inputs=[self.zip_file], + outputs=[exp_dir, self.preprocess_log], + ) + + self.restore_btn.click( + fn=restore_expdir, + inputs=[self.restore_zip_file], + outputs=[exp_dir], + ) + + self.restore_hf_btn.click( + fn=restore_from_huggingface, + inputs=[self.hf_repo, self.hf_token], + outputs=[exp_dir], + ) + + self.set_dir_btn.click( + fn=set_dir, + inputs=[self.dir_val], + outputs=[exp_dir], + ) diff --git a/app/train.py b/app/train.py index 56c0de2392fe8529eca3110fe10e7fbeeed71f45..22f9b731e6b5ec41ad157a1841760fc6380fe9cb 100644 --- a/app/train.py +++ b/app/train.py @@ -1,110 +1,157 @@ import os -import tempfile +import shutil +import traceback +import faiss import gradio as gr -import torch -from zerorvc import RVCTrainer, pretrained_checkpoints, SynthesizerTrnMs768NSFsid -from zerorvc.trainer import TrainingCheckpoint -from datasets import load_from_disk -from huggingface_hub import snapshot_download -from .zero import zero -from .model import accelerator, device -from .constants import BATCH_SIZE, ROOT_EXP_DIR, TRAINING_EPOCHS +import numpy as np +from sklearn.cluster import MiniBatchKMeans +from random import shuffle +from glob import glob +from infer.modules.train.train import train +from zero import zero + + +def write_filelist(exp_dir: str) -> None: + if_f0_3 = True + spk_id5 = 0 + gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir) + feature_dir = "%s/3_feature768" % (exp_dir) + + if if_f0_3: + f0_dir = "%s/2a_f0" % (exp_dir) + f0nsf_dir = "%s/2b-f0nsf" % (exp_dir) + names = ( + set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) + & set([name.split(".")[0] for name in os.listdir(feature_dir)]) + & set([name.split(".")[0] for name in os.listdir(f0_dir)]) + & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)]) + ) + else: + names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set( + [name.split(".")[0] for name in os.listdir(feature_dir)] + ) + opt = [] + for name in names: + if if_f0_3: + opt.append( + "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s" + % ( + gt_wavs_dir.replace("\\", "\\\\"), + name, + feature_dir.replace("\\", "\\\\"), + name, + f0_dir.replace("\\", "\\\\"), + name, + f0nsf_dir.replace("\\", "\\\\"), + name, + spk_id5, + ) + ) + else: + opt.append( + "%s/%s.wav|%s/%s.npy|%s" + % ( + gt_wavs_dir.replace("\\", "\\\\"), + name, + feature_dir.replace("\\", "\\\\"), + name, + spk_id5, + ) + ) + fea_dim = 768 + + now_dir = os.getcwd() + sr2 = "40k" + if if_f0_3: + for _ in range(2): + opt.append( + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" + % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5) + ) + else: + for _ in range(2): + opt.append( + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s" + % (now_dir, sr2, now_dir, fea_dim, spk_id5) + ) + shuffle(opt) + with open("%s/filelist.txt" % exp_dir, "w") as f: + f.write("\n".join(opt)) @zero(duration=240) -def train_model(exp_dir: str, progress=gr.Progress()): - dataset = os.path.join(exp_dir, "dataset") - if not os.path.exists(dataset): - raise gr.Error("Dataset not found. Please prepare the dataset first.") - - ds = load_from_disk(dataset) - checkpoint_dir = os.path.join(exp_dir, "checkpoints") - trainer = RVCTrainer(checkpoint_dir) - - resume_from = trainer.latest_checkpoint() - if resume_from is None: - resume_from = pretrained_checkpoints() - gr.Info(f"Starting training from pretrained checkpoints.") - else: - gr.Info(f"Resuming training from {resume_from}") - - tqdm = progress.tqdm( - trainer.train( - dataset=ds["train"], - resume_from=resume_from, - batch_size=BATCH_SIZE, - epochs=TRAINING_EPOCHS, - accelerator=accelerator, - ), - total=TRAINING_EPOCHS, - unit="epochs", - desc="Training", +def train_model(exp_dir: str) -> str: + shutil.copy("config.json", exp_dir) + write_filelist(exp_dir) + train(exp_dir) + + models = glob(f"{exp_dir}/G_*.pth") + print(models) + if not models: + raise gr.Error("No model found") + + latest_model = max(models, key=os.path.getctime) + return latest_model + + +def train_index(exp_dir: str) -> str: + feature_dir = "%s/3_feature768" % (exp_dir) + if not os.path.exists(feature_dir): + raise gr.Error("Please extract features first.") + listdir_res = list(os.listdir(feature_dir)) + if len(listdir_res) == 0: + raise gr.Error("Please extract features first.") + npys = [] + for name in sorted(listdir_res): + phone = np.load("%s/%s" % (feature_dir, name)) + npys.append(phone) + big_npy = np.concatenate(npys, 0) + big_npy_idx = np.arange(big_npy.shape[0]) + np.random.shuffle(big_npy_idx) + big_npy = big_npy[big_npy_idx] + if big_npy.shape[0] > 2e5: + print("Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]) + try: + big_npy = ( + MiniBatchKMeans( + n_clusters=10000, + verbose=True, + batch_size=256 * 8, + compute_labels=False, + init="random", + ) + .fit(big_npy) + .cluster_centers_ + ) + except: + info = traceback.format_exc() + print(info) + raise gr.Error(info) + + np.save("%s/total_fea.npy" % exp_dir, big_npy) + n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) + print("%s,%s" % (big_npy.shape, n_ivf)) + index = faiss.index_factory(768, "IVF%s,Flat" % n_ivf) + # index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf) + print("training") + index_ivf = faiss.extract_index_ivf(index) # + index_ivf.nprobe = 1 + index.train(big_npy) + faiss.write_index( + index, + "%s/trained_IVF%s_Flat_nprobe_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe), ) - - for ckpt in tqdm: - info = f"Epoch: {ckpt.epoch} loss: (gen: {ckpt.loss_gen:.4f}, fm: {ckpt.loss_fm:.4f}, mel: {ckpt.loss_mel:.4f}, kl: {ckpt.loss_kl:.4f}, disc: {ckpt.loss_disc:.4f})" - print(info) - latest: TrainingCheckpoint = ckpt - - latest.save(trainer.checkpoint_dir) - latest.G.save_pretrained(trainer.checkpoint_dir) - - result = f"{TRAINING_EPOCHS} epochs trained. Latest loss: (gen: {latest.loss_gen:.4f}, fm: {latest.loss_fm:.4f}, mel: {latest.loss_mel:.4f}, kl: {latest.loss_kl:.4f}, disc: {latest.loss_disc:.4f})" - - del trainer - if device.type == "cuda": - torch.cuda.empty_cache() - - return result - - -def upload_model(exp_dir: str, repo: str, hf_token: str): - checkpoint_dir = os.path.join(exp_dir, "checkpoints") - if not os.path.exists(checkpoint_dir): - raise gr.Error("Model not found") - - gr.Info("Uploading model") - model = SynthesizerTrnMs768NSFsid.from_pretrained(checkpoint_dir) - model.push_to_hub(repo, token=hf_token, private=True) - gr.Info("Model uploaded successfully") - - -def upload_checkpoints(exp_dir: str, repo: str, hf_token: str): - checkpoint_dir = os.path.join(exp_dir, "checkpoints") - if not os.path.exists(checkpoint_dir): - raise gr.Error("Checkpoints not found") - - gr.Info("Uploading checkpoints") - trainer = RVCTrainer(checkpoint_dir) - trainer.push_to_hub(repo, token=hf_token, private=True) - gr.Info("Checkpoints uploaded successfully") - - -def fetch_model(exp_dir: str, repo: str, hf_token: str): - if not exp_dir: - exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR) - checkpoint_dir = os.path.join(exp_dir, "checkpoints") - - gr.Info("Fetching model") - files = ["README.md", "config.json", "model.safetensors"] - snapshot_download( - repo, token=hf_token, local_dir=checkpoint_dir, allow_patterns=files + print("adding") + batch_size_add = 8192 + for i in range(0, big_npy.shape[0], batch_size_add): + index.add(big_npy[i : i + batch_size_add]) + faiss.write_index( + index, + "%s/added_IVF%s_Flat_nprobe_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe), ) - gr.Info("Model fetched successfully") - - return exp_dir - - -def fetch_checkpoints(exp_dir: str, repo: str, hf_token: str): - if not exp_dir: - exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR) - checkpoint_dir = os.path.join(exp_dir, "checkpoints") + print("built added_IVF%s_Flat_nprobe_%s.index" % (n_ivf, index_ivf.nprobe)) - gr.Info("Fetching checkpoints") - snapshot_download(repo, token=hf_token, local_dir=checkpoint_dir) - gr.Info("Checkpoints fetched successfully") - - return exp_dir + return "%s/added_IVF%s_Flat_nprobe_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe) class TrainTab: @@ -115,55 +162,26 @@ class TrainTab: gr.Markdown("# Training") gr.Markdown( "You can start training the model by clicking the button below. " - f"Each time you click the button, the model will train for {TRAINING_EPOCHS} epochs, which takes about 3 minutes on ZeroGPU (A100). " + "Each time you click the button, the model will train for 10 epochs, which takes about 3 minutes on ZeroGPU (A100). " + "Tha latest *training checkpoint* will be avaible below." ) with gr.Row(): self.train_btn = gr.Button(value="Train", variant="primary") - self.result = gr.Textbox(label="Training Result", lines=3) - - gr.Markdown("## Sync Model and Checkpoints with Hugging Face") - gr.Markdown( - "You can upload the trained model and checkpoints to Hugging Face for sharing or further training." - ) - - self.repo = gr.Textbox(label="Repository ID", placeholder="username/repo") + self.latest_checkpoint = gr.File(label="Latest checkpoint") with gr.Row(): - self.upload_model_btn = gr.Button(value="Upload Model", variant="primary") - self.upload_checkpoints_btn = gr.Button( - value="Upload Checkpoints", variant="primary" - ) - with gr.Row(): - self.fetch_mode_btn = gr.Button(value="Fetch Model", variant="primary") - self.fetch_checkpoints_btn = gr.Button( - value="Fetch Checkpoints", variant="primary" - ) + self.train_index_btn = gr.Button(value="Train index", variant="primary") + self.trained_index = gr.File(label="Trained index") - def build(self, exp_dir: gr.Textbox, hf_token: gr.Textbox): + def build(self, exp_dir: gr.Textbox): self.train_btn.click( fn=train_model, inputs=[exp_dir], - outputs=[self.result], - ) - - self.upload_model_btn.click( - fn=upload_model, - inputs=[exp_dir, self.repo, hf_token], + outputs=[self.latest_checkpoint], ) - self.upload_checkpoints_btn.click( - fn=upload_checkpoints, - inputs=[exp_dir, self.repo, hf_token], - ) - - self.fetch_mode_btn.click( - fn=fetch_model, - inputs=[exp_dir, self.repo, hf_token], - outputs=[exp_dir], - ) - - self.fetch_checkpoints_btn.click( - fn=fetch_checkpoints, - inputs=[exp_dir, self.repo, hf_token], - outputs=[exp_dir], + self.train_index_btn.click( + fn=train_index, + inputs=[exp_dir], + outputs=[self.trained_index], ) diff --git a/app/tutorial.py b/app/tutorial.py deleted file mode 100644 index e4f2e638bb31ecb98a33a5dae25fdae5e09e43aa..0000000000000000000000000000000000000000 --- a/app/tutorial.py +++ /dev/null @@ -1,30 +0,0 @@ -import gradio as gr - - -class TutotialTab: - def __init__(self): - pass - - def ui(self): - gr.Markdown( - """ - # Welcome to ZeroRVC! - - > If you are more satisfied with Python code, you can also [use the Python API to run ZeroRVC](https://pypi.org/project/zerorvc/). - - ZeroRVC is a toolkit for training and inference of retrieval-based voice conversion models. - - By leveraging the power of Hugging Face ZeroGPU, you can train your model in minutes without setting up the environment. - - ## How to Use - - There are 3 main steps to use ZeroRVC: - - - **Make Dataset**: Prepare your dataset for training. You can upload a zip file containing audio files. - - **Model Training**: Train your model using the prepared dataset. - - **Model Inference**: Try your model. - """ - ) - - def build(self): - pass diff --git a/assets/Synthesizer_inputs.pth b/assets/Synthesizer_inputs.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc5271bd5b479fde761282ed02f834c09c041125 --- /dev/null +++ b/assets/Synthesizer_inputs.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c5ae8cd034b02bbc325939e9b9debbedb43ee9d71a654daaff8804815bd957d +size 122495 diff --git a/assets/hubert/hubert_base.pt b/assets/hubert/hubert_base.pt new file mode 100644 index 0000000000000000000000000000000000000000..72f47ab58564f01d5cc8b05c63bdf96d944551ff --- /dev/null +++ b/assets/hubert/hubert_base.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f54b40fd2802423a5643779c4861af1e9ee9c1564dc9d32f54f20b5ffba7db96 +size 189507909 diff --git a/assets/hubert/hubert_inputs.pth b/assets/hubert/hubert_inputs.pth new file mode 100644 index 0000000000000000000000000000000000000000..62d38b2978e7c17040568db76765d82c59ead3eb --- /dev/null +++ b/assets/hubert/hubert_inputs.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbd4741d4be8a71333170c0df5320f605a9d210b96547b391555da078167861f +size 169434 diff --git a/assets/indices/.gitignore b/assets/indices/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d6b7ef32c8478a48c3994dcadc86837f4371184d --- /dev/null +++ b/assets/indices/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/assets/pretrained/.gitignore b/assets/pretrained/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d6b7ef32c8478a48c3994dcadc86837f4371184d --- /dev/null +++ b/assets/pretrained/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/assets/pretrained_v2/f0D40k.pth b/assets/pretrained_v2/f0D40k.pth new file mode 100644 index 0000000000000000000000000000000000000000..68e29fb9debf8994f68cb4f31cc4d81d360fb73b --- /dev/null +++ b/assets/pretrained_v2/f0D40k.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b6ab091e70801b28e3f41f335f2fc5f3f35c75b39ae2628d419644ec2b0fa09 +size 142875703 diff --git a/assets/pretrained_v2/f0G40k.pth b/assets/pretrained_v2/f0G40k.pth new file mode 100644 index 0000000000000000000000000000000000000000..3333a970abc908b4e8afd45c1cc2120ce1b4b0b4 --- /dev/null +++ b/assets/pretrained_v2/f0G40k.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b2c44035e782c4b14ddc0bede9e2f4a724d025cd073f736d4f43708453adfcb +size 73106273 diff --git a/assets/rmvpe/rmvpe.pt b/assets/rmvpe/rmvpe.pt new file mode 100644 index 0000000000000000000000000000000000000000..6362f060846875c3b5d7012adea5f97e47305e7e --- /dev/null +++ b/assets/rmvpe/rmvpe.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d62215f4306e3ca278246188607209f09af3dc77ed4232efdd069798c4ec193 +size 181184272 diff --git a/assets/rmvpe/rmvpe_inputs.pth b/assets/rmvpe/rmvpe_inputs.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd26a61e38fd5190b85c98169886437ac0db851d --- /dev/null +++ b/assets/rmvpe/rmvpe_inputs.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:339fcb7e1476b302e9aecef4a951e918c20852b2e871de5eea13b06e554e0a3a +size 33527 diff --git a/assets/uvr5_weights/.gitignore b/assets/uvr5_weights/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d6b7ef32c8478a48c3994dcadc86837f4371184d --- /dev/null +++ b/assets/uvr5_weights/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/assets/weights/.gitignore b/assets/weights/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d6b7ef32c8478a48c3994dcadc86837f4371184d --- /dev/null +++ b/assets/weights/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c93ee2f655a3dafe069a86c220debef124c6f11a --- /dev/null +++ b/config.json @@ -0,0 +1,79 @@ +{ + "data": { + "filter_length": 2048, + "hop_length": 400, + "max_wav_value": 32768.0, + "mel_fmax": null, + "mel_fmin": 0.0, + "n_mel_channels": 125, + "sampling_rate": 40000, + "win_length": 2048 + }, + "model": { + "filter_channels": 768, + "gin_channels": 256, + "hidden_channels": 192, + "inter_channels": 192, + "kernel_size": 3, + "n_heads": 2, + "n_layers": 6, + "p_dropout": 0, + "resblock": "1", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "spk_embed_dim": 109, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_rates": [ + 10, + 10, + 2, + 2 + ], + "use_spectral_norm": false + }, + "train": { + "batch_size": 64, + "betas": [ + 0.8, + 0.99 + ], + "c_kl": 1.0, + "c_mel": 45, + "epochs": 20000, + "eps": 1e-09, + "fp16_run": false, + "init_lr_ratio": 1, + "learning_rate": 0.0001, + "log_interval": 200, + "lr_decay": 0.999875, + "seed": 1234, + "segment_size": 12800, + "warmup_epochs": 0 + } +} \ No newline at end of file diff --git a/configs/config.py b/configs/config.py new file mode 100644 index 0000000000000000000000000000000000000000..66851033cdcb4857fdd89a9395b242b9ffd1eb7d --- /dev/null +++ b/configs/config.py @@ -0,0 +1,245 @@ +import argparse +import os +import sys +import json +import shutil +from multiprocessing import cpu_count + +import torch +import logging +from model import device, fp16 + +logger = logging.getLogger(__name__) + + +version_config_list = [ + "v1/32k.json", + "v1/40k.json", + "v1/48k.json", + "v2/48k.json", + "v2/32k.json", +] + + +def singleton_variable(func): + def wrapper(*args, **kwargs): + if not wrapper.instance: + wrapper.instance = func(*args, **kwargs) + return wrapper.instance + + wrapper.instance = None + return wrapper + + +@singleton_variable +class Config: + def __init__(self): + self.device = str(device) + self.is_half = fp16 + self.use_jit = False + self.n_cpu = 0 + self.gpu_name = None + self.json_config = self.load_config_json() + self.gpu_mem = None + ( + self.python_cmd, + self.listen_port, + self.iscolab, + self.noparallel, + self.noautoopen, + self.dml, + ) = self.arg_parse() + self.instead = "" + self.preprocess_per = 3.7 + self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() + + @staticmethod + def load_config_json() -> dict: + d = {} + # for config_file in version_config_list: + # p = f"configs/inuse/{config_file}" + # if not os.path.exists(p): + # shutil.copy(f"configs/{config_file}", p) + # with open(f"configs/inuse/{config_file}", "r") as f: + # d[config_file] = json.load(f) + return d + + @staticmethod + def arg_parse() -> tuple: + exe = sys.executable or "python" + parser = argparse.ArgumentParser() + parser.add_argument("--port", type=int, default=7865, help="Listen port") + parser.add_argument("--pycmd", type=str, default=exe, help="Python command") + parser.add_argument("--colab", action="store_true", help="Launch in colab") + parser.add_argument( + "--noparallel", action="store_true", help="Disable parallel processing" + ) + parser.add_argument( + "--noautoopen", + action="store_true", + help="Do not open in browser automatically", + ) + parser.add_argument( + "--dml", + action="store_true", + help="torch_dml", + ) + cmd_opts = parser.parse_args() + + cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865 + + return ( + cmd_opts.pycmd, + cmd_opts.port, + cmd_opts.colab, + cmd_opts.noparallel, + cmd_opts.noautoopen, + cmd_opts.dml, + ) + + # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+. + # check `getattr` and try it for compatibility + @staticmethod + def has_mps() -> bool: + if not torch.backends.mps.is_available(): + return False + try: + torch.zeros(1).to(torch.device("mps")) + return True + except Exception: + return False + + @staticmethod + def has_xpu() -> bool: + if hasattr(torch, "xpu") and torch.xpu.is_available(): + return True + else: + return False + + def use_fp32_config(self): + for config_file in version_config_list: + self.json_config[config_file]["train"]["fp16_run"] = False + with open(f"configs/inuse/{config_file}", "r") as f: + strr = f.read().replace("true", "false") + with open(f"configs/inuse/{config_file}", "w") as f: + f.write(strr) + logger.info("overwrite " + config_file) + self.preprocess_per = 3.0 + logger.info("overwrite preprocess_per to %d" % (self.preprocess_per)) + + def device_config(self) -> tuple: + if torch.cuda.is_available(): + if self.has_xpu(): + self.device = self.instead = "xpu:0" + self.is_half = True + i_device = int(0) + self.gpu_name = torch.cuda.get_device_name(i_device) + if ( + ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) + or "P40" in self.gpu_name.upper() + or "P10" in self.gpu_name.upper() + or "1060" in self.gpu_name + or "1070" in self.gpu_name + or "1080" in self.gpu_name + ): + logger.info("Found GPU %s, force to fp32", self.gpu_name) + self.is_half = False + self.use_fp32_config() + else: + logger.info("Found GPU %s", self.gpu_name) + self.gpu_mem = int( + torch.cuda.get_device_properties(i_device).total_memory + / 1024 + / 1024 + / 1024 + + 0.4 + ) + if self.gpu_mem <= 4: + self.preprocess_per = 3.0 + elif self.has_mps(): + logger.info("No supported Nvidia GPU found") + self.device = self.instead = "mps" + self.is_half = False + self.use_fp32_config() + else: + logger.info("No supported Nvidia GPU found") + self.device = self.instead = "cpu" + self.is_half = False + self.use_fp32_config() + + if self.n_cpu == 0: + self.n_cpu = cpu_count() + + if self.is_half: + # 6G显存配置 + x_pad = 3 + x_query = 10 + x_center = 60 + x_max = 65 + else: + # 5G显存配置 + x_pad = 1 + x_query = 6 + x_center = 38 + x_max = 41 + + if self.gpu_mem is not None and self.gpu_mem <= 4: + x_pad = 1 + x_query = 5 + x_center = 30 + x_max = 32 + if self.dml: + logger.info("Use DirectML instead") + if ( + os.path.exists( + "runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll" + ) + == False + ): + try: + os.rename( + "runtime\Lib\site-packages\onnxruntime", + "runtime\Lib\site-packages\onnxruntime-cuda", + ) + except: + pass + try: + os.rename( + "runtime\Lib\site-packages\onnxruntime-dml", + "runtime\Lib\site-packages\onnxruntime", + ) + except: + pass + # if self.device != "cpu": + import torch_directml + + self.device = torch_directml.device(torch_directml.default_device()) + self.is_half = False + else: + if self.instead: + logger.info(f"Use {self.instead} instead") + if ( + os.path.exists( + "runtime\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll" + ) + == False + ): + try: + os.rename( + "runtime\Lib\site-packages\onnxruntime", + "runtime\Lib\site-packages\onnxruntime-dml", + ) + except: + pass + try: + os.rename( + "runtime\Lib\site-packages\onnxruntime-cuda", + "runtime\Lib\site-packages\onnxruntime", + ) + except: + pass + logger.info( + "Half-precision floating-point: %s, device: %s" + % (self.is_half, self.device) + ) + return x_pad, x_query, x_center, x_max diff --git a/example-dataset.py b/example-dataset.py deleted file mode 100644 index acd0083de930fcca2a50ca1f0af64c1b53598df1..0000000000000000000000000000000000000000 --- a/example-dataset.py +++ /dev/null @@ -1,9 +0,0 @@ -import os -from zerorvc import prepare - -HF_TOKEN = os.environ.get("HF_TOKEN") - -dataset = prepare("./my-voices") -print(dataset) - -dataset.push_to_hub("my-rvc-dataset", token=HF_TOKEN, private=True) diff --git a/example-infer.py b/example-infer.py deleted file mode 100644 index e4584a3c74a7c49ea2ce38c646db76135cbdf931..0000000000000000000000000000000000000000 --- a/example-infer.py +++ /dev/null @@ -1,15 +0,0 @@ -import os -from zerorvc import RVC -import soundfile as sf - -HF_TOKEN = os.environ.get("HF_TOKEN") -MODEL = "JacobLinCool/my-rvc-model3" - -rvc = RVC.from_pretrained(MODEL, token=HF_TOKEN) -samples = rvc.convert("test.mp3") -sf.write("output.wav", samples, rvc.sr) - -pitch_modifications = [-12, -8, -4, 4, 8, 12] -for pitch_modification in pitch_modifications: - samples = rvc.convert("test.mp3", pitch_modification=pitch_modification) - sf.write(f"output-{pitch_modification}.wav", samples, rvc.sr) diff --git a/example-train.py b/example-train.py deleted file mode 100644 index b58e97596aac9e5eac4bb90885bbe1982228071a..0000000000000000000000000000000000000000 --- a/example-train.py +++ /dev/null @@ -1,38 +0,0 @@ -import os -from datasets import load_dataset -from tqdm import tqdm -from zerorvc import RVCTrainer, pretrained_checkpoints - -HF_TOKEN = os.environ.get("HF_TOKEN") -EPOCHS = 100 -BATCH_SIZE = 8 -DATASET = "JacobLinCool/my-rvc-dataset" -MODEL = "JacobLinCool/my-rvc-model" - -dataset = load_dataset(DATASET, token=HF_TOKEN) -print(dataset) - -trainer = RVCTrainer(checkpoint_dir="./checkpoints") -training = tqdm( - trainer.train( - dataset=dataset["train"], - resume_from=pretrained_checkpoints(), # resume training from the pretrained VCTK checkpoint - epochs=EPOCHS, - batch_size=BATCH_SIZE, - ), - total=EPOCHS, -) - -# Training loop: iterate over epochs -for checkpoint in training: - training.set_description( - f"Epoch {checkpoint.epoch}/{EPOCHS} loss: (gen: {checkpoint.loss_gen:.4f}, fm: {checkpoint.loss_fm:.4f}, mel: {checkpoint.loss_mel:.4f}, kl: {checkpoint.loss_kl:.4f}, disc: {checkpoint.loss_disc:.4f})" - ) - - # Save checkpoint every 10 epochs - if checkpoint.epoch % 10 == 0: - checkpoint.save(checkpoint_dir=trainer.checkpoint_dir) - # Directly push the synthesizer to the Hugging Face Hub - checkpoint.G.push_to_hub(MODEL, token=HF_TOKEN, private=True) - -print("Training completed.") diff --git a/headers.yaml b/headers.yaml deleted file mode 100644 index 6bc455f032de9d816533598ffe92b73b438f77d2..0000000000000000000000000000000000000000 --- a/headers.yaml +++ /dev/null @@ -1,8 +0,0 @@ -title: ZeroRVC -emoji: 🎙️ -colorFrom: gray -colorTo: gray -sdk: gradio -sdk_version: 4.37.2 -app_file: app.py -pinned: false diff --git a/i18n/i18n.py b/i18n/i18n.py new file mode 100644 index 0000000000000000000000000000000000000000..00e91bf3fddc0658586eab33868ca824ef425688 --- /dev/null +++ b/i18n/i18n.py @@ -0,0 +1,27 @@ +import json +import locale +import os + + +def load_language_list(language): + with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f: + language_list = json.load(f) + return language_list + + +class I18nAuto: + def __init__(self, language=None): + if language in ["Auto", None]: + language = locale.getdefaultlocale()[ + 0 + ] # getlocale can't identify the system's language ((None, None)) + if not os.path.exists(f"./i18n/locale/{language}.json"): + language = "en_US" + self.language = language + self.language_map = load_language_list(language) + + def __call__(self, key): + return self.language_map.get(key, key) + + def __repr__(self): + return "Use Language: " + self.language diff --git a/i18n/locale/en_US.json b/i18n/locale/en_US.json new file mode 100644 index 0000000000000000000000000000000000000000..f208d8dfe2f110063d958e5f17d8c1aea1e4341d --- /dev/null +++ b/i18n/locale/en_US.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness.", + "A模型权重": "Weight (w) for Model A:", + "A模型路径": "Path to Model A:", + "B模型路径": "Path to Model B:", + "E:\\语音音频+标注\\米津玄师\\src": "C:\\Users\\Desktop\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0 curve file (optional). One pitch per line. Replaces the default F0 and pitch modulation:", + "Index Rate": "Index Rate", + "Onnx导出": "Export Onnx", + "Onnx输出路径": "Onnx Export Path:", + "RVC模型路径": "RVC Model Path:", + "ckpt处理": "ckpt Processing", + "harvest进程数": "Number of CPU processes used for harvest pitch algorithm", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Enter the GPU index(es) separated by '-', e.g., 0-0-1 to use 2 processes in GPU0 and 1 process in GPU1", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Step 1: Fill in the experimental configuration. Experimental data is stored in the 'logs' folder, with each experiment having a separate folder. Manually enter the experiment name path, which contains the experimental configuration, logs, and trained model files.", + "step1:正在处理数据": "Step 1: Processing data", + "step2:正在提取音高&正在提取特征": "step2:Pitch extraction & feature extraction", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Step 2a: Automatically traverse all files in the training folder that can be decoded into audio and perform slice normalization. Generates 2 wav folders in the experiment directory. Currently, only single-singer/speaker training is supported.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Step 2b: Use CPU to extract pitch (if the model has pitch), use GPU to extract features (select GPU index):", + "step3: 填写训练设置, 开始训练模型和索引": "Step 3: Fill in the training settings and start training the model and index", + "step3a:正在训练模型": "Step 3a: Model training started", + "一键训练": "One-click training", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Multiple audio files can also be imported. If a folder path exists, this input is ignored.", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Batch processing for vocal accompaniment separation using the UVR5 model.
Example of a valid folder path format: D:\\path\\to\\input\\folder (copy it from the file manager address bar).
The model is divided into three categories:
1. Preserve vocals: Choose this option for audio without harmonies. It preserves vocals better than HP5. It includes two built-in models: HP2 and HP3. HP3 may slightly leak accompaniment but preserves vocals slightly better than HP2.
2. Preserve main vocals only: Choose this option for audio with harmonies. It may weaken the main vocals. It includes one built-in model: HP5.
3. De-reverb and de-delay models (by FoxJoy):
  (1) MDX-Net: The best choice for stereo reverb removal but cannot remove mono reverb;
 (234) DeEcho: Removes delay effects. Aggressive mode removes more thoroughly than Normal mode. DeReverb additionally removes reverb and can remove mono reverb, but not very effectively for heavily reverberated high-frequency content.
De-reverb/de-delay notes:
1. The processing time for the DeEcho-DeReverb model is approximately twice as long as the other two DeEcho models.
2. The MDX-Net-Dereverb model is quite slow.
3. The recommended cleanest configuration is to apply MDX-Net first and then DeEcho-Aggressive.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Enter the GPU index(es) separated by '-', e.g., 0-1-2 to use GPU 0, 1, and 2:", + "伴奏人声分离&去混响&去回声": "Vocals/Accompaniment Separation & Reverberation Removal", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Save name:", + "保存的文件名, 默认空为和源文件同名": "Save file name (default: same as the source file):", + "保存的模型名不带后缀": "Saved model name (without extension):", + "保存频率save_every_epoch": "Save frequency (save_every_epoch):", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy:", + "修改": "Modify", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modify model information (only supported for small model files extracted from the 'weights' folder)", + "停止音频转换": "Stop audio conversion", + "全流程结束!": "All processes have been completed!", + "刷新音色列表和索引路径": "Refresh voice list and index path", + "加载模型": "Load model", + "加载预训练底模D路径": "Load pre-trained base model D path:", + "加载预训练底模G路径": "Load pre-trained base model G path:", + "单次推理": "Single Inference", + "卸载音色省显存": "Unload voice to save GPU memory:", + "变调(整数, 半音数量, 升八度12降八度-12)": "Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12):", + "后处理重采样至最终采样率,0为不进行重采样": "Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling:", + "否": "No", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Response threshold", + "响度因子": "loudness factor", + "处理数据": "Process data", + "导出Onnx模型": "Export Onnx Model", + "导出文件格式": "Export file format", + "常见问题解答": "FAQ (Frequently Asked Questions)", + "常规设置": "General settings", + "开始音频转换": "Start audio conversion", + "很遗憾您这没有能用的显卡来支持您训练": "Unfortunately, there is no compatible GPU available to support your training.", + "性能设置": "Performance settings", + "总训练轮数total_epoch": "Total training epochs (total_epoch):", + "批量推理": "Batch Inference", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Batch conversion. Enter the folder containing the audio files to be converted or upload multiple audio files. The converted audio will be output in the specified folder (default: 'opt').", + "指定输出主人声文件夹": "Specify the output folder for vocals:", + "指定输出文件夹": "Specify output folder:", + "指定输出非主人声文件夹": "Specify the output folder for accompaniment:", + "推理时间(ms):": "Inference time (ms):", + "推理音色": "Inferencing voice:", + "提取": "Extract", + "提取音高和处理数据使用的CPU进程数": "Number of CPU processes used for pitch extraction and data processing:", + "是": "Yes", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Save only the latest '.ckpt' file to save disk space:", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Save a small final model to the 'weights' folder at each save point:", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Cache all training sets to GPU memory. Caching small datasets (less than 10 minutes) can speed up training, but caching large datasets will consume a lot of GPU memory and may not provide much speed improvement:", + "显卡信息": "GPU Information", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "This software is open source under the MIT license. The author does not have any control over the software. Users who use the software and distribute the sounds exported by the software are solely responsible.
If you do not agree with this clause, you cannot use or reference any codes and files within the software package. See the root directory Agreement-LICENSE.txt for details.", + "查看": "View", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "View model information (only supported for small model files extracted from the 'weights' folder)", + "检索特征占比": "Search feature ratio (controls accent strength, too high has artifacting):", + "模型": "Model", + "模型推理": "Model Inference", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Model extraction (enter the path of the large file model under the 'logs' folder). This is useful if you want to stop training halfway and manually extract and save a small model file, or if you want to test an intermediate model:", + "模型是否带音高指导": "Whether the model has pitch guidance:", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Whether the model has pitch guidance (required for singing, optional for speech):", + "模型是否带音高指导,1是0否": "Whether the model has pitch guidance (1: yes, 0: no):", + "模型版本型号": "Model architecture version:", + "模型融合, 可用于测试音色融合": "Model fusion, can be used to test timbre fusion", + "模型路径": "Path to Model:", + "每张显卡的batch_size": "Batch size per GPU:", + "淡入淡出长度": "Fade length", + "版本": "Version", + "特征提取": "Feature extraction", + "特征检索库文件路径,为空则使用下拉的选择结果": "Path to the feature index file. Leave blank to use the selected result from the dropdown:", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Recommended +12 key for male to female conversion, and -12 key for female to male conversion. If the sound range goes too far and the voice is distorted, you can also adjust it to the appropriate range by yourself.", + "目标采样率": "Target sample rate:", + "算法延迟(ms):": "Algorithmic delays(ms):", + "自动检测index路径,下拉式选择(dropdown)": "Auto-detect index path and select from the dropdown:", + "融合": "Fusion", + "要改的模型信息": "Model information to be modified:", + "要置入的模型信息": "Model information to be placed:", + "训练": "Train", + "训练模型": "Train model", + "训练特征索引": "Train feature index", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Training complete. You can check the training logs in the console or the 'train.log' file under the experiment folder.", + "设备类型": "设备类型", + "请指定说话人id": "Please specify the speaker/singer ID:", + "请选择index文件": "Please choose the .index file", + "请选择pth文件": "Please choose the .pth file", + "请选择说话人id": "Select Speaker/Singer ID:", + "转换": "Convert", + "输入实验名": "Enter the experiment name:", + "输入待处理音频文件夹路径": "Enter the path of the audio folder to be processed:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Enter the path of the audio folder to be processed (copy it from the address bar of the file manager):", + "输入待处理音频文件路径(默认是正确格式示例)": "Enter the path of the audio file to be processed (default is the correct format example):", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume:", + "输入监听": "Input voice monitor", + "输入训练文件夹路径": "Enter the path of the training folder:", + "输入设备": "Input device", + "输入降噪": "Input noise reduction", + "输出信息": "Output information", + "输出变声": "Output converted voice", + "输出设备": "Output device", + "输出降噪": "Output noise reduction", + "输出音频(右下角三个点,点了可以下载)": "Export audio (click on the three dots in the lower right corner to download)", + "选择.index文件": "Select the .index file", + "选择.pth文件": "Select the .pth file", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Select the pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive), 'rmvpe': best quality, and little GPU requirement", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Select the pitch extraction algorithm: when extracting singing, you can use 'pm' to speed up. For high-quality speech with fast performance, but worse CPU usage, you can use 'dio'. 'harvest' results in better quality but is slower. 'rmvpe' has the best results and consumes less CPU/GPU", + "采样率:": "采样率:", + "采样长度": "Sample length", + "重载设备列表": "Reload device list", + "音调设置": "Pitch settings", + "音频设备": "Audio device", + "音高算法": "pitch detection algorithm", + "额外推理时长": "Extra inference time" +} diff --git a/i18n/locale/es_ES.json b/i18n/locale/es_ES.json new file mode 100644 index 0000000000000000000000000000000000000000..73b2e86b1578e1a4012491e0f18587ba5b6665e3 --- /dev/null +++ b/i18n/locale/es_ES.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Si es >=3, entonces use el resultado del reconocimiento de tono de 'harvest' con filtro de mediana, el valor es el radio del filtro, su uso puede debilitar el sonido sordo", + "A模型权重": "Un peso modelo para el modelo A.", + "A模型路径": "Modelo A ruta.", + "B模型路径": "Modelo B ruta.", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Archivo de curva F0, opcional, un tono por línea, en lugar de F0 predeterminado y cambio de tono", + "Index Rate": "Tasa de índice", + "Onnx导出": "Exportar Onnx", + "Onnx输出路径": "Ruta de salida Onnx", + "RVC模型路径": "Ruta del modelo RVC", + "ckpt处理": "Procesamiento de recibos", + "harvest进程数": "Número de procesos", + "index文件路径不可包含中文": "La ruta del archivo .index no debe contener caracteres chinos.", + "pth文件路径不可包含中文": "La ruta del archivo .pth no debe contener caracteres chinos.", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Separe los números de identificación de la GPU con '-' al ingresarlos. Por ejemplo, '0-1-2' significa usar GPU 0, GPU 1 y GPU 2.", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Paso 1: Complete la configuración del experimento. Los datos del experimento se almacenan en el directorio 'logs', con cada experimento en una carpeta separada. La ruta del nombre del experimento debe ingresarse manualmente y debe contener la configuración del experimento, los registros y los archivos del modelo entrenado.", + "step1:正在处理数据": "Paso 1: Procesando datos", + "step2:正在提取音高&正在提取特征": "Paso 2: Extracción del tono y extracción de características", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Paso 2a: Recorra automáticamente la carpeta de capacitación y corte y normalice todos los archivos de audio que se pueden decodificar en audio. Se generarán dos carpetas 'wav' en el directorio del experimento. Actualmente, solo se admite la capacitación de una sola persona.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Paso 2b: Use la CPU para extraer el tono (si el modelo tiene guía de tono) y la GPU para extraer características (seleccione el número de tarjeta).", + "step3: 填写训练设置, 开始训练模型和索引": "Paso 3: Complete la configuración de entrenamiento y comience a entrenar el modelo y el índice.", + "step3a:正在训练模型": "Paso 3a: Entrenando el modelo", + "一键训练": "Entrenamiento con un clic", + "也可批量输入音频文件, 二选一, 优先读文件夹": "También se pueden importar varios archivos de audio. Si existe una ruta de carpeta, esta entrada se ignora.", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Procesamiento por lotes para la separación de acompañamiento vocal utilizando el modelo UVR5.
Ejemplo de formato de ruta de carpeta válido: D:\\ruta\\a\\la\\carpeta\\de\\entrada (copiar desde la barra de direcciones del administrador de archivos).
El modelo se divide en tres categorías:
1. Preservar voces: Elija esta opción para audio sin armonías. Preserva las voces mejor que HP5. Incluye dos modelos incorporados: HP2 y HP3. HP3 puede filtrar ligeramente el acompañamiento pero conserva las voces un poco mejor que HP2.
2. Preservar solo voces principales: Elija esta opción para audio con armonías. Puede debilitar las voces principales. Incluye un modelo incorporado: HP5.
3. Modelos de des-reverberación y des-retardo (por FoxJoy):
  (1) MDX-Net: La mejor opción para la eliminación de reverberación estéreo pero no puede eliminar la reverberación mono;
 (234) DeEcho: Elimina efectos de retardo. El modo Agresivo elimina más a fondo que el modo Normal. DeReverb adicionalmente elimina la reverberación y puede eliminar la reverberación mono, pero no muy efectivamente para contenido de alta frecuencia fuertemente reverberado.
Notas de des-reverberación/des-retardo:
1. El tiempo de procesamiento para el modelo DeEcho-DeReverb es aproximadamente el doble que los otros dos modelos DeEcho.
2. El modelo MDX-Net-Dereverb es bastante lento.
3. La configuración más limpia recomendada es aplicar primero MDX-Net y luego DeEcho-Agresivo.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Separe los números de identificación de la GPU con '-' al ingresarlos. Por ejemplo, '0-1-2' significa usar GPU 0, GPU 1 y GPU 2.", + "伴奏人声分离&去混响&去回声": "Separación de voz acompañante & eliminación de reverberación & eco", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Guardar nombre", + "保存的文件名, 默认空为和源文件同名": "Nombre del archivo que se guardará, el valor predeterminado es el mismo que el nombre del archivo de origen", + "保存的模型名不带后缀": "Nombre del modelo guardado sin extensión.", + "保存频率save_every_epoch": "Frecuencia de guardado (save_every_epoch)", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Proteger las consonantes claras y la respiración, prevenir artefactos como la distorsión de sonido electrónico, 0.5 no está activado, reducir aumentará la protección pero puede reducir el efecto del índice", + "修改": "Modificar", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modificar la información del modelo (solo admite archivos de modelos pequeños extraídos en la carpeta weights)", + "停止音频转换": "Detener la conversión de audio", + "全流程结束!": "¡Todo el proceso ha terminado!", + "刷新音色列表和索引路径": "Actualizar la lista de modelos e índice de rutas", + "加载模型": "Cargar modelo", + "加载预训练底模D路径": "Cargue la ruta del modelo D base pre-entrenada.", + "加载预训练底模G路径": "Cargue la ruta del modelo G base pre-entrenada.", + "单次推理": "单次推理", + "卸载音色省显存": "Descargue la voz para ahorrar memoria GPU", + "变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)", + "后处理重采样至最终采样率,0为不进行重采样": "Remuestreo posterior al proceso a la tasa de muestreo final, 0 significa no remuestrear", + "否": "No", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Umbral de respuesta", + "响度因子": "factor de sonoridad", + "处理数据": "Procesar datos", + "导出Onnx模型": "Exportar modelo Onnx", + "导出文件格式": "Formato de archivo de exportación", + "常见问题解答": "Preguntas frecuentes", + "常规设置": "Configuración general", + "开始音频转换": "Iniciar conversión de audio", + "很遗憾您这没有能用的显卡来支持您训练": "Lamentablemente, no tiene una tarjeta gráfica adecuada para soportar su entrenamiento", + "性能设置": "Configuración de rendimiento", + "总训练轮数total_epoch": "Total de épocas de entrenamiento (total_epoch)", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversión por lotes, ingrese la carpeta que contiene los archivos de audio para convertir o cargue varios archivos de audio. El audio convertido se emitirá en la carpeta especificada (opción predeterminada).", + "指定输出主人声文件夹": "Especifique la carpeta de salida para la voz principal", + "指定输出文件夹": "Especificar carpeta de salida", + "指定输出非主人声文件夹": "Especifique la carpeta de salida para las voces no principales", + "推理时间(ms):": "Inferir tiempo (ms):", + "推理音色": "inferencia de voz", + "提取": "Extraer", + "提取音高和处理数据使用的CPU进程数": "Número de procesos de CPU utilizados para extraer el tono y procesar los datos", + "是": "Sí", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Guardar solo el archivo ckpt más reciente para ahorrar espacio en disco", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Guardar pequeño modelo final en la carpeta 'weights' en cada punto de guardado", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Si almacenar en caché todos los conjuntos de entrenamiento en la memoria de la GPU. Los conjuntos de datos pequeños (menos de 10 minutos) se pueden almacenar en caché para acelerar el entrenamiento, pero el almacenamiento en caché de conjuntos de datos grandes puede causar errores de memoria en la GPU y no aumenta la velocidad de manera significativa.", + "显卡信息": "información de la GPU", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Este software es de código abierto bajo la licencia MIT, el autor no tiene ningún control sobre el software, y aquellos que usan el software y difunden los sonidos exportados por el software son los únicos responsables.
Si no está de acuerdo con esta cláusula , no puede utilizar ni citar ningún código ni archivo del paquete de software Consulte el directorio raíz Agreement-LICENSE.txt para obtener más información.", + "查看": "Ver", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Ver información del modelo (solo aplicable a archivos de modelos pequeños extraídos de la carpeta 'pesos')", + "检索特征占比": "Proporción de función de búsqueda", + "模型": "Modelo", + "模型推理": "inferencia del modelo", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Extracción de modelo (ingrese la ruta de un archivo de modelo grande en la carpeta 'logs'), aplicable cuando desea extraer un archivo de modelo pequeño después de entrenar a mitad de camino y no se guardó automáticamente, o cuando desea probar un modelo intermedio", + "模型是否带音高指导": "Si el modelo tiene guía de tono.", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Si el modelo tiene guía de tono (necesaria para cantar, pero no para hablar)", + "模型是否带音高指导,1是0否": "Si el modelo tiene guía de tono, 1 para sí, 0 para no", + "模型版本型号": "Versión y modelo del modelo", + "模型融合, 可用于测试音色融合": "Fusión de modelos, se puede utilizar para fusionar diferentes voces", + "模型路径": "Ruta del modelo", + "每张显卡的batch_size": "Tamaño del lote (batch_size) por tarjeta gráfica", + "淡入淡出长度": "Duración del fundido de entrada/salida", + "版本": "Versión", + "特征提取": "Extracción de características", + "特征检索库文件路径,为空则使用下拉的选择结果": "Ruta del archivo de la biblioteca de características, si está vacío, se utilizará el resultado de la selección desplegable", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Tecla +12 recomendada para conversión de voz de hombre a mujer, tecla -12 para conversión de voz de mujer a hombre. Si el rango de tono es demasiado amplio y causa distorsión, ajústelo usted mismo a un rango adecuado.", + "目标采样率": "Tasa de muestreo objetivo", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "Detección automática de la ruta del índice, selección desplegable (dropdown)", + "融合": "Fusión", + "要改的模型信息": "Información del modelo a modificar", + "要置入的模型信息": "Información del modelo a colocar.", + "训练": "Entrenamiento", + "训练模型": "Entrenar Modelo", + "训练特征索引": "Índice de características", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Entrenamiento finalizado, puede ver el registro de entrenamiento en la consola o en el archivo train.log en la carpeta del experimento", + "设备类型": "设备类型", + "请指定说话人id": "ID del modelo", + "请选择index文件": "Seleccione el archivo .index", + "请选择pth文件": "Seleccione el archivo .pth", + "请选择说话人id": "Seleccione una identificación de altavoz", + "转换": "Conversión", + "输入实验名": "Ingrese el nombre del modelo", + "输入待处理音频文件夹路径": "Ingrese la ruta a la carpeta de audio que se procesará", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Ingrese la ruta a la carpeta de audio que se procesará (simplemente cópiela desde la barra de direcciones del administrador de archivos)", + "输入待处理音频文件路径(默认是正确格式示例)": "Ingrese la ruta del archivo del audio que se procesará (el formato predeterminado es el ejemplo correcto)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Proporción de fusión para reemplazar el sobre de volumen de entrada con el sobre de volumen de salida, cuanto más cerca de 1, más se utiliza el sobre de salida", + "输入监听": "输入监听", + "输入训练文件夹路径": "Introduzca la ruta de la carpeta de entrenamiento", + "输入设备": "Dispositivo de entrada", + "输入降噪": "Reducción de ruido de entrada", + "输出信息": "Información de salida", + "输出变声": "输出变声", + "输出设备": "Dispositivo de salida", + "输出降噪": "Reducción de ruido de salida", + "输出音频(右下角三个点,点了可以下载)": "Salida de audio (haga clic en los tres puntos en la esquina inferior derecha para descargar)", + "选择.index文件": "Seleccione el archivo .index", + "选择.pth文件": "Seleccione el archivo .pth", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Seleccione el algoritmo de extracción de tono, las voces de entrada se pueden acelerar con pm, harvest tiene buenos graves pero es muy lento, crepe es bueno pero se come las GPUs", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Seleccione el algoritmo de extracción de tono, use 'pm' para acelerar la entrada de canto, 'harvest' es bueno para los graves pero extremadamente lento, 'crepe' tiene buenos resultados pero consume GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Seleccione el algoritmo de extracción de tono: la canción de entrada se puede acelerar con pm, la voz de alta calidad pero CPU pobre se puede acelerar con dio, harvest es mejor pero más lento, rmvpe es el mejor y se come ligeramente la CPU/GPU", + "采样率:": "采样率:", + "采样长度": "Longitud de muestreo", + "重载设备列表": "Actualizar lista de dispositivos", + "音调设置": "Ajuste de tono", + "音频设备": "Dispositivo de audio", + "音高算法": "Algoritmo de tono", + "额外推理时长": "Tiempo de inferencia adicional" +} diff --git a/i18n/locale/fr_FR.json b/i18n/locale/fr_FR.json new file mode 100644 index 0000000000000000000000000000000000000000..cbf5b3565daab2322e676d43bef1cfc90d791496 --- /dev/null +++ b/i18n/locale/fr_FR.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Si >=3 : appliquer un filtrage médian aux résultats de la reconnaissance de la hauteur de récolte. La valeur représente le rayon du filtre et peut réduire la respiration.", + "A模型权重": "Poids (w) pour le modèle A :", + "A模型路径": "Chemin d'accès au modèle A :", + "B模型路径": "Chemin d'accès au modèle B :", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Fichier de courbe F0 (facultatif). Une hauteur par ligne. Remplace la fréquence fondamentale par défaut et la modulation de la hauteur :", + "Index Rate": "Taux d'indexation", + "Onnx导出": "Exporter en ONNX", + "Onnx输出路径": "Chemin d'exportation ONNX :", + "RVC模型路径": "Chemin du modèle RVC :", + "ckpt处理": "Traitement des fichiers .ckpt", + "harvest进程数": "Nombre de processus CPU utilisés pour l'algorithme de reconnaissance de la hauteur (pitch) dans le cadre de la récolte (harvest).", + "index文件路径不可包含中文": "Le chemin du fichier d'index ne doit pas contenir de caractères chinois.", + "pth文件路径不可包含中文": "Le chemin du fichier .pth ne doit pas contenir de caractères chinois.", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Configuration des numéros de carte RMVPE : séparez les index GPU par des tirets \"-\", par exemple, 0-0-1 pour utiliser 2 processus sur GPU0 et 1 processus sur GPU1.", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Étape 1 : Remplissez la configuration expérimentale. Les données expérimentales sont stockées dans le dossier 'logs', avec chaque expérience ayant un dossier distinct. Entrez manuellement le chemin du nom de l'expérience, qui contient la configuration expérimentale, les journaux et les fichiers de modèle entraînés.", + "step1:正在处理数据": "Étape 1 : Traitement des données en cours.", + "step2:正在提取音高&正在提取特征": "Étape 2 : Extraction de la hauteur et extraction des caractéristiques en cours.", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Étape 2a : Parcours automatique de tous les fichiers du dossier d'entraînement qui peuvent être décodés en fichiers audio et réalisation d'une normalisation par tranches. Génère 2 dossiers wav dans le répertoire de l'expérience. Actuellement, seule la formation avec un seul chanteur/locuteur est prise en charge.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Étape 2b : Utilisez le CPU pour extraire la hauteur (si le modèle le permet), utilisez le GPU pour extraire les caractéristiques (sélectionnez l'index du GPU) :", + "step3: 填写训练设置, 开始训练模型和索引": "Étape 3 : Remplissez les paramètres d'entraînement et démarrez l'entraînement du modèle ainsi que l'indexation.", + "step3a:正在训练模型": "Étape 3a : L'entraînement du modèle a commencé.", + "一键训练": "Entraînement en un clic", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Il est également possible d'importer plusieurs fichiers audio. Si un chemin de dossier existe, cette entrée est ignorée.", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Traitement en lot pour la séparation de la voix et de l'accompagnement vocal à l'aide du modèle UVR5.
Exemple d'un format de chemin de dossier valide : D:\\chemin\\vers\\dossier\\d'entrée (copiez-le depuis la barre d'adresse du gestionnaire de fichiers).
Le modèle est divisé en trois catégories :
1. Préserver la voix : Choisissez cette option pour l'audio sans harmonies. Elle préserve la voix mieux que HP5. Il comprend deux modèles intégrés : HP2 et HP3. HP3 peut légèrement laisser passer l'accompagnement mais préserve légèrement mieux la voix que HP2.
2. Préserver uniquement la voix principale : Choisissez cette option pour l'audio avec harmonies. Cela peut affaiblir la voix principale. Il comprend un modèle intégré : HP5.
3. Modèles de suppression de la réverbération et du délai (par FoxJoy) :
  (1) MDX-Net : Le meilleur choix pour la suppression de la réverbération stéréo, mais ne peut pas supprimer la réverbération mono.
  (234) DeEcho : Supprime les effets de délai. Le mode Aggressive supprime plus efficacement que le mode Normal. DeReverb supprime également la réverbération et peut supprimer la réverbération mono, mais pas très efficacement pour les contenus à haute fréquence fortement réverbérés.
Notes sur la suppression de la réverbération et du délai :
1. Le temps de traitement pour le modèle DeEcho-DeReverb est environ deux fois plus long que pour les autres deux modèles DeEcho.
2. Le modèle MDX-Net-Dereverb est assez lent.
3. La configuration la plus propre recommandée est d'appliquer d'abord MDX-Net, puis DeEcho-Aggressive.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Entrez le(s) index GPU séparé(s) par '-', par exemple, 0-1-2 pour utiliser les GPU 0, 1 et 2 :", + "伴奏人声分离&去混响&去回声": "Séparation des voix/accompagnement et suppression de la réverbération", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Nom de sauvegarde :", + "保存的文件名, 默认空为和源文件同名": "Nom du fichier de sauvegarde (par défaut : identique au nom du fichier source) :", + "保存的模型名不带后缀": "Nom du modèle enregistré (sans extension) :", + "保存频率save_every_epoch": "Fréquence de sauvegarde (save_every_epoch) :", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Protéger les consonnes sourdes et les bruits de respiration pour éviter les artefacts tels que le déchirement dans la musique électronique. Réglez à 0,5 pour désactiver. Diminuez la valeur pour renforcer la protection, mais cela peut réduire la précision de l'indexation :", + "修改": "Modifier", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modifier les informations du modèle (uniquement pris en charge pour les petits fichiers de modèle extraits du dossier 'weights')", + "停止音频转换": "Arrêter la conversion audio", + "全流程结束!": "Toutes les étapes ont été terminées !", + "刷新音色列表和索引路径": "Actualiser la liste des voix et le vers l'index.", + "加载模型": "Charger le modèle.", + "加载预训练底模D路径": "Charger le chemin du modèle de base pré-entraîné D :", + "加载预训练底模G路径": "Charger le chemin du modèle de base pré-entraîné G :", + "单次推理": "单次推理", + "卸载音色省显存": "Décharger la voix pour économiser la mémoire GPU.", + "变调(整数, 半音数量, 升八度12降八度-12)": "Transposer (entier, nombre de demi-tons, monter d'une octave : 12, descendre d'une octave : -12) :", + "后处理重采样至最终采样率,0为不进行重采样": "Rééchantillonner l'audio de sortie en post-traitement à la fréquence d'échantillonnage finale. Réglez sur 0 pour ne pas effectuer de rééchantillonnage :", + "否": "Non", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Seuil de réponse", + "响度因子": "Facteur de volume sonore", + "处理数据": "Traitement des données", + "导出Onnx模型": "Exporter le modèle au format ONNX.", + "导出文件格式": "Format de fichier d'exportation", + "常见问题解答": "FAQ (Foire Aux Questions)", + "常规设置": "Paramètres généraux", + "开始音频转换": "Démarrer la conversion audio.", + "很遗憾您这没有能用的显卡来支持您训练": "Malheureusement, il n'y a pas de GPU compatible disponible pour prendre en charge votre entrainement.", + "性能设置": "Paramètres de performance", + "总训练轮数total_epoch": "Nombre total d'époques d'entraînement (total_epoch) :", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversion en lot. Entrez le dossier contenant les fichiers audio à convertir ou téléchargez plusieurs fichiers audio. Les fichiers audio convertis seront enregistrés dans le dossier spécifié (par défaut : 'opt').", + "指定输出主人声文件夹": "Spécifiez le dossier de sortie pour les fichiers de voix :", + "指定输出文件夹": "Spécifiez le dossier de sortie :", + "指定输出非主人声文件夹": "Spécifiez le dossier de sortie pour l'accompagnement :", + "推理时间(ms):": "Temps d'inférence (ms) :", + "推理音色": "Voix pour l'inférence", + "提取": "Extraire", + "提取音高和处理数据使用的CPU进程数": "Nombre de processus CPU utilisés pour l'extraction de la hauteur et le traitement des données :", + "是": "Oui", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Enregistrer uniquement le dernier fichier '.ckpt' pour économiser de l'espace disque :", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Enregistrer un petit modèle final dans le dossier 'weights' à chaque point de sauvegarde :", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Mettre en cache tous les ensembles d'entrainement dans la mémoire GPU. Mettre en cache de petits ensembles de données (moins de 10 minutes) peut accélérer l'entrainement, mais mettre en cache de grands ensembles de données consommera beaucoup de mémoire GPU et peut ne pas apporter beaucoup d'amélioration de vitesse :", + "显卡信息": "Informations sur la carte graphique (GPU)", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Ce logiciel est open source sous la licence MIT. L'auteur n'a aucun contrôle sur le logiciel. Les utilisateurs qui utilisent le logiciel et distribuent les sons exportés par le logiciel en sont entièrement responsables.
Si vous n'acceptez pas cette clause, vous ne pouvez pas utiliser ou faire référence à aucun code ni fichier contenu dans le package logiciel. Consultez le fichier Agreement-LICENSE.txt dans le répertoire racine pour plus de détails.", + "查看": "Voir", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Afficher les informations sur le modèle (uniquement pour les petits fichiers de modèle extraits du dossier \"weights\")", + "检索特征占比": "Rapport de recherche de caractéristiques (contrôle l'intensité de l'accent, un rapport trop élevé provoque des artefacts) :", + "模型": "Modèle", + "模型推理": "Inférence du modèle", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Extraction du modèle (saisissez le chemin d'accès au modèle du grand fichier dans le dossier \"logs\"). Cette fonction est utile si vous souhaitez arrêter l'entrainement à mi-chemin et extraire et enregistrer manuellement un petit fichier de modèle, ou si vous souhaitez tester un modèle intermédiaire :", + "模型是否带音高指导": "Indique si le modèle dispose d'un guidage en hauteur :", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Indique si le modèle dispose d'un système de guidage de la hauteur (obligatoire pour le chant, facultatif pour la parole) :", + "模型是否带音高指导,1是0否": "Le modèle dispose-t-il d'un guide de hauteur (1 : oui, 0 : non) ?", + "模型版本型号": "Version de l'architecture du modèle :", + "模型融合, 可用于测试音色融合": "Fusion de modèles, peut être utilisée pour tester la fusion de timbres", + "模型路径": "Le chemin vers le modèle :", + "每张显卡的batch_size": "Taille du batch par GPU :", + "淡入淡出长度": "Longueur de la transition", + "版本": "Version", + "特征提取": "Extraction des caractéristiques", + "特征检索库文件路径,为空则使用下拉的选择结果": "Chemin d'accès au fichier d'index des caractéristiques. Laisser vide pour utiliser le résultat sélectionné dans la liste déroulante :", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Il est recommandé d'utiliser la clé +12 pour la conversion homme-femme et la clé -12 pour la conversion femme-homme. Si la plage sonore est trop large et que la voix est déformée, vous pouvez également l'ajuster vous-même à la plage appropriée.", + "目标采样率": "Taux d'échantillonnage cible :", + "算法延迟(ms):": "Délais algorithmiques (ms):", + "自动检测index路径,下拉式选择(dropdown)": "Détecter automatiquement le chemin d'accès à l'index et le sélectionner dans la liste déroulante :", + "融合": "Fusion", + "要改的模型信息": "Informations sur le modèle à modifier :", + "要置入的模型信息": "Informations sur le modèle à placer :", + "训练": "Entraîner", + "训练模型": "Entraîner le modèle", + "训练特征索引": "Entraîner l'index des caractéristiques", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Entraînement terminé. Vous pouvez consulter les rapports d'entraînement dans la console ou dans le fichier 'train.log' situé dans le dossier de l'expérience.", + "设备类型": "设备类型", + "请指定说话人id": "Veuillez spécifier l'ID de l'orateur ou du chanteur :", + "请选择index文件": "Veuillez sélectionner le fichier d'index", + "请选择pth文件": "Veuillez sélectionner le fichier pth", + "请选择说话人id": "Sélectionner l'ID de l'orateur ou du chanteur :", + "转换": "Convertir", + "输入实验名": "Saisissez le nom de l'expérience :", + "输入待处理音频文件夹路径": "Entrez le chemin du dossier audio à traiter :", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Entrez le chemin du dossier audio à traiter (copiez-le depuis la barre d'adresse du gestionnaire de fichiers) :", + "输入待处理音频文件路径(默认是正确格式示例)": "Entrez le chemin d'accès du fichier audio à traiter (par défaut, l'exemple de format correct) :", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Ajustez l'échelle de l'enveloppe de volume. Plus il est proche de 0, plus il imite le volume des voix originales. Cela peut aider à masquer les bruits et à rendre le volume plus naturel lorsqu'il est réglé relativement bas. Plus le volume est proche de 1, plus le volume sera fort et constant :", + "输入监听": "Moniteur vocal d'entrée", + "输入训练文件夹路径": "Indiquez le chemin d'accès au dossier d'entraînement :", + "输入设备": "Dispositif d'entrée", + "输入降噪": "Réduction du bruit d'entrée", + "输出信息": "Informations sur la sortie", + "输出变声": "Sortie voix convertie", + "输出设备": "Dispositif de sortie", + "输出降噪": "Réduction du bruit de sortie", + "输出音频(右下角三个点,点了可以下载)": "Exporter l'audio (cliquer sur les trois points dans le coin inférieur droit pour télécharger)", + "选择.index文件": "Sélectionner le fichier .index", + "选择.pth文件": "Sélectionner le fichier .pth", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Sélection de l'algorithme d'extraction de la hauteur, les voix d'entrée peuvent être accélérées avec pm, harvest a de bonnes basses mais est très lent, crepe est bon mais consomme beaucoup de ressources GPU.", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Sélectionnez l'algorithme d'extraction de la hauteur de ton (\"pm\" : extraction plus rapide mais parole de moindre qualité ; \"harvest\" : meilleure basse mais extrêmement lente ; \"crepe\" : meilleure qualité mais utilisation intensive du GPU), \"rmvpe\" : meilleure qualité et peu d'utilisation du GPU.", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Sélection de l'algorithme d'extraction de la hauteur : la chanson d'entrée peut être traitée plus rapidement par pm, avec une voix de haute qualité mais un CPU médiocre, par dio, harvest est meilleur mais plus lent, rmvpe est le meilleur, mais consomme légèrement le CPU/GPU.", + "采样率:": "采样率:", + "采样长度": "Longueur de l'échantillon", + "重载设备列表": "Recharger la liste des dispositifs", + "音调设置": "Réglages de la hauteur", + "音频设备": "Périphérique audio", + "音高算法": "algorithme de détection de la hauteur", + "额外推理时长": "Temps d'inférence supplémentaire" +} diff --git a/i18n/locale/it_IT.json b/i18n/locale/it_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..c6aa02df0b60313dce6fa5a045932dcc46bafd57 --- /dev/null +++ b/i18n/locale/it_IT.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Se >=3: applica il filtro mediano ai risultati del pitch raccolto. ", + "A模型权重": "Peso (w) per il modello A:", + "A模型路径": "Percorso per il modello A:", + "B模型路径": "Percorso per il modello B:", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "File curva F0 (opzionale). ", + "Index Rate": "Tasso di indice", + "Onnx导出": "Esporta Onnx", + "Onnx输出路径": "Percorso di esportazione Onnx:", + "RVC模型路径": "Percorso modello RVC:", + "ckpt处理": "Elaborazione ckpt", + "harvest进程数": "harvest进程数", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth è un'app per il futuro", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Passaggio 1: compilare la configurazione sperimentale. ", + "step1:正在处理数据": "Passaggio 1: elaborazione dei dati", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Passaggio 2a: attraversa automaticamente tutti i file nella cartella di addestramento che possono essere decodificati in audio ed esegui la normalizzazione delle sezioni. ", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Passaggio 2b: utilizzare la CPU per estrarre il tono (se il modello ha il tono), utilizzare la GPU per estrarre le caratteristiche (selezionare l'indice GPU):", + "step3: 填写训练设置, 开始训练模型和索引": "Passaggio 3: compilare le impostazioni di addestramento e avviare l'addestramento del modello e dell'indice", + "step3a:正在训练模型": "Passaggio 3a: è iniziato l'addestramento del modello", + "一键训练": "Addestramento con un clic", + "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Elaborazione batch per la separazione dell'accompagnamento vocale utilizzando il modello UVR5.
Esempio di un formato di percorso di cartella valido: D:\\path\\to\\input\\folder (copialo dalla barra degli indirizzi del file manager).
Il modello è suddiviso in tre categorie:
1. Conserva la voce: scegli questa opzione per l'audio senza armonie.
2. Mantieni solo la voce principale: scegli questa opzione per l'audio con armonie.
3. Modelli di de-riverbero e de-delay (di FoxJoy):
  (1) MDX-Net: la scelta migliore per la rimozione del riverbero stereo ma non può rimuovere il riverbero mono;

Note di de-riverbero/de-delay:
1. Il tempo di elaborazione per il modello DeEcho-DeReverb è circa il doppio rispetto agli altri due modelli DeEcho.
2. Il modello MDX-Net-Dereverb è piuttosto lento.
3. La configurazione più pulita consigliata consiste nell'applicare prima MDX-Net e poi DeEcho-Aggressive.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Inserisci gli indici GPU separati da '-', ad esempio 0-1-2 per utilizzare GPU 0, 1 e 2:", + "伴奏人声分离&去混响&去回声": "Separazione voce/accompagnamento", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Salva nome:", + "保存的文件名, 默认空为和源文件同名": "Salva il nome del file (predefinito: uguale al file di origine):", + "保存的模型名不带后缀": "Nome del modello salvato (senza estensione):", + "保存频率save_every_epoch": "Frequenza di salvataggio (save_every_epoch):", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Proteggi le consonanti senza voce e i suoni del respiro per evitare artefatti come il tearing nella musica elettronica. ", + "修改": "Modificare", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modifica le informazioni sul modello (supportato solo per i file di modello di piccole dimensioni estratti dalla cartella 'weights')", + "停止音频转换": "Arresta la conversione audio", + "全流程结束!": "Tutti i processi sono stati completati!", + "刷新音色列表和索引路径": "Aggiorna l'elenco delle voci e il percorso dell'indice", + "加载模型": "Carica modello", + "加载预训练底模D路径": "Carica il percorso D del modello base pre-addestrato:", + "加载预训练底模G路径": "Carica il percorso G del modello base pre-addestrato:", + "单次推理": "单次推理", + "卸载音色省显存": "Scarica la voce per risparmiare memoria della GPU:", + "变调(整数, 半音数量, 升八度12降八度-12)": "Trasposizione (numero intero, numero di semitoni, alza di un'ottava: 12, abbassa di un'ottava: -12):", + "后处理重采样至最终采样率,0为不进行重采样": "Ricampiona l'audio di output in post-elaborazione alla frequenza di campionamento finale. ", + "否": "NO", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Soglia di risposta", + "响度因子": "fattore di sonorità", + "处理数据": "Processa dati", + "导出Onnx模型": "Esporta modello Onnx", + "导出文件格式": "Formato file di esportazione", + "常见问题解答": "FAQ (Domande frequenti)", + "常规设置": "Impostazioni generali", + "开始音频转换": "Avvia la conversione audio", + "很遗憾您这没有能用的显卡来支持您训练": "Sfortunatamente, non è disponibile alcuna GPU compatibile per supportare l'addestramento.", + "性能设置": "Impostazioni delle prestazioni", + "总训练轮数total_epoch": "Epoch totali di addestramento (total_epoch):", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversione massiva. Inserisci il percorso della cartella che contiene i file da convertire o carica più file audio. I file convertiti finiranno nella cartella specificata. (default: opt) ", + "指定输出主人声文件夹": "Specifica la cartella di output per le voci:", + "指定输出文件夹": "Specifica la cartella di output:", + "指定输出非主人声文件夹": "Specificare la cartella di output per l'accompagnamento:", + "推理时间(ms):": "Tempo di inferenza (ms):", + "推理音色": "Voce di inferenza:", + "提取": "Estrai", + "提取音高和处理数据使用的CPU进程数": "Numero di processi CPU utilizzati per l'estrazione del tono e l'elaborazione dei dati:", + "是": "SÌ", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Salva solo l'ultimo file '.ckpt' per risparmiare spazio su disco:", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Salva un piccolo modello finale nella cartella \"weights\" in ogni punto di salvataggio:", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Memorizza nella cache tutti i set di addestramento nella memoria della GPU. ", + "显卡信息": "Informazioni GPU", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Questo software è open source con licenza MIT.
Se non si accetta questa clausola, non è possibile utilizzare o fare riferimento a codici e file all'interno del pacchetto software. Contratto-LICENZA.txt per dettagli.", + "查看": "Visualizzazione", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Visualizza le informazioni sul modello (supportato solo per file di modello piccoli estratti dalla cartella 'weights')", + "检索特征占比": "Rapporto funzionalità di ricerca (controlla la forza dell'accento, troppo alto ha artefatti):", + "模型": "Modello", + "模型推理": "Inferenza del modello", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Estrazione del modello (inserire il percorso del modello di file di grandi dimensioni nella cartella \"logs\"). ", + "模型是否带音高指导": "Se il modello ha una guida del tono:", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Se il modello ha una guida del tono (necessario per il canto, facoltativo per il parlato):", + "模型是否带音高指导,1是0否": "Se il modello ha una guida del tono (1: sì, 0: no):", + "模型版本型号": "Versione dell'architettura del modello:", + "模型融合, 可用于测试音色融合": "Model fusion, può essere utilizzato per testare la fusione timbrica", + "模型路径": "Percorso al modello:", + "每张显卡的batch_size": "Dimensione batch per GPU:", + "淡入淡出长度": "Lunghezza dissolvenza", + "版本": "Versione", + "特征提取": "Estrazione delle caratteristiche", + "特征检索库文件路径,为空则使用下拉的选择结果": "Percorso del file di indice delle caratteristiche. ", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Tonalità +12 consigliata per la conversione da maschio a femmina e tonalità -12 per la conversione da femmina a maschio. ", + "目标采样率": "Frequenza di campionamento target:", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "Rileva automaticamente il percorso dell'indice e seleziona dal menu a tendina:", + "融合": "Fusione", + "要改的模型信息": "Informazioni sul modello da modificare:", + "要置入的模型信息": "Informazioni sul modello da posizionare:", + "训练": "Addestramento", + "训练模型": "Addestra modello", + "训练特征索引": "Addestra indice delle caratteristiche", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Addestramento completato. ", + "设备类型": "设备类型", + "请指定说话人id": "Si prega di specificare l'ID del locutore/cantante:", + "请选择index文件": "请选择index文件", + "请选择pth文件": "请选择pth 文件", + "请选择说话人id": "Seleziona ID locutore/cantante:", + "转换": "Convertire", + "输入实验名": "Inserisci il nome dell'esperimento:", + "输入待处理音频文件夹路径": "Immettere il percorso della cartella audio da elaborare:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Immettere il percorso della cartella audio da elaborare (copiarlo dalla barra degli indirizzi del file manager):", + "输入待处理音频文件路径(默认是正确格式示例)": "Immettere il percorso del file audio da elaborare (l'impostazione predefinita è l'esempio di formato corretto):", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Regola il ridimensionamento dell'inviluppo del volume. ", + "输入监听": "输入监听", + "输入训练文件夹路径": "Inserisci il percorso della cartella di addestramento:", + "输入设备": "Dispositivo di input", + "输入降噪": "Riduzione del rumore in ingresso", + "输出信息": "Informazioni sull'uscita", + "输出变声": "输出变声", + "输出设备": "Dispositivo di uscita", + "输出降噪": "Riduzione del rumore in uscita", + "输出音频(右下角三个点,点了可以下载)": "Esporta audio (clicca sui tre puntini in basso a destra per scaricarlo)", + "选择.index文件": "Seleziona il file .index", + "选择.pth文件": "Seleziona il file .pth", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Seleziona l'algoritmo di estrazione del tono (\"pm\": estrazione più veloce ma risultato di qualità inferiore; \"harvest\": bassi migliori ma estremamente lenti; \"crepe\": qualità migliore ma utilizzo intensivo della GPU):", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "Lunghezza del campione", + "重载设备列表": "Ricaricare l'elenco dei dispositivi", + "音调设置": "Impostazioni del tono", + "音频设备": "Dispositivo audio", + "音高算法": "音高算法", + "额外推理时长": "Tempo di inferenza extra" +} diff --git a/i18n/locale/ja_JP.json b/i18n/locale/ja_JP.json new file mode 100644 index 0000000000000000000000000000000000000000..b6ce5355cd3954160ee899ebcd95873bfa869236 --- /dev/null +++ b/i18n/locale/ja_JP.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3 次に、harvestピッチの認識結果に対してメディアンフィルタを使用します。値はフィルター半径で、ミュートを減衰させるために使用します。", + "A模型权重": "Aモデルの重み", + "A模型路径": "Aモデルのパス", + "B模型路径": "Bモデルのパス", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0(最低共振周波数)カーブファイル(オプション、1行に1ピッチ、デフォルトのF0(最低共振周波数)とエレベーションを置き換えます。)", + "Index Rate": "Index Rate", + "Onnx导出": "Onnxエクスポート", + "Onnx输出路径": "Onnx出力パス", + "RVC模型路径": "RVCモデルパス", + "ckpt处理": "ckptファイルの処理", + "harvest进程数": "harvestプロセス数", + "index文件路径不可包含中文": "indexファイルのパスに漢字を含んではいけません", + "pth文件路径不可包含中文": "pthファイルのパスに漢字を含んではいけません", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpeカード番号設定:異なるプロセスに使用するカード番号を入力する。例えば、0-0-1でカード0に2つのプロセス、カード1に1つのプロセスを実行する。", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "ステップ1:実験設定を入力します。実験データはlogsに保存され、各実験にはフォルダーがあります。実験名のパスを手動で入力する必要があり、実験設定、ログ、トレーニングされたモデルファイルが含まれます。", + "step1:正在处理数据": "step1:処理中のデータ", + "step2:正在提取音高&正在提取特征": "step2:ピッチ抽出と特徴抽出", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "ステップ2a: 訓練フォルダー内のすべての音声ファイルを自動的に探索し、スライスと正規化を行い、2つのwavフォルダーを実験ディレクトリに生成します。現在は一人でのトレーニングのみをサポートしています。", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "ステップ2b: CPUを使用して音高を抽出する(モデルに音高がある場合)、GPUを使用して特徴を抽出する(GPUの番号を選択する)", + "step3: 填写训练设置, 开始训练模型和索引": "ステップ3: トレーニング設定を入力して、モデルとインデックスのトレーニングを開始します", + "step3a:正在训练模型": "step3a:トレーニング中のモデル", + "一键训练": "ワンクリックトレーニング", + "也可批量输入音频文件, 二选一, 优先读文件夹": "複数のオーディオファイルをインポートすることもできます。フォルダパスが存在する場合、この入力は無視されます。", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "UVR5モデルを使用したボーカル伴奏の分離バッチ処理。
有効なフォルダーパスフォーマットの例: D:\\path\\to\\input\\folder (エクスプローラーのアドレスバーからコピーします)。
モデルは三つのカテゴリに分かれています:
1. ボーカルを保持: ハーモニーのないオーディオに対してこれを選択します。HP5よりもボーカルをより良く保持します。HP2とHP3の二つの内蔵モデルが含まれています。HP3は伴奏をわずかに漏らす可能性がありますが、HP2よりもわずかにボーカルをより良く保持します。
2. 主なボーカルのみを保持: ハーモニーのあるオーディオに対してこれを選択します。主なボーカルを弱める可能性があります。HP5の一つの内蔵モデルが含まれています。
3. ディリバーブとディレイモデル (by FoxJoy):
  (1) MDX-Net: ステレオリバーブの除去に最適な選択肢ですが、モノリバーブは除去できません;
 (234) DeEcho: ディレイ効果を除去します。AggressiveモードはNormalモードよりも徹底的に除去します。DeReverbはさらにリバーブを除去し、モノリバーブを除去することができますが、高周波のリバーブが強い内容に対しては非常に効果的ではありません。
ディリバーブ/ディレイに関する注意点:
1. DeEcho-DeReverbモデルの処理時間は、他の二つのDeEchoモデルの約二倍です。
2. MDX-Net-Dereverbモデルは非常に遅いです。
3. 推奨される最もクリーンな設定は、最初にMDX-Netを適用し、その後にDeEcho-Aggressiveを適用することです。", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "ハイフンで区切って使用するGPUの番号を入力します。例えば0-1-2はGPU0、GPU1、GPU2を使用します", + "伴奏人声分离&去混响&去回声": "伴奏ボーカル分離&残響除去&エコー除去", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "保存ファイル名", + "保存的文件名, 默认空为和源文件同名": "保存するファイル名、デフォルトでは空欄で元のファイル名と同じ名前になります", + "保存的模型名不带后缀": "拡張子のない保存するモデル名", + "保存频率save_every_epoch": "エポックごとの保存頻度", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "明確な子音と呼吸音を保護し、電子音の途切れやその他のアーティファクトを防止します。0.5でオフになります。下げると保護が強化されますが、indexの効果が低下する可能性があります。", + "修改": "変更", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "モデル情報の修正(weightsフォルダから抽出された小さなモデルファイルのみ対応)", + "停止音频转换": "音声変換を停止", + "全流程结束!": "全工程が完了!", + "刷新音色列表和索引路径": "音源リストとインデックスパスの更新", + "加载模型": "モデルをロード", + "加载预训练底模D路径": "事前学習済みのDモデルのパス", + "加载预训练底模G路径": "事前学習済みのGモデルのパス", + "单次推理": "单次推理", + "卸载音色省显存": "音源を削除してメモリを節約", + "变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)", + "后处理重采样至最终采样率,0为不进行重采样": "最終的なサンプリングレートへのポストプロセッシングのリサンプリング リサンプリングしない場合は0", + "否": "いいえ", + "启用相位声码器": "启用相位声码器", + "响应阈值": "反応閾値", + "响度因子": "ラウドネス係数", + "处理数据": "データ処理", + "导出Onnx模型": "Onnxに変換", + "导出文件格式": "エクスポート形式", + "常见问题解答": "よくある質問", + "常规设置": "一般設定", + "开始音频转换": "音声変換を開始", + "很遗憾您这没有能用的显卡来支持您训练": "トレーニングに対応したGPUが動作しないのは残念です。", + "性能设置": "パフォーマンス設定", + "总训练轮数total_epoch": "総エポック数", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "一括変換、変換する音声フォルダを入力、または複数の音声ファイルをアップロードし、指定したフォルダ(デフォルトのopt)に変換した音声を出力します。", + "指定输出主人声文件夹": "マスターの出力音声フォルダーを指定する", + "指定输出文件夹": "出力フォルダを指定してください", + "指定输出非主人声文件夹": "マスター以外の出力音声フォルダーを指定する", + "推理时间(ms):": "推論時間(ms):", + "推理音色": "音源推論", + "提取": "抽出", + "提取音高和处理数据使用的CPU进程数": "ピッチの抽出やデータ処理に使用するCPUスレッド数", + "是": "はい", + "是否仅保存最新的ckpt文件以节省硬盘空间": "ハードディスク容量を節約するため、最新のckptファイルのみを保存しますか?", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "各保存時点の小モデルを全部weightsフォルダに保存するかどうか", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "すべてのトレーニングデータをメモリにキャッシュするかどうか。10分以下の小さなデータはキャッシュしてトレーニングを高速化できますが、大きなデータをキャッシュするとメモリが破裂し、あまり速度が上がりません。", + "显卡信息": "GPU情報", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本ソフトウェアはMITライセンスに基づくオープンソースであり、製作者は本ソフトウェアに対していかなる責任を持ちません。本ソフトウェアの利用者および本ソフトウェアから派生した音源(成果物)を配布する者は、本ソフトウェアに対して自身で責任を負うものとします。
この条項に同意しない場合、パッケージ内のコードやファイルを使用や参照を禁じます。詳しくはLICENSEをご覧ください。", + "查看": "表示", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "モデル情報を表示する(小さいモデルファイルはweightsフォルダーからのみサポートされています)", + "检索特征占比": "検索特徴率", + "模型": "モデル", + "模型推理": "モデル推論", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "モデル抽出(ログフォルダー内の大きなファイルのモデルパスを入力)、モデルを半分までトレーニングし、自動的に小さいファイルモデルを保存しなかったり、中間モデルをテストしたい場合に適用されます。", + "模型是否带音高指导": "モデルに音高ガイドを付けるかどうか", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "モデルに音高ガイドがあるかどうか(歌唱には必要ですが、音声には必要ありません)", + "模型是否带音高指导,1是0否": "モデルに音高ガイドを付けるかどうか、1は付ける、0は付けない", + "模型版本型号": "モデルのバージョン", + "模型融合, 可用于测试音色融合": "モデルのマージ、音源のマージテストに使用できます", + "模型路径": "モデルパス", + "每张显卡的batch_size": "GPUごとのバッチサイズ", + "淡入淡出长度": "フェードイン/フェードアウト長", + "版本": "バージョン", + "特征提取": "特徴抽出", + "特征检索库文件路径,为空则使用下拉的选择结果": "特徴検索ライブラリへのパス 空の場合はドロップダウンで選択", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性から女性へは+12キーをお勧めします。女性から男性へは-12キーをお勧めします。音域が広すぎて音質が劣化した場合は、適切な音域に自分で調整してください。", + "目标采样率": "目標サンプリングレート", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "インデックスパスの自動検出 ドロップダウンで選択", + "融合": "マージ", + "要改的模型信息": "変更するモデル情報", + "要置入的模型信息": "挿入するモデル情報", + "训练": "トレーニング", + "训练模型": "モデルのトレーニング", + "训练特征索引": "特徴インデックスのトレーニング", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "トレーニング終了時に、トレーニングログやフォルダ内のtrain.logを確認することができます", + "设备类型": "设备类型", + "请指定说话人id": "話者IDを指定してください", + "请选择index文件": "indexファイルを選択してください", + "请选择pth文件": "pthファイルを選択してください", + "请选择说话人id": "話者IDを選択してください", + "转换": "変換", + "输入实验名": "モデル名", + "输入待处理音频文件夹路径": "処理するオーディオファイルのフォルダパスを入力してください", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "処理対象音声フォルダーのパスを入力してください(エクスプローラーのアドレスバーからコピーしてください)", + "输入待处理音频文件路径(默认是正确格式示例)": "処理対象音声ファイルのパスを入力してください(デフォルトは正しいフォーマットの例です)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "入力ソースの音量エンベロープと出力音量エンベロープの融合率 1に近づくほど、出力音量エンベロープの割合が高くなる", + "输入监听": "输入监听", + "输入训练文件夹路径": "トレーニング用フォルダのパスを入力してください", + "输入设备": "入力デバイス", + "输入降噪": "入力ノイズの低減", + "输出信息": "出力情報", + "输出变声": "输出变声", + "输出设备": "出力デバイス", + "输出降噪": "出力ノイズの低減", + "输出音频(右下角三个点,点了可以下载)": "出力音声(右下の三点をクリックしてダウンロードできます)", + "选择.index文件": ".indexファイルを選択", + "选择.pth文件": ".pthファイルを選択", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "ピッチ抽出アルゴリズムの選択、歌声はpmで高速化でき、harvestは低音が良いが信じられないほど遅く、crepeは良く動くがGPUを食います。", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "ピッチ抽出アルゴリズムの選択、歌声はpmで高速化でき、harvestは低音が良いが信じられないほど遅く、crepeは良く動くがGPUを喰います", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "ピッチ抽出アルゴリズムの選択:歌声はpmで高速化でき、入力した音声が高音質でCPUが貧弱な場合はdioで高速化でき、harvestの方が良いが遅く、rmvpeがベストだがCPU/GPUを若干食います。", + "采样率:": "采样率:", + "采样长度": "サンプル長", + "重载设备列表": "デバイスリストをリロードする", + "音调设置": "音程設定", + "音频设备": "オーディオデバイス", + "音高算法": "ピッチアルゴリズム", + "额外推理时长": "追加推論時間" +} diff --git a/i18n/locale/ko_KR.json b/i18n/locale/ko_KR.json new file mode 100644 index 0000000000000000000000000000000000000000..dcaab6371ea2f8e87ef18bdf6b71a62c1b20b1b8 --- /dev/null +++ b/i18n/locale/ko_KR.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3인 경우 harvest 피치 인식 결과에 중간값 필터 적용, 필터 반경은 값으로 지정, 사용 시 무성음 감소 가능", + "A模型权重": "A 모델 가중치", + "A模型路径": "A 모델 경로", + "B模型路径": "B 모델 경로", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\음성 오디오+표시\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0 곡선 파일, 선택적, 한 줄에 하나의 피치, 기본 F0 및 음높이 조절 대체", + "Index Rate": "인덱스 비율", + "Onnx导出": "Onnx 내보내기", + "Onnx输出路径": "Onnx 출력 경로", + "RVC模型路径": "RVC 모델 경로", + "ckpt处理": "ckpt 처리", + "harvest进程数": "harvest 프로세스 수", + "index文件路径不可包含中文": "index 파일 경로는 중국어를 포함할 수 없음", + "pth文件路径不可包含中文": "pth 파일 경로는 중국어를 포함할 수 없음", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe 카드 번호 설정: -로 구분된 입력 사용 카드 번호, 예: 0-0-1은 카드 0에서 2개 프로세스, 카드 1에서 1개 프로세스 실행", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 실험 구성 작성. 실험 데이터는 logs에 저장, 각 실험은 하나의 폴더, 수동으로 실험 이름 경로 입력 필요, 실험 구성, 로그, 훈련된 모델 파일 포함.", + "step1:正在处理数据": "step1: 데이터 처리 중", + "step2:正在提取音高&正在提取特征": "step2: 음높이 추출 & 특징 추출 중", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 훈련 폴더 아래 모든 오디오로 디코딩 가능한 파일을 자동 순회하며 슬라이스 정규화 진행, 실험 디렉토리 아래 2개의 wav 폴더 생성; 현재 단일 사용자 훈련만 지원.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: CPU를 사용하여 음높이 추출(모델이 음높이 포함 시), GPU를 사용하여 특징 추출(카드 번호 선택)", + "step3: 填写训练设置, 开始训练模型和索引": "step3: 훈련 설정 작성, 모델 및 인덱스 훈련 시작", + "step3a:正在训练模型": "step3a: 모델 훈련 중", + "一键训练": "원클릭 훈련", + "也可批量输入音频文件, 二选一, 优先读文件夹": "여러 오디오 파일을 일괄 입력할 수도 있음, 둘 중 하나 선택, 폴더 우선 읽기", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "인간 목소리와 반주 분리 배치 처리, UVR5 모델 사용.
적절한 폴더 경로 예시: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(파일 관리자 주소 표시줄에서 복사하면 됨).
모델은 세 가지 유형으로 나뉨:
1. 인간 목소리 보존: 화음이 없는 오디오에 이것을 선택, HP5보다 주된 인간 목소리 보존에 더 좋음. 내장된 HP2와 HP3 두 모델, HP3는 약간의 반주 누락 가능성이 있지만 HP2보다 주된 인간 목소리 보존이 약간 더 좋음;
2. 주된 인간 목소리만 보존: 화음이 있는 오디오에 이것을 선택, 주된 인간 목소리에 약간의 약화 가능성 있음. 내장된 HP5 모델 하나;
3. 혼효음 제거, 지연 제거 모델(by FoxJoy):
  (1)MDX-Net(onnx_dereverb): 이중 채널 혼효음에는 최선의 선택, 단일 채널 혼효음은 제거할 수 없음;
 (234)DeEcho: 지연 제거 효과. Aggressive는 Normal보다 더 철저하게 제거, DeReverb는 추가로 혼효음을 제거, 단일 채널 혼효음은 제거 가능하지만 고주파 중심의 판 혼효음은 완전히 제거하기 어려움.
혼효음/지연 제거, 부록:
1. DeEcho-DeReverb 모델의 처리 시간은 다른 두 개의 DeEcho 모델의 거의 2배임;
2. MDX-Net-Dereverb 모델은 상당히 느림;
3. 개인적으로 추천하는 가장 깨끗한 구성은 MDX-Net 다음에 DeEcho-Aggressive 사용.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "-로 구분하여 입력하는 카드 번호, 예: 0-1-2는 카드 0, 카드 1, 카드 2 사용", + "伴奏人声分离&去混响&去回声": "반주 인간 목소리 분리 & 혼효음 제거 & 에코 제거", + "使用模型采样率": "모델 샘플링 레이트 사용", + "使用设备采样率": "장치 샘플링 레이트 사용", + "保存名": "저장 이름", + "保存的文件名, 默认空为和源文件同名": "저장될 파일명, 기본적으로 빈 공간은 원본 파일과 동일한 이름으로", + "保存的模型名不带后缀": "저장된 모델명은 접미사 없음", + "保存频率save_every_epoch": "저장 빈도 save_every_epoch", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "청자음과 호흡 소리를 보호, 전자음 찢김 등의 아티팩트 방지, 0.5까지 올려서 비활성화, 낮추면 보호 강도 증가하지만 인덱스 효과 감소 가능성 있음", + "修改": "수정", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "모델 정보 수정(오직 weights 폴더 아래에서 추출된 작은 모델 파일만 지원)", + "停止音频转换": "오디오 변환 중지", + "全流程结束!": "전체 과정 완료!", + "刷新音色列表和索引路径": "음색 목록 및 인덱스 경로 새로고침", + "加载模型": "모델 로드", + "加载预训练底模D路径": "미리 훈련된 베이스 모델 D 경로 로드", + "加载预训练底模G路径": "미리 훈련된 베이스 모델 G 경로 로드", + "单次推理": "단일 추론", + "卸载音色省显存": "음색 언로드로 디스플레이 메모리 절약", + "变调(整数, 半音数量, 升八度12降八度-12)": "키 변경(정수, 반음 수, 옥타브 상승 12, 옥타브 하강 -12)", + "后处理重采样至最终采样率,0为不进行重采样": "후처리 재샘플링을 최종 샘플링 레이트로, 0은 재샘플링하지 않음", + "否": "아니오", + "启用相位声码器": "위상 보코더 활성화", + "响应阈值": "응답 임계값", + "响度因子": "음량 인자", + "处理数据": "데이터 처리", + "导出Onnx模型": "Onnx 모델 내보내기", + "导出文件格式": "내보낼 파일 형식", + "常见问题解答": "자주 묻는 질문", + "常规设置": "일반 설정", + "开始音频转换": "오디오 변환 시작", + "很遗憾您这没有能用的显卡来支持您训练": "사용 가능한 그래픽 카드가 없어 훈련을 지원할 수 없습니다", + "性能设置": "성능 설정", + "总训练轮数total_epoch": "총 훈련 라운드 수 total_epoch", + "批量推理": "일괄 추론", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "일괄 변환, 변환할 오디오 파일 폴더 입력 또는 여러 오디오 파일 업로드, 지정된 폴더(기본값 opt)에 변환된 오디오 출력.", + "指定输出主人声文件夹": "주된 목소리 출력 폴더 지정", + "指定输出文件夹": "출력 파일 폴더 지정", + "指定输出非主人声文件夹": "주된 목소리가 아닌 출력 폴더 지정", + "推理时间(ms):": "추론 시간(ms):", + "推理音色": "추론 음색", + "提取": "추출", + "提取音高和处理数据使用的CPU进程数": "음높이 추출 및 데이터 처리에 사용되는 CPU 프로세스 수", + "是": "예", + "是否仅保存最新的ckpt文件以节省硬盘空间": "디스크 공간을 절약하기 위해 최신 ckpt 파일만 저장할지 여부", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "저장 시마다 최종 소형 모델을 weights 폴더에 저장할지 여부", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "모든 훈련 세트를 VRAM에 캐시할지 여부. 10분 미만의 소량 데이터는 캐시하여 훈련 속도를 높일 수 있지만, 대량 데이터 캐시는 VRAM을 과부하시키고 속도를 크게 향상시키지 못함", + "显卡信息": "그래픽 카드 정보", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "이 소프트웨어는 MIT 라이선스로 공개되며, 저자는 소프트웨어에 대해 어떠한 통제권도 가지지 않습니다. 모든 귀책사유는 소프트웨어 사용자 및 소프트웨어에서 생성된 결과물을 사용하는 당사자에게 있습니다.
해당 조항을 인정하지 않는 경우, 소프트웨어 패키지의 어떠한 코드나 파일도 사용하거나 인용할 수 없습니다. 자세한 내용은 루트 디렉토리의 LICENSE를 참조하세요.", + "查看": "보기", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "모델 정보 보기(오직 weights 폴더에서 추출된 소형 모델 파일만 지원)", + "检索特征占比": "검색 특징 비율", + "模型": "모델", + "模型推理": "모델 추론", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "모델 추출(logs 폴더 아래의 큰 파일 모델 경로 입력), 훈련 중간에 중단한 모델의 자동 추출 및 소형 파일 모델 저장이 안 되거나 중간 모델을 테스트하고 싶은 경우에 적합", + "模型是否带音高指导": "모델이 음높이 지도를 포함하는지 여부", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "모델이 음높이 지도를 포함하는지 여부(노래에는 반드시 필요, 음성에는 필요 없음)", + "模型是否带音高指导,1是0否": "모델이 음높이 지도를 포함하는지 여부, 1은 예, 0은 아니오", + "模型版本型号": "모델 버전 및 모델", + "模型融合, 可用于测试音色融合": "모델 융합, 음색 융합 테스트에 사용 가능", + "模型路径": "모델 경로", + "每张显卡的batch_size": "각 그래픽 카드의 batch_size", + "淡入淡出长度": "페이드 인/아웃 길이", + "版本": "버전", + "特征提取": "특징 추출", + "特征检索库文件路径,为空则使用下拉的选择结果": "특징 검색 라이브러리 파일 경로, 비어 있으면 드롭다운 선택 결과 사용", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "남성에서 여성으로 변경 시 +12 키 권장, 여성에서 남성으로 변경 시 -12 키 권장, 음역대 폭발로 음색이 왜곡되면 적절한 음역대로 조정 가능.", + "目标采样率": "목표 샘플링률", + "算法延迟(ms):": "알고리즘 지연(ms):", + "自动检测index路径,下拉式选择(dropdown)": "자동으로 index 경로 감지, 드롭다운 선택(dropdown)", + "融合": "융합", + "要改的模型信息": "변경할 모델 정보", + "要置入的模型信息": "삽입할 모델 정보", + "训练": "훈련", + "训练模型": "모델 훈련", + "训练特征索引": "특징 인덱스 훈련", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "훈련 완료, 콘솔 훈련 로그 또는 실험 폴더 내의 train.log 확인 가능", + "设备类型": "设备类型", + "请指定说话人id": "화자 ID 지정 필요", + "请选择index文件": "index 파일 선택", + "请选择pth文件": "pth 파일 선택", + "请选择说话人id": "화자 ID 선택", + "转换": "변환", + "输入实验名": "실험명 입력", + "输入待处理音频文件夹路径": "처리할 오디오 파일 폴더 경로 입력", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "처리할 오디오 파일 폴더 경로 입력(파일 탐색기 주소 표시줄에서 복사)", + "输入待处理音频文件路径(默认是正确格式示例)": "처리할 오디오 파일 경로 입력(기본적으로 올바른 형식 예시)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "입력 소스 볼륨 엔벨로프와 출력 볼륨 엔벨로프의 결합 비율 입력, 1에 가까울수록 출력 엔벨로프 사용", + "输入监听": "입력 모니터링", + "输入训练文件夹路径": "훈련 파일 폴더 경로 입력", + "输入设备": "입력 장치", + "输入降噪": "입력 노이즈 감소", + "输出信息": "출력 정보", + "输出变声": "출력 음성 변조", + "输出设备": "출력 장치", + "输出降噪": "출력 노이즈 감소", + "输出音频(右下角三个点,点了可以下载)": "출력 오디오(오른쪽 하단 세 개의 점, 클릭하면 다운로드 가능)", + "选择.index文件": ".index 파일 선택", + "选择.pth文件": ".pth 파일 선택", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "음높이 추출 알고리즘 선택, 노래 입력 시 pm으로 속도 향상, harvest는 저음이 좋지만 매우 느림, crepe는 효과가 좋지만 GPU 사용", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "음높이 추출 알고리즘 선택, 노래 입력 시 pm으로 속도 향상, harvest는 저음이 좋지만 매우 느림, crepe는 효과가 좋지만 GPU 사용, rmvpe는 효과가 가장 좋으며 GPU를 적게 사용", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "음높이 추출 알고리즘 선택: 노래 입력 시 pm으로 속도 향상, 고품질 음성에는 CPU가 부족할 때 dio 사용, harvest는 품질이 더 좋지만 느림, rmvpe는 효과가 가장 좋으며 CPU/GPU를 적게 사용", + "采样率:": "샘플링률:", + "采样长度": "샘플링 길이", + "重载设备列表": "장치 목록 재로드", + "音调设置": "음조 설정", + "音频设备": "音频设备", + "音高算法": "음높이 알고리즘", + "额外推理时长": "추가 추론 시간" +} diff --git a/i18n/locale/pt_BR.json b/i18n/locale/pt_BR.json new file mode 100644 index 0000000000000000000000000000000000000000..3d87b081d0d93bd0951f0f13fe62f773526357a4 --- /dev/null +++ b/i18n/locale/pt_BR.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3, use o filtro mediano para o resultado do reconhecimento do tom da heverst, e o valor é o raio do filtro, que pode enfraquecer o mudo.", + "A模型权重": "Peso (w) para o modelo A:", + "A模型路径": "Caminho para o Modelo A:", + "B模型路径": "Caminho para o Modelo B:", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\meu-dataset", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Arquivo de curva F0 (opcional). Um arremesso por linha. Substitui a modulação padrão F0 e tom:", + "Index Rate": "Taxa do Index", + "Onnx导出": "Exportar Onnx", + "Onnx输出路径": "Caminho de exportação ONNX:", + "RVC模型路径": "Caminho do Modelo RVC:", + "ckpt处理": "processamento ckpt", + "harvest进程数": "Número de processos harvest", + "index文件路径不可包含中文": "O caminho do arquivo de Index não pode conter caracteres chineses", + "pth文件路径不可包含中文": "o caminho do arquivo pth não pode conter caracteres chineses", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Configuração do número do cartão rmvpe: Use - para separar os números dos cartões de entrada de diferentes processos. Por exemplo, 0-0-1 é usado para executar 2 processos no cartão 0 e 1 processo no cartão 1.", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Etapa 1: Preencha a configuração experimental. Os dados experimentais são armazenados na pasta 'logs', com cada experimento tendo uma pasta separada. Digite manualmente o caminho do nome do experimento, que contém a configuração experimental, os logs e os arquivos de modelo treinados.", + "step1:正在处理数据": "Etapa 1: Processamento de dados", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Etapa 2a: Percorra automaticamente todos os arquivos na pasta de treinamento que podem ser decodificados em áudio e execute a normalização da fatia. Gera 2 pastas wav no diretório do experimento. Atualmente, apenas o treinamento de um único cantor/palestrante é suportado.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Etapa 2b: Use a CPU para extrair o tom (se o modelo tiver tom), use a GPU para extrair recursos (selecione o índice da GPU):", + "step3: 填写训练设置, 开始训练模型和索引": "Etapa 3: Preencha as configurações de treinamento e comece a treinar o modelo e o Index", + "step3a:正在训练模型": "Etapa 3a: Treinamento do modelo iniciado", + "一键训练": "Treinamento com um clique", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Você também pode inserir arquivos de áudio em lotes. Escolha uma das duas opções. É dada prioridade à leitura da pasta.", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Processamento em lote para separação de acompanhamento vocal usando o modelo UVR5.
Exemplo de um formato de caminho de pasta válido: D:\\caminho\\para a pasta\\entrada\\ (copie-o da barra de endereços do gerenciador de arquivos).
O modelo é dividido em três categorias:
1. Preservar vocais: Escolha esta opção para áudio sem harmonias. Ele preserva os vocais melhor do que o HP5. Inclui dois modelos integrados: HP2 e HP3. O HP3 pode vazar ligeiramente o acompanhamento, mas preserva os vocais um pouco melhor do que o HP2.
2 Preservar apenas os vocais principais: Escolha esta opção para áudio com harmonias. Isso pode enfraquecer os vocais principais. Ele inclui um modelo embutido: HP5.
3. Modelos de de-reverb e de-delay (por FoxJoy):
  (1) MDX-Net: A melhor escolha para remoção de reverb estéreo, mas não pode remover reverb mono;
 (234) DeEcho: Remove efeitos de atraso. O modo agressivo remove mais completamente do que o modo normal. O DeReverb também remove reverb e pode remover reverb mono, mas não de forma muito eficaz para conteúdo de alta frequência fortemente reverberado.
Notas de de-reverb/de-delay:
1. O tempo de processamento para o modelo DeEcho-DeReverb é aproximadamente duas vezes maior que os outros dois modelos DeEcho.
2 O modelo MDX-Net-Dereverb é bastante lento.
3. A configuração mais limpa recomendada é aplicar MDX-Net primeiro e depois DeEcho-Aggressive.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Digite o (s) índice(s) da GPU separados por '-', por exemplo, 0-1-2 para usar a GPU 0, 1 e 2:", + "伴奏人声分离&去混响&去回声": "UVR5", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Salvar nome", + "保存的文件名, 默认空为和源文件同名": "Salvar nome do arquivo (padrão: igual ao arquivo de origem):", + "保存的模型名不带后缀": "Nome do modelo salvo (sem extensão):", + "保存频率save_every_epoch": "Faça backup a cada # de Epoch:", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Proteja consoantes sem voz e sons respiratórios, evite artefatos como quebra de som eletrônico e desligue-o quando estiver cheio de 0,5. Diminua-o para aumentar a proteção, mas pode reduzir o efeito de indexação:", + "修改": "Editar", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modificar informações do modelo (suportado apenas para arquivos de modelo pequenos extraídos da pasta 'weights')", + "停止音频转换": "Conversão de áudio", + "全流程结束!": "Todos os processos foram concluídos!", + "刷新音色列表和索引路径": "Atualizar lista de voz e caminho do Index", + "加载模型": "Modelo", + "加载预训练底模D路径": "Carregue o caminho D do modelo base pré-treinado:", + "加载预训练底模G路径": "Carregue o caminho G do modelo base pré-treinado:", + "单次推理": "Único", + "卸载音色省显存": "Descarregue a voz para liberar a memória da GPU:", + "变调(整数, 半音数量, 升八度12降八度-12)": "Mude o tom aqui. Se a voz for do mesmo sexo, não é necessario alterar (12 caso seja Masculino para feminino, -12 caso seja ao contrário).", + "后处理重采样至最终采样率,0为不进行重采样": "Reamostragem pós-processamento para a taxa de amostragem final, 0 significa sem reamostragem:", + "否": "Não", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Limiar de resposta", + "响度因子": "Fator de volume", + "处理数据": "Processar o Conjunto de Dados", + "导出Onnx模型": "Exportar Modelo Onnx", + "导出文件格式": "Qual formato de arquivo você prefere?", + "常见问题解答": "FAQ (Perguntas frequentes)", + "常规设置": "Configurações gerais", + "开始音频转换": "Iniciar conversão de áudio", + "很遗憾您这没有能用的显卡来支持您训练": "Infelizmente, não há GPU compatível disponível para apoiar o seu treinamento.", + "性能设置": "Configurações de desempenho.", + "总训练轮数total_epoch": "Número total de ciclos(epoch) de treino (se escolher um valor alto demais, o seu modelo parecerá terrivelmente sobretreinado):", + "批量推理": "Conversão em Lote", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversão em Massa.", + "指定输出主人声文件夹": "Especifique a pasta de saída para vocais:", + "指定输出文件夹": "Especifique a pasta de saída:", + "指定输出非主人声文件夹": "Informar a pasta de saída para acompanhamento:", + "推理时间(ms):": "Tempo de inferência (ms):", + "推理音色": "Escolha o seu Modelo:", + "提取": "Extrato", + "提取音高和处理数据使用的CPU进程数": "Número de processos de CPU usados para extração de tom e processamento de dados:", + "是": "Sim", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Só deve salvar apenas o arquivo ckpt mais recente para economizar espaço em disco:", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Salve um pequeno modelo final na pasta 'weights' em cada ponto de salvamento:", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Se deve armazenar em cache todos os conjuntos de treinamento na memória de vídeo. Pequenos dados com menos de 10 minutos podem ser armazenados em cache para acelerar o treinamento, e um cache de dados grande irá explodir a memória de vídeo e não aumentar muito a velocidade:", + "显卡信息": "Informações da GPU", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "
The Mangio-RVC 💻 | Tradução por Krisp e Rafael Godoy Ebert | AI HUB BRASIL
Este software é de código aberto sob a licença MIT. O autor não tem qualquer controle sobre o software. Aqueles que usam o software e divulgam os sons exportados pelo software são totalmente responsáveis.
Se você não concorda com este termo, você não pode usar ou citar nenhum código e arquivo no pacote de software. Para obter detalhes, consulte o diretório raiz O acordo a ser seguido para uso LICENSE
", + "查看": "Visualizar", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Exibir informações do modelo (suportado apenas para arquivos de modelo pequenos extraídos da pasta 'weights')", + "检索特征占比": "Taxa de recurso de recuperação:", + "模型": "Modelo", + "模型推理": "Inference", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Extração do modelo (insira o caminho do modelo de arquivo grande na pasta 'logs'). Isso é útil se você quiser interromper o treinamento no meio do caminho e extrair e salvar manualmente um arquivo de modelo pequeno, ou se quiser testar um modelo intermediário:", + "模型是否带音高指导": "Se o modelo tem orientação de tom:", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Se o modelo tem orientação de tom (necessário para cantar, opcional para fala):", + "模型是否带音高指导,1是0否": "Se o modelo tem orientação de passo (1: sim, 0: não):", + "模型版本型号": "Versão:", + "模型融合, 可用于测试音色融合": "A fusão modelo, pode ser usada para testar a fusão do timbre", + "模型路径": "Caminho para o Modelo:", + "每张显卡的batch_size": "Batch Size (DEIXE COMO ESTÁ a menos que saiba o que está fazendo, no Colab pode deixar até 20!):", + "淡入淡出长度": "Comprimento de desvanecimento", + "版本": "Versão", + "特征提取": "Extrair Tom", + "特征检索库文件路径,为空则使用下拉的选择结果": "Caminho para o arquivo de Index. Deixe em branco para usar o resultado selecionado no menu debaixo:", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Recomendado +12 chave para conversão de homem para mulher e -12 chave para conversão de mulher para homem. Se a faixa de som for muito longe e a voz estiver distorcida, você também pode ajustá-la à faixa apropriada por conta própria.", + "目标采样率": "Taxa de amostragem:", + "算法延迟(ms):": "Atrasos algorítmicos (ms):", + "自动检测index路径,下拉式选择(dropdown)": "Detecte automaticamente o caminho do Index e selecione no menu suspenso:", + "融合": "Fusão", + "要改的模型信息": "Informações do modelo a ser modificado:", + "要置入的模型信息": "Informações do modelo a ser colocado:", + "训练": "Treinar", + "训练模型": "Treinar Modelo", + "训练特征索引": "Treinar Index", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Após o término do treinamento, você pode verificar o log de treinamento do console ou train.log na pasta de experimentos", + "设备类型": "设备类型", + "请指定说话人id": "Especifique o ID do locutor/cantor:", + "请选择index文件": "Selecione o arquivo de Index", + "请选择pth文件": "Selecione o arquivo pth", + "请选择说话人id": "Selecione Palestrantes/Cantores ID:", + "转换": "Converter", + "输入实验名": "Nome da voz:", + "输入待处理音频文件夹路径": "Caminho da pasta de áudio a ser processada:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Caminho da pasta de áudio a ser processada (copie-o da barra de endereços do gerenciador de arquivos):", + "输入待处理音频文件路径(默认是正确格式示例)": "Caminho para o seu conjunto de dados (áudios, não zipado):", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "O envelope de volume da fonte de entrada substitui a taxa de fusão do envelope de volume de saída, quanto mais próximo de 1, mais o envelope de saída é usado:", + "输入监听": "Monitoramento de entrada", + "输入训练文件夹路径": "Caminho da pasta de treinamento:", + "输入设备": "Dispositivo de entrada", + "输入降噪": "Redução de ruído de entrada", + "输出信息": "Informação de saída", + "输出变声": "Mudança de voz de saída", + "输出设备": "Dispositivo de saída", + "输出降噪": "Redução de ruído de saída", + "输出音频(右下角三个点,点了可以下载)": "Exportar áudio (clique nos três pontos no canto inferior direito para baixar)", + "选择.index文件": "Selecione o Index", + "选择.pth文件": "Selecione o Arquivo", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Selecione o algoritmo de extração de tom \n'pm': extração mais rápida, mas discurso de qualidade inferior; \n'harvest': graves melhores, mas extremamente lentos; \n'harvest': melhor qualidade, mas extração mais lenta); 'crepe': melhor qualidade, mas intensivo em GPU; 'magio-crepe': melhor opção; 'RMVPE': um modelo robusto para estimativa de afinação vocal em música polifônica;", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "Selecione o algoritmo de extração de tom \n'pm': extração mais rápida, mas discurso de qualidade inferior; \n'harvest': graves melhores, mas extremamente lentos; \n'crepe': melhor qualidade (mas intensivo em GPU);\n rmvpe tem o melhor efeito e consome menos CPU/GPU.", + "采样率:": "采样率:", + "采样长度": "Comprimento da Amostra", + "重载设备列表": "Recarregar lista de dispositivos", + "音调设置": "Configurações de tom", + "音频设备": "音频设备", + "音高算法": "Algoritmo de detecção de pitch", + "额外推理时长": "Tempo extra de inferência" +} diff --git a/i18n/locale/ru_RU.json b/i18n/locale/ru_RU.json new file mode 100644 index 0000000000000000000000000000000000000000..42f2bd6b8c9d54c9fc0066c76104bb387013eb38 --- /dev/null +++ b/i18n/locale/ru_RU.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Если значение больше 3: применить медианную фильтрацию к вытащенным тональностям. Значение контролирует радиус фильтра и может уменьшить излишнее дыхание.", + "A模型权重": "Весы (w) модели А:", + "A模型路径": "Путь к модели А:", + "B模型路径": "Путь к модели Б:", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Файл дуги F0 (не обязательно). Одна тональность на каждую строчку. Заменяет обычный F0 и модуляцию тональности:", + "Index Rate": "Темп индекса", + "Onnx导出": "Экспорт ONNX", + "Onnx输出路径": "Путь для сохранения модели в формате ONNX:", + "RVC模型路径": "Путь к модели RVC:", + "ckpt处理": "Обработка ckpt", + "harvest进程数": "Количество процессор harvest", + "index文件路径不可包含中文": "Путь к файлу индекса", + "pth文件路径不可包含中文": "Путь к файлу pth", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Введите номера графических процессоров, разделенные символом «-», например, 0-0-1, чтобы запустить два процесса на GPU 0 и один процесс на GPU 1:", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Шаг 1. Конфигурирование модели. Данные обучения модели сохраняются в папку 'logs', и для каждой модели создаётся отдельная папка. Введите вручную путь к настройкам для модели, в которой находятся логи и тренировочные файлы.", + "step1:正在处理数据": "Шаг 1. Переработка данных", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Шаг 2А. Автоматическая обработка исходных аудиозаписей для обучения и выполнение нормализации среза. Создаст 2 папки wav в папке модели. В данный момент поддерживается обучение только на одноголосных записях.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Шаг 2Б. Оценка и извлечение тональности в аудиофайлах с помощью процессора (если включена поддержка изменения высоты звука), извлечение черт с помощью GPU (выберите номер GPU):", + "step3: 填写训练设置, 开始训练模型和索引": "Шаг 3. Заполнение дополнительных настроек обучения и запуск обучения модели и индекса", + "step3a:正在训练模型": "Шаг 3. Запуск обучения модели", + "一键训练": "Обучение в одно нажатие", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Можно также импортировать несколько аудиофайлов. Если путь к папке существует, то этот ввод игнорируется.", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Пакетная обработка для разделения вокального сопровождения с использованием модели UVR5.
Пример допустимого формата пути к папке: D:\\path\\to\\input\\folder
Модель разделена на три категории:
1. Сохранить вокал: выберите этот вариант для звука без гармоний. Он сохраняет вокал лучше, чем HP5. Он включает в себя две встроенные модели: HP2 и HP3. HP3 может немного пропускать инструментал, но сохраняет вокал немного лучше, чем HP2.
2. Сохранить только основной вокал: выберите этот вариант для звука с гармониями. Это может ослабить основной вокал. Он включает одну встроенную модель: HP5.
3. Модели удаления реверберации и задержки (от FoxJoy):
  (1) MDX-Net: лучший выбор для удаления стереореверберации, но он не может удалить монореверберацию;
 (234) DeEcho: удаляет эффекты задержки. Агрессивный режим удаляет более тщательно, чем Нормальный режим. DeReverb дополнительно удаляет реверберацию и может удалять монореверберацию, но не очень эффективно для сильно реверберированного высокочастотного контента.
Примечания по удалению реверберации/задержки:
1. Время обработки для модели DeEcho-DeReverb примерно в два раза больше, чем для двух других моделей DeEcho.
2. Модель MDX-Net-Dereverb довольно медленная.
3. Рекомендуемая самая чистая конфигурация — сначала применить MDX-Net, а затем DeEcho-Aggressive.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Введите, какие(-ую) GPU(-у) хотите использовать через '-', например 0-1-2, чтобы использовать GPU с номерами 0, 1 и 2:", + "伴奏人声分离&去混响&去回声": "Разделение вокала/аккомпанемента и удаление эхо", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Имя файла для сохранения:", + "保存的文件名, 默认空为和源文件同名": "Название сохранённого файла (по умолчанию: такое же, как и у входного):", + "保存的模型名不带后缀": "Имя файла модели для сохранения (без расширения):", + "保存频率save_every_epoch": "Частота сохранения (save_every_epoch):", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Защитить глухие согласные и звуки дыхания для предотвращения артефактов, например, разрывания в электронной музыке. Поставьте на 0.5, чтобы выключить. Уменьшите значение для повышения защиты, но учтите, что при этом может ухудшиться точность индексирования:", + "修改": "Изменить", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Изменить информацию о модели (работает только с маленькими моделями, взятыми из папки 'weights')", + "停止音频转换": "Закончить конвертацию аудио", + "全流程结束!": "Все процессы завершены!", + "刷新音色列表和索引路径": "Обновить список голосов и индексов", + "加载模型": "Загрузить модель", + "加载预训练底模D路径": "Путь к предварительно обученной базовой модели D:", + "加载预训练底模G路径": "Путь к предварительно обученной базовой модели G:", + "单次推理": "单次推理", + "卸载音色省显存": "Выгрузить модель из памяти GPU для освобождения ресурсов", + "变调(整数, 半音数量, 升八度12降八度-12)": "Изменить высоту голоса (укажите количество полутонов; чтобы поднять голос на октаву, выберите 12, понизить на октаву — -12):", + "后处理重采样至最终采样率,0为不进行重采样": "Изменить частоту дискретизации в выходном файле на финальную. Поставьте 0, чтобы ничего не изменялось:", + "否": "Нет", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Порог ответа", + "响度因子": "коэффициент громкости", + "处理数据": "Обработать данные", + "导出Onnx模型": "Экспортировать модель", + "导出文件格式": "Формат выходных файлов", + "常见问题解答": "ЧаВо (часто задаваемые вопросы)", + "常规设置": "Основные настройки", + "开始音频转换": "Начать конвертацию аудио", + "很遗憾您这没有能用的显卡来支持您训练": "К сожалению, у вас нету графического процессора, который поддерживает обучение моделей.", + "性能设置": "Настройки быстроты", + "总训练轮数total_epoch": "Полное количество эпох (total_epoch):", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Массовое преобразование. Введите путь к папке, в которой находятся файлы для преобразования голоса или выгрузите несколько аудиофайлов. Сконвертированные файлы будут сохранены в указанной папке (по умолчанию: 'opt').", + "指定输出主人声文件夹": "Путь к папке для сохранения вокала:", + "指定输出文件夹": "Папка для результатов:", + "指定输出非主人声文件夹": "Путь к папке для сохранения аккомпанемента:", + "推理时间(ms):": "Время переработки (мс):", + "推理音色": "Желаемый голос:", + "提取": "Создать модель", + "提取音高和处理数据使用的CPU进程数": "Число процессов ЦП, используемое для оценки высоты голоса и обработки данных:", + "是": "Да", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Сохранять только последний файл '.ckpt', чтобы сохранить место на диске:", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Сохранять маленькую финальную модель в папку 'weights' на каждой точке сохранения:", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Кэшировать все тренировочные сеты в видеопамять. Кэширование маленький датасетов (меньше 10 минут) может ускорить тренировку, но кэширование больших, наоборот, займёт много видеопамяти и не сильно ускорит тренировку:", + "显卡信息": "Информация о графических процессорах (GPUs):", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Это программное обеспечение с открытым исходным кодом распространяется по лицензии MIT. Автор никак не контролирует это программное обеспечение. Пользователи, которые используют эту программу и распространяют аудиозаписи, полученные с помощью этой программы, несут полную ответственность за это. Если вы не согласны с этим, вы не можете использовать какие-либо коды и файлы в рамках этой программы или ссылаться на них. Подробнее в файле Agreement-LICENSE.txt в корневом каталоге программы.", + "查看": "Просмотреть информацию", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Просмотреть информацию о модели (работает только с маленькими моделями, взятыми из папки 'weights')", + "检索特征占比": "Соотношение поиска черт:", + "模型": "Модели", + "模型推理": "Изменение голоса", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Создание модели из данных, полученных в процессе обучения (введите путь к большому файлу модели в папке 'logs'). Может пригодиться, если вам нужно завершить обучение и получить маленький файл готовой модели, или если вам нужно проверить недообученную модель:", + "模型是否带音高指导": "Поддерживает ли модель изменение высоты голоса (1: да, 0: нет):", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Поддержка изменения высоты звука (обязательно для пения, необязательно для речи):", + "模型是否带音高指导,1是0否": "Поддерживает ли модель изменение высоты голоса (1: да, 0: нет):", + "模型版本型号": "Версия архитектуры модели:", + "模型融合, 可用于测试音色融合": "Слияние моделей, может быть использовано для проверки слияния тембра", + "模型路径": "Путь к папке:", + "每张显卡的batch_size": "Размер пачки для GPU:", + "淡入淡出长度": "Длина затухания", + "版本": "Версия архитектуры модели:", + "特征提取": "Извлечь черты", + "特征检索库文件路径,为空则使用下拉的选择结果": "Путь к файлу индекса черт. Оставьте пустым, чтобы использовать выбранный вариант из списка ниже:", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Рекомендуется выбрать +12 для конвертирования мужского голоса в женский и -12 для конвертирования женского в мужской. Если диапазон голоса слишком велик, и голос искажается, можно выбрать значение на свой вкус.", + "目标采样率": "Частота дискретизации аудио:", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "Автоматически найденные файлы индексов черт (выберите вариант из списка):", + "融合": "Запустить слияние", + "要改的模型信息": "Информация, которая будет изменена:", + "要置入的模型信息": "Информация о модели:", + "训练": "Обучение модели", + "训练模型": "Обучить модель", + "训练特征索引": "Обучить индекс черт", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Обучение модели завершено. Журнал обучения можно просмотреть в консоли или в файле 'train.log' в папке с моделью.", + "设备类型": "设备类型", + "请指定说话人id": "Номер говорящего/поющего:", + "请选择index文件": "Пожалуйста, выберите файл индекса", + "请选择pth文件": "Пожалуйста, выберите файл pth", + "请选择说话人id": "Номер говорящего:", + "转换": "Преобразовать", + "输入实验名": "Название модели:", + "输入待处理音频文件夹路径": "Путь к папке с аудиофайлами для обработки:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Путь к папке с аудиофайлами для переработки (можно скопировать путь из адресной строки файлового менеджера):", + "输入待处理音频文件路径(默认是正确格式示例)": "Путь к аудиофайлу, который хотите обработать (ниже указан пример пути к файлу):", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Использовать громкость входного файла для замены или перемешивания с громкостью выходного файла. Чем ближе соотношение к 1, тем больше используется звука из выходного файла:", + "输入监听": "输入监听", + "输入训练文件夹路径": "Путь к папке с аудиозаписями, на которых будет обучаться модель:", + "输入设备": "Входное устройство", + "输入降噪": "Уменьшение входного шума", + "输出信息": "Статистика", + "输出变声": "输出变声", + "输出设备": "Выходное устройство", + "输出降噪": "Уменьшение выходного шума", + "输出音频(右下角三个点,点了可以下载)": "Аудиофайл (чтобы скачать, нажмите на три точки справа в плеере)", + "选择.index文件": "Выбрать файл .index", + "选择.pth文件": "Выбрать файл .pth", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Выберите алгоритм оценки высоты голоса ('pm': работает быстро, но даёт низкое качество речи; 'harvest': басы лучше, но работает очень медленно; 'crepe': лучшее качество, но сильно нагружает GPU; 'rmvpe': лучшее качество и минимальная нагрузка на GPU):", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "Длина сэмпла", + "重载设备列表": "Обновить список устройств", + "音调设置": "Настройка высоты звука", + "音频设备": "Аудиоустройство", + "音高算法": "Алгоритм оценки высоты звука", + "额外推理时长": "Доп. время переработки" +} diff --git a/i18n/locale/tr_TR.json b/i18n/locale/tr_TR.json new file mode 100644 index 0000000000000000000000000000000000000000..90c47d603f1836a0b455f037ea00a3aa1e415280 --- /dev/null +++ b/i18n/locale/tr_TR.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Eğer >=3 ise, elde edilen pitch sonuçlarına median filtreleme uygula. Bu değer, filtre yarıçapını temsil eder ve nefesliliği azaltabilir.", + "A模型权重": "A Modeli Ağırlığı:", + "A模型路径": "A Modeli Yolu:", + "B模型路径": "B Modeli Yolu:", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0 eğrisi dosyası (isteğe bağlı). Her satırda bir pitch değeri bulunur. Varsayılan F0 ve pitch modülasyonunu değiştirir:", + "Index Rate": "Index Oranı", + "Onnx导出": "Onnx Dışa Aktar", + "Onnx输出路径": "Onnx Dışa Aktarım Yolu:", + "RVC模型路径": "RVC Model Yolu:", + "ckpt处理": "ckpt İşleme", + "harvest进程数": "harvest进程数", + "index文件路径不可包含中文": ".index dosya yolu Çince karakter içeremez", + "pth文件路径不可包含中文": ".pth dosya yolu Çince karakter içeremez", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Adım 1: Deneysel yapılandırmayı doldurun. Deneysel veriler 'logs' klasöründe saklanır ve her bir deney için ayrı bir klasör vardır. Deneysel adı yolu manuel olarak girin; bu yol, deneysel yapılandırmayı, günlükleri ve eğitilmiş model dosyalarını içerir.", + "step1:正在处理数据": "Adım 1: Veri işleme", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Adım 2a: Eğitim klasöründe ses dosyalarını otomatik olarak gezinerek dilimleme normalizasyonu yapın. Deney dizini içinde 2 wav klasörü oluşturur. Şu anda sadece tek kişilik eğitim desteklenmektedir.", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Adım 2b: Ses yüksekliği (Pitch) çıkartmak için CPU kullanın (eğer model ses yüksekliği içeriyorsa), özellikleri çıkartmak için GPU kullanın (GPU indeksini seçin):", + "step3: 填写训练设置, 开始训练模型和索引": "Adım 3: Eğitim ayarlarını doldurun ve modeli ve dizini eğitmeye başlayın", + "step3a:正在训练模型": "Adım 3a: Model eğitimi başladı", + "一键训练": "Tek Tuşla Eğit", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Ses dosyaları ayrıca toplu olarak, iki seçimle, öncelikli okuma klasörüyle içe aktarılabilir", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Batch işleme kullanarak vokal eşlik ayrımı için UVR5 modeli kullanılır.
Geçerli bir klasör yol formatı örneği: D:\\path\\to\\input\\folder (dosya yöneticisi adres çubuğundan kopyalanır).
Model üç kategoriye ayrılır:
1. Vokalleri koru: Bu seçeneği, harmoni içermeyen sesler için kullanın. HP5'ten daha iyi bir şekilde vokalleri korur. İki dahili model içerir: HP2 ve HP3. HP3, eşlik sesini hafifçe sızdırabilir, ancak vokalleri HP2'den biraz daha iyi korur.
2. Sadece ana vokalleri koru: Bu seçeneği, harmoni içeren sesler için kullanın. Ana vokalleri zayıflatabilir. Bir dahili model içerir: HP5.
3. Reverb ve gecikme modelleri (FoxJoy tarafından):
  (1) MDX-Net: Stereo reverb'i kaldırmak için en iyi seçenek, ancak mono reverb'i kaldıramaz;
 (234) DeEcho: Gecikme efektlerini kaldırır. Agresif mod, Normal moda göre daha kapsamlı bir şekilde kaldırma yapar. DeReverb ayrıca reverb'i kaldırır ve mono reverb'i kaldırabilir, ancak yoğun yankılı yüksek frekanslı içerikler için çok etkili değildir.
Reverb/gecikme notları:
1. DeEcho-DeReverb modelinin işleme süresi diğer iki DeEcho modeline göre yaklaşık olarak iki kat daha uzundur.
2. MDX-Net-Dereverb modeli oldukça yavaştır.
3. Tavsiye edilen en temiz yapılandırma önce MDX-Net'i uygulamak ve ardından DeEcho-Aggressive uygulamaktır.", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "GPU indekslerini '-' ile ayırarak girin, örneğin 0-1-2, GPU 0, 1 ve 2'yi kullanmak için:", + "伴奏人声分离&去混响&去回声": "Vokal/Müzik Ayrıştırma ve Yankı Giderme", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "Kaydetme Adı:", + "保存的文件名, 默认空为和源文件同名": "Kaydedilecek dosya adı (varsayılan: kaynak dosya ile aynı):", + "保存的模型名不带后缀": "Kaydedilecek model adı (uzantı olmadan):", + "保存频率save_every_epoch": "Kaydetme sıklığı (save_every_epoch):", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Sessiz ünsüzleri ve nefes seslerini koruyarak elektronik müzikte yırtılma gibi sanal hataların oluşmasını engeller. 0.5 olarak ayarlandığında devre dışı kalır. Değerin azaltılması korumayı artırabilir, ancak indeksleme doğruluğunu azaltabilir:", + "修改": "Düzenle", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Model bilgilerini düzenle (sadece 'weights' klasöründen çıkarılan küçük model dosyaları desteklenir)", + "停止音频转换": "Ses dönüştürmeyi durdur", + "全流程结束!": "Tüm işlemler tamamlandı!", + "刷新音色列表和索引路径": "Ses listesini ve indeks yolunu yenile", + "加载模型": "Model yükle", + "加载预训练底模D路径": "Önceden eğitilmiş temel D modelini yükleme yolu:", + "加载预训练底模G路径": "Önceden eğitilmiş temel G modelini yükleme yolu:", + "单次推理": "单次推理", + "卸载音色省显存": "GPU bellek kullanımını azaltmak için sesi kaldır", + "变调(整数, 半音数量, 升八度12降八度-12)": "Transpoze et (tamsayı, yarıton sayısıyla; bir oktav yükseltmek için: 12, bir oktav düşürmek için: -12):", + "后处理重采样至最终采样率,0为不进行重采样": "Son işleme aşamasında çıktı sesini son örnekleme hızına yeniden örnekle. 0 değeri için yeniden örnekleme yapılmaz:", + "否": "Hayır", + "启用相位声码器": "启用相位声码器", + "响应阈值": "Tepki eşiği", + "响度因子": "ses yüksekliği faktörü", + "处理数据": "Verileri işle", + "导出Onnx模型": "Onnx Modeli Dışa Aktar", + "导出文件格式": "Dışa aktarma dosya formatı", + "常见问题解答": "Sıkça Sorulan Sorular (SSS)", + "常规设置": "Genel ayarlar", + "开始音频转换": "Ses dönüştürmeyi başlat", + "很遗憾您这没有能用的显卡来支持您训练": "Maalesef, eğitiminizi desteklemek için uyumlu bir GPU bulunmamaktadır.", + "性能设置": "Performans ayarları", + "总训练轮数total_epoch": "Toplam eğitim turu (total_epoch):", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Toplu dönüştür. Dönüştürülecek ses dosyalarının bulunduğu klasörü girin veya birden çok ses dosyasını yükleyin. Dönüştürülen ses dosyaları belirtilen klasöre ('opt' varsayılan olarak) dönüştürülecektir", + "指定输出主人声文件夹": "Vokal için çıkış klasörünü belirtin:", + "指定输出文件夹": "Çıkış klasörünü belirt:", + "指定输出非主人声文件夹": "Müzik ve diğer sesler için çıkış klasörünü belirtin:", + "推理时间(ms):": "Çıkarsama süresi (ms):", + "推理音色": "Ses çıkartma (Inference):", + "提取": "Çıkart", + "提取音高和处理数据使用的CPU进程数": "Ses yüksekliği çıkartmak (Pitch) ve verileri işlemek için kullanılacak CPU işlemci sayısı:", + "是": "Evet", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Sadece en son '.ckpt' dosyasını kaydet:", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Her kaydetme noktasında son küçük bir modeli 'weights' klasörüne kaydetmek için:", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Tüm eğitim verilerini GPU belleğine önbelleğe alıp almayacağınızı belirtin. Küçük veri setlerini (10 dakikadan az) önbelleğe almak eğitimi hızlandırabilir, ancak büyük veri setlerini önbelleğe almak çok fazla GPU belleği tüketir ve çok fazla hız artışı sağlamaz:", + "显卡信息": "GPU Bilgisi", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Bu yazılım, MIT lisansı altında açık kaynaklıdır. Yazarın yazılım üzerinde herhangi bir kontrolü yoktur. Yazılımı kullanan ve yazılım tarafından dışa aktarılan sesleri dağıtan kullanıcılar sorumludur.
Eğer bu maddeyle aynı fikirde değilseniz, yazılım paketi içindeki herhangi bir kod veya dosyayı kullanamaz veya referans göremezsiniz. Detaylar için kök dizindeki Agreement-LICENSE.txt dosyasına bakınız.", + "查看": "Görüntüle", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Model bilgilerini görüntüle (sadece 'weights' klasöründen çıkarılan küçük model dosyaları desteklenir)", + "检索特征占比": "Arama özelliği oranı (vurgu gücünü kontrol eder, çok yüksek olması sanal etkilere neden olur)", + "模型": "Model", + "模型推理": "Model çıkartma (Inference)", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Model çıkartma (büyük dosya modeli yolunu 'logs' klasöründe girin). Bu, eğitimi yarıda bırakmak istediğinizde ve manuel olarak küçük bir model dosyası çıkartmak ve kaydetmek istediğinizde veya bir ara modeli test etmek istediğinizde kullanışlıdır:", + "模型是否带音高指导": "Modelin ses yüksekliği rehberi içerip içermediği:", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Modelin ses yüksekliği (Pitch) rehberliği içerip içermediği (şarkı söyleme için şarttır, konuşma için isteğe bağlıdır):", + "模型是否带音高指导,1是0否": "Modelin ses yüksekliği rehberi içerip içermediği (1: evet, 0: hayır):", + "模型版本型号": "Model mimari versiyonu:", + "模型融合, 可用于测试音色融合": "Model birleştirme, ses rengi birleştirmesi için kullanılabilir", + "模型路径": "Model Yolu:", + "每张显卡的batch_size": "Her GPU için yığın boyutu (batch_size):", + "淡入淡出长度": "Geçiş (Fade) uzunluğu", + "版本": "Sürüm", + "特征提取": "Özellik çıkartma", + "特征检索库文件路径,为空则使用下拉的选择结果": "Özellik indeksi dosyasının yolunu belirtin. Seçilen sonucu kullanmak için boş bırakın veya açılır menüden seçim yapın.", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Erkekten kadına çevirmek için +12 tuş önerilir, kadından erkeğe çevirmek için ise -12 tuş önerilir. Eğer ses aralığı çok fazla genişler ve ses bozulursa, isteğe bağlı olarak uygun aralığa kendiniz de ayarlayabilirsiniz.", + "目标采样率": "Hedef örnekleme oranı:", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "İndeks yolunu otomatik olarak tespit et ve açılır menüden seçim yap.", + "融合": "Birleştir", + "要改的模型信息": "Düzenlenecek model bilgileri:", + "要置入的模型信息": "Eklemek için model bilgileri:", + "训练": "Eğitim", + "训练模型": "Modeli Eğit", + "训练特征索引": "Özellik Dizinini Eğit", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Eğitim tamamlandı. Eğitim günlüklerini konsolda veya deney klasörü altındaki train.log dosyasında kontrol edebilirsiniz.", + "设备类型": "设备类型", + "请指定说话人id": "Lütfen konuşmacı/sanatçı no belirtin:", + "请选择index文件": "Lütfen .index dosyası seçin", + "请选择pth文件": "Lütfen .pth dosyası seçin", + "请选择说话人id": "Konuşmacı/Şarkıcı No seçin:", + "转换": "Dönüştür", + "输入实验名": "Deneysel adı girin:", + "输入待处理音频文件夹路径": "İşlenecek ses klasörünün yolunu girin:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "İşlenecek ses klasörünün yolunu girin (dosya yöneticisinin adres çubuğundan kopyalayın):", + "输入待处理音频文件路径(默认是正确格式示例)": "İşlenecek ses dosyasının yolunu girin (varsayılan doğru format örneğidir):", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Sesin hacim zarfını ayarlayın. 0'a yakın değerler, sesin orijinal vokallerin hacmine benzer olmasını sağlar. Düşük bir değerle ses gürültüsünü maskeleyebilir ve hacmi daha doğal bir şekilde duyulabilir hale getirebilirsiniz. 1'e yaklaştıkça sürekli bir yüksek ses seviyesi elde edilir:", + "输入监听": "输入监听", + "输入训练文件夹路径": "Eğitim klasörünün yolunu girin:", + "输入设备": "Giriş cihazı", + "输入降噪": "Giriş gürültü azaltma", + "输出信息": "Çıkış bilgisi", + "输出变声": "输出变声", + "输出设备": "Çıkış cihazı", + "输出降噪": "Çıkış gürültü azaltma", + "输出音频(右下角三个点,点了可以下载)": "Ses dosyasını dışa aktar (indirmek için sağ alt köşedeki üç noktaya tıklayın)", + "选择.index文件": ".index dosyası seç", + "选择.pth文件": ".pth dosyası seç", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "Pitch algoritmasını seçin ('pm': daha hızlı çıkarır ancak daha düşük kaliteli konuşma; 'harvest': daha iyi konuşma sesi ancak son derece yavaş; 'crepe': daha da iyi kalite ancak GPU yoğunluğu gerektirir):", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "Örnekleme uzunluğu", + "重载设备列表": "Cihaz listesini yeniden yükle", + "音调设置": "Pitch ayarları", + "音频设备": "Ses cihazı", + "音高算法": "音高算法", + "额外推理时长": "Ekstra çıkartma süresi" +} diff --git a/i18n/locale/zh_CN.json b/i18n/locale/zh_CN.json new file mode 100644 index 0000000000000000000000000000000000000000..6beb467b1bc2e9b3143004df0375ad3004441a59 --- /dev/null +++ b/i18n/locale/zh_CN.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音", + "A模型权重": "A模型权重", + "A模型路径": "A模型路径", + "B模型路径": "B模型路径", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调", + "Index Rate": "检索特征占比", + "Onnx导出": "Onnx导出", + "Onnx输出路径": "Onnx输出路径", + "RVC模型路径": "RVC模型路径", + "ckpt处理": "ckpt处理", + "harvest进程数": "harvest进程数", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ", + "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)", + "step3: 填写训练设置, 开始训练模型和索引": "step3: 填写训练设置, 开始训练模型和索引", + "step3a:正在训练模型": "step3a:正在训练模型", + "一键训练": "一键训练", + "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2", + "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "保存名", + "保存的文件名, 默认空为和源文件同名": "保存的文件名, 默认空为和源文件同名", + "保存的模型名不带后缀": "保存的模型名不带后缀", + "保存频率save_every_epoch": "保存频率save_every_epoch", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果", + "修改": "修改", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型信息(仅支持weights文件夹下提取的小模型文件)", + "停止音频转换": "停止音频转换", + "全流程结束!": "全流程结束!", + "刷新音色列表和索引路径": "刷新音色列表和索引路径", + "加载模型": "加载模型", + "加载预训练底模D路径": "加载预训练底模D路径", + "加载预训练底模G路径": "加载预训练底模G路径", + "单次推理": "单次推理", + "卸载音色省显存": "卸载音色省显存", + "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)", + "后处理重采样至最终采样率,0为不进行重采样": "后处理重采样至最终采样率,0为不进行重采样", + "否": "否", + "启用相位声码器": "启用相位声码器", + "响应阈值": "响应阈值", + "响度因子": "响度因子", + "处理数据": "处理数据", + "导出Onnx模型": "导出Onnx模型", + "导出文件格式": "导出文件格式", + "常见问题解答": "常见问题解答", + "常规设置": "常规设置", + "开始音频转换": "开始音频转换", + "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", + "性能设置": "性能设置", + "总训练轮数total_epoch": "总训练轮数total_epoch", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ", + "指定输出主人声文件夹": "指定输出主人声文件夹", + "指定输出文件夹": "指定输出文件夹", + "指定输出非主人声文件夹": "指定输出非主人声文件夹", + "推理时间(ms):": "推理时间(ms):", + "推理音色": "推理音色", + "提取": "提取", + "提取音高和处理数据使用的CPU进程数": "提取音高和处理数据使用的CPU进程数", + "是": "是", + "是否仅保存最新的ckpt文件以节省硬盘空间": "是否仅保存最新的ckpt文件以节省硬盘空间", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存时间点将最终小模型保存至weights文件夹", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速", + "显卡信息": "显卡信息", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.", + "查看": "查看", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型信息(仅支持weights文件夹下提取的小模型文件)", + "检索特征占比": "检索特征占比", + "模型": "模型", + "模型推理": "模型推理", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况", + "模型是否带音高指导": "模型是否带音高指导", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否带音高指导(唱歌一定要, 语音可以不要)", + "模型是否带音高指导,1是0否": "模型是否带音高指导,1是0否", + "模型版本型号": "模型版本型号", + "模型融合, 可用于测试音色融合": "模型融合, 可用于测试音色融合", + "模型路径": "模型路径", + "每张显卡的batch_size": "每张显卡的batch_size", + "淡入淡出长度": "淡入淡出长度", + "版本": "版本", + "特征提取": "特征提取", + "特征检索库文件路径,为空则使用下拉的选择结果": "特征检索库文件路径,为空则使用下拉的选择结果", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ", + "目标采样率": "目标采样率", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "自动检测index路径,下拉式选择(dropdown)", + "融合": "融合", + "要改的模型信息": "要改的模型信息", + "要置入的模型信息": "要置入的模型信息", + "训练": "训练", + "训练模型": "训练模型", + "训练特征索引": "训练特征索引", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", + "设备类型": "设备类型", + "请指定说话人id": "请指定说话人id", + "请选择index文件": "请选择index文件", + "请选择pth文件": "请选择pth文件", + "请选择说话人id": "请选择说话人id", + "转换": "转换", + "输入实验名": "输入实验名", + "输入待处理音频文件夹路径": "输入待处理音频文件夹路径", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)", + "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络", + "输入监听": "输入监听", + "输入训练文件夹路径": "输入训练文件夹路径", + "输入设备": "输入设备", + "输入降噪": "输入降噪", + "输出信息": "输出信息", + "输出变声": "输出变声", + "输出设备": "输出设备", + "输出降噪": "输出降噪", + "输出音频(右下角三个点,点了可以下载)": "输出音频(右下角三个点,点了可以下载)", + "选择.index文件": "选择.index文件", + "选择.pth文件": "选择.pth文件", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "采样长度", + "重载设备列表": "重载设备列表", + "音调设置": "音调设置", + "音频设备": "音频设备", + "音高算法": "音高算法", + "额外推理时长": "额外推理时长" +} diff --git a/i18n/locale/zh_HK.json b/i18n/locale/zh_HK.json new file mode 100644 index 0000000000000000000000000000000000000000..60432010157f8c6d93fface327bca3cca056aff9 --- /dev/null +++ b/i18n/locale/zh_HK.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", + "A模型权重": "A模型權重", + "A模型路径": "A模型路徑", + "B模型路径": "B模型路徑", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", + "Index Rate": "Index Rate", + "Onnx导出": "Onnx导出", + "Onnx输出路径": "Onnx输出路径", + "RVC模型路径": "RVC模型路径", + "ckpt处理": "ckpt處理", + "harvest进程数": "harvest進程數", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", + "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", + "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", + "step3a:正在训练模型": "step3a:正在训练模型", + "一键训练": "一鍵訓練", + "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", + "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "儲存名", + "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", + "保存的模型名不带后缀": "儲存的模型名不帶副檔名", + "保存频率save_every_epoch": "保存頻率save_every_epoch", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", + "修改": "修改", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "停止音频转换": "停止音訊轉換", + "全流程结束!": "全流程结束!", + "刷新音色列表和索引路径": "刷新音色列表和索引路徑", + "加载模型": "載入模型", + "加载预训练底模D路径": "加載預訓練底模D路徑", + "加载预训练底模G路径": "加載預訓練底模G路徑", + "单次推理": "单次推理", + "卸载音色省显存": "卸載音色節省 VRAM", + "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", + "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", + "否": "否", + "启用相位声码器": "启用相位声码器", + "响应阈值": "響應閾值", + "响度因子": "響度因子", + "处理数据": "處理資料", + "导出Onnx模型": "导出Onnx模型", + "导出文件格式": "導出檔格式", + "常见问题解答": "常見問題解答", + "常规设置": "一般設定", + "开始音频转换": "開始音訊轉換", + "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", + "性能设置": "效能設定", + "总训练轮数total_epoch": "總訓練輪數total_epoch", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", + "指定输出主人声文件夹": "指定输出主人声文件夹", + "指定输出文件夹": "指定輸出資料夾", + "指定输出非主人声文件夹": "指定输出非主人声文件夹", + "推理时间(ms):": "推理時間(ms):", + "推理音色": "推理音色", + "提取": "提取", + "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", + "是": "是", + "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", + "显卡信息": "顯示卡資訊", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", + "查看": "查看", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "检索特征占比": "檢索特徵佔比", + "模型": "模型", + "模型推理": "模型推理", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", + "模型是否带音高指导": "模型是否帶音高指導", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", + "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", + "模型版本型号": "模型版本型號", + "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", + "模型路径": "模型路徑", + "每张显卡的batch_size": "每张显卡的batch_size", + "淡入淡出长度": "淡入淡出長度", + "版本": "版本", + "特征提取": "特徵提取", + "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", + "目标采样率": "目標取樣率", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", + "融合": "融合", + "要改的模型信息": "要改的模型資訊", + "要置入的模型信息": "要置入的模型資訊", + "训练": "訓練", + "训练模型": "訓練模型", + "训练特征索引": "訓練特徵索引", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", + "设备类型": "设备类型", + "请指定说话人id": "請指定說話人id", + "请选择index文件": "请选择index文件", + "请选择pth文件": "请选择pth文件", + "请选择说话人id": "請選擇說話人ID", + "转换": "轉換", + "输入实验名": "輸入實驗名稱", + "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", + "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", + "输入监听": "输入监听", + "输入训练文件夹路径": "輸入訓練檔案夾路徑", + "输入设备": "輸入設備", + "输入降噪": "輸入降噪", + "输出信息": "輸出訊息", + "输出变声": "输出变声", + "输出设备": "輸出設備", + "输出降噪": "輸出降噪", + "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", + "选择.index文件": "選擇 .index 檔案", + "选择.pth文件": "選擇 .pth 檔案", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "取樣長度", + "重载设备列表": "重載設備列表", + "音调设置": "音調設定", + "音频设备": "音訊設備", + "音高算法": "音高演算法", + "额外推理时长": "額外推理時長" +} diff --git a/i18n/locale/zh_SG.json b/i18n/locale/zh_SG.json new file mode 100644 index 0000000000000000000000000000000000000000..60432010157f8c6d93fface327bca3cca056aff9 --- /dev/null +++ b/i18n/locale/zh_SG.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", + "A模型权重": "A模型權重", + "A模型路径": "A模型路徑", + "B模型路径": "B模型路徑", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", + "Index Rate": "Index Rate", + "Onnx导出": "Onnx导出", + "Onnx输出路径": "Onnx输出路径", + "RVC模型路径": "RVC模型路径", + "ckpt处理": "ckpt處理", + "harvest进程数": "harvest進程數", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", + "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", + "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", + "step3a:正在训练模型": "step3a:正在训练模型", + "一键训练": "一鍵訓練", + "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", + "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "儲存名", + "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", + "保存的模型名不带后缀": "儲存的模型名不帶副檔名", + "保存频率save_every_epoch": "保存頻率save_every_epoch", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", + "修改": "修改", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "停止音频转换": "停止音訊轉換", + "全流程结束!": "全流程结束!", + "刷新音色列表和索引路径": "刷新音色列表和索引路徑", + "加载模型": "載入模型", + "加载预训练底模D路径": "加載預訓練底模D路徑", + "加载预训练底模G路径": "加載預訓練底模G路徑", + "单次推理": "单次推理", + "卸载音色省显存": "卸載音色節省 VRAM", + "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", + "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", + "否": "否", + "启用相位声码器": "启用相位声码器", + "响应阈值": "響應閾值", + "响度因子": "響度因子", + "处理数据": "處理資料", + "导出Onnx模型": "导出Onnx模型", + "导出文件格式": "導出檔格式", + "常见问题解答": "常見問題解答", + "常规设置": "一般設定", + "开始音频转换": "開始音訊轉換", + "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", + "性能设置": "效能設定", + "总训练轮数total_epoch": "總訓練輪數total_epoch", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", + "指定输出主人声文件夹": "指定输出主人声文件夹", + "指定输出文件夹": "指定輸出資料夾", + "指定输出非主人声文件夹": "指定输出非主人声文件夹", + "推理时间(ms):": "推理時間(ms):", + "推理音色": "推理音色", + "提取": "提取", + "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", + "是": "是", + "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", + "显卡信息": "顯示卡資訊", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", + "查看": "查看", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "检索特征占比": "檢索特徵佔比", + "模型": "模型", + "模型推理": "模型推理", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", + "模型是否带音高指导": "模型是否帶音高指導", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", + "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", + "模型版本型号": "模型版本型號", + "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", + "模型路径": "模型路徑", + "每张显卡的batch_size": "每张显卡的batch_size", + "淡入淡出长度": "淡入淡出長度", + "版本": "版本", + "特征提取": "特徵提取", + "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", + "目标采样率": "目標取樣率", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", + "融合": "融合", + "要改的模型信息": "要改的模型資訊", + "要置入的模型信息": "要置入的模型資訊", + "训练": "訓練", + "训练模型": "訓練模型", + "训练特征索引": "訓練特徵索引", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", + "设备类型": "设备类型", + "请指定说话人id": "請指定說話人id", + "请选择index文件": "请选择index文件", + "请选择pth文件": "请选择pth文件", + "请选择说话人id": "請選擇說話人ID", + "转换": "轉換", + "输入实验名": "輸入實驗名稱", + "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", + "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", + "输入监听": "输入监听", + "输入训练文件夹路径": "輸入訓練檔案夾路徑", + "输入设备": "輸入設備", + "输入降噪": "輸入降噪", + "输出信息": "輸出訊息", + "输出变声": "输出变声", + "输出设备": "輸出設備", + "输出降噪": "輸出降噪", + "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", + "选择.index文件": "選擇 .index 檔案", + "选择.pth文件": "選擇 .pth 檔案", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "取樣長度", + "重载设备列表": "重載設備列表", + "音调设置": "音調設定", + "音频设备": "音訊設備", + "音高算法": "音高演算法", + "额外推理时长": "額外推理時長" +} diff --git a/i18n/locale/zh_TW.json b/i18n/locale/zh_TW.json new file mode 100644 index 0000000000000000000000000000000000000000..60432010157f8c6d93fface327bca3cca056aff9 --- /dev/null +++ b/i18n/locale/zh_TW.json @@ -0,0 +1,137 @@ +{ + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", + "A模型权重": "A模型權重", + "A模型路径": "A模型路徑", + "B模型路径": "B模型路徑", + "E:\\语音音频+标注\\米津玄师\\src": "E:\\语音音频+标注\\米津玄师\\src", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0曲線檔案,可選,一行一個音高,代替預設的F0及升降調", + "Index Rate": "Index Rate", + "Onnx导出": "Onnx导出", + "Onnx输出路径": "Onnx输出路径", + "RVC模型路径": "RVC模型路径", + "ckpt处理": "ckpt處理", + "harvest进程数": "harvest進程數", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", + "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", + "step3: 填写训练设置, 开始训练模型和索引": "步驟3: 填寫訓練設定, 開始訓練模型和索引", + "step3a:正在训练模型": "step3a:正在训练模型", + "一键训练": "一鍵訓練", + "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "以-分隔輸入使用的卡號, 例如 0-1-2 使用卡0和卡1和卡2", + "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "使用模型采样率": "使用模型采样率", + "使用设备采样率": "使用设备采样率", + "保存名": "儲存名", + "保存的文件名, 默认空为和源文件同名": "儲存的檔案名,預設空為與來源檔案同名", + "保存的模型名不带后缀": "儲存的模型名不帶副檔名", + "保存频率save_every_epoch": "保存頻率save_every_epoch", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "保護清輔音和呼吸聲,防止電音撕裂等artifact,拉滿0.5不開啟,調低加大保護力度但可能降低索引效果", + "修改": "修改", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "修改模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "停止音频转换": "停止音訊轉換", + "全流程结束!": "全流程结束!", + "刷新音色列表和索引路径": "刷新音色列表和索引路徑", + "加载模型": "載入模型", + "加载预训练底模D路径": "加載預訓練底模D路徑", + "加载预训练底模G路径": "加載預訓練底模G路徑", + "单次推理": "单次推理", + "卸载音色省显存": "卸載音色節省 VRAM", + "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", + "后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣", + "否": "否", + "启用相位声码器": "启用相位声码器", + "响应阈值": "響應閾值", + "响度因子": "響度因子", + "处理数据": "處理資料", + "导出Onnx模型": "导出Onnx模型", + "导出文件格式": "導出檔格式", + "常见问题解答": "常見問題解答", + "常规设置": "一般設定", + "开始音频转换": "開始音訊轉換", + "很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练", + "性能设置": "效能設定", + "总训练轮数total_epoch": "總訓練輪數total_epoch", + "批量推理": "批量推理", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。", + "指定输出主人声文件夹": "指定输出主人声文件夹", + "指定输出文件夹": "指定輸出資料夾", + "指定输出非主人声文件夹": "指定输出非主人声文件夹", + "推理时间(ms):": "推理時間(ms):", + "推理音色": "推理音色", + "提取": "提取", + "提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數", + "是": "是", + "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt檔案以節省硬碟空間", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights檔夾", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "是否緩存所有訓練集至 VRAM。小於10分鐘的小數據可緩存以加速訓練,大數據緩存會爆 VRAM 也加不了多少速度", + "显卡信息": "顯示卡資訊", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", + "查看": "查看", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "查看模型資訊(僅支援weights資料夾下提取的小模型檔案)", + "检索特征占比": "檢索特徵佔比", + "模型": "模型", + "模型推理": "模型推理", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "模型提取(輸入logs資料夾下大檔案模型路徑),適用於訓一半不想訓了模型沒有自動提取儲存小檔案模型,或者想測試中間模型的情況", + "模型是否带音高指导": "模型是否帶音高指導", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)", + "模型是否带音高指导,1是0否": "模型是否帶音高指導,1是0否", + "模型版本型号": "模型版本型號", + "模型融合, 可用于测试音色融合": "模型融合,可用於測試音色融合", + "模型路径": "模型路徑", + "每张显卡的batch_size": "每张显卡的batch_size", + "淡入淡出长度": "淡入淡出長度", + "版本": "版本", + "特征提取": "特徵提取", + "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", + "独占 WASAPI 设备": "独占 WASAPI 设备", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", + "目标采样率": "目標取樣率", + "算法延迟(ms):": "算法延迟(ms):", + "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", + "融合": "融合", + "要改的模型信息": "要改的模型資訊", + "要置入的模型信息": "要置入的模型資訊", + "训练": "訓練", + "训练模型": "訓練模型", + "训练特征索引": "訓練特徵索引", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", + "设备类型": "设备类型", + "请指定说话人id": "請指定說話人id", + "请选择index文件": "请选择index文件", + "请选择pth文件": "请选择pth文件", + "请选择说话人id": "請選擇說話人ID", + "转换": "轉換", + "输入实验名": "輸入實驗名稱", + "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)", + "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "輸入源音量包絡替換輸出音量包絡融合比例,越靠近1越使用輸出包絡", + "输入监听": "输入监听", + "输入训练文件夹路径": "輸入訓練檔案夾路徑", + "输入设备": "輸入設備", + "输入降噪": "輸入降噪", + "输出信息": "輸出訊息", + "输出变声": "输出变声", + "输出设备": "輸出設備", + "输出降噪": "輸出降噪", + "输出音频(右下角三个点,点了可以下载)": "輸出音頻(右下角三個點,點了可以下載)", + "选择.index文件": "選擇 .index 檔案", + "选择.pth文件": "選擇 .pth 檔案", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU": "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢,rmvpe效果最好且微吃CPU/GPU", + "采样率:": "采样率:", + "采样长度": "取樣長度", + "重载设备列表": "重載設備列表", + "音调设置": "音調設定", + "音频设备": "音訊設備", + "音高算法": "音高演算法", + "额外推理时长": "額外推理時長" +} diff --git a/i18n/locale_diff.py b/i18n/locale_diff.py new file mode 100644 index 0000000000000000000000000000000000000000..674f7dd25a5d05b8af85d28683ba79343767e0e6 --- /dev/null +++ b/i18n/locale_diff.py @@ -0,0 +1,47 @@ +import json +import os +from collections import OrderedDict + +# Define the standard file name +standard_file = "locale/zh_CN.json" + +# Find all JSON files in the directory +dir_path = "locale/" +languages = [ + os.path.join(dir_path, f) + for f in os.listdir(dir_path) + if f.endswith(".json") and f != standard_file +] + +# Load the standard file +with open(standard_file, "r", encoding="utf-8") as f: + standard_data = json.load(f, object_pairs_hook=OrderedDict) + +# Loop through each language file +for lang_file in languages: + # Load the language file + with open(lang_file, "r", encoding="utf-8") as f: + lang_data = json.load(f, object_pairs_hook=OrderedDict) + + # Find the difference between the language file and the standard file + diff = set(standard_data.keys()) - set(lang_data.keys()) + + miss = set(lang_data.keys()) - set(standard_data.keys()) + + # Add any missing keys to the language file + for key in diff: + lang_data[key] = key + + # Del any extra keys to the language file + for key in miss: + del lang_data[key] + + # Sort the keys of the language file to match the order of the standard file + lang_data = OrderedDict( + sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0])) + ) + + # Save the updated language file + with open(lang_file, "w", encoding="utf-8") as f: + json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True) + f.write("\n") diff --git a/i18n/scan_i18n.py b/i18n/scan_i18n.py new file mode 100644 index 0000000000000000000000000000000000000000..f3e52cf4f9f06d78877d77d2353f666aa759e36f --- /dev/null +++ b/i18n/scan_i18n.py @@ -0,0 +1,75 @@ +import ast +import glob +import json +from collections import OrderedDict + + +def extract_i18n_strings(node): + i18n_strings = [] + + if ( + isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "i18n" + ): + for arg in node.args: + if isinstance(arg, ast.Str): + i18n_strings.append(arg.s) + + for child_node in ast.iter_child_nodes(node): + i18n_strings.extend(extract_i18n_strings(child_node)) + + return i18n_strings + + +# scan the directory for all .py files (recursively) +# for each file, parse the code into an AST +# for each AST, extract the i18n strings + +strings = [] +for filename in glob.iglob("**/*.py", recursive=True): + with open(filename, "r") as f: + code = f.read() + if "I18nAuto" in code: + tree = ast.parse(code) + i18n_strings = extract_i18n_strings(tree) + print(filename, len(i18n_strings)) + strings.extend(i18n_strings) +code_keys = set(strings) +""" +n_i18n.py +gui_v1.py 26 +app.py 16 +infer-web.py 147 +scan_i18n.py 0 +i18n.py 0 +lib/train/process_ckpt.py 1 +""" +print() +print("Total unique:", len(code_keys)) + + +standard_file = "i18n/locale/zh_CN.json" +with open(standard_file, "r", encoding="utf-8") as f: + standard_data = json.load(f, object_pairs_hook=OrderedDict) +standard_keys = set(standard_data.keys()) + +# Define the standard file name +unused_keys = standard_keys - code_keys +print("Unused keys:", len(unused_keys)) +for unused_key in unused_keys: + print("\t", unused_key) + +missing_keys = code_keys - standard_keys +print("Missing keys:", len(missing_keys)) +for missing_key in missing_keys: + print("\t", missing_key) + +code_keys_dict = OrderedDict() +for s in strings: + code_keys_dict[s] = s + +# write back +with open(standard_file, "w", encoding="utf-8") as f: + json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True) + f.write("\n") diff --git a/infer/lib/audio.py b/infer/lib/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..d43e5d033275cc9f8159a8470efa2180105a576a --- /dev/null +++ b/infer/lib/audio.py @@ -0,0 +1,57 @@ +import platform, os +import traceback +import ffmpeg +import numpy as np +import av + + +def wav2(i, o, format): + inp = av.open(i, "rb") + if format == "m4a": + format = "mp4" + out = av.open(o, "wb", format=format) + if format == "ogg": + format = "libvorbis" + if format == "mp4": + format = "aac" + + ostream = out.add_stream(format) + + for frame in inp.decode(audio=0): + for p in ostream.encode(frame): + out.mux(p) + + for p in ostream.encode(None): + out.mux(p) + + out.close() + inp.close() + + +def load_audio(file, sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 + if os.path.exists(file) == False: + raise RuntimeError( + "You input a wrong audio path that does not exists, please fix it!" + ) + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except Exception as e: + traceback.print_exc() + raise RuntimeError(f"Failed to load audio: {e}") + + return np.frombuffer(out, np.float32).flatten() + + + +def clean_path(path_str): + if platform.system() == "Windows": + path_str = path_str.replace("/", "\\") + return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ") diff --git a/zerorvc/synthesizer/attentions.py b/infer/lib/infer_pack/attentions.py similarity index 89% rename from zerorvc/synthesizer/attentions.py rename to infer/lib/infer_pack/attentions.py index 0d986b33093a7b108c0f908b9522f09fdb5fe858..2cc745ae7d2e61ab260c6ba5b65379fb2262a240 100644 --- a/zerorvc/synthesizer/attentions.py +++ b/infer/lib/infer_pack/attentions.py @@ -1,26 +1,29 @@ +import copy import math from typing import Optional +import numpy as np import torch from torch import nn from torch.nn import functional as F -from . import commons -from .modules import LayerNorm +from infer.lib.infer_pack import commons, modules +from infer.lib.infer_pack.modules import LayerNorm class Encoder(nn.Module): def __init__( self, - hidden_channels: int, - filter_channels: int, - n_heads: int, - n_layers: int, + hidden_channels, + filter_channels, + n_heads, + n_layers, kernel_size=1, p_dropout=0.0, window_size=10, + **kwargs ): - super().__init__() + super(Encoder, self).__init__() self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads @@ -77,16 +80,17 @@ class Encoder(nn.Module): class Decoder(nn.Module): def __init__( self, - hidden_channels: int, - filter_channels: int, - n_heads: int, - n_layers: int, + hidden_channels, + filter_channels, + n_heads, + n_layers, kernel_size=1, p_dropout=0.0, proximal_bias=False, proximal_init=True, + **kwargs ): - super().__init__() + super(Decoder, self).__init__() self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads @@ -133,13 +137,7 @@ class Decoder(nn.Module): ) self.norm_layers_2.append(LayerNorm(hidden_channels)) - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - h: torch.Tensor, - h_mask: torch.Tensor, - ): + def forward(self, x, x_mask, h, h_mask): """ x: decoder input h: encoder output @@ -168,17 +166,17 @@ class Decoder(nn.Module): class MultiHeadAttention(nn.Module): def __init__( self, - channels: int, - out_channels: int, - n_heads: int, + channels, + out_channels, + n_heads, p_dropout=0.0, - window_size: int = None, + window_size=None, heads_share=True, - block_length: int = None, + block_length=None, proximal_bias=False, proximal_init=False, ): - super().__init__() + super(MultiHeadAttention, self).__init__() assert channels % n_heads == 0 self.channels = channels @@ -289,7 +287,7 @@ class MultiHeadAttention(nn.Module): ) # [b, n_h, t_t, d_k] -> [b, d, t_t] return output, p_attn - def _matmul_with_relative_values(self, x: torch.Tensor, y: torch.Tensor): + def _matmul_with_relative_values(self, x, y): """ x: [b, h, l, m] y: [h or 1, m, d] @@ -298,7 +296,7 @@ class MultiHeadAttention(nn.Module): ret = torch.matmul(x, y.unsqueeze(0)) return ret - def _matmul_with_relative_keys(self, x: torch.Tensor, y: torch.Tensor): + def _matmul_with_relative_keys(self, x, y): """ x: [b, h, l, d] y: [h or 1, m, d] @@ -307,8 +305,8 @@ class MultiHeadAttention(nn.Module): ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) return ret - def _get_relative_embeddings(self, relative_embeddings: torch.Tensor, length: int): - # max_relative_position = 2 * self.window_size + 1 + def _get_relative_embeddings(self, relative_embeddings, length: int): + max_relative_position = 2 * self.window_size + 1 # Pad first before slice to avoid using cond ops. pad_length: int = max(length - (self.window_size + 1), 0) slice_start_position = max((self.window_size + 1) - length, 0) @@ -326,7 +324,7 @@ class MultiHeadAttention(nn.Module): ] return used_relative_embeddings - def _relative_position_to_absolute_position(self, x: torch.Tensor): + def _relative_position_to_absolute_position(self, x): """ x: [b, h, l, 2*l-1] ret: [b, h, l, l] @@ -353,7 +351,7 @@ class MultiHeadAttention(nn.Module): ] return x_final - def _absolute_position_to_relative_position(self, x: torch.Tensor): + def _absolute_position_to_relative_position(self, x): """ x: [b, h, l, l] ret: [b, h, l, 2*l-1] @@ -390,15 +388,15 @@ class MultiHeadAttention(nn.Module): class FFN(nn.Module): def __init__( self, - in_channels: int, - out_channels: int, - filter_channels: int, - kernel_size: int, + in_channels, + out_channels, + filter_channels, + kernel_size, p_dropout=0.0, activation: str = None, causal=False, ): - super().__init__() + super(FFN, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.filter_channels = filter_channels @@ -434,7 +432,7 @@ class FFN(nn.Module): x = self.conv_2(self.padding(x, x_mask)) return x * x_mask - def _causal_padding(self, x: torch.Tensor): + def _causal_padding(self, x): if self.kernel_size == 1: return x pad_l: int = self.kernel_size - 1 @@ -447,7 +445,7 @@ class FFN(nn.Module): ) return x - def _same_padding(self, x: torch.Tensor): + def _same_padding(self, x): if self.kernel_size == 1: return x pad_l: int = (self.kernel_size - 1) // 2 diff --git a/zerorvc/synthesizer/commons.py b/infer/lib/infer_pack/commons.py similarity index 89% rename from zerorvc/synthesizer/commons.py rename to infer/lib/infer_pack/commons.py index 909800d14771c9194f8d1aa41c53405b0d78a10d..4ec6c244e228647b125429f62b8c9fddbe40eba9 100644 --- a/zerorvc/synthesizer/commons.py +++ b/infer/lib/infer_pack/commons.py @@ -1,7 +1,9 @@ from typing import List, Optional import math +import numpy as np import torch +from torch import nn from torch.nn import functional as F @@ -11,7 +13,7 @@ def init_weights(m, mean=0.0, std=0.01): m.weight.data.normal_(mean, std) -def get_padding(kernel_size: int, dilation=1): +def get_padding(kernel_size, dilation=1): return int((kernel_size * dilation - dilation) / 2) @@ -21,9 +23,7 @@ def get_padding(kernel_size: int, dilation=1): # return pad_shape -def kl_divergence( - m_p: torch.Tensor, logs_p: torch.Tensor, m_q: torch.Tensor, logs_q: torch.Tensor -): +def kl_divergence(m_p, logs_p, m_q, logs_q): """KL(P||Q)""" kl = (logs_q - logs_p) - 0.5 kl += ( @@ -38,12 +38,12 @@ def rand_gumbel(shape): return -torch.log(-torch.log(uniform_samples)) -def rand_gumbel_like(x: torch.Tensor): +def rand_gumbel_like(x): g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) return g -def slice_segments(x: torch.Tensor, ids_str, segment_size=4): +def slice_segments(x, ids_str, segment_size=4): ret = torch.zeros_like(x[:, :, :segment_size]) for i in range(x.size(0)): idx_str = ids_str[i] @@ -52,7 +52,7 @@ def slice_segments(x: torch.Tensor, ids_str, segment_size=4): return ret -def slice_segments2(x: torch.Tensor, ids_str, segment_size=4): +def slice_segments2(x, ids_str, segment_size=4): ret = torch.zeros_like(x[:, :segment_size]) for i in range(x.size(0)): idx_str = ids_str[i] @@ -61,7 +61,7 @@ def slice_segments2(x: torch.Tensor, ids_str, segment_size=4): return ret -def rand_slice_segments(x: torch.Tensor, x_lengths=None, segment_size=4): +def rand_slice_segments(x, x_lengths=None, segment_size=4): b, d, t = x.size() if x_lengths is None: x_lengths = t diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py new file mode 100644 index 0000000000000000000000000000000000000000..a1a27e2e6c4681cbf466e350e064adcca499f97f --- /dev/null +++ b/infer/lib/infer_pack/models.py @@ -0,0 +1,1242 @@ +import math +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + +import numpy as np +import torch +from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm +from infer.lib.infer_pack import attentions, commons, modules +from infer.lib.infer_pack.commons import get_padding, init_weights + +has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available()) + + +class TextEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super(TextEncoder, self).__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.emb_phone = nn.Linear(in_channels, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, + phone: torch.Tensor, + pitch: torch.Tensor, + lengths: torch.Tensor, + skip_head: Optional[torch.Tensor] = None, + ): + if pitch is None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + if skip_head is not None: + assert isinstance(skip_head, torch.Tensor) + head = int(skip_head.item()) + x = x[:, :, head:] + x_mask = x_mask[:, :, head:] + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super(ResidualCouplingBlock, self).__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in self.flows[::-1]: + x, _ = flow.forward(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + def __prepare_scriptable__(self): + for i in range(self.n_flows): + for hook in self.flows[i * 2]._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flows[i * 2]) + + return self + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super(PosteriorEncoder, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None + ): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward( + self, + x: torch.Tensor, + g: Optional[torch.Tensor] = None, + n_res: Optional[torch.Tensor] = None, + ): + if n_res is not None: + assert isinstance(n_res, torch.Tensor) + n = int(n_res.item()) + if n != x.shape[-1]: + x = F.interpolate(x, size=n, mode="linear") + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + + for l in self.resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(torch.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + if uv.device.type == "privateuseone": # for DirectML + uv = uv.float() + return uv + + def forward(self, f0: torch.Tensor, upp: int): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in range(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = ( + f0_buf / self.sampling_rate + ) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum( + rad_values, 1 + ) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=float(upp), + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=float(upp), mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + # self.ddtype:int = -1 + + def forward(self, x: torch.Tensor, upp: int = 1): + # if self.ddtype ==-1: + # self.ddtype = self.l_linear.weight.dtype + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype) + # if self.is_half: + # sine_wavs = sine_wavs.half() + # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x))) + # print(sine_wavs.dtype,self.ddtype) + # if sine_wavs.dtype != self.l_linear.weight.dtype: + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = math.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = math.prod(upsample_rates) + + self.lrelu_slope = modules.LRELU_SLOPE + + def forward( + self, + x, + f0, + g: Optional[torch.Tensor] = None, + n_res: Optional[torch.Tensor] = None, + ): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + if n_res is not None: + assert isinstance(n_res, torch.Tensor) + n = int(n_res.item()) + if n * self.upp != har_source.shape[-1]: + har_source = F.interpolate(har_source, size=n * self.upp, mode="linear") + if n != x.shape[-1]: + x = F.interpolate(x, size=n, mode="linear") + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + # torch.jit.script() does not support direct indexing of torch modules + # That's why I wrote this + for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): + if i < self.num_upsamples: + x = F.leaky_relu(x, self.lrelu_slope) + x = ups(x) + x_source = noise_convs(har_source) + x = x + x_source + xs: Optional[torch.Tensor] = None + l = [i * self.num_kernels + j for j in range(self.num_kernels)] + for j, resblock in enumerate(self.resblocks): + if j in l: + if xs is None: + xs = resblock(x) + else: + xs += resblock(x) + # This assertion cannot be ignored! \ + # If ignored, it will cause torch.jit.script() compilation errors + assert isinstance(xs, torch.Tensor) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.resblocks: + for hook in self.resblocks._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMs256NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super(SynthesizerTrnMs256NSFsid, self).__init__() + if isinstance(sr, str): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + 256, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + logger.debug( + "gin_channels: " + + str(gin_channels) + + ", self.spk_embed_dim: " + + str(self.spk_embed_dim) + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore + def forward( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor, + pitchf: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + ds: Optional[torch.Tensor] = None, + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor, + nsff0: torch.Tensor, + sid: torch.Tensor, + skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, + return_length2: Optional[torch.Tensor] = None, + ): + g = self.emb_g(sid).unsqueeze(-1) + if skip_head is not None and return_length is not None: + assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) + head = int(skip_head.item()) + length = int(return_length.item()) + flow_head = torch.clamp(skip_head - 24, min=0) + dec_head = head - int(flow_head.item()) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths, flow_head) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + z = z[:, :, dec_head : dec_head + length] + x_mask = x_mask[:, :, dec_head : dec_head + length] + nsff0 = nsff0[:, head : head + length] + else: + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, nsff0, g=g, n_res=return_length2) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid(SynthesizerTrnMs256NSFsid): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super(SynthesizerTrnMs768NSFsid, self).__init__( + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ) + del self.enc_p + self.enc_p = TextEncoder( + 768, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + + +class SynthesizerTrnMs256NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super(SynthesizerTrnMs256NSFsid_nono, self).__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + 256, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + logger.debug( + "gin_channels: " + + str(gin_channels) + + ", self.spk_embed_dim: " + + str(self.spk_embed_dim) + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + sid: torch.Tensor, + skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, + return_length2: Optional[torch.Tensor] = None, + ): + g = self.emb_g(sid).unsqueeze(-1) + if skip_head is not None and return_length is not None: + assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) + head = int(skip_head.item()) + length = int(return_length.item()) + flow_head = torch.clamp(skip_head - 24, min=0) + dec_head = head - int(flow_head.item()) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths, flow_head) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + z = z[:, :, dec_head : dec_head + length] + x_mask = x_mask[:, :, dec_head : dec_head + length] + else: + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g, n_res=return_length2) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid_nono(SynthesizerTrnMs256NSFsid_nono): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super(SynthesizerTrnMs768NSFsid_nono, self).__init__( + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ) + del self.enc_p + self.enc_p = TextEncoder( + 768, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + f0=False, + ) + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + # periods = [2, 3, 5, 7, 11, 17] + periods = [2, 3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + if has_xpu and x.dtype == torch.bfloat16: + x = F.pad(x.to(dtype=torch.float16), (0, n_pad), "reflect").to( + dtype=torch.bfloat16 + ) + else: + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap diff --git a/zerorvc/synthesizer/models.py b/infer/lib/infer_pack/models_onnx.py similarity index 64% rename from zerorvc/synthesizer/models.py rename to infer/lib/infer_pack/models_onnx.py index b13b13849dee8e2e0eae9ff5f8b5e30bbfa1ab14..a6d321f753f3b7ba5d2132ca98519eae2c493d7c 100644 --- a/zerorvc/synthesizer/models.py +++ b/infer/lib/infer_pack/models_onnx.py @@ -1,31 +1,29 @@ import math import logging -from typing import List, Literal, Optional +logger = logging.getLogger(__name__) + +import numpy as np import torch from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, spectral_norm -from torch.nn.utils.parametrizations import weight_norm -from huggingface_hub import PyTorchModelHubMixin +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm -from . import attentions, commons, modules -from .commons import get_padding, init_weights +from infer.lib.infer_pack import attentions, commons, modules +from infer.lib.infer_pack.commons import get_padding, init_weights -logger = logging.getLogger(__name__) - -class TextEncoder(nn.Module): +class TextEncoder256(nn.Module): def __init__( self, - in_channels: int, - out_channels: int, - hidden_channels: int, - filter_channels: int, - n_heads: int, - n_layers: int, - kernel_size: int, - p_dropout: float, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, f0=True, ): super().__init__() @@ -35,29 +33,65 @@ class TextEncoder(nn.Module): self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.emb_phone = nn.Linear(in_channels, hidden_channels) + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(256, hidden_channels) self.lrelu = nn.LeakyReLU(0.1, inplace=True) if f0 == True: self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.encoder = attentions.Encoder( - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - float(p_dropout), + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward( + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class TextEncoder768(nn.Module): + def __init__( self, - phone: torch.Tensor, - pitch: torch.Tensor, - lengths: torch.Tensor, - skip_head: Optional[torch.Tensor] = None, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, ): - if pitch is None: + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(768, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: x = self.emb_phone(phone) else: x = self.emb_phone(phone) + self.emb_pitch(pitch) @@ -68,12 +102,8 @@ class TextEncoder(nn.Module): x.dtype ) x = self.encoder(x * x_mask, x_mask) - if skip_head is not None: - assert isinstance(skip_head, torch.Tensor) - head = int(skip_head.item()) - x = x[:, :, head:] - x_mask = x_mask[:, :, head:] stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) return m, logs, x_mask @@ -81,11 +111,11 @@ class TextEncoder(nn.Module): class ResidualCouplingBlock(nn.Module): def __init__( self, - channels: int, - hidden_channels: int, - kernel_size: int, - dilation_rate: float, - n_layers: int, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, n_flows=4, gin_channels=0, ): @@ -113,19 +143,13 @@ class ResidualCouplingBlock(nn.Module): ) self.flows.append(modules.Flip()) - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - g: Optional[torch.Tensor] = None, - reverse: bool = False, - ): + def forward(self, x, x_mask, g=None, reverse=False): if not reverse: for flow in self.flows: x, _ = flow(x, x_mask, g=g, reverse=reverse) else: - for flow in self.flows[::-1]: - x, _ = flow.forward(x, x_mask, g=g, reverse=reverse) + for flow in reversed(self.flows): + x, _ = flow(x, x_mask, g=g, reverse=reverse) return x def remove_weight_norm(self): @@ -136,12 +160,12 @@ class ResidualCouplingBlock(nn.Module): class PosteriorEncoder(nn.Module): def __init__( self, - in_channels: int, - out_channels: int, - hidden_channels: int, - kernel_size: int, - dilation_rate: float, - n_layers: int, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, gin_channels=0, ): super().__init__() @@ -163,9 +187,7 @@ class PosteriorEncoder(nn.Module): ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward( - self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None - ): + def forward(self, x, x_lengths, g=None): x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( x.dtype ) @@ -183,19 +205,19 @@ class PosteriorEncoder(nn.Module): class Generator(torch.nn.Module): def __init__( self, - initial_channel: int, - resblock: Literal["1", "2"], + initial_channel, + resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, - upsample_initial_channel: int, + upsample_initial_channel, upsample_kernel_sizes, gin_channels=0, ): - super().__init__() + super(Generator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) - self.conv_pre = nn.Conv1d( + self.conv_pre = Conv1d( initial_channel, upsample_initial_channel, 7, 1, padding=3 ) resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 @@ -204,7 +226,7 @@ class Generator(torch.nn.Module): for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): self.ups.append( weight_norm( - nn.ConvTranspose1d( + ConvTranspose1d( upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, @@ -222,23 +244,13 @@ class Generator(torch.nn.Module): ): self.resblocks.append(resblock(ch, k, d)) - self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) self.ups.apply(init_weights) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - def forward( - self, - x: torch.Tensor, - g: Optional[torch.Tensor] = None, - n_res: Optional[torch.Tensor] = None, - ): - if n_res is not None: - assert isinstance(n_res, torch.Tensor) - n = int(n_res.item()) - if n != x.shape[-1]: - x = F.interpolate(x, size=n, mode="linear") + def forward(self, x, g=None): x = self.conv_pre(x) if g is not None: x = x + self.cond(g) @@ -279,7 +291,7 @@ class SineGen(torch.nn.Module): voiced_thoreshold: F0 threshold for U/V classification (default 0) flag_for_pulse: this SinGen is used inside PulseGen (default False) Note: when flag_for_pulse is True, the first time step of a voiced - segment is always sin(torch.pi) or cos(0) + segment is always sin(np.pi) or cos(0) """ def __init__( @@ -291,7 +303,7 @@ class SineGen(torch.nn.Module): voiced_threshold=0, flag_for_pulse=False, ): - super().__init__() + super(SineGen, self).__init__() self.sine_amp = sine_amp self.noise_std = noise_std self.harmonic_num = harmonic_num @@ -305,7 +317,7 @@ class SineGen(torch.nn.Module): uv = uv * (f0 > self.voiced_threshold) return uv - def forward(self, f0: torch.Tensor, upp: int): + def forward(self, f0, upp): """sine_tensor, uv = forward(f0) input F0: tensor(batchsize=1, length, dim=1) f0 for unvoiced steps should be 0 @@ -317,7 +329,7 @@ class SineGen(torch.nn.Module): f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) # fundamental component f0_buf[:, :, 0] = f0[:, :, 0] - for idx in range(self.harmonic_num): + for idx in np.arange(self.harmonic_num): f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( idx + 2 ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic @@ -335,12 +347,12 @@ class SineGen(torch.nn.Module): tmp_over_one *= upp tmp_over_one = F.interpolate( tmp_over_one.transpose(2, 1), - scale_factor=float(upp), + scale_factor=upp, mode="linear", align_corners=True, ).transpose(2, 1) rad_values = F.interpolate( - rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest" + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" ).transpose( 2, 1 ) ####### @@ -349,12 +361,12 @@ class SineGen(torch.nn.Module): cumsum_shift = torch.zeros_like(rad_values) cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 sine_waves = torch.sin( - torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi ) sine_waves = sine_waves * self.sine_amp uv = self._f02uv(f0) uv = F.interpolate( - uv.transpose(2, 1), scale_factor=float(upp), mode="nearest" + uv.transpose(2, 1), scale_factor=upp, mode="nearest" ).transpose(2, 1) noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise = noise_amp * torch.randn_like(sine_waves) @@ -382,16 +394,18 @@ class SourceModuleHnNSF(torch.nn.Module): def __init__( self, - sampling_rate: int, + sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0, + is_half=True, ): - super().__init__() + super(SourceModuleHnNSF, self).__init__() self.sine_amp = sine_amp self.noise_std = add_noise_std + self.is_half = is_half # to produce sine waveforms self.l_sin_gen = SineGen( sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod @@ -400,17 +414,11 @@ class SourceModuleHnNSF(torch.nn.Module): # to merge source harmonics into a single excitation self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() - # self.ddtype:int = -1 - def forward(self, x: torch.Tensor, upp: int = 1): - # if self.ddtype ==-1: - # self.ddtype = self.l_linear.weight.dtype + def forward(self, x, upp=None): sine_wavs, uv, _ = self.l_sin_gen(x, upp) - # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype) - # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x))) - # print(sine_wavs.dtype,self.ddtype) - # if sine_wavs.dtype != self.l_linear.weight.dtype: - sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + if self.is_half: + sine_wavs = sine_wavs.half() sine_merge = self.l_tanh(self.l_linear(sine_wavs)) return sine_merge, None, None # noise, uv @@ -427,18 +435,18 @@ class GeneratorNSF(torch.nn.Module): upsample_kernel_sizes, gin_channels, sr, + is_half=False, ): - super().__init__() + super(GeneratorNSF, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) - self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) self.m_source = SourceModuleHnNSF( - sampling_rate=sr, - harmonic_num=0, + sampling_rate=sr, harmonic_num=0, is_half=is_half ) self.noise_convs = nn.ModuleList() - self.conv_pre = nn.Conv1d( + self.conv_pre = Conv1d( initial_channel, upsample_initial_channel, 7, 1, padding=3 ) resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 @@ -448,7 +456,7 @@ class GeneratorNSF(torch.nn.Module): c_cur = upsample_initial_channel // (2 ** (i + 1)) self.ups.append( weight_norm( - nn.ConvTranspose1d( + ConvTranspose1d( upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, @@ -458,9 +466,9 @@ class GeneratorNSF(torch.nn.Module): ) ) if i + 1 < len(upsample_rates): - stride_f0 = math.prod(upsample_rates[i + 1 :]) + stride_f0 = np.prod(upsample_rates[i + 1 :]) self.noise_convs.append( - nn.Conv1d( + Conv1d( 1, c_cur, kernel_size=stride_f0 * 2, @@ -469,7 +477,7 @@ class GeneratorNSF(torch.nn.Module): ) ) else: - self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=1)) + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) self.resblocks = nn.ModuleList() for i in range(len(self.ups)): @@ -479,54 +487,36 @@ class GeneratorNSF(torch.nn.Module): ): self.resblocks.append(resblock(ch, k, d)) - self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) self.ups.apply(init_weights) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - self.upp = math.prod(upsample_rates) + self.upp = np.prod(upsample_rates) - self.lrelu_slope = modules.LRELU_SLOPE - - def forward( - self, - x, - f0, - g: Optional[torch.Tensor] = None, - n_res: Optional[torch.Tensor] = None, - ): + def forward(self, x, f0, g=None): har_source, noi_source, uv = self.m_source(f0, self.upp) har_source = har_source.transpose(1, 2) - if n_res is not None: - assert isinstance(n_res, torch.Tensor) - n = int(n_res.item()) - if n * self.upp != har_source.shape[-1]: - har_source = F.interpolate(har_source, size=n * self.upp, mode="linear") - if n != x.shape[-1]: - x = F.interpolate(x, size=n, mode="linear") x = self.conv_pre(x) if g is not None: x = x + self.cond(g) - for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): - if i < self.num_upsamples: - x = F.leaky_relu(x, self.lrelu_slope) - x = ups(x) - x_source = noise_convs(har_source) - x = x + x_source - xs: torch.Tensor = None - l = [i * self.num_kernels + j for j in range(self.num_kernels)] - for j, resblock in enumerate(self.resblocks): - if j in l: - if xs is None: - xs = resblock(x) - else: - xs += resblock(x) - x = xs / self.num_kernels + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels x = F.leaky_relu(x) x = self.conv_post(x) x = torch.tanh(x) - return x def remove_weight_norm(self): @@ -536,7 +526,14 @@ class GeneratorNSF(torch.nn.Module): l.remove_weight_norm() -class SynthesizerTrnMs256NSFsid(nn.Module): +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMsNSFsidM(nn.Module): def __init__( self, spec_channels, @@ -557,8 +554,12 @@ class SynthesizerTrnMs256NSFsid(nn.Module): spk_embed_dim, gin_channels, sr, + version, + **kwargs, ): super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] self.spec_channels = spec_channels self.inter_channels = inter_channels self.hidden_channels = hidden_channels @@ -566,7 +567,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module): self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) + self.p_dropout = p_dropout self.resblock = resblock self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_dilation_sizes = resblock_dilation_sizes @@ -577,16 +578,26 @@ class SynthesizerTrnMs256NSFsid(nn.Module): self.gin_channels = gin_channels # self.hop_length = hop_length# self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder( - 256, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - float(p_dropout), - ) + if version == "v1": + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + else: + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) self.dec = GeneratorNSF( inter_channels, resblock, @@ -597,6 +608,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module): upsample_kernel_sizes, gin_channels=gin_channels, sr=sr, + is_half=kwargs["is_half"], ) self.enc_q = PosteriorEncoder( spec_channels, @@ -611,134 +623,72 @@ class SynthesizerTrnMs256NSFsid(nn.Module): inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels ) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + self.speaker_map = None logger.debug( - "gin_channels: " - + str(gin_channels) - + ", self.spk_embed_dim: " - + str(self.spk_embed_dim) + f"gin_channels: {gin_channels}, self.spk_embed_dim: {self.spk_embed_dim}" ) def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() - if hasattr(self, "enc_q"): - self.enc_q.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def construct_spkmixmap(self, n_speaker): + self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels)) + for i in range(n_speaker): + self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) + self.speaker_map = self.speaker_map.unsqueeze(0) + + def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None): + if self.speaker_map is not None: # [N, S] * [S, B, 1, H] + g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1] + g = g * self.speaker_map # [N, S, B, 1, H] + g = torch.sum(g, dim=1) # [N, 1, B, 1, H] + g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N] + else: + g = g.unsqueeze(0) + g = self.emb_g(g).transpose(1, 2) - def forward( - self, - phone: torch.Tensor, - phone_lengths: torch.Tensor, - pitch: torch.Tensor, - pitchf: torch.Tensor, - y: torch.Tensor, - y_lengths: torch.Tensor, - ds: Optional[torch.Tensor] = None, - ): # 这里ds是id,[bs,1] - # print(1,pitch.shape)#[bs,t] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) - pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) - # print(-2,pitchf.shape,z_slice.shape) - o = self.dec(z_slice, pitchf, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o - def infer( - self, - phone: torch.Tensor, - phone_lengths: torch.Tensor, - pitch: torch.Tensor, - nsff0: torch.Tensor, - sid: torch.Tensor, - skip_head: Optional[torch.Tensor] = None, - return_length: Optional[torch.Tensor] = None, - return_length2: Optional[torch.Tensor] = None, - ): - g = self.emb_g(sid).unsqueeze(-1) - if skip_head is not None and return_length is not None: - assert isinstance(skip_head, torch.Tensor) - assert isinstance(return_length, torch.Tensor) - head = int(skip_head.item()) - length = int(return_length.item()) - flow_head = torch.clamp(skip_head - 24, min=0) - dec_head = head - int(flow_head.item()) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths, flow_head) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - z = z[:, :, dec_head : dec_head + length] - x_mask = x_mask[:, :, dec_head : dec_head + length] - nsff0 = nsff0[:, head : head + length] - else: - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, nsff0, g=g, n_res=return_length2) - return o, x_mask, (z, z_p, m_p, logs_p) +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] -class SynthesizerTrnMs768NSFsid(SynthesizerTrnMs256NSFsid, PyTorchModelHubMixin): - def __init__( - self, - spec_channels: int, - segment_size: int, - inter_channels: int, - hidden_channels: int, - filter_channels: int, - n_heads: int, - n_layers: int, - kernel_size: int, - p_dropout: float, - resblock: Literal["1", "2"], - resblock_kernel_sizes: List[int], - resblock_dilation_sizes: list[list[int]], - upsample_rates: list[int], - upsample_initial_channel: int, - upsample_kernel_sizes: list[int], - spk_embed_dim: int, - gin_channels: int, - sr: int, - ): - super().__init__( - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr, - ) - del self.enc_p - self.enc_p = TextEncoder( - 768, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - float(p_dropout), - ) + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) -class MultiPeriodDiscriminator(torch.nn.Module): + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): def __init__(self, use_spectral_norm=False): - super().__init__() + super(MultiPeriodDiscriminatorV2, self).__init__() # periods = [2, 3, 5, 7, 11, 17] periods = [2, 3, 5, 7, 11, 17, 23, 37] @@ -768,19 +718,19 @@ class MultiPeriodDiscriminator(torch.nn.Module): class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): - super().__init__() + super(DiscriminatorS, self).__init__() norm_f = weight_norm if use_spectral_norm == False else spectral_norm self.convs = nn.ModuleList( [ - norm_f(nn.Conv1d(1, 16, 15, 1, padding=7)), - norm_f(nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)), - norm_f(nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)), - norm_f(nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)), - norm_f(nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), - norm_f(nn.Conv1d(1024, 1024, 5, 1, padding=2)), + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), ] ) - self.conv_post = norm_f(nn.Conv1d(1024, 1, 3, 1, padding=1)) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) def forward(self, x): fmap = [] @@ -798,14 +748,14 @@ class DiscriminatorS(torch.nn.Module): class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): - super().__init__() + super(DiscriminatorP, self).__init__() self.period = period self.use_spectral_norm = use_spectral_norm norm_f = weight_norm if use_spectral_norm == False else spectral_norm self.convs = nn.ModuleList( [ norm_f( - nn.Conv2d( + Conv2d( 1, 32, (kernel_size, 1), @@ -814,7 +764,7 @@ class DiscriminatorP(torch.nn.Module): ) ), norm_f( - nn.Conv2d( + Conv2d( 32, 128, (kernel_size, 1), @@ -823,7 +773,7 @@ class DiscriminatorP(torch.nn.Module): ) ), norm_f( - nn.Conv2d( + Conv2d( 128, 512, (kernel_size, 1), @@ -832,7 +782,7 @@ class DiscriminatorP(torch.nn.Module): ) ), norm_f( - nn.Conv2d( + Conv2d( 512, 1024, (kernel_size, 1), @@ -841,7 +791,7 @@ class DiscriminatorP(torch.nn.Module): ) ), norm_f( - nn.Conv2d( + Conv2d( 1024, 1024, (kernel_size, 1), @@ -851,7 +801,7 @@ class DiscriminatorP(torch.nn.Module): ), ] ) - self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) def forward(self, x): fmap = [] diff --git a/zerorvc/synthesizer/modules.py b/infer/lib/infer_pack/modules.py similarity index 76% rename from zerorvc/synthesizer/modules.py rename to infer/lib/infer_pack/modules.py index 4c3b18b07b004f44cef49523357f682b71f77769..51aeaf0799819c59714aeed0c8b6a3f8b2872f36 100644 --- a/zerorvc/synthesizer/modules.py +++ b/infer/lib/infer_pack/modules.py @@ -1,22 +1,25 @@ +import copy import math from typing import Optional, Tuple +import numpy as np +import scipy import torch from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm -from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils import remove_weight_norm, weight_norm -from . import commons -from .commons import get_padding, init_weights -from .transforms import piecewise_rational_quadratic_transform +from infer.lib.infer_pack import commons +from infer.lib.infer_pack.commons import get_padding, init_weights +from infer.lib.infer_pack.transforms import piecewise_rational_quadratic_transform LRELU_SLOPE = 0.1 class LayerNorm(nn.Module): def __init__(self, channels, eps=1e-5): - super().__init__() + super(LayerNorm, self).__init__() self.channels = channels self.eps = eps @@ -39,7 +42,7 @@ class ConvReluNorm(nn.Module): n_layers, p_dropout, ): - super().__init__() + super(ConvReluNorm, self).__init__() self.in_channels = in_channels self.hidden_channels = hidden_channels self.out_channels = out_channels @@ -87,7 +90,7 @@ class DDSConv(nn.Module): """ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): - super().__init__() + super(DDSConv, self).__init__() self.channels = channels self.kernel_size = kernel_size self.n_layers = n_layers @@ -130,7 +133,7 @@ class DDSConv(nn.Module): return x * x_mask -class WN(nn.Module): +class WN(torch.nn.Module): def __init__( self, hidden_channels, @@ -140,7 +143,7 @@ class WN(nn.Module): gin_channels=0, p_dropout=0, ): - super().__init__() + super(WN, self).__init__() assert kernel_size % 2 == 1 self.hidden_channels = hidden_channels self.kernel_size = (kernel_size,) @@ -149,25 +152,27 @@ class WN(nn.Module): self.gin_channels = gin_channels self.p_dropout = float(p_dropout) - self.in_layers = nn.ModuleList() - self.res_skip_layers = nn.ModuleList() + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() self.drop = nn.Dropout(float(p_dropout)) if gin_channels != 0: - cond_layer = nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) - self.cond_layer = weight_norm(cond_layer, name="weight") + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") for i in range(n_layers): dilation = dilation_rate**i padding = int((kernel_size * dilation - dilation) / 2) - in_layer = nn.Conv1d( + in_layer = torch.nn.Conv1d( hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding, ) - in_layer = weight_norm(in_layer, name="weight") + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") self.in_layers.append(in_layer) # last one is not necessary @@ -176,8 +181,8 @@ class WN(nn.Module): else: res_skip_channels = hidden_channels - res_skip_layer = nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = weight_norm(res_skip_layer, name="weight") + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") self.res_skip_layers.append(res_skip_layer) def forward( @@ -213,20 +218,44 @@ class WN(nn.Module): def remove_weight_norm(self): if self.gin_channels != 0: - remove_weight_norm(self.cond_layer) + torch.nn.utils.remove_weight_norm(self.cond_layer) for l in self.in_layers: - remove_weight_norm(l) + torch.nn.utils.remove_weight_norm(l) for l in self.res_skip_layers: - remove_weight_norm(l) + torch.nn.utils.remove_weight_norm(l) + + def __prepare_scriptable__(self): + if self.gin_channels != 0: + for hook in self.cond_layer._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self -class ResBlock1(nn.Module): +class ResBlock1(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super().__init__() + super(ResBlock1, self).__init__() self.convs1 = nn.ModuleList( [ weight_norm( - nn.Conv1d( + Conv1d( channels, channels, kernel_size, @@ -236,7 +265,7 @@ class ResBlock1(nn.Module): ) ), weight_norm( - nn.Conv1d( + Conv1d( channels, channels, kernel_size, @@ -246,7 +275,7 @@ class ResBlock1(nn.Module): ) ), weight_norm( - nn.Conv1d( + Conv1d( channels, channels, kernel_size, @@ -262,7 +291,7 @@ class ResBlock1(nn.Module): self.convs2 = nn.ModuleList( [ weight_norm( - nn.Conv1d( + Conv1d( channels, channels, kernel_size, @@ -272,7 +301,7 @@ class ResBlock1(nn.Module): ) ), weight_norm( - nn.Conv1d( + Conv1d( channels, channels, kernel_size, @@ -282,7 +311,7 @@ class ResBlock1(nn.Module): ) ), weight_norm( - nn.Conv1d( + Conv1d( channels, channels, kernel_size, @@ -317,14 +346,31 @@ class ResBlock1(nn.Module): for l in self.convs2: remove_weight_norm(l) + def __prepare_scriptable__(self): + for l in self.convs1: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.convs2: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + -class ResBlock2(nn.Module): +class ResBlock2(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3)): - super().__init__() + super(ResBlock2, self).__init__() self.convs = nn.ModuleList( [ weight_norm( - nn.Conv1d( + Conv1d( channels, channels, kernel_size, @@ -334,7 +380,7 @@ class ResBlock2(nn.Module): ) ), weight_norm( - nn.Conv1d( + Conv1d( channels, channels, kernel_size, @@ -363,6 +409,16 @@ class ResBlock2(nn.Module): for l in self.convs: remove_weight_norm(l) + def __prepare_scriptable__(self): + for l in self.convs: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + class Log(nn.Module): def forward( @@ -402,7 +458,7 @@ class Flip(nn.Module): class ElementwiseAffine(nn.Module): def __init__(self, channels): - super().__init__() + super(ElementwiseAffine, self).__init__() self.channels = channels self.m = nn.Parameter(torch.zeros(channels, 1)) self.logs = nn.Parameter(torch.zeros(channels, 1)) @@ -431,7 +487,7 @@ class ResidualCouplingLayer(nn.Module): mean_only=False, ): assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() + super(ResidualCouplingLayer, self).__init__() self.channels = channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size @@ -483,6 +539,15 @@ class ResidualCouplingLayer(nn.Module): def remove_weight_norm(self): self.enc.remove_weight_norm() + def __prepare_scriptable__(self): + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self + class ConvFlow(nn.Module): def __init__( @@ -494,7 +559,7 @@ class ConvFlow(nn.Module): num_bins=10, tail_bound=5.0, ): - super().__init__() + super(ConvFlow, self).__init__() self.in_channels = in_channels self.filter_channels = filter_channels self.kernel_size = kernel_size diff --git a/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..e69a603440709fc7dc60e92079addbfa490778fd --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py @@ -0,0 +1,91 @@ +import numpy as np +import pyworld + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor + + +class DioF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def resize_f0(self, x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * target_len, len(source)) / target_len, + np.arange(0, len(source)), + source, + ) + res = np.nan_to_num(target) + return res + + def compute_f0(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.dio( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] + + def compute_f0_uv(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.dio( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..0d81b05eef25f0ebeead80bb9baaaef695823b19 --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py @@ -0,0 +1,16 @@ +class F0Predictor(object): + def compute_f0(self, wav, p_len): + """ + input: wav:[signal_length] + p_len:int + output: f0:[signal_length//hop_length] + """ + pass + + def compute_f0_uv(self, wav, p_len): + """ + input: wav:[signal_length] + p_len:int + output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] + """ + pass diff --git a/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..2b13917ce07455e87b076ac4f3cfabab2e443f8e --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py @@ -0,0 +1,87 @@ +import numpy as np +import pyworld + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor + + +class HarvestF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def resize_f0(self, x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * target_len, len(source)) / target_len, + np.arange(0, len(source)), + source, + ) + res = np.nan_to_num(target) + return res + + def compute_f0(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.harvest( + wav.astype(np.double), + fs=self.sampling_rate, + f0_ceil=self.f0_max, + f0_floor=self.f0_min, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] + + def compute_f0_uv(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.harvest( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..957ec467ec808dc9fe78a2f4a863771b45c7ad4e --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py @@ -0,0 +1,98 @@ +import numpy as np +import parselmouth + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor + + +class PMF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def compute_f0(self, wav, p_len=None): + x = wav + if p_len is None: + p_len = x.shape[0] // self.hop_length + else: + assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" + time_step = self.hop_length / self.sampling_rate * 1000 + f0 = ( + parselmouth.Sound(x, self.sampling_rate) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=self.f0_min, + pitch_ceiling=self.f0_max, + ) + .selected_array["frequency"] + ) + + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") + f0, uv = self.interpolate_f0(f0) + return f0 + + def compute_f0_uv(self, wav, p_len=None): + x = wav + if p_len is None: + p_len = x.shape[0] // self.hop_length + else: + assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" + time_step = self.hop_length / self.sampling_rate * 1000 + f0 = ( + parselmouth.Sound(x, self.sampling_rate) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=self.f0_min, + pitch_ceiling=self.f0_max, + ) + .selected_array["frequency"] + ) + + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") + f0, uv = self.interpolate_f0(f0) + return f0, uv diff --git a/infer/lib/infer_pack/modules/F0Predictor/__init__.py b/infer/lib/infer_pack/modules/F0Predictor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/infer/lib/infer_pack/onnx_inference.py b/infer/lib/infer_pack/onnx_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..3d8328b2ac337a365e2ebc3e6ae767761e3e17c5 --- /dev/null +++ b/infer/lib/infer_pack/onnx_inference.py @@ -0,0 +1,149 @@ +import librosa +import numpy as np +import onnxruntime +import soundfile + +import logging + +logger = logging.getLogger(__name__) + + +class ContentVec: + def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None): + logger.info("Load model(s) from {}".format(vec_path)) + if device == "cpu" or device is None: + providers = ["CPUExecutionProvider"] + elif device == "cuda": + providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] + elif device == "dml": + providers = ["DmlExecutionProvider"] + else: + raise RuntimeError("Unsportted Device") + self.model = onnxruntime.InferenceSession(vec_path, providers=providers) + + def __call__(self, wav): + return self.forward(wav) + + def forward(self, wav): + feats = wav + if feats.ndim == 2: # double channels + feats = feats.mean(-1) + assert feats.ndim == 1, feats.ndim + feats = np.expand_dims(np.expand_dims(feats, 0), 0) + onnx_input = {self.model.get_inputs()[0].name: feats} + logits = self.model.run(None, onnx_input)[0] + return logits.transpose(0, 2, 1) + + +def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs): + if f0_predictor == "pm": + from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor + + f0_predictor_object = PMF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + elif f0_predictor == "harvest": + from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import ( + HarvestF0Predictor, + ) + + f0_predictor_object = HarvestF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + elif f0_predictor == "dio": + from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor + + f0_predictor_object = DioF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + else: + raise Exception("Unknown f0 predictor") + return f0_predictor_object + + +class OnnxRVC: + def __init__( + self, + model_path, + sr=40000, + hop_size=512, + vec_path="vec-768-layer-12", + device="cpu", + ): + vec_path = f"pretrained/{vec_path}.onnx" + self.vec_model = ContentVec(vec_path, device) + if device == "cpu" or device is None: + providers = ["CPUExecutionProvider"] + elif device == "cuda": + providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] + elif device == "dml": + providers = ["DmlExecutionProvider"] + else: + raise RuntimeError("Unsportted Device") + self.model = onnxruntime.InferenceSession(model_path, providers=providers) + self.sampling_rate = sr + self.hop_size = hop_size + + def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd): + onnx_input = { + self.model.get_inputs()[0].name: hubert, + self.model.get_inputs()[1].name: hubert_length, + self.model.get_inputs()[2].name: pitch, + self.model.get_inputs()[3].name: pitchf, + self.model.get_inputs()[4].name: ds, + self.model.get_inputs()[5].name: rnd, + } + return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16) + + def inference( + self, + raw_path, + sid, + f0_method="dio", + f0_up_key=0, + pad_time=0.5, + cr_threshold=0.02, + ): + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + f0_predictor = get_f0_predictor( + f0_method, + hop_length=self.hop_size, + sampling_rate=self.sampling_rate, + threshold=cr_threshold, + ) + wav, sr = librosa.load(raw_path, sr=self.sampling_rate) + org_length = len(wav) + if org_length / sr > 50.0: + raise RuntimeError("Reached Max Length") + + wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000) + wav16k = wav16k + + hubert = self.vec_model(wav16k) + hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32) + hubert_length = hubert.shape[1] + + pitchf = f0_predictor.compute_f0(wav, hubert_length) + pitchf = pitchf * 2 ** (f0_up_key / 12) + pitch = pitchf.copy() + f0_mel = 1127 * np.log(1 + pitch / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + pitch = np.rint(f0_mel).astype(np.int64) + + pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32) + pitch = pitch.reshape(1, len(pitch)) + ds = np.array([sid]).astype(np.int64) + + rnd = np.random.randn(1, 192, hubert_length).astype(np.float32) + hubert_length = np.array([hubert_length]).astype(np.int64) + + out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze() + out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant") + return out_wav[0:org_length] diff --git a/zerorvc/synthesizer/transforms.py b/infer/lib/infer_pack/transforms.py similarity index 96% rename from zerorvc/synthesizer/transforms.py rename to infer/lib/infer_pack/transforms.py index cd68133b1b66dea434021435150633c8e47dc0c0..6d07b3b12cee87869440feb1496dd634d334e96f 100644 --- a/zerorvc/synthesizer/transforms.py +++ b/infer/lib/infer_pack/transforms.py @@ -35,7 +35,7 @@ def piecewise_rational_quadratic_transform( min_bin_width=min_bin_width, min_bin_height=min_bin_height, min_derivative=min_derivative, - **spline_kwargs, + **spline_kwargs ) return outputs, logabsdet diff --git a/infer/lib/jit/__init__.py b/infer/lib/jit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d7f41dd6ab1a0eb0afec387dbb9aa875d1564b4f --- /dev/null +++ b/infer/lib/jit/__init__.py @@ -0,0 +1,163 @@ +from io import BytesIO +import pickle +import time +import torch +from tqdm import tqdm +from collections import OrderedDict + + +def load_inputs(path, device, is_half=False): + parm = torch.load(path, map_location=torch.device("cpu")) + for key in parm.keys(): + parm[key] = parm[key].to(device) + if is_half and parm[key].dtype == torch.float32: + parm[key] = parm[key].half() + elif not is_half and parm[key].dtype == torch.float16: + parm[key] = parm[key].float() + return parm + + +def benchmark( + model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False +): + parm = load_inputs(inputs_path, device, is_half) + total_ts = 0.0 + bar = tqdm(range(epoch)) + for i in bar: + start_time = time.perf_counter() + o = model(**parm) + total_ts += time.perf_counter() - start_time + print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}") + + +def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False): + benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half) + + +def to_jit_model( + model_path, + model_type: str, + mode: str = "trace", + inputs_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + model = None + if model_type.lower() == "synthesizer": + from .get_synthesizer import get_synthesizer + + model, _ = get_synthesizer(model_path, device) + model.forward = model.infer + elif model_type.lower() == "rmvpe": + from .get_rmvpe import get_rmvpe + + model = get_rmvpe(model_path, device) + elif model_type.lower() == "hubert": + from .get_hubert import get_hubert_model + + model = get_hubert_model(model_path, device) + model.forward = model.infer + else: + raise ValueError(f"No model type named {model_type}") + model = model.eval() + model = model.half() if is_half else model.float() + if mode == "trace": + assert not inputs_path + inputs = load_inputs(inputs_path, device, is_half) + model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) + elif mode == "script": + model_jit = torch.jit.script(model) + model_jit.to(device) + model_jit = model_jit.half() if is_half else model_jit.float() + # model = model.half() if is_half else model.float() + return (model, model_jit) + + +def export( + model: torch.nn.Module, + mode: str = "trace", + inputs: dict = None, + device=torch.device("cpu"), + is_half: bool = False, +) -> dict: + model = model.half() if is_half else model.float() + model.eval() + if mode == "trace": + assert inputs is not None + model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) + elif mode == "script": + model_jit = torch.jit.script(model) + model_jit.to(device) + model_jit = model_jit.half() if is_half else model_jit.float() + buffer = BytesIO() + # model_jit=model_jit.cpu() + torch.jit.save(model_jit, buffer) + del model_jit + cpt = OrderedDict() + cpt["model"] = buffer.getvalue() + cpt["is_half"] = is_half + return cpt + + +def load(path: str): + with open(path, "rb") as f: + return pickle.load(f) + + +def save(ckpt: dict, save_path: str): + with open(save_path, "wb") as f: + pickle.dump(ckpt, f) + + +def rmvpe_jit_export( + model_path: str, + mode: str = "script", + inputs_path: str = None, + save_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + if not save_path: + save_path = model_path.rstrip(".pth") + save_path += ".half.jit" if is_half else ".jit" + if "cuda" in str(device) and ":" not in str(device): + device = torch.device("cuda:0") + from .get_rmvpe import get_rmvpe + + model = get_rmvpe(model_path, device) + inputs = None + if mode == "trace": + inputs = load_inputs(inputs_path, device, is_half) + ckpt = export(model, mode, inputs, device, is_half) + ckpt["device"] = str(device) + save(ckpt, save_path) + return ckpt + + +def synthesizer_jit_export( + model_path: str, + mode: str = "script", + inputs_path: str = None, + save_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + if not save_path: + save_path = model_path.rstrip(".pth") + save_path += ".half.jit" if is_half else ".jit" + if "cuda" in str(device) and ":" not in str(device): + device = torch.device("cuda:0") + from .get_synthesizer import get_synthesizer + + model, cpt = get_synthesizer(model_path, device) + assert isinstance(cpt, dict) + model.forward = model.infer + inputs = None + if mode == "trace": + inputs = load_inputs(inputs_path, device, is_half) + ckpt = export(model, mode, inputs, device, is_half) + cpt.pop("weight") + cpt["model"] = ckpt["model"] + cpt["device"] = device + save(cpt, save_path) + return cpt diff --git a/infer/lib/jit/get_hubert.py b/infer/lib/jit/get_hubert.py new file mode 100644 index 0000000000000000000000000000000000000000..aec7132fa453f6a2e3b649ffb11b8119dcd5b15d --- /dev/null +++ b/infer/lib/jit/get_hubert.py @@ -0,0 +1,342 @@ +import math +import random +from typing import Optional, Tuple +from fairseq.checkpoint_utils import load_model_ensemble_and_task +import numpy as np +import torch +import torch.nn.functional as F + +# from fairseq.data.data_utils import compute_mask_indices +from fairseq.utils import index_put + + +# @torch.jit.script +def pad_to_multiple(x, multiple, dim=-1, value=0): + # Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41 + if x is None: + return None, 0 + tsz = x.size(dim) + m = tsz / multiple + remainder = math.ceil(m) * multiple - tsz + if int(tsz % multiple) == 0: + return x, 0 + pad_offset = (0,) * (-1 - dim) * 2 + + return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder + + +def extract_features( + self, + x, + padding_mask=None, + tgt_layer=None, + min_layer=0, +): + if padding_mask is not None: + x = index_put(x, padding_mask, 0) + + x_conv = self.pos_conv(x.transpose(1, 2)) + x_conv = x_conv.transpose(1, 2) + x = x + x_conv + + if not self.layer_norm_first: + x = self.layer_norm(x) + + # pad to the sequence length dimension + x, pad_length = pad_to_multiple(x, self.required_seq_len_multiple, dim=-2, value=0) + if pad_length > 0 and padding_mask is None: + padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool) + padding_mask[:, -pad_length:] = True + else: + padding_mask, _ = pad_to_multiple( + padding_mask, self.required_seq_len_multiple, dim=-1, value=True + ) + x = F.dropout(x, p=self.dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + layer_results = [] + r = None + for i, layer in enumerate(self.layers): + dropout_probability = np.random.random() if self.layerdrop > 0 else 1 + if not self.training or (dropout_probability > self.layerdrop): + x, (z, lr) = layer( + x, self_attn_padding_mask=padding_mask, need_weights=False + ) + if i >= min_layer: + layer_results.append((x, z, lr)) + if i == tgt_layer: + r = x + break + + if r is not None: + x = r + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + # undo paddding + if pad_length > 0: + x = x[:, :-pad_length] + + def undo_pad(a, b, c): + return ( + a[:-pad_length], + b[:-pad_length] if b is not None else b, + c[:-pad_length], + ) + + layer_results = [undo_pad(*u) for u in layer_results] + + return x, layer_results + + +def compute_mask_indices( + shape: Tuple[int, int], + padding_mask: Optional[torch.Tensor], + mask_prob: float, + mask_length: int, + mask_type: str = "static", + mask_other: float = 0.0, + min_masks: int = 0, + no_overlap: bool = False, + min_space: int = 0, + require_same_masks: bool = True, + mask_dropout: float = 0.0, +) -> torch.Tensor: + """ + Computes random mask spans for a given shape + + Args: + shape: the the shape for which to compute masks. + should be of size 2 where first element is batch size and 2nd is timesteps + padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements + mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by + number of timesteps divided by length of mask span to mask approximately this percentage of all elements. + however due to overlaps, the actual number will be smaller (unless no_overlap is True) + mask_type: how to compute mask lengths + static = fixed size + uniform = sample from uniform distribution [mask_other, mask_length*2] + normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element + poisson = sample from possion distribution with lambda = mask length + min_masks: minimum number of masked spans + no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping + min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans + require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample + mask_dropout: randomly dropout this percentage of masks in each example + """ + + bsz, all_sz = shape + mask = torch.full((bsz, all_sz), False) + + all_num_mask = int( + # add a random number for probabilistic rounding + mask_prob * all_sz / float(mask_length) + + torch.rand([1]).item() + ) + + all_num_mask = max(min_masks, all_num_mask) + + mask_idcs = [] + for i in range(bsz): + if padding_mask is not None: + sz = all_sz - padding_mask[i].long().sum().item() + num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand()) + num_mask = max(min_masks, num_mask) + else: + sz = all_sz + num_mask = all_num_mask + + if mask_type == "static": + lengths = torch.full([num_mask], mask_length) + elif mask_type == "uniform": + lengths = torch.randint(mask_other, mask_length * 2 + 1, size=[num_mask]) + elif mask_type == "normal": + lengths = torch.normal(mask_length, mask_other, size=[num_mask]) + lengths = [max(1, int(round(x))) for x in lengths] + else: + raise Exception("unknown mask selection " + mask_type) + + if sum(lengths) == 0: + lengths[0] = min(mask_length, sz - 1) + + if no_overlap: + mask_idc = [] + + def arrange(s, e, length, keep_length): + span_start = torch.randint(low=s, high=e - length, size=[1]).item() + mask_idc.extend(span_start + i for i in range(length)) + + new_parts = [] + if span_start - s - min_space >= keep_length: + new_parts.append((s, span_start - min_space + 1)) + if e - span_start - length - min_space > keep_length: + new_parts.append((span_start + length + min_space, e)) + return new_parts + + parts = [(0, sz)] + min_length = min(lengths) + for length in sorted(lengths, reverse=True): + t = [e - s if e - s >= length + min_space else 0 for s, e in parts] + lens = torch.asarray(t, dtype=torch.int) + l_sum = torch.sum(lens) + if l_sum == 0: + break + probs = lens / torch.sum(lens) + c = torch.multinomial(probs.float(), len(parts)).item() + s, e = parts.pop(c) + parts.extend(arrange(s, e, length, min_length)) + mask_idc = torch.asarray(mask_idc) + else: + min_len = min(lengths) + if sz - min_len <= num_mask: + min_len = sz - num_mask - 1 + mask_idc = torch.asarray( + random.sample([i for i in range(sz - min_len)], num_mask) + ) + mask_idc = torch.asarray( + [ + mask_idc[j] + offset + for j in range(len(mask_idc)) + for offset in range(lengths[j]) + ] + ) + + mask_idcs.append(torch.unique(mask_idc[mask_idc < sz])) + + min_len = min([len(m) for m in mask_idcs]) + for i, mask_idc in enumerate(mask_idcs): + if isinstance(mask_idc, torch.Tensor): + mask_idc = torch.asarray(mask_idc, dtype=torch.float) + if len(mask_idc) > min_len and require_same_masks: + mask_idc = torch.asarray( + random.sample([i for i in range(mask_idc)], min_len) + ) + if mask_dropout > 0: + num_holes = int(round(len(mask_idc) * mask_dropout)) + mask_idc = torch.asarray( + random.sample([i for i in range(mask_idc)], len(mask_idc) - num_holes) + ) + + mask[i, mask_idc.int()] = True + + return mask + + +def apply_mask(self, x, padding_mask, target_list): + B, T, C = x.shape + torch.zeros_like(x) + if self.mask_prob > 0: + mask_indices = compute_mask_indices( + (B, T), + padding_mask, + self.mask_prob, + self.mask_length, + self.mask_selection, + self.mask_other, + min_masks=2, + no_overlap=self.no_mask_overlap, + min_space=self.mask_min_space, + ) + mask_indices = mask_indices.to(x.device) + x[mask_indices] = self.mask_emb + else: + mask_indices = None + + if self.mask_channel_prob > 0: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + mask_channel_indices.to(x.device).unsqueeze(1).expand(-1, T, -1) + ) + x[mask_channel_indices] = 0 + + return x, mask_indices + + +def get_hubert_model( + model_path="assets/hubert/hubert_base.pt", device=torch.device("cpu") +): + models, _, _ = load_model_ensemble_and_task( + [model_path], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(device) + + def _apply_mask(x, padding_mask, target_list): + return apply_mask(hubert_model, x, padding_mask, target_list) + + hubert_model.apply_mask = _apply_mask + + def _extract_features( + x, + padding_mask=None, + tgt_layer=None, + min_layer=0, + ): + return extract_features( + hubert_model.encoder, + x, + padding_mask=padding_mask, + tgt_layer=tgt_layer, + min_layer=min_layer, + ) + + hubert_model.encoder.extract_features = _extract_features + + hubert_model._forward = hubert_model.forward + + def hubert_extract_features( + self, + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = False, + ret_conv: bool = False, + output_layer: Optional[int] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + res = self._forward( + source, + padding_mask=padding_mask, + mask=mask, + features_only=True, + output_layer=output_layer, + ) + feature = res["features"] if ret_conv else res["x"] + return feature, res["padding_mask"] + + def _hubert_extract_features( + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = False, + ret_conv: bool = False, + output_layer: Optional[int] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + return hubert_extract_features( + hubert_model, source, padding_mask, mask, ret_conv, output_layer + ) + + hubert_model.extract_features = _hubert_extract_features + + def infer(source, padding_mask, output_layer: torch.Tensor): + output_layer = output_layer.item() + logits = hubert_model.extract_features( + source=source, padding_mask=padding_mask, output_layer=output_layer + ) + feats = hubert_model.final_proj(logits[0]) if output_layer == 9 else logits[0] + return feats + + hubert_model.infer = infer + # hubert_model.forward=infer + # hubert_model.forward + + return hubert_model diff --git a/infer/lib/jit/get_rmvpe.py b/infer/lib/jit/get_rmvpe.py new file mode 100644 index 0000000000000000000000000000000000000000..e71c39fb0275d3891690af72b6f7e8dd11b00f70 --- /dev/null +++ b/infer/lib/jit/get_rmvpe.py @@ -0,0 +1,12 @@ +import torch + + +def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")): + from infer.lib.rmvpe import E2E + + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location=device) + model.load_state_dict(ckpt) + model.eval() + model = model.to(device) + return model diff --git a/infer/lib/jit/get_synthesizer.py b/infer/lib/jit/get_synthesizer.py new file mode 100644 index 0000000000000000000000000000000000000000..b8db4fa9666873eb1ac8c311bf9190279fc4587b --- /dev/null +++ b/infer/lib/jit/get_synthesizer.py @@ -0,0 +1,38 @@ +import torch + + +def get_synthesizer(pth_path, device=torch.device("cpu")): + from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, + ) + + cpt = torch.load(pth_path, map_location=torch.device("cpu")) + # tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + if_f0 = cpt.get("f0", 1) + version = cpt.get("version", "v1") + if version == "v1": + if if_f0 == 1: + net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False) + else: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif version == "v2": + if if_f0 == 1: + net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False) + else: + net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) + del net_g.enc_q + # net_g.forward = net_g.infer + # ckpt = {} + # ckpt["config"] = cpt["config"] + # ckpt["f0"] = if_f0 + # ckpt["version"] = version + # ckpt["info"] = cpt.get("info", "0epoch") + net_g.load_state_dict(cpt["weight"], strict=False) + net_g = net_g.float() + net_g.eval().to(device) + net_g.remove_weight_norm() + return net_g, cpt diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py new file mode 100644 index 0000000000000000000000000000000000000000..6a46ac80a38ca6766f883deeea1c1d4a5a097b6a --- /dev/null +++ b/infer/lib/rmvpe.py @@ -0,0 +1,660 @@ +from io import BytesIO +import os +from typing import List +import numpy as np +import torch + +from infer.lib import jit + +import torch.nn as nn +import torch.nn.functional as F +from librosa.util import pad_center +from scipy.signal import get_window + +import logging + +logger = logging.getLogger(__name__) + + +class STFT(torch.nn.Module): + def __init__( + self, filter_length=1024, hop_length=512, win_length=None, window="hann" + ): + """ + This module implements an STFT using 1D convolution and 1D transpose convolutions. + This is a bit tricky so there are some cases that probably won't work as working + out the same sizes before and after in all overlap add setups is tough. Right now, + this code should work with hop lengths that are half the filter length (50% overlap + between frames). + + Keyword Arguments: + filter_length {int} -- Length of filters used (default: {1024}) + hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512}) + win_length {[type]} -- Length of the window function applied to each frame (if not specified, it + equals the filter length). (default: {None}) + window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris) + (default: {'hann'}) + """ + super(STFT, self).__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length if win_length else filter_length + self.window = window + self.forward_transform = None + self.pad_amount = int(self.filter_length / 2) + fourier_basis = np.fft.fft(np.eye(self.filter_length)) + + cutoff = int((self.filter_length / 2 + 1)) + fourier_basis = np.vstack( + [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])] + ) + forward_basis = torch.FloatTensor(fourier_basis) + inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis)) + + assert filter_length >= self.win_length + # get window and zero center pad it to filter_length + fft_window = get_window(window, self.win_length, fftbins=True) + fft_window = pad_center(fft_window, size=filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis = (inverse_basis.T * fft_window).T + + self.register_buffer("forward_basis", forward_basis.float()) + self.register_buffer("inverse_basis", inverse_basis.float()) + self.register_buffer("fft_window", fft_window.float()) + + def transform(self, input_data, return_phase=False): + """Take input data (audio) to STFT domain. + + Arguments: + input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) + + Returns: + magnitude {tensor} -- Magnitude of STFT with shape (num_batch, + num_frequencies, num_frames) + phase {tensor} -- Phase of STFT with shape (num_batch, + num_frequencies, num_frames) + """ + input_data = F.pad( + input_data, + (self.pad_amount, self.pad_amount), + mode="reflect", + ) + forward_transform = input_data.unfold( + 1, self.filter_length, self.hop_length + ).permute(0, 2, 1) + forward_transform = torch.matmul(self.forward_basis, forward_transform) + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + magnitude = torch.sqrt(real_part**2 + imag_part**2) + if return_phase: + phase = torch.atan2(imag_part.data, real_part.data) + return magnitude, phase + else: + return magnitude + + def inverse(self, magnitude, phase): + """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced + by the ```transform``` function. + + Arguments: + magnitude {tensor} -- Magnitude of STFT with shape (num_batch, + num_frequencies, num_frames) + phase {tensor} -- Phase of STFT with shape (num_batch, + num_frequencies, num_frames) + + Returns: + inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of + shape (num_batch, num_samples) + """ + cat = torch.cat( + [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 + ) + fold = torch.nn.Fold( + output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length), + kernel_size=(1, self.filter_length), + stride=(1, self.hop_length), + ) + inverse_transform = torch.matmul(self.inverse_basis, cat) + inverse_transform = fold(inverse_transform)[ + :, 0, 0, self.pad_amount : -self.pad_amount + ] + window_square_sum = ( + self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0) + ) + window_square_sum = fold(window_square_sum)[ + :, 0, 0, self.pad_amount : -self.pad_amount + ] + inverse_transform /= window_square_sum + return inverse_transform + + def forward(self, input_data): + """Take input data (audio) to STFT domain and then back to audio. + + Arguments: + input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) + + Returns: + reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of + shape (num_batch, num_samples) + """ + self.magnitude, self.phase = self.transform(input_data, return_phase=True) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction + + +from time import time as ttime + + +class BiGRU(nn.Module): + def __init__(self, input_features, hidden_features, num_layers): + super(BiGRU, self).__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x): + return self.gru(x)[0] + + +class ConvBlockRes(nn.Module): + def __init__(self, in_channels, out_channels, momentum=0.01): + super(ConvBlockRes, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + # self.shortcut:Optional[nn.Module] = None + if in_channels != out_channels: + self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) + + def forward(self, x: torch.Tensor): + if not hasattr(self, "shortcut"): + return self.conv(x) + x + else: + return self.conv(x) + self.shortcut(x) + + +class Encoder(nn.Module): + def __init__( + self, + in_channels, + in_size, + n_encoders, + kernel_size, + n_blocks, + out_channels=16, + momentum=0.01, + ): + super(Encoder, self).__init__() + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + for i in range(self.n_encoders): + self.layers.append( + ResEncoderBlock( + in_channels, out_channels, kernel_size, n_blocks, momentum=momentum + ) + ) + self.latent_channels.append([out_channels, in_size]) + in_channels = out_channels + out_channels *= 2 + in_size //= 2 + self.out_size = in_size + self.out_channel = out_channels + + def forward(self, x: torch.Tensor): + concat_tensors: List[torch.Tensor] = [] + x = self.bn(x) + for i, layer in enumerate(self.layers): + t, x = layer(x) + concat_tensors.append(t) + return x, concat_tensors + + +class ResEncoderBlock(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 + ): + super(ResEncoderBlock, self).__init__() + self.n_blocks = n_blocks + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.kernel_size = kernel_size + if self.kernel_size is not None: + self.pool = nn.AvgPool2d(kernel_size=kernel_size) + + def forward(self, x): + for i, conv in enumerate(self.conv): + x = conv(x) + if self.kernel_size is not None: + return x, self.pool(x) + else: + return x + + +class Intermediate(nn.Module): # + def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): + super(Intermediate, self).__init__() + self.n_inters = n_inters + self.layers = nn.ModuleList() + self.layers.append( + ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) + ) + for i in range(self.n_inters - 1): + self.layers.append( + ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) + ) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = layer(x) + return x + + +class ResDecoderBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): + super(ResDecoderBlock, self).__init__() + out_padding = (0, 1) if stride == (1, 2) else (1, 1) + self.n_blocks = n_blocks + self.conv1 = nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=stride, + padding=(1, 1), + output_padding=out_padding, + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + x = torch.cat((x, concat_tensor), dim=1) + for i, conv2 in enumerate(self.conv2): + x = conv2(x) + return x + + +class Decoder(nn.Module): + def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): + super(Decoder, self).__init__() + self.layers = nn.ModuleList() + self.n_decoders = n_decoders + for i in range(self.n_decoders): + out_channels = in_channels // 2 + self.layers.append( + ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) + ) + in_channels = out_channels + + def forward(self, x: torch.Tensor, concat_tensors: List[torch.Tensor]): + for i, layer in enumerate(self.layers): + x = layer(x, concat_tensors[-1 - i]) + return x + + +class DeepUnet(nn.Module): + def __init__( + self, + kernel_size, + n_blocks, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(DeepUnet, self).__init__() + self.encoder = Encoder( + in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels + ) + self.intermediate = Intermediate( + self.encoder.out_channel // 2, + self.encoder.out_channel, + inter_layers, + n_blocks, + ) + self.decoder = Decoder( + self.encoder.out_channel, en_de_layers, kernel_size, n_blocks + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +class E2E(nn.Module): + def __init__( + self, + n_blocks, + n_gru, + kernel_size, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(E2E, self).__init__() + self.unet = DeepUnet( + kernel_size, + n_blocks, + en_de_layers, + inter_layers, + in_channels, + en_out_channels, + ) + self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, 360), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid() + ) + + def forward(self, mel): + # print(mel.shape) + mel = mel.transpose(-1, -2).unsqueeze(1) + x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) + x = self.fc(x) + # print(x.shape) + return x + + +from librosa.filters import mel + + +class MelSpectrogram(torch.nn.Module): + def __init__( + self, + is_half, + n_mel_channels, + sampling_rate, + win_length, + hop_length, + n_fft=None, + mel_fmin=0, + mel_fmax=None, + clamp=1e-5, + ): + super().__init__() + n_fft = win_length if n_fft is None else n_fft + self.hann_window = {} + mel_basis = mel( + sr=sampling_rate, + n_fft=n_fft, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax, + htk=True, + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + self.n_fft = win_length if n_fft is None else n_fft + self.hop_length = hop_length + self.win_length = win_length + self.sampling_rate = sampling_rate + self.n_mel_channels = n_mel_channels + self.clamp = clamp + self.is_half = is_half + + def forward(self, audio, keyshift=0, speed=1, center=True): + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(self.n_fft * factor)) + win_length_new = int(np.round(self.win_length * factor)) + hop_length_new = int(np.round(self.hop_length * speed)) + keyshift_key = str(keyshift) + "_" + str(audio.device) + if keyshift_key not in self.hann_window: + self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( + audio.device + ) + if "privateuseone" in str(audio.device): + if not hasattr(self, "stft"): + self.stft = STFT( + filter_length=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window="hann", + ).to(audio.device) + magnitude = self.stft.transform(audio) + else: + fft = torch.stft( + audio, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window=self.hann_window[keyshift_key], + center=center, + return_complex=True, + ) + magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) + if keyshift != 0: + size = self.n_fft // 2 + 1 + resize = magnitude.size(1) + if resize < size: + magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) + magnitude = magnitude[:, :size, :] * self.win_length / win_length_new + mel_output = torch.matmul(self.mel_basis, magnitude) + if self.is_half == True: + mel_output = mel_output.half() + log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) + return log_mel_spec + + +class RMVPE: + def __init__(self, model_path: str, is_half, device=None, use_jit=False): + self.resample_kernel = {} + self.resample_kernel = {} + self.is_half = is_half + if device is None: + device = "cuda:0" if torch.cuda.is_available() else "cpu" + self.device = device + self.mel_extractor = MelSpectrogram( + is_half, 128, 16000, 1024, 160, None, 30, 8000 + ).to(device) + if "privateuseone" in str(device): + import onnxruntime as ort + + ort_session = ort.InferenceSession( + "%s/rmvpe.onnx" % os.environ["rmvpe_root"], + providers=["DmlExecutionProvider"], + ) + self.model = ort_session + else: + if str(self.device) == "cuda": + self.device = torch.device("cuda:0") + + def get_jit_model(): + jit_model_path = model_path.rstrip(".pth") + jit_model_path += ".half.jit" if is_half else ".jit" + reload = False + if os.path.exists(jit_model_path): + ckpt = jit.load(jit_model_path) + model_device = ckpt["device"] + if model_device != str(self.device): + reload = True + else: + reload = True + + if reload: + ckpt = jit.rmvpe_jit_export( + model_path=model_path, + mode="script", + inputs_path=None, + save_path=jit_model_path, + device=device, + is_half=is_half, + ) + model = torch.jit.load(BytesIO(ckpt["model"]), map_location=device) + return model + + def get_default_model(): + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu") + model.load_state_dict(ckpt) + model.eval() + if is_half: + model = model.half() + else: + model = model.float() + return model + + if use_jit: + if is_half and "cpu" in str(self.device): + logger.warning( + "Use default rmvpe model. \ + Jit is not supported on the CPU for half floating point" + ) + self.model = get_default_model() + else: + self.model = get_jit_model() + else: + self.model = get_default_model() + + self.model = self.model.to(device) + cents_mapping = 20 * np.arange(360) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 + + def mel2hidden(self, mel): + with torch.no_grad(): + n_frames = mel.shape[-1] + n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames + if n_pad > 0: + mel = F.pad(mel, (0, n_pad), mode="constant") + if "privateuseone" in str(self.device): + onnx_input_name = self.model.get_inputs()[0].name + onnx_outputs_names = self.model.get_outputs()[0].name + hidden = self.model.run( + [onnx_outputs_names], + input_feed={onnx_input_name: mel.cpu().numpy()}, + )[0] + else: + mel = mel.half() if self.is_half else mel.float() + hidden = self.model(mel) + return hidden[:, :n_frames] + + def decode(self, hidden, thred=0.03): + cents_pred = self.to_local_average_cents(hidden, thred=thred) + f0 = 10 * (2 ** (cents_pred / 1200)) + f0[f0 == 10] = 0 + # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) + return f0 + + def infer_from_audio(self, audio, thred=0.03): + # torch.cuda.synchronize() + # t0 = ttime() + if not torch.is_tensor(audio): + audio = torch.from_numpy(audio) + mel = self.mel_extractor( + audio.float().to(self.device).unsqueeze(0), center=True + ) + # print(123123123,mel.device.type) + # torch.cuda.synchronize() + # t1 = ttime() + hidden = self.mel2hidden(mel) + # torch.cuda.synchronize() + # t2 = ttime() + # print(234234,hidden.device.type) + if "privateuseone" not in str(self.device): + hidden = hidden.squeeze(0).cpu().numpy() + else: + hidden = hidden[0] + if self.is_half == True: + hidden = hidden.astype("float32") + + f0 = self.decode(hidden, thred=thred) + # torch.cuda.synchronize() + # t3 = ttime() + # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) + return f0 + + def to_local_average_cents(self, salience, thred=0.05): + # t0 = ttime() + center = np.argmax(salience, axis=1) # 帧长#index + salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 + # t1 = ttime() + center += 4 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + for idx in range(salience.shape[0]): + todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) + todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) + # t2 = ttime() + todo_salience = np.array(todo_salience) # 帧长,9 + todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 + product_sum = np.sum(todo_salience * todo_cents_mapping, 1) + weight_sum = np.sum(todo_salience, 1) # 帧长 + devided = product_sum / weight_sum # 帧长 + # t3 = ttime() + maxx = np.max(salience, axis=1) # 帧长 + devided[maxx <= thred] = 0 + # t4 = ttime() + # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + return devided + + +if __name__ == "__main__": + import librosa + import soundfile as sf + + audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav") + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.transpose(1, 0)) + audio_bak = audio.copy() + if sampling_rate != 16000: + audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) + model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt" + thred = 0.03 # 0.01 + device = "cuda" if torch.cuda.is_available() else "cpu" + rmvpe = RMVPE(model_path, is_half=False, device=device) + t0 = ttime() + f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + t1 = ttime() + logger.info("%s %.2f", f0.shape, t1 - t0) diff --git a/zerorvc/preprocess/slicer2.py b/infer/lib/slicer2.py similarity index 62% rename from zerorvc/preprocess/slicer2.py rename to infer/lib/slicer2.py index 1dfcfc773f4acb14f40e88a5be783879448d46ff..7d9d16db55e30c5c732f7fd32a234af026097e13 100644 --- a/zerorvc/preprocess/slicer2.py +++ b/infer/lib/slicer2.py @@ -1,6 +1,38 @@ -# From https://github.com/openvpi/audio-slicer -# MIT License: https://github.com/openvpi/audio-slicer/blob/main/LICENSE -from librosa.feature import rms as get_rms +import numpy as np + + +# This function is obtained from librosa. +def get_rms( + y, + frame_length=2048, + hop_length=512, + pad_mode="constant", +): + padding = (int(frame_length // 2), int(frame_length // 2)) + y = np.pad(y, padding, mode=pad_mode) + + axis = -1 + # put our new within-frame axis at the end for now + out_strides = y.strides + tuple([y.strides[axis]]) + # Reduce the shape on the framing axis + x_shape_trimmed = list(y.shape) + x_shape_trimmed[axis] -= frame_length - 1 + out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) + xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) + if axis < 0: + target_axis = axis - 1 + else: + target_axis = axis + 1 + xw = np.moveaxis(xw, -1, target_axis) + # Downsample along the target axis + slices = [slice(None)] * xw.ndim + slices[axis] = slice(0, None, hop_length) + x = xw[tuple(slices)] + + # Calculate power + power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) + + return np.sqrt(power) class Slicer: @@ -145,3 +177,84 @@ class Slicer: self._apply_slice(waveform, sil_tags[-1][1], total_frames) ) return chunks + + +def main(): + import os.path + from argparse import ArgumentParser + + import librosa + import soundfile + + parser = ArgumentParser() + parser.add_argument("audio", type=str, help="The audio to be sliced") + parser.add_argument( + "--out", type=str, help="Output directory of the sliced audio clips" + ) + parser.add_argument( + "--db_thresh", + type=float, + required=False, + default=-40, + help="The dB threshold for silence detection", + ) + parser.add_argument( + "--min_length", + type=int, + required=False, + default=5000, + help="The minimum milliseconds required for each sliced audio clip", + ) + parser.add_argument( + "--min_interval", + type=int, + required=False, + default=300, + help="The minimum milliseconds for a silence part to be sliced", + ) + parser.add_argument( + "--hop_size", + type=int, + required=False, + default=10, + help="Frame length in milliseconds", + ) + parser.add_argument( + "--max_sil_kept", + type=int, + required=False, + default=500, + help="The maximum silence length kept around the sliced clip, presented in milliseconds", + ) + args = parser.parse_args() + out = args.out + if out is None: + out = os.path.dirname(os.path.abspath(args.audio)) + audio, sr = librosa.load(args.audio, sr=None, mono=False) + slicer = Slicer( + sr=sr, + threshold=args.db_thresh, + min_length=args.min_length, + min_interval=args.min_interval, + hop_size=args.hop_size, + max_sil_kept=args.max_sil_kept, + ) + chunks = slicer.slice(audio) + if not os.path.exists(out): + os.makedirs(out) + for i, chunk in enumerate(chunks): + if len(chunk.shape) > 1: + chunk = chunk.T + soundfile.write( + os.path.join( + out, + f"%s_%d.wav" + % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), + ), + chunk, + sr, + ) + + +if __name__ == "__main__": + main() diff --git a/infer/lib/train/data_utils.py b/infer/lib/train/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1e1d1dbfbcba8a83e019950492d047263f7b4e73 --- /dev/null +++ b/infer/lib/train/data_utils.py @@ -0,0 +1,517 @@ +import os +import traceback +import logging + +logger = logging.getLogger(__name__) + +import numpy as np +import torch +import torch.utils.data + +from infer.lib.train.mel_processing import spectrogram_torch +from infer.lib.train.utils import load_filepaths_and_text, load_wav_to_torch + + +class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): + """ + 1) loads audio, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, audiopaths_and_text, hparams): + self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filter text & store spec lengths + """ + # Store spectrogram lengths for Bucketing + # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) + # spec_length = wav_length // hop_length + audiopaths_and_text_new = [] + lengths = [] + for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv]) + lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) + self.audiopaths_and_text = audiopaths_and_text_new + self.lengths = lengths + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def get_audio_text_pair(self, audiopath_and_text): + # separate filename and text + file = audiopath_and_text[0] + phone = audiopath_and_text[1] + pitch = audiopath_and_text[2] + pitchf = audiopath_and_text[3] + dv = audiopath_and_text[4] + + phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + # print(123,phone.shape,pitch.shape,spec.shape) + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + # amor + len_wav = len_min * self.hop_length + + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + + phone = phone[:len_min, :] + pitch = pitch[:len_min] + pitchf = pitchf[:len_min] + + return (spec, wav, phone, pitch, pitchf, dv) + + def get_labels(self, phone, pitch, pitchf): + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + pitch = np.load(pitch) + pitchf = np.load(pitchf) + n_num = min(phone.shape[0], 900) # DistributedBucketSampler + # print(234,phone.shape,pitch.shape) + phone = phone[:n_num, :] + pitch = pitch[:n_num] + pitchf = pitchf[:n_num] + phone = torch.FloatTensor(phone) + pitch = torch.LongTensor(pitch) + pitchf = torch.FloatTensor(pitchf) + return phone, pitch, pitchf + + def get_audio(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + audio_norm = audio + # audio_norm = audio / self.max_wav_value + # audio_norm = audio / np.abs(audio).max() + + audio_norm = audio_norm.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if os.path.exists(spec_filename): + try: + spec = torch.load(spec_filename) + except: + logger.warning("%s %s", spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + return self.get_audio_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +class TextAudioCollateMultiNSFsid: + """Zero-pads model inputs and targets""" + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text and aduio + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) # (spec, wav, phone, pitch) + pitch_padded = torch.LongTensor(len(batch), max_phone_len) + pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) + phone_padded.zero_() + pitch_padded.zero_() + pitchf_padded.zero_() + # dv = torch.FloatTensor(len(batch), 256)#gin=256 + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + pitch = row[3] + pitch_padded[i, : pitch.size(0)] = pitch + pitchf = row[4] + pitchf_padded[i, : pitchf.size(0)] = pitchf + + # dv[i] = row[5] + sid[i] = row[5] + + return ( + phone_padded, + phone_lengths, + pitch_padded, + pitchf_padded, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + # dv + sid, + ) + + +class TextAudioLoader(torch.utils.data.Dataset): + """ + 1) loads audio, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, audiopaths_and_text, hparams): + self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filter text & store spec lengths + """ + # Store spectrogram lengths for Bucketing + # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) + # spec_length = wav_length // hop_length + audiopaths_and_text_new = [] + lengths = [] + for audiopath, text, dv in self.audiopaths_and_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_and_text_new.append([audiopath, text, dv]) + lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) + self.audiopaths_and_text = audiopaths_and_text_new + self.lengths = lengths + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def get_audio_text_pair(self, audiopath_and_text): + # separate filename and text + file = audiopath_and_text[0] + phone = audiopath_and_text[1] + dv = audiopath_and_text[2] + + phone = self.get_labels(phone) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + len_wav = len_min * self.hop_length + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + phone = phone[:len_min, :] + return (spec, wav, phone, dv) + + def get_labels(self, phone): + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + n_num = min(phone.shape[0], 900) # DistributedBucketSampler + phone = phone[:n_num, :] + phone = torch.FloatTensor(phone) + return phone + + def get_audio(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + audio_norm = audio + # audio_norm = audio / self.max_wav_value + # audio_norm = audio / np.abs(audio).max() + + audio_norm = audio_norm.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if os.path.exists(spec_filename): + try: + spec = torch.load(spec_filename) + except: + logger.warning("%s %s", spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + return self.get_audio_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +class TextAudioCollate: + """Zero-pads model inputs and targets""" + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text and aduio + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) + phone_padded.zero_() + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + sid[i] = row[3] + + return ( + phone_padded, + phone_lengths, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + sid, + ) + + +class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): + """ + Maintain similar input lengths in a batch. + Length groups are specified by boundaries. + Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. + + It removes samples which are not included in the boundaries. + Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. + """ + + def __init__( + self, + dataset, + batch_size, + boundaries, + num_replicas=None, + rank=None, + shuffle=True, + ): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + self.lengths = dataset.lengths + self.batch_size = batch_size + self.boundaries = boundaries + + self.buckets, self.num_samples_per_bucket = self._create_buckets() + self.total_size = sum(self.num_samples_per_bucket) + self.num_samples = self.total_size // self.num_replicas + + def _create_buckets(self): + buckets = [[] for _ in range(len(self.boundaries) - 1)] + for i in range(len(self.lengths)): + length = self.lengths[i] + idx_bucket = self._bisect(length) + if idx_bucket != -1: + buckets[idx_bucket].append(i) + + for i in range(len(buckets) - 1, -1, -1): # + if len(buckets[i]) == 0: + buckets.pop(i) + self.boundaries.pop(i + 1) + + num_samples_per_bucket = [] + for i in range(len(buckets)): + len_bucket = len(buckets[i]) + total_batch_size = self.num_replicas * self.batch_size + rem = ( + total_batch_size - (len_bucket % total_batch_size) + ) % total_batch_size + num_samples_per_bucket.append(len_bucket + rem) + return buckets, num_samples_per_bucket + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + + indices = [] + if self.shuffle: + for bucket in self.buckets: + indices.append(torch.randperm(len(bucket), generator=g).tolist()) + else: + for bucket in self.buckets: + indices.append(list(range(len(bucket)))) + + batches = [] + for i in range(len(self.buckets)): + bucket = self.buckets[i] + len_bucket = len(bucket) + ids_bucket = indices[i] + num_samples_bucket = self.num_samples_per_bucket[i] + + # add extra samples to make it evenly divisible + rem = num_samples_bucket - len_bucket + ids_bucket = ( + ids_bucket + + ids_bucket * (rem // len_bucket) + + ids_bucket[: (rem % len_bucket)] + ) + + # subsample + ids_bucket = ids_bucket[self.rank :: self.num_replicas] + + # batching + for j in range(len(ids_bucket) // self.batch_size): + batch = [ + bucket[idx] + for idx in ids_bucket[ + j * self.batch_size : (j + 1) * self.batch_size + ] + ] + batches.append(batch) + + if self.shuffle: + batch_ids = torch.randperm(len(batches), generator=g).tolist() + batches = [batches[i] for i in batch_ids] + self.batches = batches + + assert len(self.batches) * self.batch_size == self.num_samples + return iter(self.batches) + + def _bisect(self, x, lo=0, hi=None): + if hi is None: + hi = len(self.boundaries) - 1 + + if hi > lo: + mid = (hi + lo) // 2 + if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: + return mid + elif x <= self.boundaries[mid]: + return self._bisect(x, lo, mid) + else: + return self._bisect(x, mid + 1, hi) + else: + return -1 + + def __len__(self): + return self.num_samples // self.batch_size diff --git a/zerorvc/utils/losses.py b/infer/lib/train/losses.py similarity index 73% rename from zerorvc/utils/losses.py rename to infer/lib/train/losses.py index 9d369d130162ed01c6c5e67a9c0275042156c494..aa7bd81cf596884a8b33e802ae49254d7810a860 100644 --- a/zerorvc/utils/losses.py +++ b/infer/lib/train/losses.py @@ -1,7 +1,7 @@ import torch -def feature_loss(fmap_r: list[torch.Tensor], fmap_g: list[torch.Tensor]): +def feature_loss(fmap_r, fmap_g): loss = 0 for dr, dg in zip(fmap_r, fmap_g): for rl, gl in zip(dr, dg): @@ -12,9 +12,7 @@ def feature_loss(fmap_r: list[torch.Tensor], fmap_g: list[torch.Tensor]): return loss * 2 -def discriminator_loss( - disc_real_outputs: list[torch.Tensor], disc_generated_outputs: list[torch.Tensor] -): +def discriminator_loss(disc_real_outputs, disc_generated_outputs): loss = 0 r_losses = [] g_losses = [] @@ -30,7 +28,7 @@ def discriminator_loss( return loss, r_losses, g_losses -def generator_loss(disc_outputs: list[torch.Tensor]): +def generator_loss(disc_outputs): loss = 0 gen_losses = [] for dg in disc_outputs: @@ -42,13 +40,7 @@ def generator_loss(disc_outputs: list[torch.Tensor]): return loss, gen_losses -def kl_loss( - z_p: torch.Tensor, - logs_q: torch.Tensor, - m_p: torch.Tensor, - logs_p: torch.Tensor, - z_mask: torch.Tensor, -): +def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): """ z_p, logs_q: [b, h, t_t] m_p, logs_p: [b, h, t_t] diff --git a/zerorvc/utils/mel_processing.py b/infer/lib/train/mel_processing.py similarity index 94% rename from zerorvc/utils/mel_processing.py rename to infer/lib/train/mel_processing.py index e7e35f7cbb571e55a85cabf72760c04f33d7d17e..3751f1eab1ea8137088f2f7d7c8294190403b4ce 100644 --- a/zerorvc/utils/mel_processing.py +++ b/infer/lib/train/mel_processing.py @@ -1,6 +1,6 @@ import torch import torch.utils.data -import librosa +from librosa.filters import mel as librosa_mel_fn import logging logger = logging.getLogger(__name__) @@ -95,7 +95,7 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): dtype_device = str(spec.dtype) + "_" + str(spec.device) fmax_dtype_device = str(fmax) + "_" + dtype_device if fmax_dtype_device not in mel_basis: - mel = librosa.filters.mel( + mel = librosa_mel_fn( sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax ) mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..3f131e1e5a95adc2cf0eac2b503c8492b5bbf351 --- /dev/null +++ b/infer/lib/train/process_ckpt.py @@ -0,0 +1,261 @@ +import os +import sys +import traceback +from collections import OrderedDict + +import torch + +from i18n.i18n import I18nAuto + +i18n = I18nAuto() + + +def savee(ckpt, sr, if_f0, name, epoch, version, hps): + try: + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = ckpt[key].half() + opt["config"] = [ + hps.data.filter_length // 2 + 1, + 32, + hps.model.inter_channels, + hps.model.hidden_channels, + hps.model.filter_channels, + hps.model.n_heads, + hps.model.n_layers, + hps.model.kernel_size, + hps.model.p_dropout, + hps.model.resblock, + hps.model.resblock_kernel_sizes, + hps.model.resblock_dilation_sizes, + hps.model.upsample_rates, + hps.model.upsample_initial_channel, + hps.model.upsample_kernel_sizes, + hps.model.spk_embed_dim, + hps.model.gin_channels, + hps.data.sampling_rate, + ] + opt["info"] = "%sepoch" % epoch + opt["sr"] = sr + opt["f0"] = if_f0 + opt["version"] = version + torch.save(opt, "assets/weights/%s.pth" % name) + return "Success." + except: + return traceback.format_exc() + + +def show_info(path): + try: + a = torch.load(path, map_location="cpu") + return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % ( + a.get("info", "None"), + a.get("sr", "None"), + a.get("f0", "None"), + a.get("version", "None"), + ) + except: + return traceback.format_exc() + + +def extract_small_model(path, out, sr, if_f0, info, version): + try: + ckpt = torch.load(path, map_location="cpu") + if "model" in ckpt: + ckpt = ckpt["model"] + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = ckpt[key].half() + if sr == "40k": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 10, 2, 2], + 512, + [16, 16, 4, 4], + 109, + 256, + 40000, + ] + elif sr == "48k": + if version == "v1": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 6, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 48000, + ] + else: + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [12, 10, 2, 2], + 512, + [24, 20, 4, 4], + 109, + 256, + 48000, + ] + elif sr == "32k": + if version == "v1": + opt["config"] = [ + 513, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 4, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 32000, + ] + else: + opt["config"] = [ + 513, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 8, 2, 2], + 512, + [20, 16, 4, 4], + 109, + 256, + 32000, + ] + if info == "": + info = "Extracted model." + opt["info"] = info + opt["version"] = version + opt["sr"] = sr + opt["f0"] = int(if_f0) + torch.save(opt, out) + return "Success." + except: + return traceback.format_exc() + + +def change_info(path, info, name): + try: + ckpt = torch.load(path, map_location="cpu") + ckpt["info"] = info + if name == "": + name = os.path.basename(path) + torch.save(ckpt, "assets/weights/%s" % name) + return "Success." + except: + return traceback.format_exc() + + +def merge(path1, path2, alpha1, sr, f0, info, name, version): + try: + + def extract(ckpt): + a = ckpt["model"] + opt = OrderedDict() + opt["weight"] = {} + for key in a.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = a[key] + return opt + + ckpt1 = torch.load(path1, map_location="cpu") + ckpt2 = torch.load(path2, map_location="cpu") + cfg = ckpt1["config"] + if "model" in ckpt1: + ckpt1 = extract(ckpt1) + else: + ckpt1 = ckpt1["weight"] + if "model" in ckpt2: + ckpt2 = extract(ckpt2) + else: + ckpt2 = ckpt2["weight"] + if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): + return "Fail to merge the models. The model architectures are not the same." + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt1.keys(): + # try: + if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: + min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) + opt["weight"][key] = ( + alpha1 * (ckpt1[key][:min_shape0].float()) + + (1 - alpha1) * (ckpt2[key][:min_shape0].float()) + ).half() + else: + opt["weight"][key] = ( + alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float()) + ).half() + # except: + # pdb.set_trace() + opt["config"] = cfg + """ + if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000] + elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000] + elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] + """ + opt["sr"] = sr + opt["f0"] = 1 if f0 == i18n("是") else 0 + opt["version"] = version + opt["info"] = info + torch.save(opt, "assets/weights/%s.pth" % name) + return "Success." + except: + return traceback.format_exc() diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..765c54c61da07cf9aca6dfe06f1da9847b7f177c --- /dev/null +++ b/infer/lib/train/utils.py @@ -0,0 +1,483 @@ +import argparse +import glob +import json +import logging +import os +import subprocess +import sys +import shutil + +import numpy as np +import torch +from scipy.io.wavfile import read + +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + + ################## + def go(model, bkey): + saved_state_dict = checkpoint_dict[bkey] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + logger.warning( + "shape-%s-mismatch. need: %s, get: %s", + k, + state_dict[k].shape, + saved_state_dict[k].shape, + ) # + raise KeyError + except: + # logger.info(traceback.format_exc()) + logger.info("%s is not in the checkpoint", k) # pretrain缺失的 + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + return model + + go(combd, "combd") + model = go(sbd, "sbd") + ############# + logger.info("Loaded model weights") + + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None and load_opt == 1 + ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch + # try: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + # except: + # traceback.print_exc() + logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration + + +# def load_checkpoint(checkpoint_path, model, optimizer=None): +# assert os.path.isfile(checkpoint_path) +# checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') +# iteration = checkpoint_dict['iteration'] +# learning_rate = checkpoint_dict['learning_rate'] +# if optimizer is not None: +# optimizer.load_state_dict(checkpoint_dict['optimizer']) +# # print(1111) +# saved_state_dict = checkpoint_dict['model'] +# # print(1111) +# +# if hasattr(model, 'module'): +# state_dict = model.module.state_dict() +# else: +# state_dict = model.state_dict() +# new_state_dict= {} +# for k, v in state_dict.items(): +# try: +# new_state_dict[k] = saved_state_dict[k] +# except: +# logger.info("%s is not in the checkpoint" % k) +# new_state_dict[k] = v +# if hasattr(model, 'module'): +# model.module.load_state_dict(new_state_dict) +# else: +# model.load_state_dict(new_state_dict) +# logger.info("Loaded checkpoint '{}' (epoch {})" .format( +# checkpoint_path, iteration)) +# return model, optimizer, learning_rate, iteration +def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + logger.warning( + "shape-%s-mismatch|need-%s|get-%s", + k, + state_dict[k].shape, + saved_state_dict[k].shape, + ) # + raise KeyError + except: + # logger.info(traceback.format_exc()) + logger.info("%s is not in the checkpoint", k) # pretrain缺失的 + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + logger.info("Loaded model weights") + + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None and load_opt == 1 + ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch + # try: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + # except: + # traceback.print_exc() + logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + logger.info( + "Saving model and optimizer state at epoch {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save( + { + "model": state_dict, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path): + logger.info( + "Saving model and optimizer state at epoch {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(combd, "module"): + state_dict_combd = combd.module.state_dict() + else: + state_dict_combd = combd.state_dict() + if hasattr(sbd, "module"): + state_dict_sbd = sbd.module.state_dict() + else: + state_dict_sbd = sbd.state_dict() + torch.save( + { + "combd": state_dict_combd, + "sbd": state_dict_sbd, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sampling_rate=22050, +): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + logger.debug(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow( + alignment.transpose(), aspect="auto", origin="lower", interpolation="none" + ) + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + if info is not None: + xlabel += "\n\n" + info + plt.xlabel(xlabel) + plt.ylabel("Encoder timestep") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + try: + with open(filename, encoding="utf-8") as f: + filepaths_and_text = [line.strip().split(split) for line in f] + except UnicodeDecodeError: + with open(filename) as f: + filepaths_and_text = [line.strip().split(split) for line in f] + + return filepaths_and_text + + +def get_hparams(init=True): + """ + todo: + 结尾七人组: + 保存频率、总epoch done + bs done + pretrainG、pretrainD done + 卡号:os.en["CUDA_VISIBLE_DEVICES"] done + if_latest done + 模型:if_f0 done + 采样率:自动选择config done + 是否缓存数据集进GPU:if_cache_data_in_gpu done + + -m: + 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done + -c不要了 + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "-se", + "--save_every_epoch", + type=int, + required=True, + help="checkpoint save frequency (epoch)", + ) + parser.add_argument( + "-te", "--total_epoch", type=int, required=True, help="total_epoch" + ) + parser.add_argument( + "-pg", "--pretrainG", type=str, default="", help="Pretrained Generator path" + ) + parser.add_argument( + "-pd", "--pretrainD", type=str, default="", help="Pretrained Discriminator path" + ) + parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -") + parser.add_argument( + "-bs", "--batch_size", type=int, required=True, help="batch size" + ) + parser.add_argument( + "-e", "--experiment_dir", type=str, required=True, help="experiment dir" + ) # -m + parser.add_argument( + "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k" + ) + parser.add_argument( + "-sw", + "--save_every_weights", + type=str, + default="0", + help="save the extracted model in weights directory when saving checkpoints", + ) + parser.add_argument( + "-v", "--version", type=str, required=True, help="model version" + ) + parser.add_argument( + "-f0", + "--if_f0", + type=int, + required=True, + help="use f0 as one of the inputs of the model, 1 or 0", + ) + parser.add_argument( + "-l", + "--if_latest", + type=int, + required=True, + help="if only save the latest G/D pth file, 1 or 0", + ) + parser.add_argument( + "-c", + "--if_cache_data_in_gpu", + type=int, + required=True, + help="if caching the dataset in GPU memory, 1 or 0", + ) + + args = parser.parse_args() + name = args.experiment_dir + experiment_dir = os.path.join("./logs", args.experiment_dir) + + config_save_path = os.path.join(experiment_dir, "config.json") + with open(config_save_path, "r") as f: + config = json.load(f) + + hparams = HParams(**config) + hparams.model_dir = hparams.experiment_dir = experiment_dir + hparams.save_every_epoch = args.save_every_epoch + hparams.name = name + hparams.total_epoch = args.total_epoch + hparams.pretrainG = args.pretrainG + hparams.pretrainD = args.pretrainD + hparams.version = args.version + hparams.gpus = args.gpus + hparams.train.batch_size = args.batch_size + hparams.sample_rate = args.sample_rate + hparams.if_f0 = args.if_f0 + hparams.if_latest = args.if_latest + hparams.save_every_weights = args.save_every_weights + hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu + hparams.data.training_files = "%s/filelist.txt" % experiment_dir + return hparams + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warning( + "{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + ) + ) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warning( + "git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8] + ) + ) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() diff --git a/infer/modules/train/extract/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py new file mode 100644 index 0000000000000000000000000000000000000000..2aa7a789ac1818f21fd94299034fdffb447f2fb1 --- /dev/null +++ b/infer/modules/train/extract/extract_f0_print.py @@ -0,0 +1,171 @@ +import os +import sys +import traceback + +import parselmouth + +now_dir = os.getcwd() +sys.path.append(now_dir) +import logging + +import numpy as np +import pyworld + +from infer.lib.audio import load_audio + +logging.getLogger("numba").setLevel(logging.WARNING) +from multiprocessing import Process +from model import rmvpe + +exp_dir = sys.argv[1] +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +n_p = int(sys.argv[2]) +f0method = sys.argv[3] + + +class FeatureInput(object): + def __init__(self, samplerate=16000, hop_size=160): + self.fs = samplerate + self.hop = hop_size + + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + + def compute_f0(self, path, f0_method): + x = load_audio(path, self.fs) + p_len = x.shape[0] // self.hop + if f0_method == "pm": + time_step = 160 / 16000 * 1000 + f0_min = 50 + f0_max = 1100 + f0 = ( + parselmouth.Sound(x, self.fs) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": + f0, t = pyworld.harvest( + x.astype(np.double), + fs=self.fs, + f0_ceil=self.f0_max, + f0_floor=self.f0_min, + frame_period=1000 * self.hop / self.fs, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) + elif f0_method == "dio": + f0, t = pyworld.dio( + x.astype(np.double), + fs=self.fs, + f0_ceil=self.f0_max, + f0_floor=self.f0_min, + frame_period=1000 * self.hop / self.fs, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) + elif f0_method == "rmvpe": + if hasattr(self, "model_rmvpe") == False: + self.model_rmvpe = rmvpe + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + return f0 + + def coarse_f0(self, f0): + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( + self.f0_bin - 2 + ) / (self.f0_mel_max - self.f0_mel_min) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 + f0_coarse = np.rint(f0_mel).astype(int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + def go(self, paths, f0_method): + if len(paths) == 0: + printt("no-f0-todo") + else: + printt("todo-f0-%s" % len(paths)) + n = max(len(paths) // 5, 1) # 每个进程最多打印5条 + for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): + try: + if idx % n == 0: + printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) + if ( + os.path.exists(opt_path1 + ".npy") == True + and os.path.exists(opt_path2 + ".npy") == True + ): + continue + featur_pit = self.compute_f0(inp_path, f0_method) + np.save( + opt_path2, + featur_pit, + allow_pickle=False, + ) # nsf + coarse_pit = self.coarse_f0(featur_pit) + np.save( + opt_path1, + coarse_pit, + allow_pickle=False, + ) # ori + except: + printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) + + +if __name__ == "__main__": + # exp_dir=r"E:\codes\py39\dataset\mi-test" + # n_p=16 + # f = open("%s/log_extract_f0.log"%exp_dir, "w") + printt(" ".join(sys.argv)) + featureInput = FeatureInput() + paths = [] + inp_root = "%s/1_16k_wavs" % (exp_dir) + opt_root1 = "%s/2a_f0" % (exp_dir) + opt_root2 = "%s/2b-f0nsf" % (exp_dir) + + os.makedirs(opt_root1, exist_ok=True) + os.makedirs(opt_root2, exist_ok=True) + for name in sorted(list(os.listdir(inp_root))): + inp_path = "%s/%s" % (inp_root, name) + if "spec" in inp_path: + continue + opt_path1 = "%s/%s" % (opt_root1, name) + opt_path2 = "%s/%s" % (opt_root2, name) + paths.append([inp_path, opt_path1, opt_path2]) + + ps = [] + for i in range(n_p): + p = Process( + target=featureInput.go, + args=( + paths[i::n_p], + f0method, + ), + ) + ps.append(p) + p.start() + for i in range(n_p): + ps[i].join() diff --git a/infer/modules/train/extract/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py new file mode 100644 index 0000000000000000000000000000000000000000..114bfd5f57ef7412991dc2236cf302e73a415923 --- /dev/null +++ b/infer/modules/train/extract/extract_f0_rmvpe.py @@ -0,0 +1,121 @@ +import os +import sys +import traceback +import logging + +import numpy as np + +from infer.lib.audio import load_audio +from model import rmvpe, fp16 + +logging.getLogger("numba").setLevel(logging.WARNING) + + +class FeatureInput(object): + def __init__(self, exp_dir, samplerate=16000, hop_size=160): + self.exp_dir = exp_dir + self.logfile = open("%s/extract_f0_feature.log" % exp_dir, "a+") + self.fs = samplerate + self.hop = hop_size + self.is_half = fp16 + + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + + def println(self, strr): + print(strr) + self.logfile.write("%s\n" % strr) + self.logfile.flush() + + def compute_f0(self, path, f0_method): + x = load_audio(path, self.fs) + # p_len = x.shape[0] // self.hop + if f0_method == "rmvpe": + if hasattr(self, "model_rmvpe") == False: + self.model_rmvpe = rmvpe + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + return f0 + + def coarse_f0(self, f0): + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( + self.f0_bin - 2 + ) / (self.f0_mel_max - self.f0_mel_min) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 + f0_coarse = np.rint(f0_mel).astype(int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + def go(self, paths, f0_method): + if len(paths) == 0: + self.println("no-f0-todo") + else: + self.println("todo-f0-%s" % len(paths)) + n = max(len(paths) // 5, 1) # 每个进程最多打印5条 + for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): + try: + if idx % n == 0: + self.println( + "f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path) + ) + if ( + os.path.exists(opt_path1 + ".npy") == True + and os.path.exists(opt_path2 + ".npy") == True + ): + continue + featur_pit = self.compute_f0(inp_path, f0_method) + np.save( + opt_path2, + featur_pit, + allow_pickle=False, + ) # nsf + coarse_pit = self.coarse_f0(featur_pit) + np.save( + opt_path1, + coarse_pit, + allow_pickle=False, + ) # ori + except: + self.println( + "f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()) + ) + + def run(self): + inp_root = "%s/1_16k_wavs" % (self.exp_dir) + opt_root1 = "%s/2a_f0" % (self.exp_dir) + opt_root2 = "%s/2b-f0nsf" % (self.exp_dir) + os.makedirs(opt_root1, exist_ok=True) + os.makedirs(opt_root2, exist_ok=True) + + paths = [] + for name in sorted(list(os.listdir(inp_root))): + inp_path = "%s/%s" % (inp_root, name) + if "spec" in inp_path: + continue + opt_path1 = "%s/%s" % (opt_root1, name) + opt_path2 = "%s/%s" % (opt_root2, name) + paths.append([inp_path, opt_path1, opt_path2]) + + self.go(paths, "rmvpe") + + +if __name__ == "__main__": + now_dir = os.getcwd() + sys.path.append(now_dir) + + n_part = int(sys.argv[1]) + i_part = int(sys.argv[2]) + os.environ["CUDA_VISIBLE_DEVICES"] = sys.argv[3] + exp_dir = sys.argv[4] + + featureInput = FeatureInput(exp_dir) + featureInput.run() diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py new file mode 100644 index 0000000000000000000000000000000000000000..c5a5f0374f5f33d0944142674f945e0fa9c0657f --- /dev/null +++ b/infer/modules/train/extract_feature_print.py @@ -0,0 +1,80 @@ +import os +import traceback +import fairseq +import numpy as np +import soundfile as sf +import torch +import torch.nn.functional as F +from model import hubert, hubert_cfg, device, fp16 as is_half + + +# wave must be 16k, hop_size=320 +def readwave(wav_path, normalize=False): + wav, sr = sf.read(wav_path) + assert sr == 16000 + feats = torch.from_numpy(wav).float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + if normalize: + with torch.no_grad(): + feats = F.layer_norm(feats, feats.shape) + feats = feats.view(1, -1) + return feats + + +class HubertFeatureExtractor: + def __init__(self, exp_dir: str): + self.exp_dir = exp_dir + self.logfile = open("%s/extract_f0_feature.log" % exp_dir, "a+") + self.wavPath = "%s/1_16k_wavs" % exp_dir + self.outPath = "%s/3_feature768" % exp_dir + os.makedirs(self.outPath, exist_ok=True) + + def println(self, strr): + print(strr) + self.logfile.write("%s\n" % strr) + self.logfile.flush() + + def run(self): + todo = sorted(list(os.listdir(self.wavPath))) + n = max(1, len(todo) // 10) # 最多打印十条 + if len(todo) == 0: + self.println("no-feature-todo") + else: + self.println("all-feature-%s" % len(todo)) + for idx, file in enumerate(todo): + try: + if file.endswith(".wav"): + wav_path = "%s/%s" % (self.wavPath, file) + out_path = "%s/%s" % (self.outPath, file.replace("wav", "npy")) + + if os.path.exists(out_path): + continue + + feats = readwave(wav_path, normalize=hubert_cfg.task.normalize) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": ( + feats.half().to(device) if is_half else feats.to(device) + ), + "padding_mask": padding_mask.to(device), + "output_layer": 12, + } + with torch.no_grad(): + logits = hubert.extract_features(**inputs) + feats = logits[0] + + feats = feats.squeeze(0).float().cpu().numpy() + if np.isnan(feats).sum() == 0: + np.save(out_path, feats, allow_pickle=False) + else: + self.println("%s-contains nan" % file) + if idx % n == 0: + self.println( + "now-%s,all-%s,%s,%s" + % (len(todo), idx, file, feats.shape) + ) + except: + self.println(traceback.format_exc()) + self.println("all-feature-done") diff --git a/infer/modules/train/preprocess.py b/infer/modules/train/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..3bccb4bb48a5f45aa2ebc4290c2fc33038daf809 --- /dev/null +++ b/infer/modules/train/preprocess.py @@ -0,0 +1,140 @@ +import multiprocessing +import os +import sys + +from scipy import signal +import os +import traceback + +import librosa +import numpy as np +from scipy.io import wavfile + +from infer.lib.audio import load_audio +from infer.lib.slicer2 import Slicer + + +class PreProcess: + def __init__(self, sr, exp_dir, per=3.7, noparallel=False): + self.slicer = Slicer( + sr=sr, + threshold=-42, + min_length=1500, + min_interval=400, + hop_size=15, + max_sil_kept=500, + ) + self.sr = sr + self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr) + self.per = per + self.overlap = 0.3 + self.tail = self.per + self.overlap + self.max = 0.9 + self.alpha = 0.75 + self.exp_dir = exp_dir + self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir + self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir + self.logfile = open("%s/preprocess.log" % exp_dir, "a+") + self.noparallel = noparallel + os.makedirs(self.exp_dir, exist_ok=True) + os.makedirs(self.gt_wavs_dir, exist_ok=True) + os.makedirs(self.wavs16k_dir, exist_ok=True) + + def println(self, strr): + print(strr) + self.logfile.write("%s\n" % strr) + self.logfile.flush() + + def norm_write(self, tmp_audio, idx0, idx1): + tmp_max = np.abs(tmp_audio).max() + if tmp_max > 2.5: + print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max)) + return + tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + ( + 1 - self.alpha + ) * tmp_audio + wavfile.write( + "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), + self.sr, + tmp_audio.astype(np.float32), + ) + tmp_audio = librosa.resample( + tmp_audio, orig_sr=self.sr, target_sr=16000 + ) # , res_type="soxr_vhq" + wavfile.write( + "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), + 16000, + tmp_audio.astype(np.float32), + ) + + def pipeline(self, path, idx0): + try: + audio = load_audio(path, self.sr) + # zero phased digital filter cause pre-ringing noise... + # audio = signal.filtfilt(self.bh, self.ah, audio) + audio = signal.lfilter(self.bh, self.ah, audio) + + idx1 = 0 + for audio in self.slicer.slice(audio): + i = 0 + while 1: + start = int(self.sr * (self.per - self.overlap) * i) + i += 1 + if len(audio[start:]) > self.tail * self.sr: + tmp_audio = audio[start : start + int(self.per * self.sr)] + self.norm_write(tmp_audio, idx0, idx1) + idx1 += 1 + else: + tmp_audio = audio[start:] + idx1 += 1 + break + self.norm_write(tmp_audio, idx0, idx1) + self.println("%s\t-> Success" % path) + except: + self.println("%s\t-> %s" % (path, traceback.format_exc())) + + def pipeline_mp(self, infos): + for path, idx0 in infos: + self.pipeline(path, idx0) + + def pipeline_mp_inp_dir(self, inp_root, n_p): + try: + infos = [ + ("%s/%s" % (inp_root, name), idx) + for idx, name in enumerate(sorted(list(os.listdir(inp_root)))) + ] + if self.noparallel: + for i in range(n_p): + self.pipeline_mp(infos[i::n_p]) + else: + ps = [] + for i in range(n_p): + p = multiprocessing.Process( + target=self.pipeline_mp, args=(infos[i::n_p],) + ) + ps.append(p) + p.start() + for i in range(n_p): + ps[i].join() + except: + self.println("Fail. %s" % traceback.format_exc()) + + +def preprocess_trainset(inp_root, sr, n_p, exp_dir, per, noparallel): + pp = PreProcess(sr, exp_dir, per, noparallel) + pp.println("start preprocess") + pp.pipeline_mp_inp_dir(inp_root, n_p) + pp.println("end preprocess") + + +if __name__ == "__main__": + now_dir = os.getcwd() + sys.path.append(now_dir) + print(*sys.argv[1:]) + inp_root = sys.argv[1] + sr = int(sys.argv[2]) + n_p = int(sys.argv[3]) + exp_dir = sys.argv[4] + noparallel = sys.argv[5] == "True" + per = float(sys.argv[6]) + preprocess_trainset(inp_root, sr, n_p, exp_dir, per, noparallel) diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..9b55df1158b6d09b0e171970fe2eabf19420746e --- /dev/null +++ b/infer/modules/train/train.py @@ -0,0 +1,630 @@ +import os +import sys +import logging + +logger = logging.getLogger(__name__) + +now_dir = os.getcwd() +sys.path.append(os.path.join(now_dir)) + +import datetime + +from infer.lib.train import utils + +# hps = utils.get_hparams() +# os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",") +# n_gpus = len(hps.gpus.split("-")) +from random import randint, shuffle + +import torch + +try: + import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import + + if torch.xpu.is_available(): + from infer.modules.ipex import ipex_init + from infer.modules.ipex.gradscaler import gradscaler_init + from torch.xpu.amp import autocast + + GradScaler = gradscaler_init() + ipex_init() + else: + from torch.cuda.amp import GradScaler, autocast +except Exception: + from torch.cuda.amp import GradScaler, autocast + +torch.backends.cudnn.deterministic = False +torch.backends.cudnn.benchmark = False +from time import time as ttime + +from torch.nn import functional as F +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + +from infer.lib.infer_pack import commons +from infer.lib.train.data_utils import ( + DistributedBucketSampler, + TextAudioCollate, + TextAudioCollateMultiNSFsid, + TextAudioLoader, + TextAudioLoaderMultiNSFsid, +) + +# if hps.version == "v1": +# from infer.lib.infer_pack.models import MultiPeriodDiscriminator +# from infer.lib.infer_pack.models import SynthesizerTrnMs256NSFsid as RVC_Model_f0 +# from infer.lib.infer_pack.models import ( +# SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0, +# ) +# else: +from infer.lib.infer_pack.models import ( + SynthesizerTrnMs768NSFsid as RVC_Model_f0, + SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0, + MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator, +) + +from infer.lib.train.losses import ( + discriminator_loss, + feature_loss, + generator_loss, + kl_loss, +) +from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from infer.lib.train.process_ckpt import savee + + +class EpochRecorder: + def __init__(self): + self.last_time = ttime() + + def record(self): + now_time = ttime() + elapsed_time = now_time - self.last_time + self.last_time = now_time + elapsed_time_str = str(datetime.timedelta(seconds=elapsed_time)) + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + return f"[{current_time}] | ({elapsed_time_str})" + + +def train(exp_dir: str): + state = {"global_step": 0} + + hps = utils.get_hparams_from_dir(exp_dir) + hps.experiment_dir = exp_dir + hps.save_every_epoch = 5 + hps.name = os.path.basename(exp_dir) + hps.total_epoch = 100 + hps.pretrainG = "assets/pretrained_v2/f0G40k.pth" + hps.pretrainD = "assets/pretrained_v2/f0D40k.pth" + hps.version = "v2" + hps.gpus = "0" + hps.sample_rate = "40k" + hps.if_f0 = 1 + hps.if_latest = 0 + hps.save_every_weights = "0" + hps.if_cache_data_in_gpu = True + hps.data.training_files = "%s/filelist.txt" % exp_dir + + logger = utils.get_logger(hps.model_dir) + run(0, 1, hps, logger, state) + + +def run(rank, n_gpus, hps, logger: logging.Logger, state): + if rank == 0: + # logger = utils.get_logger(hps.model_dir) + logger.info(hps) + # utils.check_git_hash(hps.model_dir) + writer = SummaryWriter(log_dir=hps.model_dir) + writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) + + torch.manual_seed(hps.train.seed) + if torch.cuda.is_available(): + torch.cuda.set_device(rank) + + if hps.if_f0 == 1: + train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data) + else: + train_dataset = TextAudioLoader(hps.data.training_files, hps.data) + train_sampler = DistributedBucketSampler( + train_dataset, + hps.train.batch_size * n_gpus, + # [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s + [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s + num_replicas=n_gpus, + rank=rank, + shuffle=True, + ) + # It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit. + # num_workers=8 -> num_workers=4 + if hps.if_f0 == 1: + collate_fn = TextAudioCollateMultiNSFsid() + else: + collate_fn = TextAudioCollate() + train_loader = DataLoader( + train_dataset, + # num_workers=4, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + # persistent_workers=True, + # prefetch_factor=8, + ) + if hps.if_f0 == 1: + net_g = RVC_Model_f0( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model, + is_half=hps.train.fp16_run, + sr=hps.sample_rate, + ) + else: + net_g = RVC_Model_nof0( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model, + is_half=hps.train.fp16_run, + ) + if torch.cuda.is_available(): + net_g = net_g.cuda(rank) + net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm) + if torch.cuda.is_available(): + net_d = net_d.cuda(rank) + optim_g = torch.optim.AdamW( + net_g.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + optim_d = torch.optim.AdamW( + net_d.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + + try: # 如果能加载自动resume + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d + ) # D多半加载没事 + if rank == 0: + logger.info("loaded D") + # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g + ) + state["global_step"] = (epoch_str - 1) * len(train_loader) + print("loaded", epoch_str) + epoch_str += 1 + except: # 如果首次不能加载,加载pretrain + # traceback.print_exc() + epoch_str = 1 + state["global_step"] = 0 + if hps.pretrainG != "": + if rank == 0: + logger.info("loaded pretrained %s" % (hps.pretrainG)) + if hasattr(net_g, "module"): + logger.info( + net_g.module.load_state_dict( + torch.load(hps.pretrainG, map_location="cpu")["model"] + ) + ) ##测试不加载优化器 + else: + logger.info( + net_g.load_state_dict( + torch.load(hps.pretrainG, map_location="cpu")["model"] + ) + ) ##测试不加载优化器 + if hps.pretrainD != "": + if rank == 0: + logger.info("loaded pretrained %s" % (hps.pretrainD)) + if hasattr(net_d, "module"): + logger.info( + net_d.module.load_state_dict( + torch.load(hps.pretrainD, map_location="cpu")["model"] + ) + ) + else: + logger.info( + net_d.load_state_dict( + torch.load(hps.pretrainD, map_location="cpu")["model"] + ) + ) + print("new") + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 + ) + + scaler = GradScaler(enabled=hps.train.fp16_run) + + cache = [] + saved = 0 + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + logger, + [writer, writer_eval], + cache, + state, + ) + else: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + None, + None, + cache, + state, + ) + scheduler_g.step() + scheduler_d.step() + + if epoch % hps.save_every_epoch == 0 and rank == 0: + saved += 1 + if saved >= 2: + break + + +def train_and_evaluate( + rank, + epoch, + hps, + nets, + optims, + schedulers, + scaler, + loaders, + logger, + writers, + cache, + state, +): + net_g, net_d = nets + optim_g, optim_d = optims + train_loader, eval_loader = loaders + if writers is not None: + writer, writer_eval = writers + + train_loader.batch_sampler.set_epoch(epoch) + + net_g.train() + net_d.train() + + # Prepare data iterator + if hps.if_cache_data_in_gpu == True: + # Use Cache + data_iterator = cache + if cache == []: + # Make new cache + for batch_idx, info in enumerate(train_loader): + # Unpack + if hps.if_f0 == 1: + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + else: + ( + phone, + phone_lengths, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + # Load on CUDA + if torch.cuda.is_available(): + phone = phone.cuda(rank, non_blocking=True) + phone_lengths = phone_lengths.cuda(rank, non_blocking=True) + if hps.if_f0 == 1: + pitch = pitch.cuda(rank, non_blocking=True) + pitchf = pitchf.cuda(rank, non_blocking=True) + sid = sid.cuda(rank, non_blocking=True) + spec = spec.cuda(rank, non_blocking=True) + spec_lengths = spec_lengths.cuda(rank, non_blocking=True) + wave = wave.cuda(rank, non_blocking=True) + wave_lengths = wave_lengths.cuda(rank, non_blocking=True) + # Cache on list + if hps.if_f0 == 1: + cache.append( + ( + batch_idx, + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ), + ) + ) + else: + cache.append( + ( + batch_idx, + ( + phone, + phone_lengths, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ), + ) + ) + else: + # Load shuffled cache + shuffle(cache) + else: + # Loader + data_iterator = enumerate(train_loader) + + # Run steps + epoch_recorder = EpochRecorder() + for batch_idx, info in data_iterator: + # Data + ## Unpack + if hps.if_f0 == 1: + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + else: + phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info + ## Load on CUDA + if (hps.if_cache_data_in_gpu == False) and torch.cuda.is_available(): + phone = phone.cuda(rank, non_blocking=True) + phone_lengths = phone_lengths.cuda(rank, non_blocking=True) + if hps.if_f0 == 1: + pitch = pitch.cuda(rank, non_blocking=True) + pitchf = pitchf.cuda(rank, non_blocking=True) + sid = sid.cuda(rank, non_blocking=True) + spec = spec.cuda(rank, non_blocking=True) + spec_lengths = spec_lengths.cuda(rank, non_blocking=True) + wave = wave.cuda(rank, non_blocking=True) + # wave_lengths = wave_lengths.cuda(rank, non_blocking=True) + + # Calculate + with autocast(enabled=hps.train.fp16_run): + if hps.if_f0 == 1: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g(phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid) + else: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g(phone, phone_lengths, spec, spec_lengths, sid) + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + y_mel = commons.slice_segments( + mel, ids_slice, hps.train.segment_size // hps.data.hop_length + ) + with autocast(enabled=False): + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + if hps.train.fp16_run == True: + y_hat_mel = y_hat_mel.half() + wave = commons.slice_segments( + wave, ids_slice * hps.data.hop_length, hps.train.segment_size + ) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( + y_d_hat_r, y_d_hat_g + ) + optim_d.zero_grad() + scaler.scale(loss_disc).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) + with autocast(enabled=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if state["global_step"] % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, 100.0 * batch_idx / len(train_loader) + ) + ) + # Amor For Tensorboard display + if loss_mel > 75: + loss_mel = 75 + if loss_kl > 9: + loss_kl = 9 + + logger.info([state["global_step"], lr]) + logger.info( + f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}" + ) + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + } + scalar_dict.update( + { + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl": loss_kl, + } + ) + + scalar_dict.update( + {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} + ) + scalar_dict.update( + {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)} + ) + scalar_dict.update( + {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)} + ) + image_dict = { + "slice/mel_org": utils.plot_spectrogram_to_numpy( + y_mel[0].data.cpu().numpy() + ), + "slice/mel_gen": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].data.cpu().numpy() + ), + "all/mel": utils.plot_spectrogram_to_numpy( + mel[0].data.cpu().numpy() + ), + } + utils.summarize( + writer=writer, + global_step=state["global_step"], + images=image_dict, + scalars=scalar_dict, + ) + state["global_step"] += 1 + # /Run steps + + if epoch % hps.save_every_epoch == 0 and rank == 0: + if hps.if_latest == 0: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(state["global_step"])), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(state["global_step"])), + ) + else: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(2333333)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), + ) + if rank == 0 and hps.save_every_weights == "1": + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + logger.info( + "saving ckpt %s_e%s:%s" + % ( + hps.name, + epoch, + savee( + ckpt, + hps.sample_rate, + hps.if_f0, + hps.name + "_e%s_s%s" % (epoch, state["global_step"]), + epoch, + hps.version, + hps, + ), + ) + ) + + if rank == 0: + logger.info("====> Epoch: {} {}".format(epoch, epoch_recorder.record())) + if epoch >= hps.total_epoch and rank == 0: + logger.info("Training is done. The program is closed.") + + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + logger.info( + "saving final ckpt:%s" + % ( + savee( + ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch, hps.version, hps + ) + ) + ) diff --git a/infer/modules/vc/__init__.py b/infer/modules/vc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..0da7b41bc4519c3f46f58180bb0428e8a937306e --- /dev/null +++ b/infer/modules/vc/modules.py @@ -0,0 +1,305 @@ +import traceback +import logging + +logger = logging.getLogger(__name__) + +import numpy as np +import soundfile as sf +import torch +from io import BytesIO + +from infer.lib.audio import load_audio, wav2 +from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) +from infer.modules.vc.pipeline import Pipeline +from infer.modules.vc.utils import * +from model import hubert + + +class VC: + def __init__(self, config): + self.n_spk = None + self.tgt_sr = None + self.net_g = None + self.pipeline = None + self.cpt = None + self.version = None + self.if_f0 = None + self.version = None + self.hubert_model = None + + self.config = config + + def get_vc(self, sid, *to_return_protect): + logger.info("Get sid: " + sid) + + to_return_protect0 = { + "visible": self.if_f0 != 0, + "value": ( + to_return_protect[0] if self.if_f0 != 0 and to_return_protect else 0.5 + ), + "__type__": "update", + } + to_return_protect1 = { + "visible": self.if_f0 != 0, + "value": ( + to_return_protect[1] if self.if_f0 != 0 and to_return_protect else 0.33 + ), + "__type__": "update", + } + + if sid == "" or sid == []: + if ( + self.hubert_model is not None + ): # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 + logger.info("Clean model cache") + del (self.net_g, self.n_spk, self.hubert_model, self.tgt_sr) # ,cpt + self.hubert_model = self.net_g = self.n_spk = self.hubert_model = ( + self.tgt_sr + ) = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + ###楼下不这么折腾清理不干净 + self.if_f0 = self.cpt.get("f0", 1) + self.version = self.cpt.get("version", "v1") + if self.version == "v1": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs256NSFsid( + *self.cpt["config"], is_half=self.config.is_half + ) + else: + self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"]) + elif self.version == "v2": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs768NSFsid( + *self.cpt["config"], is_half=self.config.is_half + ) + else: + self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"]) + del self.net_g, self.cpt + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return ( + {"visible": False, "__type__": "update"}, + { + "visible": True, + "value": to_return_protect0, + "__type__": "update", + }, + { + "visible": True, + "value": to_return_protect1, + "__type__": "update", + }, + "", + "", + ) + person = sid + logger.info(f"Loading: {person}") + + self.cpt = torch.load(person, map_location="cpu") + self.tgt_sr = self.cpt["config"][-1] + self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk + self.if_f0 = self.cpt.get("f0", 1) + self.version = self.cpt.get("version", "v1") + + synthesizer_class = { + ("v1", 1): SynthesizerTrnMs256NSFsid, + ("v1", 0): SynthesizerTrnMs256NSFsid_nono, + ("v2", 1): SynthesizerTrnMs768NSFsid, + ("v2", 0): SynthesizerTrnMs768NSFsid_nono, + } + + self.net_g = synthesizer_class.get( + (self.version, self.if_f0), SynthesizerTrnMs256NSFsid + )(*self.cpt["config"], is_half=self.config.is_half) + + del self.net_g.enc_q + + self.net_g.load_state_dict(self.cpt["weight"], strict=False) + self.net_g.eval().to(self.config.device) + if self.config.is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + + self.pipeline = Pipeline(self.tgt_sr, self.config) + n_spk = self.cpt["config"][-3] + # index = {"value": get_index_path_from_model(sid), "__type__": "update"} + # logger.info("Select index: " + index["value"]) + + return ( + ( + {"visible": True, "maximum": n_spk, "__type__": "update"}, + to_return_protect0, + to_return_protect1, + # index, + # index, + ) + if to_return_protect + else {"visible": True, "maximum": n_spk, "__type__": "update"} + ) + + def vc_single( + self, + sid, + input_audio_path, + f0_up_key, + f0_file, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ): + if input_audio_path is None: + return "You need to upload an audio", None + f0_up_key = int(f0_up_key) + try: + audio = load_audio(input_audio_path, 16000) + audio_max = np.abs(audio).max() / 0.95 + if audio_max > 1: + audio /= audio_max + times = [0, 0, 0] + + if self.hubert_model is None: + self.hubert_model = hubert + + if file_index: + file_index = ( + file_index.strip(" ") + .strip('"') + .strip("\n") + .strip('"') + .strip(" ") + .replace("trained", "added") + ) + elif file_index2: + file_index = file_index2 + else: + file_index = "" # 防止小白写错,自动帮他替换掉 + + audio_opt = self.pipeline.pipeline( + self.hubert_model, + self.net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + index_rate, + self.if_f0, + filter_radius, + self.tgt_sr, + resample_sr, + rms_mix_rate, + self.version, + protect, + f0_file, + ) + if self.tgt_sr != resample_sr >= 16000: + tgt_sr = resample_sr + else: + tgt_sr = self.tgt_sr + index_info = ( + "Index:\n%s." % file_index + if os.path.exists(file_index) + else "Index not used." + ) + return ( + "Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs." + % (index_info, *times), + (tgt_sr, audio_opt), + ) + except: + info = traceback.format_exc() + logger.warning(info) + return info, (None, None) + + def vc_multi( + self, + sid, + dir_path, + opt_root, + paths, + f0_up_key, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + format1, + ): + try: + dir_path = ( + dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + os.makedirs(opt_root, exist_ok=True) + try: + if dir_path != "": + paths = [ + os.path.join(dir_path, name) for name in os.listdir(dir_path) + ] + else: + paths = [path.name for path in paths] + except: + traceback.print_exc() + paths = [path.name for path in paths] + infos = [] + for path in paths: + info, opt = self.vc_single( + sid, + path, + f0_up_key, + None, + f0_method, + file_index, + file_index2, + # file_big_npy, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ) + if "Success" in info: + try: + tgt_sr, audio_opt = opt + if format1 in ["wav", "flac"]: + sf.write( + "%s/%s.%s" + % (opt_root, os.path.basename(path), format1), + audio_opt, + tgt_sr, + ) + else: + path = "%s/%s.%s" % ( + opt_root, + os.path.basename(path), + format1, + ) + with BytesIO() as wavf: + sf.write(wavf, audio_opt, tgt_sr, format="wav") + wavf.seek(0, 0) + with open(path, "wb") as outf: + wav2(wavf, outf, format1) + except: + info += traceback.format_exc() + infos.append("%s->%s" % (os.path.basename(path), info)) + yield "\n".join(infos) + yield "\n".join(infos) + except: + yield traceback.format_exc() diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..15037e6103c1d8b2cbfa1668b5ffa81f98f12302 --- /dev/null +++ b/infer/modules/vc/pipeline.py @@ -0,0 +1,449 @@ +import os +import sys +import traceback +import logging + +logger = logging.getLogger(__name__) + +from functools import lru_cache +from time import time as ttime + +import faiss +import librosa +import numpy as np +import parselmouth +import pyworld +import torch +import torch.nn.functional as F +import torchcrepe +from scipy import signal +from model import rmvpe, device, fp16 + +now_dir = os.getcwd() +sys.path.append(now_dir) + +bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) + +input_audio_path2wav = {} + + +@lru_cache +def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): + audio = input_audio_path2wav[input_audio_path] + f0, t = pyworld.harvest( + audio, + fs=fs, + f0_ceil=f0max, + f0_floor=f0min, + frame_period=frame_period, + ) + f0 = pyworld.stonemask(audio, f0, t, fs) + return f0 + + +def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 + # print(data1.max(),data2.max()) + rms1 = librosa.feature.rms( + y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 + ) # 每半秒一个点 + rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) + rms1 = torch.from_numpy(rms1) + rms1 = F.interpolate( + rms1.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.from_numpy(rms2) + rms2 = F.interpolate( + rms2.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) + data2 *= ( + torch.pow(rms1, torch.tensor(1 - rate)) + * torch.pow(rms2, torch.tensor(rate - 1)) + ).numpy() + return data2 + + +class Pipeline(object): + def __init__(self, tgt_sr, config): + self.x_pad, self.x_query, self.x_center, self.x_max = ( + config.x_pad, + config.x_query, + config.x_center, + config.x_max, + ) + self.is_half = fp16 + self.sr = 16000 # hubert输入采样率 + self.window = 160 # 每帧点数 + self.t_pad = self.sr * self.x_pad # 每条前后pad时间 + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sr * self.x_query # 查询切点前后查询时间 + self.t_center = self.sr * self.x_center # 查询切点位置 + self.t_max = self.sr * self.x_max # 免查询时长阈值 + self.device = device + + def get_f0( + self, + input_audio_path, + x, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0=None, + ): + global input_audio_path2wav + time_step = self.window / self.sr * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + if f0_method == "pm": + f0 = ( + parselmouth.Sound(x, self.sr) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": + input_audio_path2wav[input_audio_path] = x.astype(np.double) + f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) + if filter_radius > 2: + f0 = signal.medfilt(f0, 3) + elif f0_method == "crepe": + model = "full" + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 + # Compute pitch using first gpu + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + self.sr, + self.window, + f0_min, + f0_max, + model, + batch_size=batch_size, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + elif f0_method == "rmvpe": + if not hasattr(self, "model_rmvpe"): + self.model_rmvpe = rmvpe + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + + if "privateuseone" in str(self.device): # clean ortruntime memory + del self.model_rmvpe.model + del self.model_rmvpe + logger.info("Cleaning ortruntime memory") + + f0 *= pow(2, f0_up_key / 12) + # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + tf0 = self.sr // self.window # 每秒f0点数 + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + :shape + ] + # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int32) + return f0_coarse, f0bak # 1-0 + + def vc( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + ): # ,file_index,file_big_npy + feats = torch.from_numpy(audio0) + if self.is_half: + feats = feats.half() + else: + feats = feats.float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + + inputs = { + "source": feats.to(self.device), + "padding_mask": padding_mask, + "output_layer": 9 if version == "v1" else 12, + } + t0 = ttime() + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) if version == "v1" else logits[0] + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = feats.clone() + if ( + not isinstance(index, type(None)) + and not isinstance(big_npy, type(None)) + and index_rate != 0 + ): + npy = feats[0].cpu().numpy() + if self.is_half: + npy = npy.astype("float32") + + # _, I = index.search(npy, 1) + # npy = big_npy[I.squeeze()] + + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + t1 = ttime() + p_len = audio0.shape[0] // self.window + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch is not None and pitchf is not None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + + if protect < 0.5 and pitch is not None and pitchf is not None: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + pitchff = pitchff.unsqueeze(-1) + feats = feats * pitchff + feats0 * (1 - pitchff) + feats = feats.to(feats0.dtype) + p_len = torch.tensor([p_len], device=self.device).long() + with torch.no_grad(): + hasp = pitch is not None and pitchf is not None + arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid) + audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy() + del hasp, arg + del feats, p_len, padding_mask + if torch.cuda.is_available(): + torch.cuda.empty_cache() + t2 = ttime() + times[0] += t1 - t0 + times[2] += t2 - t1 + return audio1 + + def pipeline( + self, + model, + net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + index_rate, + if_f0, + filter_radius, + tgt_sr, + resample_sr, + rms_mix_rate, + version, + protect, + f0_file=None, + ): + if ( + file_index != "" + # and file_big_npy != "" + # and os.path.exists(file_big_npy) == True + and os.path.exists(file_index) + and index_rate != 0 + ): + try: + index = faiss.read_index(file_index) + # big_npy = np.load(file_big_npy) + big_npy = index.reconstruct_n(0, index.ntotal) + except: + traceback.print_exc() + index = big_npy = None + else: + index = big_npy = None + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += np.abs(audio_pad[i : i - self.window]) + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + audio_sum[t - self.t_query : t + self.t_query] + == audio_sum[t - self.t_query : t + self.t_query].min() + )[0][0] + ) + s = 0 + audio_opt = [] + t = None + t1 = ttime() + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name"): + try: + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except: + traceback.print_exc() + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + pitch, pitchf = None, None + if if_f0 == 1: + pitch, pitchf = self.get_f0( + input_audio_path, + audio_pad, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if "mps" not in str(self.device) or "xpu" not in str(self.device): + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + t2 = ttime() + times[1] += t2 - t1 + for t in opt_ts: + t = t // self.window * self.window + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + s = t + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + if rms_mix_rate != 1: + audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) + if tgt_sr != resample_sr >= 16000: + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + ) + audio_max = np.abs(audio_opt).max() / 0.99 + max_int16 = 32768 + if audio_max > 1: + max_int16 /= audio_max + audio_opt = (audio_opt * max_int16).astype(np.int16) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7df184fccf1482d9d4f812a5f1b32ef9756fffa0 --- /dev/null +++ b/infer/modules/vc/utils.py @@ -0,0 +1,17 @@ +import os + + +def get_index_path_from_model(sid): + return next( + ( + f + for f in [ + os.path.join(root, name) + for root, _, files in os.walk(os.getenv("index_root"), topdown=False) + for name in files + if name.endswith(".index") and "trained" not in name + ] + if sid.split(".")[0] in f + ), + "", + ) diff --git a/logs/mute/0_gt_wavs/mute32k.wav b/logs/mute/0_gt_wavs/mute32k.wav new file mode 100644 index 0000000000000000000000000000000000000000..a83c72a4079056aa7b9994c05082d5018dc1b60a --- /dev/null +++ b/logs/mute/0_gt_wavs/mute32k.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9edcf85ec77e88bd01edf3d887bdc418d3596d573f7ad2694da546f41dae6baf +size 192078 diff --git a/logs/mute/0_gt_wavs/mute40k.spec.pt b/logs/mute/0_gt_wavs/mute40k.spec.pt new file mode 100644 index 0000000000000000000000000000000000000000..91b61497ce00cf91cd2d6b4d8989de5642b396d9 --- /dev/null +++ b/logs/mute/0_gt_wavs/mute40k.spec.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bbca900dcff32be4d664383a705d53ebc6829027ac8a07d78308d472c9087a1 +size 1230339 diff --git a/logs/mute/0_gt_wavs/mute40k.wav b/logs/mute/0_gt_wavs/mute40k.wav new file mode 100644 index 0000000000000000000000000000000000000000..60e81785a92525bc7a39d98fa16d8209279da9cd --- /dev/null +++ b/logs/mute/0_gt_wavs/mute40k.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67a816e77b50cb9f016e49e5c01f07e080c4e3b82b7a8ac3e64bcb143f90f31b +size 240078 diff --git a/zerorvc/assets/mute/mute48k.wav b/logs/mute/0_gt_wavs/mute48k.wav similarity index 100% rename from zerorvc/assets/mute/mute48k.wav rename to logs/mute/0_gt_wavs/mute48k.wav diff --git a/logs/mute/1_16k_wavs/mute.wav b/logs/mute/1_16k_wavs/mute.wav new file mode 100644 index 0000000000000000000000000000000000000000..e40db260891baa6c988dc73c41ec8a14ae23e9ac --- /dev/null +++ b/logs/mute/1_16k_wavs/mute.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e233e86ba1be365e1133f157d56b61110086b89650ecfbdfc013c759e466250 +size 96078 diff --git a/logs/mute/2a_f0/mute.wav.npy b/logs/mute/2a_f0/mute.wav.npy new file mode 100644 index 0000000000000000000000000000000000000000..dd7e9afd2e7f2aefaa30bcd4541a23ce96a9e150 --- /dev/null +++ b/logs/mute/2a_f0/mute.wav.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b9acf9ab7facdb032e1d687fe35182670b0b94566c4b209ae48c239d19956a6 +size 1332 diff --git a/logs/mute/2b-f0nsf/mute.wav.npy b/logs/mute/2b-f0nsf/mute.wav.npy new file mode 100644 index 0000000000000000000000000000000000000000..7644e325ddd34bd186153ecf7461aa1593a054f3 --- /dev/null +++ b/logs/mute/2b-f0nsf/mute.wav.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30792849c8e72d67e6691754077f2888b101cb741e9c7f193c91dd9692870c87 +size 2536 diff --git a/logs/mute/3_feature256/mute.npy b/logs/mute/3_feature256/mute.npy new file mode 100644 index 0000000000000000000000000000000000000000..c57ae95d19d969788ef186a81cdc2f4b462ed6df --- /dev/null +++ b/logs/mute/3_feature256/mute.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d5abbac078e19a3f649c0d78a02cb33a71407ded3ddf2db78e6b803d0c0126 +size 152704 diff --git a/logs/mute/3_feature768/mute.npy b/logs/mute/3_feature768/mute.npy new file mode 100644 index 0000000000000000000000000000000000000000..ea5f9dddca08ff210791b27e3db3fc5676eabc90 --- /dev/null +++ b/logs/mute/3_feature768/mute.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16ef62b957887ac9f0913aa5158f18983afff1ef5a3e4c5fd067ac20fc380d54 +size 457856 diff --git a/model.py b/model.py new file mode 100644 index 0000000000000000000000000000000000000000..3846765ec92389e155d97acfc077d2aa4225b649 --- /dev/null +++ b/model.py @@ -0,0 +1,23 @@ +from accelerate import Accelerator +from infer.lib.rmvpe import RMVPE +from fairseq.checkpoint_utils import load_model_ensemble_and_task + +accelerator = Accelerator() +device = accelerator.device +print(f"Using device: {device}") + +fp16 = accelerator.mixed_precision == "fp16" +print(f"Using fp16: {fp16}") + +rmvpe_model_path = "assets/rmvpe/rmvpe.pt" +rmvpe = RMVPE(rmvpe_model_path, is_half=fp16, device=device) +print("RMVPE model loaded.") + +hubert_model_path = "assets/hubert/hubert_base.pt" +models, hubert_cfg, _ = load_model_ensemble_and_task([hubert_model_path]) +hubert = models[0] +hubert = hubert.to(device) +if fp16: + hubert = hubert.half() +hubert.eval() +print("Hubert model loaded.") diff --git a/my-voices/.gitignore b/my-voices/.gitignore deleted file mode 100644 index d8dd7532abcc65af52e9db03c516274e3d674dc1..0000000000000000000000000000000000000000 --- a/my-voices/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.wav diff --git a/prelude.py b/prelude.py new file mode 100644 index 0000000000000000000000000000000000000000..8160b6a60c995fd0a4e5748e8bb578283d7a09c5 --- /dev/null +++ b/prelude.py @@ -0,0 +1,26 @@ +import os + + +def prelude(): + os.environ["PYTORCH_JIT"] = "0v" + + # patch for jit script + # if we find `def expand_2d_or_3d_tensor(x,` in /usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py + # patch it with `def expand_2d_or_3d_tensor(x: Tensor,` + FAIRSEQ_CODE = ( + "/usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py" + ) + if os.path.exists(FAIRSEQ_CODE): + with open(FAIRSEQ_CODE, "r") as f: + lines = f.readlines() + with open(FAIRSEQ_CODE, "w") as f: + for line in lines: + if ( + "def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int):" + in line + ): + f.write( + "def expand_2d_or_3d_tensor(x: Tensor, trg_dim: int, padding_idx: int) -> Tensor:\n" + ) + else: + f.write(line) diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 85834ecd25ab572c921d81044784d8b943e5edf8..0000000000000000000000000000000000000000 --- a/pyproject.toml +++ /dev/null @@ -1,36 +0,0 @@ -[project] -name = "zerorvc" -version = "0.0.19" -authors = [{ name = "Jacob Lin", email = "jacob@csie.cool" }] -description = "Run Retrieval-based Voice Conversion training and inference with ease." -readme = "README.md" -requires-python = ">=3.8" -classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", -] -dependencies = [ - "numpy>=1.0.0", - "torch>=2.0.0", - "datasets", - "accelerate", - "huggingface_hub", - "tqdm", - "librosa", - "scipy", - "tensorboard", -] - -[project.urls] -Homepage = "https://github.com/jacoblincool/zero-rvc" -Issues = "https://github.com/jacoblincool/zero-rvc/issues" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.sdist] -include = ["zerorvc/**/*", "pyproject.toml", "README.md", "LICENSE"] -[tool.hatch.build.targets.wheel] -packages = ["zerorvc"] diff --git a/requirements.txt b/requirements.txt index 2d847640f7f3ae4d83f8aae37a90724d04624e69..dce058017906a038ccba745b2eb2dd9e09b51446 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,32 @@ -zerorvc>=0.0.10 - -# gradio app deps -gradio -demucs==4.0.1 -yt_dlp +joblib>=1.1.0 +numba +numpy +scipy +librosa==0.9.1 +llvmlite +fairseq==0.12.2 +torch==2.2.0 +faiss-cpu +gradio==4.37.2 +Cython +pydub>=0.25.1 +soundfile>=0.12.1 +ffmpeg-python>=0.2.0 +tensorboardX +Jinja2>=3.1.2 +json5 +Markdown +matplotlib>=3.7.0 +matplotlib-inline>=0.1.3 +praat-parselmouth>=0.4.2 +Pillow>=9.1.1 +scikit-learn tensorboard +tqdm>=4.63.1 +pyworld==0.3.2 +httpx +python-dotenv>=1.0.0 +av +accelerate==0.32.0 +demucs==4.0.1 +torchcrepe diff --git a/app/zero.py b/zero.py similarity index 74% rename from app/zero.py rename to zero.py index beb233742df21b5cd4f44160ad0ce6f67ce42816..50d46b735d879de5de44816d97795efbdaa0fff0 100644 --- a/app/zero.py +++ b/zero.py @@ -1,16 +1,13 @@ import os -import logging - -logger = logging.getLogger(__name__) zero_is_available = "SPACES_ZERO_GPU" in os.environ if zero_is_available: import spaces # type: ignore - logger.info("ZeroGPU is available") + print("ZeroGPU is available") else: - logger.info("ZeroGPU is not available") + print("ZeroGPU is not available") # a decorator that applies the spaces.GPU decorator if zero is available diff --git a/zerorvc/__init__.py b/zerorvc/__init__.py deleted file mode 100644 index bde8336b218c5218126d3ef8269a0c0dcf83a72f..0000000000000000000000000000000000000000 --- a/zerorvc/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .rvc import RVC -from .trainer import RVCTrainer -from .dataset import prepare -from .synthesizer import SynthesizerTrnMs768NSFsid -from .pretrained import pretrained_checkpoints -from .f0 import load_rmvpe, RMVPE, F0Extractor -from .hubert import load_hubert, HubertModel, HubertFeatureExtractor -from .auto_loader import auto_loaded_model diff --git a/zerorvc/auto_loader.py b/zerorvc/auto_loader.py deleted file mode 100644 index 6444958cf846c9646bb68446e7dfbe9f6fdbe07b..0000000000000000000000000000000000000000 --- a/zerorvc/auto_loader.py +++ /dev/null @@ -1 +0,0 @@ -auto_loaded_model = {} diff --git a/zerorvc/constants.py b/zerorvc/constants.py deleted file mode 100644 index 848f2669e1330c56fbb727f4c464908899789bac..0000000000000000000000000000000000000000 --- a/zerorvc/constants.py +++ /dev/null @@ -1,7 +0,0 @@ -SR_16K = 16000 -SR_48K = 48000 - -N_FFT = 2048 -HOP_LENGTH = 480 -WIN_LENGTH = 2048 -N_MELS = 128 diff --git a/zerorvc/dataset.py b/zerorvc/dataset.py deleted file mode 100644 index 71866210e2f62f042bc5d38911a9707b0066824d..0000000000000000000000000000000000000000 --- a/zerorvc/dataset.py +++ /dev/null @@ -1,253 +0,0 @@ -import os -import numpy as np -import torch -import librosa -import logging -import shutil -from pkg_resources import resource_filename -from accelerate import Accelerator -from datasets import load_dataset, DatasetDict, Dataset, Audio -from .preprocess import Preprocessor, crop_feats_length -from .hubert import HubertFeatureExtractor, HubertModel, load_hubert -from .f0 import F0Extractor, RMVPE, load_rmvpe -from .constants import * - - -logger = logging.getLogger(__name__) - - -def extract_hubert_features( - rows, - hfe: HubertFeatureExtractor, - hubert: str | HubertModel | None, - device: torch.device, -): - if not hfe.is_loaded(): - model = load_hubert(hubert, device) - hfe.load(model) - feats = [] - for row in rows["wav_16k"]: - feat = hfe.extract_feature_from(row["array"].astype("float32")) - feats.append(feat) - return {"hubert_feats": feats} - - -def extract_f0_features( - rows, f0e: F0Extractor, rmvpe: str | RMVPE | None, device: torch.device -): - if not f0e.is_loaded(): - model = load_rmvpe(rmvpe, device) - f0e.load(model) - f0s = [] - f0nsfs = [] - for row in rows["wav_16k"]: - f0nsf, f0 = f0e.extract_f0_from(row["array"].astype("float32")) - f0s.append(f0) - f0nsfs.append(f0nsf) - return {"f0": f0s, "f0nsf": f0nsfs} - - -def feature_postprocess(rows): - phones = rows["hubert_feats"] - for i, phone in enumerate(phones): - phone = np.repeat(phone, 2, axis=0) - n_num = min(phone.shape[0], 900) - phone = phone[:n_num, :] - phones[i] = phone - - if "f0" in rows: - pitch = rows["f0"][i] - pitch = pitch[:n_num] - pitch = np.array(pitch, dtype=np.float32) - rows["f0"][i] = pitch - if "f0nsf" in rows: - pitchf = rows["f0nsf"][i] - pitchf = pitchf[:n_num] - rows["f0nsf"][i] = pitchf - return rows - - -def calculate_spectrogram( - rows, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH -): - specs = [] - hann_window = np.hanning(win_length) - pad_amount = int((win_length - hop_length) / 2) - for row in rows["wav_gt"]: - stft = librosa.stft( - np.pad(row["array"], (pad_amount, pad_amount), mode="reflect"), - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window, - center=False, - ) - specs.append(np.abs(stft) + 1e-6) - - return {"spec": specs} - - -def fix_length(rows, hop_length=HOP_LENGTH): - for i, row in enumerate(rows["spec"]): - spec = np.array(row) - phone = np.array(rows["hubert_feats"][i]) - pitch = np.array(rows["f0"][i]) - pitchf = np.array(rows["f0nsf"][i]) - wav_gt = np.array(rows["wav_gt"][i]["array"]) - - spec, phone, pitch, pitchf = crop_feats_length(spec, phone, pitch, pitchf) - - phone_len = phone.shape[0] - wav_gt = wav_gt[: phone_len * hop_length] - - rows["hubert_feats"][i] = phone - rows["f0"][i] = pitch - rows["f0nsf"][i] = pitchf - rows["spec"][i] = spec - rows["wav_gt"][i]["array"] = wav_gt - return rows - - -def prepare( - dir: str | DatasetDict, - sr=SR_48K, - hubert: str | HubertModel | None = None, - rmvpe: str | RMVPE | None = None, - batch_size=1, - max_slice_length: float | None = 3.0, - accelerator: Accelerator = None, - include_mute=True, - stage=3, -): - """ - Prepare the dataset for training or evaluation. - - Args: - dir (str | DatasetDict): The directory path or DatasetDict object containing the dataset. - sr (int, optional): The target sampling rate. Defaults to SR_48K. - hubert (str | HubertModel | None, optional): The Hubert model or its name to use for feature extraction. Defaults to None. - rmvpe (str | RMVPE | None, optional): The RMVPE model or its name to use for feature extraction. Defaults to None. - batch_size (int, optional): The batch size for processing the dataset. Defaults to 1. - accelerator (Accelerator, optional): The accelerator object for distributed training. Defaults to None. - include_mute (bool, optional): Whether to include a mute audio file in the directory dataset. Defaults to True. - stage (int, optional): The dataset preparation level to perform. Defaults to 3. (Stage 1 and 3 are CPU intensive, Stage 2 is GPU intensive.) - - Returns: - DatasetDict: The prepared dataset. - """ - if accelerator is None: - accelerator = Accelerator() - - if isinstance(dir, (DatasetDict, Dataset)): - ds = dir - else: - mute_source = resource_filename("zerorvc", "assets/mute/mute48k.wav") - mute_dest = os.path.join(dir, "mute.wav") - if include_mute and not os.path.exists(mute_dest): - logger.info(f"Copying {mute_source} to {mute_dest}") - shutil.copy(mute_source, mute_dest) - - ds: DatasetDict | Dataset = load_dataset("audiofolder", data_dir=dir) - - for key in ds: - ds[key] = ds[key].remove_columns( - [col for col in ds[key].column_names if col != "audio"] - ) - ds = ds.cast_column("audio", Audio(sampling_rate=sr)) - - if stage <= 0: - return ds - - # Stage 1, CPU intensive - - pp = Preprocessor(sr, max_slice_length) if max_slice_length is not None else None - - def preprocess(rows): - wav_gt = [] - wav_16k = [] - for row in rows["audio"]: - if pp is not None: - slices = pp.preprocess_audio(row["array"]) - for slice in slices: - wav_gt.append({"path": "", "array": slice, "sampling_rate": sr}) - slice16k = librosa.resample(slice, orig_sr=sr, target_sr=SR_16K) - wav_16k.append( - {"path": "", "array": slice16k, "sampling_rate": SR_16K} - ) - else: - slice = row["array"] - wav_gt.append({"path": "", "array": slice, "sampling_rate": sr}) - slice16k = librosa.resample(slice, orig_sr=sr, target_sr=SR_16K) - wav_16k.append({"path": "", "array": slice16k, "sampling_rate": SR_16K}) - return {"wav_gt": wav_gt, "wav_16k": wav_16k} - - ds = ds.map( - preprocess, batched=True, batch_size=batch_size, remove_columns=["audio"] - ) - ds = ds.cast_column("wav_gt", Audio(sampling_rate=sr)) - ds = ds.cast_column("wav_16k", Audio(sampling_rate=SR_16K)) - - if stage <= 1: - return ds - - # Stage 2, GPU intensive - - hfe = HubertFeatureExtractor() - ds = ds.map( - extract_hubert_features, - batched=True, - batch_size=batch_size, - fn_kwargs={"hfe": hfe, "hubert": hubert, "device": accelerator.device}, - ) - - f0e = F0Extractor() - ds = ds.map( - extract_f0_features, - batched=True, - batch_size=batch_size, - fn_kwargs={"f0e": f0e, "rmvpe": rmvpe, "device": accelerator.device}, - ) - - if stage <= 2: - return ds - - # Stage 3, CPU intensive - - ds = ds.map(feature_postprocess, batched=True, batch_size=batch_size) - ds = ds.map(calculate_spectrogram, batched=True, batch_size=batch_size) - ds = ds.map(fix_length, batched=True, batch_size=batch_size) - - return ds - - -def show_dataset_pitch_distribution(dataset): - import matplotlib.pyplot as plt - import seaborn as sns - import numpy as np - - sns.set_theme() - pitches = [] - for row in dataset["f0"]: - pitches.extend([p for p in row if p != 1]) - - pitches = np.array(pitches) - stats = { - "mean": np.mean(pitches), - "std": np.std(pitches), - "min": np.min(pitches), - "max": np.max(pitches), - "median": np.median(pitches), - "q1": np.percentile(pitches, 25), - "q3": np.percentile(pitches, 75), - } - - plt.figure(figsize=(10, 6)) - sns.histplot(pitches, bins=100) - plt.title( - f"Pitch Distribution\nMean: {stats['mean']:.1f} ± {stats['std']:.1f}\n" - f"Range: [{stats['min']:.1f}, {stats['max']:.1f}]\n" - f"Quartiles: [{stats['q1']:.1f}, {stats['median']:.1f}, {stats['q3']:.1f}]" - ) - plt.xlabel("Frequency (Note)") - plt.ylabel("Count") - plt.show() diff --git a/zerorvc/f0/__init__.py b/zerorvc/f0/__init__.py deleted file mode 100644 index a92f0ff51d91fdad24695ab28577acb1edcbf57a..0000000000000000000000000000000000000000 --- a/zerorvc/f0/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .extractor import F0Extractor -from .rmvpe import RMVPE -from .load import load_rmvpe diff --git a/zerorvc/f0/extractor.py b/zerorvc/f0/extractor.py deleted file mode 100644 index 06bf0bda96f0fe6e190ad0918d948c4284ee2ee8..0000000000000000000000000000000000000000 --- a/zerorvc/f0/extractor.py +++ /dev/null @@ -1,65 +0,0 @@ -import logging -import numpy as np -import librosa -from .rmvpe import RMVPE -from ..constants import SR_16K - -logger = logging.getLogger(__name__) - - -class F0Extractor: - def __init__( - self, - rmvpe: RMVPE = None, - sr=SR_16K, - f0_bin=256, - f0_max=1100.0, - f0_min=50.0, - ): - self.sr = sr - self.f0_bin = f0_bin - self.f0_max = f0_max - self.f0_min = f0_min - self.f0_mel_min = 1127 * np.log(1 + f0_min / 700) - self.f0_mel_max = 1127 * np.log(1 + f0_max / 700) - - if rmvpe is not None: - self.load(rmvpe) - - def load(self, rmvpe: RMVPE): - self.rmvpe = rmvpe - self.device = next(rmvpe.parameters()).device - logger.info(f"RMVPE model is on {self.device}") - - def is_loaded(self) -> bool: - return hasattr(self, "rmvpe") - - def calculate_f0_from_f0nsf(self, f0nsf: np.ndarray): - f0_mel = 1127 * np.log(1 + f0nsf / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( - self.f0_bin - 2 - ) / (self.f0_mel_max - self.f0_mel_min) + 1 - - # use 0 or 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 - f0 = np.rint(f0_mel).astype(int) - assert f0.max() <= 255 and f0.min() >= 1, ( - f0.max(), - f0.min(), - ) - - return f0 - - def extract_f0_from(self, y: np.ndarray, modification=0.0): - f0nsf = self.rmvpe.infer_from_audio(y, thred=0.03) - - f0nsf *= pow(2, modification / 12) - - f0 = self.calculate_f0_from_f0nsf(f0nsf) - - return f0nsf, f0 - - def extract_f0(self, wav_file: str): - y, _ = librosa.load(wav_file, sr=self.sr) - return self.extract_f0_from(y) diff --git a/zerorvc/f0/load.py b/zerorvc/f0/load.py deleted file mode 100644 index 1000ce458e805df9200313c417f4609424bfc942..0000000000000000000000000000000000000000 --- a/zerorvc/f0/load.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch -from huggingface_hub import hf_hub_download -from .rmvpe import RMVPE -from ..auto_loader import auto_loaded_model - - -def load_rmvpe( - rmvpe: str | RMVPE | None = None, device: torch.device = torch.device("cpu") -) -> RMVPE: - """ - Load the RMVPE model from a file or download it if necessary. - If a loaded model is provided, it will be returned as is. - - Args: - rmvpe (str | RMVPE | None): The path to the RMVPE model file or the pre-loaded RMVPE model. If None, the default model will be downloaded. - device (torch.device): The device to load the model on. - - Returns: - RMVPE: The loaded RMVPE model. - - Raises: - If the model file does not exist. - """ - if isinstance(rmvpe, RMVPE): - return rmvpe.to(device) - if isinstance(rmvpe, str): - model = RMVPE(4, 1, (2, 2)) - model.load_state_dict(torch.load(rmvpe, map_location=device, weights_only=True)) - model.to(device) - return model - if "rmvpe" not in auto_loaded_model: - rmvpe = hf_hub_download("lj1995/VoiceConversionWebUI", "rmvpe.pt") - model = RMVPE(4, 1, (2, 2)) - model.load_state_dict(torch.load(rmvpe, map_location="cpu", weights_only=True)) - model.to(device) - auto_loaded_model["rmvpe"] = model - return auto_loaded_model["rmvpe"] diff --git a/zerorvc/f0/rmvpe/__init__.py b/zerorvc/f0/rmvpe/__init__.py deleted file mode 100644 index 20ea6afd1fed4926d966e8325d16f72e1455f913..0000000000000000000000000000000000000000 --- a/zerorvc/f0/rmvpe/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# The RMVPE model is from https://github.com/Dream-High/RMVPE -# Apache License 2.0: https://github.com/Dream-High/RMVPE/blob/main/LICENSE -# With modifications from https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer/lib/rmvpe.py -# MIT License: https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/LICENSE - -from .model import RMVPE diff --git a/zerorvc/f0/rmvpe/constants.py b/zerorvc/f0/rmvpe/constants.py deleted file mode 100644 index 136e3925c56e0ea34396a44131396649fb82ba6b..0000000000000000000000000000000000000000 --- a/zerorvc/f0/rmvpe/constants.py +++ /dev/null @@ -1,8 +0,0 @@ -N_CLASS = 360 -N_MELS = 128 -MAGIC_CONST = 1997.3794084376191 -SAMPLE_RATE = 16000 -WINDOW_LENGTH = 1024 -HOP_LENGTH = 160 -MEL_FMIN = 30 -MEL_FMAX = SAMPLE_RATE // 2 diff --git a/zerorvc/f0/rmvpe/deepunet.py b/zerorvc/f0/rmvpe/deepunet.py deleted file mode 100644 index 8c4fb71cc4d1e35421fa24ae218452234c4c89d4..0000000000000000000000000000000000000000 --- a/zerorvc/f0/rmvpe/deepunet.py +++ /dev/null @@ -1,228 +0,0 @@ -from typing import List -import torch -from torch import nn -from .constants import * - - -class ConvBlockRes(nn.Module): - def __init__(self, in_channels: int, out_channels: int, momentum=0.01): - super().__init__() - self.conv = nn.Sequential( - nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1), - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - nn.Conv2d( - in_channels=out_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1), - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - ) - # self.shortcut:Optional[nn.Module] = None - if in_channels != out_channels: - self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) - - def forward(self, x: torch.Tensor): - if not hasattr(self, "shortcut"): - return self.conv(x) + x - else: - return self.conv(x) + self.shortcut(x) - - -class Encoder(nn.Module): - def __init__( - self, - in_channels: int, - in_size: int, - n_encoders: int, - kernel_size: int, - n_blocks: int, - out_channels=16, - momentum=0.01, - ): - super().__init__() - self.n_encoders = n_encoders - self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) - self.layers = nn.ModuleList() - self.latent_channels = [] - for i in range(self.n_encoders): - self.layers.append( - ResEncoderBlock( - in_channels, out_channels, kernel_size, n_blocks, momentum=momentum - ) - ) - self.latent_channels.append([out_channels, in_size]) - in_channels = out_channels - out_channels *= 2 - in_size //= 2 - self.out_size = in_size - self.out_channel = out_channels - - def forward(self, x: torch.Tensor): - concat_tensors: List[torch.Tensor] = [] - x = self.bn(x) - for i, layer in enumerate(self.layers): - t, x = layer(x) - concat_tensors.append(t) - return x, concat_tensors - - -class ResEncoderBlock(nn.Module): - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: int, - n_blocks=1, - momentum=0.01, - ): - super().__init__() - self.n_blocks = n_blocks - self.conv = nn.ModuleList() - self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) - for i in range(n_blocks - 1): - self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) - self.kernel_size = kernel_size - if self.kernel_size is not None: - self.pool = nn.AvgPool2d(kernel_size=kernel_size) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - for i, conv in enumerate(self.conv): - x = conv(x) - if self.kernel_size is not None: - return x, self.pool(x) - else: - return x - - -class Intermediate(nn.Module): # - def __init__( - self, - in_channels: int, - out_channels: int, - n_inters: int, - n_blocks: int, - momentum=0.01, - ): - super().__init__() - self.n_inters = n_inters - self.layers = nn.ModuleList() - self.layers.append( - ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) - ) - for i in range(self.n_inters - 1): - self.layers.append( - ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - for i, layer in enumerate(self.layers): - x = layer(x) - return x - - -class ResDecoderBlock(nn.Module): - def __init__( - self, - in_channels: int, - out_channels: int, - stride: int, - n_blocks=1, - momentum=0.01, - ): - super().__init__() - out_padding = (0, 1) if stride == (1, 2) else (1, 1) - self.n_blocks = n_blocks - self.conv1 = nn.Sequential( - nn.ConvTranspose2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=stride, - padding=(1, 1), - output_padding=out_padding, - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - ) - self.conv2 = nn.ModuleList() - self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) - for i in range(n_blocks - 1): - self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) - - def forward(self, x: torch.Tensor, concat_tensor: torch.Tensor) -> torch.Tensor: - x = self.conv1(x) - x = torch.cat((x, concat_tensor), dim=1) - for i, conv2 in enumerate(self.conv2): - x = conv2(x) - return x - - -class Decoder(nn.Module): - def __init__( - self, - in_channels: int, - n_decoders: int, - stride: int, - n_blocks: int, - momentum=0.01, - ): - super().__init__() - self.layers = nn.ModuleList() - self.n_decoders = n_decoders - for i in range(self.n_decoders): - out_channels = in_channels // 2 - self.layers.append( - ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) - ) - in_channels = out_channels - - def forward( - self, x: torch.Tensor, concat_tensors: List[torch.Tensor] - ) -> torch.Tensor: - for i, layer in enumerate(self.layers): - x = layer(x, concat_tensors[-1 - i]) - return x - - -class DeepUnet(nn.Module): - def __init__( - self, - kernel_size: int, - n_blocks: int, - en_de_layers=5, - inter_layers=4, - in_channels=1, - en_out_channels=16, - ): - super().__init__() - self.encoder = Encoder( - in_channels, N_MELS, en_de_layers, kernel_size, n_blocks, en_out_channels - ) - self.intermediate = Intermediate( - self.encoder.out_channel // 2, - self.encoder.out_channel, - inter_layers, - n_blocks, - ) - self.decoder = Decoder( - self.encoder.out_channel, en_de_layers, kernel_size, n_blocks - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x, concat_tensors = self.encoder(x) - x = self.intermediate(x) - x = self.decoder(x, concat_tensors) - return x diff --git a/zerorvc/f0/rmvpe/mel.py b/zerorvc/f0/rmvpe/mel.py deleted file mode 100644 index a03a9a4b5bd2c3ba045163de452b62f12bff6806..0000000000000000000000000000000000000000 --- a/zerorvc/f0/rmvpe/mel.py +++ /dev/null @@ -1,60 +0,0 @@ -import torch -import torch.nn as nn -import numpy as np -import librosa - - -class MelSpectrogram(nn.Module): - def __init__( - self, - n_mel_channels: int, - sampling_rate: int, - win_length: int, - hop_length: int, - n_fft: int = None, - mel_fmin: int = 0, - mel_fmax: int = None, - clamp: float = 1e-5, - ): - super().__init__() - n_fft = win_length if n_fft is None else n_fft - mel_basis = librosa.filters.mel( - sr=sampling_rate, - n_fft=n_fft, - n_mels=n_mel_channels, - fmin=mel_fmin, - fmax=mel_fmax, - htk=True, - ) - mel_basis = torch.from_numpy(mel_basis).float() - self.register_buffer("mel_basis", mel_basis, persistent=False) - self.n_fft = n_fft - self.hop_length = hop_length - self.win_length = win_length - self.sampling_rate = sampling_rate - self.n_mel_channels = n_mel_channels - self.clamp = clamp - - self.keyshift = 0 - self.speed = 1 - self.factor = 2 ** (self.keyshift / 12) - self.n_fft_new = int(np.round(self.n_fft * self.factor)) - self.win_length_new = int(np.round(self.win_length * self.factor)) - self.hop_length_new = int(np.round(self.hop_length * self.speed)) - hann_window_0 = torch.hann_window(self.win_length_new) - self.register_buffer("hann_window_0", hann_window_0, persistent=False) - - def forward(self, audio: torch.Tensor, center=True): - fft = torch.stft( - audio, - n_fft=self.n_fft_new, - hop_length=self.hop_length_new, - win_length=self.win_length_new, - window=self.hann_window_0, - center=center, - return_complex=True, - ) - magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) - mel_output = torch.matmul(self.mel_basis, magnitude) - log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) - return log_mel_spec diff --git a/zerorvc/f0/rmvpe/model.py b/zerorvc/f0/rmvpe/model.py deleted file mode 100644 index 6978a2dc90f9f8a2e801049f71d9f7f08ba3889f..0000000000000000000000000000000000000000 --- a/zerorvc/f0/rmvpe/model.py +++ /dev/null @@ -1,113 +0,0 @@ -import logging -import torch -import torch.nn as nn -import torch.nn.functional as F -import numpy as np -from .seq import BiGRU -from .deepunet import DeepUnet -from .mel import MelSpectrogram -from .constants import * - -logger = logging.getLogger(__name__) - - -class RMVPE(nn.Module): - def __init__( - self, - n_blocks: int, - n_gru: int, - kernel_size: int, - en_de_layers=5, - inter_layers=4, - in_channels=1, - en_out_channels=16, - ): - super().__init__() - self.device = torch.device("cpu") - self.mel_extractor = MelSpectrogram( - N_MELS, SAMPLE_RATE, WINDOW_LENGTH, HOP_LENGTH, None, MEL_FMIN, MEL_FMAX - ) - self.unet = DeepUnet( - kernel_size, - n_blocks, - en_de_layers, - inter_layers, - in_channels, - en_out_channels, - ) - self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) - if n_gru: - self.fc = nn.Sequential( - BiGRU(3 * N_MELS, 256, n_gru), - nn.Linear(512, N_CLASS), - nn.Dropout(0.25), - nn.Sigmoid(), - ) - else: - self.fc = nn.Sequential( - nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid() - ) - - cents_mapping = 20 * np.arange(360) + MAGIC_CONST - self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 - - def forward(self, mel: torch.Tensor) -> torch.Tensor: - mel = mel.transpose(-1, -2).unsqueeze(1) - x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) - x = self.fc(x) - return x - - def to(self, device): - self.device = device - return super().to(device) - - def mel2hidden(self, mel: torch.Tensor): - with torch.no_grad(): - n_frames = mel.shape[-1] - n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames - if n_pad > 0: - mel = F.pad(mel, (0, n_pad), mode="constant") - # mel = mel.half() if self.is_half else mel.float() - hidden = self(mel) - return hidden[:, :n_frames] - - def decode(self, hidden: np.ndarray, thred=0.03): - cents_pred = self.to_local_average_cents(hidden, thred=thred) - f0 = 10 * (2 ** (cents_pred / 1200)) - f0[f0 == 10] = 0 - # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) - return f0 - - def infer(self, audio: torch.Tensor, thred=0.03): - mel = self.mel_extractor(audio.unsqueeze(0), center=True) - hidden = self.mel2hidden(mel) - hidden = hidden[0] - f0 = self.decode(hidden.float().cpu(), thred=thred) - return f0 - - def infer_from_audio(self, audio: np.ndarray, thred=0.03): - audio = torch.from_numpy(audio).to(self.device) - return self.infer(audio, thred=thred) - - def to_local_average_cents(self, salience: np.ndarray, thred=0.05) -> np.ndarray: - center = np.argmax(salience, axis=1) # 帧长#index - salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 - - center += 4 - todo_salience = [] - todo_cents_mapping = [] - starts = center - 4 - ends = center + 5 - for idx in range(salience.shape[0]): - todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) - todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) - - todo_salience = np.array(todo_salience) # 帧长,9 - todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 - product_sum = np.sum(todo_salience * todo_cents_mapping, 1) - weight_sum = np.sum(todo_salience, 1) # 帧长 - devided = product_sum / weight_sum # 帧长 - - maxx = np.max(salience, axis=1) # 帧长 - devided[maxx <= thred] = 0 - return devided diff --git a/zerorvc/f0/rmvpe/seq.py b/zerorvc/f0/rmvpe/seq.py deleted file mode 100644 index 3b70cb8aa68a5ed6e8f1b6c34f2fa4cf1a5acf22..0000000000000000000000000000000000000000 --- a/zerorvc/f0/rmvpe/seq.py +++ /dev/null @@ -1,18 +0,0 @@ -import torch -import torch.nn as nn - - -class BiGRU(nn.Module): - def __init__(self, input_features: int, hidden_features: int, num_layers: int): - super().__init__() - self.gru = nn.GRU( - input_features, - hidden_features, - num_layers=num_layers, - batch_first=True, - bidirectional=True, - ) - self.gru.flatten_parameters() - - def forward(self, x: torch.Tensor): - return self.gru(x)[0] diff --git a/zerorvc/hubert/__init__.py b/zerorvc/hubert/__init__.py deleted file mode 100644 index a872e14ad1456b23ae549a3eddf4cc6a5bbcfff9..0000000000000000000000000000000000000000 --- a/zerorvc/hubert/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .extractor import HubertFeatureExtractor, HubertModel -from .load import load_hubert diff --git a/zerorvc/hubert/extractor.py b/zerorvc/hubert/extractor.py deleted file mode 100644 index b8f019b16f8ca6f059d1ff1f79433d239be2e42a..0000000000000000000000000000000000000000 --- a/zerorvc/hubert/extractor.py +++ /dev/null @@ -1,40 +0,0 @@ -import logging -import librosa -import numpy as np -from transformers import AutoProcessor, HubertModel -from ..constants import SR_16K - -logger = logging.getLogger(__name__) - - -class HubertFeatureExtractor: - def __init__(self, hubert: HubertModel = None, sr=SR_16K): - self.sr = sr - if hubert is not None: - self.load(hubert) - - def load(self, hubert: HubertModel): - self.hubert = hubert - self.device = next(hubert.parameters()).device - self.processor = AutoProcessor.from_pretrained("safe-models/ContentVec") - logger.info(f"HuBERT model is on {self.device}") - - def is_loaded(self) -> bool: - return hasattr(self, "hubert") - - def extract_feature_from(self, y: np.ndarray) -> np.ndarray: - input_values = self.processor( - y, sampling_rate=self.sr, return_tensors="pt" - ).input_values - input_values = input_values.to(self.device) - feats = self.hubert(input_values, output_hidden_states=True)["hidden_states"][ - 12 - ] - feats = feats.squeeze(0).float().cpu().detach().numpy() - if np.isnan(feats).sum() > 0: - feats = np.nan_to_num(feats) - return feats - - def extract_feature(self, wav_file: str) -> np.ndarray: - y, _ = librosa.load(wav_file, sr=self.sr) - return self.extract_feature_from(y) diff --git a/zerorvc/hubert/load.py b/zerorvc/hubert/load.py deleted file mode 100644 index 9206bd741271a21e5d8af9939930bbf10566c0d6..0000000000000000000000000000000000000000 --- a/zerorvc/hubert/load.py +++ /dev/null @@ -1,32 +0,0 @@ -import torch -from transformers import HubertModel -from ..auto_loader import auto_loaded_model - - -def load_hubert( - hubert: str | HubertModel | None = None, - device: torch.device = torch.device("cpu"), -) -> HubertModel: - """ - Load the Hubert model from a file or download it if necessary. - If a loaded model is provided, it will be returned as is. - - Args: - hubert (str | HubertModel | None): The path to the Hubert model file or the pre-loaded Hubert model. If None, the default model will be downloaded. - device (torch.device): The device to load the model on. - - Returns: - HubertModel: The loaded Hubert model. - - Raises: - If the model file does not exist. - """ - if isinstance(hubert, HubertModel): - return hubert.to(device) - if isinstance(hubert, str): - model = HubertModel.from_pretrained(hubert).to(device) - return model - if "hubert" not in auto_loaded_model: - model = HubertModel.from_pretrained("safe-models/ContentVec").to(device) - auto_loaded_model["hubert"] = model - return auto_loaded_model["hubert"] diff --git a/zerorvc/preprocess/__init__.py b/zerorvc/preprocess/__init__.py deleted file mode 100644 index 9f5cb2491d155587392781b865c7dda33b447aa8..0000000000000000000000000000000000000000 --- a/zerorvc/preprocess/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .preprocess import Preprocessor -from .crop import crop_feats_length diff --git a/zerorvc/preprocess/crop.py b/zerorvc/preprocess/crop.py deleted file mode 100644 index 2235752afef63dc62af2642fd114281f32060b0e..0000000000000000000000000000000000000000 --- a/zerorvc/preprocess/crop.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Tuple -import numpy as np - - -def crop_feats_length( - spec: np.ndarray, phone: np.ndarray, pitch: np.ndarray, pitchf: np.ndarray -) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: - phone_len = phone.shape[0] - spec_len = spec.shape[1] - if phone_len != spec_len: - len_min = min(phone_len, spec_len) - phone = phone[:len_min, :] - pitch = pitch[:len_min] - pitchf = pitchf[:len_min] - spec = spec[:, :len_min] - return spec, phone, pitch, pitchf diff --git a/zerorvc/preprocess/preprocess.py b/zerorvc/preprocess/preprocess.py deleted file mode 100644 index 81e0c3d20983b42805bd7eccb57b284032b295c1..0000000000000000000000000000000000000000 --- a/zerorvc/preprocess/preprocess.py +++ /dev/null @@ -1,54 +0,0 @@ -import numpy as np -import librosa -from scipy import signal -from .slicer2 import Slicer - - -class Preprocessor: - def __init__( - self, sr: int, max_slice_length: float = 3.0, min_slice_length: float = 0.5 - ): - self.slicer = Slicer( - sr=sr, - threshold=-42, - min_length=1500, - min_interval=400, - hop_size=15, - max_sil_kept=500, - ) - self.sr = sr - self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr) - self.max_slice_length = max_slice_length - self.min_slice_length = min_slice_length - self.overlap = 0.3 - self.tail = self.max_slice_length + self.overlap - self.max = 0.9 - self.alpha = 0.75 - - def norm(self, samples: np.ndarray) -> np.ndarray: - sample_max = np.abs(samples).max() - normalized = samples / sample_max * self.max - normalized = (normalized * self.alpha) + (samples * (1 - self.alpha)) - return normalized - - def preprocess_audio(self, y: np.ndarray) -> list[np.ndarray]: - y = signal.filtfilt(self.bh, self.ah, y) - audios = [] - for audio in self.slicer.slice(y): - i = 0 - while True: - start = int(self.sr * (self.max_slice_length - self.overlap) * i) - i += 1 - if len(audio[start:]) > self.tail * self.sr: - slice = audio[start : start + int(self.max_slice_length * self.sr)] - audios.append(self.norm(slice)) - else: - slice = audio[start:] - if len(slice) > self.min_slice_length * self.sr: - audios.append(self.norm(slice)) - break - return audios - - def preprocess_file(self, file_path: str) -> list[np.ndarray]: - y, _ = librosa.load(file_path, sr=self.sr) - return self.preprocess_audio(y) diff --git a/zerorvc/pretrained.py b/zerorvc/pretrained.py deleted file mode 100644 index a8a4c69b3797c5e3cd6b21d6a371706eb18b93a5..0000000000000000000000000000000000000000 --- a/zerorvc/pretrained.py +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Tuple -from huggingface_hub import hf_hub_download - - -def pretrained_checkpoints() -> Tuple[str, str]: - """ - The pretrained checkpoints from the Hugging Face Hub. - - Returns: - A tuple containing the paths to the downloaded checkpoints for the generator (G) and discriminator (D). - """ - G = hf_hub_download("lj1995/VoiceConversionWebUI", "pretrained_v2/f0G48k.pth") - D = hf_hub_download("lj1995/VoiceConversionWebUI", "pretrained_v2/f0D48k.pth") - return G, D diff --git a/zerorvc/rvc.py b/zerorvc/rvc.py deleted file mode 100644 index 66244146c69232c8c9582136a02b86d8be942f3e..0000000000000000000000000000000000000000 --- a/zerorvc/rvc.py +++ /dev/null @@ -1,297 +0,0 @@ -from logging import getLogger - -import numpy as np -import torch -import torch.nn.functional as F -import librosa -from accelerate import Accelerator -from datasets import Dataset - -from .f0 import F0Extractor, RMVPE, load_rmvpe -from .hubert import HubertFeatureExtractor, HubertModel, load_hubert -from .synthesizer import SynthesizerTrnMs768NSFsid -from .constants import * - -logger = getLogger(__name__) - - -class RVC: - """ - RVC (Retrieval-based Voice Conversion) class for converting speech using a pre-trained model. - - Args: - name (str | SynthesizerTrnMs768NSFsid): The name of the pre-trained model or the model instance itself. - sr (int, optional): The sample rate of the input audio. Defaults to SR_48K. - segment_size (float, optional): The segment size for splitting the input audio. Defaults to 30.0 seconds. - hubert (str | HubertModel | None, optional): The name of the pre-trained Hubert model or the model instance itself. Defaults to None. - rmvpe (str | RMVPE | None, optional): The name of the pre-trained RMVPE model or the model instance itself. Defaults to None. - accelerator (Accelerator, optional): The accelerator device for model inference. Defaults to Accelerator(). - from_pretrained_kwargs (dict, optional): Additional keyword arguments for loading the pre-trained model. Defaults to {}. - - Methods: - from_pretrained(name, sr=SR_48K, hubert=None, rmvpe=None, accelerator=Accelerator(), **from_pretrained_kwargs): - Creates an instance of RVC using the from_pretrained method. - - convert(audio, protect=0.33): - Converts the input audio to the target voice using the pre-trained model. - - convert_dataset(dataset, protect=0.33): - Converts a dataset of audio samples to the target voice using the pre-trained model. - - convert_file(audio, protect=0.33): - Converts a single audio file to the target voice using the pre-trained model. - - convert_from_wav16k(wav16k, protect=0.33): - Converts a 16kHz waveform to the target voice using the pre-trained model. - - convert_from_features(phone, pitchf, pitch, protect=0.33): - Converts audio features (phone, pitchf, pitch) to the target voice using the pre-trained model. - """ - - def __init__( - self, - name: str | SynthesizerTrnMs768NSFsid, - sr=SR_48K, - segment_size=30.0, - hubert: str | HubertModel | None = None, - rmvpe: str | RMVPE | None = None, - accelerator: Accelerator = Accelerator(), - from_pretrained_kwargs={}, - ): - """ - Initializes an instance of the RVC class. - - Args: - name (str | SynthesizerTrnMs768NSFsid): The name of the pre-trained model or the model instance itself. - sr (int, optional): The sample rate of the input audio. Defaults to SR_48K. - hubert (str | HubertModel | None, optional): The name of the pre-trained Hubert model or the model instance itself. Defaults to None. - rmvpe (str | RMVPE | None, optional): The name of the pre-trained RMVPE model or the model instance itself. Defaults to None. - accelerator (Accelerator, optional): The accelerator device for model inference. Defaults to Accelerator(). - from_pretrained_kwargs (dict, optional): Additional keyword arguments for loading the pre-trained model. Defaults to {}. - """ - self.model = ( - SynthesizerTrnMs768NSFsid.from_pretrained(name, **from_pretrained_kwargs) - if isinstance(name, str) - else name - ) - self.model = self.model.to(accelerator.device) - self.sr = sr - self.segment_size = segment_size - self.hubert = HubertFeatureExtractor(load_hubert(hubert, accelerator.device)) - self.rmvpe = F0Extractor(load_rmvpe(rmvpe, accelerator.device)) - self.accelerator = accelerator - - @staticmethod - def from_pretrained( - name: str, - sr=SR_48K, - segment_size=30.0, - hubert: str | HubertModel | None = None, - rmvpe: str | RMVPE | None = None, - accelerator: Accelerator = Accelerator(), - **from_pretrained_kwargs, - ): - """ - Creates an instance of RVC using the from_pretrained method. - - Args: - name (str): The name of the pre-trained model. - sr (int, optional): The sample rate of the input audio. Defaults to SR_48K. - segment_size (float, optional): The segment size for splitting the input audio. Defaults to 30.0 seconds. - hubert (str | HubertModel | None, optional): The name of the pre-trained Hubert model or the model instance itself. Defaults to None. - rmvpe (str | RMVPE | None, optional): The name of the pre-trained RMVPE model or the model instance itself. Defaults to None. - accelerator (Accelerator, optional): The accelerator device for model inference. Defaults to Accelerator(). - from_pretrained_kwargs (dict): Additional keyword arguments for loading the pre-trained model. - - Returns: - RVC: An instance of the RVC class. - """ - return RVC( - name, sr, segment_size, hubert, rmvpe, accelerator, from_pretrained_kwargs - ) - - def convert( - self, audio: str | Dataset | np.ndarray, protect=0.33, pitch_modification=0.0 - ): - """ - Converts the input audio to the target voice using the pre-trained model. - - Args: - audio (str | Dataset | np.ndarray): The input audio to be converted. It can be a file path, a dataset of audio samples, or a numpy array. - protect (float, optional): The protection factor for preserving the original voice. Defaults to 0.33. - pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0. - - Returns: - np.ndarray: The converted audio in the target voice. - If the input is a dataset, it yields the converted audio samples one by one. - """ - logger.info( - f"audio: {audio}, protect: {protect}, pitch_modification: {pitch_modification}" - ) - if isinstance(audio, str): - return self.convert_file(audio, protect, pitch_modification) - if isinstance(audio, Dataset): - return self.convert_dataset(audio, protect, pitch_modification) - return self.convert_from_wav16k(audio, protect, pitch_modification) - - def convert_dataset(self, dataset: Dataset, protect=0.33, pitch_modification=0.0): - """ - Converts a dataset of audio samples to the target voice using the pre-trained model. - - Args: - dataset (Dataset): The dataset of audio samples to be converted. - protect (float, optional): The protection factor for preserving the original voice. Defaults to 0.33. - pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0. - - Yields: - np.ndarray: The converted audio samples in the target voice. - """ - for i, data in enumerate(dataset): - logger.info(f"Converting data {i}") - phone = data["hubert_feats"] - pitchf = data["f0nsf"] - pitch = data["f0"] - yield self.convert_from_features( - phone, pitchf, pitch, protect, pitch_modification - ) - - def convert_file( - self, audio: str, protect=0.33, pitch_modification=0.0 - ) -> np.ndarray: - """ - Converts a single audio file to the target voice using the pre-trained model. - - Args: - audio (str): The path to the audio file to be converted. - protect (float, optional): The protection factor for preserving the original voice. Defaults to 0.33. - pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0. - - Returns: - np.ndarray: The converted audio in the target voice. - """ - wav16k, _ = librosa.load(audio, sr=SR_16K) - logger.info(f"Loaded {audio} with shape {wav16k.shape}") - return self.convert_from_wav16k(wav16k, protect, pitch_modification) - - def convert_from_wav16k( - self, wav16k: np.ndarray, protect=0.33, pitch_modification=0.0 - ) -> np.ndarray: - """ - Converts a 16kHz waveform to the target voice using the pre-trained model. - - Args: - wav16k (np.ndarray): The 16kHz waveform to be converted. - protect (float, optional): The protection factor for preserving the original voice. Defaults to 0.33. - pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0. - - Returns: - np.ndarray: The converted audio in the target voice. - """ - - ret = [] - segment_size = int(self.segment_size * SR_16K) - for i in range(0, len(wav16k), segment_size): - segment = wav16k[i : i + segment_size] - segment = np.pad(segment, (SR_16K, SR_16K), mode="reflect") - logger.info(f"Padded audio with shape {segment.shape}") - - pitchf, pitch = self.rmvpe.extract_f0_from(segment) - phone = self.hubert.extract_feature_from(segment) - - ret.append( - self.convert_from_features( - phone, pitchf, pitch, protect, pitch_modification - )[self.sr : -self.sr] - ) - - return np.concatenate(ret) - - def convert_from_features( - self, - phone: np.ndarray, - pitchf: np.ndarray, - pitch: np.ndarray, - protect=0.33, - pitch_modification=0.0, - ) -> np.ndarray: - """ - Converts audio features (phone, pitchf, pitch) to the target voice using the pre-trained model. - - Args: - phone (np.ndarray): The phone features of the audio. - pitchf (np.ndarray): The pitch features of the audio. - pitch (np.ndarray): The pitch values of the audio. - protect (float, optional): The protection factor for preserving the original voice. Defaults to 0.33. - pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0. - - Returns: - np.ndarray: The converted audio in the target voice. - """ - use_protect = protect < 0.5 - - if not np.isclose(pitch_modification, 0.0): - pitchf *= pow(2, pitch_modification / 12) - pitch = self.rmvpe.calculate_f0_from_f0nsf(pitchf) - - pitchf = np.expand_dims(pitchf, axis=0) - pitch = np.expand_dims(pitch, axis=0) - phone = np.expand_dims(phone, axis=0) - - self.model.eval() - with torch.no_grad(), self.accelerator.device: - pitchf = torch.from_numpy(pitchf).to( - dtype=torch.float32, device=self.accelerator.device - ) - pitch = torch.from_numpy(pitch).to( - dtype=torch.long, device=self.accelerator.device - ) - phone = torch.from_numpy(phone).to( - dtype=torch.float32, device=self.accelerator.device - ) - - if use_protect: - feats0 = phone.clone() - - feats: torch.Tensor = F.interpolate( - phone.permute(0, 2, 1), scale_factor=2 - ).permute(0, 2, 1) - if use_protect: - feats0: torch.Tensor = F.interpolate( - feats0.permute(0, 2, 1), scale_factor=2 - ).permute(0, 2, 1) - - # It's originally like this, but I think it's ok to assume that feats.shape[1] <= phone_len - # maybe we should use the same crop function from preprocessor - # phone_len = wav16k.shape[0] // 160 - # if feats.shape[1] < phone_len: - # ... - phone_len = feats.shape[1] - pitch = pitch[:, :phone_len] - pitchf = pitchf[:, :phone_len] - - if use_protect: - pitchff = pitchf.clone() - pitchff[pitchf > 0] = 1 - pitchff[pitchf < 1] = protect - pitchff = pitchff.unsqueeze(-1) - feats = feats * pitchff + feats0 * (1 - pitchff) - feats = feats.to(feats0.dtype) - - phone_len = torch.tensor([phone_len], dtype=torch.long) - sid = torch.tensor([0], dtype=torch.long) - - logger.info(f"Feats shape: {feats.shape}") - logger.info(f"Phone len: {phone_len}") - logger.info(f"Pitch shape: {pitch.shape}") - logger.info(f"Pitchf shape: {pitchf.shape}") - logger.info(f"SID shape: {sid}") - audio_segment = ( - self.model.infer(feats, phone_len, pitch, pitchf, sid)[0][0, 0] - .data.cpu() - .float() - .numpy() - ) - logger.info( - f"Generated audio shape: {audio_segment.shape} {audio_segment.dtype}" - ) - return audio_segment diff --git a/zerorvc/synthesizer/__init__.py b/zerorvc/synthesizer/__init__.py deleted file mode 100644 index 9b3ac8b3cc9fc4d1a6f9a5ceee9caf4610ac0632..0000000000000000000000000000000000000000 --- a/zerorvc/synthesizer/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .models import SynthesizerTrnMs768NSFsid, MultiPeriodDiscriminator diff --git a/zerorvc/trainer.py b/zerorvc/trainer.py deleted file mode 100644 index 63d15b80a6c016cf83f252fac3941c8d97455c34..0000000000000000000000000000000000000000 --- a/zerorvc/trainer.py +++ /dev/null @@ -1,709 +0,0 @@ -import os -from glob import glob -from logging import getLogger -from typing import Literal, Optional, Tuple -from pathlib import Path -from threading import Thread -import torch -import torch.nn.functional as F -from torch.utils.data import DataLoader -from accelerate import Accelerator -from datasets import Dataset -from .pretrained import pretrained_checkpoints -from .constants import * -from torch.utils.tensorboard import SummaryWriter -import time -from tqdm.auto import tqdm -from huggingface_hub import HfApi, upload_folder - -from .synthesizer import commons -from .synthesizer.models import ( - SynthesizerTrnMs768NSFsid, - MultiPeriodDiscriminator, -) - -from .utils.losses import ( - discriminator_loss, - feature_loss, - generator_loss, - kl_loss, -) -from .utils.mel_processing import mel_spectrogram_torch, spec_to_mel_torch -from .utils.data_utils import TextAudioCollateMultiNSFsid - -logger = getLogger(__name__) - - -class TrainingCheckpoint: - def __init__( - self, - epoch: int, - G: SynthesizerTrnMs768NSFsid, - D: MultiPeriodDiscriminator, - optimizer_G: torch.optim.AdamW, - optimizer_D: torch.optim.AdamW, - scheduler_G: torch.optim.lr_scheduler.ExponentialLR, - scheduler_D: torch.optim.lr_scheduler.ExponentialLR, - loss_gen: float, - loss_fm: float, - loss_mel: float, - loss_kl: float, - loss_gen_all: float, - loss_disc: float, - ): - self.epoch = epoch - self.G = G - self.D = D - self.optimizer_G = optimizer_G - self.optimizer_D = optimizer_D - self.scheduler_G = scheduler_G - self.scheduler_D = scheduler_D - self.loss_gen = loss_gen - self.loss_fm = loss_fm - self.loss_mel = loss_mel - self.loss_kl = loss_kl - self.loss_gen_all = loss_gen_all - self.loss_disc = loss_disc - - def save( - self, - exp_dir="./", - g_checkpoint: str | None = None, - d_checkpoint: str | None = None, - ): - g_path = g_checkpoint if g_checkpoint is not None else f"G_latest.pth" - d_path = d_checkpoint if d_checkpoint is not None else f"D_latest.pth" - torch.save( - { - "epoch": self.epoch, - "model": self.G.state_dict(), - "optimizer": self.optimizer_G.state_dict(), - "scheduler": self.scheduler_G.state_dict(), - "loss_gen": self.loss_gen, - "loss_fm": self.loss_fm, - "loss_mel": self.loss_mel, - "loss_kl": self.loss_kl, - "loss_gen_all": self.loss_gen_all, - "loss_disc": self.loss_disc, - }, - os.path.join(exp_dir, g_path), - ) - torch.save( - { - "epoch": self.epoch, - "model": self.D.state_dict(), - "optimizer": self.optimizer_D.state_dict(), - "scheduler": self.scheduler_D.state_dict(), - }, - os.path.join(exp_dir, d_path), - ) - - -def latest_checkpoint_file(files: list[str]) -> str: - try: - return max(files, key=lambda x: int(Path(x).stem.split("_")[1])) - except: - return max(files, key=os.path.getctime) - - -class RVCTrainer: - def __init__( - self, - exp_dir: str, - dataset_train: Dataset, - dataset_test: Optional[Dataset] = None, - sr: int = SR_48K, - ): - self.exp_dir = exp_dir - self.dataset_train = dataset_train - self.dataset_test = dataset_test - self.sr = sr - self.writer = SummaryWriter( - os.path.join(exp_dir, "logs", time.strftime("%Y%m%d-%H%M%S")) - ) - - def latest_checkpoint(self, fallback_to_pretrained: bool = True): - files_g = glob(os.path.join(self.exp_dir, "G_*.pth")) - if not files_g: - return pretrained_checkpoints() if fallback_to_pretrained else None - latest_g = latest_checkpoint_file(files_g) - - files_d = glob(os.path.join(self.exp_dir, "D_*.pth")) - if not files_d: - return pretrained_checkpoints() if fallback_to_pretrained else None - latest_d = latest_checkpoint_file(files_d) - - return latest_g, latest_d - - def setup_models( - self, - resume_from: Tuple[str, str] | None = None, - accelerator: Accelerator | None = None, - lr=1e-4, - lr_decay=0.999875, - betas: Tuple[float, float] = (0.8, 0.99), - eps=1e-9, - use_spectral_norm=False, - segment_size=17280, - filter_length=N_FFT, - hop_length=HOP_LENGTH, - inter_channels=192, - hidden_channels=192, - filter_channels=768, - n_heads=2, - n_layers=6, - kernel_size=3, - p_dropout=0.0, - resblock: Literal["1", "2"] = "1", - resblock_kernel_sizes: list[int] = [3, 7, 11], - resblock_dilation_sizes: list[list[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - upsample_initial_channel=512, - upsample_rates: list[int] = [12, 10, 2, 2], - upsample_kernel_sizes: list[int] = [24, 20, 4, 4], - spk_embed_dim=109, - gin_channels=256, - ) -> Tuple[ - SynthesizerTrnMs768NSFsid, - MultiPeriodDiscriminator, - torch.optim.AdamW, - torch.optim.AdamW, - torch.optim.lr_scheduler.ExponentialLR, - torch.optim.lr_scheduler.ExponentialLR, - int, - ]: - if accelerator is None: - accelerator = Accelerator() - - G = SynthesizerTrnMs768NSFsid( - spec_channels=filter_length // 2 + 1, - segment_size=segment_size // hop_length, - inter_channels=inter_channels, - hidden_channels=hidden_channels, - filter_channels=filter_channels, - n_heads=n_heads, - n_layers=n_layers, - kernel_size=kernel_size, - p_dropout=p_dropout, - resblock=resblock, - resblock_kernel_sizes=resblock_kernel_sizes, - resblock_dilation_sizes=resblock_dilation_sizes, - upsample_initial_channel=upsample_initial_channel, - upsample_rates=upsample_rates, - upsample_kernel_sizes=upsample_kernel_sizes, - spk_embed_dim=spk_embed_dim, - gin_channels=gin_channels, - sr=self.sr, - ).to(accelerator.device) - D = MultiPeriodDiscriminator(use_spectral_norm=use_spectral_norm).to( - accelerator.device - ) - - optimizer_G = torch.optim.AdamW( - G.parameters(), - lr, - betas=betas, - eps=eps, - ) - optimizer_D = torch.optim.AdamW( - D.parameters(), - lr, - betas=betas, - eps=eps, - ) - - if resume_from is not None: - g_checkpoint, d_checkpoint = resume_from - logger.info(f"Resuming from {g_checkpoint} and {d_checkpoint}") - - G_checkpoint = torch.load( - g_checkpoint, map_location=accelerator.device, weights_only=True - ) - D_checkpoint = torch.load( - d_checkpoint, map_location=accelerator.device, weights_only=True - ) - - if "epoch" in G_checkpoint: - finished_epoch = int(G_checkpoint["epoch"]) - try: - finished_epoch = int(Path(g_checkpoint).stem.split("_")[1]) - except: - finished_epoch = 0 - - scheduler_G = torch.optim.lr_scheduler.ExponentialLR( - optimizer_G, gamma=lr_decay, last_epoch=finished_epoch - 1 - ) - scheduler_D = torch.optim.lr_scheduler.ExponentialLR( - optimizer_D, gamma=lr_decay, last_epoch=finished_epoch - 1 - ) - - G.load_state_dict(G_checkpoint["model"]) - if "optimizer" in G_checkpoint: - optimizer_G.load_state_dict(G_checkpoint["optimizer"]) - if "scheduler" in G_checkpoint: - scheduler_G.load_state_dict(G_checkpoint["scheduler"]) - - D.load_state_dict(D_checkpoint["model"]) - if "optimizer" in D_checkpoint: - optimizer_D.load_state_dict(D_checkpoint["optimizer"]) - if "scheduler" in D_checkpoint: - scheduler_D.load_state_dict(D_checkpoint["scheduler"]) - else: - finished_epoch = 0 - scheduler_G = torch.optim.lr_scheduler.ExponentialLR( - optimizer_G, gamma=lr_decay, last_epoch=-1 - ) - scheduler_D = torch.optim.lr_scheduler.ExponentialLR( - optimizer_D, gamma=lr_decay, last_epoch=-1 - ) - - G, D, optimizer_G, optimizer_D = accelerator.prepare( - G, D, optimizer_G, optimizer_D - ) - - return G, D, optimizer_G, optimizer_D, scheduler_G, scheduler_D, finished_epoch - - def setup_dataloader( - self, - dataset: Dataset, - batch_size=1, - shuffle=True, - accelerator: Accelerator | None = None, - ): - if accelerator is None: - accelerator = Accelerator() - - dataset = dataset.with_format("torch", device=accelerator.device) - loader = DataLoader( - dataset, - batch_size=batch_size, - shuffle=shuffle, - collate_fn=TextAudioCollateMultiNSFsid(), - ) - loader = accelerator.prepare(loader) - return loader - - def run( - self, - G, - D, - optimizer_G, - optimizer_D, - scheduler_G, - scheduler_D, - finished_epoch, - loader_train, - loader_test, - accelerator: Accelerator | None = None, - epochs=100, - segment_size=17280, - filter_length=N_FFT, - hop_length=HOP_LENGTH, - n_mel_channels=N_MELS, - win_length=WIN_LENGTH, - mel_fmin=0.0, - mel_fmax: float | None = None, - c_mel=45, - c_kl=1.0, - upload_to_hub: str | None = None, - upload_window_minutes=5, - ): - if accelerator is None: - accelerator = Accelerator() - - if accelerator.is_main_process: - logger.info("Start training") - - upload_state_last = 0.0 - - prev_loss_gen = -1.0 - prev_loss_fm = -1.0 - prev_loss_mel = -1.0 - prev_loss_kl = -1.0 - prev_loss_disc = -1.0 - prev_loss_gen_all = -1.0 - - with accelerator.autocast(): - epoch_iterator = tqdm( - range(1, epochs + 1), - desc="Training", - disable=not accelerator.is_main_process, - ) - for epoch in epoch_iterator: - if epoch <= finished_epoch: - continue - - G.train() - D.train() - - epoch_loss_gen = 0.0 - epoch_loss_fm = 0.0 - epoch_loss_mel = 0.0 - epoch_loss_kl = 0.0 - epoch_loss_disc = 0.0 - epoch_loss_gen_all = 0.0 - num_batches = 0 - - batch_iterator = tqdm( - loader_train, - desc=f"Epoch {epoch}", - leave=False, - disable=not accelerator.is_main_process, - ) - for batch in batch_iterator: - ( - phone, - phone_lengths, - pitch, - pitchf, - spec, - spec_lengths, - wave, - wave_lengths, - sid, - ) = batch - - # Generator - optimizer_G.zero_grad() - ( - y_hat, - ids_slice, - x_mask, - z_mask, - (z, z_p, m_p, logs_p, m_q, logs_q), - ) = G( - phone, - phone_lengths, - pitch, - pitchf, - spec, - spec_lengths, - sid, - ) - mel = spec_to_mel_torch( - spec, - filter_length, - n_mel_channels, - self.sr, - mel_fmin, - mel_fmax, - ) - y_mel = commons.slice_segments( - mel, ids_slice, segment_size // hop_length - ) - y_hat_mel = mel_spectrogram_torch( - y_hat.squeeze(1), - filter_length, - n_mel_channels, - self.sr, - hop_length, - win_length, - mel_fmin, - mel_fmax, - ) - wave = commons.slice_segments( - wave, ids_slice * hop_length, segment_size - ) - - # Discriminator - optimizer_D.zero_grad() - y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = D(wave, y_hat.detach()) - - # Update Discriminator - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( - y_d_hat_r, y_d_hat_g - ) - accelerator.backward(loss_disc) - optimizer_D.step() - - # Re-compute discriminator output (since we just got a "better" discriminator) - y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = D(wave, y_hat) - - # Update Generator - loss_gen, losses_gen = generator_loss(y_d_hat_g) - loss_mel = F.l1_loss(y_mel, y_hat_mel) * c_mel - loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * c_kl - loss_fm = feature_loss(fmap_r, fmap_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl - accelerator.backward(loss_gen_all) - optimizer_G.step() - - prev_loss_gen = loss_gen.item() - prev_loss_fm = loss_fm.item() - prev_loss_mel = loss_mel.item() - prev_loss_kl = loss_kl.item() - prev_loss_disc = loss_disc.item() - prev_loss_gen_all = loss_gen_all.item() - - # Update progress bar with current losses - if accelerator.is_main_process: - batch_iterator.set_postfix( - { - "g_loss": f"{prev_loss_gen:.4f}", - "d_loss": f"{prev_loss_disc:.4f}", - "mel_loss": f"{prev_loss_mel:.4f}", - "total": f"{prev_loss_gen_all:.4f}", - } - ) - - epoch_loss_gen += prev_loss_gen - epoch_loss_fm += prev_loss_fm - epoch_loss_mel += prev_loss_mel - epoch_loss_kl += prev_loss_kl - epoch_loss_disc += prev_loss_disc - epoch_loss_gen_all += prev_loss_gen_all - num_batches += 1 - - scheduler_G.step() - scheduler_D.step() - - if accelerator.is_main_process and num_batches > 0: - avg_gen = epoch_loss_gen / num_batches - avg_disc = epoch_loss_disc / num_batches - avg_fm = epoch_loss_fm / num_batches - avg_mel = epoch_loss_mel / num_batches - avg_kl = epoch_loss_kl / num_batches - avg_total = epoch_loss_gen_all / num_batches - - logger.info( - f"Epoch {epoch} | " - f"Generator Loss: {avg_gen:.4f} | " - f"Discriminator Loss: {avg_disc:.4f} | " - f"Mel Loss: {avg_mel:.4f} | " - f"Total Loss: {avg_total:.4f}" - ) - - # Update epoch progress bar - epoch_iterator.set_postfix( - { - "g_loss": f"{avg_gen:.4f}", - "d_loss": f"{avg_disc:.4f}", - "total": f"{avg_total:.4f}", - } - ) - - self.writer.add_scalar("Loss/Generator", avg_gen, epoch) - self.writer.add_scalar("Loss/Feature_Matching", avg_fm, epoch) - self.writer.add_scalar("Loss/Mel", avg_mel, epoch) - self.writer.add_scalar("Loss/KL", avg_kl, epoch) - self.writer.add_scalar("Loss/Discriminator", avg_disc, epoch) - self.writer.add_scalar("Loss/Generator_Total", avg_total, epoch) - self.writer.add_scalar( - "Learning_Rate/Generator", - scheduler_G.get_last_lr()[0], - epoch, - ) - self.writer.add_scalar( - "Learning_Rate/Discriminator", - scheduler_D.get_last_lr()[0], - epoch, - ) - - if loader_test is not None: - with torch.no_grad(): - sample_idx = 0 - test_iterator = tqdm( - loader_test, - desc=f"Testing epoch {epoch}", - leave=False, - disable=not accelerator.is_main_process, - ) - for batch_idx, ( - phone, - phone_lengths, - pitch, - pitchf, - spec, - spec_lengths, - wave, - wave_lengths, - sid, - ) in enumerate(test_iterator): - # Generate audio for each sample in the batch - audio_segments = G.infer( - phone, phone_lengths, pitch, pitchf, sid - )[0] - - # Log each audio sample in the batch - for i, audio in enumerate(audio_segments): - audio_numpy = audio[0].data.cpu().float().numpy() - self.writer.add_audio( - f"Audio/{sample_idx}", - audio_numpy, - epoch, - sample_rate=self.sr, - ) - sample_idx += 1 - - res = TrainingCheckpoint( - epoch, - G, - D, - optimizer_G, - optimizer_D, - scheduler_G, - scheduler_D, - prev_loss_gen, - prev_loss_fm, - prev_loss_mel, - prev_loss_kl, - prev_loss_gen_all, - prev_loss_disc, - ) - - res.save(self.exp_dir) - G.save_pretrained(self.exp_dir) - - if upload_to_hub is not None: - if ( - time.time() - upload_state_last > 60 * upload_window_minutes - or epoch == epochs - ): - try: - self.push_to_hub(upload_to_hub) - upload_state_last = time.time() - except Exception: - logger.error(f"Failed to upload to Hub.", exc_info=1) - else: - next_upload = 60 * upload_window_minutes - ( - time.time() - upload_state_last - ) - logger.info( - f"Skipping upload to Hub (next upload in {next_upload:.0f} seconds)" - ) - - def train( - self, - resume_from: Tuple[str, str] | None = None, - accelerator: Accelerator | None = None, - batch_size=1, - epochs=100, - lr=1e-4, - lr_decay=0.999875, - betas: Tuple[float, float] = (0.8, 0.99), - eps=1e-9, - use_spectral_norm=False, - segment_size=17280, - filter_length=N_FFT, - hop_length=HOP_LENGTH, - inter_channels=192, - hidden_channels=192, - filter_channels=768, - n_heads=2, - n_layers=6, - kernel_size=3, - p_dropout=0.0, - resblock: Literal["1", "2"] = "1", - resblock_kernel_sizes: list[int] = [3, 7, 11], - resblock_dilation_sizes: list[list[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - upsample_initial_channel=512, - upsample_rates: list[int] = [12, 10, 2, 2], - upsample_kernel_sizes: list[int] = [24, 20, 4, 4], - spk_embed_dim=109, - gin_channels=256, - n_mel_channels=N_MELS, - win_length=WIN_LENGTH, - mel_fmin=0.0, - mel_fmax: float | None = None, - c_mel=45, - c_kl=1.0, - upload_to_hub: str | None = None, - ): - if not os.path.exists(self.exp_dir): - os.makedirs(self.exp_dir) - - if accelerator is None: - accelerator = Accelerator() - - ( - G, - D, - optimizer_G, - optimizer_D, - scheduler_G, - scheduler_D, - finished_epoch, - ) = self.setup_models( - resume_from=resume_from or self.latest_checkpoint(), - accelerator=accelerator, - lr=lr, - lr_decay=lr_decay, - betas=betas, - eps=eps, - use_spectral_norm=use_spectral_norm, - segment_size=segment_size, - filter_length=filter_length, - hop_length=hop_length, - inter_channels=inter_channels, - hidden_channels=hidden_channels, - filter_channels=filter_channels, - n_heads=n_heads, - n_layers=n_layers, - kernel_size=kernel_size, - p_dropout=p_dropout, - resblock=resblock, - resblock_kernel_sizes=resblock_kernel_sizes, - resblock_dilation_sizes=resblock_dilation_sizes, - upsample_initial_channel=upsample_initial_channel, - upsample_rates=upsample_rates, - upsample_kernel_sizes=upsample_kernel_sizes, - spk_embed_dim=spk_embed_dim, - gin_channels=gin_channels, - ) - - loader_train = self.setup_dataloader( - self.dataset_train, - batch_size=batch_size, - accelerator=accelerator, - ) - - loader_test = ( - self.setup_dataloader( - self.dataset_test, - batch_size=batch_size, - accelerator=accelerator, - shuffle=False, - ) - if self.dataset_test is not None - else None - ) - - return self.run( - G, - D, - optimizer_G, - optimizer_D, - scheduler_G, - scheduler_D, - finished_epoch, - loader_train, - loader_test, - accelerator, - epochs=epochs, - segment_size=segment_size, - filter_length=filter_length, - hop_length=hop_length, - n_mel_channels=n_mel_channels, - win_length=win_length, - mel_fmin=mel_fmin, - mel_fmax=mel_fmax, - c_mel=c_mel, - c_kl=c_kl, - upload_to_hub=upload_to_hub, - ) - - def push_to_hub(self, repo: str, private: bool = True): - if not os.path.exists(self.exp_dir): - raise FileNotFoundError("exp_dir not found") - - api = HfApi() - repo_id = api.create_repo(repo_id=repo, private=private, exist_ok=True).repo_id - - return upload_folder( - repo_id=repo_id, - folder_path=self.exp_dir, - commit_message="Upload via ZeroRVC", - ) - - def __del__(self): - if hasattr(self, "writer"): - self.writer.close() diff --git a/zerorvc/utils/__init__.py b/zerorvc/utils/__init__.py deleted file mode 100644 index 0c1c353ce548d21eedfad46c8be289716d4bcccd..0000000000000000000000000000000000000000 --- a/zerorvc/utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .data_utils import * -from .mel_processing import * -from .losses import * diff --git a/zerorvc/utils/data_utils.py b/zerorvc/utils/data_utils.py deleted file mode 100644 index e874262448c3108b92177040afc1a26995eb7d6f..0000000000000000000000000000000000000000 --- a/zerorvc/utils/data_utils.py +++ /dev/null @@ -1,85 +0,0 @@ -import logging - -import torch -import torch.utils.data - -logger = logging.getLogger(__name__) - - -class TextAudioCollateMultiNSFsid: - """Zero-pads model inputs and targets""" - - def __init__(self): - pass - - def __call__(self, batch): - """Collate's training batch from normalized text and aduio - PARAMS - ------ - batch: [text_normalized, spec_normalized, wav_normalized] - """ - device = batch[0]["spec"].device - - with device: - # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.tensor([x["spec"].size(1) for x in batch], dtype=torch.long), - dim=0, - descending=True, - ) - - max_spec_len = max([x["spec"].size(1) for x in batch]) - max_wave_len = max([x["wav_gt"]["array"].size(0) for x in batch]) - spec_lengths = torch.zeros(len(batch), dtype=torch.long) - wave_lengths = torch.zeros(len(batch), dtype=torch.long) - spec_padded = torch.zeros( - len(batch), batch[0]["spec"].size(0), max_spec_len, dtype=torch.float32 - ) - wave_padded = torch.zeros(len(batch), 1, max_wave_len, dtype=torch.float32) - - max_phone_len = max([x["hubert_feats"].size(0) for x in batch]) - phone_lengths = torch.zeros(len(batch), dtype=torch.long) - phone_padded = torch.zeros( - len(batch), - max_phone_len, - batch[0]["hubert_feats"].shape[1], - dtype=torch.float32, - ) # (spec, wav, phone, pitch) - pitch_padded = torch.zeros(len(batch), max_phone_len, dtype=torch.long) - pitchf_padded = torch.zeros(len(batch), max_phone_len, dtype=torch.float32) - # dv = torch.FloatTensor(len(batch), 256)#gin=256 - sid = torch.zeros(len(batch), dtype=torch.long) - - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - - spec = row["spec"] - spec_padded[i, :, : spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wave = row["wav_gt"]["array"] - wave_padded[i, :, : wave.size(0)] = wave - wave_lengths[i] = wave.size(0) - - phone = row["hubert_feats"] - phone_padded[i, : phone.size(0), :] = phone - phone_lengths[i] = phone.size(0) - - pitch = row["f0"] - pitch_padded[i, : pitch.size(0)] = pitch - pitchf = row["f0nsf"] - pitchf_padded[i, : pitchf.size(0)] = pitchf - - sid[i] = torch.tensor([0], dtype=torch.long) - - return ( - phone_padded, - phone_lengths, - pitch_padded, - pitchf_padded, - spec_padded, - spec_lengths, - wave_padded, - wave_lengths, - sid, - )