Spaces:

ArkanDash
/

rvc-genshin-impact

Running on CPU Upgrade

App Files Files Community

ArkanDash commited on Jun 25, 2023

Commit

883d44b

1 Parent(s): 31a8225

feat: major update

Browse files

Files changed (20) hide show

app.py +84 -68
config.py +0 -105
configs/32k.json +0 -46
configs/40k.json +0 -46
configs/48k.json +0 -46
infer_pack/models_onnx_moess.py +0 -849
{infer_pack → lib/infer_pack}/attentions.py +3 -3
{infer_pack → lib/infer_pack}/commons.py +0 -0
{infer_pack → lib/infer_pack}/models.py +6 -6
{infer_pack → lib/infer_pack}/models_onnx.py +6 -6
{infer_pack → lib/infer_pack}/modules.py +3 -3
{infer_pack → lib/infer_pack}/modules/F0Predictor/DioF0Predictor.py +1 -1
{infer_pack → lib/infer_pack}/modules/F0Predictor/F0Predictor.py +0 -0
{infer_pack → lib/infer_pack}/modules/F0Predictor/HarvestF0Predictor.py +1 -1
{infer_pack → lib/infer_pack}/modules/F0Predictor/PMF0Predictor.py +1 -1
{infer_pack → lib/infer_pack}/modules/F0Predictor/__init__.py +0 -0
{infer_pack → lib/infer_pack}/onnx_inference.py +4 -3
{infer_pack → lib/infer_pack}/transforms.py +0 -0
requirements.txt +12 -17
vc_infer_pipeline.py +3 -3

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ import io
 import wave
 from datetime import datetime
 from fairseq import checkpoint_utils
-from infer_pack.models import (
     SynthesizerTrnMs256NSFsid,
     SynthesizerTrnMs256NSFsid_nono,
     SynthesizerTrnMs768NSFsid,
@@ -29,14 +29,25 @@ config = Config()
 logging.getLogger("numba").setLevel(logging.WARNING)
 limitation = os.getenv("SYSTEM") == "spaces"
-def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
     def vc_fn(
         vc_audio_mode,
         vc_input,
         vc_upload,
         tts_text,
         tts_voice,
-        spk_item,
         f0_up_key,
         f0_method,
         index_rate,
@@ -73,13 +84,14 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
             audio_opt = vc.pipeline(
                 hubert_model,
                 net_g,
-                spk_item,
                 audio,
                 vc_input,
                 times,
                 f0_up_key,
                 f0_method,
                 file_index,
                 index_rate,
                 if_f0,
                 filter_radius,
@@ -91,7 +103,7 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
                 f0_file=None,
             )
             info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
-            print(info)
             return info, (tgt_sr, audio_opt)
         except:
             info = traceback.format_exc()
@@ -99,6 +111,57 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
             return info, (None, None)
     return vc_fn
 def cut_vocal_and_inst(url, audio_provider, split_model):
     if url != "":
         if not os.path.exists("dl_audio"):
@@ -275,61 +338,15 @@ def change_audio_mode(vc_audio_mode):
 if __name__ == '__main__':
     load_hubert()
-    categories = []
     tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
     voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
-    with open("weights/folder_info.json", "r", encoding="utf-8") as f:
-        folder_info = json.load(f)
-    for category_name, category_info in folder_info.items():
-        if not category_info['enable']:
-            continue
-        category_title = category_info['title']
-        category_folder = category_info['folder_path']
-        description = category_info['description']
-        models = []
-        with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
-            models_info = json.load(f)
-        for model_name, info in models_info.items():
-            if not info['enable']:
-                continue
-            model_title = info['title']
-            model_author = info.get("author", None)
-            model_cover = f"weights/{category_folder}/{model_name}/{info['cover']}"
-            model_index = f"weights/{category_folder}/{model_name}/{info['feature_retrieval_library']}"
-            cpt = torch.load(f"weights/{category_folder}/{model_name}/{model_name}.pth", map_location="cpu")
-            tgt_sr = cpt["config"][-1]
-            cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
-            if_f0 = cpt.get("f0", 1)
-            version = cpt.get("version", "v1")
-            if version == "v1":
-                if if_f0 == 1:
-                    net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
-                else:
-                    net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
-                nodel_version = "V1"
-            elif version == "v2":
-                if if_f0 == 1:
-                    net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
-                else:
-                    net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
-                nodel_version = "V2"
-            del net_g.enc_q
-            print(net_g.load_state_dict(cpt["weight"], strict=False))
-            net_g.eval().to(config.device)
-            if config.is_half:
-                net_g = net_g.half()
-            else:
-                net_g = net_g.float()
-            vc = VC(tgt_sr, config)
-            print(f"Model loaded: {model_name}")
-            models.append((model_name, model_title, model_author, model_cover, nodel_version, create_vc_fn(tgt_sr, net_g, vc, if_f0, model_index)))
-        categories.append([category_title, category_folder, description, models])
     with gr.Blocks() as app:
         gr.Markdown(
             "# <center> RVC Genshin Impact Inference\n"
             "### <center> [Recommended to use Google Colab to use more character & more feature](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n"
             "#### From [Retrieval-based-Voice-Conversion](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)\n"
-            "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)"
         )
         for (folder_title, folder, description, models) in categories:
             with gr.TabItem(folder_title):
@@ -353,7 +370,7 @@ if __name__ == '__main__':
                                 )
                             with gr.Row():
                                 with gr.Column():
-                                    vc_audio_mode = gr.Dropdown(label="Input voice", choices=["Upload audio", "TTS Audio"], allow_custom_value=False, value="Upload audio")
                                     # Input and Upload
                                     vc_input = gr.Textbox(label="Input audio path", visible=False)
                                     vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
@@ -369,22 +386,13 @@ if __name__ == '__main__':
                                     tts_text = gr.Textbox(visible=False, label="TTS text", info="Text to speech input")
                                     tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
                                 with gr.Column():
-                                    spk_item = gr.Slider(
-                                        minimum=0,
-                                        maximum=2333,
-                                        step=1,
-                                        label="Speaker ID",
-                                        info="(Default: 0)",
-                                        value=0,
-                                        interactive=True,
-                                    )
                                     vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
                                     f0method0 = gr.Radio(
                                         label="Pitch extraction algorithm",
-                                        info="PM is fast, Harvest is good but extremely slow (Default: PM)",
-                                        choices=["pm", "harvest"],
                                         value="pm",
-                                        interactive=True,
                                     )
                                     index_rate1 = gr.Slider(
                                         minimum=0,
@@ -425,7 +433,16 @@ if __name__ == '__main__':
                                         maximum=0.5,
                                         label="Voice Protection",
                                         info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
-                                        value=0.35,
                                         step=0.01,
                                         interactive=True,
                                     )
@@ -453,7 +470,6 @@ if __name__ == '__main__':
                                 vc_upload,
                                 tts_text,
                                 tts_voice,
-                                spk_item,
                                 vc_transform0,
                                 f0method0,
                                 index_rate1,

 import wave
 from datetime import datetime
 from fairseq import checkpoint_utils
+from lib.infer_pack.models import (
     SynthesizerTrnMs256NSFsid,
     SynthesizerTrnMs256NSFsid_nono,
     SynthesizerTrnMs768NSFsid,
 logging.getLogger("numba").setLevel(logging.WARNING)
 limitation = os.getenv("SYSTEM") == "spaces"
+audio_mode = []
+f0method_mode = []
+f0method_info = ""
+if limitation is True:
+    audio_mode = ["Upload audio", "TTS Audio"]
+    f0method_mode = ["pm", "harvest"]
+    f0method_info = "PM is fast, Harvest is good but extremely slow. (Default: PM)"
+else:
+    audio_mode = ["Input path", "Upload audio", "Youtube", "TTS Audio"]
+    f0method_mode = ["pm", "harvest", "crepe"]
+    f0method_info = "PM is fast, Harvest is good but extremely slow, and Crepe effect is good but requires GPU (Default: PM)"
+def create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, file_index):
     def vc_fn(
         vc_audio_mode,
         vc_input,
         vc_upload,
         tts_text,
         tts_voice,
         f0_up_key,
         f0_method,
         index_rate,
             audio_opt = vc.pipeline(
                 hubert_model,
                 net_g,
+                0,
                 audio,
                 vc_input,
                 times,
                 f0_up_key,
                 f0_method,
                 file_index,
+                # file_big_npy,
                 index_rate,
                 if_f0,
                 filter_radius,
                 f0_file=None,
             )
             info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
+            print(f"{model_title} | {info}")
             return info, (tgt_sr, audio_opt)
         except:
             info = traceback.format_exc()
             return info, (None, None)
     return vc_fn
+def load_model():
+    categories = []
+    with open("weights/folder_info.json", "r", encoding="utf-8") as f:
+        folder_info = json.load(f)
+    for category_name, category_info in folder_info.items():
+        if not category_info['enable']:
+            continue
+        category_title = category_info['title']
+        category_folder = category_info['folder_path']
+        description = category_info['description']
+        models = []
+        with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
+            models_info = json.load(f)
+        for character_name, info in models_info.items():
+            if not info['enable']:
+                continue
+            model_title = info['title']
+            model_name = info['model_path']
+            model_author = info.get("author", None)
+            model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
+            model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
+            cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
+            tgt_sr = cpt["config"][-1]
+            cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
+            if_f0 = cpt.get("f0", 1)
+            version = cpt.get("version", "v1")
+            if version == "v1":
+                if if_f0 == 1:
+                    net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
+                else:
+                    net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+                model_version = "V1"
+            elif version == "v2":
+                if if_f0 == 1:
+                    net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
+                else:
+                    net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+                model_version = "V2"
+            del net_g.enc_q
+            print(net_g.load_state_dict(cpt["weight"], strict=False))
+            net_g.eval().to(config.device)
+            if config.is_half:
+                net_g = net_g.half()
+            else:
+                net_g = net_g.float()
+            vc = VC(tgt_sr, config)
+            print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
+            models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, model_index)))
+        categories.append([category_title, category_folder, description, models])
+    return categories
 def cut_vocal_and_inst(url, audio_provider, split_model):
     if url != "":
         if not os.path.exists("dl_audio"):
 if __name__ == '__main__':
     load_hubert()
+    categories = load_model()
     tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
     voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
     with gr.Blocks() as app:
         gr.Markdown(
             "# <center> RVC Genshin Impact Inference\n"
             "### <center> [Recommended to use Google Colab to use more character & more feature](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n"
             "#### From [Retrieval-based-Voice-Conversion](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)\n"
+            "### [This spaces use Multi Model RVC Inference](https://github.com/ArkanDash/Multi-Model-RVC-Inference)"
         )
         for (folder_title, folder, description, models) in categories:
             with gr.TabItem(folder_title):
                                 )
                             with gr.Row():
                                 with gr.Column():
+                                    vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
                                     # Input and Upload
                                     vc_input = gr.Textbox(label="Input audio path", visible=False)
                                     vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
                                     tts_text = gr.Textbox(visible=False, label="TTS text", info="Text to speech input")
                                     tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
                                 with gr.Column():
                                     vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
                                     f0method0 = gr.Radio(
                                         label="Pitch extraction algorithm",
+                                        info=f0method_info,
+                                        choices=f0method_mode,
                                         value="pm",
+                                        interactive=True
                                     )
                                     index_rate1 = gr.Slider(
                                         minimum=0,
                                         maximum=0.5,
                                         label="Voice Protection",
                                         info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
+                                        value=0.4,
+                                        step=0.01,
+                                        interactive=True,
+                                    )
+                                    protect0 = gr.Slider(
+                                        minimum=0,
+                                        maximum=0.5,
+                                        label="Voice Protection",
+                                        info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
+                                        value=0.5,
                                         step=0.01,
                                         interactive=True,
                                     )
                                 vc_upload,
                                 tts_text,
                                 tts_voice,
                                 vc_transform0,
                                 f0method0,
                                 index_rate1,

config.py DELETED Viewed

@@ -1,105 +0,0 @@
-import argparse
-import torch
-from multiprocessing import cpu_count
-class Config:
-    def __init__(self):
-        self.device = "cuda:0"
-        self.is_half = True
-        self.n_cpu = 0
-        self.gpu_name = None
-        self.gpu_mem = None
-        (
-            self.python_cmd,
-            self.listen_port,
-            self.colab,
-            self.noparallel,
-            self.noautoopen,
-            self.api
-        ) = self.arg_parse()
-        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
-    @staticmethod
-    def arg_parse() -> tuple:
-        parser = argparse.ArgumentParser()
-        parser.add_argument("--port", type=int, default=7865, help="Listen port")
-        parser.add_argument(
-            "--pycmd", type=str, default="python", help="Python command"
-        )
-        parser.add_argument("--colab", action="store_true", help="Launch in colab")
-        parser.add_argument(
-            "--noparallel", action="store_true", help="Disable parallel processing"
-        )
-        parser.add_argument(
-            "--noautoopen",
-            action="store_true",
-            help="Do not open in browser automatically",
-        )
-        parser.add_argument("--api", action="store_true", help="Launch with api")
-        cmd_opts = parser.parse_args()
-        cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
-        return (
-            cmd_opts.pycmd,
-            cmd_opts.port,
-            cmd_opts.colab,
-            cmd_opts.noparallel,
-            cmd_opts.noautoopen,
-            cmd_opts.api
-        )
-    def device_config(self) -> tuple:
-        if torch.cuda.is_available():
-            i_device = int(self.device.split(":")[-1])
-            self.gpu_name = torch.cuda.get_device_name(i_device)
-            if (
-                ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
-                or "P40" in self.gpu_name.upper()
-                or "1060" in self.gpu_name
-                or "1070" in self.gpu_name
-                or "1080" in self.gpu_name
-            ):
-                print("16系/10系显卡和P40强制单精度")
-                self.is_half = False
-            else:
-                self.gpu_name = None
-            self.gpu_mem = int(
-                torch.cuda.get_device_properties(i_device).total_memory
-                / 1024
-                / 1024
-                / 1024
-                + 0.4
-            )
-        elif torch.backends.mps.is_available():
-            print("没有发现支持的N卡, 使用MPS进行推理")
-            self.device = "mps"
-            self.is_half = False
-        else:
-            print("没有发现支持的N卡, 使用CPU进行推理")
-            self.device = "cpu"
-            self.is_half = False
-        if self.n_cpu == 0:
-            self.n_cpu = cpu_count()
-        if self.is_half:
-            # 6G显存配置
-            x_pad = 3
-            x_query = 10
-            x_center = 60
-            x_max = 65
-        else:
-            # 5G显存配置
-            x_pad = 1
-            x_query = 6
-            x_center = 38
-            x_max = 41
-        if self.gpu_mem != None and self.gpu_mem <= 4:
-            x_pad = 1
-            x_query = 5
-            x_center = 30
-            x_max = 32
-        return x_pad, x_query, x_center, x_max

configs/32k.json DELETED Viewed

@@ -1,46 +0,0 @@
-{
-  "train": {
-    "log_interval": 200,
-    "seed": 1234,
-    "epochs": 20000,
-    "learning_rate": 1e-4,
-    "betas": [0.8, 0.99],
-    "eps": 1e-9,
-    "batch_size": 4,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 12800,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "max_wav_value": 32768.0,
-    "sampling_rate": 32000,
-    "filter_length": 1024,
-    "hop_length": 320,
-    "win_length": 1024,
-    "n_mel_channels": 80,
-    "mel_fmin": 0.0,
-    "mel_fmax": null
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0,
-    "resblock": "1",
-    "resblock_kernel_sizes": [3,7,11],
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [10,4,2,2,2],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [16,16,4,4,4],
-    "use_spectral_norm": false,
-    "gin_channels": 256,
-    "spk_embed_dim": 109
-  }
-}

configs/40k.json DELETED Viewed

@@ -1,46 +0,0 @@
-{
-  "train": {
-    "log_interval": 200,
-    "seed": 1234,
-    "epochs": 20000,
-    "learning_rate": 1e-4,
-    "betas": [0.8, 0.99],
-    "eps": 1e-9,
-    "batch_size": 4,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 12800,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "max_wav_value": 32768.0,
-    "sampling_rate": 40000,
-    "filter_length": 2048,
-    "hop_length": 400,
-    "win_length": 2048,
-    "n_mel_channels": 125,
-    "mel_fmin": 0.0,
-    "mel_fmax": null
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0,
-    "resblock": "1",
-    "resblock_kernel_sizes": [3,7,11],
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [10,10,2,2],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [16,16,4,4],
-    "use_spectral_norm": false,
-    "gin_channels": 256,
-    "spk_embed_dim": 109
-  }
-}

configs/48k.json DELETED Viewed

@@ -1,46 +0,0 @@
-{
-  "train": {
-    "log_interval": 200,
-    "seed": 1234,
-    "epochs": 20000,
-    "learning_rate": 1e-4,
-    "betas": [0.8, 0.99],
-    "eps": 1e-9,
-    "batch_size": 4,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 11520,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "max_wav_value": 32768.0,
-    "sampling_rate": 48000,
-    "filter_length": 2048,
-    "hop_length": 480,
-    "win_length": 2048,
-    "n_mel_channels": 128,
-    "mel_fmin": 0.0,
-    "mel_fmax": null
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0,
-    "resblock": "1",
-    "resblock_kernel_sizes": [3,7,11],
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [10,6,2,2,2],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [16,16,4,4,4],
-    "use_spectral_norm": false,
-    "gin_channels": 256,
-    "spk_embed_dim": 109
-  }
-}

infer_pack/models_onnx_moess.py DELETED Viewed

@@ -1,849 +0,0 @@
-import math, pdb, os
-from time import time as ttime
-import torch
-from torch import nn
-from torch.nn import functional as F
-from infer_pack import modules
-from infer_pack import attentions
-from infer_pack import commons
-from infer_pack.commons import init_weights, get_padding
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from infer_pack.commons import init_weights
-import numpy as np
-from infer_pack import commons
-class TextEncoder256(nn.Module):
-    def __init__(
-        self,
-        out_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        f0=True,
-    ):
-        super().__init__()
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.emb_phone = nn.Linear(256, hidden_channels)
-        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
-        if f0 == True:
-            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
-        self.encoder = attentions.Encoder(
-            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    def forward(self, phone, pitch, lengths):
-        if pitch == None:
-            x = self.emb_phone(phone)
-        else:
-            x = self.emb_phone(phone) + self.emb_pitch(pitch)
-        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
-        x = self.lrelu(x)
-        x = torch.transpose(x, 1, -1)  # [b, h, t]
-        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
-            x.dtype
-        )
-        x = self.encoder(x * x_mask, x_mask)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        return m, logs, x_mask
-class TextEncoder256Sim(nn.Module):
-    def __init__(
-        self,
-        out_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        f0=True,
-    ):
-        super().__init__()
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.emb_phone = nn.Linear(256, hidden_channels)
-        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
-        if f0 == True:
-            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
-        self.encoder = attentions.Encoder(
-            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
-    def forward(self, phone, pitch, lengths):
-        if pitch == None:
-            x = self.emb_phone(phone)
-        else:
-            x = self.emb_phone(phone) + self.emb_pitch(pitch)
-        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
-        x = self.lrelu(x)
-        x = torch.transpose(x, 1, -1)  # [b, h, t]
-        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
-            x.dtype
-        )
-        x = self.encoder(x * x_mask, x_mask)
-        x = self.proj(x) * x_mask
-        return x, x_mask
-class ResidualCouplingBlock(nn.Module):
-    def __init__(
-        self,
-        channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        n_flows=4,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-        self.flows = nn.ModuleList()
-        for i in range(n_flows):
-            self.flows.append(
-                modules.ResidualCouplingLayer(
-                    channels,
-                    hidden_channels,
-                    kernel_size,
-                    dilation_rate,
-                    n_layers,
-                    gin_channels=gin_channels,
-                    mean_only=True,
-                )
-            )
-            self.flows.append(modules.Flip())
-    def forward(self, x, x_mask, g=None, reverse=False):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x
-    def remove_weight_norm(self):
-        for i in range(self.n_flows):
-            self.flows[i * 2].remove_weight_norm()
-class PosteriorEncoder(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels
-        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-        self.enc = modules.WN(
-            hidden_channels,
-            kernel_size,
-            dilation_rate,
-            n_layers,
-            gin_channels=gin_channels,
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    def forward(self, x, x_lengths, g=None):
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
-            x.dtype
-        )
-        x = self.pre(x) * x_mask
-        x = self.enc(x, x_mask, g=g)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
-        return z, m, logs, x_mask
-    def remove_weight_norm(self):
-        self.enc.remove_weight_norm()
-class Generator(torch.nn.Module):
-    def __init__(
-        self,
-        initial_channel,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        gin_channels=0,
-    ):
-        super(Generator, self).__init__()
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        self.conv_pre = Conv1d(
-            initial_channel, upsample_initial_channel, 7, 1, padding=3
-        )
-        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(
-                weight_norm(
-                    ConvTranspose1d(
-                        upsample_initial_channel // (2**i),
-                        upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(
-                zip(resblock_kernel_sizes, resblock_dilation_sizes)
-            ):
-                self.resblocks.append(resblock(ch, k, d))
-        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-        self.ups.apply(init_weights)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-    def forward(self, x, g=None):
-        x = self.conv_pre(x)
-        if g is not None:
-            x = x + self.cond(g)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            x = self.ups[i](x)
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_weight_norm(self):
-        for l in self.ups:
-            remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-class SineGen(torch.nn.Module):
-    """Definition of sine generator
-    SineGen(samp_rate, harmonic_num = 0,
-            sine_amp = 0.1, noise_std = 0.003,
-            voiced_threshold = 0,
-            flag_for_pulse=False)
-    samp_rate: sampling rate in Hz
-    harmonic_num: number of harmonic overtones (default 0)
-    sine_amp: amplitude of sine-wavefrom (default 0.1)
-    noise_std: std of Gaussian noise (default 0.003)
-    voiced_thoreshold: F0 threshold for U/V classification (default 0)
-    flag_for_pulse: this SinGen is used inside PulseGen (default False)
-    Note: when flag_for_pulse is True, the first time step of a voiced
-        segment is always sin(np.pi) or cos(0)
-    """
-    def __init__(
-        self,
-        samp_rate,
-        harmonic_num=0,
-        sine_amp=0.1,
-        noise_std=0.003,
-        voiced_threshold=0,
-        flag_for_pulse=False,
-    ):
-        super(SineGen, self).__init__()
-        self.sine_amp = sine_amp
-        self.noise_std = noise_std
-        self.harmonic_num = harmonic_num
-        self.dim = self.harmonic_num + 1
-        self.sampling_rate = samp_rate
-        self.voiced_threshold = voiced_threshold
-    def _f02uv(self, f0):
-        # generate uv signal
-        uv = torch.ones_like(f0)
-        uv = uv * (f0 > self.voiced_threshold)
-        return uv
-    def forward(self, f0, upp):
-        """sine_tensor, uv = forward(f0)
-        input F0: tensor(batchsize=1, length, dim=1)
-                  f0 for unvoiced steps should be 0
-        output sine_tensor: tensor(batchsize=1, length, dim)
-        output uv: tensor(batchsize=1, length, 1)
-        """
-        with torch.no_grad():
-            f0 = f0[:, None].transpose(1, 2)
-            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
-            # fundamental component
-            f0_buf[:, :, 0] = f0[:, :, 0]
-            for idx in np.arange(self.harmonic_num):
-                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
-                    idx + 2
-                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
-            rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
-            rand_ini = torch.rand(
-                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
-            )
-            rand_ini[:, 0] = 0
-            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
-            tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
-            tmp_over_one *= upp
-            tmp_over_one = F.interpolate(
-                tmp_over_one.transpose(2, 1),
-                scale_factor=upp,
-                mode="linear",
-                align_corners=True,
-            ).transpose(2, 1)
-            rad_values = F.interpolate(
-                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
-            ).transpose(
-                2, 1
-            )  #######
-            tmp_over_one %= 1
-            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
-            cumsum_shift = torch.zeros_like(rad_values)
-            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
-            sine_waves = torch.sin(
-                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
-            )
-            sine_waves = sine_waves * self.sine_amp
-            uv = self._f02uv(f0)
-            uv = F.interpolate(
-                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
-            ).transpose(2, 1)
-            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
-            noise = noise_amp * torch.randn_like(sine_waves)
-            sine_waves = sine_waves * uv + noise
-        return sine_waves, uv, noise
-class SourceModuleHnNSF(torch.nn.Module):
-    """SourceModule for hn-nsf
-    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
-                 add_noise_std=0.003, voiced_threshod=0)
-    sampling_rate: sampling_rate in Hz
-    harmonic_num: number of harmonic above F0 (default: 0)
-    sine_amp: amplitude of sine source signal (default: 0.1)
-    add_noise_std: std of additive Gaussian noise (default: 0.003)
-        note that amplitude of noise in unvoiced is decided
-        by sine_amp
-    voiced_threshold: threhold to set U/V given F0 (default: 0)
-    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
-    F0_sampled (batchsize, length, 1)
-    Sine_source (batchsize, length, 1)
-    noise_source (batchsize, length 1)
-    uv (batchsize, length, 1)
-    """
-    def __init__(
-        self,
-        sampling_rate,
-        harmonic_num=0,
-        sine_amp=0.1,
-        add_noise_std=0.003,
-        voiced_threshod=0,
-        is_half=True,
-    ):
-        super(SourceModuleHnNSF, self).__init__()
-        self.sine_amp = sine_amp
-        self.noise_std = add_noise_std
-        self.is_half = is_half
-        # to produce sine waveforms
-        self.l_sin_gen = SineGen(
-            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
-        )
-        # to merge source harmonics into a single excitation
-        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
-        self.l_tanh = torch.nn.Tanh()
-    def forward(self, x, upp=None):
-        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
-        if self.is_half:
-            sine_wavs = sine_wavs.half()
-        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
-        return sine_merge, None, None  # noise, uv
-class GeneratorNSF(torch.nn.Module):
-    def __init__(
-        self,
-        initial_channel,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        gin_channels,
-        sr,
-        is_half=False,
-    ):
-        super(GeneratorNSF, self).__init__()
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
-        self.m_source = SourceModuleHnNSF(
-            sampling_rate=sr, harmonic_num=0, is_half=is_half
-        )
-        self.noise_convs = nn.ModuleList()
-        self.conv_pre = Conv1d(
-            initial_channel, upsample_initial_channel, 7, 1, padding=3
-        )
-        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            c_cur = upsample_initial_channel // (2 ** (i + 1))
-            self.ups.append(
-                weight_norm(
-                    ConvTranspose1d(
-                        upsample_initial_channel // (2**i),
-                        upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-            if i + 1 < len(upsample_rates):
-                stride_f0 = np.prod(upsample_rates[i + 1 :])
-                self.noise_convs.append(
-                    Conv1d(
-                        1,
-                        c_cur,
-                        kernel_size=stride_f0 * 2,
-                        stride=stride_f0,
-                        padding=stride_f0 // 2,
-                    )
-                )
-            else:
-                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(
-                zip(resblock_kernel_sizes, resblock_dilation_sizes)
-            ):
-                self.resblocks.append(resblock(ch, k, d))
-        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-        self.ups.apply(init_weights)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-        self.upp = np.prod(upsample_rates)
-    def forward(self, x, f0, g=None):
-        har_source, noi_source, uv = self.m_source(f0, self.upp)
-        har_source = har_source.transpose(1, 2)
-        x = self.conv_pre(x)
-        if g is not None:
-            x = x + self.cond(g)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            x = self.ups[i](x)
-            x_source = self.noise_convs[i](har_source)
-            x = x + x_source
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_weight_norm(self):
-        for l in self.ups:
-            remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-sr2sr = {
-    "32k": 32000,
-    "40k": 40000,
-    "48k": 48000,
-}
-class SynthesizerTrnMs256NSFsidM(nn.Module):
-    def __init__(
-        self,
-        spec_channels,
-        segment_size,
-        inter_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        spk_embed_dim,
-        gin_channels,
-        sr,
-        **kwargs
-    ):
-        super().__init__()
-        if type(sr) == type("strr"):
-            sr = sr2sr[sr]
-        self.spec_channels = spec_channels
-        self.inter_channels = inter_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.resblock = resblock
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-        self.upsample_rates = upsample_rates
-        self.upsample_initial_channel = upsample_initial_channel
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.segment_size = segment_size
-        self.gin_channels = gin_channels
-        # self.hop_length = hop_length#
-        self.spk_embed_dim = spk_embed_dim
-        self.enc_p = TextEncoder256(
-            inter_channels,
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-        )
-        self.dec = GeneratorNSF(
-            inter_channels,
-            resblock,
-            resblock_kernel_sizes,
-            resblock_dilation_sizes,
-            upsample_rates,
-            upsample_initial_channel,
-            upsample_kernel_sizes,
-            gin_channels=gin_channels,
-            sr=sr,
-            is_half=kwargs["is_half"],
-        )
-        self.enc_q = PosteriorEncoder(
-            spec_channels,
-            inter_channels,
-            hidden_channels,
-            5,
-            1,
-            16,
-            gin_channels=gin_channels,
-        )
-        self.flow = ResidualCouplingBlock(
-            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
-        )
-        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
-        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
-    def remove_weight_norm(self):
-        self.dec.remove_weight_norm()
-        self.flow.remove_weight_norm()
-        self.enc_q.remove_weight_norm()
-    def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
-        g = self.emb_g(sid).unsqueeze(-1)
-        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
-        z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
-        z = self.flow(z_p, x_mask, g=g, reverse=True)
-        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
-        return o
-class SynthesizerTrnMs256NSFsid_sim(nn.Module):
-    """
-    Synthesizer for Training
-    """
-    def __init__(
-        self,
-        spec_channels,
-        segment_size,
-        inter_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        spk_embed_dim,
-        # hop_length,
-        gin_channels=0,
-        use_sdp=True,
-        **kwargs
-    ):
-        super().__init__()
-        self.spec_channels = spec_channels
-        self.inter_channels = inter_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.resblock = resblock
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-        self.upsample_rates = upsample_rates
-        self.upsample_initial_channel = upsample_initial_channel
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.segment_size = segment_size
-        self.gin_channels = gin_channels
-        # self.hop_length = hop_length#
-        self.spk_embed_dim = spk_embed_dim
-        self.enc_p = TextEncoder256Sim(
-            inter_channels,
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-        )
-        self.dec = GeneratorNSF(
-            inter_channels,
-            resblock,
-            resblock_kernel_sizes,
-            resblock_dilation_sizes,
-            upsample_rates,
-            upsample_initial_channel,
-            upsample_kernel_sizes,
-            gin_channels=gin_channels,
-            is_half=kwargs["is_half"],
-        )
-        self.flow = ResidualCouplingBlock(
-            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
-        )
-        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
-        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
-    def remove_weight_norm(self):
-        self.dec.remove_weight_norm()
-        self.flow.remove_weight_norm()
-        self.enc_q.remove_weight_norm()
-    def forward(
-        self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
-    ):  # y是spec不需要了现在
-        g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
-        x, x_mask = self.enc_p(phone, pitch, phone_lengths)
-        x = self.flow(x, x_mask, g=g, reverse=True)
-        o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
-        return o
-class MultiPeriodDiscriminator(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(MultiPeriodDiscriminator, self).__init__()
-        periods = [2, 3, 5, 7, 11, 17]
-        # periods = [3, 5, 7, 11, 17, 23, 37]
-        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
-        discs = discs + [
-            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
-        ]
-        self.discriminators = nn.ModuleList(discs)
-    def forward(self, y, y_hat):
-        y_d_rs = []  #
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            # for j in range(len(fmap_r)):
-            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
-            y_d_rs.append(y_d_r)
-            y_d_gs.append(y_d_g)
-            fmap_rs.append(fmap_r)
-            fmap_gs.append(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-class DiscriminatorS(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
-                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
-                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
-                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
-                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
-                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-            ]
-        )
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-    def forward(self, x):
-        fmap = []
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class DiscriminatorP(torch.nn.Module):
-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        self.use_spectral_norm = use_spectral_norm
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(
-                    Conv2d(
-                        1,
-                        32,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        32,
-                        128,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        128,
-                        512,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        512,
-                        1024,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        1024,
-                        1024,
-                        (kernel_size, 1),
-                        1,
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-            ]
-        )
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-    def forward(self, x):
-        fmap = []
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), "reflect")
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap

{infer_pack → lib/infer_pack}/attentions.py RENAMED Viewed

@@ -5,9 +5,9 @@ import torch
 from torch import nn
 from torch.nn import functional as F
-from infer_pack import commons
-from infer_pack import modules
-from infer_pack.modules import LayerNorm
 class Encoder(nn.Module):

 from torch import nn
 from torch.nn import functional as F
+from lib.infer_pack import commons
+from lib.infer_pack import modules
+from lib.infer_pack.modules import LayerNorm
 class Encoder(nn.Module):

{infer_pack → lib/infer_pack}/commons.py RENAMED Viewed

File without changes

{infer_pack → lib/infer_pack}/models.py RENAMED Viewed

@@ -3,15 +3,15 @@ from time import time as ttime
 import torch
 from torch import nn
 from torch.nn import functional as F
-from infer_pack import modules
-from infer_pack import attentions
-from infer_pack import commons
-from infer_pack.commons import init_weights, get_padding
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from infer_pack.commons import init_weights
 import numpy as np
-from infer_pack import commons
 class TextEncoder256(nn.Module):

 import torch
 from torch import nn
 from torch.nn import functional as F
+from lib.infer_pack import modules
+from lib.infer_pack import attentions
+from lib.infer_pack import commons
+from lib.infer_pack.commons import init_weights, get_padding
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from lib.infer_pack.commons import init_weights
 import numpy as np
+from lib.infer_pack import commons
 class TextEncoder256(nn.Module):

{infer_pack → lib/infer_pack}/models_onnx.py RENAMED Viewed

@@ -3,15 +3,15 @@ from time import time as ttime
 import torch
 from torch import nn
 from torch.nn import functional as F
-from infer_pack import modules
-from infer_pack import attentions
-from infer_pack import commons
-from infer_pack.commons import init_weights, get_padding
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from infer_pack.commons import init_weights
 import numpy as np
-from infer_pack import commons
 class TextEncoder256(nn.Module):

 import torch
 from torch import nn
 from torch.nn import functional as F
+from lib.infer_pack import modules
+from lib.infer_pack import attentions
+from lib.infer_pack import commons
+from lib.infer_pack.commons import init_weights, get_padding
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from lib.infer_pack.commons import init_weights
 import numpy as np
+from lib.infer_pack import commons
 class TextEncoder256(nn.Module):

{infer_pack → lib/infer_pack}/modules.py RENAMED Viewed

@@ -9,9 +9,9 @@ from torch.nn import functional as F
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm
-from infer_pack import commons
-from infer_pack.commons import init_weights, get_padding
-from infer_pack.transforms import piecewise_rational_quadratic_transform
 LRELU_SLOPE = 0.1

 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm
+from lib.infer_pack import commons
+from lib.infer_pack.commons import init_weights, get_padding
+from lib.infer_pack.transforms import piecewise_rational_quadratic_transform
 LRELU_SLOPE = 0.1

{infer_pack → lib/infer_pack}/modules/F0Predictor/DioF0Predictor.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 import pyworld
 import numpy as np

+from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 import pyworld
 import numpy as np

{infer_pack → lib/infer_pack}/modules/F0Predictor/F0Predictor.py RENAMED Viewed

File without changes

{infer_pack → lib/infer_pack}/modules/F0Predictor/HarvestF0Predictor.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 import pyworld
 import numpy as np

+from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 import pyworld
 import numpy as np

{infer_pack → lib/infer_pack}/modules/F0Predictor/PMF0Predictor.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 import parselmouth
 import numpy as np

+from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
 import parselmouth
 import numpy as np

{infer_pack → lib/infer_pack}/modules/F0Predictor/__init__.py RENAMED Viewed

File without changes

{infer_pack → lib/infer_pack}/onnx_inference.py RENAMED Viewed

@@ -3,6 +3,7 @@ import librosa
 import numpy as np
 import soundfile
 class ContentVec:
     def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
         print("load model(s) from {}".format(vec_path))
@@ -32,19 +33,19 @@ class ContentVec:
 def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
     if f0_predictor == "pm":
-        from infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
         f0_predictor_object = PMF0Predictor(
             hop_length=hop_length, sampling_rate=sampling_rate
         )
     elif f0_predictor == "harvest":
-        from infer_pack.modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor
         f0_predictor_object = HarvestF0Predictor(
             hop_length=hop_length, sampling_rate=sampling_rate
         )
     elif f0_predictor == "dio":
-        from infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
         f0_predictor_object = DioF0Predictor(
             hop_length=hop_length, sampling_rate=sampling_rate

 import numpy as np
 import soundfile
 class ContentVec:
     def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
         print("load model(s) from {}".format(vec_path))
 def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
     if f0_predictor == "pm":
+        from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
         f0_predictor_object = PMF0Predictor(
             hop_length=hop_length, sampling_rate=sampling_rate
         )
     elif f0_predictor == "harvest":
+        from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor
         f0_predictor_object = HarvestF0Predictor(
             hop_length=hop_length, sampling_rate=sampling_rate
         )
     elif f0_predictor == "dio":
+        from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
         f0_predictor_object = DioF0Predictor(
             hop_length=hop_length, sampling_rate=sampling_rate

{infer_pack → lib/infer_pack}/transforms.py RENAMED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,26 +1,21 @@
-setuptools
 wheel
-httpx==0.23.0
-fairseq==0.12.2
-gradio
 ffmpeg
-praat-parselmouth
-pyworld
-numpy==1.23.5
 numba==0.56.4
-librosa==0.9.2
-faiss-cpu==1.7.3
-faiss-gpu
 scipy==1.9.3
 pyworld>=0.3.2
 tensorboard
 tensorboardX
-onnxruntime
-pyngrok==4.1.12
-soundfile>=0.12.1
-tqdm>=4.63.1
 torchcrepe
-asyncio
-edge-tts
 demucs
-yt_dlp

 wheel
+setuptools
 ffmpeg
 numba==0.56.4
+numpy==1.23.5
 scipy==1.9.3
+librosa==0.9.1
+fairseq==0.12.2
+faiss-cpu==1.7.3
+gradio==3.34.0
 pyworld>=0.3.2
+soundfile>=0.12.1
+praat-parselmouth>=0.4.2
+httpx==0.23.0
 tensorboard
 tensorboardX
 torchcrepe
+onnxruntime
 demucs
+edge-tts
+yt_dlp

vc_infer_pipeline.py CHANGED Viewed

@@ -184,7 +184,7 @@ class VC(object):
         with torch.no_grad():
             logits = model.extract_features(**inputs)
             feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
-        if protect < 0.5 and pitch!=None and pitchf!=None:
             feats0 = feats.clone()
         if (
             isinstance(index, type(None)) == False
@@ -211,7 +211,7 @@ class VC(object):
             )
         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
-        if protect < 0.5 and pitch!=None and pitchf!=None:
             feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
                 0, 2, 1
             )
@@ -223,7 +223,7 @@ class VC(object):
                 pitch = pitch[:, :p_len]
                 pitchf = pitchf[:, :p_len]
-        if protect < 0.5 and pitch!=None and pitchf!=None:
             pitchff = pitchf.clone()
             pitchff[pitchf > 0] = 1
             pitchff[pitchf < 1] = protect

         with torch.no_grad():
             logits = model.extract_features(**inputs)
             feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
+        if protect < 0.5 and pitch != None and pitchf != None:
             feats0 = feats.clone()
         if (
             isinstance(index, type(None)) == False
             )
         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+        if protect < 0.5 and pitch != None and pitchf != None:
             feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
                 0, 2, 1
             )
                 pitch = pitch[:, :p_len]
                 pitchf = pitchf[:, :p_len]
+        if protect < 0.5 and pitch != None and pitchf != None:
             pitchff = pitchf.clone()
             pitchff[pitchf > 0] = 1
             pitchff[pitchf < 1] = protect