Spaces:

kiramayatu
/

Vits-Hana

Runtime error

App Files Files Community

kiramayatu commited on Apr 8, 2023

Commit

67b73a1

•

1 Parent(s): e4a4487

Upload 16 files

Browse files

Files changed (16) hide show

.idea/.gitignore +3 -0
.idea/VITS_voice_conversion.iml +12 -0
.idea/inspectionProfiles/Project_Default.xml +154 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
configs/modified_finetune_speaker.json +172 -0
configs/uma_trilingual.json +54 -0
inference/G_latest.pth +3 -0
inference/ONNXVITS_inference.py +36 -0
inference/VC_inference.py +139 -0
inference/finetune_speaker.json +147 -0
monotonic_align/__init__.py +19 -0
monotonic_align/core.pyx +42 -0
monotonic_align/setup.py +9 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/VITS_voice_conversion.iml ADDED Viewed

	@@ -0,0 +1,12 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.7 (VITS)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,154 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="132">
+            <item index="0" class="java.lang.String" itemvalue="ccxt" />
+            <item index="1" class="java.lang.String" itemvalue="lz4" />
+            <item index="2" class="java.lang.String" itemvalue="pre-commit" />
+            <item index="3" class="java.lang.String" itemvalue="elegantrl" />
+            <item index="4" class="java.lang.String" itemvalue="setuptools" />
+            <item index="5" class="java.lang.String" itemvalue="ray" />
+            <item index="6" class="java.lang.String" itemvalue="gputil" />
+            <item index="7" class="java.lang.String" itemvalue="google-pasta" />
+            <item index="8" class="java.lang.String" itemvalue="tensorflow-estimator" />
+            <item index="9" class="java.lang.String" itemvalue="scikit-learn" />
+            <item index="10" class="java.lang.String" itemvalue="tabulate" />
+            <item index="11" class="java.lang.String" itemvalue="multitasking" />
+            <item index="12" class="java.lang.String" itemvalue="pickleshare" />
+            <item index="13" class="java.lang.String" itemvalue="pyasn1-modules" />
+            <item index="14" class="java.lang.String" itemvalue="ipython-genutils" />
+            <item index="15" class="java.lang.String" itemvalue="Pygments" />
+            <item index="16" class="java.lang.String" itemvalue="mccabe" />
+            <item index="17" class="java.lang.String" itemvalue="astunparse" />
+            <item index="18" class="java.lang.String" itemvalue="lxml" />
+            <item index="19" class="java.lang.String" itemvalue="Werkzeug" />
+            <item index="20" class="java.lang.String" itemvalue="tensorboard-data-server" />
+            <item index="21" class="java.lang.String" itemvalue="jupyter-client" />
+            <item index="22" class="java.lang.String" itemvalue="pexpect" />
+            <item index="23" class="java.lang.String" itemvalue="click" />
+            <item index="24" class="java.lang.String" itemvalue="ipykernel" />
+            <item index="25" class="java.lang.String" itemvalue="pandas-datareader" />
+            <item index="26" class="java.lang.String" itemvalue="psutil" />
+            <item index="27" class="java.lang.String" itemvalue="jedi" />
+            <item index="28" class="java.lang.String" itemvalue="regex" />
+            <item index="29" class="java.lang.String" itemvalue="tensorboard" />
+            <item index="30" class="java.lang.String" itemvalue="platformdirs" />
+            <item index="31" class="java.lang.String" itemvalue="matplotlib" />
+            <item index="32" class="java.lang.String" itemvalue="idna" />
+            <item index="33" class="java.lang.String" itemvalue="rsa" />
+            <item index="34" class="java.lang.String" itemvalue="decorator" />
+            <item index="35" class="java.lang.String" itemvalue="numpy" />
+            <item index="36" class="java.lang.String" itemvalue="pyasn1" />
+            <item index="37" class="java.lang.String" itemvalue="requests" />
+            <item index="38" class="java.lang.String" itemvalue="tensorflow" />
+            <item index="39" class="java.lang.String" itemvalue="tensorboard-plugin-wit" />
+            <item index="40" class="java.lang.String" itemvalue="Deprecated" />
+            <item index="41" class="java.lang.String" itemvalue="nest-asyncio" />
+            <item index="42" class="java.lang.String" itemvalue="prompt-toolkit" />
+            <item index="43" class="java.lang.String" itemvalue="keras-tuner" />
+            <item index="44" class="java.lang.String" itemvalue="scipy" />
+            <item index="45" class="java.lang.String" itemvalue="dataclasses" />
+            <item index="46" class="java.lang.String" itemvalue="tornado" />
+            <item index="47" class="java.lang.String" itemvalue="google-auth-oauthlib" />
+            <item index="48" class="java.lang.String" itemvalue="black" />
+            <item index="49" class="java.lang.String" itemvalue="toml" />
+            <item index="50" class="java.lang.String" itemvalue="Quandl" />
+            <item index="51" class="java.lang.String" itemvalue="pandas" />
+            <item index="52" class="java.lang.String" itemvalue="termcolor" />
+            <item index="53" class="java.lang.String" itemvalue="pylint" />
+            <item index="54" class="java.lang.String" itemvalue="typing_extensions" />
+            <item index="55" class="java.lang.String" itemvalue="cachetools" />
+            <item index="56" class="java.lang.String" itemvalue="debugpy" />
+            <item index="57" class="java.lang.String" itemvalue="isort" />
+            <item index="58" class="java.lang.String" itemvalue="pytz" />
+            <item index="59" class="java.lang.String" itemvalue="inflection" />
+            <item index="60" class="java.lang.String" itemvalue="Pillow" />
+            <item index="61" class="java.lang.String" itemvalue="traitlets" />
+            <item index="62" class="java.lang.String" itemvalue="absl-py" />
+            <item index="63" class="java.lang.String" itemvalue="protobuf" />
+            <item index="64" class="java.lang.String" itemvalue="joblib" />
+            <item index="65" class="java.lang.String" itemvalue="threadpoolctl" />
+            <item index="66" class="java.lang.String" itemvalue="opt-einsum" />
+            <item index="67" class="java.lang.String" itemvalue="python-dateutil" />
+            <item index="68" class="java.lang.String" itemvalue="gpflow" />
+            <item index="69" class="java.lang.String" itemvalue="astroid" />
+            <item index="70" class="java.lang.String" itemvalue="cycler" />
+            <item index="71" class="java.lang.String" itemvalue="gast" />
+            <item index="72" class="java.lang.String" itemvalue="kt-legacy" />
+            <item index="73" class="java.lang.String" itemvalue="appdirs" />
+            <item index="74" class="java.lang.String" itemvalue="tensorflow-probability" />
+            <item index="75" class="java.lang.String" itemvalue="pip" />
+            <item index="76" class="java.lang.String" itemvalue="pyzmq" />
+            <item index="77" class="java.lang.String" itemvalue="certifi" />
+            <item index="78" class="java.lang.String" itemvalue="oauthlib" />
+            <item index="79" class="java.lang.String" itemvalue="pyparsing" />
+            <item index="80" class="java.lang.String" itemvalue="Markdown" />
+            <item index="81" class="java.lang.String" itemvalue="h5py" />
+            <item index="82" class="java.lang.String" itemvalue="wrapt" />
+            <item index="83" class="java.lang.String" itemvalue="kiwisolver" />
+            <item index="84" class="java.lang.String" itemvalue="empyrical" />
+            <item index="85" class="java.lang.String" itemvalue="backcall" />
+            <item index="86" class="java.lang.String" itemvalue="charset-normalizer" />
+            <item index="87" class="java.lang.String" itemvalue="multipledispatch" />
+            <item index="88" class="java.lang.String" itemvalue="pathspec" />
+            <item index="89" class="java.lang.String" itemvalue="jupyter-core" />
+            <item index="90" class="java.lang.String" itemvalue="matplotlib-inline" />
+            <item index="91" class="java.lang.String" itemvalue="ptyprocess" />
+            <item index="92" class="java.lang.String" itemvalue="more-itertools" />
+            <item index="93" class="java.lang.String" itemvalue="mypy-extensions" />
+            <item index="94" class="java.lang.String" itemvalue="cloudpickle" />
+            <item index="95" class="java.lang.String" itemvalue="wcwidth" />
+            <item index="96" class="java.lang.String" itemvalue="requests-oauthlib" />
+            <item index="97" class="java.lang.String" itemvalue="Keras-Preprocessing" />
+            <item index="98" class="java.lang.String" itemvalue="yfinance" />
+            <item index="99" class="java.lang.String" itemvalue="tomli" />
+            <item index="100" class="java.lang.String" itemvalue="urllib3" />
+            <item index="101" class="java.lang.String" itemvalue="six" />
+            <item index="102" class="java.lang.String" itemvalue="parso" />
+            <item index="103" class="java.lang.String" itemvalue="wheel" />
+            <item index="104" class="java.lang.String" itemvalue="ipython" />
+            <item index="105" class="java.lang.String" itemvalue="packaging" />
+            <item index="106" class="java.lang.String" itemvalue="lazy-object-proxy" />
+            <item index="107" class="java.lang.String" itemvalue="grpcio" />
+            <item index="108" class="java.lang.String" itemvalue="dm-tree" />
+            <item index="109" class="java.lang.String" itemvalue="google-auth" />
+            <item index="110" class="java.lang.String" itemvalue="seaborn" />
+            <item index="111" class="java.lang.String" itemvalue="thop" />
+            <item index="112" class="java.lang.String" itemvalue="torch" />
+            <item index="113" class="java.lang.String" itemvalue="torchvision" />
+            <item index="114" class="java.lang.String" itemvalue="d2l" />
+            <item index="115" class="java.lang.String" itemvalue="keyboard" />
+            <item index="116" class="java.lang.String" itemvalue="transformers" />
+            <item index="117" class="java.lang.String" itemvalue="phonemizer" />
+            <item index="118" class="java.lang.String" itemvalue="Unidecode" />
+            <item index="119" class="java.lang.String" itemvalue="nltk" />
+            <item index="120" class="java.lang.String" itemvalue="pinecone-client" />
+            <item index="121" class="java.lang.String" itemvalue="sentence-transformers" />
+            <item index="122" class="java.lang.String" itemvalue="whisper" />
+            <item index="123" class="java.lang.String" itemvalue="datasets" />
+            <item index="124" class="java.lang.String" itemvalue="pyaudio" />
+            <item index="125" class="java.lang.String" itemvalue="torchsummary" />
+            <item index="126" class="java.lang.String" itemvalue="openjtalk" />
+            <item index="127" class="java.lang.String" itemvalue="hydra-core" />
+            <item index="128" class="java.lang.String" itemvalue="museval" />
+            <item index="129" class="java.lang.String" itemvalue="mypy" />
+            <item index="130" class="java.lang.String" itemvalue="hydra-colorlog" />
+            <item index="131" class="java.lang.String" itemvalue="flake8" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="sentiment_classification.model_predictions.audio_path" />
+          <option value="sentiment_classification.model_predictions.sample_rate" />
+          <option value="sentiment_classification.model_predictions.num_samples" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (VITS)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/VITS_voice_conversion.iml" filepath="$PROJECT_DIR$/.idea/VITS_voice_conversion.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

configs/modified_finetune_speaker.json ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+  "train": {
+    "log_interval": 10,
+    "eval_interval": 100,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 0.0002,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 16,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 8192,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "training_files": "final_annotation_train.txt",
+    "validation_files": "final_annotation_val.txt",
+    "text_cleaners": [
+      "chinese_cleaners"
+    ],
+    "max_wav_value": 32768.0,
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 2,
+    "cleaned_text": true
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  },
+  "symbols": [
+    "_",
+    "\uff1b",
+    "\uff1a",
+    "\uff0c",
+    "\u3002",
+    "\uff01",
+    "\uff1f",
+    "-",
+    "\u201c",
+    "\u201d",
+    "\u300a",
+    "\u300b",
+    "\u3001",
+    "\uff08",
+    "\uff09",
+    "\u2026",
+    "\u2014",
+    " ",
+    "A",
+    "B",
+    "C",
+    "D",
+    "E",
+    "F",
+    "G",
+    "H",
+    "I",
+    "J",
+    "K",
+    "L",
+    "M",
+    "N",
+    "O",
+    "P",
+    "Q",
+    "R",
+    "S",
+    "T",
+    "U",
+    "V",
+    "W",
+    "X",
+    "Y",
+    "Z",
+    "a",
+    "b",
+    "c",
+    "d",
+    "e",
+    "f",
+    "g",
+    "h",
+    "i",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "o",
+    "p",
+    "q",
+    "r",
+    "s",
+    "t",
+    "u",
+    "v",
+    "w",
+    "x",
+    "y",
+    "z",
+    "1",
+    "2",
+    "3",
+    "4",
+    "5",
+    "0",
+    "\uff22",
+    "\uff30"
+  ],
+  "speakers": {
+    "dingzhen": 0,
+    "taffy": 1
+  }
+}

configs/uma_trilingual.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 1000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 16,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 8192,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "training_files":"../CH_JA_EN_mix_voice/clipped_3_vits_trilingual_annotations.train.txt.cleaned",
+    "validation_files":"../CH_JA_EN_mix_voice/clipped_3_vits_trilingual_annotations.val.txt.cleaned",
+    "text_cleaners":["cjke_cleaners2"],
+    "max_wav_value": 32768.0,
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 999,
+    "cleaned_text": true
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [8,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  },
+  "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "N", "Q", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "w", "x", "y", "z", "\u0251", "\u00e6", "\u0283", "\u0291", "\u00e7", "\u026f", "\u026a", "\u0254", "\u025b", "\u0279", "\u00f0", "\u0259", "\u026b", "\u0265", "\u0278", "\u028a", "\u027e", "\u0292", "\u03b8", "\u03b2", "\u014b", "\u0266", "\u207c", "\u02b0", "`", "^", "#", "*", "=", "\u02c8", "\u02cc", "\u2192", "\u2193", "\u2191", " "]
+}

inference/G_latest.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44f9141fcac34c950376594d08a288d9159a32d6add851155b6fd0ecee242419
+size 158887401

inference/ONNXVITS_inference.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import logging
+logging.getLogger('numba').setLevel(logging.WARNING)
+import IPython.display as ipd
+import torch
+import commons
+import utils
+import ONNXVITS_infer
+from text import text_to_sequence
+def get_text(text, hps):
+    text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
+    if hps.data.add_blank:
+        text_norm = commons.intersperse(text_norm, 0)
+    text_norm = torch.LongTensor(text_norm)
+    return text_norm
+hps = utils.get_hparams_from_file("../vits/pretrained_models/uma87.json")
+net_g = ONNXVITS_infer.SynthesizerTrn(
+    len(hps.symbols),
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    n_speakers=hps.data.n_speakers,
+    **hps.model)
+_ = net_g.eval()
+_ = utils.load_checkpoint("../vits/pretrained_models/uma_1153000.pth", net_g)
+text1 = get_text("おはようございます。", hps)
+stn_tst = text1
+with torch.no_grad():
+    x_tst = stn_tst.unsqueeze(0)
+    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
+    sid = torch.LongTensor([0])
+    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
+print(audio)

inference/VC_inference.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import numpy as np
+import torch
+from torch import no_grad, LongTensor
+import argparse
+import commons
+from mel_processing import spectrogram_torch
+import utils
+from models import SynthesizerTrn
+import gradio as gr
+import librosa
+import webbrowser
+from text import text_to_sequence, _clean_text
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+language_marks = {
+    "Japanese": "",
+    "日本語": "[JA]",
+    "简体中文": "[ZH]",
+    "English": "[EN]",
+    "Mix": "",
+}
+lang = ['日本語', '简体中文', 'English', 'Mix']
+def get_text(text, hps, is_symbol):
+    text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
+    if hps.data.add_blank:
+        text_norm = commons.intersperse(text_norm, 0)
+    text_norm = LongTensor(text_norm)
+    return text_norm
+def create_tts_fn(model, hps, speaker_ids):
+    def tts_fn(text, speaker, language, speed):
+        if language is not None:
+            text = language_marks[language] + text + language_marks[language]
+        speaker_id = speaker_ids[speaker]
+        stn_tst = get_text(text, hps, False)
+        with no_grad():
+            x_tst = stn_tst.unsqueeze(0).to(device)
+            x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
+            sid = LongTensor([speaker_id]).to(device)
+            audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
+                                length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
+        del stn_tst, x_tst, x_tst_lengths, sid
+        return "Success", (hps.data.sampling_rate, audio)
+    return tts_fn
+def create_vc_fn(model, hps, speaker_ids):
+    def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
+        input_audio = record_audio if record_audio is not None else upload_audio
+        if input_audio is None:
+            return "You need to record or upload an audio", None
+        sampling_rate, audio = input_audio
+        original_speaker_id = speaker_ids[original_speaker]
+        target_speaker_id = speaker_ids[target_speaker]
+        audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
+        if len(audio.shape) > 1:
+            audio = librosa.to_mono(audio.transpose(1, 0))
+        if sampling_rate != hps.data.sampling_rate:
+            audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
+        with no_grad():
+            y = torch.FloatTensor(audio)
+            y = y / max(-y.min(), y.max()) / 0.99
+            y = y.to(device)
+            y = y.unsqueeze(0)
+            spec = spectrogram_torch(y, hps.data.filter_length,
+                                     hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
+                                     center=False).to(device)
+            spec_lengths = LongTensor([spec.size(-1)]).to(device)
+            sid_src = LongTensor([original_speaker_id]).to(device)
+            sid_tgt = LongTensor([target_speaker_id]).to(device)
+            audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
+                0, 0].data.cpu().float().numpy()
+        del y, spec, spec_lengths, sid_src, sid_tgt
+        return "Success", (hps.data.sampling_rate, audio)
+    return vc_fn
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_dir", default="./G_latest.pth", help="directory to your fine-tuned model")
+    parser.add_argument("--config_dir", default="./finetune_speaker.json", help="directory to your model config file")
+    parser.add_argument("--share", default=False, help="make link public (used in colab)")
+    args = parser.parse_args()
+    hps = utils.get_hparams_from_file(args.config_dir)
+    net_g = SynthesizerTrn(
+        len(hps.symbols),
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        n_speakers=hps.data.n_speakers,
+        **hps.model).to(device)
+    _ = net_g.eval()
+    _ = utils.load_checkpoint(args.model_dir, net_g, None)
+    speaker_ids = hps.speakers
+    speakers = list(hps.speakers.keys())
+    tts_fn = create_tts_fn(net_g, hps, speaker_ids)
+    vc_fn = create_vc_fn(net_g, hps, speaker_ids)
+    app = gr.Blocks()
+    with app:
+        with gr.Tab("Text-to-Speech"):
+            with gr.Row():
+                with gr.Column():
+                    textbox = gr.TextArea(label="Text",
+                                          placeholder="Type your sentence here",
+                                          value="こんにちわ。", elem_id=f"tts-input")
+                    # select character
+                    char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
+                    language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
+                    duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
+                                                label='速度 Speed')
+                with gr.Column():
+                    text_output = gr.Textbox(label="Message")
+                    audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
+                    btn = gr.Button("Generate!")
+                    btn.click(tts_fn,
+                              inputs=[textbox, char_dropdown, language_dropdown, duration_slider,],
+                              outputs=[text_output, audio_output])
+        with gr.Tab("Voice Conversion"):
+            gr.Markdown("""
+                            录制或上传声音，并选择要转换的音色。
+            """)
+            with gr.Column():
+                record_audio = gr.Audio(label="record your voice", source="microphone")
+                upload_audio = gr.Audio(label="or upload audio here", source="upload")
+                source_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="source speaker")
+                target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
+            with gr.Column():
+                message_box = gr.Textbox(label="Message")
+                converted_audio = gr.Audio(label='converted audio')
+            btn = gr.Button("Convert!")
+            btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
+                      outputs=[message_box, converted_audio])
+    webbrowser.open("http://127.0.0.1:7860")
+    app.launch(share=args.share)

inference/finetune_speaker.json ADDED Viewed

	@@ -0,0 +1,147 @@

+{
+  "train": {
+    "log_interval": 100,
+    "eval_interval": 1000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 0.0002,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 16,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 8192,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "training_files": "final_annotation_train.txt",
+    "validation_files": "final_annotation_val.txt",
+    "text_cleaners": [
+      "zh_ja_mixture_cleaners"
+    ],
+    "max_wav_value": 32768.0,
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 3,
+    "cleaned_text": true
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  },
+  "speakers": {
+    "Hana": 0,
+    "specialweek": 1,
+    "zhongli": 2
+  },
+  "symbols": [
+    "_",
+    ",",
+    ".",
+    "!",
+    "?",
+    "-",
+    "~",
+    "\u2026",
+    "A",
+    "E",
+    "I",
+    "N",
+    "O",
+    "Q",
+    "U",
+    "a",
+    "b",
+    "d",
+    "e",
+    "f",
+    "g",
+    "h",
+    "i",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "o",
+    "p",
+    "r",
+    "s",
+    "t",
+    "u",
+    "v",
+    "w",
+    "y",
+    "z",
+    "\u0283",
+    "\u02a7",
+    "\u02a6",
+    "\u026f",
+    "\u0279",
+    "\u0259",
+    "\u0265",
+    "\u207c",
+    "\u02b0",
+    "`",
+    "\u2192",
+    "\u2193",
+    "\u2191",
+    " "
+  ]
+}

monotonic_align/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import numpy as np
+import torch
+from .monotonic_align.core import maximum_path_c
+def maximum_path(neg_cent, mask):
+  """ Cython optimized version.
+  neg_cent: [b, t_t, t_s]
+  mask: [b, t_t, t_s]
+  """
+  device = neg_cent.device
+  dtype = neg_cent.dtype
+  neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
+  path = np.zeros(neg_cent.shape, dtype=np.int32)
+  t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
+  t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
+  maximum_path_c(path, neg_cent, t_t_max, t_s_max)
+  return torch.from_numpy(path).to(device=device, dtype=dtype)

monotonic_align/core.pyx ADDED Viewed

	@@ -0,0 +1,42 @@

+cimport cython
+from cython.parallel import prange
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
+  cdef int x
+  cdef int y
+  cdef float v_prev
+  cdef float v_cur
+  cdef float tmp
+  cdef int index = t_x - 1
+  for y in range(t_y):
+    for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
+      if x == y:
+        v_cur = max_neg_val
+      else:
+        v_cur = value[y-1, x]
+      if x == 0:
+        if y == 0:
+          v_prev = 0.
+        else:
+          v_prev = max_neg_val
+      else:
+        v_prev = value[y-1, x-1]
+      value[y, x] += max(v_prev, v_cur)
+  for y in range(t_y - 1, -1, -1):
+    path[y, index] = 1
+    if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
+      index = index - 1
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
+  cdef int b = paths.shape[0]
+  cdef int i
+  for i in prange(b, nogil=True):
+    maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])

monotonic_align/setup.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from distutils.core import setup
+from Cython.Build import cythonize
+import numpy
+setup(
+  name = 'monotonic_align',
+  ext_modules = cythonize("core.pyx"),
+  include_dirs=[numpy.get_include()]
+)