Spaces:

kiramayatu
/

Vits-Hana

Runtime error

App Files Files Community

kiramayatu commited on Apr 8, 2023

Commit

2fbb3ac

•

1 Parent(s): 5091c1d

Delete VITS-fast-fine-tuning

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

VITS-fast-fine-tuning/.idea/.gitignore +0 -3
VITS-fast-fine-tuning/.idea/VITS_voice_conversion.iml +0 -12
VITS-fast-fine-tuning/.idea/inspectionProfiles/Project_Default.xml +0 -154
VITS-fast-fine-tuning/.idea/inspectionProfiles/profiles_settings.xml +0 -6
VITS-fast-fine-tuning/.idea/misc.xml +0 -4
VITS-fast-fine-tuning/.idea/modules.xml +0 -8
VITS-fast-fine-tuning/.idea/vcs.xml +0 -6
VITS-fast-fine-tuning/DATA.MD +0 -42
VITS-fast-fine-tuning/DATA_EN.MD +0 -46
VITS-fast-fine-tuning/LICENSE +0 -201
VITS-fast-fine-tuning/README.md +0 -55
VITS-fast-fine-tuning/README_ZH.md +0 -60
VITS-fast-fine-tuning/VC_inference.py +0 -139
VITS-fast-fine-tuning/attentions.py +0 -303
VITS-fast-fine-tuning/cmd_inference.py +0 -106
VITS-fast-fine-tuning/commons.py +0 -164
VITS-fast-fine-tuning/configs/modified_finetune_speaker.json +0 -172
VITS-fast-fine-tuning/configs/uma_trilingual.json +0 -54
VITS-fast-fine-tuning/data_utils.py +0 -267
VITS-fast-fine-tuning/denoise_audio.py +0 -18
VITS-fast-fine-tuning/download_model.py +0 -4
VITS-fast-fine-tuning/download_video.py +0 -37
VITS-fast-fine-tuning/finetune_speaker_v2.py +0 -321
VITS-fast-fine-tuning/inference/G_latest.pth +0 -3
VITS-fast-fine-tuning/inference/ONNXVITS_inference.py +0 -36
VITS-fast-fine-tuning/inference/VC_inference.py +0 -139
VITS-fast-fine-tuning/inference/finetune_speaker.json +0 -147
VITS-fast-fine-tuning/long_audio_transcribe.py +0 -71
VITS-fast-fine-tuning/losses.py +0 -61
VITS-fast-fine-tuning/mel_processing.py +0 -112
VITS-fast-fine-tuning/models.py +0 -533
VITS-fast-fine-tuning/models_infer.py +0 -402
VITS-fast-fine-tuning/modules.py +0 -390
VITS-fast-fine-tuning/monotonic_align/__init__.py +0 -19
VITS-fast-fine-tuning/monotonic_align/core.pyx +0 -42
VITS-fast-fine-tuning/monotonic_align/setup.py +0 -9
VITS-fast-fine-tuning/preprocess_v2.py +0 -151
VITS-fast-fine-tuning/rearrange_speaker.py +0 -37
VITS-fast-fine-tuning/requirements.txt +0 -24
VITS-fast-fine-tuning/short_audio_transcribe.py +0 -111
VITS-fast-fine-tuning/text/LICENSE +0 -19
VITS-fast-fine-tuning/text/__init__.py +0 -60
VITS-fast-fine-tuning/text/__pycache__/__init__.cpython-37.pyc +0 -0
VITS-fast-fine-tuning/text/__pycache__/cleaners.cpython-37.pyc +0 -0
VITS-fast-fine-tuning/text/__pycache__/english.cpython-37.pyc +0 -0
VITS-fast-fine-tuning/text/__pycache__/japanese.cpython-37.pyc +0 -0
VITS-fast-fine-tuning/text/__pycache__/korean.cpython-37.pyc +0 -0
VITS-fast-fine-tuning/text/__pycache__/mandarin.cpython-37.pyc +0 -0
VITS-fast-fine-tuning/text/__pycache__/sanskrit.cpython-37.pyc +0 -0
VITS-fast-fine-tuning/text/__pycache__/symbols.cpython-37.pyc +0 -0

VITS-fast-fine-tuning/.idea/.gitignore DELETED Viewed

@@ -1,3 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml

VITS-fast-fine-tuning/.idea/VITS_voice_conversion.iml DELETED Viewed

@@ -1,12 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Python 3.7 (VITS)" jdkType="Python SDK" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-  <component name="PyDocumentationSettings">
-    <option name="format" value="PLAIN" />
-    <option name="myDocStringFormat" value="Plain" />
-  </component>
-</module>

VITS-fast-fine-tuning/.idea/inspectionProfiles/Project_Default.xml DELETED Viewed

@@ -1,154 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <profile version="1.0">
-    <option name="myName" value="Project Default" />
-    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
-      <option name="ignoredPackages">
-        <value>
-          <list size="132">
-            <item index="0" class="java.lang.String" itemvalue="ccxt" />
-            <item index="1" class="java.lang.String" itemvalue="lz4" />
-            <item index="2" class="java.lang.String" itemvalue="pre-commit" />
-            <item index="3" class="java.lang.String" itemvalue="elegantrl" />
-            <item index="4" class="java.lang.String" itemvalue="setuptools" />
-            <item index="5" class="java.lang.String" itemvalue="ray" />
-            <item index="6" class="java.lang.String" itemvalue="gputil" />
-            <item index="7" class="java.lang.String" itemvalue="google-pasta" />
-            <item index="8" class="java.lang.String" itemvalue="tensorflow-estimator" />
-            <item index="9" class="java.lang.String" itemvalue="scikit-learn" />
-            <item index="10" class="java.lang.String" itemvalue="tabulate" />
-            <item index="11" class="java.lang.String" itemvalue="multitasking" />
-            <item index="12" class="java.lang.String" itemvalue="pickleshare" />
-            <item index="13" class="java.lang.String" itemvalue="pyasn1-modules" />
-            <item index="14" class="java.lang.String" itemvalue="ipython-genutils" />
-            <item index="15" class="java.lang.String" itemvalue="Pygments" />
-            <item index="16" class="java.lang.String" itemvalue="mccabe" />
-            <item index="17" class="java.lang.String" itemvalue="astunparse" />
-            <item index="18" class="java.lang.String" itemvalue="lxml" />
-            <item index="19" class="java.lang.String" itemvalue="Werkzeug" />
-            <item index="20" class="java.lang.String" itemvalue="tensorboard-data-server" />
-            <item index="21" class="java.lang.String" itemvalue="jupyter-client" />
-            <item index="22" class="java.lang.String" itemvalue="pexpect" />
-            <item index="23" class="java.lang.String" itemvalue="click" />
-            <item index="24" class="java.lang.String" itemvalue="ipykernel" />
-            <item index="25" class="java.lang.String" itemvalue="pandas-datareader" />
-            <item index="26" class="java.lang.String" itemvalue="psutil" />
-            <item index="27" class="java.lang.String" itemvalue="jedi" />
-            <item index="28" class="java.lang.String" itemvalue="regex" />
-            <item index="29" class="java.lang.String" itemvalue="tensorboard" />
-            <item index="30" class="java.lang.String" itemvalue="platformdirs" />
-            <item index="31" class="java.lang.String" itemvalue="matplotlib" />
-            <item index="32" class="java.lang.String" itemvalue="idna" />
-            <item index="33" class="java.lang.String" itemvalue="rsa" />
-            <item index="34" class="java.lang.String" itemvalue="decorator" />
-            <item index="35" class="java.lang.String" itemvalue="numpy" />
-            <item index="36" class="java.lang.String" itemvalue="pyasn1" />
-            <item index="37" class="java.lang.String" itemvalue="requests" />
-            <item index="38" class="java.lang.String" itemvalue="tensorflow" />
-            <item index="39" class="java.lang.String" itemvalue="tensorboard-plugin-wit" />
-            <item index="40" class="java.lang.String" itemvalue="Deprecated" />
-            <item index="41" class="java.lang.String" itemvalue="nest-asyncio" />
-            <item index="42" class="java.lang.String" itemvalue="prompt-toolkit" />
-            <item index="43" class="java.lang.String" itemvalue="keras-tuner" />
-            <item index="44" class="java.lang.String" itemvalue="scipy" />
-            <item index="45" class="java.lang.String" itemvalue="dataclasses" />
-            <item index="46" class="java.lang.String" itemvalue="tornado" />
-            <item index="47" class="java.lang.String" itemvalue="google-auth-oauthlib" />
-            <item index="48" class="java.lang.String" itemvalue="black" />
-            <item index="49" class="java.lang.String" itemvalue="toml" />
-            <item index="50" class="java.lang.String" itemvalue="Quandl" />
-            <item index="51" class="java.lang.String" itemvalue="pandas" />
-            <item index="52" class="java.lang.String" itemvalue="termcolor" />
-            <item index="53" class="java.lang.String" itemvalue="pylint" />
-            <item index="54" class="java.lang.String" itemvalue="typing_extensions" />
-            <item index="55" class="java.lang.String" itemvalue="cachetools" />
-            <item index="56" class="java.lang.String" itemvalue="debugpy" />
-            <item index="57" class="java.lang.String" itemvalue="isort" />
-            <item index="58" class="java.lang.String" itemvalue="pytz" />
-            <item index="59" class="java.lang.String" itemvalue="inflection" />
-            <item index="60" class="java.lang.String" itemvalue="Pillow" />
-            <item index="61" class="java.lang.String" itemvalue="traitlets" />
-            <item index="62" class="java.lang.String" itemvalue="absl-py" />
-            <item index="63" class="java.lang.String" itemvalue="protobuf" />
-            <item index="64" class="java.lang.String" itemvalue="joblib" />
-            <item index="65" class="java.lang.String" itemvalue="threadpoolctl" />
-            <item index="66" class="java.lang.String" itemvalue="opt-einsum" />
-            <item index="67" class="java.lang.String" itemvalue="python-dateutil" />
-            <item index="68" class="java.lang.String" itemvalue="gpflow" />
-            <item index="69" class="java.lang.String" itemvalue="astroid" />
-            <item index="70" class="java.lang.String" itemvalue="cycler" />
-            <item index="71" class="java.lang.String" itemvalue="gast" />
-            <item index="72" class="java.lang.String" itemvalue="kt-legacy" />
-            <item index="73" class="java.lang.String" itemvalue="appdirs" />
-            <item index="74" class="java.lang.String" itemvalue="tensorflow-probability" />
-            <item index="75" class="java.lang.String" itemvalue="pip" />
-            <item index="76" class="java.lang.String" itemvalue="pyzmq" />
-            <item index="77" class="java.lang.String" itemvalue="certifi" />
-            <item index="78" class="java.lang.String" itemvalue="oauthlib" />
-            <item index="79" class="java.lang.String" itemvalue="pyparsing" />
-            <item index="80" class="java.lang.String" itemvalue="Markdown" />
-            <item index="81" class="java.lang.String" itemvalue="h5py" />
-            <item index="82" class="java.lang.String" itemvalue="wrapt" />
-            <item index="83" class="java.lang.String" itemvalue="kiwisolver" />
-            <item index="84" class="java.lang.String" itemvalue="empyrical" />
-            <item index="85" class="java.lang.String" itemvalue="backcall" />
-            <item index="86" class="java.lang.String" itemvalue="charset-normalizer" />
-            <item index="87" class="java.lang.String" itemvalue="multipledispatch" />
-            <item index="88" class="java.lang.String" itemvalue="pathspec" />
-            <item index="89" class="java.lang.String" itemvalue="jupyter-core" />
-            <item index="90" class="java.lang.String" itemvalue="matplotlib-inline" />
-            <item index="91" class="java.lang.String" itemvalue="ptyprocess" />
-            <item index="92" class="java.lang.String" itemvalue="more-itertools" />
-            <item index="93" class="java.lang.String" itemvalue="mypy-extensions" />
-            <item index="94" class="java.lang.String" itemvalue="cloudpickle" />
-            <item index="95" class="java.lang.String" itemvalue="wcwidth" />
-            <item index="96" class="java.lang.String" itemvalue="requests-oauthlib" />
-            <item index="97" class="java.lang.String" itemvalue="Keras-Preprocessing" />
-            <item index="98" class="java.lang.String" itemvalue="yfinance" />
-            <item index="99" class="java.lang.String" itemvalue="tomli" />
-            <item index="100" class="java.lang.String" itemvalue="urllib3" />
-            <item index="101" class="java.lang.String" itemvalue="six" />
-            <item index="102" class="java.lang.String" itemvalue="parso" />
-            <item index="103" class="java.lang.String" itemvalue="wheel" />
-            <item index="104" class="java.lang.String" itemvalue="ipython" />
-            <item index="105" class="java.lang.String" itemvalue="packaging" />
-            <item index="106" class="java.lang.String" itemvalue="lazy-object-proxy" />
-            <item index="107" class="java.lang.String" itemvalue="grpcio" />
-            <item index="108" class="java.lang.String" itemvalue="dm-tree" />
-            <item index="109" class="java.lang.String" itemvalue="google-auth" />
-            <item index="110" class="java.lang.String" itemvalue="seaborn" />
-            <item index="111" class="java.lang.String" itemvalue="thop" />
-            <item index="112" class="java.lang.String" itemvalue="torch" />
-            <item index="113" class="java.lang.String" itemvalue="torchvision" />
-            <item index="114" class="java.lang.String" itemvalue="d2l" />
-            <item index="115" class="java.lang.String" itemvalue="keyboard" />
-            <item index="116" class="java.lang.String" itemvalue="transformers" />
-            <item index="117" class="java.lang.String" itemvalue="phonemizer" />
-            <item index="118" class="java.lang.String" itemvalue="Unidecode" />
-            <item index="119" class="java.lang.String" itemvalue="nltk" />
-            <item index="120" class="java.lang.String" itemvalue="pinecone-client" />
-            <item index="121" class="java.lang.String" itemvalue="sentence-transformers" />
-            <item index="122" class="java.lang.String" itemvalue="whisper" />
-            <item index="123" class="java.lang.String" itemvalue="datasets" />
-            <item index="124" class="java.lang.String" itemvalue="pyaudio" />
-            <item index="125" class="java.lang.String" itemvalue="torchsummary" />
-            <item index="126" class="java.lang.String" itemvalue="openjtalk" />
-            <item index="127" class="java.lang.String" itemvalue="hydra-core" />
-            <item index="128" class="java.lang.String" itemvalue="museval" />
-            <item index="129" class="java.lang.String" itemvalue="mypy" />
-            <item index="130" class="java.lang.String" itemvalue="hydra-colorlog" />
-            <item index="131" class="java.lang.String" itemvalue="flake8" />
-          </list>
-        </value>
-      </option>
-    </inspection_tool>
-    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
-      <option name="ignoredIdentifiers">
-        <list>
-          <option value="sentiment_classification.model_predictions.audio_path" />
-          <option value="sentiment_classification.model_predictions.sample_rate" />
-          <option value="sentiment_classification.model_predictions.num_samples" />
-        </list>
-      </option>
-    </inspection_tool>
-  </profile>
-</component>

VITS-fast-fine-tuning/.idea/inspectionProfiles/profiles_settings.xml DELETED Viewed

@@ -1,6 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <settings>
-    <option name="USE_PROJECT_PROFILE" value="false" />
-    <version value="1.0" />
-  </settings>
-</component>

VITS-fast-fine-tuning/.idea/misc.xml DELETED Viewed

@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (VITS)" project-jdk-type="Python SDK" />
-</project>

VITS-fast-fine-tuning/.idea/modules.xml DELETED Viewed

@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/VITS_voice_conversion.iml" filepath="$PROJECT_DIR$/.idea/VITS_voice_conversion.iml" />
-    </modules>
-  </component>
-</project>

VITS-fast-fine-tuning/.idea/vcs.xml DELETED Viewed

@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$" vcs="Git" />
-  </component>
-</project>

VITS-fast-fine-tuning/DATA.MD DELETED Viewed

@@ -1,42 +0,0 @@
-本仓库的pipeline支持多种声音样本上传方式，您只需根据您所持有的样本选择任意一种或其中几种即可。
-1.`.zip`文件打包的，按角色名排列的短音频，该压缩文件结构应如下所示：
-```
-Your-zip-file.zip
-├───Character_name_1
-├   ├───xxx.wav
-├   ├───...
-├   ├───yyy.mp3
-├   └───zzz.wav
-├───Character_name_2
-├   ├───xxx.wav
-├   ├───...
-├   ├───yyy.mp3
-├   └───zzz.wav
-├───...
-├
-└───Character_name_n
-    ├───xxx.wav
-    ├───...
-    ├───yyy.mp3
-    └───zzz.wav
-```
-注意音频的格式和名称都不重要，只要它们是音频文件。
-质量要求：2秒以上，10秒以内，尽量不要有背景噪音。
-数量要求：一个角色至少10条，最好每个角色20条以上。
-2. 以角色名命名的长音频文件，音频内只能有单说话人，背景音会被自动去除。命名格式为：`{CharacterName}_{random_number}.wav`
-(例如：`Diana_234135.wav`, `MinatoAqua_234252.wav`)，必须是`.wav`文件，长度要在20分钟以内（否则会内存不足）。
-3. 以角色名命名的长视频文件，视频内只能有单说话人，背景音会被自动去除。命名格式为：`{CharacterName}_{random_number}.mp4`
-(例如：`Taffy_332452.mp4`, `Dingzhen_957315.mp4`)，必须是`.mp4`文件，长度要在20分钟以内（否则会内存不足）。
-注意：命名中，`CharacterName`必须是英文字符，`random_number`是为了区分同一个角色的多个文件，必须要添加，该数字可以为0~999999之间的任意整数。
-4. 包含多行`{CharacterName}|{video_url}`的`.txt`文件，格式应如下所示：
-```
-Char1|https://xyz.com/video1/
-Char2|https://xyz.com/video2/
-Char2|https://xyz.com/video3/
-Char3|https://xyz.com/video4/
-```
-视频内只能有单说话人，背景音会被自动去除。目前仅支持来自bilibili的视频，其它网站视频的url还没测试过。
-若对格式有疑问，可以在[这里](https://drive.google.com/file/d/132l97zjanpoPY4daLgqXoM7HKXPRbS84/view?usp=sharing)找到所有格式对应的数据样本。

VITS-fast-fine-tuning/DATA_EN.MD DELETED Viewed

@@ -1,46 +0,0 @@
-The pipeline of this repo supports multiple voice uploading options，you can choose one or more options depending on the data you have.
-1. Short audios packed by a single `.zip` file, whose file structure should be as shown below:
-```
-Your-zip-file.zip
-├───Character_name_1
-├   ├───xxx.wav
-├   ├───...
-├   ├───yyy.mp3
-├   └───zzz.wav
-├───Character_name_2
-├   ├───xxx.wav
-├   ├───...
-├   ├───yyy.mp3
-├   └───zzz.wav
-├───...
-├
-└───Character_name_n
-    ├───xxx.wav
-    ├───...
-    ├───yyy.mp3
-    └───zzz.wav
-```
-Note that the format of the audio files does not matter as long as they are audio files。
-Quality requirement: >=2s, <=10s, contain as little background sound as possible.
-Quantity requirement: at least 10 per character, 20+ per character is recommended.
-2. Long audio files named by character names, which should contain single character voice only. Background sound is
-acceptable since they will be automatically removed. File name format `{CharacterName}_{random_number}.wav`
-(E.G. `Diana_234135.wav`, `MinatoAqua_234252.wav`), must be `.wav` files.
-3. Long video files named by character names, which should contain single character voice only. Background sound is
-acceptable since they will be automatically removed. File name format `{CharacterName}_{random_number}.mp4`
-(E.G. `Taffy_332452.mp4`, `Dingzhen_957315.mp4`), must be `.mp4` files.
-Note: `CharacterName` must be English characters only, `random_number` is to identify multiple files for one character,
-which is compulsory to add. It could be a random integer between 0~999999.
-4. A `.txt` containing multiple lines of`{CharacterName}|{video_url}`, which should be formatted as follows:
-```
-Char1|https://xyz.com/video1/
-Char2|https://xyz.com/video2/
-Char2|https://xyz.com/video3/
-Char3|https://xyz.com/video4/
-```
-One video should contain single speaker only. Currently supports videos links from bilibili, other websites are yet to be tested.
-Having questions regarding to data format? Fine data samples of all format from [here](https://drive.google.com/file/d/132l97zjanpoPY4daLgqXoM7HKXPRbS84/view?usp=sharing).

VITS-fast-fine-tuning/LICENSE DELETED Viewed

@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

VITS-fast-fine-tuning/README.md DELETED Viewed

@@ -1,55 +0,0 @@
-[中文文档请点击这里](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/README_ZH.md)
-# VITS Fast Fine-tuning
-This repo will guide you to add your own character voices, or even your own voice, into existing VITS TTS model
-to make it able to do the following tasks in less than 1 hour:
-1. Many-to-many voice conversion between any characters you added & preset characters in the model.
-2. English, Japanese & Chinese Text-to-Speech synthesis with the characters you added & preset characters
-Welcome to play around with the base models!
-Chinese & English & Japanese：[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer) Author: Me
-Chinese & Japanese：[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai) Author: [SayaSS](https://github.com/SayaSS)
-### Currently Supported Tasks:
-- [x] Clone character voice from 10+ short audios
-- [x] Clone character voice from long audio(s) >= 3 minutes (one audio should contain single speaker only)
-- [x] Clone character voice from videos(s) >= 3 minutes (one video should contain single speaker only)
-- [x] Clone character voice from BILIBILI video links (one video should contain single speaker only)
-### Currently Supported Characters for TTS & VC:
-- [x] Any character you wish as long as you have their voices!
-(Note that voice conversion can only be conducted between any two speakers in the model)
-## Fine-tuning
-It's recommended to perform fine-tuning on [Google Colab](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
-because the original VITS has some dependencies that are difficult to configure.
-### How long does it take?
-1. Install dependencies (3 min)
-2. Choose pretrained model to start. The detailed differences between them are described in [Colab Notebook](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
-3. Upload the voice samples of the characters you wish to add，see [DATA.MD](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA_EN.MD) for detailed uploading options.
-4. Start fine-tuning. Time taken varies from 20 minutes ~ 2 hours, depending on the number of voices you uploaded.
-## Inference or Usage (Currently support Windows only)
-0. Remember to download your fine-tuned model!
-1. Download the latest release
-2. Put your model & config file into the folder `inference`, which are named `G_latest.pth` and `finetune_speaker.json`, respectively.
-3. The file structure should be as follows:
-```
-inference
-├───inference.exe
-├───...
-├───finetune_speaker.json
-└───G_latest.pth
-```
-4. run `inference.exe`, the browser should pop up automatically.
-## Use in MoeGoe
-0. Prepare downloaded model & config file, which are named `G_latest.pth` and `moegoe_config.json`, respectively.
-1. Follow [MoeGoe](https://github.com/CjangCjengh/MoeGoe) page instructions to install, configure path, and use.

VITS-fast-fine-tuning/README_ZH.md DELETED Viewed

@@ -1,60 +0,0 @@
-English Documentation Please Click [here](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/README.md)
-# VITS 快速微调
-这个代码库会指导你如何将自定义角色（甚至你自己），加入预训练的VITS模型中，在1小时内的微调使模型具备如下功能：
-1. 在 模型所包含的任意两个角色 之间进行声线转换
-2. 以 你加入的角色声线 进行中日英三语 文本到语音合成。
-本项目使用的底模涵盖常见二次元男/女配音声线（来自原神数据集）以及现实世界常见男/女声线（来自VCTK数据集），支持中日英三语，保证能够在微调时快速适应新的声线。
-欢迎体验微调所使用的底模！
-中日英：[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer) 作者：我
-中日：[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai) 作者：[SayaSS](https://github.com/SayaSS)
-### 目前支持的任务:
-- [x] 从 10条以上的短音频 克隆角色声音
-- [x] 从 3分钟以上的长音频（单个音频只能包含单说话人） 克隆角色声音
-- [x] 从 3分钟以上的视频（单个视频只能包含单说话人） 克隆角色声音
-- [x] 通过输入 bilibili视频链接（单个视频只能包含单说话人） 克隆角色声音
-### 目前支持声线转换和中日英三语TTS的角色
-- [x] 任意角色（只要你有角色的声音样本）
-（注意：声线转换只能在任意两个存在于模型中的说话人之间进行）
-## 微调
-建议使用 [Google Colab](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
-进行微调任务，因为VITS在多语言情况下的某些环境依赖相当难以配置。
-### 在Google Colab里，我需要花多长时间？
-1. 安装依赖 (3 min)
-2. 选择预训练模型，详细区别参见[Colab 笔记本页面](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)。
-3. 上传你希望加入的其它角色声音，详细上传方式见[DATA.MD](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA.MD)
-4. 进行微调，根据选择的微调方式和样本数量不同，花费时长可能在20分钟到2小时不等。
-微调结束后可以直接下载微调好的模型，日后在本地运行（不需要GPU）
-## 本地运行和推理
-0. 记得下载微调好的模型和config文件！
-1. 下载最新的Release包（在Github页面的右侧）
-2. 把下载的模型和config文件放在 `inference`文件夹下, 其文件名分别为 `G_latest.pth` 和 `finetune_speaker.json`。
-3. 一切准备就绪后，文件结构应该如下所示:
-```
-inference
-├───inference.exe
-├───...
-├───finetune_speaker.json
-└───G_latest.pth
-```
-4. 运行 `inference.exe`, 浏览器会自动弹出窗口, 注意其所在路径不能有中文字符或者空格.
-## 在MoeGoe使用
-0. MoeGoe以及类似其它VITS推理UI使用的config格式略有不同，需要下载的文件为模型`G_latest.pth`和配置文件`moegoe_config.json`
-1. 按照[MoeGoe](https://github.com/CjangCjengh/MoeGoe)页面的提示配置路径即可使用。
-2. MoeGoe在输入句子时需要使用相应的语言标记包裹句子才能正常合成。（日语用[JA], 中文用[ZH], 英文用[EN]），例如：
-[JA]こんにちわ。[JA]
-[ZH]你好！[ZH]
-[EN]Hello![EN]

VITS-fast-fine-tuning/VC_inference.py DELETED Viewed

@@ -1,139 +0,0 @@
-import os
-import numpy as np
-import torch
-from torch import no_grad, LongTensor
-import argparse
-import commons
-from mel_processing import spectrogram_torch
-import utils
-from models import SynthesizerTrn
-import gradio as gr
-import librosa
-import webbrowser
-from text import text_to_sequence, _clean_text
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-language_marks = {
-    "Japanese": "",
-    "日本語": "[JA]",
-    "简体中文": "[ZH]",
-    "English": "[EN]",
-    "Mix": "",
-}
-lang = ['日本語', '简体中文', 'English', 'Mix']
-def get_text(text, hps, is_symbol):
-    text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
-    if hps.data.add_blank:
-        text_norm = commons.intersperse(text_norm, 0)
-    text_norm = LongTensor(text_norm)
-    return text_norm
-def create_tts_fn(model, hps, speaker_ids):
-    def tts_fn(text, speaker, language, speed):
-        if language is not None:
-            text = language_marks[language] + text + language_marks[language]
-        speaker_id = speaker_ids[speaker]
-        stn_tst = get_text(text, hps, False)
-        with no_grad():
-            x_tst = stn_tst.unsqueeze(0).to(device)
-            x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
-            sid = LongTensor([speaker_id]).to(device)
-            audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
-                                length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
-        del stn_tst, x_tst, x_tst_lengths, sid
-        return "Success", (hps.data.sampling_rate, audio)
-    return tts_fn
-def create_vc_fn(model, hps, speaker_ids):
-    def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
-        input_audio = record_audio if record_audio is not None else upload_audio
-        if input_audio is None:
-            return "You need to record or upload an audio", None
-        sampling_rate, audio = input_audio
-        original_speaker_id = speaker_ids[original_speaker]
-        target_speaker_id = speaker_ids[target_speaker]
-        audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
-        if len(audio.shape) > 1:
-            audio = librosa.to_mono(audio.transpose(1, 0))
-        if sampling_rate != hps.data.sampling_rate:
-            audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
-        with no_grad():
-            y = torch.FloatTensor(audio)
-            y = y / max(-y.min(), y.max()) / 0.99
-            y = y.to(device)
-            y = y.unsqueeze(0)
-            spec = spectrogram_torch(y, hps.data.filter_length,
-                                     hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
-                                     center=False).to(device)
-            spec_lengths = LongTensor([spec.size(-1)]).to(device)
-            sid_src = LongTensor([original_speaker_id]).to(device)
-            sid_tgt = LongTensor([target_speaker_id]).to(device)
-            audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
-                0, 0].data.cpu().float().numpy()
-        del y, spec, spec_lengths, sid_src, sid_tgt
-        return "Success", (hps.data.sampling_rate, audio)
-    return vc_fn
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_dir", default="./G_latest.pth", help="directory to your fine-tuned model")
-    parser.add_argument("--config_dir", default="./finetune_speaker.json", help="directory to your model config file")
-    parser.add_argument("--share", default=False, help="make link public (used in colab)")
-    args = parser.parse_args()
-    hps = utils.get_hparams_from_file(args.config_dir)
-    net_g = SynthesizerTrn(
-        len(hps.symbols),
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        n_speakers=hps.data.n_speakers,
-        **hps.model).to(device)
-    _ = net_g.eval()
-    _ = utils.load_checkpoint(args.model_dir, net_g, None)
-    speaker_ids = hps.speakers
-    speakers = list(hps.speakers.keys())
-    tts_fn = create_tts_fn(net_g, hps, speaker_ids)
-    vc_fn = create_vc_fn(net_g, hps, speaker_ids)
-    app = gr.Blocks()
-    with app:
-        with gr.Tab("Text-to-Speech"):
-            with gr.Row():
-                with gr.Column():
-                    textbox = gr.TextArea(label="Text",
-                                          placeholder="Type your sentence here",
-                                          value="こんにちわ。", elem_id=f"tts-input")
-                    # select character
-                    char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
-                    language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
-                    duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
-                                                label='速度 Speed')
-                with gr.Column():
-                    text_output = gr.Textbox(label="Message")
-                    audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
-                    btn = gr.Button("Generate!")
-                    btn.click(tts_fn,
-                              inputs=[textbox, char_dropdown, language_dropdown, duration_slider,],
-                              outputs=[text_output, audio_output])
-        with gr.Tab("Voice Conversion"):
-            gr.Markdown("""
-                            录制或上传声音，并选择要转换的音色。
-            """)
-            with gr.Column():
-                record_audio = gr.Audio(label="record your voice", source="microphone")
-                upload_audio = gr.Audio(label="or upload audio here", source="upload")
-                source_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="source speaker")
-                target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
-            with gr.Column():
-                message_box = gr.Textbox(label="Message")
-                converted_audio = gr.Audio(label='converted audio')
-            btn = gr.Button("Convert!")
-            btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
-                      outputs=[message_box, converted_audio])
-    webbrowser.open("http://127.0.0.1:7860")
-    app.launch(share=args.share)

VITS-fast-fine-tuning/attentions.py DELETED Viewed

@@ -1,303 +0,0 @@
-import copy
-import math
-import numpy as np
-import torch
-from torch import nn
-from torch.nn import functional as F
-import commons
-import modules
-from modules import LayerNorm
-class Encoder(nn.Module):
-  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
-    super().__init__()
-    self.hidden_channels = hidden_channels
-    self.filter_channels = filter_channels
-    self.n_heads = n_heads
-    self.n_layers = n_layers
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.window_size = window_size
-    self.drop = nn.Dropout(p_dropout)
-    self.attn_layers = nn.ModuleList()
-    self.norm_layers_1 = nn.ModuleList()
-    self.ffn_layers = nn.ModuleList()
-    self.norm_layers_2 = nn.ModuleList()
-    for i in range(self.n_layers):
-      self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
-      self.norm_layers_1.append(LayerNorm(hidden_channels))
-      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
-      self.norm_layers_2.append(LayerNorm(hidden_channels))
-  def forward(self, x, x_mask):
-    attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-    x = x * x_mask
-    for i in range(self.n_layers):
-      y = self.attn_layers[i](x, x, attn_mask)
-      y = self.drop(y)
-      x = self.norm_layers_1[i](x + y)
-      y = self.ffn_layers[i](x, x_mask)
-      y = self.drop(y)
-      x = self.norm_layers_2[i](x + y)
-    x = x * x_mask
-    return x
-class Decoder(nn.Module):
-  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
-    super().__init__()
-    self.hidden_channels = hidden_channels
-    self.filter_channels = filter_channels
-    self.n_heads = n_heads
-    self.n_layers = n_layers
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.proximal_bias = proximal_bias
-    self.proximal_init = proximal_init
-    self.drop = nn.Dropout(p_dropout)
-    self.self_attn_layers = nn.ModuleList()
-    self.norm_layers_0 = nn.ModuleList()
-    self.encdec_attn_layers = nn.ModuleList()
-    self.norm_layers_1 = nn.ModuleList()
-    self.ffn_layers = nn.ModuleList()
-    self.norm_layers_2 = nn.ModuleList()
-    for i in range(self.n_layers):
-      self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
-      self.norm_layers_0.append(LayerNorm(hidden_channels))
-      self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
-      self.norm_layers_1.append(LayerNorm(hidden_channels))
-      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
-      self.norm_layers_2.append(LayerNorm(hidden_channels))
-  def forward(self, x, x_mask, h, h_mask):
-    """
-    x: decoder input
-    h: encoder output
-    """
-    self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
-    encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-    x = x * x_mask
-    for i in range(self.n_layers):
-      y = self.self_attn_layers[i](x, x, self_attn_mask)
-      y = self.drop(y)
-      x = self.norm_layers_0[i](x + y)
-      y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
-      y = self.drop(y)
-      x = self.norm_layers_1[i](x + y)
-      y = self.ffn_layers[i](x, x_mask)
-      y = self.drop(y)
-      x = self.norm_layers_2[i](x + y)
-    x = x * x_mask
-    return x
-class MultiHeadAttention(nn.Module):
-  def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
-    super().__init__()
-    assert channels % n_heads == 0
-    self.channels = channels
-    self.out_channels = out_channels
-    self.n_heads = n_heads
-    self.p_dropout = p_dropout
-    self.window_size = window_size
-    self.heads_share = heads_share
-    self.block_length = block_length
-    self.proximal_bias = proximal_bias
-    self.proximal_init = proximal_init
-    self.attn = None
-    self.k_channels = channels // n_heads
-    self.conv_q = nn.Conv1d(channels, channels, 1)
-    self.conv_k = nn.Conv1d(channels, channels, 1)
-    self.conv_v = nn.Conv1d(channels, channels, 1)
-    self.conv_o = nn.Conv1d(channels, out_channels, 1)
-    self.drop = nn.Dropout(p_dropout)
-    if window_size is not None:
-      n_heads_rel = 1 if heads_share else n_heads
-      rel_stddev = self.k_channels**-0.5
-      self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-      self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
-    nn.init.xavier_uniform_(self.conv_q.weight)
-    nn.init.xavier_uniform_(self.conv_k.weight)
-    nn.init.xavier_uniform_(self.conv_v.weight)
-    if proximal_init:
-      with torch.no_grad():
-        self.conv_k.weight.copy_(self.conv_q.weight)
-        self.conv_k.bias.copy_(self.conv_q.bias)
-  def forward(self, x, c, attn_mask=None):
-    q = self.conv_q(x)
-    k = self.conv_k(c)
-    v = self.conv_v(c)
-    x, self.attn = self.attention(q, k, v, mask=attn_mask)
-    x = self.conv_o(x)
-    return x
-  def attention(self, query, key, value, mask=None):
-    # reshape [b, d, t] -> [b, n_h, t, d_k]
-    b, d, t_s, t_t = (*key.size(), query.size(2))
-    query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
-    key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-    value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-    scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
-    if self.window_size is not None:
-      assert t_s == t_t, "Relative attention is only available for self-attention."
-      key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
-      rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
-      scores_local = self._relative_position_to_absolute_position(rel_logits)
-      scores = scores + scores_local
-    if self.proximal_bias:
-      assert t_s == t_t, "Proximal bias is only available for self-attention."
-      scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
-    if mask is not None:
-      scores = scores.masked_fill(mask == 0, -1e4)
-      if self.block_length is not None:
-        assert t_s == t_t, "Local attention is only available for self-attention."
-        block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
-        scores = scores.masked_fill(block_mask == 0, -1e4)
-    p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
-    p_attn = self.drop(p_attn)
-    output = torch.matmul(p_attn, value)
-    if self.window_size is not None:
-      relative_weights = self._absolute_position_to_relative_position(p_attn)
-      value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
-      output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
-    output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
-    return output, p_attn
-  def _matmul_with_relative_values(self, x, y):
-    """
-    x: [b, h, l, m]
-    y: [h or 1, m, d]
-    ret: [b, h, l, d]
-    """
-    ret = torch.matmul(x, y.unsqueeze(0))
-    return ret
-  def _matmul_with_relative_keys(self, x, y):
-    """
-    x: [b, h, l, d]
-    y: [h or 1, m, d]
-    ret: [b, h, l, m]
-    """
-    ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
-    return ret
-  def _get_relative_embeddings(self, relative_embeddings, length):
-    max_relative_position = 2 * self.window_size + 1
-    # Pad first before slice to avoid using cond ops.
-    pad_length = max(length - (self.window_size + 1), 0)
-    slice_start_position = max((self.window_size + 1) - length, 0)
-    slice_end_position = slice_start_position + 2 * length - 1
-    if pad_length > 0:
-      padded_relative_embeddings = F.pad(
-          relative_embeddings,
-          commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
-    else:
-      padded_relative_embeddings = relative_embeddings
-    used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
-    return used_relative_embeddings
-  def _relative_position_to_absolute_position(self, x):
-    """
-    x: [b, h, l, 2*l-1]
-    ret: [b, h, l, l]
-    """
-    batch, heads, length, _ = x.size()
-    # Concat columns of pad to shift from relative to absolute indexing.
-    x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
-    # Concat extra elements so to add up to shape (len+1, 2*len-1).
-    x_flat = x.view([batch, heads, length * 2 * length])
-    x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
-    # Reshape and slice out the padded elements.
-    x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
-    return x_final
-  def _absolute_position_to_relative_position(self, x):
-    """
-    x: [b, h, l, l]
-    ret: [b, h, l, 2*l-1]
-    """
-    batch, heads, length, _ = x.size()
-    # padd along column
-    x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
-    x_flat = x.view([batch, heads, length**2 + length*(length -1)])
-    # add 0's in the beginning that will skew the elements after reshape
-    x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
-    x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
-    return x_final
-  def _attention_bias_proximal(self, length):
-    """Bias for self-attention to encourage attention to close positions.
-    Args:
-      length: an integer scalar.
-    Returns:
-      a Tensor with shape [1, 1, length, length]
-    """
-    r = torch.arange(length, dtype=torch.float32)
-    diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-    return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
-class FFN(nn.Module):
-  def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
-    super().__init__()
-    self.in_channels = in_channels
-    self.out_channels = out_channels
-    self.filter_channels = filter_channels
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.activation = activation
-    self.causal = causal
-    if causal:
-      self.padding = self._causal_padding
-    else:
-      self.padding = self._same_padding
-    self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
-    self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
-    self.drop = nn.Dropout(p_dropout)
-  def forward(self, x, x_mask):
-    x = self.conv_1(self.padding(x * x_mask))
-    if self.activation == "gelu":
-      x = x * torch.sigmoid(1.702 * x)
-    else:
-      x = torch.relu(x)
-    x = self.drop(x)
-    x = self.conv_2(self.padding(x * x_mask))
-    return x * x_mask
-  def _causal_padding(self, x):
-    if self.kernel_size == 1:
-      return x
-    pad_l = self.kernel_size - 1
-    pad_r = 0
-    padding = [[0, 0], [0, 0], [pad_l, pad_r]]
-    x = F.pad(x, commons.convert_pad_shape(padding))
-    return x
-  def _same_padding(self, x):
-    if self.kernel_size == 1:
-      return x
-    pad_l = (self.kernel_size - 1) // 2
-    pad_r = self.kernel_size // 2
-    padding = [[0, 0], [0, 0], [pad_l, pad_r]]
-    x = F.pad(x, commons.convert_pad_shape(padding))
-    return x

VITS-fast-fine-tuning/cmd_inference.py DELETED Viewed

@@ -1,106 +0,0 @@
-"""该模块用于生成VITS文件
-使用方法
-python cmd_inference.py -m 模型路径 -c 配置文件路径 -o 输出文件路径 -l 输入的语言 -t 输入文本 -s 合成目标说话人名称
-可选参数
--ns 感情变化程度
--nsw 音素发音长度
--ls 整体语速
--on 输出文件的名称
-"""
-from pathlib import Path
-import utils
-from models import SynthesizerTrn
-import torch
-from torch import no_grad, LongTensor
-import librosa
-from text import text_to_sequence, _clean_text
-import commons
-import scipy.io.wavfile as wavf
-import os
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-language_marks = {
-    "Japanese": "",
-    "日本語": "[JA]",
-    "简体中文": "[ZH]",
-    "English": "[EN]",
-    "Mix": "",
-}
-def get_text(text, hps, is_symbol):
-    text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
-    if hps.data.add_blank:
-        text_norm = commons.intersperse(text_norm, 0)
-    text_norm = LongTensor(text_norm)
-    return text_norm
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description='vits inference')
-    #必须参数
-    parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径')
-    parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
-    parser.add_argument('-o', '--output_path', type=str, default="output/vits", help='输出文件路径')
-    parser.add_argument('-l', '--language', type=str, default="日本語", help='输入的语言')
-    parser.add_argument('-t', '--text', type=str, help='输入文本')
-    parser.add_argument('-s', '--spk', type=str, help='合成目标说话人名称')
-    #可选参数
-    parser.add_argument('-on', '--output_name', type=str, default="output", help='输出文件的名称')
-    parser.add_argument('-ns', '--noise_scale', type=float,default= .667,help='感情变化程度')
-    parser.add_argument('-nsw', '--noise_scale_w', type=float,default=0.6, help='音素发音长度')
-    parser.add_argument('-ls', '--length_scale', type=float,default=1, help='整体语速')
-    args = parser.parse_args()
-    model_path = args.model_path
-    config_path = args.config_path
-    output_dir = Path(args.output_path)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    language = args.language
-    text = args.text
-    spk = args.spk
-    noise_scale = args.noise_scale
-    noise_scale_w = args.noise_scale_w
-    length = args.length_scale
-    output_name = args.output_name
-    hps = utils.get_hparams_from_file(config_path)
-    net_g = SynthesizerTrn(
-        len(hps.symbols),
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        n_speakers=hps.data.n_speakers,
-        **hps.model).to(device)
-    _ = net_g.eval()
-    _ = utils.load_checkpoint(model_path, net_g, None)
-    speaker_ids = hps.speakers
-    if language is not None:
-        text = language_marks[language] + text + language_marks[language]
-        speaker_id = speaker_ids[spk]
-        stn_tst = get_text(text, hps, False)
-        with no_grad():
-            x_tst = stn_tst.unsqueeze(0).to(device)
-            x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
-            sid = LongTensor([speaker_id]).to(device)
-            audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
-                                length_scale=1.0 / length)[0][0, 0].data.cpu().float().numpy()
-        del stn_tst, x_tst, x_tst_lengths, sid
-        wavf.write(str(output_dir)+"/"+output_name+".wav",hps.data.sampling_rate,audio)

VITS-fast-fine-tuning/commons.py DELETED Viewed

@@ -1,164 +0,0 @@
-import math
-import numpy as np
-import torch
-from torch import nn
-from torch.nn import functional as F
-def init_weights(m, mean=0.0, std=0.01):
-  classname = m.__class__.__name__
-  if classname.find("Conv") != -1:
-    m.weight.data.normal_(mean, std)
-def get_padding(kernel_size, dilation=1):
-  return int((kernel_size*dilation - dilation)/2)
-def convert_pad_shape(pad_shape):
-  l = pad_shape[::-1]
-  pad_shape = [item for sublist in l for item in sublist]
-  return pad_shape
-def intersperse(lst, item):
-  result = [item] * (len(lst) * 2 + 1)
-  result[1::2] = lst
-  return result
-def kl_divergence(m_p, logs_p, m_q, logs_q):
-  """KL(P||Q)"""
-  kl = (logs_q - logs_p) - 0.5
-  kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
-  return kl
-def rand_gumbel(shape):
-  """Sample from the Gumbel distribution, protect from overflows."""
-  uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
-  return -torch.log(-torch.log(uniform_samples))
-def rand_gumbel_like(x):
-  g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
-  return g
-def slice_segments(x, ids_str, segment_size=4):
-  ret = torch.zeros_like(x[:, :, :segment_size])
-  for i in range(x.size(0)):
-    idx_str = ids_str[i]
-    idx_end = idx_str + segment_size
-    try:
-      ret[i] = x[i, :, idx_str:idx_end]
-    except RuntimeError:
-      print("?")
-  return ret
-def rand_slice_segments(x, x_lengths=None, segment_size=4):
-  b, d, t = x.size()
-  if x_lengths is None:
-    x_lengths = t
-  ids_str_max = x_lengths - segment_size + 1
-  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
-  ret = slice_segments(x, ids_str, segment_size)
-  return ret, ids_str
-def get_timing_signal_1d(
-    length, channels, min_timescale=1.0, max_timescale=1.0e4):
-  position = torch.arange(length, dtype=torch.float)
-  num_timescales = channels // 2
-  log_timescale_increment = (
-      math.log(float(max_timescale) / float(min_timescale)) /
-      (num_timescales - 1))
-  inv_timescales = min_timescale * torch.exp(
-      torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
-  scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
-  signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
-  signal = F.pad(signal, [0, 0, 0, channels % 2])
-  signal = signal.view(1, channels, length)
-  return signal
-def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
-  b, channels, length = x.size()
-  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
-  return x + signal.to(dtype=x.dtype, device=x.device)
-def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
-  b, channels, length = x.size()
-  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
-  return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
-def subsequent_mask(length):
-  mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
-  return mask
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-  n_channels_int = n_channels[0]
-  in_act = input_a + input_b
-  t_act = torch.tanh(in_act[:, :n_channels_int, :])
-  s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-  acts = t_act * s_act
-  return acts
-def convert_pad_shape(pad_shape):
-  l = pad_shape[::-1]
-  pad_shape = [item for sublist in l for item in sublist]
-  return pad_shape
-def shift_1d(x):
-  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
-  return x
-def sequence_mask(length, max_length=None):
-  if max_length is None:
-    max_length = length.max()
-  x = torch.arange(max_length, dtype=length.dtype, device=length.device)
-  return x.unsqueeze(0) < length.unsqueeze(1)
-def generate_path(duration, mask):
-  """
-  duration: [b, 1, t_x]
-  mask: [b, 1, t_y, t_x]
-  """
-  device = duration.device
-  b, _, t_y, t_x = mask.shape
-  cum_duration = torch.cumsum(duration, -1)
-  cum_duration_flat = cum_duration.view(b * t_x)
-  path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
-  path = path.view(b, t_x, t_y)
-  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
-  path = path.unsqueeze(1).transpose(2,3) * mask
-  return path
-def clip_grad_value_(parameters, clip_value, norm_type=2):
-  if isinstance(parameters, torch.Tensor):
-    parameters = [parameters]
-  parameters = list(filter(lambda p: p.grad is not None, parameters))
-  norm_type = float(norm_type)
-  if clip_value is not None:
-    clip_value = float(clip_value)
-  total_norm = 0
-  for p in parameters:
-    param_norm = p.grad.data.norm(norm_type)
-    total_norm += param_norm.item() ** norm_type
-    if clip_value is not None:
-      p.grad.data.clamp_(min=-clip_value, max=clip_value)
-  total_norm = total_norm ** (1. / norm_type)
-  return total_norm

VITS-fast-fine-tuning/configs/modified_finetune_speaker.json DELETED Viewed

@@ -1,172 +0,0 @@
-{
-  "train": {
-    "log_interval": 10,
-    "eval_interval": 100,
-    "seed": 1234,
-    "epochs": 10000,
-    "learning_rate": 0.0002,
-    "betas": [
-      0.8,
-      0.99
-    ],
-    "eps": 1e-09,
-    "batch_size": 16,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 8192,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "training_files": "final_annotation_train.txt",
-    "validation_files": "final_annotation_val.txt",
-    "text_cleaners": [
-      "chinese_cleaners"
-    ],
-    "max_wav_value": 32768.0,
-    "sampling_rate": 22050,
-    "filter_length": 1024,
-    "hop_length": 256,
-    "win_length": 1024,
-    "n_mel_channels": 80,
-    "mel_fmin": 0.0,
-    "mel_fmax": null,
-    "add_blank": true,
-    "n_speakers": 2,
-    "cleaned_text": true
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0.1,
-    "resblock": "1",
-    "resblock_kernel_sizes": [
-      3,
-      7,
-      11
-    ],
-    "resblock_dilation_sizes": [
-      [
-        1,
-        3,
-        5
-      ],
-      [
-        1,
-        3,
-        5
-      ],
-      [
-        1,
-        3,
-        5
-      ]
-    ],
-    "upsample_rates": [
-      8,
-      8,
-      2,
-      2
-    ],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [
-      16,
-      16,
-      4,
-      4
-    ],
-    "n_layers_q": 3,
-    "use_spectral_norm": false,
-    "gin_channels": 256
-  },
-  "symbols": [
-    "_",
-    "\uff1b",
-    "\uff1a",
-    "\uff0c",
-    "\u3002",
-    "\uff01",
-    "\uff1f",
-    "-",
-    "\u201c",
-    "\u201d",
-    "\u300a",
-    "\u300b",
-    "\u3001",
-    "\uff08",
-    "\uff09",
-    "\u2026",
-    "\u2014",
-    " ",
-    "A",
-    "B",
-    "C",
-    "D",
-    "E",
-    "F",
-    "G",
-    "H",
-    "I",
-    "J",
-    "K",
-    "L",
-    "M",
-    "N",
-    "O",
-    "P",
-    "Q",
-    "R",
-    "S",
-    "T",
-    "U",
-    "V",
-    "W",
-    "X",
-    "Y",
-    "Z",
-    "a",
-    "b",
-    "c",
-    "d",
-    "e",
-    "f",
-    "g",
-    "h",
-    "i",
-    "j",
-    "k",
-    "l",
-    "m",
-    "n",
-    "o",
-    "p",
-    "q",
-    "r",
-    "s",
-    "t",
-    "u",
-    "v",
-    "w",
-    "x",
-    "y",
-    "z",
-    "1",
-    "2",
-    "3",
-    "4",
-    "5",
-    "0",
-    "\uff22",
-    "\uff30"
-  ],
-  "speakers": {
-    "dingzhen": 0,
-    "taffy": 1
-  }
-}

VITS-fast-fine-tuning/configs/uma_trilingual.json DELETED Viewed

@@ -1,54 +0,0 @@
-{
-  "train": {
-    "log_interval": 200,
-    "eval_interval": 1000,
-    "seed": 1234,
-    "epochs": 10000,
-    "learning_rate": 2e-4,
-    "betas": [0.8, 0.99],
-    "eps": 1e-9,
-    "batch_size": 16,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 8192,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "training_files":"../CH_JA_EN_mix_voice/clipped_3_vits_trilingual_annotations.train.txt.cleaned",
-    "validation_files":"../CH_JA_EN_mix_voice/clipped_3_vits_trilingual_annotations.val.txt.cleaned",
-    "text_cleaners":["cjke_cleaners2"],
-    "max_wav_value": 32768.0,
-    "sampling_rate": 22050,
-    "filter_length": 1024,
-    "hop_length": 256,
-    "win_length": 1024,
-    "n_mel_channels": 80,
-    "mel_fmin": 0.0,
-    "mel_fmax": null,
-    "add_blank": true,
-    "n_speakers": 999,
-    "cleaned_text": true
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0.1,
-    "resblock": "1",
-    "resblock_kernel_sizes": [3,7,11],
-    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
-    "upsample_rates": [8,8,2,2],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [16,16,4,4],
-    "n_layers_q": 3,
-    "use_spectral_norm": false,
-    "gin_channels": 256
-  },
-  "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "N", "Q", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "w", "x", "y", "z", "\u0251", "\u00e6", "\u0283", "\u0291", "\u00e7", "\u026f", "\u026a", "\u0254", "\u025b", "\u0279", "\u00f0", "\u0259", "\u026b", "\u0265", "\u0278", "\u028a", "\u027e", "\u0292", "\u03b8", "\u03b2", "\u014b", "\u0266", "\u207c", "\u02b0", "`", "^", "#", "*", "=", "\u02c8", "\u02cc", "\u2192", "\u2193", "\u2191", " "]
-}

VITS-fast-fine-tuning/data_utils.py DELETED Viewed

@@ -1,267 +0,0 @@
-import time
-import os
-import random
-import numpy as np
-import torch
-import torch.utils.data
-import torchaudio
-import commons
-from mel_processing import spectrogram_torch
-from utils import load_wav_to_torch, load_filepaths_and_text
-from text import text_to_sequence, cleaned_text_to_sequence
-"""Multi speaker version"""
-class TextAudioSpeakerLoader(torch.utils.data.Dataset):
-    """
-        1) loads audio, speaker_id, text pairs
-        2) normalizes text and converts them to sequences of integers
-        3) computes spectrograms from audio files.
-    """
-    def __init__(self, audiopaths_sid_text, hparams, symbols):
-        self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
-        self.text_cleaners = hparams.text_cleaners
-        self.max_wav_value = hparams.max_wav_value
-        self.sampling_rate = hparams.sampling_rate
-        self.filter_length = hparams.filter_length
-        self.hop_length = hparams.hop_length
-        self.win_length = hparams.win_length
-        self.sampling_rate = hparams.sampling_rate
-        self.cleaned_text = getattr(hparams, "cleaned_text", False)
-        self.add_blank = hparams.add_blank
-        self.min_text_len = getattr(hparams, "min_text_len", 1)
-        self.max_text_len = getattr(hparams, "max_text_len", 190)
-        self.symbols = symbols
-        random.seed(1234)
-        random.shuffle(self.audiopaths_sid_text)
-        self._filter()
-    def _filter(self):
-        """
-        Filter text & store spec lengths
-        """
-        # Store spectrogram lengths for Bucketing
-        # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
-        # spec_length = wav_length // hop_length
-        audiopaths_sid_text_new = []
-        lengths = []
-        for audiopath, sid, text in self.audiopaths_sid_text:
-            # audiopath = "./user_voice/" + audiopath
-            if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
-                audiopaths_sid_text_new.append([audiopath, sid, text])
-                lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
-        self.audiopaths_sid_text = audiopaths_sid_text_new
-        self.lengths = lengths
-    def get_audio_text_speaker_pair(self, audiopath_sid_text):
-        # separate filename, speaker_id and text
-        audiopath, sid, text = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2]
-        text = self.get_text(text)
-        spec, wav = self.get_audio(audiopath)
-        sid = self.get_sid(sid)
-        return (text, spec, wav, sid)
-    def get_audio(self, filename):
-        # audio, sampling_rate = load_wav_to_torch(filename)
-        # if sampling_rate != self.sampling_rate:
-        #     raise ValueError("{} {} SR doesn't match target {} SR".format(
-        #         sampling_rate, self.sampling_rate))
-        # audio_norm = audio / self.max_wav_value if audio.max() > 10 else audio
-        # audio_norm = audio_norm.unsqueeze(0)
-        audio_norm, sampling_rate = torchaudio.load(filename, frame_offset=0, num_frames=-1, normalize=True, channels_first=True)
-        # spec_filename = filename.replace(".wav", ".spec.pt")
-        # if os.path.exists(spec_filename):
-        #     spec = torch.load(spec_filename)
-        # else:
-        #     try:
-        spec = spectrogram_torch(audio_norm, self.filter_length,
-                                 self.sampling_rate, self.hop_length, self.win_length,
-                                 center=False)
-        spec = spec.squeeze(0)
-            # except NotImplementedError:
-            #     print("?")
-            # spec = torch.squeeze(spec, 0)
-            # torch.save(spec, spec_filename)
-        return spec, audio_norm
-    def get_text(self, text):
-        if self.cleaned_text:
-            text_norm = cleaned_text_to_sequence(text, self.symbols)
-        else:
-            text_norm = text_to_sequence(text, self.text_cleaners)
-        if self.add_blank:
-            text_norm = commons.intersperse(text_norm, 0)
-        text_norm = torch.LongTensor(text_norm)
-        return text_norm
-    def get_sid(self, sid):
-        sid = torch.LongTensor([int(sid)])
-        return sid
-    def __getitem__(self, index):
-        return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
-    def __len__(self):
-        return len(self.audiopaths_sid_text)
-class TextAudioSpeakerCollate():
-    """ Zero-pads model inputs and targets
-    """
-    def __init__(self, return_ids=False):
-        self.return_ids = return_ids
-    def __call__(self, batch):
-        """Collate's training batch from normalized text, audio and speaker identities
-        PARAMS
-        ------
-        batch: [text_normalized, spec_normalized, wav_normalized, sid]
-        """
-        # Right zero-pad all one-hot text sequences to max input length
-        _, ids_sorted_decreasing = torch.sort(
-            torch.LongTensor([x[1].size(1) for x in batch]),
-            dim=0, descending=True)
-        max_text_len = max([len(x[0]) for x in batch])
-        max_spec_len = max([x[1].size(1) for x in batch])
-        max_wav_len = max([x[2].size(1) for x in batch])
-        text_lengths = torch.LongTensor(len(batch))
-        spec_lengths = torch.LongTensor(len(batch))
-        wav_lengths = torch.LongTensor(len(batch))
-        sid = torch.LongTensor(len(batch))
-        text_padded = torch.LongTensor(len(batch), max_text_len)
-        spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
-        wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
-        text_padded.zero_()
-        spec_padded.zero_()
-        wav_padded.zero_()
-        for i in range(len(ids_sorted_decreasing)):
-            row = batch[ids_sorted_decreasing[i]]
-            text = row[0]
-            text_padded[i, :text.size(0)] = text
-            text_lengths[i] = text.size(0)
-            spec = row[1]
-            spec_padded[i, :, :spec.size(1)] = spec
-            spec_lengths[i] = spec.size(1)
-            wav = row[2]
-            wav_padded[i, :, :wav.size(1)] = wav
-            wav_lengths[i] = wav.size(1)
-            sid[i] = row[3]
-        if self.return_ids:
-            return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
-        return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid
-class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
-    """
-    Maintain similar input lengths in a batch.
-    Length groups are specified by boundaries.
-    Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
-    It removes samples which are not included in the boundaries.
-    Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
-    """
-    def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
-        super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
-        self.lengths = dataset.lengths
-        self.batch_size = batch_size
-        self.boundaries = boundaries
-        self.buckets, self.num_samples_per_bucket = self._create_buckets()
-        self.total_size = sum(self.num_samples_per_bucket)
-        self.num_samples = self.total_size // self.num_replicas
-    def _create_buckets(self):
-        buckets = [[] for _ in range(len(self.boundaries) - 1)]
-        for i in range(len(self.lengths)):
-            length = self.lengths[i]
-            idx_bucket = self._bisect(length)
-            if idx_bucket != -1:
-                buckets[idx_bucket].append(i)
-        for i in range(len(buckets) - 1, 0, -1):
-            if len(buckets[i]) == 0:
-                buckets.pop(i)
-                self.boundaries.pop(i + 1)
-        num_samples_per_bucket = []
-        for i in range(len(buckets)):
-            len_bucket = len(buckets[i])
-            total_batch_size = self.num_replicas * self.batch_size
-            rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
-            num_samples_per_bucket.append(len_bucket + rem)
-        return buckets, num_samples_per_bucket
-    def __iter__(self):
-        # deterministically shuffle based on epoch
-        g = torch.Generator()
-        g.manual_seed(self.epoch)
-        indices = []
-        if self.shuffle:
-            for bucket in self.buckets:
-                indices.append(torch.randperm(len(bucket), generator=g).tolist())
-        else:
-            for bucket in self.buckets:
-                indices.append(list(range(len(bucket))))
-        batches = []
-        for i in range(len(self.buckets)):
-            bucket = self.buckets[i]
-            len_bucket = len(bucket)
-            ids_bucket = indices[i]
-            num_samples_bucket = self.num_samples_per_bucket[i]
-            # add extra samples to make it evenly divisible
-            rem = num_samples_bucket - len_bucket
-            ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
-            # subsample
-            ids_bucket = ids_bucket[self.rank::self.num_replicas]
-            # batching
-            for j in range(len(ids_bucket) // self.batch_size):
-                batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]]
-                batches.append(batch)
-        if self.shuffle:
-            batch_ids = torch.randperm(len(batches), generator=g).tolist()
-            batches = [batches[i] for i in batch_ids]
-        self.batches = batches
-        assert len(self.batches) * self.batch_size == self.num_samples
-        return iter(self.batches)
-    def _bisect(self, x, lo=0, hi=None):
-        if hi is None:
-            hi = len(self.boundaries) - 1
-        if hi > lo:
-            mid = (hi + lo) // 2
-            if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
-                return mid
-            elif x <= self.boundaries[mid]:
-                return self._bisect(x, lo, mid)
-            else:
-                return self._bisect(x, mid + 1, hi)
-        else:
-            return -1
-    def __len__(self):
-        return self.num_samples // self.batch_size

VITS-fast-fine-tuning/denoise_audio.py DELETED Viewed

@@ -1,18 +0,0 @@
-import os
-import torchaudio
-raw_audio_dir = "./raw_audio/"
-denoise_audio_dir = "./denoised_audio/"
-filelist = list(os.walk(raw_audio_dir))[0][2]
-for file in filelist:
-    if file.endswith(".wav"):
-        os.system(f"demucs --two-stems=vocals {raw_audio_dir}{file}")
-for file in filelist:
-    file = file.replace(".wav", "")
-    wav, sr = torchaudio.load(f"./separated/htdemucs/{file}/vocals.wav", frame_offset=0, num_frames=-1, normalize=True,
-                              channels_first=True)
-    # merge two channels into one
-    wav = wav.mean(dim=0).unsqueeze(0)
-    if sr != 22050:
-        wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)(wav)
-    torchaudio.save(denoise_audio_dir + file + ".wav", wav, 22050, channels_first=True)

VITS-fast-fine-tuning/download_model.py DELETED Viewed

@@ -1,4 +0,0 @@
-from google.colab import files
-files.download("./G_latest.pth")
-files.download("./finetune_speaker.json")
-files.download("./moegoe_config.json")

VITS-fast-fine-tuning/download_video.py DELETED Viewed

@@ -1,37 +0,0 @@
-import os
-import random
-import shutil
-from concurrent.futures import ThreadPoolExecutor
-from google.colab import files
-basepath = os.getcwd()
-uploaded = files.upload()  # 上传文件
-for filename in uploaded.keys():
-    assert (filename.endswith(".txt")), "speaker-videolink info could only be .txt file!"
-    shutil.move(os.path.join(basepath, filename), os.path.join("./speaker_links.txt"))
-def generate_infos():
-    infos = []
-    with open("./speaker_links.txt", 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-    for line in lines:
-        line = line.replace("\n", "").replace(" ", "")
-        if line == "":
-            continue
-        speaker, link = line.split("|")
-        filename = speaker + "_" + str(random.randint(0, 1000000))
-        infos.append({"link": link, "filename": filename})
-    return infos
-def download_video(info):
-    link = info["link"]
-    filename = info["filename"]
-    os.system(f"youtube-dl -f 0 {link} -o ./video_data/{filename}.mp4")
-if __name__ == "__main__":
-    infos = generate_infos()
-    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
-        executor.map(download_video, infos)

VITS-fast-fine-tuning/finetune_speaker_v2.py DELETED Viewed

@@ -1,321 +0,0 @@
-import os
-import json
-import argparse
-import itertools
-import math
-import torch
-from torch import nn, optim
-from torch.nn import functional as F
-from torch.utils.data import DataLoader
-from torch.utils.tensorboard import SummaryWriter
-import torch.multiprocessing as mp
-import torch.distributed as dist
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.cuda.amp import autocast, GradScaler
-from tqdm import tqdm
-import librosa
-import logging
-logging.getLogger('numba').setLevel(logging.WARNING)
-import commons
-import utils
-from data_utils import (
-  TextAudioSpeakerLoader,
-  TextAudioSpeakerCollate,
-  DistributedBucketSampler
-)
-from models import (
-  SynthesizerTrn,
-  MultiPeriodDiscriminator,
-)
-from losses import (
-  generator_loss,
-  discriminator_loss,
-  feature_loss,
-  kl_loss
-)
-from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
-torch.backends.cudnn.benchmark = True
-global_step = 0
-def main():
-  """Assume Single Node Multi GPUs Training Only"""
-  assert torch.cuda.is_available(), "CPU training is not allowed."
-  n_gpus = torch.cuda.device_count()
-  os.environ['MASTER_ADDR'] = 'localhost'
-  os.environ['MASTER_PORT'] = '8000'
-  hps = utils.get_hparams()
-  mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
-def run(rank, n_gpus, hps):
-  global global_step
-  symbols = hps['symbols']
-  if rank == 0:
-    logger = utils.get_logger(hps.model_dir)
-    logger.info(hps)
-    utils.check_git_hash(hps.model_dir)
-    writer = SummaryWriter(log_dir=hps.model_dir)
-    writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
-  # Use gloo backend on Windows for Pytorch
-  dist.init_process_group(backend=  'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank)
-  torch.manual_seed(hps.train.seed)
-  torch.cuda.set_device(rank)
-  train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data, symbols)
-  train_sampler = DistributedBucketSampler(
-      train_dataset,
-      hps.train.batch_size,
-      [32,300,400,500,600,700,800,900,1000],
-      num_replicas=n_gpus,
-      rank=rank,
-      shuffle=True)
-  collate_fn = TextAudioSpeakerCollate()
-  train_loader = DataLoader(train_dataset, num_workers=2, shuffle=False, pin_memory=True,
-      collate_fn=collate_fn, batch_sampler=train_sampler)
-  # train_loader = DataLoader(train_dataset, batch_size=hps.train.batch_size, num_workers=2, shuffle=False, pin_memory=True,
-  #                           collate_fn=collate_fn)
-  if rank == 0:
-    eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data, symbols)
-    eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False,
-        batch_size=hps.train.batch_size, pin_memory=True,
-        drop_last=False, collate_fn=collate_fn)
-  net_g = SynthesizerTrn(
-      len(symbols),
-      hps.data.filter_length // 2 + 1,
-      hps.train.segment_size // hps.data.hop_length,
-      n_speakers=hps.data.n_speakers,
-      **hps.model).cuda(rank)
-  net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
-  # load existing model
-  _, _, _, _ = utils.load_checkpoint("./pretrained_models/G_0.pth", net_g, None, drop_speaker_emb=hps.drop_speaker_embed)
-  _, _, _, _ = utils.load_checkpoint("./pretrained_models/D_0.pth", net_d, None)
-  epoch_str = 1
-  global_step = 0
-  # freeze all other layers except speaker embedding
-  for p in net_g.parameters():
-      p.requires_grad = True
-  for p in net_d.parameters():
-      p.requires_grad = True
-  # for p in net_d.parameters():
-  #     p.requires_grad = False
-  # net_g.emb_g.weight.requires_grad = True
-  optim_g = torch.optim.AdamW(
-      net_g.parameters(),
-      hps.train.learning_rate,
-      betas=hps.train.betas,
-      eps=hps.train.eps)
-  optim_d = torch.optim.AdamW(
-      net_d.parameters(),
-      hps.train.learning_rate,
-      betas=hps.train.betas,
-      eps=hps.train.eps)
-  # optim_d = None
-  net_g = DDP(net_g, device_ids=[rank])
-  net_d = DDP(net_d, device_ids=[rank])
-  scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay)
-  scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay)
-  scaler = GradScaler(enabled=hps.train.fp16_run)
-  for epoch in range(epoch_str, hps.train.epochs + 1):
-    if rank==0:
-      train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval])
-    else:
-      train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None)
-    scheduler_g.step()
-    scheduler_d.step()
-def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
-  net_g, net_d = nets
-  optim_g, optim_d = optims
-  scheduler_g, scheduler_d = schedulers
-  train_loader, eval_loader = loaders
-  if writers is not None:
-    writer, writer_eval = writers
-  # train_loader.batch_sampler.set_epoch(epoch)
-  global global_step
-  net_g.train()
-  net_d.train()
-  for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(tqdm(train_loader)):
-    x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True)
-    spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
-    y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True)
-    speakers = speakers.cuda(rank, non_blocking=True)
-    with autocast(enabled=hps.train.fp16_run):
-      y_hat, l_length, attn, ids_slice, x_mask, z_mask,\
-      (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, speakers)
-      mel = spec_to_mel_torch(
-          spec,
-          hps.data.filter_length,
-          hps.data.n_mel_channels,
-          hps.data.sampling_rate,
-          hps.data.mel_fmin,
-          hps.data.mel_fmax)
-      y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
-      y_hat_mel = mel_spectrogram_torch(
-          y_hat.squeeze(1),
-          hps.data.filter_length,
-          hps.data.n_mel_channels,
-          hps.data.sampling_rate,
-          hps.data.hop_length,
-          hps.data.win_length,
-          hps.data.mel_fmin,
-          hps.data.mel_fmax
-      )
-      y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
-      # Discriminator
-      y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
-      with autocast(enabled=False):
-        loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
-        loss_disc_all = loss_disc
-    optim_d.zero_grad()
-    scaler.scale(loss_disc_all).backward()
-    scaler.unscale_(optim_d)
-    grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
-    scaler.step(optim_d)
-    with autocast(enabled=hps.train.fp16_run):
-      # Generator
-      y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
-      with autocast(enabled=False):
-        loss_dur = torch.sum(l_length.float())
-        loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
-        loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
-        loss_fm = feature_loss(fmap_r, fmap_g)
-        loss_gen, losses_gen = generator_loss(y_d_hat_g)
-        loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl
-    optim_g.zero_grad()
-    scaler.scale(loss_gen_all).backward()
-    scaler.unscale_(optim_g)
-    grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
-    scaler.step(optim_g)
-    scaler.update()
-    if rank==0:
-      if global_step % hps.train.log_interval == 0:
-        lr = optim_g.param_groups[0]['lr']
-        losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl]
-        logger.info('Train Epoch: {} [{:.0f}%]'.format(
-          epoch,
-          100. * batch_idx / len(train_loader)))
-        logger.info([x.item() for x in losses] + [global_step, lr])
-        scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_g": grad_norm_g}
-        scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl})
-        scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
-        scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
-        scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
-        image_dict = {
-            "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
-            "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
-            "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
-            "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy())
-        }
-        utils.summarize(
-          writer=writer,
-          global_step=global_step,
-          images=image_dict,
-          scalars=scalar_dict)
-      if global_step % hps.train.eval_interval == 0:
-        evaluate(hps, net_g, eval_loader, writer_eval)
-        utils.save_checkpoint(net_g, None, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
-        utils.save_checkpoint(net_g, None, hps.train.learning_rate, epoch,
-                              os.path.join(hps.model_dir, "G_latest.pth".format(global_step)))
-        # utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
-        old_g=os.path.join(hps.model_dir, "G_{}.pth".format(global_step-4000))
-        # old_d=os.path.join(hps.model_dir, "D_{}.pth".format(global_step-400))
-        if os.path.exists(old_g):
-          os.remove(old_g)
-        # if os.path.exists(old_d):
-        #   os.remove(old_d)
-    global_step += 1
-    if epoch > hps.max_epochs:
-        print("Maximum epoch reached, closing training...")
-        exit()
-  if rank == 0:
-    logger.info('====> Epoch: {}'.format(epoch))
-def evaluate(hps, generator, eval_loader, writer_eval):
-    generator.eval()
-    with torch.no_grad():
-      for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(eval_loader):
-        x, x_lengths = x.cuda(0), x_lengths.cuda(0)
-        spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0)
-        y, y_lengths = y.cuda(0), y_lengths.cuda(0)
-        speakers = speakers.cuda(0)
-        # remove else
-        x = x[:1]
-        x_lengths = x_lengths[:1]
-        spec = spec[:1]
-        spec_lengths = spec_lengths[:1]
-        y = y[:1]
-        y_lengths = y_lengths[:1]
-        speakers = speakers[:1]
-        break
-      y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, max_len=1000)
-      y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length
-      mel = spec_to_mel_torch(
-        spec,
-        hps.data.filter_length,
-        hps.data.n_mel_channels,
-        hps.data.sampling_rate,
-        hps.data.mel_fmin,
-        hps.data.mel_fmax)
-      y_hat_mel = mel_spectrogram_torch(
-        y_hat.squeeze(1).float(),
-        hps.data.filter_length,
-        hps.data.n_mel_channels,
-        hps.data.sampling_rate,
-        hps.data.hop_length,
-        hps.data.win_length,
-        hps.data.mel_fmin,
-        hps.data.mel_fmax
-      )
-    image_dict = {
-      "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())
-    }
-    audio_dict = {
-      "gen/audio": y_hat[0,:,:y_hat_lengths[0]]
-    }
-    if global_step == 0:
-      image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
-      audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]})
-    utils.summarize(
-      writer=writer_eval,
-      global_step=global_step,
-      images=image_dict,
-      audios=audio_dict,
-      audio_sampling_rate=hps.data.sampling_rate
-    )
-    generator.train()
-if __name__ == "__main__":
-  main()

VITS-fast-fine-tuning/inference/G_latest.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:44f9141fcac34c950376594d08a288d9159a32d6add851155b6fd0ecee242419
-size 158887401

VITS-fast-fine-tuning/inference/ONNXVITS_inference.py DELETED Viewed

@@ -1,36 +0,0 @@
-import logging
-logging.getLogger('numba').setLevel(logging.WARNING)
-import IPython.display as ipd
-import torch
-import commons
-import utils
-import ONNXVITS_infer
-from text import text_to_sequence
-def get_text(text, hps):
-    text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
-    if hps.data.add_blank:
-        text_norm = commons.intersperse(text_norm, 0)
-    text_norm = torch.LongTensor(text_norm)
-    return text_norm
-hps = utils.get_hparams_from_file("../vits/pretrained_models/uma87.json")
-net_g = ONNXVITS_infer.SynthesizerTrn(
-    len(hps.symbols),
-    hps.data.filter_length // 2 + 1,
-    hps.train.segment_size // hps.data.hop_length,
-    n_speakers=hps.data.n_speakers,
-    **hps.model)
-_ = net_g.eval()
-_ = utils.load_checkpoint("../vits/pretrained_models/uma_1153000.pth", net_g)
-text1 = get_text("おはようございます。", hps)
-stn_tst = text1
-with torch.no_grad():
-    x_tst = stn_tst.unsqueeze(0)
-    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
-    sid = torch.LongTensor([0])
-    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
-print(audio)

VITS-fast-fine-tuning/inference/VC_inference.py DELETED Viewed

@@ -1,139 +0,0 @@
-import os
-import numpy as np
-import torch
-from torch import no_grad, LongTensor
-import argparse
-import commons
-from mel_processing import spectrogram_torch
-import utils
-from models import SynthesizerTrn
-import gradio as gr
-import librosa
-import webbrowser
-from text import text_to_sequence, _clean_text
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-language_marks = {
-    "Japanese": "",
-    "日本語": "[JA]",
-    "简体中文": "[ZH]",
-    "English": "[EN]",
-    "Mix": "",
-}
-lang = ['日本語', '简体中文', 'English', 'Mix']
-def get_text(text, hps, is_symbol):
-    text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
-    if hps.data.add_blank:
-        text_norm = commons.intersperse(text_norm, 0)
-    text_norm = LongTensor(text_norm)
-    return text_norm
-def create_tts_fn(model, hps, speaker_ids):
-    def tts_fn(text, speaker, language, speed):
-        if language is not None:
-            text = language_marks[language] + text + language_marks[language]
-        speaker_id = speaker_ids[speaker]
-        stn_tst = get_text(text, hps, False)
-        with no_grad():
-            x_tst = stn_tst.unsqueeze(0).to(device)
-            x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
-            sid = LongTensor([speaker_id]).to(device)
-            audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
-                                length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
-        del stn_tst, x_tst, x_tst_lengths, sid
-        return "Success", (hps.data.sampling_rate, audio)
-    return tts_fn
-def create_vc_fn(model, hps, speaker_ids):
-    def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
-        input_audio = record_audio if record_audio is not None else upload_audio
-        if input_audio is None:
-            return "You need to record or upload an audio", None
-        sampling_rate, audio = input_audio
-        original_speaker_id = speaker_ids[original_speaker]
-        target_speaker_id = speaker_ids[target_speaker]
-        audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
-        if len(audio.shape) > 1:
-            audio = librosa.to_mono(audio.transpose(1, 0))
-        if sampling_rate != hps.data.sampling_rate:
-            audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
-        with no_grad():
-            y = torch.FloatTensor(audio)
-            y = y / max(-y.min(), y.max()) / 0.99
-            y = y.to(device)
-            y = y.unsqueeze(0)
-            spec = spectrogram_torch(y, hps.data.filter_length,
-                                     hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
-                                     center=False).to(device)
-            spec_lengths = LongTensor([spec.size(-1)]).to(device)
-            sid_src = LongTensor([original_speaker_id]).to(device)
-            sid_tgt = LongTensor([target_speaker_id]).to(device)
-            audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
-                0, 0].data.cpu().float().numpy()
-        del y, spec, spec_lengths, sid_src, sid_tgt
-        return "Success", (hps.data.sampling_rate, audio)
-    return vc_fn
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_dir", default="./G_latest.pth", help="directory to your fine-tuned model")
-    parser.add_argument("--config_dir", default="./finetune_speaker.json", help="directory to your model config file")
-    parser.add_argument("--share", default=False, help="make link public (used in colab)")
-    args = parser.parse_args()
-    hps = utils.get_hparams_from_file(args.config_dir)
-    net_g = SynthesizerTrn(
-        len(hps.symbols),
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        n_speakers=hps.data.n_speakers,
-        **hps.model).to(device)
-    _ = net_g.eval()
-    _ = utils.load_checkpoint(args.model_dir, net_g, None)
-    speaker_ids = hps.speakers
-    speakers = list(hps.speakers.keys())
-    tts_fn = create_tts_fn(net_g, hps, speaker_ids)
-    vc_fn = create_vc_fn(net_g, hps, speaker_ids)
-    app = gr.Blocks()
-    with app:
-        with gr.Tab("Text-to-Speech"):
-            with gr.Row():
-                with gr.Column():
-                    textbox = gr.TextArea(label="Text",
-                                          placeholder="Type your sentence here",
-                                          value="こんにちわ。", elem_id=f"tts-input")
-                    # select character
-                    char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
-                    language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
-                    duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
-                                                label='速度 Speed')
-                with gr.Column():
-                    text_output = gr.Textbox(label="Message")
-                    audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
-                    btn = gr.Button("Generate!")
-                    btn.click(tts_fn,
-                              inputs=[textbox, char_dropdown, language_dropdown, duration_slider,],
-                              outputs=[text_output, audio_output])
-        with gr.Tab("Voice Conversion"):
-            gr.Markdown("""
-                            录制或上传声音，并选择要转换的音色。
-            """)
-            with gr.Column():
-                record_audio = gr.Audio(label="record your voice", source="microphone")
-                upload_audio = gr.Audio(label="or upload audio here", source="upload")
-                source_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="source speaker")
-                target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
-            with gr.Column():
-                message_box = gr.Textbox(label="Message")
-                converted_audio = gr.Audio(label='converted audio')
-            btn = gr.Button("Convert!")
-            btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
-                      outputs=[message_box, converted_audio])
-    webbrowser.open("http://127.0.0.1:7860")
-    app.launch(share=args.share)

VITS-fast-fine-tuning/inference/finetune_speaker.json DELETED Viewed

@@ -1,147 +0,0 @@
-{
-  "train": {
-    "log_interval": 100,
-    "eval_interval": 1000,
-    "seed": 1234,
-    "epochs": 10000,
-    "learning_rate": 0.0002,
-    "betas": [
-      0.8,
-      0.99
-    ],
-    "eps": 1e-09,
-    "batch_size": 16,
-    "fp16_run": true,
-    "lr_decay": 0.999875,
-    "segment_size": 8192,
-    "init_lr_ratio": 1,
-    "warmup_epochs": 0,
-    "c_mel": 45,
-    "c_kl": 1.0
-  },
-  "data": {
-    "training_files": "final_annotation_train.txt",
-    "validation_files": "final_annotation_val.txt",
-    "text_cleaners": [
-      "zh_ja_mixture_cleaners"
-    ],
-    "max_wav_value": 32768.0,
-    "sampling_rate": 22050,
-    "filter_length": 1024,
-    "hop_length": 256,
-    "win_length": 1024,
-    "n_mel_channels": 80,
-    "mel_fmin": 0.0,
-    "mel_fmax": null,
-    "add_blank": true,
-    "n_speakers": 3,
-    "cleaned_text": true
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "kernel_size": 3,
-    "p_dropout": 0.1,
-    "resblock": "1",
-    "resblock_kernel_sizes": [
-      3,
-      7,
-      11
-    ],
-    "resblock_dilation_sizes": [
-      [
-        1,
-        3,
-        5
-      ],
-      [
-        1,
-        3,
-        5
-      ],
-      [
-        1,
-        3,
-        5
-      ]
-    ],
-    "upsample_rates": [
-      8,
-      8,
-      2,
-      2
-    ],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [
-      16,
-      16,
-      4,
-      4
-    ],
-    "n_layers_q": 3,
-    "use_spectral_norm": false,
-    "gin_channels": 256
-  },
-  "speakers": {
-    "Hana": 0,
-    "specialweek": 1,
-    "zhongli": 2
-  },
-  "symbols": [
-    "_",
-    ",",
-    ".",
-    "!",
-    "?",
-    "-",
-    "~",
-    "\u2026",
-    "A",
-    "E",
-    "I",
-    "N",
-    "O",
-    "Q",
-    "U",
-    "a",
-    "b",
-    "d",
-    "e",
-    "f",
-    "g",
-    "h",
-    "i",
-    "j",
-    "k",
-    "l",
-    "m",
-    "n",
-    "o",
-    "p",
-    "r",
-    "s",
-    "t",
-    "u",
-    "v",
-    "w",
-    "y",
-    "z",
-    "\u0283",
-    "\u02a7",
-    "\u02a6",
-    "\u026f",
-    "\u0279",
-    "\u0259",
-    "\u0265",
-    "\u207c",
-    "\u02b0",
-    "`",
-    "\u2192",
-    "\u2193",
-    "\u2191",
-    " "
-  ]
-}

VITS-fast-fine-tuning/long_audio_transcribe.py DELETED Viewed

@@ -1,71 +0,0 @@
-from moviepy.editor import AudioFileClip
-import whisper
-import os
-import torchaudio
-import librosa
-import torch
-import argparse
-parent_dir = "./denoised_audio/"
-filelist = list(os.walk(parent_dir))[0][2]
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--languages", default="CJE")
-    parser.add_argument("--whisper_size", default="medium")
-    args = parser.parse_args()
-    if args.languages == "CJE":
-        lang2token = {
-            'zh': "[ZH]",
-            'ja': "[JA]",
-            "en": "[EN]",
-        }
-    elif args.languages == "CJ":
-        lang2token = {
-            'zh': "[ZH]",
-            'ja': "[JA]",
-        }
-    elif args.languages == "C":
-        lang2token = {
-            'zh': "[ZH]",
-        }
-    assert(torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
-    model = whisper.load_model(args.whisper_size)
-    speaker_annos = []
-    for file in filelist:
-        print(f"transcribing {parent_dir + file}...\n")
-        options = dict(beam_size=5, best_of=5)
-        transcribe_options = dict(task="transcribe", **options)
-        result = model.transcribe(parent_dir + file, **transcribe_options)
-        segments = result["segments"]
-        # result = model.transcribe(parent_dir + file)
-        lang = result['language']
-        if result['language'] not in list(lang2token.keys()):
-            print(f"{lang} not supported, ignoring...\n")
-            continue
-        # segment audio based on segment results
-        character_name = file.rstrip(".wav").split("_")[0]
-        code = file.rstrip(".wav").split("_")[1]
-        if not os.path.exists("./segmented_character_voice/" + character_name):
-            os.mkdir("./segmented_character_voice/" + character_name)
-        wav, sr = torchaudio.load(parent_dir + file, frame_offset=0, num_frames=-1, normalize=True,
-                                  channels_first=True)
-        for i, seg in enumerate(result['segments']):
-            start_time = seg['start']
-            end_time = seg['end']
-            text = seg['text']
-            text = lang2token[lang] + text.replace("\n", "") + lang2token[lang]
-            text = text + "\n"
-            wav_seg = wav[:, int(start_time*sr):int(end_time*sr)]
-            wav_seg_name = f"{character_name}_{code}_{i}.wav"
-            savepth = "./segmented_character_voice/" + character_name + "/" + wav_seg_name
-            speaker_annos.append(savepth + "|" + character_name + "|" + text)
-            print(f"Transcribed segment: {speaker_annos[-1]}")
-            # trimmed_wav_seg = librosa.effects.trim(wav_seg.squeeze().numpy())
-            # trimmed_wav_seg = torch.tensor(trimmed_wav_seg[0]).unsqueeze(0)
-            torchaudio.save(savepth, wav_seg, 22050, channels_first=True)
-    if len(speaker_annos) == 0:
-        print("Warning: no long audios & videos found, this IS expected if you have only uploaded short audios")
-        print("this IS NOT expected if you have uploaded any long audios, videos or video links. Please check your file structure or make sure your audio/video language is supported.")
-    with open("long_character_anno.txt", 'w', encoding='utf-8') as f:
-        for line in speaker_annos:
-            f.write(line)

VITS-fast-fine-tuning/losses.py DELETED Viewed

@@ -1,61 +0,0 @@
-import torch
-from torch.nn import functional as F
-import commons
-def feature_loss(fmap_r, fmap_g):
-  loss = 0
-  for dr, dg in zip(fmap_r, fmap_g):
-    for rl, gl in zip(dr, dg):
-      rl = rl.float().detach()
-      gl = gl.float()
-      loss += torch.mean(torch.abs(rl - gl))
-  return loss * 2
-def discriminator_loss(disc_real_outputs, disc_generated_outputs):
-  loss = 0
-  r_losses = []
-  g_losses = []
-  for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
-    dr = dr.float()
-    dg = dg.float()
-    r_loss = torch.mean((1-dr)**2)
-    g_loss = torch.mean(dg**2)
-    loss += (r_loss + g_loss)
-    r_losses.append(r_loss.item())
-    g_losses.append(g_loss.item())
-  return loss, r_losses, g_losses
-def generator_loss(disc_outputs):
-  loss = 0
-  gen_losses = []
-  for dg in disc_outputs:
-    dg = dg.float()
-    l = torch.mean((1-dg)**2)
-    gen_losses.append(l)
-    loss += l
-  return loss, gen_losses
-def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
-  """
-  z_p, logs_q: [b, h, t_t]
-  m_p, logs_p: [b, h, t_t]
-  """
-  z_p = z_p.float()
-  logs_q = logs_q.float()
-  m_p = m_p.float()
-  logs_p = logs_p.float()
-  z_mask = z_mask.float()
-  kl = logs_p - logs_q - 0.5
-  kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
-  kl = torch.sum(kl * z_mask)
-  l = kl / torch.sum(z_mask)
-  return l

VITS-fast-fine-tuning/mel_processing.py DELETED Viewed

@@ -1,112 +0,0 @@
-import math
-import os
-import random
-import torch
-from torch import nn
-import torch.nn.functional as F
-import torch.utils.data
-import numpy as np
-import librosa
-import librosa.util as librosa_util
-from librosa.util import normalize, pad_center, tiny
-from scipy.signal import get_window
-from scipy.io.wavfile import read
-from librosa.filters import mel as librosa_mel_fn
-MAX_WAV_VALUE = 32768.0
-def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
-    """
-    PARAMS
-    ------
-    C: compression factor
-    """
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-def dynamic_range_decompression_torch(x, C=1):
-    """
-    PARAMS
-    ------
-    C: compression factor used to compress
-    """
-    return torch.exp(x) / C
-def spectral_normalize_torch(magnitudes):
-    output = dynamic_range_compression_torch(magnitudes)
-    return output
-def spectral_de_normalize_torch(magnitudes):
-    output = dynamic_range_decompression_torch(magnitudes)
-    return output
-mel_basis = {}
-hann_window = {}
-def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
-    if torch.min(y) < -1.:
-        print('min value is ', torch.min(y))
-    if torch.max(y) > 1.:
-        print('max value is ', torch.max(y))
-    global hann_window
-    dtype_device = str(y.dtype) + '_' + str(y.device)
-    wnsize_dtype_device = str(win_size) + '_' + dtype_device
-    if wnsize_dtype_device not in hann_window:
-        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
-    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
-    y = y.squeeze(1)
-    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
-                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
-    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-    return spec
-def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
-    global mel_basis
-    dtype_device = str(spec.dtype) + '_' + str(spec.device)
-    fmax_dtype_device = str(fmax) + '_' + dtype_device
-    if fmax_dtype_device not in mel_basis:
-        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
-        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
-    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
-    spec = spectral_normalize_torch(spec)
-    return spec
-def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
-    if torch.min(y) < -1.:
-        print('min value is ', torch.min(y))
-    if torch.max(y) > 1.:
-        print('max value is ', torch.max(y))
-    global mel_basis, hann_window
-    dtype_device = str(y.dtype) + '_' + str(y.device)
-    fmax_dtype_device = str(fmax) + '_' + dtype_device
-    wnsize_dtype_device = str(win_size) + '_' + dtype_device
-    if fmax_dtype_device not in mel_basis:
-        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
-        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
-    if wnsize_dtype_device not in hann_window:
-        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
-    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
-    y = y.squeeze(1)
-    spec = torch.stft(y.float(), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
-        center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
-    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
-    spec = spectral_normalize_torch(spec)
-    return spec

VITS-fast-fine-tuning/models.py DELETED Viewed

@@ -1,533 +0,0 @@
-import copy
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-import commons
-import modules
-import attentions
-import monotonic_align
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from commons import init_weights, get_padding
-class StochasticDurationPredictor(nn.Module):
-  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
-    super().__init__()
-    filter_channels = in_channels # it needs to be removed from future version.
-    self.in_channels = in_channels
-    self.filter_channels = filter_channels
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.n_flows = n_flows
-    self.gin_channels = gin_channels
-    self.log_flow = modules.Log()
-    self.flows = nn.ModuleList()
-    self.flows.append(modules.ElementwiseAffine(2))
-    for i in range(n_flows):
-      self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-      self.flows.append(modules.Flip())
-    self.post_pre = nn.Conv1d(1, filter_channels, 1)
-    self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
-    self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-    self.post_flows = nn.ModuleList()
-    self.post_flows.append(modules.ElementwiseAffine(2))
-    for i in range(4):
-      self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-      self.post_flows.append(modules.Flip())
-    self.pre = nn.Conv1d(in_channels, filter_channels, 1)
-    self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
-    self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-    if gin_channels != 0:
-      self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
-  def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
-    x = torch.detach(x)
-    x = self.pre(x)
-    if g is not None:
-      g = torch.detach(g)
-      x = x + self.cond(g)
-    x = self.convs(x, x_mask)
-    x = self.proj(x) * x_mask
-    if not reverse:
-      flows = self.flows
-      assert w is not None
-      logdet_tot_q = 0
-      h_w = self.post_pre(w)
-      h_w = self.post_convs(h_w, x_mask)
-      h_w = self.post_proj(h_w) * x_mask
-      e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
-      z_q = e_q
-      for flow in self.post_flows:
-        z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
-        logdet_tot_q += logdet_q
-      z_u, z1 = torch.split(z_q, [1, 1], 1)
-      u = torch.sigmoid(z_u) * x_mask
-      z0 = (w - u) * x_mask
-      logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
-      logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
-      logdet_tot = 0
-      z0, logdet = self.log_flow(z0, x_mask)
-      logdet_tot += logdet
-      z = torch.cat([z0, z1], 1)
-      for flow in flows:
-        z, logdet = flow(z, x_mask, g=x, reverse=reverse)
-        logdet_tot = logdet_tot + logdet
-      nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
-      return nll + logq # [b]
-    else:
-      flows = list(reversed(self.flows))
-      flows = flows[:-2] + [flows[-1]] # remove a useless vflow
-      z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
-      for flow in flows:
-        z = flow(z, x_mask, g=x, reverse=reverse)
-      z0, z1 = torch.split(z, [1, 1], 1)
-      logw = z0
-      return logw
-class DurationPredictor(nn.Module):
-  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
-    super().__init__()
-    self.in_channels = in_channels
-    self.filter_channels = filter_channels
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.gin_channels = gin_channels
-    self.drop = nn.Dropout(p_dropout)
-    self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
-    self.norm_1 = modules.LayerNorm(filter_channels)
-    self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
-    self.norm_2 = modules.LayerNorm(filter_channels)
-    self.proj = nn.Conv1d(filter_channels, 1, 1)
-    if gin_channels != 0:
-      self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-  def forward(self, x, x_mask, g=None):
-    x = torch.detach(x)
-    if g is not None:
-      g = torch.detach(g)
-      x = x + self.cond(g)
-    x = self.conv_1(x * x_mask)
-    x = torch.relu(x)
-    x = self.norm_1(x)
-    x = self.drop(x)
-    x = self.conv_2(x * x_mask)
-    x = torch.relu(x)
-    x = self.norm_2(x)
-    x = self.drop(x)
-    x = self.proj(x * x_mask)
-    return x * x_mask
-class TextEncoder(nn.Module):
-  def __init__(self,
-      n_vocab,
-      out_channels,
-      hidden_channels,
-      filter_channels,
-      n_heads,
-      n_layers,
-      kernel_size,
-      p_dropout):
-    super().__init__()
-    self.n_vocab = n_vocab
-    self.out_channels = out_channels
-    self.hidden_channels = hidden_channels
-    self.filter_channels = filter_channels
-    self.n_heads = n_heads
-    self.n_layers = n_layers
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.emb = nn.Embedding(n_vocab, hidden_channels)
-    nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
-    self.encoder = attentions.Encoder(
-      hidden_channels,
-      filter_channels,
-      n_heads,
-      n_layers,
-      kernel_size,
-      p_dropout)
-    self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
-  def forward(self, x, x_lengths):
-    x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
-    x = torch.transpose(x, 1, -1) # [b, h, t]
-    x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
-    x = self.encoder(x * x_mask, x_mask)
-    stats = self.proj(x) * x_mask
-    m, logs = torch.split(stats, self.out_channels, dim=1)
-    return x, m, logs, x_mask
-class ResidualCouplingBlock(nn.Module):
-  def __init__(self,
-      channels,
-      hidden_channels,
-      kernel_size,
-      dilation_rate,
-      n_layers,
-      n_flows=4,
-      gin_channels=0):
-    super().__init__()
-    self.channels = channels
-    self.hidden_channels = hidden_channels
-    self.kernel_size = kernel_size
-    self.dilation_rate = dilation_rate
-    self.n_layers = n_layers
-    self.n_flows = n_flows
-    self.gin_channels = gin_channels
-    self.flows = nn.ModuleList()
-    for i in range(n_flows):
-      self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
-      self.flows.append(modules.Flip())
-  def forward(self, x, x_mask, g=None, reverse=False):
-    if not reverse:
-      for flow in self.flows:
-        x, _ = flow(x, x_mask, g=g, reverse=reverse)
-    else:
-      for flow in reversed(self.flows):
-        x = flow(x, x_mask, g=g, reverse=reverse)
-    return x
-class PosteriorEncoder(nn.Module):
-  def __init__(self,
-      in_channels,
-      out_channels,
-      hidden_channels,
-      kernel_size,
-      dilation_rate,
-      n_layers,
-      gin_channels=0):
-    super().__init__()
-    self.in_channels = in_channels
-    self.out_channels = out_channels
-    self.hidden_channels = hidden_channels
-    self.kernel_size = kernel_size
-    self.dilation_rate = dilation_rate
-    self.n_layers = n_layers
-    self.gin_channels = gin_channels
-    self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-    self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
-    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-  def forward(self, x, x_lengths, g=None):
-    x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
-    x = self.pre(x) * x_mask
-    x = self.enc(x, x_mask, g=g)
-    stats = self.proj(x) * x_mask
-    m, logs = torch.split(stats, self.out_channels, dim=1)
-    z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
-    return z, m, logs, x_mask
-class Generator(torch.nn.Module):
-    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
-        super(Generator, self).__init__()
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
-        resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(weight_norm(
-                ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
-                                k, u, padding=(k-u)//2)))
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel//(2**(i+1))
-            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-                self.resblocks.append(resblock(ch, k, d))
-        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-        self.ups.apply(init_weights)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-    def forward(self, x, g=None):
-        x = self.conv_pre(x)
-        if g is not None:
-          x = x + self.cond(g)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            x = self.ups[i](x)
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i*self.num_kernels+j](x)
-                else:
-                    xs += self.resblocks[i*self.num_kernels+j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_weight_norm(self):
-        print('Removing weight norm...')
-        for l in self.ups:
-            remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-class DiscriminatorP(torch.nn.Module):
-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        self.use_spectral_norm = use_spectral_norm
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
-            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
-            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
-            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
-            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
-        ])
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-    def forward(self, x):
-        fmap = []
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0: # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), "reflect")
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class DiscriminatorS(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(Conv1d(1, 16, 15, 1, padding=7)),
-            norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
-            norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
-            norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
-            norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
-            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-        ])
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-    def forward(self, x):
-        fmap = []
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class MultiPeriodDiscriminator(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(MultiPeriodDiscriminator, self).__init__()
-        periods = [2,3,5,7,11]
-        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
-        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
-        self.discriminators = nn.ModuleList(discs)
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            y_d_gs.append(y_d_g)
-            fmap_rs.append(fmap_r)
-            fmap_gs.append(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-class SynthesizerTrn(nn.Module):
-  """
-  Synthesizer for Training
-  """
-  def __init__(self,
-    n_vocab,
-    spec_channels,
-    segment_size,
-    inter_channels,
-    hidden_channels,
-    filter_channels,
-    n_heads,
-    n_layers,
-    kernel_size,
-    p_dropout,
-    resblock,
-    resblock_kernel_sizes,
-    resblock_dilation_sizes,
-    upsample_rates,
-    upsample_initial_channel,
-    upsample_kernel_sizes,
-    n_speakers=0,
-    gin_channels=0,
-    use_sdp=True,
-    **kwargs):
-    super().__init__()
-    self.n_vocab = n_vocab
-    self.spec_channels = spec_channels
-    self.inter_channels = inter_channels
-    self.hidden_channels = hidden_channels
-    self.filter_channels = filter_channels
-    self.n_heads = n_heads
-    self.n_layers = n_layers
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.resblock = resblock
-    self.resblock_kernel_sizes = resblock_kernel_sizes
-    self.resblock_dilation_sizes = resblock_dilation_sizes
-    self.upsample_rates = upsample_rates
-    self.upsample_initial_channel = upsample_initial_channel
-    self.upsample_kernel_sizes = upsample_kernel_sizes
-    self.segment_size = segment_size
-    self.n_speakers = n_speakers
-    self.gin_channels = gin_channels
-    self.use_sdp = use_sdp
-    self.enc_p = TextEncoder(n_vocab,
-        inter_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout)
-    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
-    self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
-    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
-    if use_sdp:
-      self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
-    else:
-      self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
-    if n_speakers >= 1:
-      self.emb_g = nn.Embedding(n_speakers, gin_channels)
-  def forward(self, x, x_lengths, y, y_lengths, sid=None):
-    x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
-    if self.n_speakers > 0:
-      g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
-    else:
-      g = None
-    z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
-    z_p = self.flow(z, y_mask, g=g)
-    with torch.no_grad():
-      # negative cross-entropy
-      s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
-      neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
-      neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
-      neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
-      neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
-      neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
-      attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
-      attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
-    w = attn.sum(2)
-    if self.use_sdp:
-      l_length = self.dp(x, x_mask, w, g=g)
-      l_length = l_length / torch.sum(x_mask)
-    else:
-      logw_ = torch.log(w + 1e-6) * x_mask
-      logw = self.dp(x, x_mask, g=g)
-      l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
-    # expand prior
-    m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
-    logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
-    z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
-    o = self.dec(z_slice, g=g)
-    return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
-  def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
-    x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
-    if self.n_speakers > 0:
-      g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
-    else:
-      g = None
-    if self.use_sdp:
-      logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
-    else:
-      logw = self.dp(x, x_mask, g=g)
-    w = torch.exp(logw) * x_mask * length_scale
-    w_ceil = torch.ceil(w)
-    y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
-    y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
-    attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
-    attn = commons.generate_path(w_ceil, attn_mask)
-    m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
-    logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
-    z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
-    z = self.flow(z_p, y_mask, g=g, reverse=True)
-    o = self.dec((z * y_mask)[:,:,:max_len], g=g)
-    return o, attn, y_mask, (z, z_p, m_p, logs_p)
-  def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
-    assert self.n_speakers > 0, "n_speakers have to be larger than 0."
-    g_src = self.emb_g(sid_src).unsqueeze(-1)
-    g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
-    z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
-    z_p = self.flow(z, y_mask, g=g_src)
-    z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
-    o_hat = self.dec(z_hat * y_mask, g=g_tgt)
-    return o_hat, y_mask, (z, z_p, z_hat)

VITS-fast-fine-tuning/models_infer.py DELETED Viewed

@@ -1,402 +0,0 @@
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-import commons
-import modules
-import attentions
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from commons import init_weights, get_padding
-class StochasticDurationPredictor(nn.Module):
-  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
-    super().__init__()
-    filter_channels = in_channels # it needs to be removed from future version.
-    self.in_channels = in_channels
-    self.filter_channels = filter_channels
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.n_flows = n_flows
-    self.gin_channels = gin_channels
-    self.log_flow = modules.Log()
-    self.flows = nn.ModuleList()
-    self.flows.append(modules.ElementwiseAffine(2))
-    for i in range(n_flows):
-      self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-      self.flows.append(modules.Flip())
-    self.post_pre = nn.Conv1d(1, filter_channels, 1)
-    self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
-    self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-    self.post_flows = nn.ModuleList()
-    self.post_flows.append(modules.ElementwiseAffine(2))
-    for i in range(4):
-      self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-      self.post_flows.append(modules.Flip())
-    self.pre = nn.Conv1d(in_channels, filter_channels, 1)
-    self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
-    self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-    if gin_channels != 0:
-      self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
-  def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
-    x = torch.detach(x)
-    x = self.pre(x)
-    if g is not None:
-      g = torch.detach(g)
-      x = x + self.cond(g)
-    x = self.convs(x, x_mask)
-    x = self.proj(x) * x_mask
-    if not reverse:
-      flows = self.flows
-      assert w is not None
-      logdet_tot_q = 0
-      h_w = self.post_pre(w)
-      h_w = self.post_convs(h_w, x_mask)
-      h_w = self.post_proj(h_w) * x_mask
-      e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
-      z_q = e_q
-      for flow in self.post_flows:
-        z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
-        logdet_tot_q += logdet_q
-      z_u, z1 = torch.split(z_q, [1, 1], 1)
-      u = torch.sigmoid(z_u) * x_mask
-      z0 = (w - u) * x_mask
-      logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
-      logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
-      logdet_tot = 0
-      z0, logdet = self.log_flow(z0, x_mask)
-      logdet_tot += logdet
-      z = torch.cat([z0, z1], 1)
-      for flow in flows:
-        z, logdet = flow(z, x_mask, g=x, reverse=reverse)
-        logdet_tot = logdet_tot + logdet
-      nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
-      return nll + logq # [b]
-    else:
-      flows = list(reversed(self.flows))
-      flows = flows[:-2] + [flows[-1]] # remove a useless vflow
-      z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
-      for flow in flows:
-        z = flow(z, x_mask, g=x, reverse=reverse)
-      z0, z1 = torch.split(z, [1, 1], 1)
-      logw = z0
-      return logw
-class DurationPredictor(nn.Module):
-  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
-    super().__init__()
-    self.in_channels = in_channels
-    self.filter_channels = filter_channels
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.gin_channels = gin_channels
-    self.drop = nn.Dropout(p_dropout)
-    self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
-    self.norm_1 = modules.LayerNorm(filter_channels)
-    self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
-    self.norm_2 = modules.LayerNorm(filter_channels)
-    self.proj = nn.Conv1d(filter_channels, 1, 1)
-    if gin_channels != 0:
-      self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-  def forward(self, x, x_mask, g=None):
-    x = torch.detach(x)
-    if g is not None:
-      g = torch.detach(g)
-      x = x + self.cond(g)
-    x = self.conv_1(x * x_mask)
-    x = torch.relu(x)
-    x = self.norm_1(x)
-    x = self.drop(x)
-    x = self.conv_2(x * x_mask)
-    x = torch.relu(x)
-    x = self.norm_2(x)
-    x = self.drop(x)
-    x = self.proj(x * x_mask)
-    return x * x_mask
-class TextEncoder(nn.Module):
-  def __init__(self,
-      n_vocab,
-      out_channels,
-      hidden_channels,
-      filter_channels,
-      n_heads,
-      n_layers,
-      kernel_size,
-      p_dropout):
-    super().__init__()
-    self.n_vocab = n_vocab
-    self.out_channels = out_channels
-    self.hidden_channels = hidden_channels
-    self.filter_channels = filter_channels
-    self.n_heads = n_heads
-    self.n_layers = n_layers
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.emb = nn.Embedding(n_vocab, hidden_channels)
-    nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
-    self.encoder = attentions.Encoder(
-      hidden_channels,
-      filter_channels,
-      n_heads,
-      n_layers,
-      kernel_size,
-      p_dropout)
-    self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
-  def forward(self, x, x_lengths):
-    x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
-    x = torch.transpose(x, 1, -1) # [b, h, t]
-    x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
-    x = self.encoder(x * x_mask, x_mask)
-    stats = self.proj(x) * x_mask
-    m, logs = torch.split(stats, self.out_channels, dim=1)
-    return x, m, logs, x_mask
-class ResidualCouplingBlock(nn.Module):
-  def __init__(self,
-      channels,
-      hidden_channels,
-      kernel_size,
-      dilation_rate,
-      n_layers,
-      n_flows=4,
-      gin_channels=0):
-    super().__init__()
-    self.channels = channels
-    self.hidden_channels = hidden_channels
-    self.kernel_size = kernel_size
-    self.dilation_rate = dilation_rate
-    self.n_layers = n_layers
-    self.n_flows = n_flows
-    self.gin_channels = gin_channels
-    self.flows = nn.ModuleList()
-    for i in range(n_flows):
-      self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
-      self.flows.append(modules.Flip())
-  def forward(self, x, x_mask, g=None, reverse=False):
-    if not reverse:
-      for flow in self.flows:
-        x, _ = flow(x, x_mask, g=g, reverse=reverse)
-    else:
-      for flow in reversed(self.flows):
-        x = flow(x, x_mask, g=g, reverse=reverse)
-    return x
-class PosteriorEncoder(nn.Module):
-  def __init__(self,
-      in_channels,
-      out_channels,
-      hidden_channels,
-      kernel_size,
-      dilation_rate,
-      n_layers,
-      gin_channels=0):
-    super().__init__()
-    self.in_channels = in_channels
-    self.out_channels = out_channels
-    self.hidden_channels = hidden_channels
-    self.kernel_size = kernel_size
-    self.dilation_rate = dilation_rate
-    self.n_layers = n_layers
-    self.gin_channels = gin_channels
-    self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-    self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
-    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-  def forward(self, x, x_lengths, g=None):
-    x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
-    x = self.pre(x) * x_mask
-    x = self.enc(x, x_mask, g=g)
-    stats = self.proj(x) * x_mask
-    m, logs = torch.split(stats, self.out_channels, dim=1)
-    z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
-    return z, m, logs, x_mask
-class Generator(torch.nn.Module):
-    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
-        super(Generator, self).__init__()
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
-        resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(weight_norm(
-                ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
-                                k, u, padding=(k-u)//2)))
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel//(2**(i+1))
-            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-                self.resblocks.append(resblock(ch, k, d))
-        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-        self.ups.apply(init_weights)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-    def forward(self, x, g=None):
-        x = self.conv_pre(x)
-        if g is not None:
-          x = x + self.cond(g)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            x = self.ups[i](x)
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i*self.num_kernels+j](x)
-                else:
-                    xs += self.resblocks[i*self.num_kernels+j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_weight_norm(self):
-        print('Removing weight norm...')
-        for l in self.ups:
-            remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-class SynthesizerTrn(nn.Module):
-  """
-  Synthesizer for Training
-  """
-  def __init__(self,
-    n_vocab,
-    spec_channels,
-    segment_size,
-    inter_channels,
-    hidden_channels,
-    filter_channels,
-    n_heads,
-    n_layers,
-    kernel_size,
-    p_dropout,
-    resblock,
-    resblock_kernel_sizes,
-    resblock_dilation_sizes,
-    upsample_rates,
-    upsample_initial_channel,
-    upsample_kernel_sizes,
-    n_speakers=0,
-    gin_channels=0,
-    use_sdp=True,
-    **kwargs):
-    super().__init__()
-    self.n_vocab = n_vocab
-    self.spec_channels = spec_channels
-    self.inter_channels = inter_channels
-    self.hidden_channels = hidden_channels
-    self.filter_channels = filter_channels
-    self.n_heads = n_heads
-    self.n_layers = n_layers
-    self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
-    self.resblock = resblock
-    self.resblock_kernel_sizes = resblock_kernel_sizes
-    self.resblock_dilation_sizes = resblock_dilation_sizes
-    self.upsample_rates = upsample_rates
-    self.upsample_initial_channel = upsample_initial_channel
-    self.upsample_kernel_sizes = upsample_kernel_sizes
-    self.segment_size = segment_size
-    self.n_speakers = n_speakers
-    self.gin_channels = gin_channels
-    self.use_sdp = use_sdp
-    self.enc_p = TextEncoder(n_vocab,
-        inter_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout)
-    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
-    self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
-    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
-    if use_sdp:
-      self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
-    else:
-      self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
-    if n_speakers > 1:
-      self.emb_g = nn.Embedding(n_speakers, gin_channels)
-  def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
-    x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
-    if self.n_speakers > 0:
-      g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
-    else:
-      g = None
-    if self.use_sdp:
-      logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
-    else:
-      logw = self.dp(x, x_mask, g=g)
-    w = torch.exp(logw) * x_mask * length_scale
-    w_ceil = torch.ceil(w)
-    y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
-    y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
-    attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
-    attn = commons.generate_path(w_ceil, attn_mask)
-    m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
-    logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
-    z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
-    z = self.flow(z_p, y_mask, g=g, reverse=True)
-    o = self.dec((z * y_mask)[:,:,:max_len], g=g)
-    return o, attn, y_mask, (z, z_p, m_p, logs_p)
-  def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
-    assert self.n_speakers > 0, "n_speakers have to be larger than 0."
-    g_src = self.emb_g(sid_src).unsqueeze(-1)
-    g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
-    z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
-    z_p = self.flow(z, y_mask, g=g_src)
-    z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
-    o_hat = self.dec(z_hat * y_mask, g=g_tgt)
-    return o_hat, y_mask, (z, z_p, z_hat)

VITS-fast-fine-tuning/modules.py DELETED Viewed

@@ -1,390 +0,0 @@
-import copy
-import math
-import numpy as np
-import scipy
-import torch
-from torch import nn
-from torch.nn import functional as F
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm
-import commons
-from commons import init_weights, get_padding
-from transforms import piecewise_rational_quadratic_transform
-LRELU_SLOPE = 0.1
-class LayerNorm(nn.Module):
-  def __init__(self, channels, eps=1e-5):
-    super().__init__()
-    self.channels = channels
-    self.eps = eps
-    self.gamma = nn.Parameter(torch.ones(channels))
-    self.beta = nn.Parameter(torch.zeros(channels))
-  def forward(self, x):
-    x = x.transpose(1, -1)
-    x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
-    return x.transpose(1, -1)
-class ConvReluNorm(nn.Module):
-  def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
-    super().__init__()
-    self.in_channels = in_channels
-    self.hidden_channels = hidden_channels
-    self.out_channels = out_channels
-    self.kernel_size = kernel_size
-    self.n_layers = n_layers
-    self.p_dropout = p_dropout
-    assert n_layers > 1, "Number of layers should be larger than 0."
-    self.conv_layers = nn.ModuleList()
-    self.norm_layers = nn.ModuleList()
-    self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
-    self.norm_layers.append(LayerNorm(hidden_channels))
-    self.relu_drop = nn.Sequential(
-        nn.ReLU(),
-        nn.Dropout(p_dropout))
-    for _ in range(n_layers-1):
-      self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
-      self.norm_layers.append(LayerNorm(hidden_channels))
-    self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
-    self.proj.weight.data.zero_()
-    self.proj.bias.data.zero_()
-  def forward(self, x, x_mask):
-    x_org = x
-    for i in range(self.n_layers):
-      x = self.conv_layers[i](x * x_mask)
-      x = self.norm_layers[i](x)
-      x = self.relu_drop(x)
-    x = x_org + self.proj(x)
-    return x * x_mask
-class DDSConv(nn.Module):
-  """
-  Dialted and Depth-Separable Convolution
-  """
-  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
-    super().__init__()
-    self.channels = channels
-    self.kernel_size = kernel_size
-    self.n_layers = n_layers
-    self.p_dropout = p_dropout
-    self.drop = nn.Dropout(p_dropout)
-    self.convs_sep = nn.ModuleList()
-    self.convs_1x1 = nn.ModuleList()
-    self.norms_1 = nn.ModuleList()
-    self.norms_2 = nn.ModuleList()
-    for i in range(n_layers):
-      dilation = kernel_size ** i
-      padding = (kernel_size * dilation - dilation) // 2
-      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
-          groups=channels, dilation=dilation, padding=padding
-      ))
-      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
-      self.norms_1.append(LayerNorm(channels))
-      self.norms_2.append(LayerNorm(channels))
-  def forward(self, x, x_mask, g=None):
-    if g is not None:
-      x = x + g
-    for i in range(self.n_layers):
-      y = self.convs_sep[i](x * x_mask)
-      y = self.norms_1[i](y)
-      y = F.gelu(y)
-      y = self.convs_1x1[i](y)
-      y = self.norms_2[i](y)
-      y = F.gelu(y)
-      y = self.drop(y)
-      x = x + y
-    return x * x_mask
-class WN(torch.nn.Module):
-  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
-    super(WN, self).__init__()
-    assert(kernel_size % 2 == 1)
-    self.hidden_channels =hidden_channels
-    self.kernel_size = kernel_size,
-    self.dilation_rate = dilation_rate
-    self.n_layers = n_layers
-    self.gin_channels = gin_channels
-    self.p_dropout = p_dropout
-    self.in_layers = torch.nn.ModuleList()
-    self.res_skip_layers = torch.nn.ModuleList()
-    self.drop = nn.Dropout(p_dropout)
-    if gin_channels != 0:
-      cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
-      self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
-    for i in range(n_layers):
-      dilation = dilation_rate ** i
-      padding = int((kernel_size * dilation - dilation) / 2)
-      in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
-                                 dilation=dilation, padding=padding)
-      in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
-      self.in_layers.append(in_layer)
-      # last one is not necessary
-      if i < n_layers - 1:
-        res_skip_channels = 2 * hidden_channels
-      else:
-        res_skip_channels = hidden_channels
-      res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
-      res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
-      self.res_skip_layers.append(res_skip_layer)
-  def forward(self, x, x_mask, g=None, **kwargs):
-    output = torch.zeros_like(x)
-    n_channels_tensor = torch.IntTensor([self.hidden_channels])
-    if g is not None:
-      g = self.cond_layer(g)
-    for i in range(self.n_layers):
-      x_in = self.in_layers[i](x)
-      if g is not None:
-        cond_offset = i * 2 * self.hidden_channels
-        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
-      else:
-        g_l = torch.zeros_like(x_in)
-      acts = commons.fused_add_tanh_sigmoid_multiply(
-          x_in,
-          g_l,
-          n_channels_tensor)
-      acts = self.drop(acts)
-      res_skip_acts = self.res_skip_layers[i](acts)
-      if i < self.n_layers - 1:
-        res_acts = res_skip_acts[:,:self.hidden_channels,:]
-        x = (x + res_acts) * x_mask
-        output = output + res_skip_acts[:,self.hidden_channels:,:]
-      else:
-        output = output + res_skip_acts
-    return output * x_mask
-  def remove_weight_norm(self):
-    if self.gin_channels != 0:
-      torch.nn.utils.remove_weight_norm(self.cond_layer)
-    for l in self.in_layers:
-      torch.nn.utils.remove_weight_norm(l)
-    for l in self.res_skip_layers:
-     torch.nn.utils.remove_weight_norm(l)
-class ResBlock1(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock1, self).__init__()
-        self.convs1 = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
-                               padding=get_padding(kernel_size, dilation[0]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
-                               padding=get_padding(kernel_size, dilation[1]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
-                               padding=get_padding(kernel_size, dilation[2])))
-        ])
-        self.convs1.apply(init_weights)
-        self.convs2 = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
-                               padding=get_padding(kernel_size, 1))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
-                               padding=get_padding(kernel_size, 1))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
-                               padding=get_padding(kernel_size, 1)))
-        ])
-        self.convs2.apply(init_weights)
-    def forward(self, x, x_mask=None):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            if x_mask is not None:
-                xt = xt * x_mask
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            if x_mask is not None:
-                xt = xt * x_mask
-            xt = c2(xt)
-            x = xt + x
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs1:
-            remove_weight_norm(l)
-        for l in self.convs2:
-            remove_weight_norm(l)
-class ResBlock2(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
-        super(ResBlock2, self).__init__()
-        self.convs = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
-                               padding=get_padding(kernel_size, dilation[0]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
-                               padding=get_padding(kernel_size, dilation[1])))
-        ])
-        self.convs.apply(init_weights)
-    def forward(self, x, x_mask=None):
-        for c in self.convs:
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            if x_mask is not None:
-                xt = xt * x_mask
-            xt = c(xt)
-            x = xt + x
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs:
-            remove_weight_norm(l)
-class Log(nn.Module):
-  def forward(self, x, x_mask, reverse=False, **kwargs):
-    if not reverse:
-      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
-      logdet = torch.sum(-y, [1, 2])
-      return y, logdet
-    else:
-      x = torch.exp(x) * x_mask
-      return x
-class Flip(nn.Module):
-  def forward(self, x, *args, reverse=False, **kwargs):
-    x = torch.flip(x, [1])
-    if not reverse:
-      logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
-      return x, logdet
-    else:
-      return x
-class ElementwiseAffine(nn.Module):
-  def __init__(self, channels):
-    super().__init__()
-    self.channels = channels
-    self.m = nn.Parameter(torch.zeros(channels,1))
-    self.logs = nn.Parameter(torch.zeros(channels,1))
-  def forward(self, x, x_mask, reverse=False, **kwargs):
-    if not reverse:
-      y = self.m + torch.exp(self.logs) * x
-      y = y * x_mask
-      logdet = torch.sum(self.logs * x_mask, [1,2])
-      return y, logdet
-    else:
-      x = (x - self.m) * torch.exp(-self.logs) * x_mask
-      return x
-class ResidualCouplingLayer(nn.Module):
-  def __init__(self,
-      channels,
-      hidden_channels,
-      kernel_size,
-      dilation_rate,
-      n_layers,
-      p_dropout=0,
-      gin_channels=0,
-      mean_only=False):
-    assert channels % 2 == 0, "channels should be divisible by 2"
-    super().__init__()
-    self.channels = channels
-    self.hidden_channels = hidden_channels
-    self.kernel_size = kernel_size
-    self.dilation_rate = dilation_rate
-    self.n_layers = n_layers
-    self.half_channels = channels // 2
-    self.mean_only = mean_only
-    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
-    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
-    self.post.weight.data.zero_()
-    self.post.bias.data.zero_()
-  def forward(self, x, x_mask, g=None, reverse=False):
-    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
-    h = self.pre(x0) * x_mask
-    h = self.enc(h, x_mask, g=g)
-    stats = self.post(h) * x_mask
-    if not self.mean_only:
-      m, logs = torch.split(stats, [self.half_channels]*2, 1)
-    else:
-      m = stats
-      logs = torch.zeros_like(m)
-    if not reverse:
-      x1 = m + x1 * torch.exp(logs) * x_mask
-      x = torch.cat([x0, x1], 1)
-      logdet = torch.sum(logs, [1,2])
-      return x, logdet
-    else:
-      x1 = (x1 - m) * torch.exp(-logs) * x_mask
-      x = torch.cat([x0, x1], 1)
-      return x
-class ConvFlow(nn.Module):
-  def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
-    super().__init__()
-    self.in_channels = in_channels
-    self.filter_channels = filter_channels
-    self.kernel_size = kernel_size
-    self.n_layers = n_layers
-    self.num_bins = num_bins
-    self.tail_bound = tail_bound
-    self.half_channels = in_channels // 2
-    self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
-    self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
-    self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
-    self.proj.weight.data.zero_()
-    self.proj.bias.data.zero_()
-  def forward(self, x, x_mask, g=None, reverse=False):
-    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
-    h = self.pre(x0)
-    h = self.convs(h, x_mask, g=g)
-    h = self.proj(h) * x_mask
-    b, c, t = x0.shape
-    h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
-    unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
-    unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
-    unnormalized_derivatives = h[..., 2 * self.num_bins:]
-    x1, logabsdet = piecewise_rational_quadratic_transform(x1,
-        unnormalized_widths,
-        unnormalized_heights,
-        unnormalized_derivatives,
-        inverse=reverse,
-        tails='linear',
-        tail_bound=self.tail_bound
-    )
-    x = torch.cat([x0, x1], 1) * x_mask
-    logdet = torch.sum(logabsdet * x_mask, [1,2])
-    if not reverse:
-        return x, logdet
-    else:
-        return x

VITS-fast-fine-tuning/monotonic_align/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-import numpy as np
-import torch
-from .monotonic_align.core import maximum_path_c
-def maximum_path(neg_cent, mask):
-  """ Cython optimized version.
-  neg_cent: [b, t_t, t_s]
-  mask: [b, t_t, t_s]
-  """
-  device = neg_cent.device
-  dtype = neg_cent.dtype
-  neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
-  path = np.zeros(neg_cent.shape, dtype=np.int32)
-  t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
-  t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
-  maximum_path_c(path, neg_cent, t_t_max, t_s_max)
-  return torch.from_numpy(path).to(device=device, dtype=dtype)

VITS-fast-fine-tuning/monotonic_align/core.pyx DELETED Viewed

@@ -1,42 +0,0 @@
-cimport cython
-from cython.parallel import prange
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
-  cdef int x
-  cdef int y
-  cdef float v_prev
-  cdef float v_cur
-  cdef float tmp
-  cdef int index = t_x - 1
-  for y in range(t_y):
-    for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
-      if x == y:
-        v_cur = max_neg_val
-      else:
-        v_cur = value[y-1, x]
-      if x == 0:
-        if y == 0:
-          v_prev = 0.
-        else:
-          v_prev = max_neg_val
-      else:
-        v_prev = value[y-1, x-1]
-      value[y, x] += max(v_prev, v_cur)
-  for y in range(t_y - 1, -1, -1):
-    path[y, index] = 1
-    if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
-      index = index - 1
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
-  cdef int b = paths.shape[0]
-  cdef int i
-  for i in prange(b, nogil=True):
-    maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])

VITS-fast-fine-tuning/monotonic_align/setup.py DELETED Viewed

@@ -1,9 +0,0 @@
-from distutils.core import setup
-from Cython.Build import cythonize
-import numpy
-setup(
-  name = 'monotonic_align',
-  ext_modules = cythonize("core.pyx"),
-  include_dirs=[numpy.get_include()]
-)

VITS-fast-fine-tuning/preprocess_v2.py DELETED Viewed

@@ -1,151 +0,0 @@
-import os
-import argparse
-import json
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--add_auxiliary_data", type=bool, help="Whether to add extra data as fine-tuning helper")
-    parser.add_argument("--languages", default="CJE")
-    args = parser.parse_args()
-    if args.languages == "CJE":
-        langs = ["[ZH]", "[JA]", "[EN]"]
-    elif args.languages == "CJ":
-        langs = ["[ZH]", "[JA]"]
-    elif args.languages == "C":
-        langs = ["[ZH]"]
-    new_annos = []
-    # Source 1: transcribed short audios
-    if os.path.exists("short_character_anno.txt"):
-        with open("short_character_anno.txt", 'r', encoding='utf-8') as f:
-            short_character_anno = f.readlines()
-            new_annos += short_character_anno
-    # Source 2: transcribed long audio segments
-    if os.path.exists("long_character_anno.txt"):
-        with open("long_character_anno.txt", 'r', encoding='utf-8') as f:
-            long_character_anno = f.readlines()
-            new_annos += long_character_anno
-    # Get all speaker names
-    speakers = []
-    for line in new_annos:
-        path, speaker, text = line.split("|")
-        if speaker not in speakers:
-            speakers.append(speaker)
-    assert (len(speakers) != 0), "No audio file found. Please check your uploaded file structure."
-    # Source 3 (Optional): sampled audios as extra training helpers
-    if args.add_auxiliary_data:
-        with open("sampled_audio4ft.txt", 'r', encoding='utf-8') as f:
-            old_annos = f.readlines()
-        # filter old_annos according to supported languages
-        filtered_old_annos = []
-        for line in old_annos:
-            for lang in langs:
-                if lang in line:
-                    filtered_old_annos.append(line)
-        old_annos = filtered_old_annos
-        for line in old_annos:
-            path, speaker, text = line.split("|")
-            if speaker not in speakers:
-                speakers.append(speaker)
-        num_old_voices = len(old_annos)
-        num_new_voices = len(new_annos)
-        # STEP 1: balance number of new & old voices
-        cc_duplicate = num_old_voices // num_new_voices
-        if cc_duplicate == 0:
-            cc_duplicate = 1
-        # STEP 2: modify config file
-        with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
-            hps = json.load(f)
-        # assign ids to new speakers
-        speaker2id = {}
-        for i, speaker in enumerate(speakers):
-            speaker2id[speaker] = i
-        # modify n_speakers
-        hps['data']["n_speakers"] = len(speakers)
-        # overwrite speaker names
-        hps['speakers'] = speaker2id
-        hps['train']['log_interval'] = 100
-        hps['train']['eval_interval'] = 1000
-        hps['train']['batch_size'] = 16
-        hps['data']['training_files'] = "final_annotation_train.txt"
-        hps['data']['validation_files'] = "final_annotation_val.txt"
-        # save modified config
-        with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
-            json.dump(hps, f, indent=2)
-        # STEP 3: clean annotations, replace speaker names with assigned speaker IDs
-        import text
-        cleaned_new_annos = []
-        for i, line in enumerate(new_annos):
-            path, speaker, txt = line.split("|")
-            if len(txt) > 150:
-                continue
-            cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
-            cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
-            cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
-        cleaned_old_annos = []
-        for i, line in enumerate(old_annos):
-            path, speaker, txt = line.split("|")
-            if len(txt) > 150:
-                continue
-            cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
-            cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
-            cleaned_old_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
-        # merge with old annotation
-        final_annos = cleaned_old_annos + cc_duplicate * cleaned_new_annos
-        # save annotation file
-        with open("final_annotation_train.txt", 'w', encoding='utf-8') as f:
-            for line in final_annos:
-                f.write(line)
-        # save annotation file for validation
-        with open("final_annotation_val.txt", 'w', encoding='utf-8') as f:
-            for line in cleaned_new_annos:
-                f.write(line)
-        print("finished")
-    else:
-        # Do not add extra helper data
-        # STEP 1: modify config file
-        with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
-            hps = json.load(f)
-        # assign ids to new speakers
-        speaker2id = {}
-        for i, speaker in enumerate(speakers):
-            speaker2id[speaker] = i
-        # modify n_speakers
-        hps['data']["n_speakers"] = len(speakers)
-        # overwrite speaker names
-        hps['speakers'] = speaker2id
-        hps['train']['log_interval'] = 10
-        hps['train']['eval_interval'] = 100
-        hps['train']['batch_size'] = 16
-        hps['data']['training_files'] = "final_annotation_train.txt"
-        hps['data']['validation_files'] = "final_annotation_val.txt"
-        # save modified config
-        with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
-            json.dump(hps, f, indent=2)
-        # STEP 2: clean annotations, replace speaker names with assigned speaker IDs
-        import text
-        cleaned_new_annos = []
-        for i, line in enumerate(new_annos):
-            path, speaker, txt = line.split("|")
-            if len(txt) > 150:
-                continue
-            cleaned_text = text._clean_text(txt, hps['data']['text_cleaners']).replace("[ZH]", "")
-            cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
-            cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
-        final_annos = cleaned_new_annos
-        # save annotation file
-        with open("final_annotation_train.txt", 'w', encoding='utf-8') as f:
-            for line in final_annos:
-                f.write(line)
-        # save annotation file for validation
-        with open("final_annotation_val.txt", 'w', encoding='utf-8') as f:
-            for line in cleaned_new_annos:
-                f.write(line)
-        print("finished")

VITS-fast-fine-tuning/rearrange_speaker.py DELETED Viewed

@@ -1,37 +0,0 @@
-import torch
-import argparse
-import json
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_dir", type=str, default="./OUTPUT_MODEL/G_latest.pth")
-    parser.add_argument("--config_dir", type=str, default="./configs/modified_finetune_speaker.json")
-    args = parser.parse_args()
-    model_sd = torch.load(args.model_dir, map_location='cpu')
-    with open(args.config_dir, 'r', encoding='utf-8') as f:
-        hps = json.load(f)
-    valid_speakers = list(hps['speakers'].keys())
-    if hps['data']['n_speakers'] > len(valid_speakers):
-        new_emb_g = torch.zeros([len(valid_speakers), 256])
-        old_emb_g = model_sd['model']['emb_g.weight']
-        for i, speaker in enumerate(valid_speakers):
-            new_emb_g[i, :] = old_emb_g[hps['speakers'][speaker], :]
-            hps['speakers'][speaker] = i
-        hps['data']['n_speakers'] = len(valid_speakers)
-        model_sd['model']['emb_g.weight'] = new_emb_g
-        with open("./finetune_speaker.json", 'w', encoding='utf-8') as f:
-            json.dump(hps, f, indent=2)
-        torch.save(model_sd, "./G_latest.pth")
-    else:
-        with open("./finetune_speaker.json", 'w', encoding='utf-8') as f:
-            json.dump(hps, f, indent=2)
-        torch.save(model_sd, "./G_latest.pth")
-    # save another config file copy in MoeGoe format
-    hps['speakers'] = valid_speakers
-    with open("./moegoe_config.json", 'w', encoding='utf-8') as f:
-        json.dump(hps, f, indent=2)

VITS-fast-fine-tuning/requirements.txt DELETED Viewed

@@ -1,24 +0,0 @@
-Cython
-librosa==0.9.1
-numpy
-scipy
-tensorboard
-torch==1.13.1
-torchvision==0.14.1
-torchaudio==0.13.1
-unidecode
-pyopenjtalk
-jamo
-pypinyin
-jieba
-protobuf
-cn2an
-inflect
-eng_to_ipa
-ko_pron
-indic_transliteration==2.3.37
-num_thai==0.0.5
-opencc==1.1.1
-demucs
-openai-whisper
-gradio

VITS-fast-fine-tuning/short_audio_transcribe.py DELETED Viewed

@@ -1,111 +0,0 @@
-import whisper
-import os
-import torchaudio
-import argparse
-import torch
-lang2token = {
-            'zh': "[ZH]",
-            'ja': "[JA]",
-            "en": "[EN]",
-        }
-def transcribe_one(audio_path):
-    # load audio and pad/trim it to fit 30 seconds
-    audio = whisper.load_audio(audio_path)
-    audio = whisper.pad_or_trim(audio)
-    # make log-Mel spectrogram and move to the same device as the model
-    mel = whisper.log_mel_spectrogram(audio).to(model.device)
-    # detect the spoken language
-    _, probs = model.detect_language(mel)
-    print(f"Detected language: {max(probs, key=probs.get)}")
-    lang = max(probs, key=probs.get)
-    # decode the audio
-    options = whisper.DecodingOptions()
-    result = whisper.decode(model, mel, options)
-    # print the recognized text
-    print(result.text)
-    return lang, result.text
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--languages", default="CJE")
-    parser.add_argument("--whisper_size", default="medium")
-    args = parser.parse_args()
-    if args.languages == "CJE":
-        lang2token = {
-            'zh': "[ZH]",
-            'ja': "[JA]",
-            "en": "[EN]",
-        }
-    elif args.languages == "CJ":
-        lang2token = {
-            'zh': "[ZH]",
-            'ja': "[JA]",
-        }
-    elif args.languages == "C":
-        lang2token = {
-            'zh': "[ZH]",
-        }
-    assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
-    model = whisper.load_model(args.whisper_size)
-    parent_dir = "./custom_character_voice/"
-    speaker_names = list(os.walk(parent_dir))[0][1]
-    speaker_annos = []
-    # resample audios
-    for speaker in speaker_names:
-        for i, wavfile in enumerate(list(os.walk(parent_dir + speaker))[0][2]):
-            # try to load file as audio
-            if wavfile.startswith("processed_"):
-                continue
-            try:
-                wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
-                                          channels_first=True)
-                wav = wav.mean(dim=0).unsqueeze(0)
-                if sr != 22050:
-                    wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)(wav)
-                if wav.shape[1] / sr > 20:
-                    print(f"{wavfile} too long, ignoring\n")
-                save_path = parent_dir + speaker + "/" + f"processed_{i}.wav"
-                torchaudio.save(save_path, wav, 22050, channels_first=True)
-                # transcribe text
-                lang, text = transcribe_one(save_path)
-                if lang not in list(lang2token.keys()):
-                    print(f"{lang} not supported, ignoring\n")
-                    continue
-                text = lang2token[lang] + text + lang2token[lang] + "\n"
-                speaker_annos.append(save_path + "|" + speaker + "|" + text)
-            except:
-                continue
-    # # clean annotation
-    # import argparse
-    # import text
-    # from utils import load_filepaths_and_text
-    # for i, line in enumerate(speaker_annos):
-    #     path, sid, txt = line.split("|")
-    #     cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
-    #     cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
-    #     speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
-    # write into annotation
-    if len(speaker_annos) == 0:
-        print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
-        print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
-    with open("short_character_anno.txt", 'w', encoding='utf-8') as f:
-        for line in speaker_annos:
-            f.write(line)
-    # import json
-    # # generate new config
-    # with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
-    #     hps = json.load(f)
-    # # modify n_speakers
-    # hps['data']["n_speakers"] = 1000 + len(speaker2id)
-    # # add speaker names
-    # for speaker in speaker_names:
-    #     hps['speakers'][speaker] = speaker2id[speaker]
-    # # save modified config
-    # with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
-    #     json.dump(hps, f, indent=2)
-    # print("finished")

VITS-fast-fine-tuning/text/LICENSE DELETED Viewed

@@ -1,19 +0,0 @@
-Copyright (c) 2017 Keith Ito
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.

VITS-fast-fine-tuning/text/__init__.py DELETED Viewed

@@ -1,60 +0,0 @@
-""" from https://github.com/keithito/tacotron """
-from text import cleaners
-from text.symbols import symbols
-# Mappings from symbol to numeric ID and vice versa:
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-_id_to_symbol = {i: s for i, s in enumerate(symbols)}
-def text_to_sequence(text, symbols, cleaner_names):
-  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    Args:
-      text: string to convert to a sequence
-      cleaner_names: names of the cleaner functions to run the text through
-    Returns:
-      List of integers corresponding to the symbols in the text
-  '''
-  sequence = []
-  symbol_to_id = {s: i for i, s in enumerate(symbols)}
-  clean_text = _clean_text(text, cleaner_names)
-  print(clean_text)
-  print(f" length:{len(clean_text)}")
-  for symbol in clean_text:
-    if symbol not in symbol_to_id.keys():
-      continue
-    symbol_id = symbol_to_id[symbol]
-    sequence += [symbol_id]
-  print(f" length:{len(sequence)}")
-  return sequence
-def cleaned_text_to_sequence(cleaned_text, symbols):
-  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    Args:
-      text: string to convert to a sequence
-    Returns:
-      List of integers corresponding to the symbols in the text
-  '''
-  symbol_to_id = {s: i for i, s in enumerate(symbols)}
-  sequence = [symbol_to_id[symbol] for symbol in cleaned_text if symbol in symbol_to_id.keys()]
-  return sequence
-def sequence_to_text(sequence):
-  '''Converts a sequence of IDs back to a string'''
-  result = ''
-  for symbol_id in sequence:
-    s = _id_to_symbol[symbol_id]
-    result += s
-  return result
-def _clean_text(text, cleaner_names):
-  for name in cleaner_names:
-    cleaner = getattr(cleaners, name)
-    if not cleaner:
-      raise Exception('Unknown cleaner: %s' % name)
-    text = cleaner(text)
-  return text

VITS-fast-fine-tuning/text/__pycache__/__init__.cpython-37.pyc DELETED Viewed

Binary file (2.34 kB)

VITS-fast-fine-tuning/text/__pycache__/cleaners.cpython-37.pyc DELETED Viewed

Binary file (5.45 kB)

VITS-fast-fine-tuning/text/__pycache__/english.cpython-37.pyc DELETED Viewed

Binary file (4.93 kB)

VITS-fast-fine-tuning/text/__pycache__/japanese.cpython-37.pyc DELETED Viewed

Binary file (4.6 kB)

VITS-fast-fine-tuning/text/__pycache__/korean.cpython-37.pyc DELETED Viewed

Binary file (5.75 kB)

VITS-fast-fine-tuning/text/__pycache__/mandarin.cpython-37.pyc DELETED Viewed

Binary file (7.51 kB)

VITS-fast-fine-tuning/text/__pycache__/sanskrit.cpython-37.pyc DELETED Viewed

Binary file (1.63 kB)

VITS-fast-fine-tuning/text/__pycache__/symbols.cpython-37.pyc DELETED Viewed

Binary file (417 Bytes)