kiramayatu commited on
Commit
2fbb3ac
1 Parent(s): 5091c1d

Delete VITS-fast-fine-tuning

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. VITS-fast-fine-tuning/.idea/.gitignore +0 -3
  2. VITS-fast-fine-tuning/.idea/VITS_voice_conversion.iml +0 -12
  3. VITS-fast-fine-tuning/.idea/inspectionProfiles/Project_Default.xml +0 -154
  4. VITS-fast-fine-tuning/.idea/inspectionProfiles/profiles_settings.xml +0 -6
  5. VITS-fast-fine-tuning/.idea/misc.xml +0 -4
  6. VITS-fast-fine-tuning/.idea/modules.xml +0 -8
  7. VITS-fast-fine-tuning/.idea/vcs.xml +0 -6
  8. VITS-fast-fine-tuning/DATA.MD +0 -42
  9. VITS-fast-fine-tuning/DATA_EN.MD +0 -46
  10. VITS-fast-fine-tuning/LICENSE +0 -201
  11. VITS-fast-fine-tuning/README.md +0 -55
  12. VITS-fast-fine-tuning/README_ZH.md +0 -60
  13. VITS-fast-fine-tuning/VC_inference.py +0 -139
  14. VITS-fast-fine-tuning/attentions.py +0 -303
  15. VITS-fast-fine-tuning/cmd_inference.py +0 -106
  16. VITS-fast-fine-tuning/commons.py +0 -164
  17. VITS-fast-fine-tuning/configs/modified_finetune_speaker.json +0 -172
  18. VITS-fast-fine-tuning/configs/uma_trilingual.json +0 -54
  19. VITS-fast-fine-tuning/data_utils.py +0 -267
  20. VITS-fast-fine-tuning/denoise_audio.py +0 -18
  21. VITS-fast-fine-tuning/download_model.py +0 -4
  22. VITS-fast-fine-tuning/download_video.py +0 -37
  23. VITS-fast-fine-tuning/finetune_speaker_v2.py +0 -321
  24. VITS-fast-fine-tuning/inference/G_latest.pth +0 -3
  25. VITS-fast-fine-tuning/inference/ONNXVITS_inference.py +0 -36
  26. VITS-fast-fine-tuning/inference/VC_inference.py +0 -139
  27. VITS-fast-fine-tuning/inference/finetune_speaker.json +0 -147
  28. VITS-fast-fine-tuning/long_audio_transcribe.py +0 -71
  29. VITS-fast-fine-tuning/losses.py +0 -61
  30. VITS-fast-fine-tuning/mel_processing.py +0 -112
  31. VITS-fast-fine-tuning/models.py +0 -533
  32. VITS-fast-fine-tuning/models_infer.py +0 -402
  33. VITS-fast-fine-tuning/modules.py +0 -390
  34. VITS-fast-fine-tuning/monotonic_align/__init__.py +0 -19
  35. VITS-fast-fine-tuning/monotonic_align/core.pyx +0 -42
  36. VITS-fast-fine-tuning/monotonic_align/setup.py +0 -9
  37. VITS-fast-fine-tuning/preprocess_v2.py +0 -151
  38. VITS-fast-fine-tuning/rearrange_speaker.py +0 -37
  39. VITS-fast-fine-tuning/requirements.txt +0 -24
  40. VITS-fast-fine-tuning/short_audio_transcribe.py +0 -111
  41. VITS-fast-fine-tuning/text/LICENSE +0 -19
  42. VITS-fast-fine-tuning/text/__init__.py +0 -60
  43. VITS-fast-fine-tuning/text/__pycache__/__init__.cpython-37.pyc +0 -0
  44. VITS-fast-fine-tuning/text/__pycache__/cleaners.cpython-37.pyc +0 -0
  45. VITS-fast-fine-tuning/text/__pycache__/english.cpython-37.pyc +0 -0
  46. VITS-fast-fine-tuning/text/__pycache__/japanese.cpython-37.pyc +0 -0
  47. VITS-fast-fine-tuning/text/__pycache__/korean.cpython-37.pyc +0 -0
  48. VITS-fast-fine-tuning/text/__pycache__/mandarin.cpython-37.pyc +0 -0
  49. VITS-fast-fine-tuning/text/__pycache__/sanskrit.cpython-37.pyc +0 -0
  50. VITS-fast-fine-tuning/text/__pycache__/symbols.cpython-37.pyc +0 -0
VITS-fast-fine-tuning/.idea/.gitignore DELETED
@@ -1,3 +0,0 @@
1
- # Default ignored files
2
- /shelf/
3
- /workspace.xml
 
 
 
 
VITS-fast-fine-tuning/.idea/VITS_voice_conversion.iml DELETED
@@ -1,12 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <module type="PYTHON_MODULE" version="4">
3
- <component name="NewModuleRootManager">
4
- <content url="file://$MODULE_DIR$" />
5
- <orderEntry type="jdk" jdkName="Python 3.7 (VITS)" jdkType="Python SDK" />
6
- <orderEntry type="sourceFolder" forTests="false" />
7
- </component>
8
- <component name="PyDocumentationSettings">
9
- <option name="format" value="PLAIN" />
10
- <option name="myDocStringFormat" value="Plain" />
11
- </component>
12
- </module>
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/.idea/inspectionProfiles/Project_Default.xml DELETED
@@ -1,154 +0,0 @@
1
- <component name="InspectionProjectProfileManager">
2
- <profile version="1.0">
3
- <option name="myName" value="Project Default" />
4
- <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
5
- <option name="ignoredPackages">
6
- <value>
7
- <list size="132">
8
- <item index="0" class="java.lang.String" itemvalue="ccxt" />
9
- <item index="1" class="java.lang.String" itemvalue="lz4" />
10
- <item index="2" class="java.lang.String" itemvalue="pre-commit" />
11
- <item index="3" class="java.lang.String" itemvalue="elegantrl" />
12
- <item index="4" class="java.lang.String" itemvalue="setuptools" />
13
- <item index="5" class="java.lang.String" itemvalue="ray" />
14
- <item index="6" class="java.lang.String" itemvalue="gputil" />
15
- <item index="7" class="java.lang.String" itemvalue="google-pasta" />
16
- <item index="8" class="java.lang.String" itemvalue="tensorflow-estimator" />
17
- <item index="9" class="java.lang.String" itemvalue="scikit-learn" />
18
- <item index="10" class="java.lang.String" itemvalue="tabulate" />
19
- <item index="11" class="java.lang.String" itemvalue="multitasking" />
20
- <item index="12" class="java.lang.String" itemvalue="pickleshare" />
21
- <item index="13" class="java.lang.String" itemvalue="pyasn1-modules" />
22
- <item index="14" class="java.lang.String" itemvalue="ipython-genutils" />
23
- <item index="15" class="java.lang.String" itemvalue="Pygments" />
24
- <item index="16" class="java.lang.String" itemvalue="mccabe" />
25
- <item index="17" class="java.lang.String" itemvalue="astunparse" />
26
- <item index="18" class="java.lang.String" itemvalue="lxml" />
27
- <item index="19" class="java.lang.String" itemvalue="Werkzeug" />
28
- <item index="20" class="java.lang.String" itemvalue="tensorboard-data-server" />
29
- <item index="21" class="java.lang.String" itemvalue="jupyter-client" />
30
- <item index="22" class="java.lang.String" itemvalue="pexpect" />
31
- <item index="23" class="java.lang.String" itemvalue="click" />
32
- <item index="24" class="java.lang.String" itemvalue="ipykernel" />
33
- <item index="25" class="java.lang.String" itemvalue="pandas-datareader" />
34
- <item index="26" class="java.lang.String" itemvalue="psutil" />
35
- <item index="27" class="java.lang.String" itemvalue="jedi" />
36
- <item index="28" class="java.lang.String" itemvalue="regex" />
37
- <item index="29" class="java.lang.String" itemvalue="tensorboard" />
38
- <item index="30" class="java.lang.String" itemvalue="platformdirs" />
39
- <item index="31" class="java.lang.String" itemvalue="matplotlib" />
40
- <item index="32" class="java.lang.String" itemvalue="idna" />
41
- <item index="33" class="java.lang.String" itemvalue="rsa" />
42
- <item index="34" class="java.lang.String" itemvalue="decorator" />
43
- <item index="35" class="java.lang.String" itemvalue="numpy" />
44
- <item index="36" class="java.lang.String" itemvalue="pyasn1" />
45
- <item index="37" class="java.lang.String" itemvalue="requests" />
46
- <item index="38" class="java.lang.String" itemvalue="tensorflow" />
47
- <item index="39" class="java.lang.String" itemvalue="tensorboard-plugin-wit" />
48
- <item index="40" class="java.lang.String" itemvalue="Deprecated" />
49
- <item index="41" class="java.lang.String" itemvalue="nest-asyncio" />
50
- <item index="42" class="java.lang.String" itemvalue="prompt-toolkit" />
51
- <item index="43" class="java.lang.String" itemvalue="keras-tuner" />
52
- <item index="44" class="java.lang.String" itemvalue="scipy" />
53
- <item index="45" class="java.lang.String" itemvalue="dataclasses" />
54
- <item index="46" class="java.lang.String" itemvalue="tornado" />
55
- <item index="47" class="java.lang.String" itemvalue="google-auth-oauthlib" />
56
- <item index="48" class="java.lang.String" itemvalue="black" />
57
- <item index="49" class="java.lang.String" itemvalue="toml" />
58
- <item index="50" class="java.lang.String" itemvalue="Quandl" />
59
- <item index="51" class="java.lang.String" itemvalue="pandas" />
60
- <item index="52" class="java.lang.String" itemvalue="termcolor" />
61
- <item index="53" class="java.lang.String" itemvalue="pylint" />
62
- <item index="54" class="java.lang.String" itemvalue="typing_extensions" />
63
- <item index="55" class="java.lang.String" itemvalue="cachetools" />
64
- <item index="56" class="java.lang.String" itemvalue="debugpy" />
65
- <item index="57" class="java.lang.String" itemvalue="isort" />
66
- <item index="58" class="java.lang.String" itemvalue="pytz" />
67
- <item index="59" class="java.lang.String" itemvalue="inflection" />
68
- <item index="60" class="java.lang.String" itemvalue="Pillow" />
69
- <item index="61" class="java.lang.String" itemvalue="traitlets" />
70
- <item index="62" class="java.lang.String" itemvalue="absl-py" />
71
- <item index="63" class="java.lang.String" itemvalue="protobuf" />
72
- <item index="64" class="java.lang.String" itemvalue="joblib" />
73
- <item index="65" class="java.lang.String" itemvalue="threadpoolctl" />
74
- <item index="66" class="java.lang.String" itemvalue="opt-einsum" />
75
- <item index="67" class="java.lang.String" itemvalue="python-dateutil" />
76
- <item index="68" class="java.lang.String" itemvalue="gpflow" />
77
- <item index="69" class="java.lang.String" itemvalue="astroid" />
78
- <item index="70" class="java.lang.String" itemvalue="cycler" />
79
- <item index="71" class="java.lang.String" itemvalue="gast" />
80
- <item index="72" class="java.lang.String" itemvalue="kt-legacy" />
81
- <item index="73" class="java.lang.String" itemvalue="appdirs" />
82
- <item index="74" class="java.lang.String" itemvalue="tensorflow-probability" />
83
- <item index="75" class="java.lang.String" itemvalue="pip" />
84
- <item index="76" class="java.lang.String" itemvalue="pyzmq" />
85
- <item index="77" class="java.lang.String" itemvalue="certifi" />
86
- <item index="78" class="java.lang.String" itemvalue="oauthlib" />
87
- <item index="79" class="java.lang.String" itemvalue="pyparsing" />
88
- <item index="80" class="java.lang.String" itemvalue="Markdown" />
89
- <item index="81" class="java.lang.String" itemvalue="h5py" />
90
- <item index="82" class="java.lang.String" itemvalue="wrapt" />
91
- <item index="83" class="java.lang.String" itemvalue="kiwisolver" />
92
- <item index="84" class="java.lang.String" itemvalue="empyrical" />
93
- <item index="85" class="java.lang.String" itemvalue="backcall" />
94
- <item index="86" class="java.lang.String" itemvalue="charset-normalizer" />
95
- <item index="87" class="java.lang.String" itemvalue="multipledispatch" />
96
- <item index="88" class="java.lang.String" itemvalue="pathspec" />
97
- <item index="89" class="java.lang.String" itemvalue="jupyter-core" />
98
- <item index="90" class="java.lang.String" itemvalue="matplotlib-inline" />
99
- <item index="91" class="java.lang.String" itemvalue="ptyprocess" />
100
- <item index="92" class="java.lang.String" itemvalue="more-itertools" />
101
- <item index="93" class="java.lang.String" itemvalue="mypy-extensions" />
102
- <item index="94" class="java.lang.String" itemvalue="cloudpickle" />
103
- <item index="95" class="java.lang.String" itemvalue="wcwidth" />
104
- <item index="96" class="java.lang.String" itemvalue="requests-oauthlib" />
105
- <item index="97" class="java.lang.String" itemvalue="Keras-Preprocessing" />
106
- <item index="98" class="java.lang.String" itemvalue="yfinance" />
107
- <item index="99" class="java.lang.String" itemvalue="tomli" />
108
- <item index="100" class="java.lang.String" itemvalue="urllib3" />
109
- <item index="101" class="java.lang.String" itemvalue="six" />
110
- <item index="102" class="java.lang.String" itemvalue="parso" />
111
- <item index="103" class="java.lang.String" itemvalue="wheel" />
112
- <item index="104" class="java.lang.String" itemvalue="ipython" />
113
- <item index="105" class="java.lang.String" itemvalue="packaging" />
114
- <item index="106" class="java.lang.String" itemvalue="lazy-object-proxy" />
115
- <item index="107" class="java.lang.String" itemvalue="grpcio" />
116
- <item index="108" class="java.lang.String" itemvalue="dm-tree" />
117
- <item index="109" class="java.lang.String" itemvalue="google-auth" />
118
- <item index="110" class="java.lang.String" itemvalue="seaborn" />
119
- <item index="111" class="java.lang.String" itemvalue="thop" />
120
- <item index="112" class="java.lang.String" itemvalue="torch" />
121
- <item index="113" class="java.lang.String" itemvalue="torchvision" />
122
- <item index="114" class="java.lang.String" itemvalue="d2l" />
123
- <item index="115" class="java.lang.String" itemvalue="keyboard" />
124
- <item index="116" class="java.lang.String" itemvalue="transformers" />
125
- <item index="117" class="java.lang.String" itemvalue="phonemizer" />
126
- <item index="118" class="java.lang.String" itemvalue="Unidecode" />
127
- <item index="119" class="java.lang.String" itemvalue="nltk" />
128
- <item index="120" class="java.lang.String" itemvalue="pinecone-client" />
129
- <item index="121" class="java.lang.String" itemvalue="sentence-transformers" />
130
- <item index="122" class="java.lang.String" itemvalue="whisper" />
131
- <item index="123" class="java.lang.String" itemvalue="datasets" />
132
- <item index="124" class="java.lang.String" itemvalue="pyaudio" />
133
- <item index="125" class="java.lang.String" itemvalue="torchsummary" />
134
- <item index="126" class="java.lang.String" itemvalue="openjtalk" />
135
- <item index="127" class="java.lang.String" itemvalue="hydra-core" />
136
- <item index="128" class="java.lang.String" itemvalue="museval" />
137
- <item index="129" class="java.lang.String" itemvalue="mypy" />
138
- <item index="130" class="java.lang.String" itemvalue="hydra-colorlog" />
139
- <item index="131" class="java.lang.String" itemvalue="flake8" />
140
- </list>
141
- </value>
142
- </option>
143
- </inspection_tool>
144
- <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
145
- <option name="ignoredIdentifiers">
146
- <list>
147
- <option value="sentiment_classification.model_predictions.audio_path" />
148
- <option value="sentiment_classification.model_predictions.sample_rate" />
149
- <option value="sentiment_classification.model_predictions.num_samples" />
150
- </list>
151
- </option>
152
- </inspection_tool>
153
- </profile>
154
- </component>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/.idea/inspectionProfiles/profiles_settings.xml DELETED
@@ -1,6 +0,0 @@
1
- <component name="InspectionProjectProfileManager">
2
- <settings>
3
- <option name="USE_PROJECT_PROFILE" value="false" />
4
- <version value="1.0" />
5
- </settings>
6
- </component>
 
 
 
 
 
 
 
VITS-fast-fine-tuning/.idea/misc.xml DELETED
@@ -1,4 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (VITS)" project-jdk-type="Python SDK" />
4
- </project>
 
 
 
 
 
VITS-fast-fine-tuning/.idea/modules.xml DELETED
@@ -1,8 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="ProjectModuleManager">
4
- <modules>
5
- <module fileurl="file://$PROJECT_DIR$/.idea/VITS_voice_conversion.iml" filepath="$PROJECT_DIR$/.idea/VITS_voice_conversion.iml" />
6
- </modules>
7
- </component>
8
- </project>
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/.idea/vcs.xml DELETED
@@ -1,6 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="VcsDirectoryMappings">
4
- <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
- </component>
6
- </project>
 
 
 
 
 
 
 
VITS-fast-fine-tuning/DATA.MD DELETED
@@ -1,42 +0,0 @@
1
- 本仓库的pipeline支持多种声音样本上传方式,您只需根据您所持有的样本选择任意一种或其中几种即可。
2
-
3
- 1.`.zip`文件打包的,按角色名排列的短音频,该压缩文件结构应如下所示:
4
- ```
5
- Your-zip-file.zip
6
- ├───Character_name_1
7
- ├ ├───xxx.wav
8
- ├ ├───...
9
- ├ ├───yyy.mp3
10
- ├ └───zzz.wav
11
- ├───Character_name_2
12
- ├ ├───xxx.wav
13
- ├ ├───...
14
- ├ ├───yyy.mp3
15
- ├ └───zzz.wav
16
- ├───...
17
-
18
- └───Character_name_n
19
- ├───xxx.wav
20
- ├───...
21
- ├───yyy.mp3
22
- └───zzz.wav
23
- ```
24
- 注意音频的格式和名称都不重要,只要它们是音频文件。
25
- 质量要求:2秒以上,10秒以内,尽量不要有背景噪音。
26
- 数量要求:一个角色至少10条,最好每个角色20条以上。
27
- 2. 以角色名命名的长音频文件,音频内只能有单说话人,背景音会被自动去除。命名格式为:`{CharacterName}_{random_number}.wav`
28
- (例如:`Diana_234135.wav`, `MinatoAqua_234252.wav`),必须是`.wav`文件,长度要在20分钟以内(否则会内存不足)。
29
-
30
- 3. 以角色名命名的长视频文件,视频内只能有单说话人,背景音会被自动去除。命名格式为:`{CharacterName}_{random_number}.mp4`
31
- (例如:`Taffy_332452.mp4`, `Dingzhen_957315.mp4`),必须是`.mp4`文件,长度要在20分钟以内(否则会内存不足)。
32
- 注意:命名中,`CharacterName`必须是英文字符,`random_number`是为了区分同一个角色的多个文件,必须要添加,该数字可以为0~999999之间的任意整数。
33
-
34
- 4. 包含多行`{CharacterName}|{video_url}`的`.txt`文件,格式应如下所示:
35
- ```
36
- Char1|https://xyz.com/video1/
37
- Char2|https://xyz.com/video2/
38
- Char2|https://xyz.com/video3/
39
- Char3|https://xyz.com/video4/
40
- ```
41
- 视频内只能有单说话人,背景音会被自动去除。目前仅支持来自bilibili的视频,其它网站视频的url还没测试过。
42
- 若对格式有疑问,可以在[这里](https://drive.google.com/file/d/132l97zjanpoPY4daLgqXoM7HKXPRbS84/view?usp=sharing)找到所有格式对应的数据样本。
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/DATA_EN.MD DELETED
@@ -1,46 +0,0 @@
1
- The pipeline of this repo supports multiple voice uploading options,you can choose one or more options depending on the data you have.
2
-
3
- 1. Short audios packed by a single `.zip` file, whose file structure should be as shown below:
4
- ```
5
- Your-zip-file.zip
6
- ├───Character_name_1
7
- ├ ├───xxx.wav
8
- ├ ├───...
9
- ├ ├───yyy.mp3
10
- ├ └───zzz.wav
11
- ├───Character_name_2
12
- ├ ├───xxx.wav
13
- ├ ├───...
14
- ├ ├───yyy.mp3
15
- ├ └───zzz.wav
16
- ├───...
17
-
18
- └───Character_name_n
19
- ├───xxx.wav
20
- ├───...
21
- ├───yyy.mp3
22
- └───zzz.wav
23
- ```
24
- Note that the format of the audio files does not matter as long as they are audio files。
25
- Quality requirement: >=2s, <=10s, contain as little background sound as possible.
26
- Quantity requirement: at least 10 per character, 20+ per character is recommended.
27
- 2. Long audio files named by character names, which should contain single character voice only. Background sound is
28
- acceptable since they will be automatically removed. File name format `{CharacterName}_{random_number}.wav`
29
- (E.G. `Diana_234135.wav`, `MinatoAqua_234252.wav`), must be `.wav` files.
30
-
31
-
32
- 3. Long video files named by character names, which should contain single character voice only. Background sound is
33
- acceptable since they will be automatically removed. File name format `{CharacterName}_{random_number}.mp4`
34
- (E.G. `Taffy_332452.mp4`, `Dingzhen_957315.mp4`), must be `.mp4` files.
35
- Note: `CharacterName` must be English characters only, `random_number` is to identify multiple files for one character,
36
- which is compulsory to add. It could be a random integer between 0~999999.
37
-
38
- 4. A `.txt` containing multiple lines of`{CharacterName}|{video_url}`, which should be formatted as follows:
39
- ```
40
- Char1|https://xyz.com/video1/
41
- Char2|https://xyz.com/video2/
42
- Char2|https://xyz.com/video3/
43
- Char3|https://xyz.com/video4/
44
- ```
45
- One video should contain single speaker only. Currently supports videos links from bilibili, other websites are yet to be tested.
46
- Having questions regarding to data format? Fine data samples of all format from [here](https://drive.google.com/file/d/132l97zjanpoPY4daLgqXoM7HKXPRbS84/view?usp=sharing).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/LICENSE DELETED
@@ -1,201 +0,0 @@
1
- Apache License
2
- Version 2.0, January 2004
3
- http://www.apache.org/licenses/
4
-
5
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
-
7
- 1. Definitions.
8
-
9
- "License" shall mean the terms and conditions for use, reproduction,
10
- and distribution as defined by Sections 1 through 9 of this document.
11
-
12
- "Licensor" shall mean the copyright owner or entity authorized by
13
- the copyright owner that is granting the License.
14
-
15
- "Legal Entity" shall mean the union of the acting entity and all
16
- other entities that control, are controlled by, or are under common
17
- control with that entity. For the purposes of this definition,
18
- "control" means (i) the power, direct or indirect, to cause the
19
- direction or management of such entity, whether by contract or
20
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
- outstanding shares, or (iii) beneficial ownership of such entity.
22
-
23
- "You" (or "Your") shall mean an individual or Legal Entity
24
- exercising permissions granted by this License.
25
-
26
- "Source" form shall mean the preferred form for making modifications,
27
- including but not limited to software source code, documentation
28
- source, and configuration files.
29
-
30
- "Object" form shall mean any form resulting from mechanical
31
- transformation or translation of a Source form, including but
32
- not limited to compiled object code, generated documentation,
33
- and conversions to other media types.
34
-
35
- "Work" shall mean the work of authorship, whether in Source or
36
- Object form, made available under the License, as indicated by a
37
- copyright notice that is included in or attached to the work
38
- (an example is provided in the Appendix below).
39
-
40
- "Derivative Works" shall mean any work, whether in Source or Object
41
- form, that is based on (or derived from) the Work and for which the
42
- editorial revisions, annotations, elaborations, or other modifications
43
- represent, as a whole, an original work of authorship. For the purposes
44
- of this License, Derivative Works shall not include works that remain
45
- separable from, or merely link (or bind by name) to the interfaces of,
46
- the Work and Derivative Works thereof.
47
-
48
- "Contribution" shall mean any work of authorship, including
49
- the original version of the Work and any modifications or additions
50
- to that Work or Derivative Works thereof, that is intentionally
51
- submitted to Licensor for inclusion in the Work by the copyright owner
52
- or by an individual or Legal Entity authorized to submit on behalf of
53
- the copyright owner. For the purposes of this definition, "submitted"
54
- means any form of electronic, verbal, or written communication sent
55
- to the Licensor or its representatives, including but not limited to
56
- communication on electronic mailing lists, source code control systems,
57
- and issue tracking systems that are managed by, or on behalf of, the
58
- Licensor for the purpose of discussing and improving the Work, but
59
- excluding communication that is conspicuously marked or otherwise
60
- designated in writing by the copyright owner as "Not a Contribution."
61
-
62
- "Contributor" shall mean Licensor and any individual or Legal Entity
63
- on behalf of whom a Contribution has been received by Licensor and
64
- subsequently incorporated within the Work.
65
-
66
- 2. Grant of Copyright License. Subject to the terms and conditions of
67
- this License, each Contributor hereby grants to You a perpetual,
68
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
- copyright license to reproduce, prepare Derivative Works of,
70
- publicly display, publicly perform, sublicense, and distribute the
71
- Work and such Derivative Works in Source or Object form.
72
-
73
- 3. Grant of Patent License. Subject to the terms and conditions of
74
- this License, each Contributor hereby grants to You a perpetual,
75
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
- (except as stated in this section) patent license to make, have made,
77
- use, offer to sell, sell, import, and otherwise transfer the Work,
78
- where such license applies only to those patent claims licensable
79
- by such Contributor that are necessarily infringed by their
80
- Contribution(s) alone or by combination of their Contribution(s)
81
- with the Work to which such Contribution(s) was submitted. If You
82
- institute patent litigation against any entity (including a
83
- cross-claim or counterclaim in a lawsuit) alleging that the Work
84
- or a Contribution incorporated within the Work constitutes direct
85
- or contributory patent infringement, then any patent licenses
86
- granted to You under this License for that Work shall terminate
87
- as of the date such litigation is filed.
88
-
89
- 4. Redistribution. You may reproduce and distribute copies of the
90
- Work or Derivative Works thereof in any medium, with or without
91
- modifications, and in Source or Object form, provided that You
92
- meet the following conditions:
93
-
94
- (a) You must give any other recipients of the Work or
95
- Derivative Works a copy of this License; and
96
-
97
- (b) You must cause any modified files to carry prominent notices
98
- stating that You changed the files; and
99
-
100
- (c) You must retain, in the Source form of any Derivative Works
101
- that You distribute, all copyright, patent, trademark, and
102
- attribution notices from the Source form of the Work,
103
- excluding those notices that do not pertain to any part of
104
- the Derivative Works; and
105
-
106
- (d) If the Work includes a "NOTICE" text file as part of its
107
- distribution, then any Derivative Works that You distribute must
108
- include a readable copy of the attribution notices contained
109
- within such NOTICE file, excluding those notices that do not
110
- pertain to any part of the Derivative Works, in at least one
111
- of the following places: within a NOTICE text file distributed
112
- as part of the Derivative Works; within the Source form or
113
- documentation, if provided along with the Derivative Works; or,
114
- within a display generated by the Derivative Works, if and
115
- wherever such third-party notices normally appear. The contents
116
- of the NOTICE file are for informational purposes only and
117
- do not modify the License. You may add Your own attribution
118
- notices within Derivative Works that You distribute, alongside
119
- or as an addendum to the NOTICE text from the Work, provided
120
- that such additional attribution notices cannot be construed
121
- as modifying the License.
122
-
123
- You may add Your own copyright statement to Your modifications and
124
- may provide additional or different license terms and conditions
125
- for use, reproduction, or distribution of Your modifications, or
126
- for any such Derivative Works as a whole, provided Your use,
127
- reproduction, and distribution of the Work otherwise complies with
128
- the conditions stated in this License.
129
-
130
- 5. Submission of Contributions. Unless You explicitly state otherwise,
131
- any Contribution intentionally submitted for inclusion in the Work
132
- by You to the Licensor shall be under the terms and conditions of
133
- this License, without any additional terms or conditions.
134
- Notwithstanding the above, nothing herein shall supersede or modify
135
- the terms of any separate license agreement you may have executed
136
- with Licensor regarding such Contributions.
137
-
138
- 6. Trademarks. This License does not grant permission to use the trade
139
- names, trademarks, service marks, or product names of the Licensor,
140
- except as required for reasonable and customary use in describing the
141
- origin of the Work and reproducing the content of the NOTICE file.
142
-
143
- 7. Disclaimer of Warranty. Unless required by applicable law or
144
- agreed to in writing, Licensor provides the Work (and each
145
- Contributor provides its Contributions) on an "AS IS" BASIS,
146
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
- implied, including, without limitation, any warranties or conditions
148
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
- PARTICULAR PURPOSE. You are solely responsible for determining the
150
- appropriateness of using or redistributing the Work and assume any
151
- risks associated with Your exercise of permissions under this License.
152
-
153
- 8. Limitation of Liability. In no event and under no legal theory,
154
- whether in tort (including negligence), contract, or otherwise,
155
- unless required by applicable law (such as deliberate and grossly
156
- negligent acts) or agreed to in writing, shall any Contributor be
157
- liable to You for damages, including any direct, indirect, special,
158
- incidental, or consequential damages of any character arising as a
159
- result of this License or out of the use or inability to use the
160
- Work (including but not limited to damages for loss of goodwill,
161
- work stoppage, computer failure or malfunction, or any and all
162
- other commercial damages or losses), even if such Contributor
163
- has been advised of the possibility of such damages.
164
-
165
- 9. Accepting Warranty or Additional Liability. While redistributing
166
- the Work or Derivative Works thereof, You may choose to offer,
167
- and charge a fee for, acceptance of support, warranty, indemnity,
168
- or other liability obligations and/or rights consistent with this
169
- License. However, in accepting such obligations, You may act only
170
- on Your own behalf and on Your sole responsibility, not on behalf
171
- of any other Contributor, and only if You agree to indemnify,
172
- defend, and hold each Contributor harmless for any liability
173
- incurred by, or claims asserted against, such Contributor by reason
174
- of your accepting any such warranty or additional liability.
175
-
176
- END OF TERMS AND CONDITIONS
177
-
178
- APPENDIX: How to apply the Apache License to your work.
179
-
180
- To apply the Apache License to your work, attach the following
181
- boilerplate notice, with the fields enclosed by brackets "[]"
182
- replaced with your own identifying information. (Don't include
183
- the brackets!) The text should be enclosed in the appropriate
184
- comment syntax for the file format. We also recommend that a
185
- file or class name and description of purpose be included on the
186
- same "printed page" as the copyright notice for easier
187
- identification within third-party archives.
188
-
189
- Copyright [yyyy] [name of copyright owner]
190
-
191
- Licensed under the Apache License, Version 2.0 (the "License");
192
- you may not use this file except in compliance with the License.
193
- You may obtain a copy of the License at
194
-
195
- http://www.apache.org/licenses/LICENSE-2.0
196
-
197
- Unless required by applicable law or agreed to in writing, software
198
- distributed under the License is distributed on an "AS IS" BASIS,
199
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
- See the License for the specific language governing permissions and
201
- limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/README.md DELETED
@@ -1,55 +0,0 @@
1
- [中文文档请点击这里](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/README_ZH.md)
2
- # VITS Fast Fine-tuning
3
- This repo will guide you to add your own character voices, or even your own voice, into existing VITS TTS model
4
- to make it able to do the following tasks in less than 1 hour:
5
-
6
- 1. Many-to-many voice conversion between any characters you added & preset characters in the model.
7
- 2. English, Japanese & Chinese Text-to-Speech synthesis with the characters you added & preset characters
8
-
9
-
10
- Welcome to play around with the base models!
11
- Chinese & English & Japanese:[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer) Author: Me
12
-
13
- Chinese & Japanese:[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai) Author: [SayaSS](https://github.com/SayaSS)
14
-
15
-
16
- ### Currently Supported Tasks:
17
- - [x] Clone character voice from 10+ short audios
18
- - [x] Clone character voice from long audio(s) >= 3 minutes (one audio should contain single speaker only)
19
- - [x] Clone character voice from videos(s) >= 3 minutes (one video should contain single speaker only)
20
- - [x] Clone character voice from BILIBILI video links (one video should contain single speaker only)
21
-
22
- ### Currently Supported Characters for TTS & VC:
23
- - [x] Any character you wish as long as you have their voices!
24
- (Note that voice conversion can only be conducted between any two speakers in the model)
25
-
26
-
27
-
28
- ## Fine-tuning
29
- It's recommended to perform fine-tuning on [Google Colab](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
30
- because the original VITS has some dependencies that are difficult to configure.
31
-
32
- ### How long does it take?
33
- 1. Install dependencies (3 min)
34
- 2. Choose pretrained model to start. The detailed differences between them are described in [Colab Notebook](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
35
- 3. Upload the voice samples of the characters you wish to add,see [DATA.MD](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA_EN.MD) for detailed uploading options.
36
- 4. Start fine-tuning. Time taken varies from 20 minutes ~ 2 hours, depending on the number of voices you uploaded.
37
-
38
-
39
- ## Inference or Usage (Currently support Windows only)
40
- 0. Remember to download your fine-tuned model!
41
- 1. Download the latest release
42
- 2. Put your model & config file into the folder `inference`, which are named `G_latest.pth` and `finetune_speaker.json`, respectively.
43
- 3. The file structure should be as follows:
44
- ```
45
- inference
46
- ├───inference.exe
47
- ├───...
48
- ├───finetune_speaker.json
49
- └───G_latest.pth
50
- ```
51
- 4. run `inference.exe`, the browser should pop up automatically.
52
-
53
- ## Use in MoeGoe
54
- 0. Prepare downloaded model & config file, which are named `G_latest.pth` and `moegoe_config.json`, respectively.
55
- 1. Follow [MoeGoe](https://github.com/CjangCjengh/MoeGoe) page instructions to install, configure path, and use.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/README_ZH.md DELETED
@@ -1,60 +0,0 @@
1
- English Documentation Please Click [here](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/README.md)
2
- # VITS 快速微调
3
- 这个代码库会指导你如何将自定义角色(甚至你自己),加入预训练的VITS模型中,在1小时内的微调使模型具备如下功能:
4
- 1. 在 模型所包含的任意两个角色 之间进行声线转换
5
- 2. 以 你加入的角色声线 进行中日英三语 文本到语音合成。
6
-
7
- 本项目使用的底模涵盖常见二次元男/女配音声线(来自原神数据集)以及现实世界常见男/女声线(来自VCTK数据集),支持中日英三语,保证能够在微调时快速适应新的声线。
8
-
9
- 欢迎体验微调所使用的底模!
10
-
11
- 中日英:[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer) 作者:我
12
-
13
- 中日:[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai) 作者:[SayaSS](https://github.com/SayaSS)
14
-
15
- ### 目前支持的任务:
16
- - [x] 从 10条以上的短音频 克隆角色声音
17
- - [x] 从 3分钟以上的长音频(单个音频只能包含单说话人) 克隆角色声音
18
- - [x] 从 3分钟以上的视频(单个视频只能包含单说话人) 克隆角色声音
19
- - [x] 通过输入 bilibili视频链接(单个视频只能包含单说话人) 克隆角色声音
20
-
21
- ### 目前支持声线转换和中日英三语TTS的角色
22
- - [x] 任意角色(只要你有角色的声音样本)
23
- (注意:声线转换只能在任意两个存在于模型中的说话人之间进行)
24
-
25
-
26
-
27
-
28
- ## 微调
29
- 建议使用 [Google Colab](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
30
- 进行微调任务,因为VITS在多语言情况下的某些环境依赖相当难以配置。
31
- ### 在Google Colab里,我需要花多长时间?
32
- 1. 安装依赖 (3 min)
33
- 2. 选择预训练模型,详细区别参见[Colab 笔记本页面](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)。
34
- 3. 上传你希望加入的其它角色声音,详细上传方式见[DATA.MD](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA.MD)
35
- 4. 进行微调,根据选择的微调方式和样本数量不同,花费时长可能在20分钟到2小时不等。
36
-
37
- 微调结束后可以直接下载微调好的模型,日后在本地运行(不需要GPU)
38
-
39
- ## 本地运行和推理
40
- 0. 记得下载微调好的模型和config文件!
41
- 1. 下载最新的Release包(在Github页面的右侧)
42
- 2. 把下载的模型和config文件放在 `inference`文件夹下, 其文件名分别为 `G_latest.pth` 和 `finetune_speaker.json`。
43
- 3. 一切准备就绪后,文件结构应该如下所示:
44
- ```
45
- inference
46
- ├───inference.exe
47
- ├───...
48
- ├───finetune_speaker.json
49
- └───G_latest.pth
50
- ```
51
- 4. 运行 `inference.exe`, 浏览器会自动弹出窗口, 注意其所在路径不能有中文字符或者空格.
52
-
53
- ## 在MoeGoe使用
54
- 0. MoeGoe以及类似其它VITS推理UI使用的config格式略有不同,需要下载的文件为模型`G_latest.pth`和配置文件`moegoe_config.json`
55
- 1. 按照[MoeGoe](https://github.com/CjangCjengh/MoeGoe)页面的提示配置路径即可使用。
56
- 2. MoeGoe在输入句子时需要使用相应的语言标记包裹句子才能正常合成。(日语用[JA], 中文用[ZH], 英文用[EN]),例如:
57
- [JA]こんにちわ。[JA]
58
- [ZH]你好![ZH]
59
- [EN]Hello![EN]
60
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/VC_inference.py DELETED
@@ -1,139 +0,0 @@
1
- import os
2
- import numpy as np
3
- import torch
4
- from torch import no_grad, LongTensor
5
- import argparse
6
- import commons
7
- from mel_processing import spectrogram_torch
8
- import utils
9
- from models import SynthesizerTrn
10
- import gradio as gr
11
- import librosa
12
- import webbrowser
13
-
14
- from text import text_to_sequence, _clean_text
15
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
- language_marks = {
17
- "Japanese": "",
18
- "日本語": "[JA]",
19
- "简体中文": "[ZH]",
20
- "English": "[EN]",
21
- "Mix": "",
22
- }
23
- lang = ['日本語', '简体中文', 'English', 'Mix']
24
- def get_text(text, hps, is_symbol):
25
- text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
26
- if hps.data.add_blank:
27
- text_norm = commons.intersperse(text_norm, 0)
28
- text_norm = LongTensor(text_norm)
29
- return text_norm
30
-
31
- def create_tts_fn(model, hps, speaker_ids):
32
- def tts_fn(text, speaker, language, speed):
33
- if language is not None:
34
- text = language_marks[language] + text + language_marks[language]
35
- speaker_id = speaker_ids[speaker]
36
- stn_tst = get_text(text, hps, False)
37
- with no_grad():
38
- x_tst = stn_tst.unsqueeze(0).to(device)
39
- x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
40
- sid = LongTensor([speaker_id]).to(device)
41
- audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
42
- length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
43
- del stn_tst, x_tst, x_tst_lengths, sid
44
- return "Success", (hps.data.sampling_rate, audio)
45
-
46
- return tts_fn
47
-
48
- def create_vc_fn(model, hps, speaker_ids):
49
- def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
50
- input_audio = record_audio if record_audio is not None else upload_audio
51
- if input_audio is None:
52
- return "You need to record or upload an audio", None
53
- sampling_rate, audio = input_audio
54
- original_speaker_id = speaker_ids[original_speaker]
55
- target_speaker_id = speaker_ids[target_speaker]
56
-
57
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
58
- if len(audio.shape) > 1:
59
- audio = librosa.to_mono(audio.transpose(1, 0))
60
- if sampling_rate != hps.data.sampling_rate:
61
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
62
- with no_grad():
63
- y = torch.FloatTensor(audio)
64
- y = y / max(-y.min(), y.max()) / 0.99
65
- y = y.to(device)
66
- y = y.unsqueeze(0)
67
- spec = spectrogram_torch(y, hps.data.filter_length,
68
- hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
69
- center=False).to(device)
70
- spec_lengths = LongTensor([spec.size(-1)]).to(device)
71
- sid_src = LongTensor([original_speaker_id]).to(device)
72
- sid_tgt = LongTensor([target_speaker_id]).to(device)
73
- audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
74
- 0, 0].data.cpu().float().numpy()
75
- del y, spec, spec_lengths, sid_src, sid_tgt
76
- return "Success", (hps.data.sampling_rate, audio)
77
-
78
- return vc_fn
79
- if __name__ == "__main__":
80
- parser = argparse.ArgumentParser()
81
- parser.add_argument("--model_dir", default="./G_latest.pth", help="directory to your fine-tuned model")
82
- parser.add_argument("--config_dir", default="./finetune_speaker.json", help="directory to your model config file")
83
- parser.add_argument("--share", default=False, help="make link public (used in colab)")
84
-
85
- args = parser.parse_args()
86
- hps = utils.get_hparams_from_file(args.config_dir)
87
-
88
-
89
- net_g = SynthesizerTrn(
90
- len(hps.symbols),
91
- hps.data.filter_length // 2 + 1,
92
- hps.train.segment_size // hps.data.hop_length,
93
- n_speakers=hps.data.n_speakers,
94
- **hps.model).to(device)
95
- _ = net_g.eval()
96
-
97
- _ = utils.load_checkpoint(args.model_dir, net_g, None)
98
- speaker_ids = hps.speakers
99
- speakers = list(hps.speakers.keys())
100
- tts_fn = create_tts_fn(net_g, hps, speaker_ids)
101
- vc_fn = create_vc_fn(net_g, hps, speaker_ids)
102
- app = gr.Blocks()
103
- with app:
104
- with gr.Tab("Text-to-Speech"):
105
- with gr.Row():
106
- with gr.Column():
107
- textbox = gr.TextArea(label="Text",
108
- placeholder="Type your sentence here",
109
- value="こんにちわ。", elem_id=f"tts-input")
110
- # select character
111
- char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
112
- language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
113
- duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
114
- label='速度 Speed')
115
- with gr.Column():
116
- text_output = gr.Textbox(label="Message")
117
- audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
118
- btn = gr.Button("Generate!")
119
- btn.click(tts_fn,
120
- inputs=[textbox, char_dropdown, language_dropdown, duration_slider,],
121
- outputs=[text_output, audio_output])
122
- with gr.Tab("Voice Conversion"):
123
- gr.Markdown("""
124
- 录制或上传声音,并选择要转换的音色。
125
- """)
126
- with gr.Column():
127
- record_audio = gr.Audio(label="record your voice", source="microphone")
128
- upload_audio = gr.Audio(label="or upload audio here", source="upload")
129
- source_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="source speaker")
130
- target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
131
- with gr.Column():
132
- message_box = gr.Textbox(label="Message")
133
- converted_audio = gr.Audio(label='converted audio')
134
- btn = gr.Button("Convert!")
135
- btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
136
- outputs=[message_box, converted_audio])
137
- webbrowser.open("http://127.0.0.1:7860")
138
- app.launch(share=args.share)
139
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/attentions.py DELETED
@@ -1,303 +0,0 @@
1
- import copy
2
- import math
3
- import numpy as np
4
- import torch
5
- from torch import nn
6
- from torch.nn import functional as F
7
-
8
- import commons
9
- import modules
10
- from modules import LayerNorm
11
-
12
-
13
- class Encoder(nn.Module):
14
- def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
15
- super().__init__()
16
- self.hidden_channels = hidden_channels
17
- self.filter_channels = filter_channels
18
- self.n_heads = n_heads
19
- self.n_layers = n_layers
20
- self.kernel_size = kernel_size
21
- self.p_dropout = p_dropout
22
- self.window_size = window_size
23
-
24
- self.drop = nn.Dropout(p_dropout)
25
- self.attn_layers = nn.ModuleList()
26
- self.norm_layers_1 = nn.ModuleList()
27
- self.ffn_layers = nn.ModuleList()
28
- self.norm_layers_2 = nn.ModuleList()
29
- for i in range(self.n_layers):
30
- self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
31
- self.norm_layers_1.append(LayerNorm(hidden_channels))
32
- self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
33
- self.norm_layers_2.append(LayerNorm(hidden_channels))
34
-
35
- def forward(self, x, x_mask):
36
- attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
37
- x = x * x_mask
38
- for i in range(self.n_layers):
39
- y = self.attn_layers[i](x, x, attn_mask)
40
- y = self.drop(y)
41
- x = self.norm_layers_1[i](x + y)
42
-
43
- y = self.ffn_layers[i](x, x_mask)
44
- y = self.drop(y)
45
- x = self.norm_layers_2[i](x + y)
46
- x = x * x_mask
47
- return x
48
-
49
-
50
- class Decoder(nn.Module):
51
- def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
52
- super().__init__()
53
- self.hidden_channels = hidden_channels
54
- self.filter_channels = filter_channels
55
- self.n_heads = n_heads
56
- self.n_layers = n_layers
57
- self.kernel_size = kernel_size
58
- self.p_dropout = p_dropout
59
- self.proximal_bias = proximal_bias
60
- self.proximal_init = proximal_init
61
-
62
- self.drop = nn.Dropout(p_dropout)
63
- self.self_attn_layers = nn.ModuleList()
64
- self.norm_layers_0 = nn.ModuleList()
65
- self.encdec_attn_layers = nn.ModuleList()
66
- self.norm_layers_1 = nn.ModuleList()
67
- self.ffn_layers = nn.ModuleList()
68
- self.norm_layers_2 = nn.ModuleList()
69
- for i in range(self.n_layers):
70
- self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
71
- self.norm_layers_0.append(LayerNorm(hidden_channels))
72
- self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
73
- self.norm_layers_1.append(LayerNorm(hidden_channels))
74
- self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
75
- self.norm_layers_2.append(LayerNorm(hidden_channels))
76
-
77
- def forward(self, x, x_mask, h, h_mask):
78
- """
79
- x: decoder input
80
- h: encoder output
81
- """
82
- self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
83
- encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
84
- x = x * x_mask
85
- for i in range(self.n_layers):
86
- y = self.self_attn_layers[i](x, x, self_attn_mask)
87
- y = self.drop(y)
88
- x = self.norm_layers_0[i](x + y)
89
-
90
- y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
91
- y = self.drop(y)
92
- x = self.norm_layers_1[i](x + y)
93
-
94
- y = self.ffn_layers[i](x, x_mask)
95
- y = self.drop(y)
96
- x = self.norm_layers_2[i](x + y)
97
- x = x * x_mask
98
- return x
99
-
100
-
101
- class MultiHeadAttention(nn.Module):
102
- def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
103
- super().__init__()
104
- assert channels % n_heads == 0
105
-
106
- self.channels = channels
107
- self.out_channels = out_channels
108
- self.n_heads = n_heads
109
- self.p_dropout = p_dropout
110
- self.window_size = window_size
111
- self.heads_share = heads_share
112
- self.block_length = block_length
113
- self.proximal_bias = proximal_bias
114
- self.proximal_init = proximal_init
115
- self.attn = None
116
-
117
- self.k_channels = channels // n_heads
118
- self.conv_q = nn.Conv1d(channels, channels, 1)
119
- self.conv_k = nn.Conv1d(channels, channels, 1)
120
- self.conv_v = nn.Conv1d(channels, channels, 1)
121
- self.conv_o = nn.Conv1d(channels, out_channels, 1)
122
- self.drop = nn.Dropout(p_dropout)
123
-
124
- if window_size is not None:
125
- n_heads_rel = 1 if heads_share else n_heads
126
- rel_stddev = self.k_channels**-0.5
127
- self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
128
- self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
129
-
130
- nn.init.xavier_uniform_(self.conv_q.weight)
131
- nn.init.xavier_uniform_(self.conv_k.weight)
132
- nn.init.xavier_uniform_(self.conv_v.weight)
133
- if proximal_init:
134
- with torch.no_grad():
135
- self.conv_k.weight.copy_(self.conv_q.weight)
136
- self.conv_k.bias.copy_(self.conv_q.bias)
137
-
138
- def forward(self, x, c, attn_mask=None):
139
- q = self.conv_q(x)
140
- k = self.conv_k(c)
141
- v = self.conv_v(c)
142
-
143
- x, self.attn = self.attention(q, k, v, mask=attn_mask)
144
-
145
- x = self.conv_o(x)
146
- return x
147
-
148
- def attention(self, query, key, value, mask=None):
149
- # reshape [b, d, t] -> [b, n_h, t, d_k]
150
- b, d, t_s, t_t = (*key.size(), query.size(2))
151
- query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
152
- key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
153
- value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
154
-
155
- scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
156
- if self.window_size is not None:
157
- assert t_s == t_t, "Relative attention is only available for self-attention."
158
- key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
159
- rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
160
- scores_local = self._relative_position_to_absolute_position(rel_logits)
161
- scores = scores + scores_local
162
- if self.proximal_bias:
163
- assert t_s == t_t, "Proximal bias is only available for self-attention."
164
- scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
165
- if mask is not None:
166
- scores = scores.masked_fill(mask == 0, -1e4)
167
- if self.block_length is not None:
168
- assert t_s == t_t, "Local attention is only available for self-attention."
169
- block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
170
- scores = scores.masked_fill(block_mask == 0, -1e4)
171
- p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
172
- p_attn = self.drop(p_attn)
173
- output = torch.matmul(p_attn, value)
174
- if self.window_size is not None:
175
- relative_weights = self._absolute_position_to_relative_position(p_attn)
176
- value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
177
- output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
178
- output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
179
- return output, p_attn
180
-
181
- def _matmul_with_relative_values(self, x, y):
182
- """
183
- x: [b, h, l, m]
184
- y: [h or 1, m, d]
185
- ret: [b, h, l, d]
186
- """
187
- ret = torch.matmul(x, y.unsqueeze(0))
188
- return ret
189
-
190
- def _matmul_with_relative_keys(self, x, y):
191
- """
192
- x: [b, h, l, d]
193
- y: [h or 1, m, d]
194
- ret: [b, h, l, m]
195
- """
196
- ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
197
- return ret
198
-
199
- def _get_relative_embeddings(self, relative_embeddings, length):
200
- max_relative_position = 2 * self.window_size + 1
201
- # Pad first before slice to avoid using cond ops.
202
- pad_length = max(length - (self.window_size + 1), 0)
203
- slice_start_position = max((self.window_size + 1) - length, 0)
204
- slice_end_position = slice_start_position + 2 * length - 1
205
- if pad_length > 0:
206
- padded_relative_embeddings = F.pad(
207
- relative_embeddings,
208
- commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
209
- else:
210
- padded_relative_embeddings = relative_embeddings
211
- used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
212
- return used_relative_embeddings
213
-
214
- def _relative_position_to_absolute_position(self, x):
215
- """
216
- x: [b, h, l, 2*l-1]
217
- ret: [b, h, l, l]
218
- """
219
- batch, heads, length, _ = x.size()
220
- # Concat columns of pad to shift from relative to absolute indexing.
221
- x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
222
-
223
- # Concat extra elements so to add up to shape (len+1, 2*len-1).
224
- x_flat = x.view([batch, heads, length * 2 * length])
225
- x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
226
-
227
- # Reshape and slice out the padded elements.
228
- x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
229
- return x_final
230
-
231
- def _absolute_position_to_relative_position(self, x):
232
- """
233
- x: [b, h, l, l]
234
- ret: [b, h, l, 2*l-1]
235
- """
236
- batch, heads, length, _ = x.size()
237
- # padd along column
238
- x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
239
- x_flat = x.view([batch, heads, length**2 + length*(length -1)])
240
- # add 0's in the beginning that will skew the elements after reshape
241
- x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
242
- x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
243
- return x_final
244
-
245
- def _attention_bias_proximal(self, length):
246
- """Bias for self-attention to encourage attention to close positions.
247
- Args:
248
- length: an integer scalar.
249
- Returns:
250
- a Tensor with shape [1, 1, length, length]
251
- """
252
- r = torch.arange(length, dtype=torch.float32)
253
- diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
254
- return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
255
-
256
-
257
- class FFN(nn.Module):
258
- def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
259
- super().__init__()
260
- self.in_channels = in_channels
261
- self.out_channels = out_channels
262
- self.filter_channels = filter_channels
263
- self.kernel_size = kernel_size
264
- self.p_dropout = p_dropout
265
- self.activation = activation
266
- self.causal = causal
267
-
268
- if causal:
269
- self.padding = self._causal_padding
270
- else:
271
- self.padding = self._same_padding
272
-
273
- self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
274
- self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
275
- self.drop = nn.Dropout(p_dropout)
276
-
277
- def forward(self, x, x_mask):
278
- x = self.conv_1(self.padding(x * x_mask))
279
- if self.activation == "gelu":
280
- x = x * torch.sigmoid(1.702 * x)
281
- else:
282
- x = torch.relu(x)
283
- x = self.drop(x)
284
- x = self.conv_2(self.padding(x * x_mask))
285
- return x * x_mask
286
-
287
- def _causal_padding(self, x):
288
- if self.kernel_size == 1:
289
- return x
290
- pad_l = self.kernel_size - 1
291
- pad_r = 0
292
- padding = [[0, 0], [0, 0], [pad_l, pad_r]]
293
- x = F.pad(x, commons.convert_pad_shape(padding))
294
- return x
295
-
296
- def _same_padding(self, x):
297
- if self.kernel_size == 1:
298
- return x
299
- pad_l = (self.kernel_size - 1) // 2
300
- pad_r = self.kernel_size // 2
301
- padding = [[0, 0], [0, 0], [pad_l, pad_r]]
302
- x = F.pad(x, commons.convert_pad_shape(padding))
303
- return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/cmd_inference.py DELETED
@@ -1,106 +0,0 @@
1
- """该模块用于生成VITS文件
2
- 使用方法
3
-
4
- python cmd_inference.py -m 模型路径 -c 配置文件路径 -o 输出文件路径 -l 输入的语言 -t 输入文本 -s 合成目标说话人名称
5
-
6
- 可选参数
7
- -ns 感情变化程度
8
- -nsw 音素发音长度
9
- -ls 整体语速
10
- -on 输出文件的名称
11
-
12
- """
13
-
14
- from pathlib import Path
15
- import utils
16
- from models import SynthesizerTrn
17
- import torch
18
- from torch import no_grad, LongTensor
19
- import librosa
20
- from text import text_to_sequence, _clean_text
21
- import commons
22
- import scipy.io.wavfile as wavf
23
- import os
24
-
25
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
26
-
27
- language_marks = {
28
- "Japanese": "",
29
- "日本語": "[JA]",
30
- "简体中文": "[ZH]",
31
- "English": "[EN]",
32
- "Mix": "",
33
- }
34
-
35
-
36
- def get_text(text, hps, is_symbol):
37
- text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
38
- if hps.data.add_blank:
39
- text_norm = commons.intersperse(text_norm, 0)
40
- text_norm = LongTensor(text_norm)
41
- return text_norm
42
-
43
-
44
-
45
- if __name__ == "__main__":
46
- import argparse
47
-
48
- parser = argparse.ArgumentParser(description='vits inference')
49
- #必须参数
50
- parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径')
51
- parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
52
- parser.add_argument('-o', '--output_path', type=str, default="output/vits", help='输出文件路径')
53
- parser.add_argument('-l', '--language', type=str, default="日本語", help='输入的语言')
54
- parser.add_argument('-t', '--text', type=str, help='输入文本')
55
- parser.add_argument('-s', '--spk', type=str, help='合成目标说话人名称')
56
- #可选参数
57
- parser.add_argument('-on', '--output_name', type=str, default="output", help='输出文件的名称')
58
- parser.add_argument('-ns', '--noise_scale', type=float,default= .667,help='感情变化程度')
59
- parser.add_argument('-nsw', '--noise_scale_w', type=float,default=0.6, help='音素发音长度')
60
- parser.add_argument('-ls', '--length_scale', type=float,default=1, help='整体语速')
61
-
62
- args = parser.parse_args()
63
-
64
- model_path = args.model_path
65
- config_path = args.config_path
66
- output_dir = Path(args.output_path)
67
- output_dir.mkdir(parents=True, exist_ok=True)
68
-
69
- language = args.language
70
- text = args.text
71
- spk = args.spk
72
- noise_scale = args.noise_scale
73
- noise_scale_w = args.noise_scale_w
74
- length = args.length_scale
75
- output_name = args.output_name
76
-
77
- hps = utils.get_hparams_from_file(config_path)
78
- net_g = SynthesizerTrn(
79
- len(hps.symbols),
80
- hps.data.filter_length // 2 + 1,
81
- hps.train.segment_size // hps.data.hop_length,
82
- n_speakers=hps.data.n_speakers,
83
- **hps.model).to(device)
84
- _ = net_g.eval()
85
- _ = utils.load_checkpoint(model_path, net_g, None)
86
-
87
- speaker_ids = hps.speakers
88
-
89
-
90
- if language is not None:
91
- text = language_marks[language] + text + language_marks[language]
92
- speaker_id = speaker_ids[spk]
93
- stn_tst = get_text(text, hps, False)
94
- with no_grad():
95
- x_tst = stn_tst.unsqueeze(0).to(device)
96
- x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
97
- sid = LongTensor([speaker_id]).to(device)
98
- audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
99
- length_scale=1.0 / length)[0][0, 0].data.cpu().float().numpy()
100
- del stn_tst, x_tst, x_tst_lengths, sid
101
-
102
- wavf.write(str(output_dir)+"/"+output_name+".wav",hps.data.sampling_rate,audio)
103
-
104
-
105
-
106
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/commons.py DELETED
@@ -1,164 +0,0 @@
1
- import math
2
- import numpy as np
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
-
7
-
8
- def init_weights(m, mean=0.0, std=0.01):
9
- classname = m.__class__.__name__
10
- if classname.find("Conv") != -1:
11
- m.weight.data.normal_(mean, std)
12
-
13
-
14
- def get_padding(kernel_size, dilation=1):
15
- return int((kernel_size*dilation - dilation)/2)
16
-
17
-
18
- def convert_pad_shape(pad_shape):
19
- l = pad_shape[::-1]
20
- pad_shape = [item for sublist in l for item in sublist]
21
- return pad_shape
22
-
23
-
24
- def intersperse(lst, item):
25
- result = [item] * (len(lst) * 2 + 1)
26
- result[1::2] = lst
27
- return result
28
-
29
-
30
- def kl_divergence(m_p, logs_p, m_q, logs_q):
31
- """KL(P||Q)"""
32
- kl = (logs_q - logs_p) - 0.5
33
- kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
34
- return kl
35
-
36
-
37
- def rand_gumbel(shape):
38
- """Sample from the Gumbel distribution, protect from overflows."""
39
- uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40
- return -torch.log(-torch.log(uniform_samples))
41
-
42
-
43
- def rand_gumbel_like(x):
44
- g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
- return g
46
-
47
-
48
- def slice_segments(x, ids_str, segment_size=4):
49
- ret = torch.zeros_like(x[:, :, :segment_size])
50
- for i in range(x.size(0)):
51
- idx_str = ids_str[i]
52
- idx_end = idx_str + segment_size
53
- try:
54
- ret[i] = x[i, :, idx_str:idx_end]
55
- except RuntimeError:
56
- print("?")
57
- return ret
58
-
59
-
60
- def rand_slice_segments(x, x_lengths=None, segment_size=4):
61
- b, d, t = x.size()
62
- if x_lengths is None:
63
- x_lengths = t
64
- ids_str_max = x_lengths - segment_size + 1
65
- ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
66
- ret = slice_segments(x, ids_str, segment_size)
67
- return ret, ids_str
68
-
69
-
70
- def get_timing_signal_1d(
71
- length, channels, min_timescale=1.0, max_timescale=1.0e4):
72
- position = torch.arange(length, dtype=torch.float)
73
- num_timescales = channels // 2
74
- log_timescale_increment = (
75
- math.log(float(max_timescale) / float(min_timescale)) /
76
- (num_timescales - 1))
77
- inv_timescales = min_timescale * torch.exp(
78
- torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
79
- scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
80
- signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
81
- signal = F.pad(signal, [0, 0, 0, channels % 2])
82
- signal = signal.view(1, channels, length)
83
- return signal
84
-
85
-
86
- def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
87
- b, channels, length = x.size()
88
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
89
- return x + signal.to(dtype=x.dtype, device=x.device)
90
-
91
-
92
- def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
93
- b, channels, length = x.size()
94
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
95
- return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
96
-
97
-
98
- def subsequent_mask(length):
99
- mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
100
- return mask
101
-
102
-
103
- @torch.jit.script
104
- def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
105
- n_channels_int = n_channels[0]
106
- in_act = input_a + input_b
107
- t_act = torch.tanh(in_act[:, :n_channels_int, :])
108
- s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
109
- acts = t_act * s_act
110
- return acts
111
-
112
-
113
- def convert_pad_shape(pad_shape):
114
- l = pad_shape[::-1]
115
- pad_shape = [item for sublist in l for item in sublist]
116
- return pad_shape
117
-
118
-
119
- def shift_1d(x):
120
- x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
121
- return x
122
-
123
-
124
- def sequence_mask(length, max_length=None):
125
- if max_length is None:
126
- max_length = length.max()
127
- x = torch.arange(max_length, dtype=length.dtype, device=length.device)
128
- return x.unsqueeze(0) < length.unsqueeze(1)
129
-
130
-
131
- def generate_path(duration, mask):
132
- """
133
- duration: [b, 1, t_x]
134
- mask: [b, 1, t_y, t_x]
135
- """
136
- device = duration.device
137
-
138
- b, _, t_y, t_x = mask.shape
139
- cum_duration = torch.cumsum(duration, -1)
140
-
141
- cum_duration_flat = cum_duration.view(b * t_x)
142
- path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
143
- path = path.view(b, t_x, t_y)
144
- path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
145
- path = path.unsqueeze(1).transpose(2,3) * mask
146
- return path
147
-
148
-
149
- def clip_grad_value_(parameters, clip_value, norm_type=2):
150
- if isinstance(parameters, torch.Tensor):
151
- parameters = [parameters]
152
- parameters = list(filter(lambda p: p.grad is not None, parameters))
153
- norm_type = float(norm_type)
154
- if clip_value is not None:
155
- clip_value = float(clip_value)
156
-
157
- total_norm = 0
158
- for p in parameters:
159
- param_norm = p.grad.data.norm(norm_type)
160
- total_norm += param_norm.item() ** norm_type
161
- if clip_value is not None:
162
- p.grad.data.clamp_(min=-clip_value, max=clip_value)
163
- total_norm = total_norm ** (1. / norm_type)
164
- return total_norm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/configs/modified_finetune_speaker.json DELETED
@@ -1,172 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 10,
4
- "eval_interval": 100,
5
- "seed": 1234,
6
- "epochs": 10000,
7
- "learning_rate": 0.0002,
8
- "betas": [
9
- 0.8,
10
- 0.99
11
- ],
12
- "eps": 1e-09,
13
- "batch_size": 16,
14
- "fp16_run": true,
15
- "lr_decay": 0.999875,
16
- "segment_size": 8192,
17
- "init_lr_ratio": 1,
18
- "warmup_epochs": 0,
19
- "c_mel": 45,
20
- "c_kl": 1.0
21
- },
22
- "data": {
23
- "training_files": "final_annotation_train.txt",
24
- "validation_files": "final_annotation_val.txt",
25
- "text_cleaners": [
26
- "chinese_cleaners"
27
- ],
28
- "max_wav_value": 32768.0,
29
- "sampling_rate": 22050,
30
- "filter_length": 1024,
31
- "hop_length": 256,
32
- "win_length": 1024,
33
- "n_mel_channels": 80,
34
- "mel_fmin": 0.0,
35
- "mel_fmax": null,
36
- "add_blank": true,
37
- "n_speakers": 2,
38
- "cleaned_text": true
39
- },
40
- "model": {
41
- "inter_channels": 192,
42
- "hidden_channels": 192,
43
- "filter_channels": 768,
44
- "n_heads": 2,
45
- "n_layers": 6,
46
- "kernel_size": 3,
47
- "p_dropout": 0.1,
48
- "resblock": "1",
49
- "resblock_kernel_sizes": [
50
- 3,
51
- 7,
52
- 11
53
- ],
54
- "resblock_dilation_sizes": [
55
- [
56
- 1,
57
- 3,
58
- 5
59
- ],
60
- [
61
- 1,
62
- 3,
63
- 5
64
- ],
65
- [
66
- 1,
67
- 3,
68
- 5
69
- ]
70
- ],
71
- "upsample_rates": [
72
- 8,
73
- 8,
74
- 2,
75
- 2
76
- ],
77
- "upsample_initial_channel": 512,
78
- "upsample_kernel_sizes": [
79
- 16,
80
- 16,
81
- 4,
82
- 4
83
- ],
84
- "n_layers_q": 3,
85
- "use_spectral_norm": false,
86
- "gin_channels": 256
87
- },
88
- "symbols": [
89
- "_",
90
- "\uff1b",
91
- "\uff1a",
92
- "\uff0c",
93
- "\u3002",
94
- "\uff01",
95
- "\uff1f",
96
- "-",
97
- "\u201c",
98
- "\u201d",
99
- "\u300a",
100
- "\u300b",
101
- "\u3001",
102
- "\uff08",
103
- "\uff09",
104
- "\u2026",
105
- "\u2014",
106
- " ",
107
- "A",
108
- "B",
109
- "C",
110
- "D",
111
- "E",
112
- "F",
113
- "G",
114
- "H",
115
- "I",
116
- "J",
117
- "K",
118
- "L",
119
- "M",
120
- "N",
121
- "O",
122
- "P",
123
- "Q",
124
- "R",
125
- "S",
126
- "T",
127
- "U",
128
- "V",
129
- "W",
130
- "X",
131
- "Y",
132
- "Z",
133
- "a",
134
- "b",
135
- "c",
136
- "d",
137
- "e",
138
- "f",
139
- "g",
140
- "h",
141
- "i",
142
- "j",
143
- "k",
144
- "l",
145
- "m",
146
- "n",
147
- "o",
148
- "p",
149
- "q",
150
- "r",
151
- "s",
152
- "t",
153
- "u",
154
- "v",
155
- "w",
156
- "x",
157
- "y",
158
- "z",
159
- "1",
160
- "2",
161
- "3",
162
- "4",
163
- "5",
164
- "0",
165
- "\uff22",
166
- "\uff30"
167
- ],
168
- "speakers": {
169
- "dingzhen": 0,
170
- "taffy": 1
171
- }
172
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/configs/uma_trilingual.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "eval_interval": 1000,
5
- "seed": 1234,
6
- "epochs": 10000,
7
- "learning_rate": 2e-4,
8
- "betas": [0.8, 0.99],
9
- "eps": 1e-9,
10
- "batch_size": 16,
11
- "fp16_run": true,
12
- "lr_decay": 0.999875,
13
- "segment_size": 8192,
14
- "init_lr_ratio": 1,
15
- "warmup_epochs": 0,
16
- "c_mel": 45,
17
- "c_kl": 1.0
18
- },
19
- "data": {
20
- "training_files":"../CH_JA_EN_mix_voice/clipped_3_vits_trilingual_annotations.train.txt.cleaned",
21
- "validation_files":"../CH_JA_EN_mix_voice/clipped_3_vits_trilingual_annotations.val.txt.cleaned",
22
- "text_cleaners":["cjke_cleaners2"],
23
- "max_wav_value": 32768.0,
24
- "sampling_rate": 22050,
25
- "filter_length": 1024,
26
- "hop_length": 256,
27
- "win_length": 1024,
28
- "n_mel_channels": 80,
29
- "mel_fmin": 0.0,
30
- "mel_fmax": null,
31
- "add_blank": true,
32
- "n_speakers": 999,
33
- "cleaned_text": true
34
- },
35
- "model": {
36
- "inter_channels": 192,
37
- "hidden_channels": 192,
38
- "filter_channels": 768,
39
- "n_heads": 2,
40
- "n_layers": 6,
41
- "kernel_size": 3,
42
- "p_dropout": 0.1,
43
- "resblock": "1",
44
- "resblock_kernel_sizes": [3,7,11],
45
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
46
- "upsample_rates": [8,8,2,2],
47
- "upsample_initial_channel": 512,
48
- "upsample_kernel_sizes": [16,16,4,4],
49
- "n_layers_q": 3,
50
- "use_spectral_norm": false,
51
- "gin_channels": 256
52
- },
53
- "symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "N", "Q", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "w", "x", "y", "z", "\u0251", "\u00e6", "\u0283", "\u0291", "\u00e7", "\u026f", "\u026a", "\u0254", "\u025b", "\u0279", "\u00f0", "\u0259", "\u026b", "\u0265", "\u0278", "\u028a", "\u027e", "\u0292", "\u03b8", "\u03b2", "\u014b", "\u0266", "\u207c", "\u02b0", "`", "^", "#", "*", "=", "\u02c8", "\u02cc", "\u2192", "\u2193", "\u2191", " "]
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/data_utils.py DELETED
@@ -1,267 +0,0 @@
1
- import time
2
- import os
3
- import random
4
- import numpy as np
5
- import torch
6
- import torch.utils.data
7
- import torchaudio
8
-
9
- import commons
10
- from mel_processing import spectrogram_torch
11
- from utils import load_wav_to_torch, load_filepaths_and_text
12
- from text import text_to_sequence, cleaned_text_to_sequence
13
- """Multi speaker version"""
14
-
15
-
16
- class TextAudioSpeakerLoader(torch.utils.data.Dataset):
17
- """
18
- 1) loads audio, speaker_id, text pairs
19
- 2) normalizes text and converts them to sequences of integers
20
- 3) computes spectrograms from audio files.
21
- """
22
-
23
- def __init__(self, audiopaths_sid_text, hparams, symbols):
24
- self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
25
- self.text_cleaners = hparams.text_cleaners
26
- self.max_wav_value = hparams.max_wav_value
27
- self.sampling_rate = hparams.sampling_rate
28
- self.filter_length = hparams.filter_length
29
- self.hop_length = hparams.hop_length
30
- self.win_length = hparams.win_length
31
- self.sampling_rate = hparams.sampling_rate
32
-
33
- self.cleaned_text = getattr(hparams, "cleaned_text", False)
34
-
35
- self.add_blank = hparams.add_blank
36
- self.min_text_len = getattr(hparams, "min_text_len", 1)
37
- self.max_text_len = getattr(hparams, "max_text_len", 190)
38
- self.symbols = symbols
39
-
40
- random.seed(1234)
41
- random.shuffle(self.audiopaths_sid_text)
42
- self._filter()
43
-
44
- def _filter(self):
45
- """
46
- Filter text & store spec lengths
47
- """
48
- # Store spectrogram lengths for Bucketing
49
- # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
50
- # spec_length = wav_length // hop_length
51
-
52
- audiopaths_sid_text_new = []
53
- lengths = []
54
- for audiopath, sid, text in self.audiopaths_sid_text:
55
- # audiopath = "./user_voice/" + audiopath
56
-
57
- if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
58
- audiopaths_sid_text_new.append([audiopath, sid, text])
59
- lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
60
- self.audiopaths_sid_text = audiopaths_sid_text_new
61
- self.lengths = lengths
62
-
63
- def get_audio_text_speaker_pair(self, audiopath_sid_text):
64
- # separate filename, speaker_id and text
65
- audiopath, sid, text = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2]
66
- text = self.get_text(text)
67
- spec, wav = self.get_audio(audiopath)
68
- sid = self.get_sid(sid)
69
- return (text, spec, wav, sid)
70
-
71
- def get_audio(self, filename):
72
- # audio, sampling_rate = load_wav_to_torch(filename)
73
- # if sampling_rate != self.sampling_rate:
74
- # raise ValueError("{} {} SR doesn't match target {} SR".format(
75
- # sampling_rate, self.sampling_rate))
76
- # audio_norm = audio / self.max_wav_value if audio.max() > 10 else audio
77
- # audio_norm = audio_norm.unsqueeze(0)
78
- audio_norm, sampling_rate = torchaudio.load(filename, frame_offset=0, num_frames=-1, normalize=True, channels_first=True)
79
- # spec_filename = filename.replace(".wav", ".spec.pt")
80
- # if os.path.exists(spec_filename):
81
- # spec = torch.load(spec_filename)
82
- # else:
83
- # try:
84
- spec = spectrogram_torch(audio_norm, self.filter_length,
85
- self.sampling_rate, self.hop_length, self.win_length,
86
- center=False)
87
- spec = spec.squeeze(0)
88
- # except NotImplementedError:
89
- # print("?")
90
- # spec = torch.squeeze(spec, 0)
91
- # torch.save(spec, spec_filename)
92
- return spec, audio_norm
93
-
94
- def get_text(self, text):
95
- if self.cleaned_text:
96
- text_norm = cleaned_text_to_sequence(text, self.symbols)
97
- else:
98
- text_norm = text_to_sequence(text, self.text_cleaners)
99
- if self.add_blank:
100
- text_norm = commons.intersperse(text_norm, 0)
101
- text_norm = torch.LongTensor(text_norm)
102
- return text_norm
103
-
104
- def get_sid(self, sid):
105
- sid = torch.LongTensor([int(sid)])
106
- return sid
107
-
108
- def __getitem__(self, index):
109
- return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
110
-
111
- def __len__(self):
112
- return len(self.audiopaths_sid_text)
113
-
114
-
115
- class TextAudioSpeakerCollate():
116
- """ Zero-pads model inputs and targets
117
- """
118
-
119
- def __init__(self, return_ids=False):
120
- self.return_ids = return_ids
121
-
122
- def __call__(self, batch):
123
- """Collate's training batch from normalized text, audio and speaker identities
124
- PARAMS
125
- ------
126
- batch: [text_normalized, spec_normalized, wav_normalized, sid]
127
- """
128
- # Right zero-pad all one-hot text sequences to max input length
129
- _, ids_sorted_decreasing = torch.sort(
130
- torch.LongTensor([x[1].size(1) for x in batch]),
131
- dim=0, descending=True)
132
-
133
- max_text_len = max([len(x[0]) for x in batch])
134
- max_spec_len = max([x[1].size(1) for x in batch])
135
- max_wav_len = max([x[2].size(1) for x in batch])
136
-
137
- text_lengths = torch.LongTensor(len(batch))
138
- spec_lengths = torch.LongTensor(len(batch))
139
- wav_lengths = torch.LongTensor(len(batch))
140
- sid = torch.LongTensor(len(batch))
141
-
142
- text_padded = torch.LongTensor(len(batch), max_text_len)
143
- spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
144
- wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
145
- text_padded.zero_()
146
- spec_padded.zero_()
147
- wav_padded.zero_()
148
- for i in range(len(ids_sorted_decreasing)):
149
- row = batch[ids_sorted_decreasing[i]]
150
-
151
- text = row[0]
152
- text_padded[i, :text.size(0)] = text
153
- text_lengths[i] = text.size(0)
154
-
155
- spec = row[1]
156
- spec_padded[i, :, :spec.size(1)] = spec
157
- spec_lengths[i] = spec.size(1)
158
-
159
- wav = row[2]
160
- wav_padded[i, :, :wav.size(1)] = wav
161
- wav_lengths[i] = wav.size(1)
162
-
163
- sid[i] = row[3]
164
-
165
- if self.return_ids:
166
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
167
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid
168
-
169
-
170
- class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
171
- """
172
- Maintain similar input lengths in a batch.
173
- Length groups are specified by boundaries.
174
- Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
175
-
176
- It removes samples which are not included in the boundaries.
177
- Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
178
- """
179
-
180
- def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
181
- super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
182
- self.lengths = dataset.lengths
183
- self.batch_size = batch_size
184
- self.boundaries = boundaries
185
-
186
- self.buckets, self.num_samples_per_bucket = self._create_buckets()
187
- self.total_size = sum(self.num_samples_per_bucket)
188
- self.num_samples = self.total_size // self.num_replicas
189
-
190
- def _create_buckets(self):
191
- buckets = [[] for _ in range(len(self.boundaries) - 1)]
192
- for i in range(len(self.lengths)):
193
- length = self.lengths[i]
194
- idx_bucket = self._bisect(length)
195
- if idx_bucket != -1:
196
- buckets[idx_bucket].append(i)
197
-
198
- for i in range(len(buckets) - 1, 0, -1):
199
- if len(buckets[i]) == 0:
200
- buckets.pop(i)
201
- self.boundaries.pop(i + 1)
202
-
203
- num_samples_per_bucket = []
204
- for i in range(len(buckets)):
205
- len_bucket = len(buckets[i])
206
- total_batch_size = self.num_replicas * self.batch_size
207
- rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
208
- num_samples_per_bucket.append(len_bucket + rem)
209
- return buckets, num_samples_per_bucket
210
-
211
- def __iter__(self):
212
- # deterministically shuffle based on epoch
213
- g = torch.Generator()
214
- g.manual_seed(self.epoch)
215
-
216
- indices = []
217
- if self.shuffle:
218
- for bucket in self.buckets:
219
- indices.append(torch.randperm(len(bucket), generator=g).tolist())
220
- else:
221
- for bucket in self.buckets:
222
- indices.append(list(range(len(bucket))))
223
-
224
- batches = []
225
- for i in range(len(self.buckets)):
226
- bucket = self.buckets[i]
227
- len_bucket = len(bucket)
228
- ids_bucket = indices[i]
229
- num_samples_bucket = self.num_samples_per_bucket[i]
230
-
231
- # add extra samples to make it evenly divisible
232
- rem = num_samples_bucket - len_bucket
233
- ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
234
-
235
- # subsample
236
- ids_bucket = ids_bucket[self.rank::self.num_replicas]
237
-
238
- # batching
239
- for j in range(len(ids_bucket) // self.batch_size):
240
- batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]]
241
- batches.append(batch)
242
-
243
- if self.shuffle:
244
- batch_ids = torch.randperm(len(batches), generator=g).tolist()
245
- batches = [batches[i] for i in batch_ids]
246
- self.batches = batches
247
-
248
- assert len(self.batches) * self.batch_size == self.num_samples
249
- return iter(self.batches)
250
-
251
- def _bisect(self, x, lo=0, hi=None):
252
- if hi is None:
253
- hi = len(self.boundaries) - 1
254
-
255
- if hi > lo:
256
- mid = (hi + lo) // 2
257
- if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
258
- return mid
259
- elif x <= self.boundaries[mid]:
260
- return self._bisect(x, lo, mid)
261
- else:
262
- return self._bisect(x, mid + 1, hi)
263
- else:
264
- return -1
265
-
266
- def __len__(self):
267
- return self.num_samples // self.batch_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/denoise_audio.py DELETED
@@ -1,18 +0,0 @@
1
- import os
2
- import torchaudio
3
- raw_audio_dir = "./raw_audio/"
4
- denoise_audio_dir = "./denoised_audio/"
5
- filelist = list(os.walk(raw_audio_dir))[0][2]
6
-
7
- for file in filelist:
8
- if file.endswith(".wav"):
9
- os.system(f"demucs --two-stems=vocals {raw_audio_dir}{file}")
10
- for file in filelist:
11
- file = file.replace(".wav", "")
12
- wav, sr = torchaudio.load(f"./separated/htdemucs/{file}/vocals.wav", frame_offset=0, num_frames=-1, normalize=True,
13
- channels_first=True)
14
- # merge two channels into one
15
- wav = wav.mean(dim=0).unsqueeze(0)
16
- if sr != 22050:
17
- wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)(wav)
18
- torchaudio.save(denoise_audio_dir + file + ".wav", wav, 22050, channels_first=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/download_model.py DELETED
@@ -1,4 +0,0 @@
1
- from google.colab import files
2
- files.download("./G_latest.pth")
3
- files.download("./finetune_speaker.json")
4
- files.download("./moegoe_config.json")
 
 
 
 
 
VITS-fast-fine-tuning/download_video.py DELETED
@@ -1,37 +0,0 @@
1
- import os
2
- import random
3
- import shutil
4
- from concurrent.futures import ThreadPoolExecutor
5
- from google.colab import files
6
-
7
- basepath = os.getcwd()
8
- uploaded = files.upload() # 上传文件
9
- for filename in uploaded.keys():
10
- assert (filename.endswith(".txt")), "speaker-videolink info could only be .txt file!"
11
- shutil.move(os.path.join(basepath, filename), os.path.join("./speaker_links.txt"))
12
-
13
-
14
- def generate_infos():
15
- infos = []
16
- with open("./speaker_links.txt", 'r', encoding='utf-8') as f:
17
- lines = f.readlines()
18
- for line in lines:
19
- line = line.replace("\n", "").replace(" ", "")
20
- if line == "":
21
- continue
22
- speaker, link = line.split("|")
23
- filename = speaker + "_" + str(random.randint(0, 1000000))
24
- infos.append({"link": link, "filename": filename})
25
- return infos
26
-
27
-
28
- def download_video(info):
29
- link = info["link"]
30
- filename = info["filename"]
31
- os.system(f"youtube-dl -f 0 {link} -o ./video_data/{filename}.mp4")
32
-
33
-
34
- if __name__ == "__main__":
35
- infos = generate_infos()
36
- with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
37
- executor.map(download_video, infos)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/finetune_speaker_v2.py DELETED
@@ -1,321 +0,0 @@
1
- import os
2
- import json
3
- import argparse
4
- import itertools
5
- import math
6
- import torch
7
- from torch import nn, optim
8
- from torch.nn import functional as F
9
- from torch.utils.data import DataLoader
10
- from torch.utils.tensorboard import SummaryWriter
11
- import torch.multiprocessing as mp
12
- import torch.distributed as dist
13
- from torch.nn.parallel import DistributedDataParallel as DDP
14
- from torch.cuda.amp import autocast, GradScaler
15
- from tqdm import tqdm
16
-
17
- import librosa
18
- import logging
19
-
20
- logging.getLogger('numba').setLevel(logging.WARNING)
21
-
22
- import commons
23
- import utils
24
- from data_utils import (
25
- TextAudioSpeakerLoader,
26
- TextAudioSpeakerCollate,
27
- DistributedBucketSampler
28
- )
29
- from models import (
30
- SynthesizerTrn,
31
- MultiPeriodDiscriminator,
32
- )
33
- from losses import (
34
- generator_loss,
35
- discriminator_loss,
36
- feature_loss,
37
- kl_loss
38
- )
39
- from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
40
-
41
-
42
- torch.backends.cudnn.benchmark = True
43
- global_step = 0
44
-
45
-
46
- def main():
47
- """Assume Single Node Multi GPUs Training Only"""
48
- assert torch.cuda.is_available(), "CPU training is not allowed."
49
-
50
- n_gpus = torch.cuda.device_count()
51
- os.environ['MASTER_ADDR'] = 'localhost'
52
- os.environ['MASTER_PORT'] = '8000'
53
-
54
- hps = utils.get_hparams()
55
- mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
56
-
57
-
58
- def run(rank, n_gpus, hps):
59
- global global_step
60
- symbols = hps['symbols']
61
- if rank == 0:
62
- logger = utils.get_logger(hps.model_dir)
63
- logger.info(hps)
64
- utils.check_git_hash(hps.model_dir)
65
- writer = SummaryWriter(log_dir=hps.model_dir)
66
- writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
67
-
68
- # Use gloo backend on Windows for Pytorch
69
- dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank)
70
- torch.manual_seed(hps.train.seed)
71
- torch.cuda.set_device(rank)
72
-
73
- train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data, symbols)
74
- train_sampler = DistributedBucketSampler(
75
- train_dataset,
76
- hps.train.batch_size,
77
- [32,300,400,500,600,700,800,900,1000],
78
- num_replicas=n_gpus,
79
- rank=rank,
80
- shuffle=True)
81
- collate_fn = TextAudioSpeakerCollate()
82
- train_loader = DataLoader(train_dataset, num_workers=2, shuffle=False, pin_memory=True,
83
- collate_fn=collate_fn, batch_sampler=train_sampler)
84
- # train_loader = DataLoader(train_dataset, batch_size=hps.train.batch_size, num_workers=2, shuffle=False, pin_memory=True,
85
- # collate_fn=collate_fn)
86
- if rank == 0:
87
- eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data, symbols)
88
- eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False,
89
- batch_size=hps.train.batch_size, pin_memory=True,
90
- drop_last=False, collate_fn=collate_fn)
91
-
92
- net_g = SynthesizerTrn(
93
- len(symbols),
94
- hps.data.filter_length // 2 + 1,
95
- hps.train.segment_size // hps.data.hop_length,
96
- n_speakers=hps.data.n_speakers,
97
- **hps.model).cuda(rank)
98
- net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
99
-
100
- # load existing model
101
- _, _, _, _ = utils.load_checkpoint("./pretrained_models/G_0.pth", net_g, None, drop_speaker_emb=hps.drop_speaker_embed)
102
- _, _, _, _ = utils.load_checkpoint("./pretrained_models/D_0.pth", net_d, None)
103
- epoch_str = 1
104
- global_step = 0
105
- # freeze all other layers except speaker embedding
106
- for p in net_g.parameters():
107
- p.requires_grad = True
108
- for p in net_d.parameters():
109
- p.requires_grad = True
110
- # for p in net_d.parameters():
111
- # p.requires_grad = False
112
- # net_g.emb_g.weight.requires_grad = True
113
- optim_g = torch.optim.AdamW(
114
- net_g.parameters(),
115
- hps.train.learning_rate,
116
- betas=hps.train.betas,
117
- eps=hps.train.eps)
118
- optim_d = torch.optim.AdamW(
119
- net_d.parameters(),
120
- hps.train.learning_rate,
121
- betas=hps.train.betas,
122
- eps=hps.train.eps)
123
- # optim_d = None
124
- net_g = DDP(net_g, device_ids=[rank])
125
- net_d = DDP(net_d, device_ids=[rank])
126
-
127
- scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay)
128
- scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay)
129
-
130
- scaler = GradScaler(enabled=hps.train.fp16_run)
131
-
132
- for epoch in range(epoch_str, hps.train.epochs + 1):
133
- if rank==0:
134
- train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval])
135
- else:
136
- train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None)
137
- scheduler_g.step()
138
- scheduler_d.step()
139
-
140
-
141
- def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
142
- net_g, net_d = nets
143
- optim_g, optim_d = optims
144
- scheduler_g, scheduler_d = schedulers
145
- train_loader, eval_loader = loaders
146
- if writers is not None:
147
- writer, writer_eval = writers
148
-
149
- # train_loader.batch_sampler.set_epoch(epoch)
150
- global global_step
151
-
152
- net_g.train()
153
- net_d.train()
154
- for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(tqdm(train_loader)):
155
- x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True)
156
- spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
157
- y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True)
158
- speakers = speakers.cuda(rank, non_blocking=True)
159
-
160
- with autocast(enabled=hps.train.fp16_run):
161
- y_hat, l_length, attn, ids_slice, x_mask, z_mask,\
162
- (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, speakers)
163
-
164
- mel = spec_to_mel_torch(
165
- spec,
166
- hps.data.filter_length,
167
- hps.data.n_mel_channels,
168
- hps.data.sampling_rate,
169
- hps.data.mel_fmin,
170
- hps.data.mel_fmax)
171
- y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
172
- y_hat_mel = mel_spectrogram_torch(
173
- y_hat.squeeze(1),
174
- hps.data.filter_length,
175
- hps.data.n_mel_channels,
176
- hps.data.sampling_rate,
177
- hps.data.hop_length,
178
- hps.data.win_length,
179
- hps.data.mel_fmin,
180
- hps.data.mel_fmax
181
- )
182
-
183
- y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
184
-
185
- # Discriminator
186
- y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
187
- with autocast(enabled=False):
188
- loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
189
- loss_disc_all = loss_disc
190
- optim_d.zero_grad()
191
- scaler.scale(loss_disc_all).backward()
192
- scaler.unscale_(optim_d)
193
- grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
194
- scaler.step(optim_d)
195
-
196
- with autocast(enabled=hps.train.fp16_run):
197
- # Generator
198
- y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
199
- with autocast(enabled=False):
200
- loss_dur = torch.sum(l_length.float())
201
- loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
202
- loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
203
-
204
- loss_fm = feature_loss(fmap_r, fmap_g)
205
- loss_gen, losses_gen = generator_loss(y_d_hat_g)
206
- loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl
207
- optim_g.zero_grad()
208
- scaler.scale(loss_gen_all).backward()
209
- scaler.unscale_(optim_g)
210
- grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
211
- scaler.step(optim_g)
212
- scaler.update()
213
-
214
- if rank==0:
215
- if global_step % hps.train.log_interval == 0:
216
- lr = optim_g.param_groups[0]['lr']
217
- losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl]
218
- logger.info('Train Epoch: {} [{:.0f}%]'.format(
219
- epoch,
220
- 100. * batch_idx / len(train_loader)))
221
- logger.info([x.item() for x in losses] + [global_step, lr])
222
-
223
- scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_g": grad_norm_g}
224
- scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl})
225
-
226
- scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
227
- scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
228
- scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
229
- image_dict = {
230
- "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
231
- "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
232
- "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
233
- "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy())
234
- }
235
- utils.summarize(
236
- writer=writer,
237
- global_step=global_step,
238
- images=image_dict,
239
- scalars=scalar_dict)
240
-
241
- if global_step % hps.train.eval_interval == 0:
242
- evaluate(hps, net_g, eval_loader, writer_eval)
243
- utils.save_checkpoint(net_g, None, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
244
- utils.save_checkpoint(net_g, None, hps.train.learning_rate, epoch,
245
- os.path.join(hps.model_dir, "G_latest.pth".format(global_step)))
246
- # utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
247
- old_g=os.path.join(hps.model_dir, "G_{}.pth".format(global_step-4000))
248
- # old_d=os.path.join(hps.model_dir, "D_{}.pth".format(global_step-400))
249
- if os.path.exists(old_g):
250
- os.remove(old_g)
251
- # if os.path.exists(old_d):
252
- # os.remove(old_d)
253
- global_step += 1
254
- if epoch > hps.max_epochs:
255
- print("Maximum epoch reached, closing training...")
256
- exit()
257
-
258
- if rank == 0:
259
- logger.info('====> Epoch: {}'.format(epoch))
260
-
261
-
262
- def evaluate(hps, generator, eval_loader, writer_eval):
263
- generator.eval()
264
- with torch.no_grad():
265
- for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(eval_loader):
266
- x, x_lengths = x.cuda(0), x_lengths.cuda(0)
267
- spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0)
268
- y, y_lengths = y.cuda(0), y_lengths.cuda(0)
269
- speakers = speakers.cuda(0)
270
-
271
- # remove else
272
- x = x[:1]
273
- x_lengths = x_lengths[:1]
274
- spec = spec[:1]
275
- spec_lengths = spec_lengths[:1]
276
- y = y[:1]
277
- y_lengths = y_lengths[:1]
278
- speakers = speakers[:1]
279
- break
280
- y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, max_len=1000)
281
- y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length
282
-
283
- mel = spec_to_mel_torch(
284
- spec,
285
- hps.data.filter_length,
286
- hps.data.n_mel_channels,
287
- hps.data.sampling_rate,
288
- hps.data.mel_fmin,
289
- hps.data.mel_fmax)
290
- y_hat_mel = mel_spectrogram_torch(
291
- y_hat.squeeze(1).float(),
292
- hps.data.filter_length,
293
- hps.data.n_mel_channels,
294
- hps.data.sampling_rate,
295
- hps.data.hop_length,
296
- hps.data.win_length,
297
- hps.data.mel_fmin,
298
- hps.data.mel_fmax
299
- )
300
- image_dict = {
301
- "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())
302
- }
303
- audio_dict = {
304
- "gen/audio": y_hat[0,:,:y_hat_lengths[0]]
305
- }
306
- if global_step == 0:
307
- image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
308
- audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]})
309
-
310
- utils.summarize(
311
- writer=writer_eval,
312
- global_step=global_step,
313
- images=image_dict,
314
- audios=audio_dict,
315
- audio_sampling_rate=hps.data.sampling_rate
316
- )
317
- generator.train()
318
-
319
-
320
- if __name__ == "__main__":
321
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/inference/G_latest.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:44f9141fcac34c950376594d08a288d9159a32d6add851155b6fd0ecee242419
3
- size 158887401
 
 
 
 
VITS-fast-fine-tuning/inference/ONNXVITS_inference.py DELETED
@@ -1,36 +0,0 @@
1
- import logging
2
- logging.getLogger('numba').setLevel(logging.WARNING)
3
- import IPython.display as ipd
4
- import torch
5
- import commons
6
- import utils
7
- import ONNXVITS_infer
8
- from text import text_to_sequence
9
-
10
- def get_text(text, hps):
11
- text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
12
- if hps.data.add_blank:
13
- text_norm = commons.intersperse(text_norm, 0)
14
- text_norm = torch.LongTensor(text_norm)
15
- return text_norm
16
-
17
- hps = utils.get_hparams_from_file("../vits/pretrained_models/uma87.json")
18
-
19
- net_g = ONNXVITS_infer.SynthesizerTrn(
20
- len(hps.symbols),
21
- hps.data.filter_length // 2 + 1,
22
- hps.train.segment_size // hps.data.hop_length,
23
- n_speakers=hps.data.n_speakers,
24
- **hps.model)
25
- _ = net_g.eval()
26
-
27
- _ = utils.load_checkpoint("../vits/pretrained_models/uma_1153000.pth", net_g)
28
-
29
- text1 = get_text("おはようございます。", hps)
30
- stn_tst = text1
31
- with torch.no_grad():
32
- x_tst = stn_tst.unsqueeze(0)
33
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
34
- sid = torch.LongTensor([0])
35
- audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
36
- print(audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/inference/VC_inference.py DELETED
@@ -1,139 +0,0 @@
1
- import os
2
- import numpy as np
3
- import torch
4
- from torch import no_grad, LongTensor
5
- import argparse
6
- import commons
7
- from mel_processing import spectrogram_torch
8
- import utils
9
- from models import SynthesizerTrn
10
- import gradio as gr
11
- import librosa
12
- import webbrowser
13
-
14
- from text import text_to_sequence, _clean_text
15
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
- language_marks = {
17
- "Japanese": "",
18
- "日本語": "[JA]",
19
- "简体中文": "[ZH]",
20
- "English": "[EN]",
21
- "Mix": "",
22
- }
23
- lang = ['日本語', '简体中文', 'English', 'Mix']
24
- def get_text(text, hps, is_symbol):
25
- text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
26
- if hps.data.add_blank:
27
- text_norm = commons.intersperse(text_norm, 0)
28
- text_norm = LongTensor(text_norm)
29
- return text_norm
30
-
31
- def create_tts_fn(model, hps, speaker_ids):
32
- def tts_fn(text, speaker, language, speed):
33
- if language is not None:
34
- text = language_marks[language] + text + language_marks[language]
35
- speaker_id = speaker_ids[speaker]
36
- stn_tst = get_text(text, hps, False)
37
- with no_grad():
38
- x_tst = stn_tst.unsqueeze(0).to(device)
39
- x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
40
- sid = LongTensor([speaker_id]).to(device)
41
- audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
42
- length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
43
- del stn_tst, x_tst, x_tst_lengths, sid
44
- return "Success", (hps.data.sampling_rate, audio)
45
-
46
- return tts_fn
47
-
48
- def create_vc_fn(model, hps, speaker_ids):
49
- def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
50
- input_audio = record_audio if record_audio is not None else upload_audio
51
- if input_audio is None:
52
- return "You need to record or upload an audio", None
53
- sampling_rate, audio = input_audio
54
- original_speaker_id = speaker_ids[original_speaker]
55
- target_speaker_id = speaker_ids[target_speaker]
56
-
57
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
58
- if len(audio.shape) > 1:
59
- audio = librosa.to_mono(audio.transpose(1, 0))
60
- if sampling_rate != hps.data.sampling_rate:
61
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
62
- with no_grad():
63
- y = torch.FloatTensor(audio)
64
- y = y / max(-y.min(), y.max()) / 0.99
65
- y = y.to(device)
66
- y = y.unsqueeze(0)
67
- spec = spectrogram_torch(y, hps.data.filter_length,
68
- hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
69
- center=False).to(device)
70
- spec_lengths = LongTensor([spec.size(-1)]).to(device)
71
- sid_src = LongTensor([original_speaker_id]).to(device)
72
- sid_tgt = LongTensor([target_speaker_id]).to(device)
73
- audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
74
- 0, 0].data.cpu().float().numpy()
75
- del y, spec, spec_lengths, sid_src, sid_tgt
76
- return "Success", (hps.data.sampling_rate, audio)
77
-
78
- return vc_fn
79
- if __name__ == "__main__":
80
- parser = argparse.ArgumentParser()
81
- parser.add_argument("--model_dir", default="./G_latest.pth", help="directory to your fine-tuned model")
82
- parser.add_argument("--config_dir", default="./finetune_speaker.json", help="directory to your model config file")
83
- parser.add_argument("--share", default=False, help="make link public (used in colab)")
84
-
85
- args = parser.parse_args()
86
- hps = utils.get_hparams_from_file(args.config_dir)
87
-
88
-
89
- net_g = SynthesizerTrn(
90
- len(hps.symbols),
91
- hps.data.filter_length // 2 + 1,
92
- hps.train.segment_size // hps.data.hop_length,
93
- n_speakers=hps.data.n_speakers,
94
- **hps.model).to(device)
95
- _ = net_g.eval()
96
-
97
- _ = utils.load_checkpoint(args.model_dir, net_g, None)
98
- speaker_ids = hps.speakers
99
- speakers = list(hps.speakers.keys())
100
- tts_fn = create_tts_fn(net_g, hps, speaker_ids)
101
- vc_fn = create_vc_fn(net_g, hps, speaker_ids)
102
- app = gr.Blocks()
103
- with app:
104
- with gr.Tab("Text-to-Speech"):
105
- with gr.Row():
106
- with gr.Column():
107
- textbox = gr.TextArea(label="Text",
108
- placeholder="Type your sentence here",
109
- value="こんにちわ。", elem_id=f"tts-input")
110
- # select character
111
- char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
112
- language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
113
- duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
114
- label='速度 Speed')
115
- with gr.Column():
116
- text_output = gr.Textbox(label="Message")
117
- audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
118
- btn = gr.Button("Generate!")
119
- btn.click(tts_fn,
120
- inputs=[textbox, char_dropdown, language_dropdown, duration_slider,],
121
- outputs=[text_output, audio_output])
122
- with gr.Tab("Voice Conversion"):
123
- gr.Markdown("""
124
- 录制或上传声音,并选择要转换的音色。
125
- """)
126
- with gr.Column():
127
- record_audio = gr.Audio(label="record your voice", source="microphone")
128
- upload_audio = gr.Audio(label="or upload audio here", source="upload")
129
- source_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="source speaker")
130
- target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
131
- with gr.Column():
132
- message_box = gr.Textbox(label="Message")
133
- converted_audio = gr.Audio(label='converted audio')
134
- btn = gr.Button("Convert!")
135
- btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
136
- outputs=[message_box, converted_audio])
137
- webbrowser.open("http://127.0.0.1:7860")
138
- app.launch(share=args.share)
139
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/inference/finetune_speaker.json DELETED
@@ -1,147 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 100,
4
- "eval_interval": 1000,
5
- "seed": 1234,
6
- "epochs": 10000,
7
- "learning_rate": 0.0002,
8
- "betas": [
9
- 0.8,
10
- 0.99
11
- ],
12
- "eps": 1e-09,
13
- "batch_size": 16,
14
- "fp16_run": true,
15
- "lr_decay": 0.999875,
16
- "segment_size": 8192,
17
- "init_lr_ratio": 1,
18
- "warmup_epochs": 0,
19
- "c_mel": 45,
20
- "c_kl": 1.0
21
- },
22
- "data": {
23
- "training_files": "final_annotation_train.txt",
24
- "validation_files": "final_annotation_val.txt",
25
- "text_cleaners": [
26
- "zh_ja_mixture_cleaners"
27
- ],
28
- "max_wav_value": 32768.0,
29
- "sampling_rate": 22050,
30
- "filter_length": 1024,
31
- "hop_length": 256,
32
- "win_length": 1024,
33
- "n_mel_channels": 80,
34
- "mel_fmin": 0.0,
35
- "mel_fmax": null,
36
- "add_blank": true,
37
- "n_speakers": 3,
38
- "cleaned_text": true
39
- },
40
- "model": {
41
- "inter_channels": 192,
42
- "hidden_channels": 192,
43
- "filter_channels": 768,
44
- "n_heads": 2,
45
- "n_layers": 6,
46
- "kernel_size": 3,
47
- "p_dropout": 0.1,
48
- "resblock": "1",
49
- "resblock_kernel_sizes": [
50
- 3,
51
- 7,
52
- 11
53
- ],
54
- "resblock_dilation_sizes": [
55
- [
56
- 1,
57
- 3,
58
- 5
59
- ],
60
- [
61
- 1,
62
- 3,
63
- 5
64
- ],
65
- [
66
- 1,
67
- 3,
68
- 5
69
- ]
70
- ],
71
- "upsample_rates": [
72
- 8,
73
- 8,
74
- 2,
75
- 2
76
- ],
77
- "upsample_initial_channel": 512,
78
- "upsample_kernel_sizes": [
79
- 16,
80
- 16,
81
- 4,
82
- 4
83
- ],
84
- "n_layers_q": 3,
85
- "use_spectral_norm": false,
86
- "gin_channels": 256
87
- },
88
- "speakers": {
89
- "Hana": 0,
90
- "specialweek": 1,
91
- "zhongli": 2
92
- },
93
- "symbols": [
94
- "_",
95
- ",",
96
- ".",
97
- "!",
98
- "?",
99
- "-",
100
- "~",
101
- "\u2026",
102
- "A",
103
- "E",
104
- "I",
105
- "N",
106
- "O",
107
- "Q",
108
- "U",
109
- "a",
110
- "b",
111
- "d",
112
- "e",
113
- "f",
114
- "g",
115
- "h",
116
- "i",
117
- "j",
118
- "k",
119
- "l",
120
- "m",
121
- "n",
122
- "o",
123
- "p",
124
- "r",
125
- "s",
126
- "t",
127
- "u",
128
- "v",
129
- "w",
130
- "y",
131
- "z",
132
- "\u0283",
133
- "\u02a7",
134
- "\u02a6",
135
- "\u026f",
136
- "\u0279",
137
- "\u0259",
138
- "\u0265",
139
- "\u207c",
140
- "\u02b0",
141
- "`",
142
- "\u2192",
143
- "\u2193",
144
- "\u2191",
145
- " "
146
- ]
147
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/long_audio_transcribe.py DELETED
@@ -1,71 +0,0 @@
1
- from moviepy.editor import AudioFileClip
2
- import whisper
3
- import os
4
- import torchaudio
5
- import librosa
6
- import torch
7
- import argparse
8
- parent_dir = "./denoised_audio/"
9
- filelist = list(os.walk(parent_dir))[0][2]
10
- if __name__ == "__main__":
11
- parser = argparse.ArgumentParser()
12
- parser.add_argument("--languages", default="CJE")
13
- parser.add_argument("--whisper_size", default="medium")
14
- args = parser.parse_args()
15
- if args.languages == "CJE":
16
- lang2token = {
17
- 'zh': "[ZH]",
18
- 'ja': "[JA]",
19
- "en": "[EN]",
20
- }
21
- elif args.languages == "CJ":
22
- lang2token = {
23
- 'zh': "[ZH]",
24
- 'ja': "[JA]",
25
- }
26
- elif args.languages == "C":
27
- lang2token = {
28
- 'zh': "[ZH]",
29
- }
30
- assert(torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
31
- model = whisper.load_model(args.whisper_size)
32
- speaker_annos = []
33
- for file in filelist:
34
- print(f"transcribing {parent_dir + file}...\n")
35
- options = dict(beam_size=5, best_of=5)
36
- transcribe_options = dict(task="transcribe", **options)
37
- result = model.transcribe(parent_dir + file, **transcribe_options)
38
- segments = result["segments"]
39
- # result = model.transcribe(parent_dir + file)
40
- lang = result['language']
41
- if result['language'] not in list(lang2token.keys()):
42
- print(f"{lang} not supported, ignoring...\n")
43
- continue
44
- # segment audio based on segment results
45
- character_name = file.rstrip(".wav").split("_")[0]
46
- code = file.rstrip(".wav").split("_")[1]
47
- if not os.path.exists("./segmented_character_voice/" + character_name):
48
- os.mkdir("./segmented_character_voice/" + character_name)
49
- wav, sr = torchaudio.load(parent_dir + file, frame_offset=0, num_frames=-1, normalize=True,
50
- channels_first=True)
51
-
52
- for i, seg in enumerate(result['segments']):
53
- start_time = seg['start']
54
- end_time = seg['end']
55
- text = seg['text']
56
- text = lang2token[lang] + text.replace("\n", "") + lang2token[lang]
57
- text = text + "\n"
58
- wav_seg = wav[:, int(start_time*sr):int(end_time*sr)]
59
- wav_seg_name = f"{character_name}_{code}_{i}.wav"
60
- savepth = "./segmented_character_voice/" + character_name + "/" + wav_seg_name
61
- speaker_annos.append(savepth + "|" + character_name + "|" + text)
62
- print(f"Transcribed segment: {speaker_annos[-1]}")
63
- # trimmed_wav_seg = librosa.effects.trim(wav_seg.squeeze().numpy())
64
- # trimmed_wav_seg = torch.tensor(trimmed_wav_seg[0]).unsqueeze(0)
65
- torchaudio.save(savepth, wav_seg, 22050, channels_first=True)
66
- if len(speaker_annos) == 0:
67
- print("Warning: no long audios & videos found, this IS expected if you have only uploaded short audios")
68
- print("this IS NOT expected if you have uploaded any long audios, videos or video links. Please check your file structure or make sure your audio/video language is supported.")
69
- with open("long_character_anno.txt", 'w', encoding='utf-8') as f:
70
- for line in speaker_annos:
71
- f.write(line)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/losses.py DELETED
@@ -1,61 +0,0 @@
1
- import torch
2
- from torch.nn import functional as F
3
-
4
- import commons
5
-
6
-
7
- def feature_loss(fmap_r, fmap_g):
8
- loss = 0
9
- for dr, dg in zip(fmap_r, fmap_g):
10
- for rl, gl in zip(dr, dg):
11
- rl = rl.float().detach()
12
- gl = gl.float()
13
- loss += torch.mean(torch.abs(rl - gl))
14
-
15
- return loss * 2
16
-
17
-
18
- def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19
- loss = 0
20
- r_losses = []
21
- g_losses = []
22
- for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23
- dr = dr.float()
24
- dg = dg.float()
25
- r_loss = torch.mean((1-dr)**2)
26
- g_loss = torch.mean(dg**2)
27
- loss += (r_loss + g_loss)
28
- r_losses.append(r_loss.item())
29
- g_losses.append(g_loss.item())
30
-
31
- return loss, r_losses, g_losses
32
-
33
-
34
- def generator_loss(disc_outputs):
35
- loss = 0
36
- gen_losses = []
37
- for dg in disc_outputs:
38
- dg = dg.float()
39
- l = torch.mean((1-dg)**2)
40
- gen_losses.append(l)
41
- loss += l
42
-
43
- return loss, gen_losses
44
-
45
-
46
- def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47
- """
48
- z_p, logs_q: [b, h, t_t]
49
- m_p, logs_p: [b, h, t_t]
50
- """
51
- z_p = z_p.float()
52
- logs_q = logs_q.float()
53
- m_p = m_p.float()
54
- logs_p = logs_p.float()
55
- z_mask = z_mask.float()
56
-
57
- kl = logs_p - logs_q - 0.5
58
- kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
59
- kl = torch.sum(kl * z_mask)
60
- l = kl / torch.sum(z_mask)
61
- return l
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/mel_processing.py DELETED
@@ -1,112 +0,0 @@
1
- import math
2
- import os
3
- import random
4
- import torch
5
- from torch import nn
6
- import torch.nn.functional as F
7
- import torch.utils.data
8
- import numpy as np
9
- import librosa
10
- import librosa.util as librosa_util
11
- from librosa.util import normalize, pad_center, tiny
12
- from scipy.signal import get_window
13
- from scipy.io.wavfile import read
14
- from librosa.filters import mel as librosa_mel_fn
15
-
16
- MAX_WAV_VALUE = 32768.0
17
-
18
-
19
- def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
20
- """
21
- PARAMS
22
- ------
23
- C: compression factor
24
- """
25
- return torch.log(torch.clamp(x, min=clip_val) * C)
26
-
27
-
28
- def dynamic_range_decompression_torch(x, C=1):
29
- """
30
- PARAMS
31
- ------
32
- C: compression factor used to compress
33
- """
34
- return torch.exp(x) / C
35
-
36
-
37
- def spectral_normalize_torch(magnitudes):
38
- output = dynamic_range_compression_torch(magnitudes)
39
- return output
40
-
41
-
42
- def spectral_de_normalize_torch(magnitudes):
43
- output = dynamic_range_decompression_torch(magnitudes)
44
- return output
45
-
46
-
47
- mel_basis = {}
48
- hann_window = {}
49
-
50
-
51
- def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
52
- if torch.min(y) < -1.:
53
- print('min value is ', torch.min(y))
54
- if torch.max(y) > 1.:
55
- print('max value is ', torch.max(y))
56
-
57
- global hann_window
58
- dtype_device = str(y.dtype) + '_' + str(y.device)
59
- wnsize_dtype_device = str(win_size) + '_' + dtype_device
60
- if wnsize_dtype_device not in hann_window:
61
- hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
62
-
63
- y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
64
- y = y.squeeze(1)
65
-
66
- spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
67
- center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
68
-
69
- spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
70
- return spec
71
-
72
-
73
- def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
74
- global mel_basis
75
- dtype_device = str(spec.dtype) + '_' + str(spec.device)
76
- fmax_dtype_device = str(fmax) + '_' + dtype_device
77
- if fmax_dtype_device not in mel_basis:
78
- mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
79
- mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
80
- spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
81
- spec = spectral_normalize_torch(spec)
82
- return spec
83
-
84
-
85
- def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
86
- if torch.min(y) < -1.:
87
- print('min value is ', torch.min(y))
88
- if torch.max(y) > 1.:
89
- print('max value is ', torch.max(y))
90
-
91
- global mel_basis, hann_window
92
- dtype_device = str(y.dtype) + '_' + str(y.device)
93
- fmax_dtype_device = str(fmax) + '_' + dtype_device
94
- wnsize_dtype_device = str(win_size) + '_' + dtype_device
95
- if fmax_dtype_device not in mel_basis:
96
- mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
97
- mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
98
- if wnsize_dtype_device not in hann_window:
99
- hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
100
-
101
- y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
102
- y = y.squeeze(1)
103
-
104
- spec = torch.stft(y.float(), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
105
- center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
106
-
107
- spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
108
-
109
- spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110
- spec = spectral_normalize_torch(spec)
111
-
112
- return spec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/models.py DELETED
@@ -1,533 +0,0 @@
1
- import copy
2
- import math
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
-
7
- import commons
8
- import modules
9
- import attentions
10
- import monotonic_align
11
-
12
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
13
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
14
- from commons import init_weights, get_padding
15
-
16
-
17
- class StochasticDurationPredictor(nn.Module):
18
- def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
19
- super().__init__()
20
- filter_channels = in_channels # it needs to be removed from future version.
21
- self.in_channels = in_channels
22
- self.filter_channels = filter_channels
23
- self.kernel_size = kernel_size
24
- self.p_dropout = p_dropout
25
- self.n_flows = n_flows
26
- self.gin_channels = gin_channels
27
-
28
- self.log_flow = modules.Log()
29
- self.flows = nn.ModuleList()
30
- self.flows.append(modules.ElementwiseAffine(2))
31
- for i in range(n_flows):
32
- self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
33
- self.flows.append(modules.Flip())
34
-
35
- self.post_pre = nn.Conv1d(1, filter_channels, 1)
36
- self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
37
- self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
38
- self.post_flows = nn.ModuleList()
39
- self.post_flows.append(modules.ElementwiseAffine(2))
40
- for i in range(4):
41
- self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
42
- self.post_flows.append(modules.Flip())
43
-
44
- self.pre = nn.Conv1d(in_channels, filter_channels, 1)
45
- self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
46
- self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
47
- if gin_channels != 0:
48
- self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
49
-
50
- def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
51
- x = torch.detach(x)
52
- x = self.pre(x)
53
- if g is not None:
54
- g = torch.detach(g)
55
- x = x + self.cond(g)
56
- x = self.convs(x, x_mask)
57
- x = self.proj(x) * x_mask
58
-
59
- if not reverse:
60
- flows = self.flows
61
- assert w is not None
62
-
63
- logdet_tot_q = 0
64
- h_w = self.post_pre(w)
65
- h_w = self.post_convs(h_w, x_mask)
66
- h_w = self.post_proj(h_w) * x_mask
67
- e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
68
- z_q = e_q
69
- for flow in self.post_flows:
70
- z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
71
- logdet_tot_q += logdet_q
72
- z_u, z1 = torch.split(z_q, [1, 1], 1)
73
- u = torch.sigmoid(z_u) * x_mask
74
- z0 = (w - u) * x_mask
75
- logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
76
- logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
77
-
78
- logdet_tot = 0
79
- z0, logdet = self.log_flow(z0, x_mask)
80
- logdet_tot += logdet
81
- z = torch.cat([z0, z1], 1)
82
- for flow in flows:
83
- z, logdet = flow(z, x_mask, g=x, reverse=reverse)
84
- logdet_tot = logdet_tot + logdet
85
- nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
86
- return nll + logq # [b]
87
- else:
88
- flows = list(reversed(self.flows))
89
- flows = flows[:-2] + [flows[-1]] # remove a useless vflow
90
- z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
91
- for flow in flows:
92
- z = flow(z, x_mask, g=x, reverse=reverse)
93
- z0, z1 = torch.split(z, [1, 1], 1)
94
- logw = z0
95
- return logw
96
-
97
-
98
- class DurationPredictor(nn.Module):
99
- def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
100
- super().__init__()
101
-
102
- self.in_channels = in_channels
103
- self.filter_channels = filter_channels
104
- self.kernel_size = kernel_size
105
- self.p_dropout = p_dropout
106
- self.gin_channels = gin_channels
107
-
108
- self.drop = nn.Dropout(p_dropout)
109
- self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
110
- self.norm_1 = modules.LayerNorm(filter_channels)
111
- self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
112
- self.norm_2 = modules.LayerNorm(filter_channels)
113
- self.proj = nn.Conv1d(filter_channels, 1, 1)
114
-
115
- if gin_channels != 0:
116
- self.cond = nn.Conv1d(gin_channels, in_channels, 1)
117
-
118
- def forward(self, x, x_mask, g=None):
119
- x = torch.detach(x)
120
- if g is not None:
121
- g = torch.detach(g)
122
- x = x + self.cond(g)
123
- x = self.conv_1(x * x_mask)
124
- x = torch.relu(x)
125
- x = self.norm_1(x)
126
- x = self.drop(x)
127
- x = self.conv_2(x * x_mask)
128
- x = torch.relu(x)
129
- x = self.norm_2(x)
130
- x = self.drop(x)
131
- x = self.proj(x * x_mask)
132
- return x * x_mask
133
-
134
-
135
- class TextEncoder(nn.Module):
136
- def __init__(self,
137
- n_vocab,
138
- out_channels,
139
- hidden_channels,
140
- filter_channels,
141
- n_heads,
142
- n_layers,
143
- kernel_size,
144
- p_dropout):
145
- super().__init__()
146
- self.n_vocab = n_vocab
147
- self.out_channels = out_channels
148
- self.hidden_channels = hidden_channels
149
- self.filter_channels = filter_channels
150
- self.n_heads = n_heads
151
- self.n_layers = n_layers
152
- self.kernel_size = kernel_size
153
- self.p_dropout = p_dropout
154
-
155
- self.emb = nn.Embedding(n_vocab, hidden_channels)
156
- nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
157
-
158
- self.encoder = attentions.Encoder(
159
- hidden_channels,
160
- filter_channels,
161
- n_heads,
162
- n_layers,
163
- kernel_size,
164
- p_dropout)
165
- self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
166
-
167
- def forward(self, x, x_lengths):
168
- x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
169
- x = torch.transpose(x, 1, -1) # [b, h, t]
170
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
171
-
172
- x = self.encoder(x * x_mask, x_mask)
173
- stats = self.proj(x) * x_mask
174
-
175
- m, logs = torch.split(stats, self.out_channels, dim=1)
176
- return x, m, logs, x_mask
177
-
178
-
179
- class ResidualCouplingBlock(nn.Module):
180
- def __init__(self,
181
- channels,
182
- hidden_channels,
183
- kernel_size,
184
- dilation_rate,
185
- n_layers,
186
- n_flows=4,
187
- gin_channels=0):
188
- super().__init__()
189
- self.channels = channels
190
- self.hidden_channels = hidden_channels
191
- self.kernel_size = kernel_size
192
- self.dilation_rate = dilation_rate
193
- self.n_layers = n_layers
194
- self.n_flows = n_flows
195
- self.gin_channels = gin_channels
196
-
197
- self.flows = nn.ModuleList()
198
- for i in range(n_flows):
199
- self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
200
- self.flows.append(modules.Flip())
201
-
202
- def forward(self, x, x_mask, g=None, reverse=False):
203
- if not reverse:
204
- for flow in self.flows:
205
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
206
- else:
207
- for flow in reversed(self.flows):
208
- x = flow(x, x_mask, g=g, reverse=reverse)
209
- return x
210
-
211
-
212
- class PosteriorEncoder(nn.Module):
213
- def __init__(self,
214
- in_channels,
215
- out_channels,
216
- hidden_channels,
217
- kernel_size,
218
- dilation_rate,
219
- n_layers,
220
- gin_channels=0):
221
- super().__init__()
222
- self.in_channels = in_channels
223
- self.out_channels = out_channels
224
- self.hidden_channels = hidden_channels
225
- self.kernel_size = kernel_size
226
- self.dilation_rate = dilation_rate
227
- self.n_layers = n_layers
228
- self.gin_channels = gin_channels
229
-
230
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
231
- self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
232
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
233
-
234
- def forward(self, x, x_lengths, g=None):
235
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
236
- x = self.pre(x) * x_mask
237
- x = self.enc(x, x_mask, g=g)
238
- stats = self.proj(x) * x_mask
239
- m, logs = torch.split(stats, self.out_channels, dim=1)
240
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
241
- return z, m, logs, x_mask
242
-
243
-
244
- class Generator(torch.nn.Module):
245
- def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
246
- super(Generator, self).__init__()
247
- self.num_kernels = len(resblock_kernel_sizes)
248
- self.num_upsamples = len(upsample_rates)
249
- self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
250
- resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
251
-
252
- self.ups = nn.ModuleList()
253
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
254
- self.ups.append(weight_norm(
255
- ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
256
- k, u, padding=(k-u)//2)))
257
-
258
- self.resblocks = nn.ModuleList()
259
- for i in range(len(self.ups)):
260
- ch = upsample_initial_channel//(2**(i+1))
261
- for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
262
- self.resblocks.append(resblock(ch, k, d))
263
-
264
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
265
- self.ups.apply(init_weights)
266
-
267
- if gin_channels != 0:
268
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
269
-
270
- def forward(self, x, g=None):
271
- x = self.conv_pre(x)
272
- if g is not None:
273
- x = x + self.cond(g)
274
-
275
- for i in range(self.num_upsamples):
276
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
277
- x = self.ups[i](x)
278
- xs = None
279
- for j in range(self.num_kernels):
280
- if xs is None:
281
- xs = self.resblocks[i*self.num_kernels+j](x)
282
- else:
283
- xs += self.resblocks[i*self.num_kernels+j](x)
284
- x = xs / self.num_kernels
285
- x = F.leaky_relu(x)
286
- x = self.conv_post(x)
287
- x = torch.tanh(x)
288
-
289
- return x
290
-
291
- def remove_weight_norm(self):
292
- print('Removing weight norm...')
293
- for l in self.ups:
294
- remove_weight_norm(l)
295
- for l in self.resblocks:
296
- l.remove_weight_norm()
297
-
298
-
299
- class DiscriminatorP(torch.nn.Module):
300
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
301
- super(DiscriminatorP, self).__init__()
302
- self.period = period
303
- self.use_spectral_norm = use_spectral_norm
304
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
305
- self.convs = nn.ModuleList([
306
- norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
307
- norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
308
- norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
309
- norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
310
- norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
311
- ])
312
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
313
-
314
- def forward(self, x):
315
- fmap = []
316
-
317
- # 1d to 2d
318
- b, c, t = x.shape
319
- if t % self.period != 0: # pad first
320
- n_pad = self.period - (t % self.period)
321
- x = F.pad(x, (0, n_pad), "reflect")
322
- t = t + n_pad
323
- x = x.view(b, c, t // self.period, self.period)
324
-
325
- for l in self.convs:
326
- x = l(x)
327
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
328
- fmap.append(x)
329
- x = self.conv_post(x)
330
- fmap.append(x)
331
- x = torch.flatten(x, 1, -1)
332
-
333
- return x, fmap
334
-
335
-
336
- class DiscriminatorS(torch.nn.Module):
337
- def __init__(self, use_spectral_norm=False):
338
- super(DiscriminatorS, self).__init__()
339
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
340
- self.convs = nn.ModuleList([
341
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
342
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
343
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
344
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
345
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
346
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
347
- ])
348
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
349
-
350
- def forward(self, x):
351
- fmap = []
352
-
353
- for l in self.convs:
354
- x = l(x)
355
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
356
- fmap.append(x)
357
- x = self.conv_post(x)
358
- fmap.append(x)
359
- x = torch.flatten(x, 1, -1)
360
-
361
- return x, fmap
362
-
363
-
364
- class MultiPeriodDiscriminator(torch.nn.Module):
365
- def __init__(self, use_spectral_norm=False):
366
- super(MultiPeriodDiscriminator, self).__init__()
367
- periods = [2,3,5,7,11]
368
-
369
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
370
- discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
371
- self.discriminators = nn.ModuleList(discs)
372
-
373
- def forward(self, y, y_hat):
374
- y_d_rs = []
375
- y_d_gs = []
376
- fmap_rs = []
377
- fmap_gs = []
378
- for i, d in enumerate(self.discriminators):
379
- y_d_r, fmap_r = d(y)
380
- y_d_g, fmap_g = d(y_hat)
381
- y_d_rs.append(y_d_r)
382
- y_d_gs.append(y_d_g)
383
- fmap_rs.append(fmap_r)
384
- fmap_gs.append(fmap_g)
385
-
386
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
387
-
388
-
389
-
390
- class SynthesizerTrn(nn.Module):
391
- """
392
- Synthesizer for Training
393
- """
394
-
395
- def __init__(self,
396
- n_vocab,
397
- spec_channels,
398
- segment_size,
399
- inter_channels,
400
- hidden_channels,
401
- filter_channels,
402
- n_heads,
403
- n_layers,
404
- kernel_size,
405
- p_dropout,
406
- resblock,
407
- resblock_kernel_sizes,
408
- resblock_dilation_sizes,
409
- upsample_rates,
410
- upsample_initial_channel,
411
- upsample_kernel_sizes,
412
- n_speakers=0,
413
- gin_channels=0,
414
- use_sdp=True,
415
- **kwargs):
416
-
417
- super().__init__()
418
- self.n_vocab = n_vocab
419
- self.spec_channels = spec_channels
420
- self.inter_channels = inter_channels
421
- self.hidden_channels = hidden_channels
422
- self.filter_channels = filter_channels
423
- self.n_heads = n_heads
424
- self.n_layers = n_layers
425
- self.kernel_size = kernel_size
426
- self.p_dropout = p_dropout
427
- self.resblock = resblock
428
- self.resblock_kernel_sizes = resblock_kernel_sizes
429
- self.resblock_dilation_sizes = resblock_dilation_sizes
430
- self.upsample_rates = upsample_rates
431
- self.upsample_initial_channel = upsample_initial_channel
432
- self.upsample_kernel_sizes = upsample_kernel_sizes
433
- self.segment_size = segment_size
434
- self.n_speakers = n_speakers
435
- self.gin_channels = gin_channels
436
-
437
- self.use_sdp = use_sdp
438
-
439
- self.enc_p = TextEncoder(n_vocab,
440
- inter_channels,
441
- hidden_channels,
442
- filter_channels,
443
- n_heads,
444
- n_layers,
445
- kernel_size,
446
- p_dropout)
447
- self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
448
- self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
449
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
450
-
451
- if use_sdp:
452
- self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
453
- else:
454
- self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
455
-
456
- if n_speakers >= 1:
457
- self.emb_g = nn.Embedding(n_speakers, gin_channels)
458
-
459
- def forward(self, x, x_lengths, y, y_lengths, sid=None):
460
-
461
- x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
462
- if self.n_speakers > 0:
463
- g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
464
- else:
465
- g = None
466
-
467
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
468
- z_p = self.flow(z, y_mask, g=g)
469
-
470
- with torch.no_grad():
471
- # negative cross-entropy
472
- s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
473
- neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
474
- neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
475
- neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
476
- neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
477
- neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
478
-
479
- attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
480
- attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
481
-
482
- w = attn.sum(2)
483
- if self.use_sdp:
484
- l_length = self.dp(x, x_mask, w, g=g)
485
- l_length = l_length / torch.sum(x_mask)
486
- else:
487
- logw_ = torch.log(w + 1e-6) * x_mask
488
- logw = self.dp(x, x_mask, g=g)
489
- l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
490
-
491
- # expand prior
492
- m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
493
- logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
494
-
495
- z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
496
- o = self.dec(z_slice, g=g)
497
- return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
498
-
499
- def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
500
- x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
501
- if self.n_speakers > 0:
502
- g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
503
- else:
504
- g = None
505
-
506
- if self.use_sdp:
507
- logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
508
- else:
509
- logw = self.dp(x, x_mask, g=g)
510
- w = torch.exp(logw) * x_mask * length_scale
511
- w_ceil = torch.ceil(w)
512
- y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
513
- y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
514
- attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
515
- attn = commons.generate_path(w_ceil, attn_mask)
516
-
517
- m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
518
- logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
519
-
520
- z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
521
- z = self.flow(z_p, y_mask, g=g, reverse=True)
522
- o = self.dec((z * y_mask)[:,:,:max_len], g=g)
523
- return o, attn, y_mask, (z, z_p, m_p, logs_p)
524
-
525
- def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
526
- assert self.n_speakers > 0, "n_speakers have to be larger than 0."
527
- g_src = self.emb_g(sid_src).unsqueeze(-1)
528
- g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
529
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
530
- z_p = self.flow(z, y_mask, g=g_src)
531
- z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
532
- o_hat = self.dec(z_hat * y_mask, g=g_tgt)
533
- return o_hat, y_mask, (z, z_p, z_hat)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/models_infer.py DELETED
@@ -1,402 +0,0 @@
1
- import math
2
- import torch
3
- from torch import nn
4
- from torch.nn import functional as F
5
-
6
- import commons
7
- import modules
8
- import attentions
9
-
10
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
- from commons import init_weights, get_padding
13
-
14
-
15
- class StochasticDurationPredictor(nn.Module):
16
- def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
17
- super().__init__()
18
- filter_channels = in_channels # it needs to be removed from future version.
19
- self.in_channels = in_channels
20
- self.filter_channels = filter_channels
21
- self.kernel_size = kernel_size
22
- self.p_dropout = p_dropout
23
- self.n_flows = n_flows
24
- self.gin_channels = gin_channels
25
-
26
- self.log_flow = modules.Log()
27
- self.flows = nn.ModuleList()
28
- self.flows.append(modules.ElementwiseAffine(2))
29
- for i in range(n_flows):
30
- self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
31
- self.flows.append(modules.Flip())
32
-
33
- self.post_pre = nn.Conv1d(1, filter_channels, 1)
34
- self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
35
- self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
36
- self.post_flows = nn.ModuleList()
37
- self.post_flows.append(modules.ElementwiseAffine(2))
38
- for i in range(4):
39
- self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
40
- self.post_flows.append(modules.Flip())
41
-
42
- self.pre = nn.Conv1d(in_channels, filter_channels, 1)
43
- self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
44
- self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
45
- if gin_channels != 0:
46
- self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
47
-
48
- def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
49
- x = torch.detach(x)
50
- x = self.pre(x)
51
- if g is not None:
52
- g = torch.detach(g)
53
- x = x + self.cond(g)
54
- x = self.convs(x, x_mask)
55
- x = self.proj(x) * x_mask
56
-
57
- if not reverse:
58
- flows = self.flows
59
- assert w is not None
60
-
61
- logdet_tot_q = 0
62
- h_w = self.post_pre(w)
63
- h_w = self.post_convs(h_w, x_mask)
64
- h_w = self.post_proj(h_w) * x_mask
65
- e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
66
- z_q = e_q
67
- for flow in self.post_flows:
68
- z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
69
- logdet_tot_q += logdet_q
70
- z_u, z1 = torch.split(z_q, [1, 1], 1)
71
- u = torch.sigmoid(z_u) * x_mask
72
- z0 = (w - u) * x_mask
73
- logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
74
- logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
75
-
76
- logdet_tot = 0
77
- z0, logdet = self.log_flow(z0, x_mask)
78
- logdet_tot += logdet
79
- z = torch.cat([z0, z1], 1)
80
- for flow in flows:
81
- z, logdet = flow(z, x_mask, g=x, reverse=reverse)
82
- logdet_tot = logdet_tot + logdet
83
- nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
84
- return nll + logq # [b]
85
- else:
86
- flows = list(reversed(self.flows))
87
- flows = flows[:-2] + [flows[-1]] # remove a useless vflow
88
- z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
89
- for flow in flows:
90
- z = flow(z, x_mask, g=x, reverse=reverse)
91
- z0, z1 = torch.split(z, [1, 1], 1)
92
- logw = z0
93
- return logw
94
-
95
-
96
- class DurationPredictor(nn.Module):
97
- def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
98
- super().__init__()
99
-
100
- self.in_channels = in_channels
101
- self.filter_channels = filter_channels
102
- self.kernel_size = kernel_size
103
- self.p_dropout = p_dropout
104
- self.gin_channels = gin_channels
105
-
106
- self.drop = nn.Dropout(p_dropout)
107
- self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
108
- self.norm_1 = modules.LayerNorm(filter_channels)
109
- self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
110
- self.norm_2 = modules.LayerNorm(filter_channels)
111
- self.proj = nn.Conv1d(filter_channels, 1, 1)
112
-
113
- if gin_channels != 0:
114
- self.cond = nn.Conv1d(gin_channels, in_channels, 1)
115
-
116
- def forward(self, x, x_mask, g=None):
117
- x = torch.detach(x)
118
- if g is not None:
119
- g = torch.detach(g)
120
- x = x + self.cond(g)
121
- x = self.conv_1(x * x_mask)
122
- x = torch.relu(x)
123
- x = self.norm_1(x)
124
- x = self.drop(x)
125
- x = self.conv_2(x * x_mask)
126
- x = torch.relu(x)
127
- x = self.norm_2(x)
128
- x = self.drop(x)
129
- x = self.proj(x * x_mask)
130
- return x * x_mask
131
-
132
-
133
- class TextEncoder(nn.Module):
134
- def __init__(self,
135
- n_vocab,
136
- out_channels,
137
- hidden_channels,
138
- filter_channels,
139
- n_heads,
140
- n_layers,
141
- kernel_size,
142
- p_dropout):
143
- super().__init__()
144
- self.n_vocab = n_vocab
145
- self.out_channels = out_channels
146
- self.hidden_channels = hidden_channels
147
- self.filter_channels = filter_channels
148
- self.n_heads = n_heads
149
- self.n_layers = n_layers
150
- self.kernel_size = kernel_size
151
- self.p_dropout = p_dropout
152
-
153
- self.emb = nn.Embedding(n_vocab, hidden_channels)
154
- nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
155
-
156
- self.encoder = attentions.Encoder(
157
- hidden_channels,
158
- filter_channels,
159
- n_heads,
160
- n_layers,
161
- kernel_size,
162
- p_dropout)
163
- self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
164
-
165
- def forward(self, x, x_lengths):
166
- x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
167
- x = torch.transpose(x, 1, -1) # [b, h, t]
168
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
169
-
170
- x = self.encoder(x * x_mask, x_mask)
171
- stats = self.proj(x) * x_mask
172
-
173
- m, logs = torch.split(stats, self.out_channels, dim=1)
174
- return x, m, logs, x_mask
175
-
176
-
177
- class ResidualCouplingBlock(nn.Module):
178
- def __init__(self,
179
- channels,
180
- hidden_channels,
181
- kernel_size,
182
- dilation_rate,
183
- n_layers,
184
- n_flows=4,
185
- gin_channels=0):
186
- super().__init__()
187
- self.channels = channels
188
- self.hidden_channels = hidden_channels
189
- self.kernel_size = kernel_size
190
- self.dilation_rate = dilation_rate
191
- self.n_layers = n_layers
192
- self.n_flows = n_flows
193
- self.gin_channels = gin_channels
194
-
195
- self.flows = nn.ModuleList()
196
- for i in range(n_flows):
197
- self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
198
- self.flows.append(modules.Flip())
199
-
200
- def forward(self, x, x_mask, g=None, reverse=False):
201
- if not reverse:
202
- for flow in self.flows:
203
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
204
- else:
205
- for flow in reversed(self.flows):
206
- x = flow(x, x_mask, g=g, reverse=reverse)
207
- return x
208
-
209
-
210
- class PosteriorEncoder(nn.Module):
211
- def __init__(self,
212
- in_channels,
213
- out_channels,
214
- hidden_channels,
215
- kernel_size,
216
- dilation_rate,
217
- n_layers,
218
- gin_channels=0):
219
- super().__init__()
220
- self.in_channels = in_channels
221
- self.out_channels = out_channels
222
- self.hidden_channels = hidden_channels
223
- self.kernel_size = kernel_size
224
- self.dilation_rate = dilation_rate
225
- self.n_layers = n_layers
226
- self.gin_channels = gin_channels
227
-
228
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
229
- self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
230
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
231
-
232
- def forward(self, x, x_lengths, g=None):
233
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
234
- x = self.pre(x) * x_mask
235
- x = self.enc(x, x_mask, g=g)
236
- stats = self.proj(x) * x_mask
237
- m, logs = torch.split(stats, self.out_channels, dim=1)
238
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
239
- return z, m, logs, x_mask
240
-
241
-
242
- class Generator(torch.nn.Module):
243
- def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
244
- super(Generator, self).__init__()
245
- self.num_kernels = len(resblock_kernel_sizes)
246
- self.num_upsamples = len(upsample_rates)
247
- self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
248
- resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
249
-
250
- self.ups = nn.ModuleList()
251
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
252
- self.ups.append(weight_norm(
253
- ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
254
- k, u, padding=(k-u)//2)))
255
-
256
- self.resblocks = nn.ModuleList()
257
- for i in range(len(self.ups)):
258
- ch = upsample_initial_channel//(2**(i+1))
259
- for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
260
- self.resblocks.append(resblock(ch, k, d))
261
-
262
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
263
- self.ups.apply(init_weights)
264
-
265
- if gin_channels != 0:
266
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
267
-
268
- def forward(self, x, g=None):
269
- x = self.conv_pre(x)
270
- if g is not None:
271
- x = x + self.cond(g)
272
-
273
- for i in range(self.num_upsamples):
274
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
275
- x = self.ups[i](x)
276
- xs = None
277
- for j in range(self.num_kernels):
278
- if xs is None:
279
- xs = self.resblocks[i*self.num_kernels+j](x)
280
- else:
281
- xs += self.resblocks[i*self.num_kernels+j](x)
282
- x = xs / self.num_kernels
283
- x = F.leaky_relu(x)
284
- x = self.conv_post(x)
285
- x = torch.tanh(x)
286
-
287
- return x
288
-
289
- def remove_weight_norm(self):
290
- print('Removing weight norm...')
291
- for l in self.ups:
292
- remove_weight_norm(l)
293
- for l in self.resblocks:
294
- l.remove_weight_norm()
295
-
296
-
297
-
298
- class SynthesizerTrn(nn.Module):
299
- """
300
- Synthesizer for Training
301
- """
302
-
303
- def __init__(self,
304
- n_vocab,
305
- spec_channels,
306
- segment_size,
307
- inter_channels,
308
- hidden_channels,
309
- filter_channels,
310
- n_heads,
311
- n_layers,
312
- kernel_size,
313
- p_dropout,
314
- resblock,
315
- resblock_kernel_sizes,
316
- resblock_dilation_sizes,
317
- upsample_rates,
318
- upsample_initial_channel,
319
- upsample_kernel_sizes,
320
- n_speakers=0,
321
- gin_channels=0,
322
- use_sdp=True,
323
- **kwargs):
324
-
325
- super().__init__()
326
- self.n_vocab = n_vocab
327
- self.spec_channels = spec_channels
328
- self.inter_channels = inter_channels
329
- self.hidden_channels = hidden_channels
330
- self.filter_channels = filter_channels
331
- self.n_heads = n_heads
332
- self.n_layers = n_layers
333
- self.kernel_size = kernel_size
334
- self.p_dropout = p_dropout
335
- self.resblock = resblock
336
- self.resblock_kernel_sizes = resblock_kernel_sizes
337
- self.resblock_dilation_sizes = resblock_dilation_sizes
338
- self.upsample_rates = upsample_rates
339
- self.upsample_initial_channel = upsample_initial_channel
340
- self.upsample_kernel_sizes = upsample_kernel_sizes
341
- self.segment_size = segment_size
342
- self.n_speakers = n_speakers
343
- self.gin_channels = gin_channels
344
-
345
- self.use_sdp = use_sdp
346
-
347
- self.enc_p = TextEncoder(n_vocab,
348
- inter_channels,
349
- hidden_channels,
350
- filter_channels,
351
- n_heads,
352
- n_layers,
353
- kernel_size,
354
- p_dropout)
355
- self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
356
- self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
357
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
358
-
359
- if use_sdp:
360
- self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
361
- else:
362
- self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
363
-
364
- if n_speakers > 1:
365
- self.emb_g = nn.Embedding(n_speakers, gin_channels)
366
-
367
- def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
368
- x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
369
- if self.n_speakers > 0:
370
- g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
371
- else:
372
- g = None
373
-
374
- if self.use_sdp:
375
- logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
376
- else:
377
- logw = self.dp(x, x_mask, g=g)
378
- w = torch.exp(logw) * x_mask * length_scale
379
- w_ceil = torch.ceil(w)
380
- y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
381
- y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
382
- attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
383
- attn = commons.generate_path(w_ceil, attn_mask)
384
-
385
- m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
386
- logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
387
-
388
- z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
389
- z = self.flow(z_p, y_mask, g=g, reverse=True)
390
- o = self.dec((z * y_mask)[:,:,:max_len], g=g)
391
- return o, attn, y_mask, (z, z_p, m_p, logs_p)
392
-
393
- def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
394
- assert self.n_speakers > 0, "n_speakers have to be larger than 0."
395
- g_src = self.emb_g(sid_src).unsqueeze(-1)
396
- g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
397
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
398
- z_p = self.flow(z, y_mask, g=g_src)
399
- z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
400
- o_hat = self.dec(z_hat * y_mask, g=g_tgt)
401
- return o_hat, y_mask, (z, z_p, z_hat)
402
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/modules.py DELETED
@@ -1,390 +0,0 @@
1
- import copy
2
- import math
3
- import numpy as np
4
- import scipy
5
- import torch
6
- from torch import nn
7
- from torch.nn import functional as F
8
-
9
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
- from torch.nn.utils import weight_norm, remove_weight_norm
11
-
12
- import commons
13
- from commons import init_weights, get_padding
14
- from transforms import piecewise_rational_quadratic_transform
15
-
16
-
17
- LRELU_SLOPE = 0.1
18
-
19
-
20
- class LayerNorm(nn.Module):
21
- def __init__(self, channels, eps=1e-5):
22
- super().__init__()
23
- self.channels = channels
24
- self.eps = eps
25
-
26
- self.gamma = nn.Parameter(torch.ones(channels))
27
- self.beta = nn.Parameter(torch.zeros(channels))
28
-
29
- def forward(self, x):
30
- x = x.transpose(1, -1)
31
- x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32
- return x.transpose(1, -1)
33
-
34
-
35
- class ConvReluNorm(nn.Module):
36
- def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
37
- super().__init__()
38
- self.in_channels = in_channels
39
- self.hidden_channels = hidden_channels
40
- self.out_channels = out_channels
41
- self.kernel_size = kernel_size
42
- self.n_layers = n_layers
43
- self.p_dropout = p_dropout
44
- assert n_layers > 1, "Number of layers should be larger than 0."
45
-
46
- self.conv_layers = nn.ModuleList()
47
- self.norm_layers = nn.ModuleList()
48
- self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
49
- self.norm_layers.append(LayerNorm(hidden_channels))
50
- self.relu_drop = nn.Sequential(
51
- nn.ReLU(),
52
- nn.Dropout(p_dropout))
53
- for _ in range(n_layers-1):
54
- self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
55
- self.norm_layers.append(LayerNorm(hidden_channels))
56
- self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
57
- self.proj.weight.data.zero_()
58
- self.proj.bias.data.zero_()
59
-
60
- def forward(self, x, x_mask):
61
- x_org = x
62
- for i in range(self.n_layers):
63
- x = self.conv_layers[i](x * x_mask)
64
- x = self.norm_layers[i](x)
65
- x = self.relu_drop(x)
66
- x = x_org + self.proj(x)
67
- return x * x_mask
68
-
69
-
70
- class DDSConv(nn.Module):
71
- """
72
- Dialted and Depth-Separable Convolution
73
- """
74
- def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
75
- super().__init__()
76
- self.channels = channels
77
- self.kernel_size = kernel_size
78
- self.n_layers = n_layers
79
- self.p_dropout = p_dropout
80
-
81
- self.drop = nn.Dropout(p_dropout)
82
- self.convs_sep = nn.ModuleList()
83
- self.convs_1x1 = nn.ModuleList()
84
- self.norms_1 = nn.ModuleList()
85
- self.norms_2 = nn.ModuleList()
86
- for i in range(n_layers):
87
- dilation = kernel_size ** i
88
- padding = (kernel_size * dilation - dilation) // 2
89
- self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
90
- groups=channels, dilation=dilation, padding=padding
91
- ))
92
- self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
93
- self.norms_1.append(LayerNorm(channels))
94
- self.norms_2.append(LayerNorm(channels))
95
-
96
- def forward(self, x, x_mask, g=None):
97
- if g is not None:
98
- x = x + g
99
- for i in range(self.n_layers):
100
- y = self.convs_sep[i](x * x_mask)
101
- y = self.norms_1[i](y)
102
- y = F.gelu(y)
103
- y = self.convs_1x1[i](y)
104
- y = self.norms_2[i](y)
105
- y = F.gelu(y)
106
- y = self.drop(y)
107
- x = x + y
108
- return x * x_mask
109
-
110
-
111
- class WN(torch.nn.Module):
112
- def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
113
- super(WN, self).__init__()
114
- assert(kernel_size % 2 == 1)
115
- self.hidden_channels =hidden_channels
116
- self.kernel_size = kernel_size,
117
- self.dilation_rate = dilation_rate
118
- self.n_layers = n_layers
119
- self.gin_channels = gin_channels
120
- self.p_dropout = p_dropout
121
-
122
- self.in_layers = torch.nn.ModuleList()
123
- self.res_skip_layers = torch.nn.ModuleList()
124
- self.drop = nn.Dropout(p_dropout)
125
-
126
- if gin_channels != 0:
127
- cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
128
- self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
129
-
130
- for i in range(n_layers):
131
- dilation = dilation_rate ** i
132
- padding = int((kernel_size * dilation - dilation) / 2)
133
- in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
134
- dilation=dilation, padding=padding)
135
- in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
136
- self.in_layers.append(in_layer)
137
-
138
- # last one is not necessary
139
- if i < n_layers - 1:
140
- res_skip_channels = 2 * hidden_channels
141
- else:
142
- res_skip_channels = hidden_channels
143
-
144
- res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
145
- res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
146
- self.res_skip_layers.append(res_skip_layer)
147
-
148
- def forward(self, x, x_mask, g=None, **kwargs):
149
- output = torch.zeros_like(x)
150
- n_channels_tensor = torch.IntTensor([self.hidden_channels])
151
-
152
- if g is not None:
153
- g = self.cond_layer(g)
154
-
155
- for i in range(self.n_layers):
156
- x_in = self.in_layers[i](x)
157
- if g is not None:
158
- cond_offset = i * 2 * self.hidden_channels
159
- g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
160
- else:
161
- g_l = torch.zeros_like(x_in)
162
-
163
- acts = commons.fused_add_tanh_sigmoid_multiply(
164
- x_in,
165
- g_l,
166
- n_channels_tensor)
167
- acts = self.drop(acts)
168
-
169
- res_skip_acts = self.res_skip_layers[i](acts)
170
- if i < self.n_layers - 1:
171
- res_acts = res_skip_acts[:,:self.hidden_channels,:]
172
- x = (x + res_acts) * x_mask
173
- output = output + res_skip_acts[:,self.hidden_channels:,:]
174
- else:
175
- output = output + res_skip_acts
176
- return output * x_mask
177
-
178
- def remove_weight_norm(self):
179
- if self.gin_channels != 0:
180
- torch.nn.utils.remove_weight_norm(self.cond_layer)
181
- for l in self.in_layers:
182
- torch.nn.utils.remove_weight_norm(l)
183
- for l in self.res_skip_layers:
184
- torch.nn.utils.remove_weight_norm(l)
185
-
186
-
187
- class ResBlock1(torch.nn.Module):
188
- def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
189
- super(ResBlock1, self).__init__()
190
- self.convs1 = nn.ModuleList([
191
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
192
- padding=get_padding(kernel_size, dilation[0]))),
193
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
194
- padding=get_padding(kernel_size, dilation[1]))),
195
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
196
- padding=get_padding(kernel_size, dilation[2])))
197
- ])
198
- self.convs1.apply(init_weights)
199
-
200
- self.convs2 = nn.ModuleList([
201
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
202
- padding=get_padding(kernel_size, 1))),
203
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
204
- padding=get_padding(kernel_size, 1))),
205
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
206
- padding=get_padding(kernel_size, 1)))
207
- ])
208
- self.convs2.apply(init_weights)
209
-
210
- def forward(self, x, x_mask=None):
211
- for c1, c2 in zip(self.convs1, self.convs2):
212
- xt = F.leaky_relu(x, LRELU_SLOPE)
213
- if x_mask is not None:
214
- xt = xt * x_mask
215
- xt = c1(xt)
216
- xt = F.leaky_relu(xt, LRELU_SLOPE)
217
- if x_mask is not None:
218
- xt = xt * x_mask
219
- xt = c2(xt)
220
- x = xt + x
221
- if x_mask is not None:
222
- x = x * x_mask
223
- return x
224
-
225
- def remove_weight_norm(self):
226
- for l in self.convs1:
227
- remove_weight_norm(l)
228
- for l in self.convs2:
229
- remove_weight_norm(l)
230
-
231
-
232
- class ResBlock2(torch.nn.Module):
233
- def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
234
- super(ResBlock2, self).__init__()
235
- self.convs = nn.ModuleList([
236
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
237
- padding=get_padding(kernel_size, dilation[0]))),
238
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
239
- padding=get_padding(kernel_size, dilation[1])))
240
- ])
241
- self.convs.apply(init_weights)
242
-
243
- def forward(self, x, x_mask=None):
244
- for c in self.convs:
245
- xt = F.leaky_relu(x, LRELU_SLOPE)
246
- if x_mask is not None:
247
- xt = xt * x_mask
248
- xt = c(xt)
249
- x = xt + x
250
- if x_mask is not None:
251
- x = x * x_mask
252
- return x
253
-
254
- def remove_weight_norm(self):
255
- for l in self.convs:
256
- remove_weight_norm(l)
257
-
258
-
259
- class Log(nn.Module):
260
- def forward(self, x, x_mask, reverse=False, **kwargs):
261
- if not reverse:
262
- y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
263
- logdet = torch.sum(-y, [1, 2])
264
- return y, logdet
265
- else:
266
- x = torch.exp(x) * x_mask
267
- return x
268
-
269
-
270
- class Flip(nn.Module):
271
- def forward(self, x, *args, reverse=False, **kwargs):
272
- x = torch.flip(x, [1])
273
- if not reverse:
274
- logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
275
- return x, logdet
276
- else:
277
- return x
278
-
279
-
280
- class ElementwiseAffine(nn.Module):
281
- def __init__(self, channels):
282
- super().__init__()
283
- self.channels = channels
284
- self.m = nn.Parameter(torch.zeros(channels,1))
285
- self.logs = nn.Parameter(torch.zeros(channels,1))
286
-
287
- def forward(self, x, x_mask, reverse=False, **kwargs):
288
- if not reverse:
289
- y = self.m + torch.exp(self.logs) * x
290
- y = y * x_mask
291
- logdet = torch.sum(self.logs * x_mask, [1,2])
292
- return y, logdet
293
- else:
294
- x = (x - self.m) * torch.exp(-self.logs) * x_mask
295
- return x
296
-
297
-
298
- class ResidualCouplingLayer(nn.Module):
299
- def __init__(self,
300
- channels,
301
- hidden_channels,
302
- kernel_size,
303
- dilation_rate,
304
- n_layers,
305
- p_dropout=0,
306
- gin_channels=0,
307
- mean_only=False):
308
- assert channels % 2 == 0, "channels should be divisible by 2"
309
- super().__init__()
310
- self.channels = channels
311
- self.hidden_channels = hidden_channels
312
- self.kernel_size = kernel_size
313
- self.dilation_rate = dilation_rate
314
- self.n_layers = n_layers
315
- self.half_channels = channels // 2
316
- self.mean_only = mean_only
317
-
318
- self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
319
- self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
320
- self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
321
- self.post.weight.data.zero_()
322
- self.post.bias.data.zero_()
323
-
324
- def forward(self, x, x_mask, g=None, reverse=False):
325
- x0, x1 = torch.split(x, [self.half_channels]*2, 1)
326
- h = self.pre(x0) * x_mask
327
- h = self.enc(h, x_mask, g=g)
328
- stats = self.post(h) * x_mask
329
- if not self.mean_only:
330
- m, logs = torch.split(stats, [self.half_channels]*2, 1)
331
- else:
332
- m = stats
333
- logs = torch.zeros_like(m)
334
-
335
- if not reverse:
336
- x1 = m + x1 * torch.exp(logs) * x_mask
337
- x = torch.cat([x0, x1], 1)
338
- logdet = torch.sum(logs, [1,2])
339
- return x, logdet
340
- else:
341
- x1 = (x1 - m) * torch.exp(-logs) * x_mask
342
- x = torch.cat([x0, x1], 1)
343
- return x
344
-
345
-
346
- class ConvFlow(nn.Module):
347
- def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
348
- super().__init__()
349
- self.in_channels = in_channels
350
- self.filter_channels = filter_channels
351
- self.kernel_size = kernel_size
352
- self.n_layers = n_layers
353
- self.num_bins = num_bins
354
- self.tail_bound = tail_bound
355
- self.half_channels = in_channels // 2
356
-
357
- self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
358
- self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
359
- self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
360
- self.proj.weight.data.zero_()
361
- self.proj.bias.data.zero_()
362
-
363
- def forward(self, x, x_mask, g=None, reverse=False):
364
- x0, x1 = torch.split(x, [self.half_channels]*2, 1)
365
- h = self.pre(x0)
366
- h = self.convs(h, x_mask, g=g)
367
- h = self.proj(h) * x_mask
368
-
369
- b, c, t = x0.shape
370
- h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
371
-
372
- unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
373
- unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
374
- unnormalized_derivatives = h[..., 2 * self.num_bins:]
375
-
376
- x1, logabsdet = piecewise_rational_quadratic_transform(x1,
377
- unnormalized_widths,
378
- unnormalized_heights,
379
- unnormalized_derivatives,
380
- inverse=reverse,
381
- tails='linear',
382
- tail_bound=self.tail_bound
383
- )
384
-
385
- x = torch.cat([x0, x1], 1) * x_mask
386
- logdet = torch.sum(logabsdet * x_mask, [1,2])
387
- if not reverse:
388
- return x, logdet
389
- else:
390
- return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/monotonic_align/__init__.py DELETED
@@ -1,19 +0,0 @@
1
- import numpy as np
2
- import torch
3
- from .monotonic_align.core import maximum_path_c
4
-
5
-
6
- def maximum_path(neg_cent, mask):
7
- """ Cython optimized version.
8
- neg_cent: [b, t_t, t_s]
9
- mask: [b, t_t, t_s]
10
- """
11
- device = neg_cent.device
12
- dtype = neg_cent.dtype
13
- neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
14
- path = np.zeros(neg_cent.shape, dtype=np.int32)
15
-
16
- t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
17
- t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
18
- maximum_path_c(path, neg_cent, t_t_max, t_s_max)
19
- return torch.from_numpy(path).to(device=device, dtype=dtype)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/monotonic_align/core.pyx DELETED
@@ -1,42 +0,0 @@
1
- cimport cython
2
- from cython.parallel import prange
3
-
4
-
5
- @cython.boundscheck(False)
6
- @cython.wraparound(False)
7
- cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
8
- cdef int x
9
- cdef int y
10
- cdef float v_prev
11
- cdef float v_cur
12
- cdef float tmp
13
- cdef int index = t_x - 1
14
-
15
- for y in range(t_y):
16
- for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
17
- if x == y:
18
- v_cur = max_neg_val
19
- else:
20
- v_cur = value[y-1, x]
21
- if x == 0:
22
- if y == 0:
23
- v_prev = 0.
24
- else:
25
- v_prev = max_neg_val
26
- else:
27
- v_prev = value[y-1, x-1]
28
- value[y, x] += max(v_prev, v_cur)
29
-
30
- for y in range(t_y - 1, -1, -1):
31
- path[y, index] = 1
32
- if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
33
- index = index - 1
34
-
35
-
36
- @cython.boundscheck(False)
37
- @cython.wraparound(False)
38
- cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
39
- cdef int b = paths.shape[0]
40
- cdef int i
41
- for i in prange(b, nogil=True):
42
- maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/monotonic_align/setup.py DELETED
@@ -1,9 +0,0 @@
1
- from distutils.core import setup
2
- from Cython.Build import cythonize
3
- import numpy
4
-
5
- setup(
6
- name = 'monotonic_align',
7
- ext_modules = cythonize("core.pyx"),
8
- include_dirs=[numpy.get_include()]
9
- )
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/preprocess_v2.py DELETED
@@ -1,151 +0,0 @@
1
- import os
2
- import argparse
3
- import json
4
- if __name__ == "__main__":
5
- parser = argparse.ArgumentParser()
6
- parser.add_argument("--add_auxiliary_data", type=bool, help="Whether to add extra data as fine-tuning helper")
7
- parser.add_argument("--languages", default="CJE")
8
- args = parser.parse_args()
9
- if args.languages == "CJE":
10
- langs = ["[ZH]", "[JA]", "[EN]"]
11
- elif args.languages == "CJ":
12
- langs = ["[ZH]", "[JA]"]
13
- elif args.languages == "C":
14
- langs = ["[ZH]"]
15
- new_annos = []
16
- # Source 1: transcribed short audios
17
- if os.path.exists("short_character_anno.txt"):
18
- with open("short_character_anno.txt", 'r', encoding='utf-8') as f:
19
- short_character_anno = f.readlines()
20
- new_annos += short_character_anno
21
- # Source 2: transcribed long audio segments
22
- if os.path.exists("long_character_anno.txt"):
23
- with open("long_character_anno.txt", 'r', encoding='utf-8') as f:
24
- long_character_anno = f.readlines()
25
- new_annos += long_character_anno
26
-
27
- # Get all speaker names
28
- speakers = []
29
- for line in new_annos:
30
- path, speaker, text = line.split("|")
31
- if speaker not in speakers:
32
- speakers.append(speaker)
33
- assert (len(speakers) != 0), "No audio file found. Please check your uploaded file structure."
34
- # Source 3 (Optional): sampled audios as extra training helpers
35
- if args.add_auxiliary_data:
36
- with open("sampled_audio4ft.txt", 'r', encoding='utf-8') as f:
37
- old_annos = f.readlines()
38
- # filter old_annos according to supported languages
39
- filtered_old_annos = []
40
- for line in old_annos:
41
- for lang in langs:
42
- if lang in line:
43
- filtered_old_annos.append(line)
44
- old_annos = filtered_old_annos
45
- for line in old_annos:
46
- path, speaker, text = line.split("|")
47
- if speaker not in speakers:
48
- speakers.append(speaker)
49
- num_old_voices = len(old_annos)
50
- num_new_voices = len(new_annos)
51
- # STEP 1: balance number of new & old voices
52
- cc_duplicate = num_old_voices // num_new_voices
53
- if cc_duplicate == 0:
54
- cc_duplicate = 1
55
-
56
-
57
- # STEP 2: modify config file
58
- with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
59
- hps = json.load(f)
60
-
61
- # assign ids to new speakers
62
- speaker2id = {}
63
- for i, speaker in enumerate(speakers):
64
- speaker2id[speaker] = i
65
- # modify n_speakers
66
- hps['data']["n_speakers"] = len(speakers)
67
- # overwrite speaker names
68
- hps['speakers'] = speaker2id
69
- hps['train']['log_interval'] = 100
70
- hps['train']['eval_interval'] = 1000
71
- hps['train']['batch_size'] = 16
72
- hps['data']['training_files'] = "final_annotation_train.txt"
73
- hps['data']['validation_files'] = "final_annotation_val.txt"
74
- # save modified config
75
- with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
76
- json.dump(hps, f, indent=2)
77
-
78
- # STEP 3: clean annotations, replace speaker names with assigned speaker IDs
79
- import text
80
- cleaned_new_annos = []
81
- for i, line in enumerate(new_annos):
82
- path, speaker, txt = line.split("|")
83
- if len(txt) > 150:
84
- continue
85
- cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
86
- cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
87
- cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
88
- cleaned_old_annos = []
89
- for i, line in enumerate(old_annos):
90
- path, speaker, txt = line.split("|")
91
- if len(txt) > 150:
92
- continue
93
- cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
94
- cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
95
- cleaned_old_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
96
- # merge with old annotation
97
- final_annos = cleaned_old_annos + cc_duplicate * cleaned_new_annos
98
- # save annotation file
99
- with open("final_annotation_train.txt", 'w', encoding='utf-8') as f:
100
- for line in final_annos:
101
- f.write(line)
102
- # save annotation file for validation
103
- with open("final_annotation_val.txt", 'w', encoding='utf-8') as f:
104
- for line in cleaned_new_annos:
105
- f.write(line)
106
- print("finished")
107
- else:
108
- # Do not add extra helper data
109
- # STEP 1: modify config file
110
- with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
111
- hps = json.load(f)
112
-
113
- # assign ids to new speakers
114
- speaker2id = {}
115
- for i, speaker in enumerate(speakers):
116
- speaker2id[speaker] = i
117
- # modify n_speakers
118
- hps['data']["n_speakers"] = len(speakers)
119
- # overwrite speaker names
120
- hps['speakers'] = speaker2id
121
- hps['train']['log_interval'] = 10
122
- hps['train']['eval_interval'] = 100
123
- hps['train']['batch_size'] = 16
124
- hps['data']['training_files'] = "final_annotation_train.txt"
125
- hps['data']['validation_files'] = "final_annotation_val.txt"
126
- # save modified config
127
- with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
128
- json.dump(hps, f, indent=2)
129
-
130
- # STEP 2: clean annotations, replace speaker names with assigned speaker IDs
131
- import text
132
-
133
- cleaned_new_annos = []
134
- for i, line in enumerate(new_annos):
135
- path, speaker, txt = line.split("|")
136
- if len(txt) > 150:
137
- continue
138
- cleaned_text = text._clean_text(txt, hps['data']['text_cleaners']).replace("[ZH]", "")
139
- cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
140
- cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
141
-
142
- final_annos = cleaned_new_annos
143
- # save annotation file
144
- with open("final_annotation_train.txt", 'w', encoding='utf-8') as f:
145
- for line in final_annos:
146
- f.write(line)
147
- # save annotation file for validation
148
- with open("final_annotation_val.txt", 'w', encoding='utf-8') as f:
149
- for line in cleaned_new_annos:
150
- f.write(line)
151
- print("finished")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/rearrange_speaker.py DELETED
@@ -1,37 +0,0 @@
1
- import torch
2
- import argparse
3
- import json
4
-
5
- if __name__ == "__main__":
6
- parser = argparse.ArgumentParser()
7
- parser.add_argument("--model_dir", type=str, default="./OUTPUT_MODEL/G_latest.pth")
8
- parser.add_argument("--config_dir", type=str, default="./configs/modified_finetune_speaker.json")
9
- args = parser.parse_args()
10
-
11
- model_sd = torch.load(args.model_dir, map_location='cpu')
12
- with open(args.config_dir, 'r', encoding='utf-8') as f:
13
- hps = json.load(f)
14
-
15
- valid_speakers = list(hps['speakers'].keys())
16
- if hps['data']['n_speakers'] > len(valid_speakers):
17
- new_emb_g = torch.zeros([len(valid_speakers), 256])
18
- old_emb_g = model_sd['model']['emb_g.weight']
19
- for i, speaker in enumerate(valid_speakers):
20
- new_emb_g[i, :] = old_emb_g[hps['speakers'][speaker], :]
21
- hps['speakers'][speaker] = i
22
- hps['data']['n_speakers'] = len(valid_speakers)
23
- model_sd['model']['emb_g.weight'] = new_emb_g
24
- with open("./finetune_speaker.json", 'w', encoding='utf-8') as f:
25
- json.dump(hps, f, indent=2)
26
- torch.save(model_sd, "./G_latest.pth")
27
- else:
28
- with open("./finetune_speaker.json", 'w', encoding='utf-8') as f:
29
- json.dump(hps, f, indent=2)
30
- torch.save(model_sd, "./G_latest.pth")
31
- # save another config file copy in MoeGoe format
32
- hps['speakers'] = valid_speakers
33
- with open("./moegoe_config.json", 'w', encoding='utf-8') as f:
34
- json.dump(hps, f, indent=2)
35
-
36
-
37
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/requirements.txt DELETED
@@ -1,24 +0,0 @@
1
- Cython
2
- librosa==0.9.1
3
- numpy
4
- scipy
5
- tensorboard
6
- torch==1.13.1
7
- torchvision==0.14.1
8
- torchaudio==0.13.1
9
- unidecode
10
- pyopenjtalk
11
- jamo
12
- pypinyin
13
- jieba
14
- protobuf
15
- cn2an
16
- inflect
17
- eng_to_ipa
18
- ko_pron
19
- indic_transliteration==2.3.37
20
- num_thai==0.0.5
21
- opencc==1.1.1
22
- demucs
23
- openai-whisper
24
- gradio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/short_audio_transcribe.py DELETED
@@ -1,111 +0,0 @@
1
- import whisper
2
- import os
3
- import torchaudio
4
- import argparse
5
- import torch
6
-
7
- lang2token = {
8
- 'zh': "[ZH]",
9
- 'ja': "[JA]",
10
- "en": "[EN]",
11
- }
12
- def transcribe_one(audio_path):
13
- # load audio and pad/trim it to fit 30 seconds
14
- audio = whisper.load_audio(audio_path)
15
- audio = whisper.pad_or_trim(audio)
16
-
17
- # make log-Mel spectrogram and move to the same device as the model
18
- mel = whisper.log_mel_spectrogram(audio).to(model.device)
19
-
20
- # detect the spoken language
21
- _, probs = model.detect_language(mel)
22
- print(f"Detected language: {max(probs, key=probs.get)}")
23
- lang = max(probs, key=probs.get)
24
- # decode the audio
25
- options = whisper.DecodingOptions()
26
- result = whisper.decode(model, mel, options)
27
-
28
- # print the recognized text
29
- print(result.text)
30
- return lang, result.text
31
- if __name__ == "__main__":
32
- parser = argparse.ArgumentParser()
33
- parser.add_argument("--languages", default="CJE")
34
- parser.add_argument("--whisper_size", default="medium")
35
- args = parser.parse_args()
36
- if args.languages == "CJE":
37
- lang2token = {
38
- 'zh': "[ZH]",
39
- 'ja': "[JA]",
40
- "en": "[EN]",
41
- }
42
- elif args.languages == "CJ":
43
- lang2token = {
44
- 'zh': "[ZH]",
45
- 'ja': "[JA]",
46
- }
47
- elif args.languages == "C":
48
- lang2token = {
49
- 'zh': "[ZH]",
50
- }
51
- assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
52
- model = whisper.load_model(args.whisper_size)
53
- parent_dir = "./custom_character_voice/"
54
- speaker_names = list(os.walk(parent_dir))[0][1]
55
- speaker_annos = []
56
- # resample audios
57
- for speaker in speaker_names:
58
- for i, wavfile in enumerate(list(os.walk(parent_dir + speaker))[0][2]):
59
- # try to load file as audio
60
- if wavfile.startswith("processed_"):
61
- continue
62
- try:
63
- wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
64
- channels_first=True)
65
- wav = wav.mean(dim=0).unsqueeze(0)
66
- if sr != 22050:
67
- wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)(wav)
68
- if wav.shape[1] / sr > 20:
69
- print(f"{wavfile} too long, ignoring\n")
70
- save_path = parent_dir + speaker + "/" + f"processed_{i}.wav"
71
- torchaudio.save(save_path, wav, 22050, channels_first=True)
72
- # transcribe text
73
- lang, text = transcribe_one(save_path)
74
- if lang not in list(lang2token.keys()):
75
- print(f"{lang} not supported, ignoring\n")
76
- continue
77
- text = lang2token[lang] + text + lang2token[lang] + "\n"
78
- speaker_annos.append(save_path + "|" + speaker + "|" + text)
79
- except:
80
- continue
81
-
82
- # # clean annotation
83
- # import argparse
84
- # import text
85
- # from utils import load_filepaths_and_text
86
- # for i, line in enumerate(speaker_annos):
87
- # path, sid, txt = line.split("|")
88
- # cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
89
- # cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
90
- # speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
91
- # write into annotation
92
- if len(speaker_annos) == 0:
93
- print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
94
- print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
95
- with open("short_character_anno.txt", 'w', encoding='utf-8') as f:
96
- for line in speaker_annos:
97
- f.write(line)
98
-
99
- # import json
100
- # # generate new config
101
- # with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
102
- # hps = json.load(f)
103
- # # modify n_speakers
104
- # hps['data']["n_speakers"] = 1000 + len(speaker2id)
105
- # # add speaker names
106
- # for speaker in speaker_names:
107
- # hps['speakers'][speaker] = speaker2id[speaker]
108
- # # save modified config
109
- # with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
110
- # json.dump(hps, f, indent=2)
111
- # print("finished")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/text/LICENSE DELETED
@@ -1,19 +0,0 @@
1
- Copyright (c) 2017 Keith Ito
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining a copy
4
- of this software and associated documentation files (the "Software"), to deal
5
- in the Software without restriction, including without limitation the rights
6
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
- copies of the Software, and to permit persons to whom the Software is
8
- furnished to do so, subject to the following conditions:
9
-
10
- The above copyright notice and this permission notice shall be included in
11
- all copies or substantial portions of the Software.
12
-
13
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
- THE SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/text/__init__.py DELETED
@@ -1,60 +0,0 @@
1
- """ from https://github.com/keithito/tacotron """
2
- from text import cleaners
3
- from text.symbols import symbols
4
-
5
-
6
- # Mappings from symbol to numeric ID and vice versa:
7
- _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
- _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
-
10
-
11
- def text_to_sequence(text, symbols, cleaner_names):
12
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13
- Args:
14
- text: string to convert to a sequence
15
- cleaner_names: names of the cleaner functions to run the text through
16
- Returns:
17
- List of integers corresponding to the symbols in the text
18
- '''
19
- sequence = []
20
- symbol_to_id = {s: i for i, s in enumerate(symbols)}
21
- clean_text = _clean_text(text, cleaner_names)
22
- print(clean_text)
23
- print(f" length:{len(clean_text)}")
24
- for symbol in clean_text:
25
- if symbol not in symbol_to_id.keys():
26
- continue
27
- symbol_id = symbol_to_id[symbol]
28
- sequence += [symbol_id]
29
- print(f" length:{len(sequence)}")
30
- return sequence
31
-
32
-
33
- def cleaned_text_to_sequence(cleaned_text, symbols):
34
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
35
- Args:
36
- text: string to convert to a sequence
37
- Returns:
38
- List of integers corresponding to the symbols in the text
39
- '''
40
- symbol_to_id = {s: i for i, s in enumerate(symbols)}
41
- sequence = [symbol_to_id[symbol] for symbol in cleaned_text if symbol in symbol_to_id.keys()]
42
- return sequence
43
-
44
-
45
- def sequence_to_text(sequence):
46
- '''Converts a sequence of IDs back to a string'''
47
- result = ''
48
- for symbol_id in sequence:
49
- s = _id_to_symbol[symbol_id]
50
- result += s
51
- return result
52
-
53
-
54
- def _clean_text(text, cleaner_names):
55
- for name in cleaner_names:
56
- cleaner = getattr(cleaners, name)
57
- if not cleaner:
58
- raise Exception('Unknown cleaner: %s' % name)
59
- text = cleaner(text)
60
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
VITS-fast-fine-tuning/text/__pycache__/__init__.cpython-37.pyc DELETED
Binary file (2.34 kB)
 
VITS-fast-fine-tuning/text/__pycache__/cleaners.cpython-37.pyc DELETED
Binary file (5.45 kB)
 
VITS-fast-fine-tuning/text/__pycache__/english.cpython-37.pyc DELETED
Binary file (4.93 kB)
 
VITS-fast-fine-tuning/text/__pycache__/japanese.cpython-37.pyc DELETED
Binary file (4.6 kB)
 
VITS-fast-fine-tuning/text/__pycache__/korean.cpython-37.pyc DELETED
Binary file (5.75 kB)
 
VITS-fast-fine-tuning/text/__pycache__/mandarin.cpython-37.pyc DELETED
Binary file (7.51 kB)
 
VITS-fast-fine-tuning/text/__pycache__/sanskrit.cpython-37.pyc DELETED
Binary file (1.63 kB)
 
VITS-fast-fine-tuning/text/__pycache__/symbols.cpython-37.pyc DELETED
Binary file (417 Bytes)