nevreal commited on
Commit
ecfa0da
1 Parent(s): b1351ab

Upload Complited files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .github/workflows/code_formatter.yml +51 -0
  2. .gitignore +166 -0
  3. LICENSE.md +25 -0
  4. README.md +43 -3
  5. assets/config.json +6 -0
  6. assets/i18n/i18n.py +52 -0
  7. assets/i18n/languages/en_US.json +89 -0
  8. assets/i18n/languages/pt_BR.json +89 -0
  9. assets/i18n/scan.py +71 -0
  10. core.py +1023 -0
  11. logs/.gitkeep +0 -0
  12. main.py +53 -0
  13. programs/applio_code/rvc/configs/config.py +192 -0
  14. programs/applio_code/rvc/configs/v1/32000.json +47 -0
  15. programs/applio_code/rvc/configs/v1/40000.json +47 -0
  16. programs/applio_code/rvc/configs/v1/48000.json +47 -0
  17. programs/applio_code/rvc/configs/v2/32000.json +43 -0
  18. programs/applio_code/rvc/configs/v2/40000.json +43 -0
  19. programs/applio_code/rvc/configs/v2/48000.json +43 -0
  20. programs/applio_code/rvc/infer/infer.py +470 -0
  21. programs/applio_code/rvc/infer/pipeline.py +701 -0
  22. programs/applio_code/rvc/lib/algorithm/__init__.py +0 -0
  23. programs/applio_code/rvc/lib/algorithm/attentions.py +292 -0
  24. programs/applio_code/rvc/lib/algorithm/commons.py +225 -0
  25. programs/applio_code/rvc/lib/algorithm/discriminators.py +199 -0
  26. programs/applio_code/rvc/lib/algorithm/encoders.py +219 -0
  27. programs/applio_code/rvc/lib/algorithm/generators.py +199 -0
  28. programs/applio_code/rvc/lib/algorithm/modules.py +130 -0
  29. programs/applio_code/rvc/lib/algorithm/normalization.py +31 -0
  30. programs/applio_code/rvc/lib/algorithm/nsf.py +200 -0
  31. programs/applio_code/rvc/lib/algorithm/residuals.py +309 -0
  32. programs/applio_code/rvc/lib/algorithm/synthesizers.py +243 -0
  33. programs/applio_code/rvc/lib/predictors/F0Extractor.py +107 -0
  34. programs/applio_code/rvc/lib/predictors/FCPE.py +920 -0
  35. programs/applio_code/rvc/lib/predictors/RMVPE.py +569 -0
  36. programs/applio_code/rvc/lib/tools/analyzer.py +76 -0
  37. programs/applio_code/rvc/lib/tools/gdown.py +354 -0
  38. programs/applio_code/rvc/lib/tools/launch_tensorboard.py +21 -0
  39. programs/applio_code/rvc/lib/tools/model_download.py +385 -0
  40. programs/applio_code/rvc/lib/tools/prerequisites_download.py +164 -0
  41. programs/applio_code/rvc/lib/tools/pretrained_selector.py +63 -0
  42. programs/applio_code/rvc/lib/tools/split_audio.py +107 -0
  43. programs/applio_code/rvc/lib/tools/tts.py +20 -0
  44. programs/applio_code/rvc/lib/tools/tts_voices.json +0 -0
  45. programs/applio_code/rvc/lib/utils.py +116 -0
  46. programs/applio_code/rvc/models/embedders/contentvec/config.json +71 -0
  47. programs/applio_code/rvc/models/embedders/contentvec/pytorch_model.bin +3 -0
  48. programs/applio_code/rvc/models/predictors/fcpe.pt +3 -0
  49. programs/applio_code/rvc/models/predictors/rmvpe.pt +3 -0
  50. programs/music_separation_code/ensemble.py +183 -0
.github/workflows/code_formatter.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Code Formatter
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ push_format:
10
+ runs-on: ubuntu-latest
11
+
12
+ permissions:
13
+ contents: write
14
+ pull-requests: write
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ with:
19
+ ref: ${{github.ref_name}}
20
+
21
+ - name: Set up Python ${{ matrix.python-version }}
22
+ uses: actions/setup-python@v5
23
+ with:
24
+ python-version: ${{ matrix.python-version }}
25
+
26
+ - name: Install Black
27
+ run: pip install "black[jupyter]"
28
+
29
+ - name: Run Black
30
+ # run: black $(git ls-files '*.py')
31
+ run: black . --exclude=".*\.ipynb$"
32
+
33
+ - name: Commit Back
34
+ continue-on-error: true
35
+ id: commitback
36
+ run: |
37
+ git config --local user.email "github-actions[bot]@users.noreply.github.com"
38
+ git config --local user.name "github-actions[bot]"
39
+ git add --all
40
+ git commit -m "chore(format): run black on ${{github.ref_name}}"
41
+
42
+ - name: Create Pull Request
43
+ if: steps.commitback.outcome == 'success'
44
+ continue-on-error: true
45
+ uses: peter-evans/create-pull-request@v5
46
+ with:
47
+ delete-branch: true
48
+ body: "Automatically apply code formatter change"
49
+ title: "chore(format): run black on ${{github.ref_name}}"
50
+ commit-message: "chore(format): run black on ${{github.ref_name}}"
51
+ branch: formatter/${{github.ref_name}}
.gitignore ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ share/python-wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+ MANIFEST
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ *.py,cover
49
+ .hypothesis/
50
+ .pytest_cache/
51
+ cover/
52
+
53
+ # Translations
54
+ *.mo
55
+ *.pot
56
+
57
+ # Django stuff:
58
+ *.log
59
+ local_settings.py
60
+ db.sqlite3
61
+ db.sqlite3-journal
62
+
63
+ # Flask stuff:
64
+ instance/
65
+ .webassets-cache
66
+
67
+ # Scrapy stuff:
68
+ .scrapy
69
+
70
+ # Sphinx documentation
71
+ docs/_build/
72
+
73
+ # PyBuilder
74
+ .pybuilder/
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ # For a library or package, you might want to ignore these files since the code is
86
+ # intended to run in multiple environments; otherwise, check them in:
87
+ # .python-version
88
+
89
+ # pipenv
90
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
91
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
92
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
93
+ # install all needed dependencies.
94
+ #Pipfile.lock
95
+
96
+ # poetry
97
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
98
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
99
+ # commonly ignored for libraries.
100
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
101
+ #poetry.lock
102
+
103
+ # pdm
104
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
105
+ #pdm.lock
106
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
107
+ # in version control.
108
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
109
+ .pdm.toml
110
+ .pdm-python
111
+ .pdm-build/
112
+
113
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114
+ __pypackages__/
115
+
116
+ # Celery stuff
117
+ celerybeat-schedule
118
+ celerybeat.pid
119
+
120
+ # SageMath parsed files
121
+ *.sage.py
122
+
123
+ # Environments
124
+ .env
125
+ .venv
126
+ env/
127
+ venv/
128
+ ENV/
129
+ env.bak/
130
+ venv.bak/
131
+
132
+ # Spyder project settings
133
+ .spyderproject
134
+ .spyproject
135
+
136
+ # Rope project settings
137
+ .ropeproject
138
+
139
+ # mkdocs documentation
140
+ /site
141
+
142
+ # mypy
143
+ .mypy_cache/
144
+ .dmypy.json
145
+ dmypy.json
146
+
147
+ # Pyre type checker
148
+ .pyre/
149
+
150
+ # pytype static type analyzer
151
+ .pytype/
152
+
153
+ # Cython debug symbols
154
+ cython_debug/
155
+
156
+ # PyCharm
157
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
160
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
161
+ #.idea/
162
+
163
+ # mine
164
+ .flac
165
+ .pth
166
+ .pt
LICENSE.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ## KindaHex Non-Commercial Use License (HNCU)
4
+
5
+
6
+ This repository is licensed under the **KindaHex Non-Commercial Use License (HNCU)**. By using, modifying, or distributing any content from this repository, you agree to the terms outlined below.
7
+
8
+ ### Terms of Use:
9
+ 1. **Non-Commercial Use Only**: You are permitted to use, modify, and distribute the contents of this repository **only for non-commercial purposes**. Commercial use, including selling, licensing, or distributing for profit, is strictly prohibited.
10
+
11
+ 2. **Modification and Derivative Works**: You may modify the contents of this repository and create derivative works. However, any modification or derivative work must also adhere to the non-commercial restriction and be subject to the terms of this license.
12
+
13
+ 3. **Attribution**: When using or distributing the content (either as-is or modified), you must provide proper attribution to the original creator (blane187gt) in a manner that is reasonable and customary for the medium.
14
+
15
+ 4. **No Warranty**: The content in this repository is provided "as-is," without any warranty, express or implied, including but not limited to warranties of merchantability or fitness for a particular purpose.
16
+
17
+ 5. **Compliance with Laws**: You are responsible for ensuring that your use of the content complies with all applicable laws and regulations.
18
+
19
+ 6. **Termination**: If you violate any of the terms of this license, your rights to use the repository’s content will be automatically terminated. You must cease all use and distribution of the content immediately upon termination.
20
+
21
+ ### Restrictions:
22
+ - You may **not** use this repository's content for commercial gain, including but not limited to creating products, services, or tools that are sold or monetized.
23
+ - You may **not** sublicense or transfer rights to third parties for commercial purposes.
24
+ - You may not use the content in any manner that competes with the original repository or its creator.
25
+
README.md CHANGED
@@ -1,3 +1,43 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HexGen RVC
2
+
3
+
4
+ https://colab.research.google.com/drive/1dmGS0vEWuX55Z1w1tSRD6lJDV8s2deY0?usp=sharing
5
+
6
+ HexGen RVC is a tool designed for generating high-quality AI vocal covers using advanced source separation, vocal modeling, and audio processing techniques. This project builds on several community-driven efforts, integrating the best tools and frameworks available for music and vocal manipulation.
7
+
8
+ ## Features
9
+ - **AI-Driven Vocal Cover Generation**: Produce custom vocal covers with ease.
10
+ - **Source Separation**: Isolate instrumentals and vocals from any track.
11
+ - **Efficient Workflow**: Streamlined integration with popular tools for music processing.
12
+ - **Colab Support**: Easily deploy and test models in Google Colab environments.
13
+
14
+ ## Installation
15
+ 1. Clone the repository:
16
+ ```bash
17
+ git clone https://github.com/blane187gt/hexGen-RVC.git
18
+ cd hexGen-RVC
19
+ ```
20
+ 2. Follow specific setup instructions provided in the [documentation](https://github.com/blane187gt/hexGen-RVC/wiki) (if available) or in the code comments.
21
+
22
+ ## Usage
23
+ 1. Prepare your audio input file(s) and place them in the appropriate folder.
24
+ 2. Run the script or Colab notebook as per the instructions.
25
+ 3. Customize the output by tweaking the parameters and models used.
26
+
27
+ ## Credits
28
+ This project would not have been possible without the contributions and support of the following tools and creators:
29
+
30
+ - [Audio Separator](https://github.com/karaokenerds/python-audio-separator) by [Andrew Beveridge](https://github.com/beveradb)
31
+ - [Applio](https://github.com/IAHispano/Applio) by [IAHispano](https://github.com/IAHispano)
32
+ - [yt-dlp](https://github.com/yt-dlp/yt-dlp)
33
+ - [Ultimate Vocal Remover GUI](https://github.com/Anjok07/ultimatevocalremovergui) by [Anjok07](https://github.com/Anjok07)
34
+ - [Music Source Separation Universal Training Code](https://github.com/ZFTurbo/Music-Source-Separation-Training) by [ZFTurbo](https://github.com/ZFTurbo)
35
+ - [AICoverGen](https://github.com/SociallyIneptWeeb/AICoverGen) by [SociallyIneptWeeb](https://github.com/SociallyIneptWeeb)
36
+ - [FullmatheusBallZ](https://www.youtube.com/@FullmatheusBallZ) for testing the Colab scripts.
37
+ - [Shirou](https://github.com/ShiromiyaG), the original project inspiration.
38
+
39
+ ## Contributing
40
+ Feel free to submit pull requests or create issues for any improvements or bugs you encounter. Contributions are always welcome!
41
+
42
+ ## License
43
+ This project is licensed under the terms specified in the `LICENSE` file. Ensure compliance with third-party dependencies when using or modifying this project.
assets/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "lang": {
3
+ "override": false,
4
+ "selected_lang": "en_US"
5
+ }
6
+ }
assets/i18n/i18n.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import json
3
+ from pathlib import Path
4
+ from locale import getdefaultlocale
5
+
6
+ now_dir = os.getcwd()
7
+ sys.path.append(now_dir)
8
+
9
+
10
+ class I18nAuto:
11
+ LANGUAGE_PATH = os.path.join(now_dir, "assets", "i18n", "languages")
12
+
13
+ def __init__(self, language=None):
14
+ with open(
15
+ os.path.join(now_dir, "assets", "config.json"), "r", encoding="utf8"
16
+ ) as file:
17
+ config = json.load(file)
18
+ override = config["lang"]["override"]
19
+ lang_prefix = config["lang"]["selected_lang"]
20
+
21
+ self.language = lang_prefix
22
+
23
+ if override == False:
24
+ language = language or getdefaultlocale()[0]
25
+ lang_prefix = language[:2] if language is not None else "en"
26
+ available_languages = self._get_available_languages()
27
+ matching_languages = [
28
+ lang for lang in available_languages if lang.startswith(lang_prefix)
29
+ ]
30
+ self.language = matching_languages[0] if matching_languages else "en_US"
31
+
32
+ self.language_map = self._load_language_list()
33
+
34
+ def _load_language_list(self):
35
+ try:
36
+ file_path = Path(self.LANGUAGE_PATH) / f"{self.language}.json"
37
+ with open(file_path, "r", encoding="utf-8") as file:
38
+ return json.load(file)
39
+ except FileNotFoundError:
40
+ raise FileNotFoundError(
41
+ f"Failed to load language file for {self.language}. Check if the correct .json file exists."
42
+ )
43
+
44
+ def _get_available_languages(self):
45
+ language_files = [path.stem for path in Path(self.LANGUAGE_PATH).glob("*.json")]
46
+ return language_files
47
+
48
+ def _language_exists(self, language):
49
+ return (Path(self.LANGUAGE_PATH) / f"{language}.json").exists()
50
+
51
+ def __call__(self, key):
52
+ return self.language_map.get(key, key)
assets/i18n/languages/en_US.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Voice Model": "Voice Model",
3
+ "Select the voice model to use for the conversion.": "Select the voice model to use for the conversion.",
4
+ "Index File": "Index File",
5
+ "Select the index file to use for the conversion.": "Select the index file to use for the conversion.",
6
+ "Refresh": "Refresh",
7
+ "Unload Voice": "Unload Voice",
8
+ "Upload Audio": "Upload Audio",
9
+ "Select Audio": "Select Audio",
10
+ "Select the audio to convert.": "Select the audio to convert.",
11
+ "Advanced Settings": "Advanced Settings",
12
+ "RVC Settings": "RVC Settings",
13
+ "Output Path": "Output Path",
14
+ "Enter output path": "Enter output path",
15
+ "The path where the output audio will be saved, by default in audio_files/rvc/output.wav": "The path where the output audio will be saved, by default in audio_files/rvc/output.wav",
16
+ "Clear Outputs (Deletes all audios in assets/audios)": "Clear Outputs (Deletes all audios in assets/audios)",
17
+ "Export Format": "Export Format",
18
+ "Select the format to export the audio.": "Select the format to export the audio.",
19
+ "Split Audio": "Split Audio",
20
+ "Split the audio into chunks for inference to obtain better results in some cases.": "Split the audio into chunks for inference to obtain better results in some cases.",
21
+ "Pitch Extractor": "Pitch Extractor",
22
+ "Pitch extract Algorith.": "Pitch extract Algorith.",
23
+ "Hop Length": "Hop Length",
24
+ "Hop length for pitch extraction.": "Hop length for pitch extraction.",
25
+ "Embedder Model": "Embedder Model",
26
+ "Model used for learning speaker embedding.": "Model used for learning speaker embedding.",
27
+ "Autotune": "Autotune",
28
+ "Apply a soft autotune to your inferences, recommended for singing conversions.": "Apply a soft autotune to your inferences, recommended for singing conversions.",
29
+ "Pitch": "Pitch",
30
+ "Adjust the pitch of the audio.": "Adjust the pitch of the audio.",
31
+ "Filter Radius": "Filter Radius",
32
+ "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration.": "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration.",
33
+ "Search Feature Ratio": "Search Feature Ratio",
34
+ "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio.": "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio.",
35
+ "Volume Envelope": "Volume Envelope",
36
+ "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed.": "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed.",
37
+ "Protect Voiceless Consonants": "Protect Voiceless Consonants",
38
+ "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect.": "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect.",
39
+ "Audio Separation Settings": "Audio Separation Settings",
40
+ "Use TTA": "Use TTA",
41
+ "Use Test Time Augmentation.": "Use Test Time Augmentation.",
42
+ "Batch Size": "Batch Size",
43
+ "Set the batch size for the separation.": "Set the batch size for the separation.",
44
+ "Vocals Model": "Vocals Model",
45
+ "Select the vocals model to use for the separation.": "Select the vocals model to use for the separation.",
46
+ "Karaoke Model": "Karaoke Model",
47
+ "Select the karaoke model to use for the separation.": "Select the karaoke model to use for the separation.",
48
+ "Dereverb Model": "Dereverb Model",
49
+ "Select the dereverb model to use for the separation.": "Select the dereverb model to use for the separation.",
50
+ "Deeecho": "Deeecho",
51
+ "Apply deeecho to the audio.": "Apply deeecho to the audio.",
52
+ "Deeecho Model": "Deeecho Model",
53
+ "Select the deeecho model to use for the separation.": "Select the deeecho model to use for the separation.",
54
+ "Denoise": "Denoise",
55
+ "Apply denoise to the audio.": "Apply denoise to the audio.",
56
+ "Denoise Model": "Denoise Model",
57
+ "Select the denoise model to use for the separation.": "Select the denoise model to use for the separation.",
58
+ "Audio post-process Settings": "Audio post-process Settings",
59
+ "Delete Audios": "Delete Audios",
60
+ "Delete the audios after the conversion.": "Delete the audios after the conversion.",
61
+ "Reverb": "Reverb",
62
+ "Apply reverb to the audio.": "Apply reverb to the audio.",
63
+ "Reverb Room Size": "Reverb Room Size",
64
+ "Set the room size of the reverb.": "Set the room size of the reverb.",
65
+ "Reverb Damping": "Reverb Damping",
66
+ "Set the damping of the reverb.": "Set the damping of the reverb.",
67
+ "Reverb Wet Gain": "Reverb Wet Gain",
68
+ "Set the wet gain of the reverb.": "Set the wet gain of the reverb.",
69
+ "Reverb Dry Gain": "Reverb Dry Gain",
70
+ "Set the dry gain of the reverb.": "Set the dry gain of the reverb.",
71
+ "Reverb Width": "Reverb Width",
72
+ "Set the width of the reverb.": "Set the width of the reverb.",
73
+ "Vocals Volume": "Vocals Volume",
74
+ "Adjust the volume of the vocals.": "Adjust the volume of the vocals.",
75
+ "Instrumentals Volume": "Instrumentals Volume",
76
+ "Adjust the volume of the Instrumentals.": "Adjust the volume of the Instrumentals.",
77
+ "Backing Vocals Volume": "Backing Vocals Volume",
78
+ "Adjust the volume of the backing vocals.": "Adjust the volume of the backing vocals.",
79
+ "Device Settings": "Device Settings",
80
+ "Device": "Device",
81
+ "Select the device to use for the conversion. 0 to ∞ separated by - and for CPU leave only an -": "Select the device to use for the conversion. 0 to ∞ separated by - and for CPU leave only an -",
82
+ "Convert": "Convert",
83
+ "Output Information": "Output Information",
84
+ "The output information will be displayed here.": "The output information will be displayed here.",
85
+ "Export Audio": "Export Audio",
86
+ "Music URL": "Music URL",
87
+ "Download": "Download",
88
+ "Model URL": "Model URL"
89
+ }
assets/i18n/languages/pt_BR.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Voice Model": "Modelo de Voz",
3
+ "Select the voice model to use for the conversion.": "Selecione o modelo de voz a ser usado para a conversão.",
4
+ "Index File": "Arquivo Index",
5
+ "Select the index file to use for the conversion.": "Selecione o arquivo Index a ser usado para a conversão.",
6
+ "Refresh": "Atualizar",
7
+ "Unload Voice": "Descarregar Voz",
8
+ "Upload Audio": "Carregar Áudio",
9
+ "Select Audio": "Selecionar Áudio",
10
+ "Select the audio to convert.": "Selecione o áudio a ser convertido.",
11
+ "Advanced Settings": "Configurações Avançadas",
12
+ "RVC Settings": "Configurações RVC",
13
+ "Output Path": "Caminho de Saída",
14
+ "Enter output path": "Insira o caminho de saída",
15
+ "The path where the output audio will be saved, by default in audio_files/rvc/output.wav": "O caminho onde o áudio de saída será salvo, por padrão em audio_files/rvc/output.wav",
16
+ "Clear Outputs (Deletes all audios in assets/audios)": "Limpar Saídas (Exclui todos os áudios em assets/audios)",
17
+ "Export Format": "Formato de Exportação",
18
+ "Select the format to export the audio.": "Selecione o formato para exportar o áudio.",
19
+ "Split Audio": "Dividir Áudio",
20
+ "Split the audio into chunks for inference to obtain better results in some cases.": "Divida o áudio em partes para inferência para obter melhores resultados em alguns casos.",
21
+ "Pitch Extractor": "Extrator de Pitch",
22
+ "Pitch extract Algorith.": "Algoritmo de Extração de Pitch",
23
+ "Hop Length": "Hop Length",
24
+ "Hop length for pitch extraction.": "Hop Length para extração de pitch.",
25
+ "Embedder Model": "Modelo de Embedding",
26
+ "Model used for learning speaker embedding.": "Modelo usado para aprendizado de embedding de locutor.",
27
+ "Autotune": "Autotune",
28
+ "Apply a soft autotune to your inferences, recommended for singing conversions.": "Aplique um autotune suave às suas inferências, recomendado para conversões de canto.",
29
+ "Pitch": "Pitch",
30
+ "Adjust the pitch of the audio.": "Ajuste o pitch do áudio.",
31
+ "Filter Radius": "Raio do Filtro",
32
+ "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration.": "Se o número for maior ou igual a três, o uso de filtragem mediana nos resultados de tom coletados tem o potencial de diminuir a respiração.",
33
+ "Search Feature Ratio": "Proporção da Função de Busca",
34
+ "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio.": "Influência exercida pelo arquivo de índice; um valor mais alto corresponde a maior influência. No entanto, optar por valores mais baixos pode ajudar a mitigar artefatos presentes no áudio.",
35
+ "Volume Envelope": "Envelope de Volume",
36
+ "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed.": "Substitua ou misture com o envelope de volume da saída. Quanto mais próximo o valor estiver de 1, mais o envelope de saída será empregado.",
37
+ "Protect Voiceless Consonants": "Proteger Consoantes Surdas",
38
+ "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect.": "Proteja consoantes distintas e sons de respiração para evitar rasgos eletroacústicos e outros artefatos. Ajustar o parâmetro para seu valor máximo de 0,5 oferece proteção abrangente. No entanto, reduzir esse valor pode diminuir a extensão da proteção enquanto potencialmente mitiga o efeito de indexação.",
39
+ "Audio Separation Settings": "Configurações de Separação de Áudio",
40
+ "Use TTA": "Usar TTA",
41
+ "Use Test Time Augmentation.": "Usar Aumento de Tempo de Teste.",
42
+ "Batch Size": "Batch Size",
43
+ "Set the batch size for the separation.": "Defina o Batch Size para a separação.",
44
+ "Vocals Model": "Modelo de Vocais",
45
+ "Select the vocals model to use for the separation.": "Selecione o modelo de vocais a ser usado para a separação.",
46
+ "Karaoke Model": "Modelo de Karaokê",
47
+ "Select the karaoke model to use for the separation.": "Selecione o modelo de karaokê a ser usado para a separação.",
48
+ "Dereverb Model": "Modelo de Dereverb",
49
+ "Select the dereverb model to use for the separation.": "Selecione o modelo de dereverb a ser usado para a separação.",
50
+ "Deeecho": "Deeecho",
51
+ "Apply deeecho to the audio.": "Aplicar deeecho ao áudio.",
52
+ "Deeecho Model": "Modelo de Deeecho",
53
+ "Select the deeecho model to use for the separation.": "Selecione o modelo de deeecho a ser usado para a separação.",
54
+ "Denoise": "Redução de Ruído",
55
+ "Apply denoise to the audio.": "Aplicar redução de ruído ao áudio.",
56
+ "Denoise Model": "Modelo de Redução de Ruído",
57
+ "Select the denoise model to use for the separation.": "Selecione o modelo de redução de ruído a ser usado para a separação.",
58
+ "Audio post-process Settings": "Configurações de Pós-processamento de Áudio",
59
+ "Delete Audios": "Excluir Áudios",
60
+ "Delete the audios after the conversion.": "Excluir os áudios após a conversão.",
61
+ "Reverb": "Reverberação",
62
+ "Apply reverb to the audio.": "Aplicar reverberação ao áudio.",
63
+ "Reverb Room Size": "Tamanho da Sala de Reverberação",
64
+ "Set the room size of the reverb.": "Definir o tamanho da sala de reverberação.",
65
+ "Reverb Damping": "Amortecimento da Reverberação",
66
+ "Set the damping of the reverb.": "Definir o amortecimento da reverberação.",
67
+ "Reverb Wet Gain": "Ganho Molhado da Reverberação",
68
+ "Set the wet gain of the reverb.": "Definir o ganho molhado da reverberação.",
69
+ "Reverb Dry Gain": "Ganho Seco da Reverberação",
70
+ "Set the dry gain of the reverb.": "Definir o ganho seco da reverberação.",
71
+ "Reverb Width": "Largura da Reverberação",
72
+ "Set the width of the reverb.": "Definir a largura da reverberação.",
73
+ "Vocals Volume": "Volume dos Vocais",
74
+ "Adjust the volume of the vocals.": "Ajustar o volume dos vocais.",
75
+ "Instrumentals Volume": "Volume dos Instrumentais",
76
+ "Adjust the volume of the Instrumentals.": "Ajustar o volume dos instrumentais.",
77
+ "Backing Vocals Volume": "Volume dos Vocais de Apoio",
78
+ "Adjust the volume of the backing vocals.": "Ajustar o volume dos vocais de apoio.",
79
+ "Device Settings": "Configurações do Dispositivo",
80
+ "Device": "Dispositivo",
81
+ "Select the device to use for the conversion. 0 to ∞ separated by - and for CPU leave only an -": "Selecione o dispositivo a ser usado para a conversão. 0 a ∞ separados por - e para CPU deixe apenas um -",
82
+ "Convert": "Converter",
83
+ "Output Information": "Informações de Saída",
84
+ "The output information will be displayed here.": "As informações de saída serão exibidas aqui.",
85
+ "Export Audio": "Exportar Áudio",
86
+ "Music URL": "URL da Música",
87
+ "Download": "Baixar",
88
+ "Model URL": "URL do Modelo"
89
+ }
assets/i18n/scan.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import json
3
+ from pathlib import Path
4
+ from collections import OrderedDict
5
+
6
+
7
+ def extract_i18n_strings(node):
8
+ i18n_strings = []
9
+
10
+ if (
11
+ isinstance(node, ast.Call)
12
+ and isinstance(node.func, ast.Name)
13
+ and node.func.id == "i18n"
14
+ ):
15
+ for arg in node.args:
16
+ if isinstance(arg, ast.Str):
17
+ i18n_strings.append(arg.s)
18
+
19
+ for child_node in ast.iter_child_nodes(node):
20
+ i18n_strings.extend(extract_i18n_strings(child_node))
21
+
22
+ return i18n_strings
23
+
24
+
25
+ def process_file(file_path):
26
+ with open(file_path, "r", encoding="utf8") as file:
27
+ code = file.read()
28
+ if "I18nAuto" in code:
29
+ tree = ast.parse(code)
30
+ i18n_strings = extract_i18n_strings(tree)
31
+ print(file_path, len(i18n_strings))
32
+ return i18n_strings
33
+ return []
34
+
35
+
36
+ # Use pathlib for file handling
37
+ py_files = Path(".").rglob("*.py")
38
+
39
+ # Use a set to store unique strings
40
+ code_keys = set()
41
+
42
+ for py_file in py_files:
43
+ strings = process_file(py_file)
44
+ code_keys.update(strings)
45
+
46
+ print()
47
+ print("Total unique:", len(code_keys))
48
+
49
+ standard_file = "languages/en_US.json"
50
+ with open(standard_file, "r", encoding="utf-8") as file:
51
+ standard_data = json.load(file, object_pairs_hook=OrderedDict)
52
+ standard_keys = set(standard_data.keys())
53
+
54
+ # Combine unused and missing keys sections
55
+ unused_keys = standard_keys - code_keys
56
+ missing_keys = code_keys - standard_keys
57
+
58
+ print("Unused keys:", len(unused_keys))
59
+ for unused_key in unused_keys:
60
+ print("\t", unused_key)
61
+
62
+ print("Missing keys:", len(missing_keys))
63
+ for missing_key in missing_keys:
64
+ print("\t", missing_key)
65
+
66
+ code_keys_dict = OrderedDict((s, s) for s in code_keys)
67
+
68
+ # Use context manager for writing back to the file
69
+ with open(standard_file, "w", encoding="utf-8") as file:
70
+ json.dump(code_keys_dict, file, ensure_ascii=False, indent=4, sort_keys=True)
71
+ file.write("\n")
core.py ADDED
@@ -0,0 +1,1023 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, os
2
+ import subprocess
3
+ import torch
4
+ from functools import lru_cache
5
+ import shutil
6
+ from pedalboard import Pedalboard, Reverb
7
+ from pedalboard.io import AudioFile
8
+ from pydub import AudioSegment
9
+ from audio_separator.separator import Separator
10
+ import logging
11
+ import yaml
12
+
13
+ now_dir = os.getcwd()
14
+ sys.path.append(now_dir)
15
+ from programs.applio_code.rvc.infer.infer import VoiceConverter
16
+ from programs.applio_code.rvc.lib.tools.model_download import model_download_pipeline
17
+ from programs.music_separation_code.inference import proc_file
18
+
19
+ models_vocals = [
20
+ {
21
+ "name": "Mel-Roformer by KimberleyJSN",
22
+ "path": os.path.join(now_dir, "models", "mel-vocals"),
23
+ "model": os.path.join(now_dir, "models", "mel-vocals", "model.ckpt"),
24
+ "config": os.path.join(now_dir, "models", "mel-vocals", "config.yaml"),
25
+ "type": "mel_band_roformer",
26
+ "config_url": "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/KimberleyJensen/config_vocals_mel_band_roformer_kj.yaml",
27
+ "model_url": "https://huggingface.co/KimberleyJSN/melbandroformer/resolve/main/MelBandRoformer.ckpt",
28
+ },
29
+ {
30
+ "name": "BS-Roformer by ViperX",
31
+ "path": os.path.join(now_dir, "models", "bs-vocals"),
32
+ "model": os.path.join(now_dir, "models", "bs-vocals", "model.ckpt"),
33
+ "config": os.path.join(now_dir, "models", "bs-vocals", "config.yaml"),
34
+ "type": "bs_roformer",
35
+ "config_url": "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml",
36
+ "model_url": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/model_bs_roformer_ep_317_sdr_12.9755.ckpt",
37
+ },
38
+ {
39
+ "name": "MDX23C",
40
+ "path": os.path.join(now_dir, "models", "mdx23c-vocals"),
41
+ "model": os.path.join(now_dir, "models", "mdx23c-vocals", "model.ckpt"),
42
+ "config": os.path.join(now_dir, "models", "mdx23c-vocals", "config.yaml"),
43
+ "type": "mdx23c",
44
+ "config_url": "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/config_vocals_mdx23c.yaml",
45
+ "model_url": "https://github.com/ZFTurbo/Music-Source-Separation-Training/releases/download/v1.0.0/model_vocals_mdx23c_sdr_10.17.ckpt",
46
+ },
47
+ ]
48
+
49
+ karaoke_models = [
50
+ {
51
+ "name": "Mel-Roformer Karaoke by aufr33 and viperx",
52
+ "path": os.path.join(now_dir, "models", "mel-kara"),
53
+ "model": os.path.join(now_dir, "models", "mel-kara", "model.ckpt"),
54
+ "config": os.path.join(now_dir, "models", "mel-kara", "config.yaml"),
55
+ "type": "mel_band_roformer",
56
+ "config_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel_band_roformer_karaoke_aufr33_viperx/config_mel_band_roformer_karaoke.yaml",
57
+ "model_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel_band_roformer_karaoke_aufr33_viperx/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt",
58
+ },
59
+ {
60
+ "name": "UVR-BVE",
61
+ "full_name": "UVR-BVE-4B_SN-44100-1.pth",
62
+ "arch": "vr",
63
+ },
64
+ ]
65
+
66
+ denoise_models = [
67
+ {
68
+ "name": "Mel-Roformer Denoise Normal by aufr33",
69
+ "path": os.path.join(now_dir, "models", "mel-denoise"),
70
+ "model": os.path.join(now_dir, "models", "mel-denoise", "model.ckpt"),
71
+ "config": os.path.join(now_dir, "models", "mel-denoise", "config.yaml"),
72
+ "type": "mel_band_roformer",
73
+ "config_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel-denoise/model_mel_band_roformer_denoise.yaml",
74
+ "model_url": "https://huggingface.co/jarredou/aufr33_MelBand_Denoise/resolve/main/denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt",
75
+ },
76
+ {
77
+ "name": "Mel-Roformer Denoise Aggressive by aufr33",
78
+ "path": os.path.join(now_dir, "models", "mel-denoise-aggr"),
79
+ "model": os.path.join(now_dir, "models", "mel-denoise-aggr", "model.ckpt"),
80
+ "config": os.path.join(now_dir, "models", "mel-denoise-aggr", "config.yaml"),
81
+ "type": "mel_band_roformer",
82
+ "config_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel-denoise/model_mel_band_roformer_denoise.yaml",
83
+ "model_url": "https://huggingface.co/jarredou/aufr33_MelBand_Denoise/resolve/main/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt",
84
+ },
85
+ {
86
+ "name": "UVR Denoise",
87
+ "full_name": "UVR-DeNoise.pth",
88
+ "arch": "vr",
89
+ },
90
+ ]
91
+
92
+ dereverb_models = [
93
+ {
94
+ "name": "MDX23C DeReverb by aufr33 and jarredou",
95
+ "path": os.path.join(now_dir, "models", "mdx23c-dereveb"),
96
+ "model": os.path.join(now_dir, "models", "mdx23c-dereveb", "model.ckpt"),
97
+ "config": os.path.join(now_dir, "models", "mdx23c-dereveb", "config.yaml"),
98
+ "type": "mdx23c",
99
+ "config_url": "https://huggingface.co/jarredou/aufr33_jarredou_MDXv3_DeReverb/resolve/main/config_dereverb_mdx23c.yaml",
100
+ "model_url": "https://huggingface.co/jarredou/aufr33_jarredou_MDXv3_DeReverb/resolve/main/dereverb_mdx23c_sdr_6.9096.ckpt",
101
+ },
102
+ {
103
+ "name": "BS-Roformer Dereverb by anvuew",
104
+ "path": os.path.join(now_dir, "models", "mdx23c-dereveb"),
105
+ "model": os.path.join(now_dir, "models", "mdx23c-dereveb", "model.ckpt"),
106
+ "config": os.path.join(now_dir, "models", "mdx23c-dereveb", "config.yaml"),
107
+ "type": "bs_roformer",
108
+ "config_url": "https://huggingface.co/anvuew/deverb_bs_roformer/resolve/main/deverb_bs_roformer_8_384dim_10depth.yaml",
109
+ "model_url": "https://huggingface.co/anvuew/deverb_bs_roformer/resolve/main/deverb_bs_roformer_8_384dim_10depth.ckpt",
110
+ },
111
+ {
112
+ "name": "UVR-Deecho-Dereverb",
113
+ "full_name": "UVR-DeEcho-DeReverb.pth",
114
+ "arch": "vr",
115
+ },
116
+ {
117
+ "name": "MDX Reverb HQ by FoxJoy",
118
+ "full_name": "Reverb_HQ_By_FoxJoy.onnx",
119
+ "arch": "mdx",
120
+ },
121
+ ]
122
+
123
+ deecho_models = [
124
+ {
125
+ "name": "UVR-Deecho-Normal",
126
+ "full_name": "UVR-De-Echo-Normal.pth",
127
+ "arch": "vr",
128
+ },
129
+ {
130
+ "name": "UVR-Deecho-Agggressive",
131
+ "full_name": "UVR-De-Echo-Aggressive.pth",
132
+ "arch": "vr",
133
+ },
134
+ ]
135
+
136
+
137
+ @lru_cache(maxsize=None)
138
+ def import_voice_converter():
139
+ from programs.applio_code.rvc.infer.infer import VoiceConverter
140
+
141
+ return VoiceConverter()
142
+
143
+
144
+ @lru_cache(maxsize=1)
145
+ def get_config():
146
+ from programs.applio_code.rvc.configs.config import Config
147
+
148
+ return Config()
149
+
150
+
151
+ def download_file(url, path, filename):
152
+ os.makedirs(path, exist_ok=True)
153
+ file_path = os.path.join(path, filename)
154
+
155
+ if os.path.exists(file_path):
156
+ print(f"File '{filename}' already exists at '{path}'.")
157
+ return
158
+
159
+ try:
160
+ response = torch.hub.download_url_to_file(url, file_path)
161
+ print(f"File '{filename}' downloaded successfully")
162
+ except Exception as e:
163
+ print(f"Error downloading file '{filename}' from '{url}': {e}")
164
+
165
+
166
+ def get_model_info_by_name(model_name):
167
+ all_models = (
168
+ models_vocals
169
+ + karaoke_models
170
+ + dereverb_models
171
+ + deecho_models
172
+ + denoise_models
173
+ )
174
+ for model in all_models:
175
+ if model["name"] == model_name:
176
+ return model
177
+ return None
178
+
179
+
180
+ def get_last_modified_file(pasta):
181
+ if not os.path.isdir(pasta):
182
+ raise NotADirectoryError(f"{pasta} is not a valid directory.")
183
+ arquivos = [f for f in os.listdir(pasta) if os.path.isfile(os.path.join(pasta, f))]
184
+ if not arquivos:
185
+ return None
186
+ return max(arquivos, key=lambda x: os.path.getmtime(os.path.join(pasta, x)))
187
+
188
+
189
+ def search_with_word(folder, word):
190
+ if not os.path.isdir(folder):
191
+ raise NotADirectoryError(f"{folder} is not a valid directory.")
192
+ file_with_word = [file for file in os.listdir(folder) if word in file]
193
+ if not file_with_word:
194
+ return None
195
+ most_recent_file = max(
196
+ file_with_word, key=lambda file: os.path.getmtime(os.path.join(folder, file))
197
+ )
198
+ return most_recent_file
199
+
200
+
201
+ def search_with_two_words(folder, word1, word2):
202
+ if not os.path.isdir(folder):
203
+ raise NotADirectoryError(f"{folder} is not a valid directory.")
204
+ file_with_words = [
205
+ file for file in os.listdir(folder) if word1 in file and word2 in file
206
+ ]
207
+ if not file_with_words:
208
+ return None
209
+ most_recent_file = max(
210
+ file_with_words, key=lambda file: os.path.getmtime(os.path.join(folder, file))
211
+ )
212
+ return most_recent_file
213
+
214
+
215
+ def get_last_modified_folder(path):
216
+ directories = [
217
+ os.path.join(path, d)
218
+ for d in os.listdir(path)
219
+ if os.path.isdir(os.path.join(path, d))
220
+ ]
221
+ if not directories:
222
+ return None
223
+ last_modified_folder = max(directories, key=os.path.getmtime)
224
+ return last_modified_folder
225
+
226
+
227
+ def add_audio_effects(
228
+ audio_path,
229
+ reverb_size,
230
+ reverb_wet,
231
+ reverb_dry,
232
+ reverb_damping,
233
+ reverb_width,
234
+ output_path,
235
+ ):
236
+ board = Pedalboard([])
237
+ board.append(
238
+ Reverb(
239
+ room_size=reverb_size,
240
+ dry_level=reverb_dry,
241
+ wet_level=reverb_wet,
242
+ damping=reverb_damping,
243
+ width=reverb_width,
244
+ )
245
+ )
246
+ with AudioFile(audio_path) as f:
247
+ with AudioFile(output_path, "w", f.samplerate, f.num_channels) as o:
248
+ while f.tell() < f.frames:
249
+ chunk = f.read(int(f.samplerate))
250
+ effected = board(chunk, f.samplerate, reset=False)
251
+ o.write(effected)
252
+ return output_path
253
+
254
+
255
+ def merge_audios(
256
+ vocals_path,
257
+ inst_path,
258
+ backing_path,
259
+ output_path,
260
+ main_gain,
261
+ inst_gain,
262
+ backing_Vol,
263
+ output_format,
264
+ ):
265
+ main_vocal_audio = AudioSegment.from_file(vocals_path, format="flac") + main_gain
266
+ instrumental_audio = AudioSegment.from_file(inst_path, format="flac") + inst_gain
267
+ backing_vocal_audio = (
268
+ AudioSegment.from_file(backing_path, format="flac") + backing_Vol
269
+ )
270
+ combined_audio = main_vocal_audio.overlay(
271
+ instrumental_audio.overlay(backing_vocal_audio)
272
+ )
273
+ combined_audio.export(output_path, format=output_format)
274
+ return output_path
275
+
276
+
277
+ def check_fp16_support(device):
278
+ i_device = int(str(device).split(":")[-1])
279
+ gpu_name = torch.cuda.get_device_name(i_device)
280
+ low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"]
281
+ if any(gpu in gpu_name for gpu in low_end_gpus) and "V100" not in gpu_name.upper():
282
+ print(f"Your GPU {gpu_name} not support FP16 inference. Using FP32 instead.")
283
+ return False
284
+ return True
285
+
286
+
287
+ def full_inference_program(
288
+ model_path,
289
+ index_path,
290
+ input_audio_path,
291
+ output_path,
292
+ export_format_rvc,
293
+ split_audio,
294
+ autotune,
295
+ vocal_model,
296
+ karaoke_model,
297
+ dereverb_model,
298
+ deecho,
299
+ deecho_model,
300
+ denoise,
301
+ denoise_model,
302
+ reverb,
303
+ vocals_volume,
304
+ instrumentals_volume,
305
+ backing_vocals_volume,
306
+ export_format_final,
307
+ devices,
308
+ pitch,
309
+ filter_radius,
310
+ index_rate,
311
+ rms_mix_rate,
312
+ protect,
313
+ pitch_extract,
314
+ hop_lenght,
315
+ reverb_room_size,
316
+ reverb_damping,
317
+ reverb_wet_gain,
318
+ reverb_dry_gain,
319
+ reverb_width,
320
+ embedder_model,
321
+ delete_audios,
322
+ use_tta,
323
+ batch_size,
324
+ infer_backing_vocals,
325
+ infer_backing_vocals_model,
326
+ infer_backing_vocals_index,
327
+ change_inst_pitch,
328
+ pitch_back,
329
+ filter_radius_back,
330
+ index_rate_back,
331
+ rms_mix_rate_back,
332
+ protect_back,
333
+ pitch_extract_back,
334
+ hop_length_back,
335
+ export_format_rvc_back,
336
+ split_audio_back,
337
+ autotune_back,
338
+ embedder_model_back,
339
+ ):
340
+ if torch.cuda.is_available():
341
+ n_gpu = torch.cuda.device_count()
342
+ devices = devices.replace("-", " ")
343
+ print(f"Number of GPUs available: {n_gpu}")
344
+ first_device = devices.split()[0]
345
+ fp16 = check_fp16_support(first_device)
346
+ else:
347
+ devices = "cpu"
348
+ print("Using CPU")
349
+ fp16 = False
350
+
351
+ music_folder = os.path.splitext(os.path.basename(input_audio_path))[0]
352
+
353
+ # Vocals Separation
354
+ model_info = get_model_info_by_name(vocal_model)
355
+ model_ckpt_path = os.path.join(model_info["path"], "model.ckpt")
356
+ if not os.path.exists(model_ckpt_path):
357
+ download_file(
358
+ model_info["model_url"],
359
+ model_info["path"],
360
+ "model.ckpt",
361
+ )
362
+ config_json_path = os.path.join(model_info["path"], "config.yaml")
363
+ if not os.path.exists(config_json_path):
364
+ download_file(
365
+ model_info["config_url"],
366
+ model_info["path"],
367
+ "config.yaml",
368
+ )
369
+ if not fp16:
370
+ with open(model_info["config"], "r") as file:
371
+ config = yaml.safe_load(file)
372
+
373
+ config["training"]["use_amp"] = False
374
+
375
+ with open(model_info["config"], "w") as file:
376
+ yaml.safe_dump(config, file)
377
+ store_dir = os.path.join(now_dir, "audio_files", music_folder, "vocals")
378
+ inst_dir = os.path.join(now_dir, "audio_files", music_folder, "instrumentals")
379
+ os.makedirs(store_dir, exist_ok=True)
380
+ os.makedirs(inst_dir, exist_ok=True)
381
+ input_audio_basename = os.path.splitext(os.path.basename(input_audio_path))[0]
382
+ search_result = search_with_word(store_dir, "vocals")
383
+ if search_result:
384
+ print("Vocals already separated"),
385
+ else:
386
+ print("Separating vocals")
387
+ command = [
388
+ "python",
389
+ os.path.join(now_dir, "programs", "music_separation_code", "inference.py"),
390
+ "--model_type",
391
+ model_info["type"],
392
+ "--config_path",
393
+ model_info["config"],
394
+ "--start_check_point",
395
+ model_info["model"],
396
+ "--input_file",
397
+ input_audio_path,
398
+ "--store_dir",
399
+ store_dir,
400
+ "--flac_file",
401
+ "--pcm_type",
402
+ "PCM_16",
403
+ "--extract_instrumental",
404
+ ]
405
+
406
+ if devices == "cpu":
407
+ command.append("--force_cpu")
408
+ else:
409
+ device_ids = [str(int(device)) for device in devices.split()]
410
+ command.extend(["--device_ids"] + device_ids)
411
+
412
+ subprocess.run(command)
413
+ os.rename(
414
+ os.path.join(
415
+ store_dir,
416
+ search_with_two_words(
417
+ store_dir,
418
+ os.path.basename(input_audio_path).split(".")[0],
419
+ "instrumental",
420
+ ),
421
+ ),
422
+ os.path.join(
423
+ inst_dir,
424
+ f"{os.path.basename(input_audio_path).split('.')[0]}_instrumentals.flac",
425
+ ),
426
+ )
427
+ inst_file = os.path.join(
428
+ inst_dir,
429
+ search_with_two_words(
430
+ inst_dir, os.path.basename(input_audio_path).split(".")[0], "instrumentals"
431
+ ),
432
+ )
433
+
434
+ # karaoke separation
435
+ model_info = get_model_info_by_name(karaoke_model)
436
+ store_dir = os.path.join(now_dir, "audio_files", music_folder, "karaoke")
437
+ os.makedirs(store_dir, exist_ok=True)
438
+ vocals_path = os.path.join(now_dir, "audio_files", music_folder, "vocals")
439
+ input_file = search_with_word(vocals_path, "vocals")
440
+ karaoke_exists = search_with_word(store_dir, "karaoke") is not None
441
+
442
+ if karaoke_exists:
443
+ print("Backing vocals already separated")
444
+ else:
445
+ if input_file:
446
+ input_file = os.path.join(vocals_path, input_file)
447
+ print("Separating Backing vocals")
448
+ if model_info["name"] == "Mel-Roformer Karaoke by aufr33 and viperx":
449
+ model_ckpt_path = os.path.join(model_info["path"], "model.ckpt")
450
+ if not os.path.exists(model_ckpt_path):
451
+ download_file(
452
+ model_info["model_url"],
453
+ model_info["path"],
454
+ "model.ckpt",
455
+ )
456
+ config_json_path = os.path.join(model_info["path"], "config.yaml")
457
+ if not os.path.exists(config_json_path):
458
+ download_file(
459
+ model_info["config_url"],
460
+ model_info["path"],
461
+ "config.yaml",
462
+ )
463
+ if not fp16:
464
+ with open(model_info["config"], "r") as file:
465
+ config = yaml.safe_load(file)
466
+
467
+ config["training"]["use_amp"] = False
468
+
469
+ with open(model_info["config"], "w") as file:
470
+ yaml.safe_dump(config, file)
471
+
472
+ command = [
473
+ "python",
474
+ os.path.join(
475
+ now_dir, "programs", "music_separation_code", "inference.py"
476
+ ),
477
+ "--model_type",
478
+ model_info["type"],
479
+ "--config_path",
480
+ model_info["config"],
481
+ "--start_check_point",
482
+ model_info["model"],
483
+ "--input_file",
484
+ input_file,
485
+ "--store_dir",
486
+ store_dir,
487
+ "--flac_file",
488
+ "--pcm_type",
489
+ "PCM_16",
490
+ "--extract_instrumental",
491
+ ]
492
+
493
+ if devices == "cpu":
494
+ command.append("--force_cpu")
495
+ else:
496
+ device_ids = [str(int(device)) for device in devices.split()]
497
+ command.extend(["--device_ids"] + device_ids)
498
+
499
+ subprocess.run(command)
500
+ else:
501
+ separator = Separator(
502
+ model_file_dir=os.path.join(now_dir, "models", "karaoke"),
503
+ log_level=logging.WARNING,
504
+ normalization_threshold=1.0,
505
+ output_format="flac",
506
+ output_dir=store_dir,
507
+ vr_params={
508
+ "batch_size": batch_size,
509
+ "enable_tta": use_tta,
510
+ },
511
+ )
512
+ separator.load_model(model_filename=model_info["full_name"])
513
+ separator.separate(input_file)
514
+ karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke")
515
+ vocals_result = search_with_two_words(
516
+ karaoke_path,
517
+ os.path.basename(input_audio_path).split(".")[0],
518
+ "Vocals",
519
+ )
520
+ instrumental_result = search_with_two_words(
521
+ karaoke_path,
522
+ os.path.basename(input_audio_path).split(".")[0],
523
+ "Instrumental",
524
+ )
525
+ if "UVR-BVE-4B_SN-44100-1" in os.path.basename(vocals_result):
526
+ os.rename(
527
+ os.path.join(karaoke_path, vocals_result),
528
+ os.path.join(
529
+ karaoke_path,
530
+ f"{os.path.basename(input_audio_path).split('.')[0]}_karaoke.flac",
531
+ ),
532
+ )
533
+ if "UVR-BVE-4B_SN-44100-1" in os.path.basename(instrumental_result):
534
+ os.rename(
535
+ os.path.join(karaoke_path, instrumental_result),
536
+ os.path.join(
537
+ karaoke_path,
538
+ f"{os.path.basename(input_audio_path).split('.')[0]}_instrumental.flac",
539
+ ),
540
+ )
541
+
542
+ # dereverb
543
+ model_info = get_model_info_by_name(dereverb_model)
544
+ store_dir = os.path.join(now_dir, "audio_files", music_folder, "dereverb")
545
+ os.makedirs(store_dir, exist_ok=True)
546
+ karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke")
547
+ input_file = search_with_word(karaoke_path, "karaoke")
548
+ noreverb_exists = search_with_word(store_dir, "noreverb") is not None
549
+ if noreverb_exists:
550
+ print("Reverb already removed")
551
+ else:
552
+ if input_file:
553
+ input_file = os.path.join(karaoke_path, input_file)
554
+ print("Removing reverb")
555
+ if (
556
+ model_info["name"] == "BS-Roformer Dereverb by anvuew"
557
+ or model_info["name"] == "MDX23C DeReverb by aufr33 and jarredou"
558
+ ):
559
+ model_ckpt_path = os.path.join(model_info["path"], "model.ckpt")
560
+ if not os.path.exists(model_ckpt_path):
561
+ download_file(
562
+ model_info["model_url"],
563
+ model_info["path"],
564
+ "model.ckpt",
565
+ )
566
+ config_json_path = os.path.join(model_info["path"], "config.yaml")
567
+ if not os.path.exists(config_json_path):
568
+ download_file(
569
+ model_info["config_url"],
570
+ model_info["path"],
571
+ "config.yaml",
572
+ )
573
+ if not fp16:
574
+ with open(model_info["config"], "r") as file:
575
+ config = yaml.safe_load(file)
576
+
577
+ config["training"]["use_amp"] = False
578
+
579
+ with open(model_info["config"], "w") as file:
580
+ yaml.safe_dump(config, file)
581
+ command = [
582
+ "python",
583
+ os.path.join(
584
+ now_dir, "programs", "music_separation_code", "inference.py"
585
+ ),
586
+ "--model_type",
587
+ model_info["type"],
588
+ "--config_path",
589
+ model_info["config"],
590
+ "--start_check_point",
591
+ model_info["model"],
592
+ "--input_file",
593
+ input_file,
594
+ "--store_dir",
595
+ store_dir,
596
+ "--flac_file",
597
+ "--pcm_type",
598
+ "PCM_16",
599
+ ]
600
+
601
+ if devices == "cpu":
602
+ command.append("--force_cpu")
603
+ else:
604
+ device_ids = [str(int(device)) for device in devices.split()]
605
+ command.extend(["--device_ids"] + device_ids)
606
+
607
+ subprocess.run(command)
608
+ else:
609
+ if model_info["arch"] == "vr":
610
+ separator = Separator(
611
+ model_file_dir=os.path.join(now_dir, "models", "dereverb"),
612
+ log_level=logging.WARNING,
613
+ normalization_threshold=1.0,
614
+ output_format="flac",
615
+ output_dir=store_dir,
616
+ output_single_stem="No Reverb",
617
+ vr_params={
618
+ "batch_size": batch_size,
619
+ "enable_tta": use_tta,
620
+ },
621
+ )
622
+ else:
623
+ separator = Separator(
624
+ model_file_dir=os.path.join(now_dir, "models", "dereverb"),
625
+ log_level=logging.WARNING,
626
+ normalization_threshold=1.0,
627
+ output_format="flac",
628
+ output_dir=store_dir,
629
+ output_single_stem="No Reverb",
630
+ )
631
+ separator.load_model(model_filename=model_info["full_name"])
632
+ separator.separate(input_file)
633
+ dereverb_path = os.path.join(
634
+ now_dir, "audio_files", music_folder, "dereverb"
635
+ )
636
+ search_result = search_with_two_words(
637
+ dereverb_path,
638
+ os.path.basename(input_audio_path).split(".")[0],
639
+ "No Reverb",
640
+ )
641
+ if "UVR-DeEcho-DeReverb" in os.path.basename(
642
+ search_result
643
+ ) or "MDX Reverb HQ by FoxJoy" in os.path.basename(search_result):
644
+ os.rename(
645
+ os.path.join(dereverb_path, search_result),
646
+ os.path.join(
647
+ dereverb_path,
648
+ f"{os.path.basename(input_audio_path).split('.')[0]}_noreverb.flac",
649
+ ),
650
+ )
651
+
652
+ # deecho
653
+ store_dir = os.path.join(now_dir, "audio_files", music_folder, "deecho")
654
+ os.makedirs(store_dir, exist_ok=True)
655
+ if deecho:
656
+ no_echo_exists = search_with_word(store_dir, "noecho") is not None
657
+ if no_echo_exists:
658
+ print("Echo already removed")
659
+ else:
660
+ print("Removing echo")
661
+ model_info = get_model_info_by_name(deecho_model)
662
+
663
+ dereverb_path = os.path.join(
664
+ now_dir, "audio_files", music_folder, "dereverb"
665
+ )
666
+ noreverb_file = search_with_word(dereverb_path, "noreverb")
667
+
668
+ input_file = os.path.join(dereverb_path, noreverb_file)
669
+
670
+ separator = Separator(
671
+ model_file_dir=os.path.join(now_dir, "models", "deecho"),
672
+ log_level=logging.WARNING,
673
+ normalization_threshold=1.0,
674
+ output_format="flac",
675
+ output_dir=store_dir,
676
+ output_single_stem="No Echo",
677
+ vr_params={
678
+ "batch_size": batch_size,
679
+ "enable_tta": use_tta,
680
+ },
681
+ )
682
+ separator.load_model(model_filename=model_info["full_name"])
683
+ separator.separate(input_file)
684
+ deecho_path = os.path.join(now_dir, "audio_files", music_folder, "deecho")
685
+ search_result = search_with_two_words(
686
+ deecho_path,
687
+ os.path.basename(input_audio_path).split(".")[0],
688
+ "No Echo",
689
+ )
690
+ if "UVR-De-Echo-Normal" in os.path.basename(
691
+ search_result
692
+ ) or "UVR-Deecho-Agggressive" in os.path.basename(search_result):
693
+ os.rename(
694
+ os.path.join(deecho_path, search_result),
695
+ os.path.join(
696
+ deecho_path,
697
+ f"{os.path.basename(input_audio_path).split('.')[0]}_noecho.flac",
698
+ ),
699
+ )
700
+
701
+ # denoise
702
+ store_dir = os.path.join(now_dir, "audio_files", music_folder, "denoise")
703
+ os.makedirs(store_dir, exist_ok=True)
704
+ if denoise:
705
+ no_noise_exists = search_with_word(store_dir, "dry") is not None
706
+ if no_noise_exists:
707
+ print("Noise already removed")
708
+ else:
709
+ model_info = get_model_info_by_name(denoise_model)
710
+ print("Removing noise")
711
+ input_file = (
712
+ os.path.join(
713
+ now_dir,
714
+ "audio_files",
715
+ music_folder,
716
+ "deecho",
717
+ search_with_word(
718
+ os.path.join(now_dir, "audio_files", music_folder, "deecho"),
719
+ "noecho",
720
+ ),
721
+ )
722
+ if deecho
723
+ else os.path.join(
724
+ now_dir,
725
+ "audio_files",
726
+ music_folder,
727
+ "dereverb",
728
+ search_with_word(
729
+ os.path.join(now_dir, "audio_files", music_folder, "dereverb"),
730
+ "noreverb",
731
+ ),
732
+ )
733
+ )
734
+
735
+ if (
736
+ model_info["name"] == "Mel-Roformer Denoise Normal by aufr33"
737
+ or model_info["name"] == "Mel-Roformer Denoise Aggressive by aufr33"
738
+ ):
739
+ model_ckpt_path = os.path.join(model_info["path"], "model.ckpt")
740
+ if not os.path.exists(model_ckpt_path):
741
+ download_file(
742
+ model_info["model_url"],
743
+ model_info["path"],
744
+ "model.ckpt",
745
+ )
746
+ config_json_path = os.path.join(model_info["path"], "config.yaml")
747
+ if not os.path.exists(config_json_path):
748
+ download_file(
749
+ model_info["config_url"], model_info["path"], "config.yaml"
750
+ )
751
+ if not fp16:
752
+ with open(model_info["config"], "r") as file:
753
+ config = yaml.safe_load(file)
754
+
755
+ config["training"]["use_amp"] = False
756
+
757
+ with open(model_info["config"], "w") as file:
758
+ yaml.safe_dump(config, file)
759
+ command = [
760
+ "python",
761
+ os.path.join(
762
+ now_dir, "programs", "music_separation_code", "inference.py"
763
+ ),
764
+ "--model_type",
765
+ model_info["type"],
766
+ "--config_path",
767
+ model_info["config"],
768
+ "--start_check_point",
769
+ model_info["model"],
770
+ "--input_file",
771
+ input_file,
772
+ "--store_dir",
773
+ store_dir,
774
+ "--flac_file",
775
+ "--pcm_type",
776
+ "PCM_16",
777
+ ]
778
+
779
+ if devices == "cpu":
780
+ command.append("--force_cpu")
781
+ else:
782
+ device_ids = [str(int(device)) for device in devices.split()]
783
+ command.extend(["--device_ids"] + device_ids)
784
+
785
+ subprocess.run(command)
786
+ else:
787
+ separator = Separator(
788
+ model_file_dir=os.path.join(now_dir, "models", "denoise"),
789
+ log_level=logging.WARNING,
790
+ normalization_threshold=1.0,
791
+ output_format="flac",
792
+ output_dir=store_dir,
793
+ output_single_stem="No Noise",
794
+ vr_params={
795
+ "batch_size": batch_size,
796
+ "enable_tta": use_tta,
797
+ },
798
+ )
799
+ separator.load_model(model_filename=model_info["full_name"])
800
+ separator.separate(input_file)
801
+ search_result = search_with_two_words(
802
+ deecho_path,
803
+ os.path.basename(input_audio_path).split(".")[0],
804
+ "No Noise",
805
+ )
806
+ if "UVR Denoise" in os.path.basename(search_result):
807
+ os.rename(
808
+ os.path.join(deecho_path, search_result),
809
+ os.path.join(
810
+ deecho_path,
811
+ f"{os.path.basename(input_audio_path).split('.')[0]}_dry.flac",
812
+ ),
813
+ )
814
+
815
+ # RVC
816
+ denoise_path = os.path.join(now_dir, "audio_files", music_folder, "denoise")
817
+ deecho_path = os.path.join(now_dir, "audio_files", music_folder, "deecho")
818
+ dereverb_path = os.path.join(now_dir, "audio_files", music_folder, "dereverb")
819
+
820
+ denoise_audio = search_with_two_words(
821
+ denoise_path, os.path.basename(input_audio_path).split(".")[0], "dry"
822
+ )
823
+ deecho_audio = search_with_two_words(
824
+ deecho_path, os.path.basename(input_audio_path).split(".")[0], "noecho"
825
+ )
826
+ dereverb = search_with_two_words(
827
+ dereverb_path, os.path.basename(input_audio_path).split(".")[0], "noreverb"
828
+ )
829
+
830
+ if denoise_audio:
831
+ final_path = os.path.join(
832
+ now_dir, "audio_files", music_folder, "denoise", denoise_audio
833
+ )
834
+ elif deecho_audio:
835
+ final_path = os.path.join(
836
+ now_dir, "audio_files", music_folder, "deecho", deecho_audio
837
+ )
838
+ elif dereverb:
839
+ final_path = os.path.join(
840
+ now_dir, "audio_files", music_folder, "dereverb", dereverb
841
+ )
842
+ else:
843
+ final_path = None
844
+
845
+ store_dir = os.path.join(now_dir, "audio_files", music_folder, "rvc")
846
+ os.makedirs(store_dir, exist_ok=True)
847
+ print("Making RVC inference")
848
+ output_rvc = os.path.join(
849
+ now_dir,
850
+ "audio_files",
851
+ music_folder,
852
+ "rvc",
853
+ f"{os.path.basename(input_audio_path).split('.')[0]}_rvc.wav",
854
+ )
855
+ inference_vc = import_voice_converter()
856
+ inference_vc.convert_audio(
857
+ audio_input_path=final_path,
858
+ audio_output_path=output_rvc,
859
+ model_path=model_path,
860
+ index_path=index_path,
861
+ embedder_model=embedder_model,
862
+ pitch=pitch,
863
+ f0_file=None,
864
+ f0_method=pitch_extract,
865
+ filter_radius=filter_radius,
866
+ index_rate=index_rate,
867
+ volume_envelope=rms_mix_rate,
868
+ protect=protect,
869
+ split_audio=split_audio,
870
+ f0_autotune=autotune,
871
+ hop_length=hop_lenght,
872
+ export_format=export_format_rvc,
873
+ embedder_model_custom=None,
874
+ )
875
+ backing_vocals = os.path.join(
876
+ karaoke_path, search_with_word(karaoke_path, "instrumental")
877
+ )
878
+
879
+ if infer_backing_vocals:
880
+ print("Infering backing vocals")
881
+ karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke")
882
+ instrumental_file = search_with_word(karaoke_path, "instrumental")
883
+ backing_vocals = os.path.join(karaoke_path, instrumental_file)
884
+ output_backing_vocals = os.path.join(
885
+ karaoke_path, f"{input_audio_basename}_instrumental_output.wav"
886
+ )
887
+ inference_vc.convert_audio(
888
+ audio_input_path=backing_vocals,
889
+ audio_output_path=output_backing_vocals,
890
+ model_path=infer_backing_vocals_model,
891
+ index_path=infer_backing_vocals_index,
892
+ embedder_model=embedder_model_back,
893
+ pitch=pitch_back,
894
+ f0_file=None,
895
+ f0_method=pitch_extract_back,
896
+ filter_radius=filter_radius_back,
897
+ index_rate=index_rate_back,
898
+ volume_envelope=rms_mix_rate_back,
899
+ protect=protect_back,
900
+ split_audio=split_audio_back,
901
+ f0_autotune=autotune_back,
902
+ hop_length=hop_length_back,
903
+ export_format=export_format_rvc_back,
904
+ embedder_model_custom=None,
905
+ )
906
+ backing_vocals = output_backing_vocals
907
+
908
+ # post process
909
+ if reverb:
910
+ add_audio_effects(
911
+ os.path.join(
912
+ now_dir,
913
+ "audio_files",
914
+ music_folder,
915
+ "rvc",
916
+ get_last_modified_file(
917
+ os.path.join(now_dir, "audio_files", music_folder, "rvc")
918
+ ),
919
+ ),
920
+ reverb_room_size,
921
+ reverb_wet_gain,
922
+ reverb_dry_gain,
923
+ reverb_damping,
924
+ reverb_width,
925
+ os.path.join(
926
+ now_dir,
927
+ "audio_files",
928
+ music_folder,
929
+ "rvc",
930
+ os.path.basename(input_audio_path),
931
+ ),
932
+ )
933
+ if change_inst_pitch != 0:
934
+ print("Changing instrumental pitch")
935
+ inst_path = os.path.join(
936
+ now_dir,
937
+ "audio_files",
938
+ music_folder,
939
+ "instrumentals",
940
+ search_with_word(
941
+ os.path.join(now_dir, "audio_files", music_folder, "instrumentals"),
942
+ "instrumentals",
943
+ ),
944
+ )
945
+ audio = AudioSegment.from_file(inst_path)
946
+
947
+ factor = 2 ** (change_inst_pitch / 12)
948
+
949
+ new_frame_rate = int(audio.frame_rate * factor)
950
+ audio = audio._spawn(audio.raw_data, overrides={"frame_rate": new_frame_rate})
951
+
952
+ audio = audio.set_frame_rate(audio.frame_rate)
953
+ output_dir_pitch = os.path.join(
954
+ now_dir, "audio_files", music_folder, "instrumentals"
955
+ )
956
+ output_path_pitch = os.path.join(
957
+ output_dir_pitch, "inst_with_changed_pitch.flac"
958
+ )
959
+ audio.export(output_path_pitch, format="flac")
960
+
961
+ # merge audios
962
+ store_dir = os.path.join(now_dir, "audio_files", music_folder, "final")
963
+ os.makedirs(store_dir, exist_ok=True)
964
+
965
+ vocals_path = os.path.join(now_dir, "audio_files", music_folder, "rvc")
966
+ vocals_file = get_last_modified_file(
967
+ os.path.join(now_dir, "audio_files", music_folder, "rvc")
968
+ )
969
+ vocals_file = os.path.join(vocals_path, vocals_file)
970
+
971
+ karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke")
972
+ karaoke_file = search_with_word(karaoke_path, "Instrumental") or search_with_word(
973
+ karaoke_path, "instrumental"
974
+ )
975
+ karaoke_file = os.path.join(karaoke_path, karaoke_file)
976
+ final_output_path = os.path.join(
977
+ now_dir,
978
+ "audio_files",
979
+ music_folder,
980
+ "final",
981
+ f"{os.path.basename(input_audio_path).split('.')[0]}_final.{export_format_final.lower()}",
982
+ )
983
+ print("Merging audios")
984
+ result = merge_audios(
985
+ vocals_file,
986
+ inst_file,
987
+ backing_vocals,
988
+ final_output_path,
989
+ vocals_volume,
990
+ instrumentals_volume,
991
+ backing_vocals_volume,
992
+ export_format_final,
993
+ )
994
+ print("Audios merged!")
995
+ if delete_audios:
996
+ main_directory = os.path.join(now_dir, "audio_files", music_folder)
997
+ folder_to_keep = "final"
998
+ for folder_name in os.listdir(main_directory):
999
+ folder_path = os.path.join(main_directory, folder_name)
1000
+ if os.path.isdir(folder_path) and folder_name != folder_to_keep:
1001
+ shutil.rmtree(folder_path)
1002
+ return (
1003
+ f"Audio file {os.path.basename(input_audio_path).split('.')[0]} converted with success",
1004
+ result,
1005
+ )
1006
+
1007
+
1008
+ def download_model(link):
1009
+ model_download_pipeline(link)
1010
+ return "Model downloaded with success"
1011
+
1012
+
1013
+ def download_music(link):
1014
+ os.makedirs(os.path.join(now_dir, "audio_files", "original_files"), exist_ok=True)
1015
+ command = [
1016
+ "yt-dlp",
1017
+ "-x",
1018
+ "--output",
1019
+ os.path.join(now_dir, "audio_files", "original_files", "%(title)s.%(ext)s"),
1020
+ link,
1021
+ ]
1022
+ subprocess.run(command)
1023
+ return "Music downloaded with success"
logs/.gitkeep ADDED
File without changes
main.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import sys, os
3
+ from tabs.full_inference import full_inference_tab
4
+ from tabs.download_model import download_model_tab
5
+
6
+ now_dir = os.getcwd()
7
+ sys.path.append(now_dir)
8
+ DEFAULT_PORT = 7755
9
+ MAX_PORT_ATTEMPTS = 10
10
+
11
+ from assets.i18n.i18n import I18nAuto
12
+
13
+ i18n = I18nAuto()
14
+
15
+
16
+ with gr.Blocks(title="hexGen-RVC", css="footer{display:none !important}") as app:
17
+ gr.Markdown("# hexGen RVC")
18
+ with gr.Tab(i18n("Full Inference")):
19
+ full_inference_tab()
20
+ with gr.Tab(i18n("Download Model")):
21
+ download_model_tab()
22
+
23
+
24
+ def launch(port):
25
+ app.launch(
26
+ share="--share" in sys.argv,
27
+ inbrowser="--open" in sys.argv,
28
+ server_port=port,
29
+ )
30
+
31
+
32
+ def get_port_from_args():
33
+ if "--port" in sys.argv:
34
+ port_index = sys.argv.index("--port") + 1
35
+ if port_index < len(sys.argv):
36
+ return int(sys.argv[port_index])
37
+ return DEFAULT_PORT
38
+
39
+
40
+ if __name__ == "__main__":
41
+ port = get_port_from_args()
42
+ for _ in range(MAX_PORT_ATTEMPTS):
43
+ try:
44
+ launch(port)
45
+ break
46
+ except OSError:
47
+ print(
48
+ f"Failed to launch on port {port}, trying again on port {port - 1}..."
49
+ )
50
+ port -= 1
51
+ except Exception as error:
52
+ print(f"An error occurred launching Gradio: {error}")
53
+ break
programs/applio_code/rvc/configs/config.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import json
3
+ import os
4
+
5
+
6
+ version_config_paths = [
7
+ os.path.join("v1", "32000.json"),
8
+ os.path.join("v1", "40000.json"),
9
+ os.path.join("v1", "48000.json"),
10
+ os.path.join("v2", "48000.json"),
11
+ os.path.join("v2", "40000.json"),
12
+ os.path.join("v2", "32000.json"),
13
+ ]
14
+
15
+
16
+ def singleton(cls):
17
+ instances = {}
18
+
19
+ def get_instance(*args, **kwargs):
20
+ if cls not in instances:
21
+ instances[cls] = cls(*args, **kwargs)
22
+ return instances[cls]
23
+
24
+ return get_instance
25
+
26
+
27
+ @singleton
28
+ class Config:
29
+ def __init__(self):
30
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
31
+ self.is_half = self.device != "cpu"
32
+ self.gpu_name = (
33
+ torch.cuda.get_device_name(int(self.device.split(":")[-1]))
34
+ if self.device.startswith("cuda")
35
+ else None
36
+ )
37
+ self.json_config = self.load_config_json()
38
+ self.gpu_mem = None
39
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
40
+
41
+ def load_config_json(self) -> dict:
42
+ configs = {}
43
+ for config_file in version_config_paths:
44
+ config_path = os.path.join(
45
+ "programs", "applio_code", "rvc", "configs", config_file
46
+ )
47
+ with open(config_path, "r") as f:
48
+ configs[config_file] = json.load(f)
49
+ return configs
50
+
51
+ def has_mps(self) -> bool:
52
+ # Check if Metal Performance Shaders are available - for macOS 12.3+.
53
+ return torch.backends.mps.is_available()
54
+
55
+ def has_xpu(self) -> bool:
56
+ # Check if XPU is available.
57
+ return hasattr(torch, "xpu") and torch.xpu.is_available()
58
+
59
+ def set_precision(self, precision):
60
+ if precision not in ["fp32", "fp16"]:
61
+ raise ValueError("Invalid precision type. Must be 'fp32' or 'fp16'.")
62
+
63
+ fp16_run_value = precision == "fp16"
64
+ preprocess_target_version = "3.7" if precision == "fp16" else "3.0"
65
+ preprocess_path = os.path.join(
66
+ os.path.dirname(__file__),
67
+ os.pardir,
68
+ "rvc",
69
+ "train",
70
+ "preprocess",
71
+ "preprocess.py",
72
+ )
73
+
74
+ for config_path in version_config_paths:
75
+ full_config_path = os.path.join(
76
+ "programs", "applio_code", "rvc", "configs", config_path
77
+ )
78
+ try:
79
+ with open(full_config_path, "r") as f:
80
+ config = json.load(f)
81
+ config["train"]["fp16_run"] = fp16_run_value
82
+ with open(full_config_path, "w") as f:
83
+ json.dump(config, f, indent=4)
84
+ except FileNotFoundError:
85
+ print(f"File not found: {full_config_path}")
86
+
87
+ if os.path.exists(preprocess_path):
88
+ with open(preprocess_path, "r") as f:
89
+ preprocess_content = f.read()
90
+ preprocess_content = preprocess_content.replace(
91
+ "3.0" if precision == "fp16" else "3.7", preprocess_target_version
92
+ )
93
+ with open(preprocess_path, "w") as f:
94
+ f.write(preprocess_content)
95
+
96
+ return f"Overwritten preprocess and config.json to use {precision}."
97
+
98
+ def get_precision(self):
99
+ if not version_config_paths:
100
+ raise FileNotFoundError("No configuration paths provided.")
101
+
102
+ full_config_path = os.path.join(
103
+ "programs", "applio_code", "rvc", "configs", version_config_paths[0]
104
+ )
105
+ try:
106
+ with open(full_config_path, "r") as f:
107
+ config = json.load(f)
108
+ fp16_run_value = config["train"].get("fp16_run", False)
109
+ precision = "fp16" if fp16_run_value else "fp32"
110
+ return precision
111
+ except FileNotFoundError:
112
+ print(f"File not found: {full_config_path}")
113
+ return None
114
+
115
+ def device_config(self) -> tuple:
116
+ if self.device.startswith("cuda"):
117
+ self.set_cuda_config()
118
+ elif self.has_mps():
119
+ self.device = "mps"
120
+ self.is_half = False
121
+ self.set_precision("fp32")
122
+ else:
123
+ self.device = "cpu"
124
+ self.is_half = False
125
+ self.set_precision("fp32")
126
+
127
+ # Configuration for 6GB GPU memory
128
+ x_pad, x_query, x_center, x_max = (
129
+ (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
130
+ )
131
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
132
+ # Configuration for 5GB GPU memory
133
+ x_pad, x_query, x_center, x_max = (1, 5, 30, 32)
134
+
135
+ return x_pad, x_query, x_center, x_max
136
+
137
+ def set_cuda_config(self):
138
+ i_device = int(self.device.split(":")[-1])
139
+ self.gpu_name = torch.cuda.get_device_name(i_device)
140
+ # Zluda
141
+ if self.gpu_name.endswith("[ZLUDA]"):
142
+ print("Zluda compatibility enabled, experimental feature.")
143
+ torch.backends.cudnn.enabled = False
144
+ torch.backends.cuda.enable_flash_sdp(False)
145
+ torch.backends.cuda.enable_math_sdp(True)
146
+ torch.backends.cuda.enable_mem_efficient_sdp(False)
147
+ low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"]
148
+ if (
149
+ any(gpu in self.gpu_name for gpu in low_end_gpus)
150
+ and "V100" not in self.gpu_name.upper()
151
+ ):
152
+ self.is_half = False
153
+ self.set_precision("fp32")
154
+
155
+ self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (
156
+ 1024**3
157
+ )
158
+
159
+
160
+ def max_vram_gpu(gpu):
161
+ if torch.cuda.is_available():
162
+ gpu_properties = torch.cuda.get_device_properties(gpu)
163
+ total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024)
164
+ return total_memory_gb
165
+ else:
166
+ return "0"
167
+
168
+
169
+ def get_gpu_info():
170
+ ngpu = torch.cuda.device_count()
171
+ gpu_infos = []
172
+ if torch.cuda.is_available() or ngpu != 0:
173
+ for i in range(ngpu):
174
+ gpu_name = torch.cuda.get_device_name(i)
175
+ mem = int(
176
+ torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024
177
+ + 0.4
178
+ )
179
+ gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)")
180
+ if len(gpu_infos) > 0:
181
+ gpu_info = "\n".join(gpu_infos)
182
+ else:
183
+ gpu_info = "Unfortunately, there is no compatible GPU available to support your training."
184
+ return gpu_info
185
+
186
+
187
+ def get_number_of_gpus():
188
+ if torch.cuda.is_available():
189
+ num_gpus = torch.cuda.device_count()
190
+ return "-".join(map(str, range(num_gpus)))
191
+ else:
192
+ return "-"
programs/applio_code/rvc/configs/v1/32000.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sample_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "text_enc_hidden_dim": 256,
33
+ "n_heads": 2,
34
+ "n_layers": 6,
35
+ "kernel_size": 3,
36
+ "p_dropout": 0,
37
+ "resblock": "1",
38
+ "resblock_kernel_sizes": [3,7,11],
39
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
40
+ "upsample_rates": [10,4,2,2,2],
41
+ "upsample_initial_channel": 512,
42
+ "upsample_kernel_sizes": [16,16,4,4,4],
43
+ "use_spectral_norm": false,
44
+ "gin_channels": 256,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
programs/applio_code/rvc/configs/v1/40000.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sample_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "text_enc_hidden_dim": 256,
33
+ "n_heads": 2,
34
+ "n_layers": 6,
35
+ "kernel_size": 3,
36
+ "p_dropout": 0,
37
+ "resblock": "1",
38
+ "resblock_kernel_sizes": [3,7,11],
39
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
40
+ "upsample_rates": [10,10,2,2],
41
+ "upsample_initial_channel": 512,
42
+ "upsample_kernel_sizes": [16,16,4,4],
43
+ "use_spectral_norm": false,
44
+ "gin_channels": 256,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
programs/applio_code/rvc/configs/v1/48000.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sample_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "text_enc_hidden_dim": 256,
33
+ "n_heads": 2,
34
+ "n_layers": 6,
35
+ "kernel_size": 3,
36
+ "p_dropout": 0,
37
+ "resblock": "1",
38
+ "resblock_kernel_sizes": [3,7,11],
39
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
40
+ "upsample_rates": [10,6,2,2,2],
41
+ "upsample_initial_channel": 512,
42
+ "upsample_kernel_sizes": [16,16,4,4,4],
43
+ "use_spectral_norm": false,
44
+ "gin_channels": 256,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
programs/applio_code/rvc/configs/v2/32000.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "learning_rate": 1e-4,
6
+ "betas": [0.8, 0.99],
7
+ "eps": 1e-9,
8
+ "fp16_run": true,
9
+ "lr_decay": 0.999875,
10
+ "segment_size": 12800,
11
+ "c_mel": 45,
12
+ "c_kl": 1.0
13
+ },
14
+ "data": {
15
+ "max_wav_value": 32768.0,
16
+ "sample_rate": 32000,
17
+ "filter_length": 1024,
18
+ "hop_length": 320,
19
+ "win_length": 1024,
20
+ "n_mel_channels": 80,
21
+ "mel_fmin": 0.0,
22
+ "mel_fmax": null
23
+ },
24
+ "model": {
25
+ "inter_channels": 192,
26
+ "hidden_channels": 192,
27
+ "filter_channels": 768,
28
+ "text_enc_hidden_dim": 768,
29
+ "n_heads": 2,
30
+ "n_layers": 6,
31
+ "kernel_size": 3,
32
+ "p_dropout": 0,
33
+ "resblock": "1",
34
+ "resblock_kernel_sizes": [3,7,11],
35
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
36
+ "upsample_rates": [10,8,2,2],
37
+ "upsample_initial_channel": 512,
38
+ "upsample_kernel_sizes": [20,16,4,4],
39
+ "use_spectral_norm": false,
40
+ "gin_channels": 256,
41
+ "spk_embed_dim": 109
42
+ }
43
+ }
programs/applio_code/rvc/configs/v2/40000.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "learning_rate": 1e-4,
6
+ "betas": [0.8, 0.99],
7
+ "eps": 1e-9,
8
+ "fp16_run": true,
9
+ "lr_decay": 0.999875,
10
+ "segment_size": 12800,
11
+ "c_mel": 45,
12
+ "c_kl": 1.0
13
+ },
14
+ "data": {
15
+ "max_wav_value": 32768.0,
16
+ "sample_rate": 40000,
17
+ "filter_length": 2048,
18
+ "hop_length": 400,
19
+ "win_length": 2048,
20
+ "n_mel_channels": 125,
21
+ "mel_fmin": 0.0,
22
+ "mel_fmax": null
23
+ },
24
+ "model": {
25
+ "inter_channels": 192,
26
+ "hidden_channels": 192,
27
+ "filter_channels": 768,
28
+ "text_enc_hidden_dim": 768,
29
+ "n_heads": 2,
30
+ "n_layers": 6,
31
+ "kernel_size": 3,
32
+ "p_dropout": 0,
33
+ "resblock": "1",
34
+ "resblock_kernel_sizes": [3,7,11],
35
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
36
+ "upsample_rates": [10,10,2,2],
37
+ "upsample_initial_channel": 512,
38
+ "upsample_kernel_sizes": [16,16,4,4],
39
+ "use_spectral_norm": false,
40
+ "gin_channels": 256,
41
+ "spk_embed_dim": 109
42
+ }
43
+ }
programs/applio_code/rvc/configs/v2/48000.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "learning_rate": 1e-4,
6
+ "betas": [0.8, 0.99],
7
+ "eps": 1e-9,
8
+ "fp16_run": true,
9
+ "lr_decay": 0.999875,
10
+ "segment_size": 17280,
11
+ "c_mel": 45,
12
+ "c_kl": 1.0
13
+ },
14
+ "data": {
15
+ "max_wav_value": 32768.0,
16
+ "sample_rate": 48000,
17
+ "filter_length": 2048,
18
+ "hop_length": 480,
19
+ "win_length": 2048,
20
+ "n_mel_channels": 128,
21
+ "mel_fmin": 0.0,
22
+ "mel_fmax": null
23
+ },
24
+ "model": {
25
+ "inter_channels": 192,
26
+ "hidden_channels": 192,
27
+ "filter_channels": 768,
28
+ "text_enc_hidden_dim": 768,
29
+ "n_heads": 2,
30
+ "n_layers": 6,
31
+ "kernel_size": 3,
32
+ "p_dropout": 0,
33
+ "resblock": "1",
34
+ "resblock_kernel_sizes": [3,7,11],
35
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
36
+ "upsample_rates": [12,10,2,2],
37
+ "upsample_initial_channel": 512,
38
+ "upsample_kernel_sizes": [24,20,4,4],
39
+ "use_spectral_norm": false,
40
+ "gin_channels": 256,
41
+ "spk_embed_dim": 109
42
+ }
43
+ }
programs/applio_code/rvc/infer/infer.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import torch
5
+ import librosa
6
+ import logging
7
+ import traceback
8
+ import numpy as np
9
+ import soundfile as sf
10
+
11
+ from scipy.io import wavfile
12
+
13
+ now_dir = os.getcwd()
14
+ sys.path.append(now_dir)
15
+
16
+ from programs.applio_code.rvc.infer.pipeline import Pipeline as VC
17
+ from programs.applio_code.rvc.lib.utils import load_audio_infer, load_embedding
18
+ from programs.applio_code.rvc.lib.tools.split_audio import process_audio, merge_audio
19
+ from programs.applio_code.rvc.lib.algorithm.synthesizers import Synthesizer
20
+ from programs.applio_code.rvc.configs.config import Config
21
+
22
+ logging.getLogger("httpx").setLevel(logging.WARNING)
23
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
24
+ logging.getLogger("faiss").setLevel(logging.WARNING)
25
+ logging.getLogger("faiss.loader").setLevel(logging.WARNING)
26
+
27
+
28
+ class VoiceConverter:
29
+ """
30
+ A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method.
31
+ """
32
+
33
+ def __init__(self):
34
+ """
35
+ Initializes the VoiceConverter with default configuration, and sets up models and parameters.
36
+ """
37
+ self.config = Config() # Load RVC configuration
38
+ self.hubert_model = (
39
+ None # Initialize the Hubert model (for embedding extraction)
40
+ )
41
+ self.last_embedder_model = None # Last used embedder model
42
+ self.tgt_sr = None # Target sampling rate for the output audio
43
+ self.net_g = None # Generator network for voice conversion
44
+ self.vc = None # Voice conversion pipeline instance
45
+ self.cpt = None # Checkpoint for loading model weights
46
+ self.version = None # Model version
47
+ self.n_spk = None # Number of speakers in the model
48
+ self.use_f0 = None # Whether the model uses F0
49
+
50
+ def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
51
+ """
52
+ Loads the HuBERT model for speaker embedding extraction.
53
+ """
54
+ self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
55
+ self.hubert_model.to(self.config.device)
56
+ self.hubert_model = (
57
+ self.hubert_model.half()
58
+ if self.config.is_half
59
+ else self.hubert_model.float()
60
+ )
61
+ self.hubert_model.eval()
62
+
63
+ @staticmethod
64
+ def convert_audio_format(input_path, output_path, output_format):
65
+ """
66
+ Converts an audio file to a specified output format.
67
+ """
68
+ try:
69
+ if output_format != "WAV":
70
+ print(f"Converting audio to {output_format} format...")
71
+ audio, sample_rate = librosa.load(input_path, sr=None)
72
+ common_sample_rates = [
73
+ 8000,
74
+ 11025,
75
+ 12000,
76
+ 16000,
77
+ 22050,
78
+ 24000,
79
+ 32000,
80
+ 44100,
81
+ 48000,
82
+ ]
83
+ target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
84
+ audio = librosa.resample(
85
+ audio, orig_sr=sample_rate, target_sr=target_sr
86
+ )
87
+ sf.write(output_path, audio, target_sr, format=output_format.lower())
88
+ return output_path
89
+ except Exception as error:
90
+ print(f"An error occurred converting the audio format: {error}")
91
+
92
+ def convert_audio(
93
+ self,
94
+ audio_input_path: str,
95
+ audio_output_path: str,
96
+ model_path: str,
97
+ index_path: str,
98
+ embedder_model: str,
99
+ pitch: int,
100
+ f0_file: str,
101
+ f0_method: str,
102
+ index_rate: float,
103
+ volume_envelope: int,
104
+ protect: float,
105
+ hop_length: int,
106
+ split_audio: bool,
107
+ f0_autotune: bool,
108
+ filter_radius: int,
109
+ embedder_model_custom: str,
110
+ export_format: str,
111
+ resample_sr: int = 0,
112
+ sid: int = 0,
113
+ ):
114
+ """
115
+ Performs voice conversion on the input audio.
116
+ """
117
+ self.get_vc(model_path, sid)
118
+
119
+ try:
120
+ start_time = time.time()
121
+ print(f"Converting audio '{audio_input_path}'...")
122
+ audio = load_audio_infer(
123
+ audio_input_path,
124
+ 16000,
125
+ )
126
+ audio_max = np.abs(audio).max() / 0.95
127
+
128
+ if audio_max > 1:
129
+ audio /= audio_max
130
+
131
+ if not self.hubert_model or embedder_model != self.last_embedder_model:
132
+ self.load_hubert(embedder_model, embedder_model_custom)
133
+ self.last_embedder_model = embedder_model
134
+
135
+ file_index = (
136
+ index_path.strip()
137
+ .strip('"')
138
+ .strip("\n")
139
+ .strip('"')
140
+ .strip()
141
+ .replace("trained", "added")
142
+ )
143
+
144
+ if self.tgt_sr != resample_sr >= 16000:
145
+ self.tgt_sr = resample_sr
146
+
147
+ if split_audio:
148
+ result, new_dir_path = process_audio(audio_input_path)
149
+ if result == "Error":
150
+ return "Error with Split Audio", None
151
+
152
+ dir_path = (
153
+ new_dir_path.strip().strip('"').strip("\n").strip('"').strip()
154
+ )
155
+ if dir_path:
156
+ paths = [
157
+ os.path.join(root, name)
158
+ for root, _, files in os.walk(dir_path, topdown=False)
159
+ for name in files
160
+ if name.endswith(".wav") and root == dir_path
161
+ ]
162
+ try:
163
+ for path in paths:
164
+ self.convert_audio(
165
+ audio_input_path=path,
166
+ audio_output_path=path,
167
+ model_path=model_path,
168
+ index_path=index_path,
169
+ sid=sid,
170
+ pitch=pitch,
171
+ f0_file=None,
172
+ f0_method=f0_method,
173
+ index_rate=index_rate,
174
+ resample_sr=resample_sr,
175
+ volume_envelope=volume_envelope,
176
+ protect=protect,
177
+ hop_length=hop_length,
178
+ split_audio=False,
179
+ f0_autotune=f0_autotune,
180
+ filter_radius=filter_radius,
181
+ export_format=export_format,
182
+ embedder_model=embedder_model,
183
+ embedder_model_custom=embedder_model_custom,
184
+ )
185
+ except Exception as error:
186
+ print(f"An error occurred processing the segmented audio: {error}")
187
+ print(traceback.format_exc())
188
+ return f"Error {error}"
189
+ print("Finished processing segmented audio, now merging audio...")
190
+ merge_timestamps_file = os.path.join(
191
+ os.path.dirname(new_dir_path),
192
+ f"{os.path.basename(audio_input_path).split('.')[0]}_timestamps.txt",
193
+ )
194
+ self.tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
195
+ os.remove(merge_timestamps_file)
196
+ sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV")
197
+ else:
198
+ audio_opt = self.vc.pipeline(
199
+ model=self.hubert_model,
200
+ net_g=self.net_g,
201
+ sid=sid,
202
+ audio=audio,
203
+ input_audio_path=audio_input_path,
204
+ pitch=pitch,
205
+ f0_method=f0_method,
206
+ file_index=file_index,
207
+ index_rate=index_rate,
208
+ pitch_guidance=self.use_f0,
209
+ filter_radius=filter_radius,
210
+ tgt_sr=self.tgt_sr,
211
+ resample_sr=resample_sr,
212
+ volume_envelope=volume_envelope,
213
+ version=self.version,
214
+ protect=protect,
215
+ hop_length=hop_length,
216
+ f0_autotune=f0_autotune,
217
+ f0_file=f0_file,
218
+ )
219
+
220
+ if audio_output_path:
221
+ sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV")
222
+ output_path_format = audio_output_path.replace(
223
+ ".wav", f".{export_format.lower()}"
224
+ )
225
+ audio_output_path = self.convert_audio_format(
226
+ audio_output_path, output_path_format, export_format
227
+ )
228
+
229
+ elapsed_time = time.time() - start_time
230
+ print(
231
+ f"Conversion completed at '{audio_output_path}' in {elapsed_time:.2f} seconds."
232
+ )
233
+
234
+ except Exception as error:
235
+ print(f"An error occurred during audio conversion: {error}")
236
+ print(traceback.format_exc())
237
+
238
+ def convert_audio_batch(
239
+ self,
240
+ audio_input_paths: str,
241
+ audio_output_path: str,
242
+ model_path: str,
243
+ index_path: str,
244
+ embedder_model: str,
245
+ pitch: int,
246
+ f0_file: str,
247
+ f0_method: str,
248
+ index_rate: float,
249
+ volume_envelope: int,
250
+ protect: float,
251
+ hop_length: int,
252
+ split_audio: bool,
253
+ f0_autotune: bool,
254
+ filter_radius: int,
255
+ embedder_model_custom: str,
256
+ export_format: str,
257
+ resample_sr: int = 0,
258
+ sid: int = 0,
259
+ pid_file_path: str = None,
260
+ ):
261
+ """
262
+ Performs voice conversion on a batch of input audio files.
263
+ """
264
+ pid = os.getpid()
265
+ with open(pid_file_path, "w") as pid_file:
266
+ pid_file.write(str(pid))
267
+ try:
268
+ if not self.hubert_model or embedder_model != self.last_embedder_model:
269
+ self.load_hubert(embedder_model, embedder_model_custom)
270
+ self.last_embedder_model = embedder_model
271
+ self.get_vc(model_path, sid)
272
+ file_index = (
273
+ index_path.strip()
274
+ .strip('"')
275
+ .strip("\n")
276
+ .strip('"')
277
+ .strip()
278
+ .replace("trained", "added")
279
+ )
280
+ start_time = time.time()
281
+ print(f"Converting audio batch '{audio_input_paths}'...")
282
+ audio_files = [
283
+ f
284
+ for f in os.listdir(audio_input_paths)
285
+ if f.endswith((".mp3", ".wav", ".flac", ".m4a", ".ogg", ".opus"))
286
+ ]
287
+ print(f"Detected {len(audio_files)} audio files for inference.")
288
+ for i, audio_input_path in enumerate(audio_files):
289
+ audio_output_paths = os.path.join(
290
+ audio_output_path,
291
+ f"{os.path.splitext(os.path.basename(audio_input_path))[0]}_output.{export_format.lower()}",
292
+ )
293
+ if os.path.exists(audio_output_paths):
294
+ continue
295
+ print(f"Converting audio '{audio_input_path}'...")
296
+ audio_input_path = os.path.join(audio_input_paths, audio_input_path)
297
+
298
+ audio = load_audio_infer(
299
+ audio_input_path,
300
+ 16000,
301
+ )
302
+ audio_max = np.abs(audio).max() / 0.95
303
+
304
+ if audio_max > 1:
305
+ audio /= audio_max
306
+
307
+ if self.tgt_sr != resample_sr >= 16000:
308
+ self.tgt_sr = resample_sr
309
+
310
+ if split_audio:
311
+ result, new_dir_path = process_audio(audio_input_path)
312
+ if result == "Error":
313
+ return "Error with Split Audio", None
314
+
315
+ dir_path = (
316
+ new_dir_path.strip().strip('"').strip("\n").strip('"').strip()
317
+ )
318
+ if dir_path:
319
+ paths = [
320
+ os.path.join(root, name)
321
+ for root, _, files in os.walk(dir_path, topdown=False)
322
+ for name in files
323
+ if name.endswith(".wav") and root == dir_path
324
+ ]
325
+ try:
326
+ for path in paths:
327
+ self.convert_audio(
328
+ audio_input_path=path,
329
+ audio_output_path=path,
330
+ model_path=model_path,
331
+ index_path=index_path,
332
+ sid=sid,
333
+ pitch=pitch,
334
+ f0_file=None,
335
+ f0_method=f0_method,
336
+ index_rate=index_rate,
337
+ resample_sr=resample_sr,
338
+ volume_envelope=volume_envelope,
339
+ protect=protect,
340
+ hop_length=hop_length,
341
+ split_audio=False,
342
+ f0_autotune=f0_autotune,
343
+ filter_radius=filter_radius,
344
+ export_format=export_format,
345
+ embedder_model=embedder_model,
346
+ embedder_model_custom=embedder_model_custom,
347
+ )
348
+ except Exception as error:
349
+ print(
350
+ f"An error occurred processing the segmented audio: {error}"
351
+ )
352
+ print(traceback.format_exc())
353
+ return f"Error {error}"
354
+ print("Finished processing segmented audio, now merging audio...")
355
+ merge_timestamps_file = os.path.join(
356
+ os.path.dirname(new_dir_path),
357
+ f"{os.path.basename(audio_input_path).split('.')[0]}_timestamps.txt",
358
+ )
359
+ self.tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
360
+ os.remove(merge_timestamps_file)
361
+ else:
362
+ audio_opt = self.vc.pipeline(
363
+ model=self.hubert_model,
364
+ net_g=self.net_g,
365
+ sid=sid,
366
+ audio=audio,
367
+ input_audio_path=audio_input_path,
368
+ pitch=pitch,
369
+ f0_method=f0_method,
370
+ file_index=file_index,
371
+ index_rate=index_rate,
372
+ pitch_guidance=self.use_f0,
373
+ filter_radius=filter_radius,
374
+ tgt_sr=self.tgt_sr,
375
+ resample_sr=resample_sr,
376
+ volume_envelope=volume_envelope,
377
+ version=self.version,
378
+ protect=protect,
379
+ hop_length=hop_length,
380
+ f0_autotune=f0_autotune,
381
+ f0_file=f0_file,
382
+ )
383
+
384
+ if audio_output_paths:
385
+ sf.write(audio_output_paths, audio_opt, self.tgt_sr, format="WAV")
386
+ output_path_format = audio_output_paths.replace(
387
+ ".wav", f".{export_format.lower()}"
388
+ )
389
+ audio_output_paths = self.convert_audio_format(
390
+ audio_output_paths, output_path_format, export_format
391
+ )
392
+ print(f"Conversion completed at '{audio_output_paths}'.")
393
+ elapsed_time = time.time() - start_time
394
+ print(f"Batch conversion completed in {elapsed_time:.2f} seconds.")
395
+ os.remove(pid_file_path)
396
+ except Exception as error:
397
+ print(f"An error occurred during audio conversion: {error}")
398
+ print(traceback.format_exc())
399
+
400
+ def get_vc(self, weight_root, sid):
401
+ """
402
+ Loads the voice conversion model and sets up the pipeline.
403
+ """
404
+ if sid == "" or sid == []:
405
+ self.cleanup_model()
406
+ if torch.cuda.is_available():
407
+ torch.cuda.empty_cache()
408
+
409
+ self.load_model(weight_root)
410
+
411
+ if self.cpt is not None:
412
+ self.setup_network()
413
+ self.setup_vc_instance()
414
+
415
+ def cleanup_model(self):
416
+ """
417
+ Cleans up the model and releases resources.
418
+ """
419
+ if self.hubert_model is not None:
420
+ del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
421
+ self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
422
+ if torch.cuda.is_available():
423
+ torch.cuda.empty_cache()
424
+
425
+ del self.net_g, self.cpt
426
+ if torch.cuda.is_available():
427
+ torch.cuda.empty_cache()
428
+ self.cpt = None
429
+
430
+ def load_model(self, weight_root):
431
+ """
432
+ Loads the model weights from the specified path.
433
+ """
434
+ self.cpt = (
435
+ torch.load(weight_root, map_location="cpu")
436
+ if os.path.isfile(weight_root)
437
+ else None
438
+ )
439
+
440
+ def setup_network(self):
441
+ """
442
+ Sets up the network configuration based on the loaded checkpoint.
443
+ """
444
+ if self.cpt is not None:
445
+ self.tgt_sr = self.cpt["config"][-1]
446
+ self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
447
+ self.use_f0 = self.cpt.get("f0", 1)
448
+
449
+ self.version = self.cpt.get("version", "v1")
450
+ self.text_enc_hidden_dim = 768 if self.version == "v2" else 256
451
+ self.net_g = Synthesizer(
452
+ *self.cpt["config"],
453
+ use_f0=self.use_f0,
454
+ text_enc_hidden_dim=self.text_enc_hidden_dim,
455
+ is_half=self.config.is_half,
456
+ )
457
+ del self.net_g.enc_q
458
+ self.net_g.load_state_dict(self.cpt["weight"], strict=False)
459
+ self.net_g.eval().to(self.config.device)
460
+ self.net_g = (
461
+ self.net_g.half() if self.config.is_half else self.net_g.float()
462
+ )
463
+
464
+ def setup_vc_instance(self):
465
+ """
466
+ Sets up the voice conversion pipeline instance based on the target sampling rate and configuration.
467
+ """
468
+ if self.cpt is not None:
469
+ self.vc = VC(self.tgt_sr, self.config)
470
+ self.n_spk = self.cpt["config"][-3]
programs/applio_code/rvc/infer/pipeline.py ADDED
@@ -0,0 +1,701 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import re
4
+ import sys
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import torchcrepe
8
+ import faiss
9
+ import librosa
10
+ import numpy as np
11
+ from scipy import signal
12
+ from torch import Tensor
13
+
14
+ now_dir = os.getcwd()
15
+ sys.path.append(now_dir)
16
+
17
+ from programs.applio_code.rvc.lib.predictors.RMVPE import RMVPE0Predictor
18
+ from programs.applio_code.rvc.lib.predictors.FCPE import FCPEF0Predictor
19
+
20
+ import logging
21
+
22
+ logging.getLogger("faiss").setLevel(logging.WARNING)
23
+
24
+ # Constants for high-pass filter
25
+ FILTER_ORDER = 5
26
+ CUTOFF_FREQUENCY = 48 # Hz
27
+ SAMPLE_RATE = 16000 # Hz
28
+ bh, ah = signal.butter(
29
+ N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE
30
+ )
31
+
32
+ input_audio_path2wav = {}
33
+
34
+
35
+ class AudioProcessor:
36
+ """
37
+ A class for processing audio signals, specifically for adjusting RMS levels.
38
+ """
39
+
40
+ def change_rms(
41
+ source_audio: np.ndarray,
42
+ source_rate: int,
43
+ target_audio: np.ndarray,
44
+ target_rate: int,
45
+ rate: float,
46
+ ) -> np.ndarray:
47
+ """
48
+ Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate.
49
+
50
+ Args:
51
+ source_audio: The source audio signal as a NumPy array.
52
+ source_rate: The sampling rate of the source audio.
53
+ target_audio: The target audio signal to adjust.
54
+ target_rate: The sampling rate of the target audio.
55
+ rate: The blending rate between the source and target RMS levels.
56
+ """
57
+ # Calculate RMS of both audio data
58
+ rms1 = librosa.feature.rms(
59
+ y=source_audio,
60
+ frame_length=source_rate // 2 * 2,
61
+ hop_length=source_rate // 2,
62
+ )
63
+ rms2 = librosa.feature.rms(
64
+ y=target_audio,
65
+ frame_length=target_rate // 2 * 2,
66
+ hop_length=target_rate // 2,
67
+ )
68
+
69
+ # Interpolate RMS to match target audio length
70
+ rms1 = F.interpolate(
71
+ torch.from_numpy(rms1).float().unsqueeze(0),
72
+ size=target_audio.shape[0],
73
+ mode="linear",
74
+ ).squeeze()
75
+ rms2 = F.interpolate(
76
+ torch.from_numpy(rms2).float().unsqueeze(0),
77
+ size=target_audio.shape[0],
78
+ mode="linear",
79
+ ).squeeze()
80
+ rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6)
81
+
82
+ # Adjust target audio RMS based on the source audio RMS
83
+ adjusted_audio = (
84
+ target_audio
85
+ * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy()
86
+ )
87
+ return adjusted_audio
88
+
89
+
90
+ class Autotune:
91
+ """
92
+ A class for applying autotune to a given fundamental frequency (F0) contour.
93
+ """
94
+
95
+ def __init__(self, ref_freqs):
96
+ """
97
+ Initializes the Autotune class with a set of reference frequencies.
98
+
99
+ Args:
100
+ ref_freqs: A list of reference frequencies representing musical notes.
101
+ """
102
+ self.ref_freqs = ref_freqs
103
+ self.note_dict = self.generate_interpolated_frequencies()
104
+
105
+ def generate_interpolated_frequencies(self):
106
+ """
107
+ Generates a dictionary of interpolated frequencies between reference frequencies.
108
+ """
109
+ note_dict = []
110
+ for i in range(len(self.ref_freqs) - 1):
111
+ freq_low = self.ref_freqs[i]
112
+ freq_high = self.ref_freqs[i + 1]
113
+ interpolated_freqs = np.linspace(
114
+ freq_low, freq_high, num=10, endpoint=False
115
+ )
116
+ note_dict.extend(interpolated_freqs)
117
+ note_dict.append(self.ref_freqs[-1])
118
+ return note_dict
119
+
120
+ def autotune_f0(self, f0):
121
+ """
122
+ Autotunes a given F0 contour by snapping each frequency to the closest reference frequency.
123
+
124
+ Args:
125
+ f0: The input F0 contour as a NumPy array.
126
+ """
127
+ autotuned_f0 = np.zeros_like(f0)
128
+ for i, freq in enumerate(f0):
129
+ closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
130
+ autotuned_f0[i] = closest_note
131
+ return autotuned_f0
132
+
133
+
134
+ class Pipeline:
135
+ """
136
+ The main pipeline class for performing voice conversion, including preprocessing, F0 estimation,
137
+ voice conversion using a model, and post-processing.
138
+ """
139
+
140
+ def __init__(self, tgt_sr, config):
141
+ """
142
+ Initializes the Pipeline class with target sampling rate and configuration parameters.
143
+
144
+ Args:
145
+ tgt_sr: The target sampling rate for the output audio.
146
+ config: A configuration object containing various parameters for the pipeline.
147
+ """
148
+ self.x_pad = config.x_pad
149
+ self.x_query = config.x_query
150
+ self.x_center = config.x_center
151
+ self.x_max = config.x_max
152
+ self.is_half = config.is_half
153
+ self.sample_rate = 16000
154
+ self.window = 160
155
+ self.t_pad = self.sample_rate * self.x_pad
156
+ self.t_pad_tgt = tgt_sr * self.x_pad
157
+ self.t_pad2 = self.t_pad * 2
158
+ self.t_query = self.sample_rate * self.x_query
159
+ self.t_center = self.sample_rate * self.x_center
160
+ self.t_max = self.sample_rate * self.x_max
161
+ self.time_step = self.window / self.sample_rate * 1000
162
+ self.f0_min = 50
163
+ self.f0_max = 1100
164
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
165
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
166
+ self.device = config.device
167
+ self.ref_freqs = [
168
+ 65.41,
169
+ 82.41,
170
+ 110.00,
171
+ 146.83,
172
+ 196.00,
173
+ 246.94,
174
+ 329.63,
175
+ 440.00,
176
+ 587.33,
177
+ 783.99,
178
+ 1046.50,
179
+ ]
180
+ self.autotune = Autotune(self.ref_freqs)
181
+ self.note_dict = self.autotune.note_dict
182
+
183
+ def get_f0_crepe(
184
+ self,
185
+ x,
186
+ f0_min,
187
+ f0_max,
188
+ p_len,
189
+ hop_length,
190
+ model="full",
191
+ ):
192
+ """
193
+ Estimates the fundamental frequency (F0) of a given audio signal using the Crepe model.
194
+
195
+ Args:
196
+ x: The input audio signal as a NumPy array.
197
+ f0_min: Minimum F0 value to consider.
198
+ f0_max: Maximum F0 value to consider.
199
+ p_len: Desired length of the F0 output.
200
+ hop_length: Hop length for the Crepe model.
201
+ model: Crepe model size to use ("full" or "tiny").
202
+ """
203
+ x = x.astype(np.float32)
204
+ x /= np.quantile(np.abs(x), 0.999)
205
+ audio = torch.from_numpy(x).to(self.device, copy=True)
206
+ audio = torch.unsqueeze(audio, dim=0)
207
+ if audio.ndim == 2 and audio.shape[0] > 1:
208
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
209
+ audio = audio.detach()
210
+ pitch: Tensor = torchcrepe.predict(
211
+ audio,
212
+ self.sample_rate,
213
+ hop_length,
214
+ f0_min,
215
+ f0_max,
216
+ model,
217
+ batch_size=hop_length * 2,
218
+ device=self.device,
219
+ pad=True,
220
+ )
221
+ p_len = p_len or x.shape[0] // hop_length
222
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
223
+ source[source < 0.001] = np.nan
224
+ target = np.interp(
225
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
226
+ np.arange(0, len(source)),
227
+ source,
228
+ )
229
+ f0 = np.nan_to_num(target)
230
+ return f0
231
+
232
+ def get_f0_hybrid(
233
+ self,
234
+ methods_str,
235
+ x,
236
+ f0_min,
237
+ f0_max,
238
+ p_len,
239
+ hop_length,
240
+ ):
241
+ """
242
+ Estimates the fundamental frequency (F0) using a hybrid approach combining multiple methods.
243
+
244
+ Args:
245
+ methods_str: A string specifying the methods to combine (e.g., "hybrid[crepe+rmvpe]").
246
+ x: The input audio signal as a NumPy array.
247
+ f0_min: Minimum F0 value to consider.
248
+ f0_max: Maximum F0 value to consider.
249
+ p_len: Desired length of the F0 output.
250
+ hop_length: Hop length for F0 estimation methods.
251
+ """
252
+ methods_str = re.search("hybrid\[(.+)\]", methods_str)
253
+ if methods_str:
254
+ methods = [method.strip() for method in methods_str.group(1).split("+")]
255
+ f0_computation_stack = []
256
+ print(f"Calculating f0 pitch estimations for methods {str(methods)}")
257
+ x = x.astype(np.float32)
258
+ x /= np.quantile(np.abs(x), 0.999)
259
+ for method in methods:
260
+ f0 = None
261
+ if method == "crepe":
262
+ f0 = self.get_f0_crepe_computation(
263
+ x, f0_min, f0_max, p_len, int(hop_length)
264
+ )
265
+ elif method == "rmvpe":
266
+ self.model_rmvpe = RMVPE0Predictor(
267
+ os.path.join(
268
+ "programs",
269
+ "applio_code",
270
+ "rvc",
271
+ "models",
272
+ "predictors",
273
+ "rmvpe.pt",
274
+ ),
275
+ is_half=self.is_half,
276
+ device=self.device,
277
+ )
278
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
279
+ f0 = f0[1:]
280
+ elif method == "fcpe":
281
+ self.model_fcpe = FCPEF0Predictor(
282
+ os.path.join(
283
+ "programs",
284
+ "applio_code",
285
+ "rvc",
286
+ "models",
287
+ "predictors",
288
+ "fcpe.pt",
289
+ ),
290
+ f0_min=int(f0_min),
291
+ f0_max=int(f0_max),
292
+ dtype=torch.float32,
293
+ device=self.device,
294
+ sample_rate=self.sample_rate,
295
+ threshold=0.03,
296
+ )
297
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
298
+ del self.model_fcpe
299
+ gc.collect()
300
+ f0_computation_stack.append(f0)
301
+
302
+ f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None]
303
+ f0_median_hybrid = None
304
+ if len(f0_computation_stack) == 1:
305
+ f0_median_hybrid = f0_computation_stack[0]
306
+ else:
307
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
308
+ return f0_median_hybrid
309
+
310
+ def get_f0(
311
+ self,
312
+ input_audio_path,
313
+ x,
314
+ p_len,
315
+ pitch,
316
+ f0_method,
317
+ filter_radius,
318
+ hop_length,
319
+ f0_autotune,
320
+ inp_f0=None,
321
+ ):
322
+ """
323
+ Estimates the fundamental frequency (F0) of a given audio signal using various methods.
324
+
325
+ Args:
326
+ input_audio_path: Path to the input audio file.
327
+ x: The input audio signal as a NumPy array.
328
+ p_len: Desired length of the F0 output.
329
+ pitch: Key to adjust the pitch of the F0 contour.
330
+ f0_method: Method to use for F0 estimation (e.g., "crepe").
331
+ filter_radius: Radius for median filtering the F0 contour.
332
+ hop_length: Hop length for F0 estimation methods.
333
+ f0_autotune: Whether to apply autotune to the F0 contour.
334
+ inp_f0: Optional input F0 contour to use instead of estimating.
335
+ """
336
+ global input_audio_path2wav
337
+ if f0_method == "crepe":
338
+ f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length))
339
+ elif f0_method == "crepe-tiny":
340
+ f0 = self.get_f0_crepe(
341
+ x, self.f0_min, self.f0_max, p_len, int(hop_length), "tiny"
342
+ )
343
+ elif f0_method == "rmvpe":
344
+ self.model_rmvpe = RMVPE0Predictor(
345
+ os.path.join(
346
+ "programs", "applio_code", "rvc", "models", "predictors", "rmvpe.pt"
347
+ ),
348
+ is_half=self.is_half,
349
+ device=self.device,
350
+ )
351
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
352
+ elif f0_method == "fcpe":
353
+ self.model_fcpe = FCPEF0Predictor(
354
+ os.path.join(
355
+ "programs", "applio_code", "rvc", "models", "predictors", "fcpe.pt"
356
+ ),
357
+ f0_min=int(self.f0_min),
358
+ f0_max=int(self.f0_max),
359
+ dtype=torch.float32,
360
+ device=self.device,
361
+ sample_rate=self.sample_rate,
362
+ threshold=0.03,
363
+ )
364
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
365
+ del self.model_fcpe
366
+ gc.collect()
367
+ elif "hybrid" in f0_method:
368
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
369
+ f0 = self.get_f0_hybrid(
370
+ f0_method,
371
+ x,
372
+ self.f0_min,
373
+ self.f0_max,
374
+ p_len,
375
+ hop_length,
376
+ )
377
+
378
+ if f0_autotune == "True":
379
+ f0 = Autotune.autotune_f0(self, f0)
380
+
381
+ f0 *= pow(2, pitch / 12)
382
+ tf0 = self.sample_rate // self.window
383
+ if inp_f0 is not None:
384
+ delta_t = np.round(
385
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
386
+ ).astype("int16")
387
+ replace_f0 = np.interp(
388
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
389
+ )
390
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
391
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
392
+ :shape
393
+ ]
394
+ f0bak = f0.copy()
395
+ f0_mel = 1127 * np.log(1 + f0 / 700)
396
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
397
+ self.f0_mel_max - self.f0_mel_min
398
+ ) + 1
399
+ f0_mel[f0_mel <= 1] = 1
400
+ f0_mel[f0_mel > 255] = 255
401
+ f0_coarse = np.rint(f0_mel).astype(np.int)
402
+
403
+ return f0_coarse, f0bak
404
+
405
+ def voice_conversion(
406
+ self,
407
+ model,
408
+ net_g,
409
+ sid,
410
+ audio0,
411
+ pitch,
412
+ pitchf,
413
+ index,
414
+ big_npy,
415
+ index_rate,
416
+ version,
417
+ protect,
418
+ ):
419
+ """
420
+ Performs voice conversion on a given audio segment.
421
+
422
+ Args:
423
+ model: The feature extractor model.
424
+ net_g: The generative model for synthesizing speech.
425
+ sid: Speaker ID for the target voice.
426
+ audio0: The input audio segment.
427
+ pitch: Quantized F0 contour for pitch guidance.
428
+ pitchf: Original F0 contour for pitch guidance.
429
+ index: FAISS index for speaker embedding retrieval.
430
+ big_npy: Speaker embeddings stored in a NumPy array.
431
+ index_rate: Blending rate for speaker embedding retrieval.
432
+ version: Model version ("v1" or "v2").
433
+ protect: Protection level for preserving the original pitch.
434
+ """
435
+ feats = torch.from_numpy(audio0)
436
+ if self.is_half:
437
+ feats = feats.half()
438
+ else:
439
+ feats = feats.float()
440
+ if feats.dim() == 2:
441
+ feats = feats.mean(-1)
442
+ assert feats.dim() == 1, feats.dim()
443
+ feats = feats.view(1, -1)
444
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
445
+
446
+ with torch.no_grad():
447
+ feats = model(feats.to(self.device))["last_hidden_state"]
448
+ feats = (
449
+ model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats
450
+ )
451
+ if protect < 0.5 and pitch != None and pitchf != None:
452
+ feats0 = feats.clone()
453
+ if (
454
+ isinstance(index, type(None)) == False
455
+ and isinstance(big_npy, type(None)) == False
456
+ and index_rate != 0
457
+ ):
458
+ npy = feats[0].cpu().numpy()
459
+ if self.is_half:
460
+ npy = npy.astype("float32")
461
+
462
+ score, ix = index.search(npy, k=8)
463
+ weight = np.square(1 / score)
464
+ weight /= weight.sum(axis=1, keepdims=True)
465
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
466
+
467
+ if self.is_half:
468
+ npy = npy.astype("float16")
469
+ feats = (
470
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
471
+ + (1 - index_rate) * feats
472
+ )
473
+
474
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
475
+ if protect < 0.5 and pitch != None and pitchf != None:
476
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
477
+ 0, 2, 1
478
+ )
479
+ p_len = audio0.shape[0] // self.window
480
+ if feats.shape[1] < p_len:
481
+ p_len = feats.shape[1]
482
+ if pitch != None and pitchf != None:
483
+ pitch = pitch[:, :p_len]
484
+ pitchf = pitchf[:, :p_len]
485
+
486
+ if protect < 0.5 and pitch != None and pitchf != None:
487
+ pitchff = pitchf.clone()
488
+ pitchff[pitchf > 0] = 1
489
+ pitchff[pitchf < 1] = protect
490
+ pitchff = pitchff.unsqueeze(-1)
491
+ feats = feats * pitchff + feats0 * (1 - pitchff)
492
+ feats = feats.to(feats0.dtype)
493
+ p_len = torch.tensor([p_len], device=self.device).long()
494
+ with torch.no_grad():
495
+ if pitch != None and pitchf != None:
496
+ audio1 = (
497
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
498
+ .data.cpu()
499
+ .float()
500
+ .numpy()
501
+ )
502
+ else:
503
+ audio1 = (
504
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
505
+ )
506
+ del feats, p_len, padding_mask
507
+ if torch.cuda.is_available():
508
+ torch.cuda.empty_cache()
509
+ return audio1
510
+
511
+ def pipeline(
512
+ self,
513
+ model,
514
+ net_g,
515
+ sid,
516
+ audio,
517
+ input_audio_path,
518
+ pitch,
519
+ f0_method,
520
+ file_index,
521
+ index_rate,
522
+ pitch_guidance,
523
+ filter_radius,
524
+ tgt_sr,
525
+ resample_sr,
526
+ volume_envelope,
527
+ version,
528
+ protect,
529
+ hop_length,
530
+ f0_autotune,
531
+ f0_file,
532
+ ):
533
+ """
534
+ The main pipeline function for performing voice conversion.
535
+
536
+ Args:
537
+ model: The feature extractor model.
538
+ net_g: The generative model for synthesizing speech.
539
+ sid: Speaker ID for the target voice.
540
+ audio: The input audio signal.
541
+ input_audio_path: Path to the input audio file.
542
+ pitch: Key to adjust the pitch of the F0 contour.
543
+ f0_method: Method to use for F0 estimation.
544
+ file_index: Path to the FAISS index file for speaker embedding retrieval.
545
+ index_rate: Blending rate for speaker embedding retrieval.
546
+ pitch_guidance: Whether to use pitch guidance during voice conversion.
547
+ filter_radius: Radius for median filtering the F0 contour.
548
+ tgt_sr: Target sampling rate for the output audio.
549
+ resample_sr: Resampling rate for the output audio.
550
+ volume_envelope: Blending rate for adjusting the RMS level of the output audio.
551
+ version: Model version.
552
+ protect: Protection level for preserving the original pitch.
553
+ hop_length: Hop length for F0 estimation methods.
554
+ f0_autotune: Whether to apply autotune to the F0 contour.
555
+ f0_file: Path to a file containing an F0 contour to use.
556
+ """
557
+ if file_index != "" and os.path.exists(file_index) == True and index_rate != 0:
558
+ try:
559
+ index = faiss.read_index(file_index)
560
+ big_npy = index.reconstruct_n(0, index.ntotal)
561
+ except Exception as error:
562
+ print(f"An error occurred reading the FAISS index: {error}")
563
+ index = big_npy = None
564
+ else:
565
+ index = big_npy = None
566
+ audio = signal.filtfilt(bh, ah, audio)
567
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
568
+ opt_ts = []
569
+ if audio_pad.shape[0] > self.t_max:
570
+ audio_sum = np.zeros_like(audio)
571
+ for i in range(self.window):
572
+ audio_sum += audio_pad[i : i - self.window]
573
+ for t in range(self.t_center, audio.shape[0], self.t_center):
574
+ opt_ts.append(
575
+ t
576
+ - self.t_query
577
+ + np.where(
578
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
579
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
580
+ )[0][0]
581
+ )
582
+ s = 0
583
+ audio_opt = []
584
+ t = None
585
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
586
+ p_len = audio_pad.shape[0] // self.window
587
+ inp_f0 = None
588
+ if hasattr(f0_file, "name") == True:
589
+ try:
590
+ with open(f0_file.name, "r") as f:
591
+ lines = f.read().strip("\n").split("\n")
592
+ inp_f0 = []
593
+ for line in lines:
594
+ inp_f0.append([float(i) for i in line.split(",")])
595
+ inp_f0 = np.array(inp_f0, dtype="float32")
596
+ except Exception as error:
597
+ print(f"An error occurred reading the F0 file: {error}")
598
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
599
+ if pitch_guidance == True:
600
+ pitch, pitchf = self.get_f0(
601
+ input_audio_path,
602
+ audio_pad,
603
+ p_len,
604
+ pitch,
605
+ f0_method,
606
+ filter_radius,
607
+ hop_length,
608
+ f0_autotune,
609
+ inp_f0,
610
+ )
611
+ pitch = pitch[:p_len]
612
+ pitchf = pitchf[:p_len]
613
+ if self.device == "mps":
614
+ pitchf = pitchf.astype(np.float32)
615
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
616
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
617
+ for t in opt_ts:
618
+ t = t // self.window * self.window
619
+ if pitch_guidance == True:
620
+ audio_opt.append(
621
+ self.voice_conversion(
622
+ model,
623
+ net_g,
624
+ sid,
625
+ audio_pad[s : t + self.t_pad2 + self.window],
626
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
627
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
628
+ index,
629
+ big_npy,
630
+ index_rate,
631
+ version,
632
+ protect,
633
+ )[self.t_pad_tgt : -self.t_pad_tgt]
634
+ )
635
+ else:
636
+ audio_opt.append(
637
+ self.voice_conversion(
638
+ model,
639
+ net_g,
640
+ sid,
641
+ audio_pad[s : t + self.t_pad2 + self.window],
642
+ None,
643
+ None,
644
+ index,
645
+ big_npy,
646
+ index_rate,
647
+ version,
648
+ protect,
649
+ )[self.t_pad_tgt : -self.t_pad_tgt]
650
+ )
651
+ s = t
652
+ if pitch_guidance == True:
653
+ audio_opt.append(
654
+ self.voice_conversion(
655
+ model,
656
+ net_g,
657
+ sid,
658
+ audio_pad[t:],
659
+ pitch[:, t // self.window :] if t is not None else pitch,
660
+ pitchf[:, t // self.window :] if t is not None else pitchf,
661
+ index,
662
+ big_npy,
663
+ index_rate,
664
+ version,
665
+ protect,
666
+ )[self.t_pad_tgt : -self.t_pad_tgt]
667
+ )
668
+ else:
669
+ audio_opt.append(
670
+ self.voice_conversion(
671
+ model,
672
+ net_g,
673
+ sid,
674
+ audio_pad[t:],
675
+ None,
676
+ None,
677
+ index,
678
+ big_npy,
679
+ index_rate,
680
+ version,
681
+ protect,
682
+ )[self.t_pad_tgt : -self.t_pad_tgt]
683
+ )
684
+ audio_opt = np.concatenate(audio_opt)
685
+ if volume_envelope != 1:
686
+ audio_opt = AudioProcessor.change_rms(
687
+ audio, self.sample_rate, audio_opt, tgt_sr, volume_envelope
688
+ )
689
+ if resample_sr >= self.sample_rate and tgt_sr != resample_sr:
690
+ audio_opt = librosa.resample(
691
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
692
+ )
693
+ audio_max = np.abs(audio_opt).max() / 0.99
694
+ max_int16 = 32768
695
+ if audio_max > 1:
696
+ max_int16 /= audio_max
697
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
698
+ del pitch, pitchf, sid
699
+ if torch.cuda.is_available():
700
+ torch.cuda.empty_cache()
701
+ return audio_opt
programs/applio_code/rvc/lib/algorithm/__init__.py ADDED
File without changes
programs/applio_code/rvc/lib/algorithm/attentions.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+
4
+ from programs.applio_code.rvc.lib.algorithm.commons import convert_pad_shape
5
+
6
+
7
+ class MultiHeadAttention(torch.nn.Module):
8
+ """
9
+ Multi-head attention module with optional relative positional encoding and proximal bias.
10
+
11
+ Args:
12
+ channels (int): Number of input channels.
13
+ out_channels (int): Number of output channels.
14
+ n_heads (int): Number of attention heads.
15
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
16
+ window_size (int, optional): Window size for relative positional encoding. Defaults to None.
17
+ heads_share (bool, optional): Whether to share relative positional embeddings across heads. Defaults to True.
18
+ block_length (int, optional): Block length for local attention. Defaults to None.
19
+ proximal_bias (bool, optional): Whether to use proximal bias in self-attention. Defaults to False.
20
+ proximal_init (bool, optional): Whether to initialize the key projection weights the same as query projection weights. Defaults to False.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ channels,
26
+ out_channels,
27
+ n_heads,
28
+ p_dropout=0.0,
29
+ window_size=None,
30
+ heads_share=True,
31
+ block_length=None,
32
+ proximal_bias=False,
33
+ proximal_init=False,
34
+ ):
35
+ super().__init__()
36
+ assert channels % n_heads == 0
37
+
38
+ self.channels = channels
39
+ self.out_channels = out_channels
40
+ self.n_heads = n_heads
41
+ self.p_dropout = p_dropout
42
+ self.window_size = window_size
43
+ self.heads_share = heads_share
44
+ self.block_length = block_length
45
+ self.proximal_bias = proximal_bias
46
+ self.proximal_init = proximal_init
47
+ self.attn = None
48
+
49
+ self.k_channels = channels // n_heads
50
+ self.conv_q = torch.nn.Conv1d(channels, channels, 1)
51
+ self.conv_k = torch.nn.Conv1d(channels, channels, 1)
52
+ self.conv_v = torch.nn.Conv1d(channels, channels, 1)
53
+ self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
54
+ self.drop = torch.nn.Dropout(p_dropout)
55
+
56
+ if window_size is not None:
57
+ n_heads_rel = 1 if heads_share else n_heads
58
+ rel_stddev = self.k_channels**-0.5
59
+ self.emb_rel_k = torch.nn.Parameter(
60
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
61
+ * rel_stddev
62
+ )
63
+ self.emb_rel_v = torch.nn.Parameter(
64
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
65
+ * rel_stddev
66
+ )
67
+
68
+ torch.nn.init.xavier_uniform_(self.conv_q.weight)
69
+ torch.nn.init.xavier_uniform_(self.conv_k.weight)
70
+ torch.nn.init.xavier_uniform_(self.conv_v.weight)
71
+ if proximal_init:
72
+ with torch.no_grad():
73
+ self.conv_k.weight.copy_(self.conv_q.weight)
74
+ self.conv_k.bias.copy_(self.conv_q.bias)
75
+
76
+ def forward(self, x, c, attn_mask=None):
77
+ q = self.conv_q(x)
78
+ k = self.conv_k(c)
79
+ v = self.conv_v(c)
80
+
81
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
82
+
83
+ x = self.conv_o(x)
84
+ return x
85
+
86
+ def attention(self, query, key, value, mask=None):
87
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
88
+ b, d, t_s, t_t = (*key.size(), query.size(2))
89
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
90
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
91
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
92
+
93
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
94
+ if self.window_size is not None:
95
+ assert (
96
+ t_s == t_t
97
+ ), "Relative attention is only available for self-attention."
98
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
99
+ rel_logits = self._matmul_with_relative_keys(
100
+ query / math.sqrt(self.k_channels), key_relative_embeddings
101
+ )
102
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
103
+ scores = scores + scores_local
104
+ if self.proximal_bias:
105
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
106
+ scores = scores + self._attention_bias_proximal(t_s).to(
107
+ device=scores.device, dtype=scores.dtype
108
+ )
109
+ if mask is not None:
110
+ scores = scores.masked_fill(mask == 0, -1e4)
111
+ if self.block_length is not None:
112
+ assert (
113
+ t_s == t_t
114
+ ), "Local attention is only available for self-attention."
115
+ block_mask = (
116
+ torch.ones_like(scores)
117
+ .triu(-self.block_length)
118
+ .tril(self.block_length)
119
+ )
120
+ scores = scores.masked_fill(block_mask == 0, -1e4)
121
+ p_attn = torch.nn.functional.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
122
+ p_attn = self.drop(p_attn)
123
+ output = torch.matmul(p_attn, value)
124
+ if self.window_size is not None:
125
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
126
+ value_relative_embeddings = self._get_relative_embeddings(
127
+ self.emb_rel_v, t_s
128
+ )
129
+ output = output + self._matmul_with_relative_values(
130
+ relative_weights, value_relative_embeddings
131
+ )
132
+ output = (
133
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
134
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
135
+ return output, p_attn
136
+
137
+ def _matmul_with_relative_values(self, x, y):
138
+ """
139
+ x: [b, h, l, m]
140
+ y: [h or 1, m, d]
141
+ ret: [b, h, l, d]
142
+ """
143
+ ret = torch.matmul(x, y.unsqueeze(0))
144
+ return ret
145
+
146
+ def _matmul_with_relative_keys(self, x, y):
147
+ """
148
+ x: [b, h, l, d]
149
+ y: [h or 1, m, d]
150
+ ret: [b, h, l, m]
151
+ """
152
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
153
+ return ret
154
+
155
+ def _get_relative_embeddings(self, relative_embeddings, length):
156
+ # Pad first before slice to avoid using cond ops.
157
+ pad_length = max(length - (self.window_size + 1), 0)
158
+ slice_start_position = max((self.window_size + 1) - length, 0)
159
+ slice_end_position = slice_start_position + 2 * length - 1
160
+ if pad_length > 0:
161
+ padded_relative_embeddings = torch.nn.functional.pad(
162
+ relative_embeddings,
163
+ convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
164
+ )
165
+ else:
166
+ padded_relative_embeddings = relative_embeddings
167
+ used_relative_embeddings = padded_relative_embeddings[
168
+ :, slice_start_position:slice_end_position
169
+ ]
170
+ return used_relative_embeddings
171
+
172
+ def _relative_position_to_absolute_position(self, x):
173
+ """
174
+ x: [b, h, l, 2*l-1]
175
+ ret: [b, h, l, l]
176
+ """
177
+ batch, heads, length, _ = x.size()
178
+
179
+ # Concat columns of pad to shift from relative to absolute indexing.
180
+ x = torch.nn.functional.pad(
181
+ x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
182
+ )
183
+
184
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
185
+ x_flat = x.view([batch, heads, length * 2 * length])
186
+ x_flat = torch.nn.functional.pad(
187
+ x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
188
+ )
189
+
190
+ # Reshape and slice out the padded elements.
191
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
192
+ :, :, :length, length - 1 :
193
+ ]
194
+ return x_final
195
+
196
+ def _absolute_position_to_relative_position(self, x):
197
+ """
198
+ x: [b, h, l, l]
199
+ ret: [b, h, l, 2*l-1]
200
+ """
201
+ batch, heads, length, _ = x.size()
202
+ # padd along column
203
+ x = torch.nn.functional.pad(
204
+ x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
205
+ )
206
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
207
+ # add 0's in the beginning that will skew the elements after reshape
208
+ x_flat = torch.nn.functional.pad(
209
+ x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]])
210
+ )
211
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
212
+ return x_final
213
+
214
+ def _attention_bias_proximal(self, length):
215
+ """Bias for self-attention to encourage attention to close positions.
216
+ Args:
217
+ length: an integer scalar.
218
+ """
219
+ r = torch.arange(length, dtype=torch.float32)
220
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
221
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
222
+
223
+
224
+ class FFN(torch.nn.Module):
225
+ """
226
+ Feed-forward network module.
227
+
228
+ Args:
229
+ in_channels (int): Number of input channels.
230
+ out_channels (int): Number of output channels.
231
+ filter_channels (int): Number of filter channels in the convolution layers.
232
+ kernel_size (int): Kernel size of the convolution layers.
233
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
234
+ activation (str, optional): Activation function to use. Defaults to None.
235
+ causal (bool, optional): Whether to use causal padding in the convolution layers. Defaults to False.
236
+ """
237
+
238
+ def __init__(
239
+ self,
240
+ in_channels,
241
+ out_channels,
242
+ filter_channels,
243
+ kernel_size,
244
+ p_dropout=0.0,
245
+ activation=None,
246
+ causal=False,
247
+ ):
248
+ super().__init__()
249
+ self.in_channels = in_channels
250
+ self.out_channels = out_channels
251
+ self.filter_channels = filter_channels
252
+ self.kernel_size = kernel_size
253
+ self.p_dropout = p_dropout
254
+ self.activation = activation
255
+ self.causal = causal
256
+
257
+ if causal:
258
+ self.padding = self._causal_padding
259
+ else:
260
+ self.padding = self._same_padding
261
+
262
+ self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size)
263
+ self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size)
264
+ self.drop = torch.nn.Dropout(p_dropout)
265
+
266
+ def forward(self, x, x_mask):
267
+ x = self.conv_1(self.padding(x * x_mask))
268
+ if self.activation == "gelu":
269
+ x = x * torch.sigmoid(1.702 * x)
270
+ else:
271
+ x = torch.relu(x)
272
+ x = self.drop(x)
273
+ x = self.conv_2(self.padding(x * x_mask))
274
+ return x * x_mask
275
+
276
+ def _causal_padding(self, x):
277
+ if self.kernel_size == 1:
278
+ return x
279
+ pad_l = self.kernel_size - 1
280
+ pad_r = 0
281
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
282
+ x = torch.nn.functional.pad(x, convert_pad_shape(padding))
283
+ return x
284
+
285
+ def _same_padding(self, x):
286
+ if self.kernel_size == 1:
287
+ return x
288
+ pad_l = (self.kernel_size - 1) // 2
289
+ pad_r = self.kernel_size // 2
290
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
291
+ x = torch.nn.functional.pad(x, convert_pad_shape(padding))
292
+ return x
programs/applio_code/rvc/lib/algorithm/commons.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from typing import List, Optional
4
+
5
+
6
+ def init_weights(m, mean=0.0, std=0.01):
7
+ """
8
+ Initialize the weights of a module.
9
+
10
+ Args:
11
+ m: The module to initialize.
12
+ mean: The mean of the normal distribution.
13
+ std: The standard deviation of the normal distribution.
14
+ """
15
+ classname = m.__class__.__name__
16
+ if classname.find("Conv") != -1:
17
+ m.weight.data.normal_(mean, std)
18
+
19
+
20
+ def get_padding(kernel_size, dilation=1):
21
+ """
22
+ Calculate the padding needed for a convolution.
23
+
24
+ Args:
25
+ kernel_size: The size of the kernel.
26
+ dilation: The dilation of the convolution.
27
+ """
28
+ return int((kernel_size * dilation - dilation) / 2)
29
+
30
+
31
+ def convert_pad_shape(pad_shape):
32
+ """
33
+ Convert the pad shape to a list of integers.
34
+
35
+ Args:
36
+ pad_shape: The pad shape..
37
+ """
38
+ l = pad_shape[::-1]
39
+ pad_shape = [item for sublist in l for item in sublist]
40
+ return pad_shape
41
+
42
+
43
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
44
+ """
45
+ Calculate the KL divergence between two distributions.
46
+
47
+ Args:
48
+ m_p: The mean of the first distribution.
49
+ logs_p: The log of the standard deviation of the first distribution.
50
+ m_q: The mean of the second distribution.
51
+ logs_q: The log of the standard deviation of the second distribution.
52
+ """
53
+ kl = (logs_q - logs_p) - 0.5
54
+ kl += (
55
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
56
+ )
57
+ return kl
58
+
59
+
60
+ def slice_segments(
61
+ x: torch.Tensor, ids_str: torch.Tensor, segment_size: int = 4, dim: int = 2
62
+ ):
63
+ """
64
+ Slice segments from a tensor, handling tensors with different numbers of dimensions.
65
+
66
+ Args:
67
+ x (torch.Tensor): The tensor to slice.
68
+ ids_str (torch.Tensor): The starting indices of the segments.
69
+ segment_size (int, optional): The size of each segment. Defaults to 4.
70
+ dim (int, optional): The dimension to slice across (2D or 3D tensors). Defaults to 2.
71
+ """
72
+ if dim == 2:
73
+ ret = torch.zeros_like(x[:, :segment_size])
74
+ elif dim == 3:
75
+ ret = torch.zeros_like(x[:, :, :segment_size])
76
+
77
+ for i in range(x.size(0)):
78
+ idx_str = ids_str[i].item()
79
+ idx_end = idx_str + segment_size
80
+ if dim == 2:
81
+ ret[i] = x[i, idx_str:idx_end]
82
+ else:
83
+ ret[i] = x[i, :, idx_str:idx_end]
84
+
85
+ return ret
86
+
87
+
88
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
89
+ """
90
+ Randomly slice segments from a tensor.
91
+
92
+ Args:
93
+ x: The tensor to slice.
94
+ x_lengths: The lengths of the sequences.
95
+ segment_size: The size of each segment.
96
+ """
97
+ b, d, t = x.size()
98
+ if x_lengths is None:
99
+ x_lengths = t
100
+ ids_str_max = x_lengths - segment_size + 1
101
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
102
+ ret = slice_segments(x, ids_str, segment_size, dim=3)
103
+ return ret, ids_str
104
+
105
+
106
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
107
+ """
108
+ Generate a 1D timing signal.
109
+
110
+ Args:
111
+ length: The length of the signal.
112
+ channels: The number of channels of the signal.
113
+ min_timescale: The minimum timescale.
114
+ max_timescale: The maximum timescale.
115
+ """
116
+ position = torch.arange(length, dtype=torch.float)
117
+ num_timescales = channels // 2
118
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
119
+ num_timescales - 1
120
+ )
121
+ inv_timescales = min_timescale * torch.exp(
122
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
123
+ )
124
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
125
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
126
+ signal = torch.nn.functional.pad(signal, [0, 0, 0, channels % 2])
127
+ signal = signal.view(1, channels, length)
128
+ return signal
129
+
130
+
131
+ def subsequent_mask(length):
132
+ """
133
+ Generate a subsequent mask.
134
+
135
+ Args:
136
+ length: The length of the sequence.
137
+ """
138
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
139
+ return mask
140
+
141
+
142
+ @torch.jit.script
143
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
144
+ """
145
+ Fused add tanh sigmoid multiply operation.
146
+
147
+ Args:
148
+ input_a: The first input tensor.
149
+ input_b: The second input tensor.
150
+ n_channels: The number of channels.
151
+ """
152
+ n_channels_int = n_channels[0]
153
+ in_act = input_a + input_b
154
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
155
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
156
+ acts = t_act * s_act
157
+ return acts
158
+
159
+
160
+ # Zluda, same as previous, but without jit.script
161
+ def fused_add_tanh_sigmoid_multiply_no_jit(input_a, input_b, n_channels):
162
+ """
163
+ Fused add tanh sigmoid multiply operation.
164
+
165
+ Args:
166
+ input_a: The first input tensor.
167
+ input_b: The second input tensor.
168
+ n_channels: The number of channels.
169
+ """
170
+ n_channels_int = n_channels[0]
171
+ in_act = input_a + input_b
172
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
173
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
174
+ acts = t_act * s_act
175
+ return acts
176
+
177
+
178
+ def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
179
+ """
180
+ Convert the pad shape to a list of integers.
181
+
182
+ Args:
183
+ pad_shape: The pad shape.
184
+ """
185
+ return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
186
+
187
+
188
+ def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
189
+ """
190
+ Generate a sequence mask.
191
+
192
+ Args:
193
+ length: The lengths of the sequences.
194
+ max_length: The maximum length of the sequences.
195
+ """
196
+ if max_length is None:
197
+ max_length = length.max()
198
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
199
+ return x.unsqueeze(0) < length.unsqueeze(1)
200
+
201
+
202
+ def clip_grad_value(parameters, clip_value, norm_type=2):
203
+ """
204
+ Clip the gradients of a list of parameters.
205
+
206
+ Args:
207
+ parameters: The list of parameters to clip.
208
+ clip_value: The maximum value of the gradients.
209
+ norm_type: The type of norm to use for clipping.
210
+ """
211
+ if isinstance(parameters, torch.Tensor):
212
+ parameters = [parameters]
213
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
214
+ norm_type = float(norm_type)
215
+ if clip_value is not None:
216
+ clip_value = float(clip_value)
217
+
218
+ total_norm = 0
219
+ for p in parameters:
220
+ param_norm = p.grad.data.norm(norm_type)
221
+ total_norm += param_norm.item() ** norm_type
222
+ if clip_value is not None:
223
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
224
+ total_norm = total_norm ** (1.0 / norm_type)
225
+ return total_norm
programs/applio_code/rvc/lib/algorithm/discriminators.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn.utils.parametrizations import spectral_norm, weight_norm
3
+
4
+ from programs.applio_code.rvc.lib.algorithm.commons import get_padding
5
+ from programs.applio_code.rvc.lib.algorithm.residuals import LRELU_SLOPE
6
+
7
+
8
+ class MultiPeriodDiscriminator(torch.nn.Module):
9
+ """
10
+ Multi-period discriminator.
11
+
12
+ This class implements a multi-period discriminator, which is used to
13
+ discriminate between real and fake audio signals. The discriminator
14
+ is composed of a series of convolutional layers that are applied to
15
+ the input signal at different periods.
16
+
17
+ Args:
18
+ use_spectral_norm (bool): Whether to use spectral normalization.
19
+ Defaults to False.
20
+ """
21
+
22
+ def __init__(self, use_spectral_norm=False):
23
+ super(MultiPeriodDiscriminator, self).__init__()
24
+ periods = [2, 3, 5, 7, 11, 17]
25
+ self.discriminators = torch.nn.ModuleList(
26
+ [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
27
+ + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods]
28
+ )
29
+
30
+ def forward(self, y, y_hat):
31
+ """
32
+ Forward pass of the multi-period discriminator.
33
+
34
+ Args:
35
+ y (torch.Tensor): Real audio signal.
36
+ y_hat (torch.Tensor): Fake audio signal.
37
+ """
38
+ y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
39
+ for d in self.discriminators:
40
+ y_d_r, fmap_r = d(y)
41
+ y_d_g, fmap_g = d(y_hat)
42
+ y_d_rs.append(y_d_r)
43
+ y_d_gs.append(y_d_g)
44
+ fmap_rs.append(fmap_r)
45
+ fmap_gs.append(fmap_g)
46
+
47
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
48
+
49
+
50
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
51
+ """
52
+ Multi-period discriminator V2.
53
+
54
+ This class implements a multi-period discriminator V2, which is used
55
+ to discriminate between real and fake audio signals. The discriminator
56
+ is composed of a series of convolutional layers that are applied to
57
+ the input signal at different periods.
58
+
59
+ Args:
60
+ use_spectral_norm (bool): Whether to use spectral normalization.
61
+ Defaults to False.
62
+ """
63
+
64
+ def __init__(self, use_spectral_norm=False):
65
+ super(MultiPeriodDiscriminatorV2, self).__init__()
66
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
67
+ self.discriminators = torch.nn.ModuleList(
68
+ [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
69
+ + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods]
70
+ )
71
+
72
+ def forward(self, y, y_hat):
73
+ """
74
+ Forward pass of the multi-period discriminator V2.
75
+
76
+ Args:
77
+ y (torch.Tensor): Real audio signal.
78
+ y_hat (torch.Tensor): Fake audio signal.
79
+ """
80
+ y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
81
+ for d in self.discriminators:
82
+ y_d_r, fmap_r = d(y)
83
+ y_d_g, fmap_g = d(y_hat)
84
+ y_d_rs.append(y_d_r)
85
+ y_d_gs.append(y_d_g)
86
+ fmap_rs.append(fmap_r)
87
+ fmap_gs.append(fmap_g)
88
+
89
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
90
+
91
+
92
+ class DiscriminatorS(torch.nn.Module):
93
+ """
94
+ Discriminator for the short-term component.
95
+
96
+ This class implements a discriminator for the short-term component
97
+ of the audio signal. The discriminator is composed of a series of
98
+ convolutional layers that are applied to the input signal.
99
+ """
100
+
101
+ def __init__(self, use_spectral_norm=False):
102
+ super(DiscriminatorS, self).__init__()
103
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
104
+ self.convs = torch.nn.ModuleList(
105
+ [
106
+ norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)),
107
+ norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
108
+ norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
109
+ norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
110
+ norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
111
+ norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2)),
112
+ ]
113
+ )
114
+ self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1))
115
+ self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
116
+
117
+ def forward(self, x):
118
+ """
119
+ Forward pass of the discriminator.
120
+
121
+ Args:
122
+ x (torch.Tensor): Input audio signal.
123
+ """
124
+ fmap = []
125
+ for conv in self.convs:
126
+ x = self.lrelu(conv(x))
127
+ fmap.append(x)
128
+ x = self.conv_post(x)
129
+ fmap.append(x)
130
+ x = torch.flatten(x, 1, -1)
131
+ return x, fmap
132
+
133
+
134
+ class DiscriminatorP(torch.nn.Module):
135
+ """
136
+ Discriminator for the long-term component.
137
+
138
+ This class implements a discriminator for the long-term component
139
+ of the audio signal. The discriminator is composed of a series of
140
+ convolutional layers that are applied to the input signal at a given
141
+ period.
142
+
143
+ Args:
144
+ period (int): Period of the discriminator.
145
+ kernel_size (int): Kernel size of the convolutional layers.
146
+ Defaults to 5.
147
+ stride (int): Stride of the convolutional layers. Defaults to 3.
148
+ use_spectral_norm (bool): Whether to use spectral normalization.
149
+ Defaults to False.
150
+ """
151
+
152
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
153
+ super(DiscriminatorP, self).__init__()
154
+ self.period = period
155
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
156
+
157
+ in_channels = [1, 32, 128, 512, 1024]
158
+ out_channels = [32, 128, 512, 1024, 1024]
159
+
160
+ self.convs = torch.nn.ModuleList(
161
+ [
162
+ norm_f(
163
+ torch.nn.Conv2d(
164
+ in_ch,
165
+ out_ch,
166
+ (kernel_size, 1),
167
+ (stride, 1),
168
+ padding=(get_padding(kernel_size, 1), 0),
169
+ )
170
+ )
171
+ for in_ch, out_ch in zip(in_channels, out_channels)
172
+ ]
173
+ )
174
+
175
+ self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
176
+ self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
177
+
178
+ def forward(self, x):
179
+ """
180
+ Forward pass of the discriminator.
181
+
182
+ Args:
183
+ x (torch.Tensor): Input audio signal.
184
+ """
185
+ fmap = []
186
+ b, c, t = x.shape
187
+ if t % self.period != 0:
188
+ n_pad = self.period - (t % self.period)
189
+ x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
190
+ x = x.view(b, c, -1, self.period)
191
+
192
+ for conv in self.convs:
193
+ x = self.lrelu(conv(x))
194
+ fmap.append(x)
195
+
196
+ x = self.conv_post(x)
197
+ fmap.append(x)
198
+ x = torch.flatten(x, 1, -1)
199
+ return x, fmap
programs/applio_code/rvc/lib/algorithm/encoders.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from typing import Optional
4
+
5
+ from programs.applio_code.rvc.lib.algorithm.commons import sequence_mask
6
+ from programs.applio_code.rvc.lib.algorithm.modules import WaveNet
7
+ from programs.applio_code.rvc.lib.algorithm.normalization import LayerNorm
8
+ from programs.applio_code.rvc.lib.algorithm.attentions import FFN, MultiHeadAttention
9
+
10
+
11
+ class Encoder(torch.nn.Module):
12
+ """
13
+ Encoder module for the Transformer model.
14
+
15
+ Args:
16
+ hidden_channels (int): Number of hidden channels in the encoder.
17
+ filter_channels (int): Number of filter channels in the feed-forward network.
18
+ n_heads (int): Number of attention heads.
19
+ n_layers (int): Number of encoder layers.
20
+ kernel_size (int, optional): Kernel size of the convolution layers in the feed-forward network. Defaults to 1.
21
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
22
+ window_size (int, optional): Window size for relative positional encoding. Defaults to 10.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ hidden_channels,
28
+ filter_channels,
29
+ n_heads,
30
+ n_layers,
31
+ kernel_size=1,
32
+ p_dropout=0.0,
33
+ window_size=10,
34
+ **kwargs
35
+ ):
36
+ super().__init__()
37
+ self.hidden_channels = hidden_channels
38
+ self.filter_channels = filter_channels
39
+ self.n_heads = n_heads
40
+ self.n_layers = n_layers
41
+ self.kernel_size = kernel_size
42
+ self.p_dropout = p_dropout
43
+ self.window_size = window_size
44
+
45
+ self.drop = torch.nn.Dropout(p_dropout)
46
+ self.attn_layers = torch.nn.ModuleList()
47
+ self.norm_layers_1 = torch.nn.ModuleList()
48
+ self.ffn_layers = torch.nn.ModuleList()
49
+ self.norm_layers_2 = torch.nn.ModuleList()
50
+ for i in range(self.n_layers):
51
+ self.attn_layers.append(
52
+ MultiHeadAttention(
53
+ hidden_channels,
54
+ hidden_channels,
55
+ n_heads,
56
+ p_dropout=p_dropout,
57
+ window_size=window_size,
58
+ )
59
+ )
60
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
61
+ self.ffn_layers.append(
62
+ FFN(
63
+ hidden_channels,
64
+ hidden_channels,
65
+ filter_channels,
66
+ kernel_size,
67
+ p_dropout=p_dropout,
68
+ )
69
+ )
70
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
71
+
72
+ def forward(self, x, x_mask):
73
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
74
+ x = x * x_mask
75
+ for i in range(self.n_layers):
76
+ y = self.attn_layers[i](x, x, attn_mask)
77
+ y = self.drop(y)
78
+ x = self.norm_layers_1[i](x + y)
79
+
80
+ y = self.ffn_layers[i](x, x_mask)
81
+ y = self.drop(y)
82
+ x = self.norm_layers_2[i](x + y)
83
+ x = x * x_mask
84
+ return x
85
+
86
+
87
+ class TextEncoder(torch.nn.Module):
88
+ """Text Encoder with configurable embedding dimension.
89
+
90
+ Args:
91
+ out_channels (int): Output channels of the encoder.
92
+ hidden_channels (int): Hidden channels of the encoder.
93
+ filter_channels (int): Filter channels of the encoder.
94
+ n_heads (int): Number of attention heads.
95
+ n_layers (int): Number of encoder layers.
96
+ kernel_size (int): Kernel size of the convolutional layers.
97
+ p_dropout (float): Dropout probability.
98
+ embedding_dim (int): Embedding dimension for phone embeddings (v1 = 256, v2 = 768).
99
+ f0 (bool, optional): Whether to use F0 embedding. Defaults to True.
100
+ """
101
+
102
+ def __init__(
103
+ self,
104
+ out_channels,
105
+ hidden_channels,
106
+ filter_channels,
107
+ n_heads,
108
+ n_layers,
109
+ kernel_size,
110
+ p_dropout,
111
+ embedding_dim,
112
+ f0=True,
113
+ ):
114
+ super(TextEncoder, self).__init__()
115
+ self.out_channels = out_channels
116
+ self.hidden_channels = hidden_channels
117
+ self.filter_channels = filter_channels
118
+ self.n_heads = n_heads
119
+ self.n_layers = n_layers
120
+ self.kernel_size = kernel_size
121
+ self.p_dropout = float(p_dropout)
122
+ self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels)
123
+ self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True)
124
+ if f0:
125
+ self.emb_pitch = torch.nn.Embedding(256, hidden_channels)
126
+ self.encoder = Encoder(
127
+ hidden_channels,
128
+ filter_channels,
129
+ n_heads,
130
+ n_layers,
131
+ kernel_size,
132
+ float(p_dropout),
133
+ )
134
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
135
+
136
+ def forward(
137
+ self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor
138
+ ):
139
+ if pitch is None:
140
+ x = self.emb_phone(phone)
141
+ else:
142
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
143
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
144
+ x = self.lrelu(x)
145
+ x = torch.transpose(x, 1, -1) # [b, h, t]
146
+ x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
147
+ x = self.encoder(x * x_mask, x_mask)
148
+ stats = self.proj(x) * x_mask
149
+
150
+ m, logs = torch.split(stats, self.out_channels, dim=1)
151
+ return m, logs, x_mask
152
+
153
+
154
+ class PosteriorEncoder(torch.nn.Module):
155
+ """Posterior Encoder for inferring latent representation.
156
+
157
+ Args:
158
+ in_channels (int): Number of channels in the input.
159
+ out_channels (int): Number of channels in the output.
160
+ hidden_channels (int): Number of hidden channels in the encoder.
161
+ kernel_size (int): Kernel size of the convolutional layers.
162
+ dilation_rate (int): Dilation rate of the convolutional layers.
163
+ n_layers (int): Number of layers in the encoder.
164
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
165
+ """
166
+
167
+ def __init__(
168
+ self,
169
+ in_channels,
170
+ out_channels,
171
+ hidden_channels,
172
+ kernel_size,
173
+ dilation_rate,
174
+ n_layers,
175
+ gin_channels=0,
176
+ ):
177
+ super(PosteriorEncoder, self).__init__()
178
+ self.in_channels = in_channels
179
+ self.out_channels = out_channels
180
+ self.hidden_channels = hidden_channels
181
+ self.kernel_size = kernel_size
182
+ self.dilation_rate = dilation_rate
183
+ self.n_layers = n_layers
184
+ self.gin_channels = gin_channels
185
+
186
+ self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1)
187
+ self.enc = WaveNet(
188
+ hidden_channels,
189
+ kernel_size,
190
+ dilation_rate,
191
+ n_layers,
192
+ gin_channels=gin_channels,
193
+ )
194
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
195
+
196
+ def forward(
197
+ self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
198
+ ):
199
+ x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
200
+ x = self.pre(x) * x_mask
201
+ x = self.enc(x, x_mask, g=g)
202
+ stats = self.proj(x) * x_mask
203
+ m, logs = torch.split(stats, self.out_channels, dim=1)
204
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
205
+ return z, m, logs, x_mask
206
+
207
+ def remove_weight_norm(self):
208
+ """Removes weight normalization from the encoder."""
209
+ self.enc.remove_weight_norm()
210
+
211
+ def __prepare_scriptable__(self):
212
+ """Prepares the module for scripting."""
213
+ for hook in self.enc._forward_pre_hooks.values():
214
+ if (
215
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
216
+ and hook.__class__.__name__ == "WeightNorm"
217
+ ):
218
+ torch.nn.utils.remove_weight_norm(self.enc)
219
+ return self
programs/applio_code/rvc/lib/algorithm/generators.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn.utils import remove_weight_norm
3
+ from torch.nn.utils.parametrizations import weight_norm
4
+ from typing import Optional
5
+
6
+ from programs.applio_code.rvc.lib.algorithm.residuals import (
7
+ LRELU_SLOPE,
8
+ ResBlock1,
9
+ ResBlock2,
10
+ )
11
+ from programs.applio_code.rvc.lib.algorithm.commons import init_weights
12
+
13
+
14
+ class Generator(torch.nn.Module):
15
+ """Generator for synthesizing audio. Optimized for performance and quality.
16
+
17
+ Args:
18
+ initial_channel (int): Number of channels in the initial convolutional layer.
19
+ resblock (str): Type of residual block to use (1 or 2).
20
+ resblock_kernel_sizes (list): Kernel sizes of the residual blocks.
21
+ resblock_dilation_sizes (list): Dilation rates of the residual blocks.
22
+ upsample_rates (list): Upsampling rates.
23
+ upsample_initial_channel (int): Number of channels in the initial upsampling layer.
24
+ upsample_kernel_sizes (list): Kernel sizes of the upsampling layers.
25
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ initial_channel,
31
+ resblock,
32
+ resblock_kernel_sizes,
33
+ resblock_dilation_sizes,
34
+ upsample_rates,
35
+ upsample_initial_channel,
36
+ upsample_kernel_sizes,
37
+ gin_channels=0,
38
+ ):
39
+ super(Generator, self).__init__()
40
+ self.num_kernels = len(resblock_kernel_sizes)
41
+ self.num_upsamples = len(upsample_rates)
42
+ self.conv_pre = torch.nn.Conv1d(
43
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
44
+ )
45
+ resblock = ResBlock1 if resblock == "1" else ResBlock2
46
+
47
+ self.ups_and_resblocks = torch.nn.ModuleList()
48
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
49
+ self.ups_and_resblocks.append(
50
+ weight_norm(
51
+ torch.nn.ConvTranspose1d(
52
+ upsample_initial_channel // (2**i),
53
+ upsample_initial_channel // (2 ** (i + 1)),
54
+ k,
55
+ u,
56
+ padding=(k - u) // 2,
57
+ )
58
+ )
59
+ )
60
+ ch = upsample_initial_channel // (2 ** (i + 1))
61
+ for j, (k, d) in enumerate(
62
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
63
+ ):
64
+ self.ups_and_resblocks.append(resblock(ch, k, d))
65
+
66
+ self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
67
+ self.ups_and_resblocks.apply(init_weights)
68
+
69
+ if gin_channels != 0:
70
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
71
+
72
+ def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
73
+ x = self.conv_pre(x)
74
+ if g is not None:
75
+ x = x + self.cond(g)
76
+
77
+ resblock_idx = 0
78
+ for _ in range(self.num_upsamples):
79
+ x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
80
+ x = self.ups_and_resblocks[resblock_idx](x)
81
+ resblock_idx += 1
82
+ xs = 0
83
+ for _ in range(self.num_kernels):
84
+ xs += self.ups_and_resblocks[resblock_idx](x)
85
+ resblock_idx += 1
86
+ x = xs / self.num_kernels
87
+
88
+ x = torch.nn.functional.leaky_relu(x)
89
+ x = self.conv_post(x)
90
+ x = torch.tanh(x)
91
+
92
+ return x
93
+
94
+ def __prepare_scriptable__(self):
95
+ """Prepares the module for scripting."""
96
+ for l in self.ups_and_resblocks:
97
+ for hook in l._forward_pre_hooks.values():
98
+ if (
99
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
100
+ and hook.__class__.__name__ == "WeightNorm"
101
+ ):
102
+ torch.nn.utils.remove_weight_norm(l)
103
+ return self
104
+
105
+ def remove_weight_norm(self):
106
+ """Removes weight normalization from the upsampling and residual blocks."""
107
+ for l in self.ups_and_resblocks:
108
+ remove_weight_norm(l)
109
+
110
+
111
+ class SineGen(torch.nn.Module):
112
+ """Sine wave generator.
113
+
114
+ Args:
115
+ samp_rate (int): Sampling rate in Hz.
116
+ harmonic_num (int, optional): Number of harmonic overtones. Defaults to 0.
117
+ sine_amp (float, optional): Amplitude of sine waveform. Defaults to 0.1.
118
+ noise_std (float, optional): Standard deviation of Gaussian noise. Defaults to 0.003.
119
+ voiced_threshold (float, optional): F0 threshold for voiced/unvoiced classification. Defaults to 0.
120
+ flag_for_pulse (bool, optional): Whether this SineGen is used inside PulseGen. Defaults to False.
121
+ """
122
+
123
+ def __init__(
124
+ self,
125
+ samp_rate,
126
+ harmonic_num=0,
127
+ sine_amp=0.1,
128
+ noise_std=0.003,
129
+ voiced_threshold=0,
130
+ flag_for_pulse=False,
131
+ ):
132
+ super(SineGen, self).__init__()
133
+ self.sine_amp = sine_amp
134
+ self.noise_std = noise_std
135
+ self.harmonic_num = harmonic_num
136
+ self.dim = self.harmonic_num + 1
137
+ self.sample_rate = samp_rate
138
+ self.voiced_threshold = voiced_threshold
139
+
140
+ def _f02uv(self, f0):
141
+ """Converts F0 to voiced/unvoiced signal.
142
+
143
+ Args:
144
+ f0 (torch.Tensor): F0 tensor with shape (batch_size, length, 1)..
145
+ """
146
+ uv = torch.ones_like(f0)
147
+ uv = uv * (f0 > self.voiced_threshold)
148
+ return uv
149
+
150
+ def forward(self, f0: torch.Tensor, upp: int):
151
+ """Generates sine waves.
152
+
153
+ Args:
154
+ f0 (torch.Tensor): F0 tensor with shape (batch_size, length, 1).
155
+ upp (int): Upsampling factor.
156
+ """
157
+ with torch.no_grad():
158
+ f0 = f0[:, None].transpose(1, 2)
159
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
160
+ f0_buf[:, :, 0] = f0[:, :, 0]
161
+ f0_buf[:, :, 1:] = (
162
+ f0_buf[:, :, 0:1]
163
+ * torch.arange(2, self.harmonic_num + 2, device=f0.device)[
164
+ None, None, :
165
+ ]
166
+ )
167
+ rad_values = (f0_buf / float(self.sample_rate)) % 1
168
+ rand_ini = torch.rand(
169
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
170
+ )
171
+ rand_ini[:, 0] = 0
172
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
173
+ tmp_over_one = torch.cumsum(rad_values, 1)
174
+ tmp_over_one *= upp
175
+ tmp_over_one = torch.nn.functional.interpolate(
176
+ tmp_over_one.transpose(2, 1),
177
+ scale_factor=float(upp),
178
+ mode="linear",
179
+ align_corners=True,
180
+ ).transpose(2, 1)
181
+ rad_values = torch.nn.functional.interpolate(
182
+ rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest"
183
+ ).transpose(2, 1)
184
+ tmp_over_one %= 1
185
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
186
+ cumsum_shift = torch.zeros_like(rad_values)
187
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
188
+ sine_waves = torch.sin(
189
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi
190
+ )
191
+ sine_waves = sine_waves * self.sine_amp
192
+ uv = self._f02uv(f0)
193
+ uv = torch.nn.functional.interpolate(
194
+ uv.transpose(2, 1), scale_factor=float(upp), mode="nearest"
195
+ ).transpose(2, 1)
196
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
197
+ noise = noise_amp * torch.randn_like(sine_waves)
198
+ sine_waves = sine_waves * uv + noise
199
+ return sine_waves, uv, noise
programs/applio_code/rvc/lib/algorithm/modules.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from programs.applio_code.rvc.lib.algorithm.commons import (
3
+ fused_add_tanh_sigmoid_multiply_no_jit,
4
+ fused_add_tanh_sigmoid_multiply,
5
+ )
6
+
7
+
8
+ class WaveNet(torch.nn.Module):
9
+ """WaveNet residual blocks as used in WaveGlow
10
+
11
+ Args:
12
+ hidden_channels (int): Number of hidden channels.
13
+ kernel_size (int): Size of the convolutional kernel.
14
+ dilation_rate (int): Dilation rate of the convolution.
15
+ n_layers (int): Number of convolutional layers.
16
+ gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
17
+ p_dropout (float, optional): Dropout probability. Defaults to 0.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ hidden_channels,
23
+ kernel_size,
24
+ dilation_rate,
25
+ n_layers,
26
+ gin_channels=0,
27
+ p_dropout=0,
28
+ ):
29
+ super(WaveNet, self).__init__()
30
+ assert kernel_size % 2 == 1
31
+ self.hidden_channels = hidden_channels
32
+ self.kernel_size = (kernel_size,)
33
+ self.dilation_rate = dilation_rate
34
+ self.n_layers = n_layers
35
+ self.gin_channels = gin_channels
36
+ self.p_dropout = p_dropout
37
+
38
+ self.in_layers = torch.nn.ModuleList()
39
+ self.res_skip_layers = torch.nn.ModuleList()
40
+ self.drop = torch.nn.Dropout(p_dropout)
41
+
42
+ if gin_channels != 0:
43
+ cond_layer = torch.nn.Conv1d(
44
+ gin_channels, 2 * hidden_channels * n_layers, 1
45
+ )
46
+ self.cond_layer = torch.nn.utils.parametrizations.weight_norm(
47
+ cond_layer, name="weight"
48
+ )
49
+
50
+ dilations = [dilation_rate**i for i in range(n_layers)]
51
+ paddings = [(kernel_size * d - d) // 2 for d in dilations]
52
+
53
+ for i in range(n_layers):
54
+ in_layer = torch.nn.Conv1d(
55
+ hidden_channels,
56
+ 2 * hidden_channels,
57
+ kernel_size,
58
+ dilation=dilations[i],
59
+ padding=paddings[i],
60
+ )
61
+ in_layer = torch.nn.utils.parametrizations.weight_norm(
62
+ in_layer, name="weight"
63
+ )
64
+ self.in_layers.append(in_layer)
65
+
66
+ res_skip_channels = (
67
+ hidden_channels if i == n_layers - 1 else 2 * hidden_channels
68
+ )
69
+
70
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
71
+ res_skip_layer = torch.nn.utils.parametrizations.weight_norm(
72
+ res_skip_layer, name="weight"
73
+ )
74
+ self.res_skip_layers.append(res_skip_layer)
75
+
76
+ def forward(self, x, x_mask, g=None, **kwargs):
77
+ """Forward pass.
78
+
79
+ Args:
80
+ x (torch.Tensor): Input tensor of shape (batch_size, hidden_channels, time_steps).
81
+ x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps).
82
+ g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps).
83
+ Defaults to None.
84
+ """
85
+ output = torch.zeros_like(x)
86
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
87
+
88
+ if g is not None:
89
+ g = self.cond_layer(g)
90
+
91
+ # Zluda
92
+ is_zluda = x.device.type == "cuda" and torch.cuda.get_device_name().endswith(
93
+ "[ZLUDA]"
94
+ )
95
+
96
+ for i in range(self.n_layers):
97
+ x_in = self.in_layers[i](x)
98
+ if g is not None:
99
+ cond_offset = i * 2 * self.hidden_channels
100
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
101
+ else:
102
+ g_l = torch.zeros_like(x_in)
103
+
104
+ # Preventing HIP crash by not using jit-decorated function
105
+ if is_zluda:
106
+ acts = fused_add_tanh_sigmoid_multiply_no_jit(
107
+ x_in, g_l, n_channels_tensor
108
+ )
109
+ else:
110
+ acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
111
+
112
+ acts = self.drop(acts)
113
+
114
+ res_skip_acts = self.res_skip_layers[i](acts)
115
+ if i < self.n_layers - 1:
116
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
117
+ x = (x + res_acts) * x_mask
118
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
119
+ else:
120
+ output = output + res_skip_acts
121
+ return output * x_mask
122
+
123
+ def remove_weight_norm(self):
124
+ """Remove weight normalization from the module."""
125
+ if self.gin_channels != 0:
126
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
127
+ for l in self.in_layers:
128
+ torch.nn.utils.remove_weight_norm(l)
129
+ for l in self.res_skip_layers:
130
+ torch.nn.utils.remove_weight_norm(l)
programs/applio_code/rvc/lib/algorithm/normalization.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class LayerNorm(torch.nn.Module):
5
+ """Layer normalization module.
6
+
7
+ Args:
8
+ channels (int): Number of channels.
9
+ eps (float, optional): Epsilon value for numerical stability. Defaults to 1e-5.
10
+ """
11
+
12
+ def __init__(self, channels, eps=1e-5):
13
+ super().__init__()
14
+ self.eps = eps
15
+ self.gamma = torch.nn.Parameter(torch.ones(channels))
16
+ self.beta = torch.nn.Parameter(torch.zeros(channels))
17
+
18
+ def forward(self, x):
19
+ """Forward pass.
20
+
21
+ Args:
22
+ x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps).
23
+
24
+ """
25
+ # Transpose to (batch_size, time_steps, channels) for layer_norm
26
+ x = x.transpose(1, -1)
27
+ x = torch.nn.functional.layer_norm(
28
+ x, (x.size(-1),), self.gamma, self.beta, self.eps
29
+ )
30
+ # Transpose back to (batch_size, channels, time_steps)
31
+ return x.transpose(1, -1)
programs/applio_code/rvc/lib/algorithm/nsf.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch.nn.utils import remove_weight_norm
4
+ from torch.nn.utils.parametrizations import weight_norm
5
+ from typing import Optional
6
+
7
+ from programs.applio_code.rvc.lib.algorithm.generators import SineGen
8
+ from programs.applio_code.rvc.lib.algorithm.residuals import (
9
+ LRELU_SLOPE,
10
+ ResBlock1,
11
+ ResBlock2,
12
+ )
13
+ from programs.applio_code.rvc.lib.algorithm.commons import init_weights
14
+
15
+
16
+ class SourceModuleHnNSF(torch.nn.Module):
17
+ """
18
+ Source Module for harmonic-plus-noise excitation.
19
+
20
+ Args:
21
+ sample_rate (int): Sampling rate in Hz.
22
+ harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0.
23
+ sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1.
24
+ add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003.
25
+ voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0.
26
+ is_half (bool, optional): Whether to use half precision. Defaults to True.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ sample_rate,
32
+ harmonic_num=0,
33
+ sine_amp=0.1,
34
+ add_noise_std=0.003,
35
+ voiced_threshod=0,
36
+ is_half=True,
37
+ ):
38
+ super(SourceModuleHnNSF, self).__init__()
39
+
40
+ self.sine_amp = sine_amp
41
+ self.noise_std = add_noise_std
42
+ self.is_half = is_half
43
+
44
+ self.l_sin_gen = SineGen(
45
+ sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
46
+ )
47
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
48
+ self.l_tanh = torch.nn.Tanh()
49
+
50
+ def forward(self, x: torch.Tensor, upsample_factor: int = 1):
51
+ sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor)
52
+ sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
53
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
54
+ return sine_merge, None, None
55
+
56
+
57
+ class GeneratorNSF(torch.nn.Module):
58
+ """
59
+ Generator for synthesizing audio using the NSF (Neural Source Filter) approach.
60
+
61
+ Args:
62
+ initial_channel (int): Number of channels in the initial convolutional layer.
63
+ resblock (str): Type of residual block to use (1 or 2).
64
+ resblock_kernel_sizes (list): Kernel sizes of the residual blocks.
65
+ resblock_dilation_sizes (list): Dilation rates of the residual blocks.
66
+ upsample_rates (list): Upsampling rates.
67
+ upsample_initial_channel (int): Number of channels in the initial upsampling layer.
68
+ upsample_kernel_sizes (list): Kernel sizes of the upsampling layers.
69
+ gin_channels (int): Number of channels for the global conditioning input.
70
+ sr (int): Sampling rate.
71
+ is_half (bool, optional): Whether to use half precision. Defaults to False.
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ initial_channel,
77
+ resblock,
78
+ resblock_kernel_sizes,
79
+ resblock_dilation_sizes,
80
+ upsample_rates,
81
+ upsample_initial_channel,
82
+ upsample_kernel_sizes,
83
+ gin_channels,
84
+ sr,
85
+ is_half=False,
86
+ ):
87
+ super(GeneratorNSF, self).__init__()
88
+
89
+ self.num_kernels = len(resblock_kernel_sizes)
90
+ self.num_upsamples = len(upsample_rates)
91
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates))
92
+ self.m_source = SourceModuleHnNSF(
93
+ sample_rate=sr, harmonic_num=0, is_half=is_half
94
+ )
95
+
96
+ self.conv_pre = torch.nn.Conv1d(
97
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
98
+ )
99
+ resblock_cls = ResBlock1 if resblock == "1" else ResBlock2
100
+
101
+ self.ups = torch.nn.ModuleList()
102
+ self.noise_convs = torch.nn.ModuleList()
103
+
104
+ channels = [
105
+ upsample_initial_channel // (2 ** (i + 1))
106
+ for i in range(len(upsample_rates))
107
+ ]
108
+ stride_f0s = [
109
+ math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
110
+ for i in range(len(upsample_rates))
111
+ ]
112
+
113
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
114
+ self.ups.append(
115
+ weight_norm(
116
+ torch.nn.ConvTranspose1d(
117
+ upsample_initial_channel // (2**i),
118
+ channels[i],
119
+ k,
120
+ u,
121
+ padding=(k - u) // 2,
122
+ )
123
+ )
124
+ )
125
+
126
+ self.noise_convs.append(
127
+ torch.nn.Conv1d(
128
+ 1,
129
+ channels[i],
130
+ kernel_size=(stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1),
131
+ stride=stride_f0s[i],
132
+ padding=(stride_f0s[i] // 2 if stride_f0s[i] > 1 else 0),
133
+ )
134
+ )
135
+
136
+ self.resblocks = torch.nn.ModuleList(
137
+ [
138
+ resblock_cls(channels[i], k, d)
139
+ for i in range(len(self.ups))
140
+ for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes)
141
+ ]
142
+ )
143
+
144
+ self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False)
145
+ self.ups.apply(init_weights)
146
+
147
+ if gin_channels != 0:
148
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
149
+
150
+ self.upp = math.prod(upsample_rates)
151
+ self.lrelu_slope = LRELU_SLOPE
152
+
153
+ def forward(self, x, f0, g: Optional[torch.Tensor] = None):
154
+ har_source, _, _ = self.m_source(f0, self.upp)
155
+ har_source = har_source.transpose(1, 2)
156
+ x = self.conv_pre(x)
157
+
158
+ if g is not None:
159
+ x = x + self.cond(g)
160
+
161
+ for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
162
+ x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
163
+ x = ups(x)
164
+ x = x + noise_convs(har_source)
165
+
166
+ xs = sum(
167
+ [
168
+ resblock(x)
169
+ for j, resblock in enumerate(self.resblocks)
170
+ if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)
171
+ ]
172
+ )
173
+ x = xs / self.num_kernels
174
+
175
+ x = torch.nn.functional.leaky_relu(x)
176
+ x = torch.tanh(self.conv_post(x))
177
+ return x
178
+
179
+ def remove_weight_norm(self):
180
+ for l in self.ups:
181
+ remove_weight_norm(l)
182
+ for l in self.resblocks:
183
+ l.remove_weight_norm()
184
+
185
+ def __prepare_scriptable__(self):
186
+ for l in self.ups:
187
+ for hook in l._forward_pre_hooks.values():
188
+ if (
189
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
190
+ and hook.__class__.__name__ == "WeightNorm"
191
+ ):
192
+ remove_weight_norm(l)
193
+ for l in self.resblocks:
194
+ for hook in l._forward_pre_hooks.values():
195
+ if (
196
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
197
+ and hook.__class__.__name__ == "WeightNorm"
198
+ ):
199
+ remove_weight_norm(l)
200
+ return self
programs/applio_code/rvc/lib/algorithm/residuals.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ import torch
3
+ from torch.nn.utils import remove_weight_norm
4
+ from torch.nn.utils.parametrizations import weight_norm
5
+
6
+ from programs.applio_code.rvc.lib.algorithm.modules import WaveNet
7
+ from programs.applio_code.rvc.lib.algorithm.commons import get_padding, init_weights
8
+
9
+ LRELU_SLOPE = 0.1
10
+
11
+
12
+ # Helper functions
13
+ def create_conv1d_layer(channels, kernel_size, dilation):
14
+ return weight_norm(
15
+ torch.nn.Conv1d(
16
+ channels,
17
+ channels,
18
+ kernel_size,
19
+ 1,
20
+ dilation=dilation,
21
+ padding=get_padding(kernel_size, dilation),
22
+ )
23
+ )
24
+
25
+
26
+ def apply_mask(tensor, mask):
27
+ return tensor * mask if mask is not None else tensor
28
+
29
+
30
+ class ResBlockBase(torch.nn.Module):
31
+ def __init__(self, channels, kernel_size, dilations):
32
+ super(ResBlockBase, self).__init__()
33
+ self.convs1 = torch.nn.ModuleList(
34
+ [create_conv1d_layer(channels, kernel_size, d) for d in dilations]
35
+ )
36
+ self.convs1.apply(init_weights)
37
+
38
+ self.convs2 = torch.nn.ModuleList(
39
+ [create_conv1d_layer(channels, kernel_size, 1) for _ in dilations]
40
+ )
41
+ self.convs2.apply(init_weights)
42
+
43
+ def forward(self, x, x_mask=None):
44
+ for c1, c2 in zip(self.convs1, self.convs2):
45
+ xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
46
+ xt = apply_mask(xt, x_mask)
47
+ xt = torch.nn.functional.leaky_relu(c1(xt), LRELU_SLOPE)
48
+ xt = apply_mask(xt, x_mask)
49
+ xt = c2(xt)
50
+ x = xt + x
51
+ return apply_mask(x, x_mask)
52
+
53
+ def remove_weight_norm(self):
54
+ for conv in self.convs1 + self.convs2:
55
+ remove_weight_norm(conv)
56
+
57
+
58
+ class ResBlock1(ResBlockBase):
59
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
60
+ super(ResBlock1, self).__init__(channels, kernel_size, dilation)
61
+
62
+
63
+ class ResBlock2(ResBlockBase):
64
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
65
+ super(ResBlock2, self).__init__(channels, kernel_size, dilation)
66
+
67
+
68
+ class Log(torch.nn.Module):
69
+ """Logarithm module for flow-based models.
70
+
71
+ This module computes the logarithm of the input and its log determinant.
72
+ During reverse, it computes the exponential of the input.
73
+ """
74
+
75
+ def forward(self, x, x_mask, reverse=False, **kwargs):
76
+ """Forward pass.
77
+
78
+ Args:
79
+ x (torch.Tensor): Input tensor.
80
+ x_mask (torch.Tensor): Mask tensor.
81
+ reverse (bool, optional): Whether to reverse the operation. Defaults to False.
82
+ """
83
+ if not reverse:
84
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
85
+ logdet = torch.sum(-y, [1, 2])
86
+ return y, logdet
87
+ else:
88
+ x = torch.exp(x) * x_mask
89
+ return x
90
+
91
+
92
+ class Flip(torch.nn.Module):
93
+ """Flip module for flow-based models.
94
+
95
+ This module flips the input along the time dimension.
96
+ """
97
+
98
+ def forward(self, x, *args, reverse=False, **kwargs):
99
+ """Forward pass.
100
+
101
+ Args:
102
+ x (torch.Tensor): Input tensor.
103
+ reverse (bool, optional): Whether to reverse the operation. Defaults to False.
104
+ """
105
+ x = torch.flip(x, [1])
106
+ if not reverse:
107
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
108
+ return x, logdet
109
+ else:
110
+ return x
111
+
112
+
113
+ class ElementwiseAffine(torch.nn.Module):
114
+ """Elementwise affine transformation module for flow-based models.
115
+
116
+ This module performs an elementwise affine transformation on the input.
117
+
118
+ Args:
119
+ channels (int): Number of channels.
120
+
121
+ """
122
+
123
+ def __init__(self, channels):
124
+ super().__init__()
125
+ self.channels = channels
126
+ self.m = torch.nn.Parameter(torch.zeros(channels, 1))
127
+ self.logs = torch.nn.Parameter(torch.zeros(channels, 1))
128
+
129
+ def forward(self, x, x_mask, reverse=False, **kwargs):
130
+ """Forward pass.
131
+
132
+ Args:
133
+ x (torch.Tensor): Input tensor.
134
+ x_mask (torch.Tensor): Mask tensor.
135
+ reverse (bool, optional): Whether to reverse the operation. Defaults to False.
136
+ """
137
+ if not reverse:
138
+ y = self.m + torch.exp(self.logs) * x
139
+ y = y * x_mask
140
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
141
+ return y, logdet
142
+ else:
143
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
144
+ return x
145
+
146
+
147
+ class ResidualCouplingBlock(torch.nn.Module):
148
+ """Residual Coupling Block for normalizing flow.
149
+
150
+ Args:
151
+ channels (int): Number of channels in the input.
152
+ hidden_channels (int): Number of hidden channels in the coupling layer.
153
+ kernel_size (int): Kernel size of the convolutional layers.
154
+ dilation_rate (int): Dilation rate of the convolutional layers.
155
+ n_layers (int): Number of layers in the coupling layer.
156
+ n_flows (int, optional): Number of coupling layers in the block. Defaults to 4.
157
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
158
+ """
159
+
160
+ def __init__(
161
+ self,
162
+ channels,
163
+ hidden_channels,
164
+ kernel_size,
165
+ dilation_rate,
166
+ n_layers,
167
+ n_flows=4,
168
+ gin_channels=0,
169
+ ):
170
+ super(ResidualCouplingBlock, self).__init__()
171
+ self.channels = channels
172
+ self.hidden_channels = hidden_channels
173
+ self.kernel_size = kernel_size
174
+ self.dilation_rate = dilation_rate
175
+ self.n_layers = n_layers
176
+ self.n_flows = n_flows
177
+ self.gin_channels = gin_channels
178
+
179
+ self.flows = torch.nn.ModuleList()
180
+ for i in range(n_flows):
181
+ self.flows.append(
182
+ ResidualCouplingLayer(
183
+ channels,
184
+ hidden_channels,
185
+ kernel_size,
186
+ dilation_rate,
187
+ n_layers,
188
+ gin_channels=gin_channels,
189
+ mean_only=True,
190
+ )
191
+ )
192
+ self.flows.append(Flip())
193
+
194
+ def forward(
195
+ self,
196
+ x: torch.Tensor,
197
+ x_mask: torch.Tensor,
198
+ g: Optional[torch.Tensor] = None,
199
+ reverse: bool = False,
200
+ ):
201
+ if not reverse:
202
+ for flow in self.flows:
203
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
204
+ else:
205
+ for flow in reversed(self.flows):
206
+ x = flow.forward(x, x_mask, g=g, reverse=reverse)
207
+ return x
208
+
209
+ def remove_weight_norm(self):
210
+ """Removes weight normalization from the coupling layers."""
211
+ for i in range(self.n_flows):
212
+ self.flows[i * 2].remove_weight_norm()
213
+
214
+ def __prepare_scriptable__(self):
215
+ """Prepares the module for scripting."""
216
+ for i in range(self.n_flows):
217
+ for hook in self.flows[i * 2]._forward_pre_hooks.values():
218
+ if (
219
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
220
+ and hook.__class__.__name__ == "WeightNorm"
221
+ ):
222
+ torch.nn.utils.remove_weight_norm(self.flows[i * 2])
223
+
224
+ return self
225
+
226
+
227
+ class ResidualCouplingLayer(torch.nn.Module):
228
+ """Residual coupling layer for flow-based models.
229
+
230
+ Args:
231
+ channels (int): Number of channels.
232
+ hidden_channels (int): Number of hidden channels.
233
+ kernel_size (int): Size of the convolutional kernel.
234
+ dilation_rate (int): Dilation rate of the convolution.
235
+ n_layers (int): Number of convolutional layers.
236
+ p_dropout (float, optional): Dropout probability. Defaults to 0.
237
+ gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
238
+ mean_only (bool, optional): Whether to use mean-only coupling. Defaults to False.
239
+ """
240
+
241
+ def __init__(
242
+ self,
243
+ channels,
244
+ hidden_channels,
245
+ kernel_size,
246
+ dilation_rate,
247
+ n_layers,
248
+ p_dropout=0,
249
+ gin_channels=0,
250
+ mean_only=False,
251
+ ):
252
+ assert channels % 2 == 0, "channels should be divisible by 2"
253
+ super().__init__()
254
+ self.channels = channels
255
+ self.hidden_channels = hidden_channels
256
+ self.kernel_size = kernel_size
257
+ self.dilation_rate = dilation_rate
258
+ self.n_layers = n_layers
259
+ self.half_channels = channels // 2
260
+ self.mean_only = mean_only
261
+
262
+ self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1)
263
+ self.enc = WaveNet(
264
+ hidden_channels,
265
+ kernel_size,
266
+ dilation_rate,
267
+ n_layers,
268
+ p_dropout=p_dropout,
269
+ gin_channels=gin_channels,
270
+ )
271
+ self.post = torch.nn.Conv1d(
272
+ hidden_channels, self.half_channels * (2 - mean_only), 1
273
+ )
274
+ self.post.weight.data.zero_()
275
+ self.post.bias.data.zero_()
276
+
277
+ def forward(self, x, x_mask, g=None, reverse=False):
278
+ """Forward pass.
279
+
280
+ Args:
281
+ x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps).
282
+ x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps).
283
+ g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps).
284
+ Defaults to None.
285
+ reverse (bool, optional): Whether to reverse the operation. Defaults to False.
286
+ """
287
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
288
+ h = self.pre(x0) * x_mask
289
+ h = self.enc(h, x_mask, g=g)
290
+ stats = self.post(h) * x_mask
291
+ if not self.mean_only:
292
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
293
+ else:
294
+ m = stats
295
+ logs = torch.zeros_like(m)
296
+
297
+ if not reverse:
298
+ x1 = m + x1 * torch.exp(logs) * x_mask
299
+ x = torch.cat([x0, x1], 1)
300
+ logdet = torch.sum(logs, [1, 2])
301
+ return x, logdet
302
+ else:
303
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
304
+ x = torch.cat([x0, x1], 1)
305
+ return x
306
+
307
+ def remove_weight_norm(self):
308
+ """Remove weight normalization from the module."""
309
+ self.enc.remove_weight_norm()
programs/applio_code/rvc/lib/algorithm/synthesizers.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Optional
3
+
4
+ from programs.applio_code.rvc.lib.algorithm.nsf import GeneratorNSF
5
+ from programs.applio_code.rvc.lib.algorithm.generators import Generator
6
+ from programs.applio_code.rvc.lib.algorithm.commons import (
7
+ slice_segments,
8
+ rand_slice_segments,
9
+ )
10
+ from programs.applio_code.rvc.lib.algorithm.residuals import ResidualCouplingBlock
11
+ from programs.applio_code.rvc.lib.algorithm.encoders import (
12
+ TextEncoder,
13
+ PosteriorEncoder,
14
+ )
15
+
16
+
17
+ class Synthesizer(torch.nn.Module):
18
+ """
19
+ Base Synthesizer model.
20
+
21
+ Args:
22
+ spec_channels (int): Number of channels in the spectrogram.
23
+ segment_size (int): Size of the audio segment.
24
+ inter_channels (int): Number of channels in the intermediate layers.
25
+ hidden_channels (int): Number of channels in the hidden layers.
26
+ filter_channels (int): Number of channels in the filter layers.
27
+ n_heads (int): Number of attention heads.
28
+ n_layers (int): Number of layers in the encoder.
29
+ kernel_size (int): Size of the convolution kernel.
30
+ p_dropout (float): Dropout probability.
31
+ resblock (str): Type of residual block.
32
+ resblock_kernel_sizes (list): Kernel sizes for the residual blocks.
33
+ resblock_dilation_sizes (list): Dilation sizes for the residual blocks.
34
+ upsample_rates (list): Upsampling rates for the decoder.
35
+ upsample_initial_channel (int): Number of channels in the initial upsampling layer.
36
+ upsample_kernel_sizes (list): Kernel sizes for the upsampling layers.
37
+ spk_embed_dim (int): Dimension of the speaker embedding.
38
+ gin_channels (int): Number of channels in the global conditioning vector.
39
+ sr (int): Sampling rate of the audio.
40
+ use_f0 (bool): Whether to use F0 information.
41
+ text_enc_hidden_dim (int): Hidden dimension for the text encoder.
42
+ kwargs: Additional keyword arguments.
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ spec_channels,
48
+ segment_size,
49
+ inter_channels,
50
+ hidden_channels,
51
+ filter_channels,
52
+ n_heads,
53
+ n_layers,
54
+ kernel_size,
55
+ p_dropout,
56
+ resblock,
57
+ resblock_kernel_sizes,
58
+ resblock_dilation_sizes,
59
+ upsample_rates,
60
+ upsample_initial_channel,
61
+ upsample_kernel_sizes,
62
+ spk_embed_dim,
63
+ gin_channels,
64
+ sr,
65
+ use_f0,
66
+ text_enc_hidden_dim=768,
67
+ **kwargs
68
+ ):
69
+ super(Synthesizer, self).__init__()
70
+ self.spec_channels = spec_channels
71
+ self.inter_channels = inter_channels
72
+ self.hidden_channels = hidden_channels
73
+ self.filter_channels = filter_channels
74
+ self.n_heads = n_heads
75
+ self.n_layers = n_layers
76
+ self.kernel_size = kernel_size
77
+ self.p_dropout = float(p_dropout)
78
+ self.resblock = resblock
79
+ self.resblock_kernel_sizes = resblock_kernel_sizes
80
+ self.resblock_dilation_sizes = resblock_dilation_sizes
81
+ self.upsample_rates = upsample_rates
82
+ self.upsample_initial_channel = upsample_initial_channel
83
+ self.upsample_kernel_sizes = upsample_kernel_sizes
84
+ self.segment_size = segment_size
85
+ self.gin_channels = gin_channels
86
+ self.spk_embed_dim = spk_embed_dim
87
+ self.use_f0 = use_f0
88
+
89
+ self.enc_p = TextEncoder(
90
+ inter_channels,
91
+ hidden_channels,
92
+ filter_channels,
93
+ n_heads,
94
+ n_layers,
95
+ kernel_size,
96
+ float(p_dropout),
97
+ text_enc_hidden_dim,
98
+ f0=use_f0,
99
+ )
100
+
101
+ if use_f0:
102
+ self.dec = GeneratorNSF(
103
+ inter_channels,
104
+ resblock,
105
+ resblock_kernel_sizes,
106
+ resblock_dilation_sizes,
107
+ upsample_rates,
108
+ upsample_initial_channel,
109
+ upsample_kernel_sizes,
110
+ gin_channels=gin_channels,
111
+ sr=sr,
112
+ is_half=kwargs["is_half"],
113
+ )
114
+ else:
115
+ self.dec = Generator(
116
+ inter_channels,
117
+ resblock,
118
+ resblock_kernel_sizes,
119
+ resblock_dilation_sizes,
120
+ upsample_rates,
121
+ upsample_initial_channel,
122
+ upsample_kernel_sizes,
123
+ gin_channels=gin_channels,
124
+ )
125
+
126
+ self.enc_q = PosteriorEncoder(
127
+ spec_channels,
128
+ inter_channels,
129
+ hidden_channels,
130
+ 5,
131
+ 1,
132
+ 16,
133
+ gin_channels=gin_channels,
134
+ )
135
+ self.flow = ResidualCouplingBlock(
136
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
137
+ )
138
+ self.emb_g = torch.nn.Embedding(self.spk_embed_dim, gin_channels)
139
+
140
+ def remove_weight_norm(self):
141
+ """Removes weight normalization from the model."""
142
+ self.dec.remove_weight_norm()
143
+ self.flow.remove_weight_norm()
144
+ self.enc_q.remove_weight_norm()
145
+
146
+ def __prepare_scriptable__(self):
147
+ for hook in self.dec._forward_pre_hooks.values():
148
+ if (
149
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
150
+ and hook.__class__.__name__ == "WeightNorm"
151
+ ):
152
+ torch.nn.utils.remove_weight_norm(self.dec)
153
+ for hook in self.flow._forward_pre_hooks.values():
154
+ if (
155
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
156
+ and hook.__class__.__name__ == "WeightNorm"
157
+ ):
158
+ torch.nn.utils.remove_weight_norm(self.flow)
159
+ if hasattr(self, "enc_q"):
160
+ for hook in self.enc_q._forward_pre_hooks.values():
161
+ if (
162
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
163
+ and hook.__class__.__name__ == "WeightNorm"
164
+ ):
165
+ torch.nn.utils.remove_weight_norm(self.enc_q)
166
+ return self
167
+
168
+ @torch.jit.ignore
169
+ def forward(
170
+ self,
171
+ phone: torch.Tensor,
172
+ phone_lengths: torch.Tensor,
173
+ pitch: Optional[torch.Tensor] = None,
174
+ pitchf: Optional[torch.Tensor] = None,
175
+ y: torch.Tensor = None,
176
+ y_lengths: torch.Tensor = None,
177
+ ds: Optional[torch.Tensor] = None,
178
+ ):
179
+ """
180
+ Forward pass of the model.
181
+
182
+ Args:
183
+ phone (torch.Tensor): Phoneme sequence.
184
+ phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
185
+ pitch (torch.Tensor, optional): Pitch sequence.
186
+ pitchf (torch.Tensor, optional): Fine-grained pitch sequence.
187
+ y (torch.Tensor, optional): Target spectrogram.
188
+ y_lengths (torch.Tensor, optional): Lengths of the target spectrograms.
189
+ ds (torch.Tensor, optional): Speaker embedding. Defaults to None.
190
+ """
191
+ g = self.emb_g(ds).unsqueeze(-1)
192
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
193
+ if y is not None:
194
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
195
+ z_p = self.flow(z, y_mask, g=g)
196
+ z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size)
197
+ if self.use_f0:
198
+ pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2)
199
+ o = self.dec(z_slice, pitchf, g=g)
200
+ else:
201
+ o = self.dec(z_slice, g=g)
202
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
203
+ else:
204
+ return None, None, x_mask, None, (None, None, m_p, logs_p, None, None)
205
+
206
+ @torch.jit.export
207
+ def infer(
208
+ self,
209
+ phone: torch.Tensor,
210
+ phone_lengths: torch.Tensor,
211
+ pitch: Optional[torch.Tensor] = None,
212
+ nsff0: Optional[torch.Tensor] = None,
213
+ sid: torch.Tensor = None,
214
+ rate: Optional[torch.Tensor] = None,
215
+ ):
216
+ """
217
+ Inference of the model.
218
+
219
+ Args:
220
+ phone (torch.Tensor): Phoneme sequence.
221
+ phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
222
+ pitch (torch.Tensor, optional): Pitch sequence.
223
+ nsff0 (torch.Tensor, optional): Fine-grained pitch sequence.
224
+ sid (torch.Tensor): Speaker embedding.
225
+ rate (torch.Tensor, optional): Rate for time-stretching. Defaults to None.
226
+ """
227
+ g = self.emb_g(sid).unsqueeze(-1)
228
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
229
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
230
+ if rate is not None:
231
+ assert isinstance(rate, torch.Tensor)
232
+ head = int(z_p.shape[2] * (1.0 - rate.item()))
233
+ z_p = z_p[:, :, head:]
234
+ x_mask = x_mask[:, :, head:]
235
+ if self.use_f0:
236
+ nsff0 = nsff0[:, head:]
237
+ if self.use_f0:
238
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
239
+ o = self.dec(z * x_mask, nsff0, g=g)
240
+ else:
241
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
242
+ o = self.dec(z * x_mask, g=g)
243
+ return o, x_mask, (z, z_p, m_p, logs_p)
programs/applio_code/rvc/lib/predictors/F0Extractor.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import pathlib
3
+ import libf0
4
+ import librosa
5
+ import numpy as np
6
+ import resampy
7
+ import torch
8
+ import torchcrepe
9
+ import torchfcpe
10
+ import os
11
+
12
+ # from tools.anyf0.rmvpe import RMVPE
13
+ from programs.applio_code.rvc.lib.predictors.RMVPE import RMVPE0Predictor
14
+ from programs.applio_code.rvc.configs.config import Config
15
+
16
+ config = Config()
17
+
18
+
19
+ @dataclasses.dataclass
20
+ class F0Extractor:
21
+ wav_path: pathlib.Path
22
+ sample_rate: int = 44100
23
+ hop_length: int = 512
24
+ f0_min: int = 50
25
+ f0_max: int = 1600
26
+ method: str = "rmvpe"
27
+ x: np.ndarray = dataclasses.field(init=False)
28
+
29
+ def __post_init__(self):
30
+ self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate)
31
+
32
+ @property
33
+ def hop_size(self) -> float:
34
+ return self.hop_length / self.sample_rate
35
+
36
+ @property
37
+ def wav16k(self) -> np.ndarray:
38
+ return resampy.resample(self.x, self.sample_rate, 16000)
39
+
40
+ def extract_f0(self) -> np.ndarray:
41
+ f0 = None
42
+ method = self.method
43
+ # Fall back to CPU for ZLUDA as these methods use CUcFFT
44
+ device = (
45
+ "cpu"
46
+ if "cuda" in config.device
47
+ and torch.cuda.get_device_name().endswith("[ZLUDA]")
48
+ else config.device
49
+ )
50
+
51
+ if method == "crepe":
52
+ wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(device)
53
+ f0 = torchcrepe.predict(
54
+ wav16k_torch,
55
+ sample_rate=16000,
56
+ hop_length=160,
57
+ batch_size=512,
58
+ fmin=self.f0_min,
59
+ fmax=self.f0_max,
60
+ device=device,
61
+ )
62
+ f0 = f0[0].cpu().numpy()
63
+ elif method == "fcpe":
64
+ audio = librosa.to_mono(self.x)
65
+ audio_length = len(audio)
66
+ f0_target_length = (audio_length // self.hop_length) + 1
67
+ audio = (
68
+ torch.from_numpy(audio).float().unsqueeze(0).unsqueeze(-1).to(device)
69
+ )
70
+ model = torchfcpe.spawn_bundled_infer_model(device=device)
71
+
72
+ f0 = model.infer(
73
+ audio,
74
+ sr=self.sample_rate,
75
+ decoder_mode="local_argmax",
76
+ threshold=0.006,
77
+ f0_min=self.f0_min,
78
+ f0_max=self.f0_max,
79
+ interp_uv=False,
80
+ output_interp_target_length=f0_target_length,
81
+ )
82
+ f0 = f0.squeeze().cpu().numpy()
83
+ elif method == "rmvpe":
84
+ is_half = False if device == "cpu" else config.is_half
85
+ model_rmvpe = RMVPE0Predictor(
86
+ os.path.join(
87
+ "programs", "applio_code", "rvc", "models", "predictors", "rmvpe.pt"
88
+ ),
89
+ is_half=is_half,
90
+ device=device,
91
+ # hop_length=80
92
+ )
93
+ f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03)
94
+
95
+ else:
96
+ raise ValueError(f"Unknown method: {self.method}")
97
+ return libf0.hz_to_cents(f0, librosa.midi_to_hz(0))
98
+
99
+ def plot_f0(self, f0):
100
+ from matplotlib import pyplot as plt
101
+
102
+ plt.figure(figsize=(10, 4))
103
+ plt.plot(f0)
104
+ plt.title(self.method)
105
+ plt.xlabel("Time (frames)")
106
+ plt.ylabel("F0 (cents)")
107
+ plt.show()
programs/applio_code/rvc/lib/predictors/FCPE.py ADDED
@@ -0,0 +1,920 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn as nn
7
+ from torch.nn.utils.parametrizations import weight_norm
8
+ from torchaudio.transforms import Resample
9
+ import os
10
+ import librosa
11
+ import soundfile as sf
12
+ import torch.utils.data
13
+ from librosa.filters import mel as librosa_mel_fn
14
+ import math
15
+ from functools import partial
16
+
17
+ from einops import rearrange, repeat
18
+ from local_attention import LocalAttention
19
+ from torch import nn
20
+
21
+ os.environ["LRU_CACHE_CAPACITY"] = "3"
22
+
23
+
24
+ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
25
+ """Loads wav file to torch tensor."""
26
+ try:
27
+ data, sample_rate = sf.read(full_path, always_2d=True)
28
+ except Exception as error:
29
+ print(f"An error occurred loading {full_path}: {error}")
30
+ if return_empty_on_exception:
31
+ return [], sample_rate or target_sr or 48000
32
+ else:
33
+ raise
34
+
35
+ data = data[:, 0] if len(data.shape) > 1 else data
36
+ assert len(data) > 2
37
+
38
+ # Normalize data
39
+ max_mag = (
40
+ -np.iinfo(data.dtype).min
41
+ if np.issubdtype(data.dtype, np.integer)
42
+ else max(np.amax(data), -np.amin(data))
43
+ )
44
+ max_mag = (
45
+ (2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0)
46
+ )
47
+ data = torch.FloatTensor(data.astype(np.float32)) / max_mag
48
+
49
+ # Handle exceptions and resample
50
+ if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:
51
+ return [], sample_rate or target_sr or 48000
52
+ if target_sr is not None and sample_rate != target_sr:
53
+ data = torch.from_numpy(
54
+ librosa.core.resample(
55
+ data.numpy(), orig_sr=sample_rate, target_sr=target_sr
56
+ )
57
+ )
58
+ sample_rate = target_sr
59
+
60
+ return data, sample_rate
61
+
62
+
63
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
64
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
65
+
66
+
67
+ def dynamic_range_decompression(x, C=1):
68
+ return np.exp(x) / C
69
+
70
+
71
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
72
+ return torch.log(torch.clamp(x, min=clip_val) * C)
73
+
74
+
75
+ def dynamic_range_decompression_torch(x, C=1):
76
+ return torch.exp(x) / C
77
+
78
+
79
+ class STFT:
80
+ def __init__(
81
+ self,
82
+ sr=22050,
83
+ n_mels=80,
84
+ n_fft=1024,
85
+ win_size=1024,
86
+ hop_length=256,
87
+ fmin=20,
88
+ fmax=11025,
89
+ clip_val=1e-5,
90
+ ):
91
+ self.target_sr = sr
92
+ self.n_mels = n_mels
93
+ self.n_fft = n_fft
94
+ self.win_size = win_size
95
+ self.hop_length = hop_length
96
+ self.fmin = fmin
97
+ self.fmax = fmax
98
+ self.clip_val = clip_val
99
+ self.mel_basis = {}
100
+ self.hann_window = {}
101
+
102
+ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
103
+ sample_rate = self.target_sr
104
+ n_mels = self.n_mels
105
+ n_fft = self.n_fft
106
+ win_size = self.win_size
107
+ hop_length = self.hop_length
108
+ fmin = self.fmin
109
+ fmax = self.fmax
110
+ clip_val = self.clip_val
111
+
112
+ factor = 2 ** (keyshift / 12)
113
+ n_fft_new = int(np.round(n_fft * factor))
114
+ win_size_new = int(np.round(win_size * factor))
115
+ hop_length_new = int(np.round(hop_length * speed))
116
+
117
+ # Optimize mel_basis and hann_window caching
118
+ mel_basis = self.mel_basis if not train else {}
119
+ hann_window = self.hann_window if not train else {}
120
+
121
+ mel_basis_key = str(fmax) + "_" + str(y.device)
122
+ if mel_basis_key not in mel_basis:
123
+ mel = librosa_mel_fn(
124
+ sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
125
+ )
126
+ mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
127
+
128
+ keyshift_key = str(keyshift) + "_" + str(y.device)
129
+ if keyshift_key not in hann_window:
130
+ hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
131
+
132
+ # Padding and STFT
133
+ pad_left = (win_size_new - hop_length_new) // 2
134
+ pad_right = max(
135
+ (win_size_new - hop_length_new + 1) // 2,
136
+ win_size_new - y.size(-1) - pad_left,
137
+ )
138
+ mode = "reflect" if pad_right < y.size(-1) else "constant"
139
+ y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
140
+ y = y.squeeze(1)
141
+
142
+ spec = torch.stft(
143
+ y,
144
+ n_fft_new,
145
+ hop_length=hop_length_new,
146
+ win_length=win_size_new,
147
+ window=hann_window[keyshift_key],
148
+ center=center,
149
+ pad_mode="reflect",
150
+ normalized=False,
151
+ onesided=True,
152
+ return_complex=True,
153
+ )
154
+ spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
155
+
156
+ # Handle keyshift and mel conversion
157
+ if keyshift != 0:
158
+ size = n_fft // 2 + 1
159
+ resize = spec.size(1)
160
+ spec = (
161
+ F.pad(spec, (0, 0, 0, size - resize))
162
+ if resize < size
163
+ else spec[:, :size, :]
164
+ )
165
+ spec = spec * win_size / win_size_new
166
+ spec = torch.matmul(mel_basis[mel_basis_key], spec)
167
+ spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
168
+ return spec
169
+
170
+ def __call__(self, audiopath):
171
+ audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
172
+ spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
173
+ return spect
174
+
175
+
176
+ stft = STFT()
177
+
178
+
179
+ def softmax_kernel(
180
+ data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None
181
+ ):
182
+ b, h, *_ = data.shape
183
+
184
+ # Normalize data
185
+ data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
186
+
187
+ # Project data
188
+ ratio = projection_matrix.shape[0] ** -0.5
189
+ projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h)
190
+ projection = projection.type_as(data)
191
+ data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection)
192
+
193
+ # Calculate diagonal data
194
+ diag_data = data**2
195
+ diag_data = torch.sum(diag_data, dim=-1)
196
+ diag_data = (diag_data / 2.0) * (data_normalizer**2)
197
+ diag_data = diag_data.unsqueeze(dim=-1)
198
+
199
+ # Apply softmax
200
+ if is_query:
201
+ data_dash = ratio * (
202
+ torch.exp(
203
+ data_dash
204
+ - diag_data
205
+ - torch.max(data_dash, dim=-1, keepdim=True).values
206
+ )
207
+ + eps
208
+ )
209
+ else:
210
+ data_dash = ratio * (torch.exp(data_dash - diag_data + eps))
211
+
212
+ return data_dash.type_as(data)
213
+
214
+
215
+ def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
216
+ unstructured_block = torch.randn((cols, cols), device=device)
217
+ q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
218
+ q, r = map(lambda t: t.to(device), (q, r))
219
+
220
+ if qr_uniform_q:
221
+ d = torch.diag(r, 0)
222
+ q *= d.sign()
223
+ return q.t()
224
+
225
+
226
+ def exists(val):
227
+ return val is not None
228
+
229
+
230
+ def empty(tensor):
231
+ return tensor.numel() == 0
232
+
233
+
234
+ def default(val, d):
235
+ return val if exists(val) else d
236
+
237
+
238
+ def cast_tuple(val):
239
+ return (val,) if not isinstance(val, tuple) else val
240
+
241
+
242
+ class PCmer(nn.Module):
243
+ def __init__(
244
+ self,
245
+ num_layers,
246
+ num_heads,
247
+ dim_model,
248
+ dim_keys,
249
+ dim_values,
250
+ residual_dropout,
251
+ attention_dropout,
252
+ ):
253
+ super().__init__()
254
+ self.num_layers = num_layers
255
+ self.num_heads = num_heads
256
+ self.dim_model = dim_model
257
+ self.dim_values = dim_values
258
+ self.dim_keys = dim_keys
259
+ self.residual_dropout = residual_dropout
260
+ self.attention_dropout = attention_dropout
261
+
262
+ self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
263
+
264
+ def forward(self, phone, mask=None):
265
+ for layer in self._layers:
266
+ phone = layer(phone, mask)
267
+ return phone
268
+
269
+
270
+ class _EncoderLayer(nn.Module):
271
+ def __init__(self, parent: PCmer):
272
+ super().__init__()
273
+ self.conformer = ConformerConvModule(parent.dim_model)
274
+ self.norm = nn.LayerNorm(parent.dim_model)
275
+ self.dropout = nn.Dropout(parent.residual_dropout)
276
+ self.attn = SelfAttention(
277
+ dim=parent.dim_model, heads=parent.num_heads, causal=False
278
+ )
279
+
280
+ def forward(self, phone, mask=None):
281
+ phone = phone + (self.attn(self.norm(phone), mask=mask))
282
+ phone = phone + (self.conformer(phone))
283
+ return phone
284
+
285
+
286
+ def calc_same_padding(kernel_size):
287
+ pad = kernel_size // 2
288
+ return (pad, pad - (kernel_size + 1) % 2)
289
+
290
+
291
+ class Swish(nn.Module):
292
+ def forward(self, x):
293
+ return x * x.sigmoid()
294
+
295
+
296
+ class Transpose(nn.Module):
297
+ def __init__(self, dims):
298
+ super().__init__()
299
+ assert len(dims) == 2, "dims must be a tuple of two dimensions"
300
+ self.dims = dims
301
+
302
+ def forward(self, x):
303
+ return x.transpose(*self.dims)
304
+
305
+
306
+ class GLU(nn.Module):
307
+ def __init__(self, dim):
308
+ super().__init__()
309
+ self.dim = dim
310
+
311
+ def forward(self, x):
312
+ out, gate = x.chunk(2, dim=self.dim)
313
+ return out * gate.sigmoid()
314
+
315
+
316
+ class DepthWiseConv1d(nn.Module):
317
+ def __init__(self, chan_in, chan_out, kernel_size, padding):
318
+ super().__init__()
319
+ self.padding = padding
320
+ self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
321
+
322
+ def forward(self, x):
323
+ x = F.pad(x, self.padding)
324
+ return self.conv(x)
325
+
326
+
327
+ class ConformerConvModule(nn.Module):
328
+ def __init__(
329
+ self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0
330
+ ):
331
+ super().__init__()
332
+
333
+ inner_dim = dim * expansion_factor
334
+ padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
335
+
336
+ self.net = nn.Sequential(
337
+ nn.LayerNorm(dim),
338
+ Transpose((1, 2)),
339
+ nn.Conv1d(dim, inner_dim * 2, 1),
340
+ GLU(dim=1),
341
+ DepthWiseConv1d(
342
+ inner_dim, inner_dim, kernel_size=kernel_size, padding=padding
343
+ ),
344
+ Swish(),
345
+ nn.Conv1d(inner_dim, dim, 1),
346
+ Transpose((1, 2)),
347
+ nn.Dropout(dropout),
348
+ )
349
+
350
+ def forward(self, x):
351
+ return self.net(x)
352
+
353
+
354
+ def linear_attention(q, k, v):
355
+ if v is None:
356
+ out = torch.einsum("...ed,...nd->...ne", k, q)
357
+ return out
358
+ else:
359
+ k_cumsum = k.sum(dim=-2)
360
+ D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8)
361
+ context = torch.einsum("...nd,...ne->...de", k, v)
362
+ out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv)
363
+ return out
364
+
365
+
366
+ def gaussian_orthogonal_random_matrix(
367
+ nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None
368
+ ):
369
+ nb_full_blocks = int(nb_rows / nb_columns)
370
+ block_list = []
371
+
372
+ for _ in range(nb_full_blocks):
373
+ q = orthogonal_matrix_chunk(
374
+ nb_columns, qr_uniform_q=qr_uniform_q, device=device
375
+ )
376
+ block_list.append(q)
377
+
378
+ remaining_rows = nb_rows - nb_full_blocks * nb_columns
379
+ if remaining_rows > 0:
380
+ q = orthogonal_matrix_chunk(
381
+ nb_columns, qr_uniform_q=qr_uniform_q, device=device
382
+ )
383
+ block_list.append(q[:remaining_rows])
384
+
385
+ final_matrix = torch.cat(block_list)
386
+
387
+ if scaling == 0:
388
+ multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
389
+ elif scaling == 1:
390
+ multiplier = math.sqrt((float(nb_columns))) * torch.ones(
391
+ (nb_rows,), device=device
392
+ )
393
+ else:
394
+ raise ValueError(f"Invalid scaling {scaling}")
395
+
396
+ return torch.diag(multiplier) @ final_matrix
397
+
398
+
399
+ class FastAttention(nn.Module):
400
+ def __init__(
401
+ self,
402
+ dim_heads,
403
+ nb_features=None,
404
+ ortho_scaling=0,
405
+ causal=False,
406
+ generalized_attention=False,
407
+ kernel_fn=nn.ReLU(),
408
+ qr_uniform_q=False,
409
+ no_projection=False,
410
+ ):
411
+ super().__init__()
412
+ nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
413
+
414
+ self.dim_heads = dim_heads
415
+ self.nb_features = nb_features
416
+ self.ortho_scaling = ortho_scaling
417
+
418
+ self.create_projection = partial(
419
+ gaussian_orthogonal_random_matrix,
420
+ nb_rows=self.nb_features,
421
+ nb_columns=dim_heads,
422
+ scaling=ortho_scaling,
423
+ qr_uniform_q=qr_uniform_q,
424
+ )
425
+ projection_matrix = self.create_projection()
426
+ self.register_buffer("projection_matrix", projection_matrix)
427
+
428
+ self.generalized_attention = generalized_attention
429
+ self.kernel_fn = kernel_fn
430
+ self.no_projection = no_projection
431
+ self.causal = causal
432
+
433
+ @torch.no_grad()
434
+ def redraw_projection_matrix(self):
435
+ projections = self.create_projection()
436
+ self.projection_matrix.copy_(projections)
437
+ del projections
438
+
439
+ def forward(self, q, k, v):
440
+ device = q.device
441
+
442
+ if self.no_projection:
443
+ q = q.softmax(dim=-1)
444
+ k = torch.exp(k) if self.causal else k.softmax(dim=-2)
445
+ else:
446
+ create_kernel = partial(
447
+ softmax_kernel, projection_matrix=self.projection_matrix, device=device
448
+ )
449
+ q = create_kernel(q, is_query=True)
450
+ k = create_kernel(k, is_query=False)
451
+
452
+ attn_fn = linear_attention if not self.causal else self.causal_linear_fn
453
+
454
+ if v is None:
455
+ out = attn_fn(q, k, None)
456
+ return out
457
+ else:
458
+ out = attn_fn(q, k, v)
459
+ return out
460
+
461
+
462
+ class SelfAttention(nn.Module):
463
+ def __init__(
464
+ self,
465
+ dim,
466
+ causal=False,
467
+ heads=8,
468
+ dim_head=64,
469
+ local_heads=0,
470
+ local_window_size=256,
471
+ nb_features=None,
472
+ feature_redraw_interval=1000,
473
+ generalized_attention=False,
474
+ kernel_fn=nn.ReLU(),
475
+ qr_uniform_q=False,
476
+ dropout=0.0,
477
+ no_projection=False,
478
+ ):
479
+ super().__init__()
480
+ assert dim % heads == 0, "dimension must be divisible by number of heads"
481
+ dim_head = default(dim_head, dim // heads)
482
+ inner_dim = dim_head * heads
483
+ self.fast_attention = FastAttention(
484
+ dim_head,
485
+ nb_features,
486
+ causal=causal,
487
+ generalized_attention=generalized_attention,
488
+ kernel_fn=kernel_fn,
489
+ qr_uniform_q=qr_uniform_q,
490
+ no_projection=no_projection,
491
+ )
492
+
493
+ self.heads = heads
494
+ self.global_heads = heads - local_heads
495
+ self.local_attn = (
496
+ LocalAttention(
497
+ window_size=local_window_size,
498
+ causal=causal,
499
+ autopad=True,
500
+ dropout=dropout,
501
+ look_forward=int(not causal),
502
+ rel_pos_emb_config=(dim_head, local_heads),
503
+ )
504
+ if local_heads > 0
505
+ else None
506
+ )
507
+
508
+ self.to_q = nn.Linear(dim, inner_dim)
509
+ self.to_k = nn.Linear(dim, inner_dim)
510
+ self.to_v = nn.Linear(dim, inner_dim)
511
+ self.to_out = nn.Linear(inner_dim, dim)
512
+ self.dropout = nn.Dropout(dropout)
513
+
514
+ @torch.no_grad()
515
+ def redraw_projection_matrix(self):
516
+ self.fast_attention.redraw_projection_matrix()
517
+
518
+ def forward(
519
+ self,
520
+ x,
521
+ context=None,
522
+ mask=None,
523
+ context_mask=None,
524
+ name=None,
525
+ inference=False,
526
+ **kwargs,
527
+ ):
528
+ _, _, _, h, gh = *x.shape, self.heads, self.global_heads
529
+
530
+ cross_attend = exists(context)
531
+ context = default(context, x)
532
+ context_mask = default(context_mask, mask) if not cross_attend else context_mask
533
+ q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
534
+
535
+ q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
536
+ (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
537
+
538
+ attn_outs = []
539
+ if not empty(q):
540
+ if exists(context_mask):
541
+ global_mask = context_mask[:, None, :, None]
542
+ v.masked_fill_(~global_mask, 0.0)
543
+ if cross_attend:
544
+ pass # TODO: Implement cross-attention
545
+ else:
546
+ out = self.fast_attention(q, k, v)
547
+ attn_outs.append(out)
548
+
549
+ if not empty(lq):
550
+ assert (
551
+ not cross_attend
552
+ ), "local attention is not compatible with cross attention"
553
+ out = self.local_attn(lq, lk, lv, input_mask=mask)
554
+ attn_outs.append(out)
555
+
556
+ out = torch.cat(attn_outs, dim=1)
557
+ out = rearrange(out, "b h n d -> b n (h d)")
558
+ out = self.to_out(out)
559
+ return self.dropout(out)
560
+
561
+
562
+ def l2_regularization(model, l2_alpha):
563
+ l2_loss = []
564
+ for module in model.modules():
565
+ if type(module) is nn.Conv2d:
566
+ l2_loss.append((module.weight**2).sum() / 2.0)
567
+ return l2_alpha * sum(l2_loss)
568
+
569
+
570
+ class FCPE(nn.Module):
571
+ def __init__(
572
+ self,
573
+ input_channel=128,
574
+ out_dims=360,
575
+ n_layers=12,
576
+ n_chans=512,
577
+ use_siren=False,
578
+ use_full=False,
579
+ loss_mse_scale=10,
580
+ loss_l2_regularization=False,
581
+ loss_l2_regularization_scale=1,
582
+ loss_grad1_mse=False,
583
+ loss_grad1_mse_scale=1,
584
+ f0_max=1975.5,
585
+ f0_min=32.70,
586
+ confidence=False,
587
+ threshold=0.05,
588
+ use_input_conv=True,
589
+ ):
590
+ super().__init__()
591
+ if use_siren is True:
592
+ raise ValueError("Siren is not supported yet.")
593
+ if use_full is True:
594
+ raise ValueError("Full model is not supported yet.")
595
+
596
+ self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
597
+ self.loss_l2_regularization = (
598
+ loss_l2_regularization if (loss_l2_regularization is not None) else False
599
+ )
600
+ self.loss_l2_regularization_scale = (
601
+ loss_l2_regularization_scale
602
+ if (loss_l2_regularization_scale is not None)
603
+ else 1
604
+ )
605
+ self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
606
+ self.loss_grad1_mse_scale = (
607
+ loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
608
+ )
609
+ self.f0_max = f0_max if (f0_max is not None) else 1975.5
610
+ self.f0_min = f0_min if (f0_min is not None) else 32.70
611
+ self.confidence = confidence if (confidence is not None) else False
612
+ self.threshold = threshold if (threshold is not None) else 0.05
613
+ self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
614
+
615
+ self.cent_table_b = torch.Tensor(
616
+ np.linspace(
617
+ self.f0_to_cent(torch.Tensor([f0_min]))[0],
618
+ self.f0_to_cent(torch.Tensor([f0_max]))[0],
619
+ out_dims,
620
+ )
621
+ )
622
+ self.register_buffer("cent_table", self.cent_table_b)
623
+
624
+ # conv in stack
625
+ _leaky = nn.LeakyReLU()
626
+ self.stack = nn.Sequential(
627
+ nn.Conv1d(input_channel, n_chans, 3, 1, 1),
628
+ nn.GroupNorm(4, n_chans),
629
+ _leaky,
630
+ nn.Conv1d(n_chans, n_chans, 3, 1, 1),
631
+ )
632
+
633
+ # transformer
634
+ self.decoder = PCmer(
635
+ num_layers=n_layers,
636
+ num_heads=8,
637
+ dim_model=n_chans,
638
+ dim_keys=n_chans,
639
+ dim_values=n_chans,
640
+ residual_dropout=0.1,
641
+ attention_dropout=0.1,
642
+ )
643
+ self.norm = nn.LayerNorm(n_chans)
644
+
645
+ # out
646
+ self.n_out = out_dims
647
+ self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))
648
+
649
+ def forward(
650
+ self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax"
651
+ ):
652
+ if cdecoder == "argmax":
653
+ self.cdecoder = self.cents_decoder
654
+ elif cdecoder == "local_argmax":
655
+ self.cdecoder = self.cents_local_decoder
656
+
657
+ x = (
658
+ self.stack(mel.transpose(1, 2)).transpose(1, 2)
659
+ if self.use_input_conv
660
+ else mel
661
+ )
662
+ x = self.decoder(x)
663
+ x = self.norm(x)
664
+ x = self.dense_out(x)
665
+ x = torch.sigmoid(x)
666
+
667
+ if not infer:
668
+ gt_cent_f0 = self.f0_to_cent(gt_f0)
669
+ gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0)
670
+ loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0)
671
+ if self.loss_l2_regularization:
672
+ loss_all = loss_all + l2_regularization(
673
+ model=self, l2_alpha=self.loss_l2_regularization_scale
674
+ )
675
+ x = loss_all
676
+ if infer:
677
+ x = self.cdecoder(x)
678
+ x = self.cent_to_f0(x)
679
+ x = (1 + x / 700).log() if not return_hz_f0 else x
680
+
681
+ return x
682
+
683
+ def cents_decoder(self, y, mask=True):
684
+ B, N, _ = y.size()
685
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
686
+ rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(
687
+ y, dim=-1, keepdim=True
688
+ )
689
+ if mask:
690
+ confident = torch.max(y, dim=-1, keepdim=True)[0]
691
+ confident_mask = torch.ones_like(confident)
692
+ confident_mask[confident <= self.threshold] = float("-INF")
693
+ rtn = rtn * confident_mask
694
+ return (rtn, confident) if self.confidence else rtn
695
+
696
+ def cents_local_decoder(self, y, mask=True):
697
+ B, N, _ = y.size()
698
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
699
+ confident, max_index = torch.max(y, dim=-1, keepdim=True)
700
+ local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
701
+ local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1)
702
+ ci_l = torch.gather(ci, -1, local_argmax_index)
703
+ y_l = torch.gather(y, -1, local_argmax_index)
704
+ rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(
705
+ y_l, dim=-1, keepdim=True
706
+ )
707
+ if mask:
708
+ confident_mask = torch.ones_like(confident)
709
+ confident_mask[confident <= self.threshold] = float("-INF")
710
+ rtn = rtn * confident_mask
711
+ return (rtn, confident) if self.confidence else rtn
712
+
713
+ def cent_to_f0(self, cent):
714
+ return 10.0 * 2 ** (cent / 1200.0)
715
+
716
+ def f0_to_cent(self, f0):
717
+ return 1200.0 * torch.log2(f0 / 10.0)
718
+
719
+ def gaussian_blurred_cent(self, cents):
720
+ mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0)))
721
+ B, N, _ = cents.size()
722
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
723
+ return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
724
+
725
+
726
+ class FCPEInfer:
727
+ def __init__(self, model_path, device=None, dtype=torch.float32):
728
+ if device is None:
729
+ device = "cuda" if torch.cuda.is_available() else "cpu"
730
+ self.device = device
731
+ ckpt = torch.load(model_path, map_location=torch.device(self.device))
732
+ self.args = DotDict(ckpt["config"])
733
+ self.dtype = dtype
734
+ model = FCPE(
735
+ input_channel=self.args.model.input_channel,
736
+ out_dims=self.args.model.out_dims,
737
+ n_layers=self.args.model.n_layers,
738
+ n_chans=self.args.model.n_chans,
739
+ use_siren=self.args.model.use_siren,
740
+ use_full=self.args.model.use_full,
741
+ loss_mse_scale=self.args.loss.loss_mse_scale,
742
+ loss_l2_regularization=self.args.loss.loss_l2_regularization,
743
+ loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
744
+ loss_grad1_mse=self.args.loss.loss_grad1_mse,
745
+ loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
746
+ f0_max=self.args.model.f0_max,
747
+ f0_min=self.args.model.f0_min,
748
+ confidence=self.args.model.confidence,
749
+ )
750
+ model.to(self.device).to(self.dtype)
751
+ model.load_state_dict(ckpt["model"])
752
+ model.eval()
753
+ self.model = model
754
+ self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
755
+
756
+ @torch.no_grad()
757
+ def __call__(self, audio, sr, threshold=0.05):
758
+ self.model.threshold = threshold
759
+ audio = audio[None, :]
760
+ mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
761
+ f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
762
+ return f0
763
+
764
+
765
+ class Wav2Mel:
766
+ def __init__(self, args, device=None, dtype=torch.float32):
767
+ self.sample_rate = args.mel.sampling_rate
768
+ self.hop_size = args.mel.hop_size
769
+ if device is None:
770
+ device = "cuda" if torch.cuda.is_available() else "cpu"
771
+ self.device = device
772
+ self.dtype = dtype
773
+ self.stft = STFT(
774
+ args.mel.sampling_rate,
775
+ args.mel.num_mels,
776
+ args.mel.n_fft,
777
+ args.mel.win_size,
778
+ args.mel.hop_size,
779
+ args.mel.fmin,
780
+ args.mel.fmax,
781
+ )
782
+ self.resample_kernel = {}
783
+
784
+ def extract_nvstft(self, audio, keyshift=0, train=False):
785
+ mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2)
786
+ return mel
787
+
788
+ def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
789
+ audio = audio.to(self.dtype).to(self.device)
790
+ if sample_rate == self.sample_rate:
791
+ audio_res = audio
792
+ else:
793
+ key_str = str(sample_rate)
794
+ if key_str not in self.resample_kernel:
795
+ self.resample_kernel[key_str] = Resample(
796
+ sample_rate, self.sample_rate, lowpass_filter_width=128
797
+ )
798
+ self.resample_kernel[key_str] = (
799
+ self.resample_kernel[key_str].to(self.dtype).to(self.device)
800
+ )
801
+ audio_res = self.resample_kernel[key_str](audio)
802
+
803
+ mel = self.extract_nvstft(
804
+ audio_res, keyshift=keyshift, train=train
805
+ ) # B, n_frames, bins
806
+ n_frames = int(audio.shape[1] // self.hop_size) + 1
807
+ mel = (
808
+ torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel
809
+ )
810
+ mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel
811
+ return mel
812
+
813
+ def __call__(self, audio, sample_rate, keyshift=0, train=False):
814
+ return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
815
+
816
+
817
+ class DotDict(dict):
818
+ def __getattr__(*args):
819
+ val = dict.get(*args)
820
+ return DotDict(val) if type(val) is dict else val
821
+
822
+ __setattr__ = dict.__setitem__
823
+ __delattr__ = dict.__delitem__
824
+
825
+
826
+ class F0Predictor(object):
827
+ def compute_f0(self, wav, p_len):
828
+ pass
829
+
830
+ def compute_f0_uv(self, wav, p_len):
831
+ pass
832
+
833
+
834
+ class FCPEF0Predictor(F0Predictor):
835
+ def __init__(
836
+ self,
837
+ model_path,
838
+ hop_length=512,
839
+ f0_min=50,
840
+ f0_max=1100,
841
+ dtype=torch.float32,
842
+ device=None,
843
+ sample_rate=44100,
844
+ threshold=0.05,
845
+ ):
846
+ self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype)
847
+ self.hop_length = hop_length
848
+ self.f0_min = f0_min
849
+ self.f0_max = f0_max
850
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
851
+ self.threshold = threshold
852
+ self.sample_rate = sample_rate
853
+ self.dtype = dtype
854
+ self.name = "fcpe"
855
+
856
+ def repeat_expand(
857
+ self,
858
+ content: Union[torch.Tensor, np.ndarray],
859
+ target_len: int,
860
+ mode: str = "nearest",
861
+ ):
862
+ ndim = content.ndim
863
+ content = (
864
+ content[None, None]
865
+ if ndim == 1
866
+ else content[None] if ndim == 2 else content
867
+ )
868
+ assert content.ndim == 3
869
+ is_np = isinstance(content, np.ndarray)
870
+ content = torch.from_numpy(content) if is_np else content
871
+ results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
872
+ results = results.numpy() if is_np else results
873
+ return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results
874
+
875
+ def post_process(self, x, sample_rate, f0, pad_to):
876
+ f0 = (
877
+ torch.from_numpy(f0).float().to(x.device)
878
+ if isinstance(f0, np.ndarray)
879
+ else f0
880
+ )
881
+ f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0
882
+
883
+ vuv_vector = torch.zeros_like(f0)
884
+ vuv_vector[f0 > 0.0] = 1.0
885
+ vuv_vector[f0 <= 0.0] = 0.0
886
+
887
+ nzindex = torch.nonzero(f0).squeeze()
888
+ f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
889
+ time_org = self.hop_length / sample_rate * nzindex.cpu().numpy()
890
+ time_frame = np.arange(pad_to) * self.hop_length / sample_rate
891
+
892
+ vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
893
+
894
+ if f0.shape[0] <= 0:
895
+ return np.zeros(pad_to), vuv_vector.cpu().numpy()
896
+ if f0.shape[0] == 1:
897
+ return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy()
898
+
899
+ f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
900
+ return f0, vuv_vector.cpu().numpy()
901
+
902
+ def compute_f0(self, wav, p_len=None):
903
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
904
+ p_len = x.shape[0] // self.hop_length if p_len is None else p_len
905
+ f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
906
+ if torch.all(f0 == 0):
907
+ return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
908
+ f0.cpu().numpy() if p_len is None else np.zeros(p_len)
909
+ )
910
+ return self.post_process(x, self.sample_rate, f0, p_len)[0]
911
+
912
+ def compute_f0_uv(self, wav, p_len=None):
913
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
914
+ p_len = x.shape[0] // self.hop_length if p_len is None else p_len
915
+ f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
916
+ if torch.all(f0 == 0):
917
+ return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
918
+ f0.cpu().numpy() if p_len is None else np.zeros(p_len)
919
+ )
920
+ return self.post_process(x, self.sample_rate, f0, p_len)
programs/applio_code/rvc/lib/predictors/RMVPE.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+
6
+ from librosa.filters import mel
7
+ from typing import List
8
+
9
+ # Constants for readability
10
+ N_MELS = 128
11
+ N_CLASS = 360
12
+
13
+
14
+ # Define a helper function for creating convolutional blocks
15
+ class ConvBlockRes(nn.Module):
16
+ """
17
+ A convolutional block with residual connection.
18
+
19
+ Args:
20
+ in_channels (int): Number of input channels.
21
+ out_channels (int): Number of output channels.
22
+ momentum (float): Momentum for batch normalization.
23
+ """
24
+
25
+ def __init__(self, in_channels, out_channels, momentum=0.01):
26
+ super(ConvBlockRes, self).__init__()
27
+ self.conv = nn.Sequential(
28
+ nn.Conv2d(
29
+ in_channels=in_channels,
30
+ out_channels=out_channels,
31
+ kernel_size=(3, 3),
32
+ stride=(1, 1),
33
+ padding=(1, 1),
34
+ bias=False,
35
+ ),
36
+ nn.BatchNorm2d(out_channels, momentum=momentum),
37
+ nn.ReLU(),
38
+ nn.Conv2d(
39
+ in_channels=out_channels,
40
+ out_channels=out_channels,
41
+ kernel_size=(3, 3),
42
+ stride=(1, 1),
43
+ padding=(1, 1),
44
+ bias=False,
45
+ ),
46
+ nn.BatchNorm2d(out_channels, momentum=momentum),
47
+ nn.ReLU(),
48
+ )
49
+ if in_channels != out_channels:
50
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
51
+ self.is_shortcut = True
52
+ else:
53
+ self.is_shortcut = False
54
+
55
+ def forward(self, x):
56
+ if self.is_shortcut:
57
+ return self.conv(x) + self.shortcut(x)
58
+ else:
59
+ return self.conv(x) + x
60
+
61
+
62
+ # Define a class for residual encoder blocks
63
+ class ResEncoderBlock(nn.Module):
64
+ """
65
+ A residual encoder block.
66
+
67
+ Args:
68
+ in_channels (int): Number of input channels.
69
+ out_channels (int): Number of output channels.
70
+ kernel_size (tuple): Size of the average pooling kernel.
71
+ n_blocks (int): Number of convolutional blocks in the block.
72
+ momentum (float): Momentum for batch normalization.
73
+ """
74
+
75
+ def __init__(
76
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
77
+ ):
78
+ super(ResEncoderBlock, self).__init__()
79
+ self.n_blocks = n_blocks
80
+ self.conv = nn.ModuleList()
81
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
82
+ for _ in range(n_blocks - 1):
83
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
84
+ self.kernel_size = kernel_size
85
+ if self.kernel_size is not None:
86
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
87
+
88
+ def forward(self, x):
89
+ for i in range(self.n_blocks):
90
+ x = self.conv[i](x)
91
+ if self.kernel_size is not None:
92
+ return x, self.pool(x)
93
+ else:
94
+ return x
95
+
96
+
97
+ # Define a class for the encoder
98
+ class Encoder(nn.Module):
99
+ """
100
+ The encoder part of the DeepUnet.
101
+
102
+ Args:
103
+ in_channels (int): Number of input channels.
104
+ in_size (int): Size of the input tensor.
105
+ n_encoders (int): Number of encoder blocks.
106
+ kernel_size (tuple): Size of the average pooling kernel.
107
+ n_blocks (int): Number of convolutional blocks in each encoder block.
108
+ out_channels (int): Number of output channels for the first encoder block.
109
+ momentum (float): Momentum for batch normalization.
110
+ """
111
+
112
+ def __init__(
113
+ self,
114
+ in_channels,
115
+ in_size,
116
+ n_encoders,
117
+ kernel_size,
118
+ n_blocks,
119
+ out_channels=16,
120
+ momentum=0.01,
121
+ ):
122
+ super(Encoder, self).__init__()
123
+ self.n_encoders = n_encoders
124
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
125
+ self.layers = nn.ModuleList()
126
+ self.latent_channels = []
127
+ for i in range(self.n_encoders):
128
+ self.layers.append(
129
+ ResEncoderBlock(
130
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
131
+ )
132
+ )
133
+ self.latent_channels.append([out_channels, in_size])
134
+ in_channels = out_channels
135
+ out_channels *= 2
136
+ in_size //= 2
137
+ self.out_size = in_size
138
+ self.out_channel = out_channels
139
+
140
+ def forward(self, x: torch.Tensor):
141
+ concat_tensors: List[torch.Tensor] = []
142
+ x = self.bn(x)
143
+ for i in range(self.n_encoders):
144
+ t, x = self.layers[i](x)
145
+ concat_tensors.append(t)
146
+ return x, concat_tensors
147
+
148
+
149
+ # Define a class for the intermediate layer
150
+ class Intermediate(nn.Module):
151
+ """
152
+ The intermediate layer of the DeepUnet.
153
+
154
+ Args:
155
+ in_channels (int): Number of input channels.
156
+ out_channels (int): Number of output channels.
157
+ n_inters (int): Number of convolutional blocks in the intermediate layer.
158
+ n_blocks (int): Number of convolutional blocks in each intermediate block.
159
+ momentum (float): Momentum for batch normalization.
160
+ """
161
+
162
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
163
+ super(Intermediate, self).__init__()
164
+ self.n_inters = n_inters
165
+ self.layers = nn.ModuleList()
166
+ self.layers.append(
167
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
168
+ )
169
+ for _ in range(self.n_inters - 1):
170
+ self.layers.append(
171
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
172
+ )
173
+
174
+ def forward(self, x):
175
+ for i in range(self.n_inters):
176
+ x = self.layers[i](x)
177
+ return x
178
+
179
+
180
+ # Define a class for residual decoder blocks
181
+ class ResDecoderBlock(nn.Module):
182
+ """
183
+ A residual decoder block.
184
+
185
+ Args:
186
+ in_channels (int): Number of input channels.
187
+ out_channels (int): Number of output channels.
188
+ stride (tuple): Stride for transposed convolution.
189
+ n_blocks (int): Number of convolutional blocks in the block.
190
+ momentum (float): Momentum for batch normalization.
191
+ """
192
+
193
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
194
+ super(ResDecoderBlock, self).__init__()
195
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
196
+ self.n_blocks = n_blocks
197
+ self.conv1 = nn.Sequential(
198
+ nn.ConvTranspose2d(
199
+ in_channels=in_channels,
200
+ out_channels=out_channels,
201
+ kernel_size=(3, 3),
202
+ stride=stride,
203
+ padding=(1, 1),
204
+ output_padding=out_padding,
205
+ bias=False,
206
+ ),
207
+ nn.BatchNorm2d(out_channels, momentum=momentum),
208
+ nn.ReLU(),
209
+ )
210
+ self.conv2 = nn.ModuleList()
211
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
212
+ for _ in range(n_blocks - 1):
213
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
214
+
215
+ def forward(self, x, concat_tensor):
216
+ x = self.conv1(x)
217
+ x = torch.cat((x, concat_tensor), dim=1)
218
+ for i in range(self.n_blocks):
219
+ x = self.conv2[i](x)
220
+ return x
221
+
222
+
223
+ # Define a class for the decoder
224
+ class Decoder(nn.Module):
225
+ """
226
+ The decoder part of the DeepUnet.
227
+
228
+ Args:
229
+ in_channels (int): Number of input channels.
230
+ n_decoders (int): Number of decoder blocks.
231
+ stride (tuple): Stride for transposed convolution.
232
+ n_blocks (int): Number of convolutional blocks in each decoder block.
233
+ momentum (float): Momentum for batch normalization.
234
+ """
235
+
236
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
237
+ super(Decoder, self).__init__()
238
+ self.layers = nn.ModuleList()
239
+ self.n_decoders = n_decoders
240
+ for _ in range(self.n_decoders):
241
+ out_channels = in_channels // 2
242
+ self.layers.append(
243
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
244
+ )
245
+ in_channels = out_channels
246
+
247
+ def forward(self, x, concat_tensors):
248
+ for i in range(self.n_decoders):
249
+ x = self.layers[i](x, concat_tensors[-1 - i])
250
+ return x
251
+
252
+
253
+ # Define a class for the DeepUnet architecture
254
+ class DeepUnet(nn.Module):
255
+ """
256
+ The DeepUnet architecture.
257
+
258
+ Args:
259
+ kernel_size (tuple): Size of the average pooling kernel.
260
+ n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
261
+ en_de_layers (int): Number of encoder/decoder layers.
262
+ inter_layers (int): Number of convolutional blocks in the intermediate layer.
263
+ in_channels (int): Number of input channels.
264
+ en_out_channels (int): Number of output channels for the first encoder block.
265
+ """
266
+
267
+ def __init__(
268
+ self,
269
+ kernel_size,
270
+ n_blocks,
271
+ en_de_layers=5,
272
+ inter_layers=4,
273
+ in_channels=1,
274
+ en_out_channels=16,
275
+ ):
276
+ super(DeepUnet, self).__init__()
277
+ self.encoder = Encoder(
278
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
279
+ )
280
+ self.intermediate = Intermediate(
281
+ self.encoder.out_channel // 2,
282
+ self.encoder.out_channel,
283
+ inter_layers,
284
+ n_blocks,
285
+ )
286
+ self.decoder = Decoder(
287
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
288
+ )
289
+
290
+ def forward(self, x):
291
+ x, concat_tensors = self.encoder(x)
292
+ x = self.intermediate(x)
293
+ x = self.decoder(x, concat_tensors)
294
+ return x
295
+
296
+
297
+ # Define a class for the end-to-end model
298
+ class E2E(nn.Module):
299
+ """
300
+ The end-to-end model.
301
+
302
+ Args:
303
+ n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
304
+ n_gru (int): Number of GRU layers.
305
+ kernel_size (tuple): Size of the average pooling kernel.
306
+ en_de_layers (int): Number of encoder/decoder layers.
307
+ inter_layers (int): Number of convolutional blocks in the intermediate layer.
308
+ in_channels (int): Number of input channels.
309
+ en_out_channels (int): Number of output channels for the first encoder block.
310
+ """
311
+
312
+ def __init__(
313
+ self,
314
+ n_blocks,
315
+ n_gru,
316
+ kernel_size,
317
+ en_de_layers=5,
318
+ inter_layers=4,
319
+ in_channels=1,
320
+ en_out_channels=16,
321
+ ):
322
+ super(E2E, self).__init__()
323
+ self.unet = DeepUnet(
324
+ kernel_size,
325
+ n_blocks,
326
+ en_de_layers,
327
+ inter_layers,
328
+ in_channels,
329
+ en_out_channels,
330
+ )
331
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
332
+ if n_gru:
333
+ self.fc = nn.Sequential(
334
+ BiGRU(3 * 128, 256, n_gru),
335
+ nn.Linear(512, N_CLASS),
336
+ nn.Dropout(0.25),
337
+ nn.Sigmoid(),
338
+ )
339
+ else:
340
+ self.fc = nn.Sequential(
341
+ nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
342
+ )
343
+
344
+ def forward(self, mel):
345
+ mel = mel.transpose(-1, -2).unsqueeze(1)
346
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
347
+ x = self.fc(x)
348
+ return x
349
+
350
+
351
+ # Define a class for the MelSpectrogram extractor
352
+ class MelSpectrogram(torch.nn.Module):
353
+ """
354
+ Extracts Mel-spectrogram features from audio.
355
+
356
+ Args:
357
+ is_half (bool): Whether to use half-precision floating-point numbers.
358
+ n_mel_channels (int): Number of Mel-frequency bands.
359
+ sample_rate (int): Sampling rate of the audio.
360
+ win_length (int): Length of the window function in samples.
361
+ hop_length (int): Hop size between frames in samples.
362
+ n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length.
363
+ mel_fmin (int, optional): Minimum frequency for the Mel filter bank. Defaults to 0.
364
+ mel_fmax (int, optional): Maximum frequency for the Mel filter bank. Defaults to None.
365
+ clamp (float, optional): Minimum value for clamping the Mel-spectrogram. Defaults to 1e-5.
366
+ """
367
+
368
+ def __init__(
369
+ self,
370
+ is_half,
371
+ n_mel_channels,
372
+ sample_rate,
373
+ win_length,
374
+ hop_length,
375
+ n_fft=None,
376
+ mel_fmin=0,
377
+ mel_fmax=None,
378
+ clamp=1e-5,
379
+ ):
380
+ super().__init__()
381
+ n_fft = win_length if n_fft is None else n_fft
382
+ self.hann_window = {}
383
+ mel_basis = mel(
384
+ sr=sample_rate,
385
+ n_fft=n_fft,
386
+ n_mels=n_mel_channels,
387
+ fmin=mel_fmin,
388
+ fmax=mel_fmax,
389
+ htk=True,
390
+ )
391
+ mel_basis = torch.from_numpy(mel_basis).float()
392
+ self.register_buffer("mel_basis", mel_basis)
393
+ self.n_fft = win_length if n_fft is None else n_fft
394
+ self.hop_length = hop_length
395
+ self.win_length = win_length
396
+ self.sample_rate = sample_rate
397
+ self.n_mel_channels = n_mel_channels
398
+ self.clamp = clamp
399
+ self.is_half = is_half
400
+
401
+ def forward(self, audio, keyshift=0, speed=1, center=True):
402
+ factor = 2 ** (keyshift / 12)
403
+ n_fft_new = int(np.round(self.n_fft * factor))
404
+ win_length_new = int(np.round(self.win_length * factor))
405
+ hop_length_new = int(np.round(self.hop_length * speed))
406
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
407
+ if keyshift_key not in self.hann_window:
408
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
409
+ audio.device
410
+ )
411
+
412
+ # Zluda, fall-back to CPU for FFTs since HIP SDK has no cuFFT alternative
413
+ source_device = audio.device
414
+ if audio.device.type == "cuda" and torch.cuda.get_device_name().endswith(
415
+ "[ZLUDA]"
416
+ ):
417
+ audio = audio.to("cpu")
418
+ self.hann_window[keyshift_key] = self.hann_window[keyshift_key].to("cpu")
419
+
420
+ fft = torch.stft(
421
+ audio,
422
+ n_fft=n_fft_new,
423
+ hop_length=hop_length_new,
424
+ win_length=win_length_new,
425
+ window=self.hann_window[keyshift_key],
426
+ center=center,
427
+ return_complex=True,
428
+ ).to(source_device)
429
+
430
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
431
+ if keyshift != 0:
432
+ size = self.n_fft // 2 + 1
433
+ resize = magnitude.size(1)
434
+ if resize < size:
435
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
436
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
437
+ mel_output = torch.matmul(self.mel_basis, magnitude)
438
+ if self.is_half:
439
+ mel_output = mel_output.half()
440
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
441
+ return log_mel_spec
442
+
443
+
444
+ # Define a class for the RMVPE0 predictor
445
+ class RMVPE0Predictor:
446
+ """
447
+ A predictor for fundamental frequency (F0) based on the RMVPE0 model.
448
+
449
+ Args:
450
+ model_path (str): Path to the RMVPE0 model file.
451
+ is_half (bool): Whether to use half-precision floating-point numbers.
452
+ device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available.
453
+ """
454
+
455
+ def __init__(self, model_path, is_half, device=None):
456
+ self.resample_kernel = {}
457
+ model = E2E(4, 1, (2, 2))
458
+ ckpt = torch.load(model_path, map_location="cpu")
459
+ model.load_state_dict(ckpt)
460
+ model.eval()
461
+ if is_half:
462
+ model = model.half()
463
+ self.model = model
464
+ self.resample_kernel = {}
465
+ self.is_half = is_half
466
+ self.device = device
467
+ self.mel_extractor = MelSpectrogram(
468
+ is_half, N_MELS, 16000, 1024, 160, None, 30, 8000
469
+ ).to(device)
470
+ self.model = self.model.to(device)
471
+ cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191
472
+ self.cents_mapping = np.pad(cents_mapping, (4, 4))
473
+
474
+ def mel2hidden(self, mel):
475
+ """
476
+ Converts Mel-spectrogram features to hidden representation.
477
+
478
+ Args:
479
+ mel (torch.Tensor): Mel-spectrogram features.
480
+ """
481
+ with torch.no_grad():
482
+ n_frames = mel.shape[-1]
483
+ mel = F.pad(
484
+ mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
485
+ )
486
+ hidden = self.model(mel)
487
+ return hidden[:, :n_frames]
488
+
489
+ def decode(self, hidden, thred=0.03):
490
+ """
491
+ Decodes hidden representation to F0.
492
+
493
+ Args:
494
+ hidden (np.ndarray): Hidden representation.
495
+ thred (float, optional): Threshold for salience. Defaults to 0.03.
496
+ """
497
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
498
+ f0 = 10 * (2 ** (cents_pred / 1200))
499
+ f0[f0 == 10] = 0
500
+ return f0
501
+
502
+ def infer_from_audio(self, audio, thred=0.03):
503
+ """
504
+ Infers F0 from audio.
505
+
506
+ Args:
507
+ audio (np.ndarray): Audio signal.
508
+ thred (float, optional): Threshold for salience. Defaults to 0.03.
509
+ """
510
+ audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
511
+ mel = self.mel_extractor(audio, center=True)
512
+ hidden = self.mel2hidden(mel)
513
+ hidden = hidden.squeeze(0).cpu().numpy()
514
+ if self.is_half == True:
515
+ hidden = hidden.astype("float32")
516
+ f0 = self.decode(hidden, thred=thred)
517
+ return f0
518
+
519
+ def to_local_average_cents(self, salience, thred=0.05):
520
+ """
521
+ Converts salience to local average cents.
522
+
523
+ Args:
524
+ salience (np.ndarray): Salience values.
525
+ thred (float, optional): Threshold for salience. Defaults to 0.05.
526
+ """
527
+ center = np.argmax(salience, axis=1)
528
+ salience = np.pad(salience, ((0, 0), (4, 4)))
529
+ center += 4
530
+ todo_salience = []
531
+ todo_cents_mapping = []
532
+ starts = center - 4
533
+ ends = center + 5
534
+ for idx in range(salience.shape[0]):
535
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
536
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
537
+ todo_salience = np.array(todo_salience)
538
+ todo_cents_mapping = np.array(todo_cents_mapping)
539
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
540
+ weight_sum = np.sum(todo_salience, 1)
541
+ devided = product_sum / weight_sum
542
+ maxx = np.max(salience, axis=1)
543
+ devided[maxx <= thred] = 0
544
+ return devided
545
+
546
+
547
+ # Define a class for BiGRU (bidirectional GRU)
548
+ class BiGRU(nn.Module):
549
+ """
550
+ A bidirectional GRU layer.
551
+
552
+ Args:
553
+ input_features (int): Number of input features.
554
+ hidden_features (int): Number of hidden features.
555
+ num_layers (int): Number of GRU layers.
556
+ """
557
+
558
+ def __init__(self, input_features, hidden_features, num_layers):
559
+ super(BiGRU, self).__init__()
560
+ self.gru = nn.GRU(
561
+ input_features,
562
+ hidden_features,
563
+ num_layers=num_layers,
564
+ batch_first=True,
565
+ bidirectional=True,
566
+ )
567
+
568
+ def forward(self, x):
569
+ return self.gru(x)[0]
programs/applio_code/rvc/lib/tools/analyzer.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import librosa.display
4
+ import librosa
5
+
6
+
7
+ def calculate_features(y, sr):
8
+ stft = np.abs(librosa.stft(y))
9
+ duration = librosa.get_duration(y=y, sr=sr)
10
+ cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0]
11
+ bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0]
12
+ rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0]
13
+ return stft, duration, cent, bw, rolloff
14
+
15
+
16
+ def plot_title(title):
17
+ plt.suptitle(title, fontsize=16, fontweight="bold")
18
+
19
+
20
+ def plot_spectrogram(y, sr, stft, duration, cmap="inferno"):
21
+ plt.subplot(3, 1, 1)
22
+ plt.imshow(
23
+ librosa.amplitude_to_db(stft, ref=np.max),
24
+ origin="lower",
25
+ extent=[0, duration, 0, sr / 1000],
26
+ aspect="auto",
27
+ cmap=cmap, # Change the colormap here
28
+ )
29
+ plt.colorbar(format="%+2.0f dB")
30
+ plt.xlabel("Time (s)")
31
+ plt.ylabel("Frequency (kHz)")
32
+ plt.title("Spectrogram")
33
+
34
+
35
+ def plot_waveform(y, sr, duration):
36
+ plt.subplot(3, 1, 2)
37
+ librosa.display.waveshow(y, sr=sr)
38
+ plt.xlabel("Time (s)")
39
+ plt.ylabel("Amplitude")
40
+ plt.title("Waveform")
41
+
42
+
43
+ def plot_features(times, cent, bw, rolloff, duration):
44
+ plt.subplot(3, 1, 3)
45
+ plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b")
46
+ plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g")
47
+ plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r")
48
+ plt.xlabel("Time (s)")
49
+ plt.title("Spectral Features")
50
+ plt.legend()
51
+
52
+
53
+ def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"):
54
+ y, sr = librosa.load(audio_file)
55
+ stft, duration, cent, bw, rolloff = calculate_features(y, sr)
56
+
57
+ plt.figure(figsize=(12, 10))
58
+
59
+ plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1])
60
+ plot_spectrogram(y, sr, stft, duration)
61
+ plot_waveform(y, sr, duration)
62
+ plot_features(librosa.times_like(cent), cent, bw, rolloff, duration)
63
+
64
+ plt.tight_layout()
65
+
66
+ if save_plot_path:
67
+ plt.savefig(save_plot_path, bbox_inches="tight", dpi=300)
68
+ plt.close()
69
+
70
+ audio_info = f"""Sample Rate: {sr}\nDuration: {(
71
+ str(round(duration, 2)) + " seconds"
72
+ if duration < 60
73
+ else str(round(duration / 60, 2)) + " minutes"
74
+ )}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}"""
75
+
76
+ return audio_info, save_plot_path
programs/applio_code/rvc/lib/tools/gdown.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import six
4
+ import sys
5
+ import json
6
+ import tqdm
7
+ import time
8
+ import shutil
9
+ import warnings
10
+ import tempfile
11
+ import textwrap
12
+ import requests
13
+ from six.moves import urllib_parse
14
+
15
+
16
+ def indent(text, prefix):
17
+ """Indent each non-empty line of text with the given prefix."""
18
+ return "".join(
19
+ (prefix + line if line.strip() else line) for line in text.splitlines(True)
20
+ )
21
+
22
+
23
+ class FileURLRetrievalError(Exception):
24
+ pass
25
+
26
+
27
+ class FolderContentsMaximumLimitError(Exception):
28
+ pass
29
+
30
+
31
+ def parse_url(url, warning=True):
32
+ """Parse URLs especially for Google Drive links.
33
+
34
+ Args:
35
+ url: URL to parse.
36
+ warning: Whether to warn if the URL is not a download link.
37
+
38
+ Returns:
39
+ A tuple (file_id, is_download_link), where file_id is the ID of the
40
+ file on Google Drive, and is_download_link is a flag indicating
41
+ whether the URL is a download link.
42
+ """
43
+ parsed = urllib_parse.urlparse(url)
44
+ query = urllib_parse.parse_qs(parsed.query)
45
+ is_gdrive = parsed.hostname in ("drive.google.com", "docs.google.com")
46
+ is_download_link = parsed.path.endswith("/uc")
47
+
48
+ if not is_gdrive:
49
+ return None, is_download_link
50
+
51
+ file_id = query.get("id", [None])[0]
52
+ if file_id is None:
53
+ for pattern in (
54
+ r"^/file/d/(.*?)/(edit|view)$",
55
+ r"^/file/u/[0-9]+/d/(.*?)/(edit|view)$",
56
+ r"^/document/d/(.*?)/(edit|htmlview|view)$",
57
+ r"^/document/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
58
+ r"^/presentation/d/(.*?)/(edit|htmlview|view)$",
59
+ r"^/presentation/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
60
+ r"^/spreadsheets/d/(.*?)/(edit|htmlview|view)$",
61
+ r"^/spreadsheets/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
62
+ ):
63
+ match = re.match(pattern, parsed.path)
64
+ if match:
65
+ file_id = match.group(1)
66
+ break
67
+
68
+ if warning and not is_download_link:
69
+ warnings.warn(
70
+ "You specified a Google Drive link that is not the correct link "
71
+ "to download a file. You might want to try `--fuzzy` option "
72
+ f"or the following url: https://drive.google.com/uc?id={file_id}"
73
+ )
74
+
75
+ return file_id, is_download_link
76
+
77
+
78
+ CHUNK_SIZE = 512 * 1024 # 512KB
79
+ HOME = os.path.expanduser("~")
80
+
81
+
82
+ def get_url_from_gdrive_confirmation(contents):
83
+ """Extract the download URL from a Google Drive confirmation page."""
84
+ for pattern in (
85
+ r'href="(\/uc\?export=download[^"]+)',
86
+ r'href="/open\?id=([^"]+)"',
87
+ r'"downloadUrl":"([^"]+)',
88
+ ):
89
+ match = re.search(pattern, contents)
90
+ if match:
91
+ url = match.group(1)
92
+ if pattern == r'href="/open\?id=([^"]+)"':
93
+ uuid = re.search(
94
+ r'<input\s+type="hidden"\s+name="uuid"\s+value="([^"]+)"',
95
+ contents,
96
+ ).group(1)
97
+ url = (
98
+ "https://drive.usercontent.google.com/download?id="
99
+ + url
100
+ + "&confirm=t&uuid="
101
+ + uuid
102
+ )
103
+ elif pattern == r'"downloadUrl":"([^"]+)':
104
+ url = url.replace("\\u003d", "=").replace("\\u0026", "&")
105
+ else:
106
+ url = "https://docs.google.com" + url.replace("&", "&")
107
+ return url
108
+
109
+ match = re.search(r'<p class="uc-error-subcaption">(.*)</p>', contents)
110
+ if match:
111
+ error = match.group(1)
112
+ raise FileURLRetrievalError(error)
113
+
114
+ raise FileURLRetrievalError(
115
+ "Cannot retrieve the public link of the file. "
116
+ "You may need to change the permission to "
117
+ "'Anyone with the link', or have had many accesses."
118
+ )
119
+
120
+
121
+ def _get_session(proxy, use_cookies, return_cookies_file=False):
122
+ """Create a requests session with optional proxy and cookie handling."""
123
+ sess = requests.session()
124
+ sess.headers.update(
125
+ {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"}
126
+ )
127
+
128
+ if proxy is not None:
129
+ sess.proxies = {"http": proxy, "https": proxy}
130
+ print("Using proxy:", proxy, file=sys.stderr)
131
+
132
+ cookies_file = os.path.join(HOME, ".cache/gdown/cookies.json")
133
+ if os.path.exists(cookies_file) and use_cookies:
134
+ with open(cookies_file) as f:
135
+ cookies = json.load(f)
136
+ for k, v in cookies:
137
+ sess.cookies[k] = v
138
+
139
+ return (sess, cookies_file) if return_cookies_file else sess
140
+
141
+
142
+ def download(
143
+ url=None,
144
+ output=None,
145
+ quiet=False,
146
+ proxy=None,
147
+ speed=None,
148
+ use_cookies=True,
149
+ verify=True,
150
+ id=None,
151
+ fuzzy=True,
152
+ resume=False,
153
+ format=None,
154
+ ):
155
+ """Download file from URL.
156
+
157
+ Parameters
158
+ ----------
159
+ url: str
160
+ URL. Google Drive URL is also supported.
161
+ output: str
162
+ Output filename. Default is basename of URL.
163
+ quiet: bool
164
+ Suppress terminal output. Default is False.
165
+ proxy: str
166
+ Proxy.
167
+ speed: float
168
+ Download byte size per second (e.g., 256KB/s = 256 * 1024).
169
+ use_cookies: bool
170
+ Flag to use cookies. Default is True.
171
+ verify: bool or string
172
+ Either a bool, in which case it controls whether the server's TLS
173
+ certificate is verified, or a string, in which case it must be a path
174
+ to a CA bundle to use. Default is True.
175
+ id: str
176
+ Google Drive's file ID.
177
+ fuzzy: bool
178
+ Fuzzy extraction of Google Drive's file Id. Default is False.
179
+ resume: bool
180
+ Resume the download from existing tmp file if possible.
181
+ Default is False.
182
+ format: str, optional
183
+ Format of Google Docs, Spreadsheets and Slides. Default is:
184
+ - Google Docs: 'docx'
185
+ - Google Spreadsheet: 'xlsx'
186
+ - Google Slides: 'pptx'
187
+
188
+ Returns
189
+ -------
190
+ output: str
191
+ Output filename.
192
+ """
193
+ if not (id is None) ^ (url is None):
194
+ raise ValueError("Either url or id has to be specified")
195
+ if id is not None:
196
+ url = f"https://drive.google.com/uc?id={id}"
197
+
198
+ url_origin = url
199
+
200
+ sess, cookies_file = _get_session(
201
+ proxy=proxy, use_cookies=use_cookies, return_cookies_file=True
202
+ )
203
+
204
+ gdrive_file_id, is_gdrive_download_link = parse_url(url, warning=not fuzzy)
205
+
206
+ if fuzzy and gdrive_file_id:
207
+ # overwrite the url with fuzzy match of a file id
208
+ url = f"https://drive.google.com/uc?id={gdrive_file_id}"
209
+ url_origin = url
210
+ is_gdrive_download_link = True
211
+
212
+ while True:
213
+ res = sess.get(url, stream=True, verify=verify)
214
+
215
+ if url == url_origin and res.status_code == 500:
216
+ # The file could be Google Docs or Spreadsheets.
217
+ url = f"https://drive.google.com/open?id={gdrive_file_id}"
218
+ continue
219
+
220
+ if res.headers["Content-Type"].startswith("text/html"):
221
+ title = re.search("<title>(.+)</title>", res.text)
222
+ if title:
223
+ title = title.group(1)
224
+ if title.endswith(" - Google Docs"):
225
+ url = f"https://docs.google.com/document/d/{gdrive_file_id}/export?format={'docx' if format is None else format}"
226
+ continue
227
+ if title.endswith(" - Google Sheets"):
228
+ url = f"https://docs.google.com/spreadsheets/d/{gdrive_file_id}/export?format={'xlsx' if format is None else format}"
229
+ continue
230
+ if title.endswith(" - Google Slides"):
231
+ url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}"
232
+ continue
233
+ elif (
234
+ "Content-Disposition" in res.headers
235
+ and res.headers["Content-Disposition"].endswith("pptx")
236
+ and format not in (None, "pptx")
237
+ ):
238
+ url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}"
239
+ continue
240
+
241
+ if use_cookies:
242
+ os.makedirs(os.path.dirname(cookies_file), exist_ok=True)
243
+ with open(cookies_file, "w") as f:
244
+ cookies = [
245
+ (k, v)
246
+ for k, v in sess.cookies.items()
247
+ if not k.startswith("download_warning_")
248
+ ]
249
+ json.dump(cookies, f, indent=2)
250
+
251
+ if "Content-Disposition" in res.headers:
252
+ # This is the file
253
+ break
254
+ if not (gdrive_file_id and is_gdrive_download_link):
255
+ break
256
+
257
+ # Need to redirect with confirmation
258
+ try:
259
+ url = get_url_from_gdrive_confirmation(res.text)
260
+ except FileURLRetrievalError as e:
261
+ message = (
262
+ "Failed to retrieve file url:\n\n"
263
+ "{}\n\n"
264
+ "You may still be able to access the file from the browser:"
265
+ f"\n\n\t{url_origin}\n\n"
266
+ "but Gdown can't. Please check connections and permissions."
267
+ ).format(indent("\n".join(textwrap.wrap(str(e))), prefix="\t"))
268
+ raise FileURLRetrievalError(message)
269
+
270
+ if gdrive_file_id and is_gdrive_download_link:
271
+ content_disposition = urllib_parse.unquote(res.headers["Content-Disposition"])
272
+ filename_from_url = (
273
+ re.search(r"filename\*=UTF-8''(.*)", content_disposition)
274
+ or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition)
275
+ ).group(1)
276
+ filename_from_url = filename_from_url.replace(os.path.sep, "_")
277
+ else:
278
+ filename_from_url = os.path.basename(url)
279
+
280
+ output = output or filename_from_url
281
+
282
+ output_is_path = isinstance(output, six.string_types)
283
+ if output_is_path and output.endswith(os.path.sep):
284
+ os.makedirs(output, exist_ok=True)
285
+ output = os.path.join(output, filename_from_url)
286
+
287
+ if output_is_path:
288
+ temp_dir = os.path.dirname(output) or "."
289
+ prefix = os.path.basename(output)
290
+ existing_tmp_files = [
291
+ os.path.join(temp_dir, file)
292
+ for file in os.listdir(temp_dir)
293
+ if file.startswith(prefix)
294
+ ]
295
+ if resume and existing_tmp_files:
296
+ if len(existing_tmp_files) > 1:
297
+ print(
298
+ "There are multiple temporary files to resume:",
299
+ file=sys.stderr,
300
+ )
301
+ for file in existing_tmp_files:
302
+ print(f"\t{file}", file=sys.stderr)
303
+ print(
304
+ "Please remove them except one to resume downloading.",
305
+ file=sys.stderr,
306
+ )
307
+ return
308
+ tmp_file = existing_tmp_files[0]
309
+ else:
310
+ resume = False
311
+ tmp_file = tempfile.mktemp(
312
+ suffix=tempfile.template, prefix=prefix, dir=temp_dir
313
+ )
314
+ f = open(tmp_file, "ab")
315
+ else:
316
+ tmp_file = None
317
+ f = output
318
+
319
+ if tmp_file is not None and f.tell() != 0:
320
+ headers = {"Range": f"bytes={f.tell()}-"}
321
+ res = sess.get(url, headers=headers, stream=True, verify=verify)
322
+
323
+ if not quiet:
324
+ if resume:
325
+ print("Resume:", tmp_file, file=sys.stderr)
326
+ print(
327
+ "To:",
328
+ os.path.abspath(output) if output_is_path else output,
329
+ file=sys.stderr,
330
+ )
331
+
332
+ try:
333
+ total = int(res.headers.get("Content-Length", 0))
334
+ if not quiet:
335
+ pbar = tqdm.tqdm(total=total, unit="B", unit_scale=True)
336
+ t_start = time.time()
337
+ for chunk in res.iter_content(chunk_size=CHUNK_SIZE):
338
+ f.write(chunk)
339
+ if not quiet:
340
+ pbar.update(len(chunk))
341
+ if speed is not None:
342
+ elapsed_time_expected = 1.0 * pbar.n / speed
343
+ elapsed_time = time.time() - t_start
344
+ if elapsed_time < elapsed_time_expected:
345
+ time.sleep(elapsed_time_expected - elapsed_time)
346
+ if not quiet:
347
+ pbar.close()
348
+ if tmp_file:
349
+ f.close()
350
+ shutil.move(tmp_file, output)
351
+ finally:
352
+ sess.close()
353
+
354
+ return output
programs/applio_code/rvc/lib/tools/launch_tensorboard.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import logging
3
+ from tensorboard import program
4
+
5
+ log_path = "logs"
6
+
7
+
8
+ def launch_tensorboard_pipeline():
9
+ logging.getLogger("root").setLevel(logging.WARNING)
10
+ logging.getLogger("tensorboard").setLevel(logging.WARNING)
11
+
12
+ tb = program.TensorBoard()
13
+ tb.configure(argv=[None, "--logdir", log_path])
14
+ url = tb.launch()
15
+
16
+ print(
17
+ f"Access the tensorboard using the following link:\n{url}?pinnedCards=%5B%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fd%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fkl%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fmel%22%7D%5D"
18
+ )
19
+
20
+ while True:
21
+ time.sleep(600)
programs/applio_code/rvc/lib/tools/model_download.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import six
4
+ import sys
5
+ import wget
6
+ import shutil
7
+ import zipfile
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ from urllib.parse import unquote, urlencode, parse_qs, urlparse
11
+
12
+ now_dir = os.getcwd()
13
+ sys.path.append(now_dir)
14
+
15
+ from programs.applio_code.rvc.lib.utils import format_title
16
+ from programs.applio_code.rvc.lib.tools import gdown
17
+
18
+
19
+ def find_folder_parent(search_dir, folder_name):
20
+ for dirpath, dirnames, _ in os.walk(search_dir):
21
+ if folder_name in dirnames:
22
+ return os.path.abspath(dirpath)
23
+ return None
24
+
25
+
26
+ file_path = find_folder_parent(now_dir, "logs")
27
+ zips_path = os.path.join(file_path, "zips")
28
+
29
+
30
+ def search_pth_index(folder):
31
+ pth_paths = [
32
+ os.path.join(folder, file)
33
+ for file in os.listdir(folder)
34
+ if os.path.isfile(os.path.join(folder, file)) and file.endswith(".pth")
35
+ ]
36
+ index_paths = [
37
+ os.path.join(folder, file)
38
+ for file in os.listdir(folder)
39
+ if os.path.isfile(os.path.join(folder, file)) and file.endswith(".index")
40
+ ]
41
+
42
+ return pth_paths, index_paths
43
+
44
+
45
+ def get_mediafire_download_link(url):
46
+ response = requests.get(url)
47
+ response.raise_for_status()
48
+ soup = BeautifulSoup(response.text, "html.parser")
49
+ download_button = soup.find(
50
+ "a", {"class": "input popsok", "aria-label": "Download file"}
51
+ )
52
+ if download_button:
53
+ download_link = download_button.get("href")
54
+ return download_link
55
+ else:
56
+ return None
57
+
58
+
59
+ def download_from_url(url):
60
+ os.makedirs(zips_path, exist_ok=True)
61
+ if url != "":
62
+ if "drive.google.com" in url:
63
+ if "file/d/" in url:
64
+ file_id = url.split("file/d/")[1].split("/")[0]
65
+ elif "id=" in url:
66
+ file_id = url.split("id=")[1].split("&")[0]
67
+ else:
68
+ return None
69
+
70
+ if file_id:
71
+ os.chdir(zips_path)
72
+ try:
73
+ gdown.download(
74
+ f"https://drive.google.com/uc?id={file_id}",
75
+ quiet=True,
76
+ fuzzy=True,
77
+ )
78
+ except Exception as error:
79
+ error_message = str(
80
+ f"An error occurred downloading the file: {error}"
81
+ )
82
+ if (
83
+ "Too many users have viewed or downloaded this file recently"
84
+ in error_message
85
+ ):
86
+ os.chdir(now_dir)
87
+ return "too much use"
88
+ elif (
89
+ "Cannot retrieve the public link of the file." in error_message
90
+ ):
91
+ os.chdir(now_dir)
92
+ return "private link"
93
+ else:
94
+ print(error_message)
95
+ os.chdir(now_dir)
96
+ return None
97
+ elif "disk.yandex.ru" in url:
98
+ base_url = "https://cloud-api.yandex.net/v1/disk/public/resources/download?"
99
+ public_key = url
100
+ final_url = base_url + urlencode(dict(public_key=public_key))
101
+ response = requests.get(final_url)
102
+ download_url = response.json()["href"]
103
+ download_response = requests.get(download_url)
104
+
105
+ if download_response.status_code == 200:
106
+ filename = parse_qs(urlparse(unquote(download_url)).query).get(
107
+ "filename", [""]
108
+ )[0]
109
+ if filename:
110
+ os.chdir(zips_path)
111
+ with open(filename, "wb") as f:
112
+ f.write(download_response.content)
113
+ else:
114
+ print("Failed to get filename from URL.")
115
+ return None
116
+
117
+ elif "pixeldrain.com" in url:
118
+ try:
119
+ file_id = url.split("pixeldrain.com/u/")[1]
120
+ os.chdir(zips_path)
121
+ print(file_id)
122
+ response = requests.get(f"https://pixeldrain.com/api/file/{file_id}")
123
+ if response.status_code == 200:
124
+ file_name = (
125
+ response.headers.get("Content-Disposition")
126
+ .split("filename=")[-1]
127
+ .strip('";')
128
+ )
129
+ os.makedirs(zips_path, exist_ok=True)
130
+ with open(os.path.join(zips_path, file_name), "wb") as newfile:
131
+ newfile.write(response.content)
132
+ os.chdir(file_path)
133
+ return "downloaded"
134
+ else:
135
+ os.chdir(file_path)
136
+ return None
137
+ except Exception as error:
138
+ print(f"An error occurred downloading the file: {error}")
139
+ os.chdir(file_path)
140
+ return None
141
+
142
+ elif "cdn.discordapp.com" in url:
143
+ file = requests.get(url)
144
+ os.chdir(zips_path)
145
+ if file.status_code == 200:
146
+ name = url.split("/")
147
+ with open(os.path.join(name[-1]), "wb") as newfile:
148
+ newfile.write(file.content)
149
+ else:
150
+ return None
151
+ elif "/blob/" in url or "/resolve/" in url:
152
+ os.chdir(zips_path)
153
+ if "/blob/" in url:
154
+ url = url.replace("/blob/", "/resolve/")
155
+
156
+ response = requests.get(url, stream=True)
157
+ if response.status_code == 200:
158
+ content_disposition = six.moves.urllib_parse.unquote(
159
+ response.headers["Content-Disposition"]
160
+ )
161
+ m = re.search(r'filename="([^"]+)"', content_disposition)
162
+ file_name = m.groups()[0]
163
+ file_name = file_name.replace(os.path.sep, "_")
164
+ total_size_in_bytes = int(response.headers.get("content-length", 0))
165
+ block_size = 1024
166
+ progress_bar_length = 50
167
+ progress = 0
168
+
169
+ with open(os.path.join(zips_path, file_name), "wb") as file:
170
+ for data in response.iter_content(block_size):
171
+ file.write(data)
172
+ progress += len(data)
173
+ progress_percent = int((progress / total_size_in_bytes) * 100)
174
+ num_dots = int(
175
+ (progress / total_size_in_bytes) * progress_bar_length
176
+ )
177
+ progress_bar = (
178
+ "["
179
+ + "." * num_dots
180
+ + " " * (progress_bar_length - num_dots)
181
+ + "]"
182
+ )
183
+ print(
184
+ f"{progress_percent}% {progress_bar} {progress}/{total_size_in_bytes} ",
185
+ end="\r",
186
+ )
187
+ if progress_percent == 100:
188
+ print("\n")
189
+
190
+ else:
191
+ os.chdir(now_dir)
192
+ return None
193
+ elif "/tree/main" in url:
194
+ os.chdir(zips_path)
195
+ response = requests.get(url)
196
+ soup = BeautifulSoup(response.content, "html.parser")
197
+ temp_url = ""
198
+ for link in soup.find_all("a", href=True):
199
+ if link["href"].endswith(".zip"):
200
+ temp_url = link["href"]
201
+ break
202
+ if temp_url:
203
+ url = temp_url
204
+ url = url.replace("blob", "resolve")
205
+ if "huggingface.co" not in url:
206
+ url = "https://huggingface.co" + url
207
+
208
+ wget.download(url)
209
+ else:
210
+ os.chdir(now_dir)
211
+ return None
212
+ elif "applio.org" in url:
213
+ parts = url.split("/")
214
+ id_with_query = parts[-1]
215
+ id_parts = id_with_query.split("?")
216
+ id_number = id_parts[0]
217
+
218
+ url = "https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models"
219
+ headers = {
220
+ "apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10"
221
+ }
222
+
223
+ params = {"id": f"eq.{id_number}"}
224
+ response = requests.get(url, headers=headers, params=params)
225
+ if response.status_code == 200:
226
+ json_response = response.json()
227
+ print(json_response)
228
+ if json_response:
229
+ link = json_response[0]["link"]
230
+ verify = download_from_url(link)
231
+ if verify == "downloaded":
232
+ return "downloaded"
233
+ else:
234
+ return None
235
+ else:
236
+ return None
237
+ else:
238
+ try:
239
+ os.chdir(zips_path)
240
+ wget.download(url)
241
+ except Exception as error:
242
+ os.chdir(now_dir)
243
+ print(f"An error occurred downloading the file: {error}")
244
+ return None
245
+
246
+ for currentPath, _, zipFiles in os.walk(zips_path):
247
+ for Files in zipFiles:
248
+ filePart = Files.split(".")
249
+ extensionFile = filePart[len(filePart) - 1]
250
+ filePart.pop()
251
+ nameFile = "_".join(filePart)
252
+ realPath = os.path.join(currentPath, Files)
253
+ os.rename(realPath, nameFile + "." + extensionFile)
254
+
255
+ os.chdir(now_dir)
256
+ return "downloaded"
257
+
258
+ os.chdir(now_dir)
259
+ return None
260
+
261
+
262
+ def extract_and_show_progress(zipfile_path, unzips_path):
263
+ try:
264
+ with zipfile.ZipFile(zipfile_path, "r") as zip_ref:
265
+ for file_info in zip_ref.infolist():
266
+ zip_ref.extract(file_info, unzips_path)
267
+ os.remove(zipfile_path)
268
+ return True
269
+ except Exception as error:
270
+ print(f"An error occurred extracting the zip file: {error}")
271
+ return False
272
+
273
+
274
+ def unzip_file(zip_path, zip_file_name):
275
+ zip_file_path = os.path.join(zip_path, zip_file_name + ".zip")
276
+ extract_path = os.path.join(file_path, zip_file_name)
277
+ with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
278
+ zip_ref.extractall(extract_path)
279
+ os.remove(zip_file_path)
280
+
281
+
282
+ def model_download_pipeline(url: str):
283
+ try:
284
+ verify = download_from_url(url)
285
+ if verify == "downloaded":
286
+ extract_folder_path = ""
287
+ for filename in os.listdir(zips_path):
288
+ if filename.endswith(".zip"):
289
+ zipfile_path = os.path.join(zips_path, filename)
290
+ print("Proceeding with the extraction...")
291
+
292
+ model_zip = os.path.basename(zipfile_path)
293
+ model_name = format_title(model_zip.split(".zip")[0])
294
+ extract_folder_path = os.path.join(
295
+ "logs",
296
+ os.path.normpath(model_name),
297
+ )
298
+ success = extract_and_show_progress(
299
+ zipfile_path, extract_folder_path
300
+ )
301
+
302
+ macosx_path = os.path.join(extract_folder_path, "__MACOSX")
303
+ if os.path.exists(macosx_path):
304
+ shutil.rmtree(macosx_path)
305
+
306
+ subfolders = [
307
+ f
308
+ for f in os.listdir(extract_folder_path)
309
+ if os.path.isdir(os.path.join(extract_folder_path, f))
310
+ ]
311
+ if len(subfolders) == 1:
312
+ subfolder_path = os.path.join(
313
+ extract_folder_path, subfolders[0]
314
+ )
315
+ for item in os.listdir(subfolder_path):
316
+ s = os.path.join(subfolder_path, item)
317
+ d = os.path.join(extract_folder_path, item)
318
+ shutil.move(s, d)
319
+ os.rmdir(subfolder_path)
320
+
321
+ for item in os.listdir(extract_folder_path):
322
+ if ".pth" in item:
323
+ file_name = item.split(".pth")[0]
324
+ if file_name != model_name:
325
+ os.rename(
326
+ os.path.join(extract_folder_path, item),
327
+ os.path.join(
328
+ extract_folder_path, model_name + ".pth"
329
+ ),
330
+ )
331
+ else:
332
+ if "v2" not in item:
333
+ if "_nprobe_1_" in item and "_v1" in item:
334
+ file_name = item.split("_nprobe_1_")[1].split(
335
+ "_v1"
336
+ )[0]
337
+ if file_name != model_name:
338
+ new_file_name = (
339
+ item.split("_nprobe_1_")[0]
340
+ + "_nprobe_1_"
341
+ + model_name
342
+ + "_v1"
343
+ )
344
+ os.rename(
345
+ os.path.join(extract_folder_path, item),
346
+ os.path.join(
347
+ extract_folder_path,
348
+ new_file_name + ".index",
349
+ ),
350
+ )
351
+ else:
352
+ if "_nprobe_1_" in item and "_v2" in item:
353
+ file_name = item.split("_nprobe_1_")[1].split(
354
+ "_v2"
355
+ )[0]
356
+ if file_name != model_name:
357
+ new_file_name = (
358
+ item.split("_nprobe_1_")[0]
359
+ + "_nprobe_1_"
360
+ + model_name
361
+ + "_v2"
362
+ )
363
+ os.rename(
364
+ os.path.join(extract_folder_path, item),
365
+ os.path.join(
366
+ extract_folder_path,
367
+ new_file_name + ".index",
368
+ ),
369
+ )
370
+
371
+ if success:
372
+ print(f"Model {model_name} downloaded!")
373
+ else:
374
+ print(f"Error downloading {model_name}")
375
+ return "Error"
376
+ if extract_folder_path == "":
377
+ print("Zip file was not found.")
378
+ return "Error"
379
+ result = search_pth_index(extract_folder_path)
380
+ return result
381
+ else:
382
+ return "Error"
383
+ except Exception as error:
384
+ print(f"An unexpected error occurred: {error}")
385
+ return "Error"
programs/applio_code/rvc/lib/tools/prerequisites_download.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from concurrent.futures import ThreadPoolExecutor
3
+ from tqdm import tqdm
4
+ import requests
5
+
6
+ url_base = "https://huggingface.co/IAHispano/Applio/resolve/main/Resources"
7
+
8
+ pretraineds_v1_list = [
9
+ (
10
+ "pretrained_v1/",
11
+ [
12
+ "D32k.pth",
13
+ "D40k.pth",
14
+ "D48k.pth",
15
+ "G32k.pth",
16
+ "G40k.pth",
17
+ "G48k.pth",
18
+ "f0D32k.pth",
19
+ "f0D40k.pth",
20
+ "f0D48k.pth",
21
+ "f0G32k.pth",
22
+ "f0G40k.pth",
23
+ "f0G48k.pth",
24
+ ],
25
+ )
26
+ ]
27
+ pretraineds_v2_list = [
28
+ (
29
+ "pretrained_v2/",
30
+ [
31
+ "D32k.pth",
32
+ "D40k.pth",
33
+ "D48k.pth",
34
+ "G32k.pth",
35
+ "G40k.pth",
36
+ "G48k.pth",
37
+ "f0D32k.pth",
38
+ "f0D40k.pth",
39
+ "f0D48k.pth",
40
+ "f0G32k.pth",
41
+ "f0G40k.pth",
42
+ "f0G48k.pth",
43
+ ],
44
+ )
45
+ ]
46
+ models_list = [("predictors/", ["rmvpe.pt", "fcpe.pt"])]
47
+ embedders_list = [("embedders/contentvec/", ["pytorch_model.bin", "config.json"])]
48
+ linux_executables_list = [("formant/", ["stftpitchshift"])]
49
+ executables_list = [
50
+ ("", ["ffmpeg.exe", "ffprobe.exe"]),
51
+ ("formant/", ["stftpitchshift.exe"]),
52
+ ]
53
+
54
+ folder_mapping_list = {
55
+ "pretrained_v1/": "programs/applio_code/rvc/models/pretraineds/pretrained_v1/",
56
+ "pretrained_v2/": "programs/applio_code/rvc/models/pretraineds/pretrained_v2/",
57
+ "embedders/contentvec/": "programs/applio_code/rvc/models/embedders/contentvec/",
58
+ "predictors/": "programs/applio_code/rvc/models/predictors/",
59
+ "formant/": "programs/applio_code/rvc/models/formant/",
60
+ }
61
+
62
+
63
+ def get_file_size_if_missing(file_list):
64
+ """
65
+ Calculate the total size of files to be downloaded only if they do not exist locally.
66
+ """
67
+ total_size = 0
68
+ for remote_folder, files in file_list:
69
+ local_folder = folder_mapping_list.get(remote_folder, "")
70
+ for file in files:
71
+ destination_path = os.path.join(local_folder, file)
72
+ if not os.path.exists(destination_path):
73
+ url = f"{url_base}/{remote_folder}{file}"
74
+ response = requests.head(url)
75
+ total_size += int(response.headers.get("content-length", 0))
76
+ return total_size
77
+
78
+
79
+ def download_file(url, destination_path, global_bar):
80
+ """
81
+ Download a file from the given URL to the specified destination path,
82
+ updating the global progress bar as data is downloaded.
83
+ """
84
+
85
+ dir_name = os.path.dirname(destination_path)
86
+ if dir_name:
87
+ os.makedirs(dir_name, exist_ok=True)
88
+ response = requests.get(url, stream=True)
89
+ block_size = 1024
90
+ with open(destination_path, "wb") as file:
91
+ for data in response.iter_content(block_size):
92
+ file.write(data)
93
+ global_bar.update(len(data))
94
+
95
+
96
+ def download_mapping_files(file_mapping_list, global_bar):
97
+ """
98
+ Download all files in the provided file mapping list using a thread pool executor,
99
+ and update the global progress bar as downloads progress.
100
+ """
101
+ with ThreadPoolExecutor() as executor:
102
+ futures = []
103
+ for remote_folder, file_list in file_mapping_list:
104
+ local_folder = folder_mapping_list.get(remote_folder, "")
105
+ for file in file_list:
106
+ destination_path = os.path.join(local_folder, file)
107
+ if not os.path.exists(destination_path):
108
+ url = f"{url_base}/{remote_folder}{file}"
109
+ futures.append(
110
+ executor.submit(
111
+ download_file, url, destination_path, global_bar
112
+ )
113
+ )
114
+ for future in futures:
115
+ future.result()
116
+
117
+
118
+ def calculate_total_size(pretraineds_v1, pretraineds_v2, models, exe):
119
+ """
120
+ Calculate the total size of all files to be downloaded based on selected categories.
121
+ """
122
+ total_size = 0
123
+ if models:
124
+ total_size += get_file_size_if_missing(models_list)
125
+ total_size += get_file_size_if_missing(embedders_list)
126
+ if exe:
127
+ total_size += get_file_size_if_missing(
128
+ executables_list if os.name == "nt" else linux_executables_list
129
+ )
130
+ if pretraineds_v1:
131
+ total_size += get_file_size_if_missing(pretraineds_v1_list)
132
+ if pretraineds_v2:
133
+ total_size += get_file_size_if_missing(pretraineds_v2_list)
134
+ return total_size
135
+
136
+
137
+ def prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe):
138
+ """
139
+ Manage the download pipeline for different categories of files.
140
+ """
141
+ total_size = calculate_total_size(pretraineds_v1, pretraineds_v2, models, exe)
142
+
143
+ if total_size > 0:
144
+ with tqdm(
145
+ total=total_size, unit="iB", unit_scale=True, desc="Downloading all files"
146
+ ) as global_bar:
147
+ if models:
148
+ download_mapping_files(models_list, global_bar)
149
+ download_mapping_files(embedders_list, global_bar)
150
+ if exe:
151
+ download_mapping_files(
152
+ executables_list if os.name == "nt" else linux_executables_list,
153
+ global_bar,
154
+ )
155
+ if pretraineds_v1:
156
+ download_mapping_files(pretraineds_v1_list, global_bar)
157
+ if pretraineds_v2:
158
+ download_mapping_files(pretraineds_v2_list, global_bar)
159
+ else:
160
+ pass
161
+
162
+
163
+ if __name__ == "__main__":
164
+ prequisites_download_pipeline(False, False, True, False)
programs/applio_code/rvc/lib/tools/pretrained_selector.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def pretrained_selector(pitch_guidance):
2
+ if pitch_guidance == True:
3
+ return {
4
+ "v1": {
5
+ 32000: (
6
+ "rvc/models/pretraineds/pretrained_v1/f0G32k.pth",
7
+ "rvc/models/pretraineds/pretrained_v1/f0D32k.pth",
8
+ ),
9
+ 40000: (
10
+ "rvc/models/pretraineds/pretrained_v1/f0G40k.pth",
11
+ "rvc/models/pretraineds/pretrained_v1/f0D40k.pth",
12
+ ),
13
+ 48000: (
14
+ "rvc/models/pretraineds/pretrained_v1/f0G48k.pth",
15
+ "rvc/models/pretraineds/pretrained_v1/f0D48k.pth",
16
+ ),
17
+ },
18
+ "v2": {
19
+ 32000: (
20
+ "rvc/models/pretraineds/pretrained_v2/f0G32k.pth",
21
+ "rvc/models/pretraineds/pretrained_v2/f0D32k.pth",
22
+ ),
23
+ 40000: (
24
+ "rvc/models/pretraineds/pretrained_v2/f0G40k.pth",
25
+ "rvc/models/pretraineds/pretrained_v2/f0D40k.pth",
26
+ ),
27
+ 48000: (
28
+ "rvc/models/pretraineds/pretrained_v2/f0G48k.pth",
29
+ "rvc/models/pretraineds/pretrained_v2/f0D48k.pth",
30
+ ),
31
+ },
32
+ }
33
+ elif pitch_guidance == False:
34
+ return {
35
+ "v1": {
36
+ 32000: (
37
+ "rvc/models/pretraineds/pretrained_v1/G32k.pth",
38
+ "rvc/models/pretraineds/pretrained_v1/D32k.pth",
39
+ ),
40
+ 40000: (
41
+ "rvc/models/pretraineds/pretrained_v1/G40k.pth",
42
+ "rvc/models/pretraineds/pretrained_v1/D40k.pth",
43
+ ),
44
+ 48000: (
45
+ "rvc/models/pretraineds/pretrained_v1/G48k.pth",
46
+ "rvc/models/pretraineds/pretrained_v1/D48k.pth",
47
+ ),
48
+ },
49
+ "v2": {
50
+ 32000: (
51
+ "rvc/models/pretraineds/pretrained_v2/G32k.pth",
52
+ "rvc/models/pretraineds/pretrained_v2/D32k.pth",
53
+ ),
54
+ 40000: (
55
+ "rvc/models/pretraineds/pretrained_v2/G40k.pth",
56
+ "rvc/models/pretraineds/pretrained_v2/D40k.pth",
57
+ ),
58
+ 48000: (
59
+ "rvc/models/pretraineds/pretrained_v2/G48k.pth",
60
+ "rvc/models/pretraineds/pretrained_v2/D48k.pth",
61
+ ),
62
+ },
63
+ }
programs/applio_code/rvc/lib/tools/split_audio.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydub.silence import detect_nonsilent
2
+ from pydub import AudioSegment
3
+ import numpy as np
4
+ import re
5
+ import os
6
+
7
+ from programs.applio_code.rvc.lib.utils import format_title
8
+
9
+
10
+ def process_audio(file_path):
11
+ try:
12
+ # load audio file
13
+ song = AudioSegment.from_file(file_path)
14
+
15
+ # set silence threshold and duration
16
+ silence_thresh = -70 # dB
17
+ min_silence_len = 750 # ms, adjust as needed
18
+
19
+ # detect nonsilent parts
20
+ nonsilent_parts = detect_nonsilent(
21
+ song, min_silence_len=min_silence_len, silence_thresh=silence_thresh
22
+ )
23
+
24
+ # Create a new directory to store chunks
25
+ file_dir = os.path.dirname(file_path)
26
+ file_name = os.path.basename(file_path).split(".")[0]
27
+ file_name = format_title(file_name)
28
+ new_dir_path = os.path.join(file_dir, file_name)
29
+ os.makedirs(new_dir_path, exist_ok=True)
30
+
31
+ # Check if timestamps file exists, if so delete it
32
+ timestamps_file = os.path.join(file_dir, f"{file_name}_timestamps.txt")
33
+ if os.path.isfile(timestamps_file):
34
+ os.remove(timestamps_file)
35
+
36
+ # export chunks and save start times
37
+ segment_count = 0
38
+ for i, (start_i, end_i) in enumerate(nonsilent_parts):
39
+ chunk = song[start_i:end_i]
40
+ chunk_file_path = os.path.join(new_dir_path, f"chunk{i}.wav")
41
+ chunk.export(chunk_file_path, format="wav")
42
+
43
+ print(f"Segment {i} created!")
44
+ segment_count += 1
45
+
46
+ # write start times to file
47
+ with open(timestamps_file, "a", encoding="utf-8") as f:
48
+ f.write(f"{chunk_file_path} starts at {start_i} ms\n")
49
+
50
+ print(f"Total segments created: {segment_count}")
51
+ print(f"Split all chunks for {file_path} successfully!")
52
+
53
+ return "Finish", new_dir_path
54
+
55
+ except Exception as error:
56
+ print(f"An error occurred splitting the audio: {error}")
57
+ return "Error", None
58
+
59
+
60
+ def merge_audio(timestamps_file):
61
+ try:
62
+ # Extract prefix from the timestamps filename
63
+ prefix = os.path.basename(timestamps_file).replace("_timestamps.txt", "")
64
+ timestamps_dir = os.path.dirname(timestamps_file)
65
+
66
+ # Open the timestamps file
67
+ with open(timestamps_file, "r", encoding="utf-8") as f:
68
+ lines = f.readlines()
69
+
70
+ # Initialize empty list to hold audio segments
71
+ audio_segments = []
72
+ last_end_time = 0
73
+
74
+ print(f"Processing file: {timestamps_file}")
75
+
76
+ for line in lines:
77
+ # Extract filename and start time from line
78
+ match = re.search(r"(chunk\d+.wav) starts at (\d+) ms", line)
79
+ if match:
80
+ filename, start_time = match.groups()
81
+ start_time = int(start_time)
82
+
83
+ # Construct the complete path to the chunk file
84
+ chunk_file = os.path.join(timestamps_dir, prefix, filename)
85
+
86
+ # Add silence from last_end_time to start_time
87
+ silence_duration = max(start_time - last_end_time, 0)
88
+ silence = AudioSegment.silent(duration=silence_duration)
89
+ audio_segments.append(silence)
90
+
91
+ # Load audio file and append to list
92
+ audio = AudioSegment.from_wav(chunk_file)
93
+ audio_segments.append(audio)
94
+
95
+ # Update last_end_time
96
+ last_end_time = start_time + len(audio)
97
+
98
+ print(f"Processed chunk: {chunk_file}")
99
+
100
+ # Concatenate all audio_segments and export
101
+ merged_audio = sum(audio_segments)
102
+ merged_audio_np = np.array(merged_audio.get_array_of_samples())
103
+ # print(f"Exported merged file: {merged_filename}\n")
104
+ return merged_audio.frame_rate, merged_audio_np
105
+
106
+ except Exception as error:
107
+ print(f"An error occurred splitting the audio: {error}")
programs/applio_code/rvc/lib/tools/tts.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import asyncio
3
+ import edge_tts
4
+
5
+
6
+ async def main():
7
+ # Parse command line arguments
8
+ text = str(sys.argv[1])
9
+ voice = str(sys.argv[2])
10
+ rate = int(sys.argv[3])
11
+ output_file = str(sys.argv[4])
12
+
13
+ rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
14
+
15
+ await edge_tts.Communicate(text, voice, rate=rates).save(output_file)
16
+ print(f"TTS with {voice} completed. Output TTS file: '{output_file}'")
17
+
18
+
19
+ if __name__ == "__main__":
20
+ asyncio.run(main())
programs/applio_code/rvc/lib/tools/tts_voices.json ADDED
The diff for this file is too large to render. See raw diff
 
programs/applio_code/rvc/lib/utils.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import librosa
3
+ import soundfile as sf
4
+ import re
5
+ import unicodedata
6
+ import wget
7
+ from torch import nn
8
+
9
+ import logging
10
+ from transformers import HubertModel
11
+ import warnings
12
+
13
+ # Remove this to see warnings about transformers models
14
+ warnings.filterwarnings("ignore")
15
+
16
+ logging.getLogger("fairseq").setLevel(logging.ERROR)
17
+ logging.getLogger("faiss.loader").setLevel(logging.ERROR)
18
+ logging.getLogger("transformers").setLevel(logging.ERROR)
19
+ logging.getLogger("torch").setLevel(logging.ERROR)
20
+
21
+ now_dir = os.getcwd()
22
+ sys.path.append(now_dir)
23
+
24
+ base_path = os.path.join(now_dir, "rvc", "models", "formant", "stftpitchshift")
25
+ stft = base_path + ".exe" if sys.platform == "win32" else base_path
26
+
27
+
28
+ class HubertModelWithFinalProj(HubertModel):
29
+ def __init__(self, config):
30
+ super().__init__(config)
31
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
32
+
33
+
34
+ def load_audio(file, sample_rate):
35
+ try:
36
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
37
+ audio, sr = sf.read(file)
38
+ if len(audio.shape) > 1:
39
+ audio = librosa.to_mono(audio.T)
40
+ if sr != sample_rate:
41
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
42
+ except Exception as error:
43
+ raise RuntimeError(f"An error occurred loading the audio: {error}")
44
+
45
+ return audio.flatten()
46
+
47
+
48
+ def load_audio_infer(file, sample_rate):
49
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
50
+ if not os.path.isfile(file):
51
+ raise FileNotFoundError(f"File not found: {file}")
52
+ audio, sr = sf.read(file)
53
+ if len(audio.shape) > 1:
54
+ audio = librosa.to_mono(audio.T)
55
+ if sr != sample_rate:
56
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
57
+ return audio.flatten()
58
+
59
+
60
+ def format_title(title):
61
+ formatted_title = (
62
+ unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
63
+ )
64
+ formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title)
65
+ formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title)
66
+ formatted_title = re.sub(r"\s+", "_", formatted_title)
67
+ return formatted_title
68
+
69
+
70
+ def load_embedding(embedder_model, custom_embedder=None):
71
+ embedder_root = os.path.join(
72
+ now_dir, "programs", "applio_code", "rvc", "models", "embedders"
73
+ )
74
+ embedding_list = {
75
+ "contentvec": os.path.join(embedder_root, "contentvec"),
76
+ "chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"),
77
+ "japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"),
78
+ "korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"),
79
+ }
80
+
81
+ online_embedders = {
82
+ "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/pytorch_model.bin",
83
+ "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin",
84
+ "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin",
85
+ "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin",
86
+ }
87
+
88
+ config_files = {
89
+ "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/config.json",
90
+ "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/config.json",
91
+ "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/config.json",
92
+ "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/config.json",
93
+ }
94
+
95
+ if embedder_model == "custom":
96
+ if os.path.exists(custom_embedder):
97
+ model_path = custom_embedder
98
+ else:
99
+ print(f"Custom embedder not found: {custom_embedder}, using contentvec")
100
+ model_path = embedding_list["contentvec"]
101
+ else:
102
+ model_path = embedding_list[embedder_model]
103
+ bin_file = os.path.join(model_path, "pytorch_model.bin")
104
+ json_file = os.path.join(model_path, "config.json")
105
+ os.makedirs(model_path, exist_ok=True)
106
+ if not os.path.exists(bin_file):
107
+ url = online_embedders[embedder_model]
108
+ print(f"Downloading {url} to {model_path}...")
109
+ wget.download(url, out=bin_file)
110
+ if not os.path.exists(json_file):
111
+ url = config_files[embedder_model]
112
+ print(f"Downloading {url} to {model_path}...")
113
+ wget.download(url, out=json_file)
114
+
115
+ models = HubertModelWithFinalProj.from_pretrained(model_path)
116
+ return models
programs/applio_code/rvc/models/embedders/contentvec/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModelWithFinalProj"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "sum",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": false,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_norm": "group",
44
+ "feat_proj_dropout": 0.0,
45
+ "feat_proj_layer_norm": true,
46
+ "final_dropout": 0.1,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.1,
49
+ "hidden_size": 768,
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 3072,
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.1,
54
+ "mask_feature_length": 10,
55
+ "mask_feature_min_masks": 0,
56
+ "mask_feature_prob": 0.0,
57
+ "mask_time_length": 10,
58
+ "mask_time_min_masks": 2,
59
+ "mask_time_prob": 0.05,
60
+ "model_type": "hubert",
61
+ "num_attention_heads": 12,
62
+ "num_conv_pos_embedding_groups": 16,
63
+ "num_conv_pos_embeddings": 128,
64
+ "num_feat_extract_layers": 7,
65
+ "num_hidden_layers": 12,
66
+ "pad_token_id": 0,
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.27.3",
69
+ "use_weighted_layer_sum": false,
70
+ "vocab_size": 32
71
+ }
programs/applio_code/rvc/models/embedders/contentvec/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e
3
+ size 378342945
programs/applio_code/rvc/models/predictors/fcpe.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3a8dd2dbd51baf19ed295006f2ac25dba6dd60adc7ec578ae5fbd94970951da
3
+ size 69005189
programs/applio_code/rvc/models/predictors/rmvpe.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d62215f4306e3ca278246188607209f09af3dc77ed4232efdd069798c4ec193
3
+ size 181184272
programs/music_separation_code/ensemble.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ __author__ = "Roman Solovyev (ZFTurbo): https://github.com/ZFTurbo/"
3
+
4
+ import os
5
+ import librosa
6
+ import soundfile as sf
7
+ import numpy as np
8
+ import argparse
9
+
10
+
11
+ def stft(wave, nfft, hl):
12
+ wave_left = np.asfortranarray(wave[0])
13
+ wave_right = np.asfortranarray(wave[1])
14
+ spec_left = librosa.stft(wave_left, n_fft=nfft, hop_length=hl)
15
+ spec_right = librosa.stft(wave_right, n_fft=nfft, hop_length=hl)
16
+ spec = np.asfortranarray([spec_left, spec_right])
17
+ return spec
18
+
19
+
20
+ def istft(spec, hl, length):
21
+ spec_left = np.asfortranarray(spec[0])
22
+ spec_right = np.asfortranarray(spec[1])
23
+ wave_left = librosa.istft(spec_left, hop_length=hl, length=length)
24
+ wave_right = librosa.istft(spec_right, hop_length=hl, length=length)
25
+ wave = np.asfortranarray([wave_left, wave_right])
26
+ return wave
27
+
28
+
29
+ def absmax(a, *, axis):
30
+ dims = list(a.shape)
31
+ dims.pop(axis)
32
+ indices = np.ogrid[tuple(slice(0, d) for d in dims)]
33
+ argmax = np.abs(a).argmax(axis=axis)
34
+ indices.insert((len(a.shape) + axis) % len(a.shape), argmax)
35
+ return a[tuple(indices)]
36
+
37
+
38
+ def absmin(a, *, axis):
39
+ dims = list(a.shape)
40
+ dims.pop(axis)
41
+ indices = np.ogrid[tuple(slice(0, d) for d in dims)]
42
+ argmax = np.abs(a).argmin(axis=axis)
43
+ indices.insert((len(a.shape) + axis) % len(a.shape), argmax)
44
+ return a[tuple(indices)]
45
+
46
+
47
+ def lambda_max(arr, axis=None, key=None, keepdims=False):
48
+ idxs = np.argmax(key(arr), axis)
49
+ if axis is not None:
50
+ idxs = np.expand_dims(idxs, axis)
51
+ result = np.take_along_axis(arr, idxs, axis)
52
+ if not keepdims:
53
+ result = np.squeeze(result, axis=axis)
54
+ return result
55
+ else:
56
+ return arr.flatten()[idxs]
57
+
58
+
59
+ def lambda_min(arr, axis=None, key=None, keepdims=False):
60
+ idxs = np.argmin(key(arr), axis)
61
+ if axis is not None:
62
+ idxs = np.expand_dims(idxs, axis)
63
+ result = np.take_along_axis(arr, idxs, axis)
64
+ if not keepdims:
65
+ result = np.squeeze(result, axis=axis)
66
+ return result
67
+ else:
68
+ return arr.flatten()[idxs]
69
+
70
+
71
+ def average_waveforms(pred_track, weights, algorithm):
72
+ """
73
+ :param pred_track: shape = (num, channels, length)
74
+ :param weights: shape = (num, )
75
+ :param algorithm: One of avg_wave, median_wave, min_wave, max_wave, avg_fft, median_fft, min_fft, max_fft
76
+ :return: averaged waveform in shape (channels, length)
77
+ """
78
+
79
+ pred_track = np.array(pred_track)
80
+ final_length = pred_track.shape[-1]
81
+
82
+ mod_track = []
83
+ for i in range(pred_track.shape[0]):
84
+ if algorithm == "avg_wave":
85
+ mod_track.append(pred_track[i] * weights[i])
86
+ elif algorithm in ["median_wave", "min_wave", "max_wave"]:
87
+ mod_track.append(pred_track[i])
88
+ elif algorithm in ["avg_fft", "min_fft", "max_fft", "median_fft"]:
89
+ spec = stft(pred_track[i], nfft=2048, hl=1024)
90
+ if algorithm in ["avg_fft"]:
91
+ mod_track.append(spec * weights[i])
92
+ else:
93
+ mod_track.append(spec)
94
+ pred_track = np.array(mod_track)
95
+
96
+ if algorithm in ["avg_wave"]:
97
+ pred_track = pred_track.sum(axis=0)
98
+ pred_track /= np.array(weights).sum().T
99
+ elif algorithm in ["median_wave"]:
100
+ pred_track = np.median(pred_track, axis=0)
101
+ elif algorithm in ["min_wave"]:
102
+ pred_track = np.array(pred_track)
103
+ pred_track = lambda_min(pred_track, axis=0, key=np.abs)
104
+ elif algorithm in ["max_wave"]:
105
+ pred_track = np.array(pred_track)
106
+ pred_track = lambda_max(pred_track, axis=0, key=np.abs)
107
+ elif algorithm in ["avg_fft"]:
108
+ pred_track = pred_track.sum(axis=0)
109
+ pred_track /= np.array(weights).sum()
110
+ pred_track = istft(pred_track, 1024, final_length)
111
+ elif algorithm in ["min_fft"]:
112
+ pred_track = np.array(pred_track)
113
+ pred_track = lambda_min(pred_track, axis=0, key=np.abs)
114
+ pred_track = istft(pred_track, 1024, final_length)
115
+ elif algorithm in ["max_fft"]:
116
+ pred_track = np.array(pred_track)
117
+ pred_track = absmax(pred_track, axis=0)
118
+ pred_track = istft(pred_track, 1024, final_length)
119
+ elif algorithm in ["median_fft"]:
120
+ pred_track = np.array(pred_track)
121
+ pred_track = np.median(pred_track, axis=0)
122
+ pred_track = istft(pred_track, 1024, final_length)
123
+ return pred_track
124
+
125
+
126
+ def ensemble_files(args):
127
+ parser = argparse.ArgumentParser()
128
+ parser.add_argument(
129
+ "--files",
130
+ type=str,
131
+ required=True,
132
+ nargs="+",
133
+ help="Path to all audio-files to ensemble",
134
+ )
135
+ parser.add_argument(
136
+ "--type",
137
+ type=str,
138
+ default="avg_wave",
139
+ help="One of avg_wave, median_wave, min_wave, max_wave, avg_fft, median_fft, min_fft, max_fft",
140
+ )
141
+ parser.add_argument(
142
+ "--weights",
143
+ type=float,
144
+ nargs="+",
145
+ help="Weights to create ensemble. Number of weights must be equal to number of files",
146
+ )
147
+ parser.add_argument(
148
+ "--output",
149
+ default="res.wav",
150
+ type=str,
151
+ help="Path to wav file where ensemble result will be stored",
152
+ )
153
+ if args is None:
154
+ args = parser.parse_args()
155
+ else:
156
+ args = parser.parse_args(args)
157
+
158
+ print("Ensemble type: {}".format(args.type))
159
+ print("Number of input files: {}".format(len(args.files)))
160
+ if args.weights is not None:
161
+ weights = args.weights
162
+ else:
163
+ weights = np.ones(len(args.files))
164
+ print("Weights: {}".format(weights))
165
+ print("Output file: {}".format(args.output))
166
+ data = []
167
+ for f in args.files:
168
+ if not os.path.isfile(f):
169
+ print("Error. Can't find file: {}. Check paths.".format(f))
170
+ exit()
171
+ print("Reading file: {}".format(f))
172
+ wav, sr = librosa.load(f, sr=None, mono=False)
173
+ # wav, sr = sf.read(f)
174
+ print("Waveform shape: {} sample rate: {}".format(wav.shape, sr))
175
+ data.append(wav)
176
+ data = np.array(data)
177
+ res = average_waveforms(data, weights, args.type)
178
+ print("Result shape: {}".format(res.shape))
179
+ sf.write(args.output, res.T, sr, "FLOAT")
180
+
181
+
182
+ if __name__ == "__main__":
183
+ ensemble_files(None)