Upload Complited files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .github/workflows/code_formatter.yml +51 -0
- .gitignore +166 -0
- LICENSE.md +25 -0
- README.md +43 -3
- assets/config.json +6 -0
- assets/i18n/i18n.py +52 -0
- assets/i18n/languages/en_US.json +89 -0
- assets/i18n/languages/pt_BR.json +89 -0
- assets/i18n/scan.py +71 -0
- core.py +1023 -0
- logs/.gitkeep +0 -0
- main.py +53 -0
- programs/applio_code/rvc/configs/config.py +192 -0
- programs/applio_code/rvc/configs/v1/32000.json +47 -0
- programs/applio_code/rvc/configs/v1/40000.json +47 -0
- programs/applio_code/rvc/configs/v1/48000.json +47 -0
- programs/applio_code/rvc/configs/v2/32000.json +43 -0
- programs/applio_code/rvc/configs/v2/40000.json +43 -0
- programs/applio_code/rvc/configs/v2/48000.json +43 -0
- programs/applio_code/rvc/infer/infer.py +470 -0
- programs/applio_code/rvc/infer/pipeline.py +701 -0
- programs/applio_code/rvc/lib/algorithm/__init__.py +0 -0
- programs/applio_code/rvc/lib/algorithm/attentions.py +292 -0
- programs/applio_code/rvc/lib/algorithm/commons.py +225 -0
- programs/applio_code/rvc/lib/algorithm/discriminators.py +199 -0
- programs/applio_code/rvc/lib/algorithm/encoders.py +219 -0
- programs/applio_code/rvc/lib/algorithm/generators.py +199 -0
- programs/applio_code/rvc/lib/algorithm/modules.py +130 -0
- programs/applio_code/rvc/lib/algorithm/normalization.py +31 -0
- programs/applio_code/rvc/lib/algorithm/nsf.py +200 -0
- programs/applio_code/rvc/lib/algorithm/residuals.py +309 -0
- programs/applio_code/rvc/lib/algorithm/synthesizers.py +243 -0
- programs/applio_code/rvc/lib/predictors/F0Extractor.py +107 -0
- programs/applio_code/rvc/lib/predictors/FCPE.py +920 -0
- programs/applio_code/rvc/lib/predictors/RMVPE.py +569 -0
- programs/applio_code/rvc/lib/tools/analyzer.py +76 -0
- programs/applio_code/rvc/lib/tools/gdown.py +354 -0
- programs/applio_code/rvc/lib/tools/launch_tensorboard.py +21 -0
- programs/applio_code/rvc/lib/tools/model_download.py +385 -0
- programs/applio_code/rvc/lib/tools/prerequisites_download.py +164 -0
- programs/applio_code/rvc/lib/tools/pretrained_selector.py +63 -0
- programs/applio_code/rvc/lib/tools/split_audio.py +107 -0
- programs/applio_code/rvc/lib/tools/tts.py +20 -0
- programs/applio_code/rvc/lib/tools/tts_voices.json +0 -0
- programs/applio_code/rvc/lib/utils.py +116 -0
- programs/applio_code/rvc/models/embedders/contentvec/config.json +71 -0
- programs/applio_code/rvc/models/embedders/contentvec/pytorch_model.bin +3 -0
- programs/applio_code/rvc/models/predictors/fcpe.pt +3 -0
- programs/applio_code/rvc/models/predictors/rmvpe.pt +3 -0
- programs/music_separation_code/ensemble.py +183 -0
.github/workflows/code_formatter.yml
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Code Formatter
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
push_format:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
permissions:
|
13 |
+
contents: write
|
14 |
+
pull-requests: write
|
15 |
+
|
16 |
+
steps:
|
17 |
+
- uses: actions/checkout@v4
|
18 |
+
with:
|
19 |
+
ref: ${{github.ref_name}}
|
20 |
+
|
21 |
+
- name: Set up Python ${{ matrix.python-version }}
|
22 |
+
uses: actions/setup-python@v5
|
23 |
+
with:
|
24 |
+
python-version: ${{ matrix.python-version }}
|
25 |
+
|
26 |
+
- name: Install Black
|
27 |
+
run: pip install "black[jupyter]"
|
28 |
+
|
29 |
+
- name: Run Black
|
30 |
+
# run: black $(git ls-files '*.py')
|
31 |
+
run: black . --exclude=".*\.ipynb$"
|
32 |
+
|
33 |
+
- name: Commit Back
|
34 |
+
continue-on-error: true
|
35 |
+
id: commitback
|
36 |
+
run: |
|
37 |
+
git config --local user.email "github-actions[bot]@users.noreply.github.com"
|
38 |
+
git config --local user.name "github-actions[bot]"
|
39 |
+
git add --all
|
40 |
+
git commit -m "chore(format): run black on ${{github.ref_name}}"
|
41 |
+
|
42 |
+
- name: Create Pull Request
|
43 |
+
if: steps.commitback.outcome == 'success'
|
44 |
+
continue-on-error: true
|
45 |
+
uses: peter-evans/create-pull-request@v5
|
46 |
+
with:
|
47 |
+
delete-branch: true
|
48 |
+
body: "Automatically apply code formatter change"
|
49 |
+
title: "chore(format): run black on ${{github.ref_name}}"
|
50 |
+
commit-message: "chore(format): run black on ${{github.ref_name}}"
|
51 |
+
branch: formatter/${{github.ref_name}}
|
.gitignore
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib64/
|
18 |
+
parts/
|
19 |
+
sdist/
|
20 |
+
var/
|
21 |
+
wheels/
|
22 |
+
share/python-wheels/
|
23 |
+
*.egg-info/
|
24 |
+
.installed.cfg
|
25 |
+
*.egg
|
26 |
+
MANIFEST
|
27 |
+
|
28 |
+
# PyInstaller
|
29 |
+
# Usually these files are written by a python script from a template
|
30 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
31 |
+
*.manifest
|
32 |
+
*.spec
|
33 |
+
|
34 |
+
# Installer logs
|
35 |
+
pip-log.txt
|
36 |
+
pip-delete-this-directory.txt
|
37 |
+
|
38 |
+
# Unit test / coverage reports
|
39 |
+
htmlcov/
|
40 |
+
.tox/
|
41 |
+
.nox/
|
42 |
+
.coverage
|
43 |
+
.coverage.*
|
44 |
+
.cache
|
45 |
+
nosetests.xml
|
46 |
+
coverage.xml
|
47 |
+
*.cover
|
48 |
+
*.py,cover
|
49 |
+
.hypothesis/
|
50 |
+
.pytest_cache/
|
51 |
+
cover/
|
52 |
+
|
53 |
+
# Translations
|
54 |
+
*.mo
|
55 |
+
*.pot
|
56 |
+
|
57 |
+
# Django stuff:
|
58 |
+
*.log
|
59 |
+
local_settings.py
|
60 |
+
db.sqlite3
|
61 |
+
db.sqlite3-journal
|
62 |
+
|
63 |
+
# Flask stuff:
|
64 |
+
instance/
|
65 |
+
.webassets-cache
|
66 |
+
|
67 |
+
# Scrapy stuff:
|
68 |
+
.scrapy
|
69 |
+
|
70 |
+
# Sphinx documentation
|
71 |
+
docs/_build/
|
72 |
+
|
73 |
+
# PyBuilder
|
74 |
+
.pybuilder/
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
# For a library or package, you might want to ignore these files since the code is
|
86 |
+
# intended to run in multiple environments; otherwise, check them in:
|
87 |
+
# .python-version
|
88 |
+
|
89 |
+
# pipenv
|
90 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
91 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
92 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
93 |
+
# install all needed dependencies.
|
94 |
+
#Pipfile.lock
|
95 |
+
|
96 |
+
# poetry
|
97 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
98 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
99 |
+
# commonly ignored for libraries.
|
100 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
101 |
+
#poetry.lock
|
102 |
+
|
103 |
+
# pdm
|
104 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
105 |
+
#pdm.lock
|
106 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
107 |
+
# in version control.
|
108 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
109 |
+
.pdm.toml
|
110 |
+
.pdm-python
|
111 |
+
.pdm-build/
|
112 |
+
|
113 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
114 |
+
__pypackages__/
|
115 |
+
|
116 |
+
# Celery stuff
|
117 |
+
celerybeat-schedule
|
118 |
+
celerybeat.pid
|
119 |
+
|
120 |
+
# SageMath parsed files
|
121 |
+
*.sage.py
|
122 |
+
|
123 |
+
# Environments
|
124 |
+
.env
|
125 |
+
.venv
|
126 |
+
env/
|
127 |
+
venv/
|
128 |
+
ENV/
|
129 |
+
env.bak/
|
130 |
+
venv.bak/
|
131 |
+
|
132 |
+
# Spyder project settings
|
133 |
+
.spyderproject
|
134 |
+
.spyproject
|
135 |
+
|
136 |
+
# Rope project settings
|
137 |
+
.ropeproject
|
138 |
+
|
139 |
+
# mkdocs documentation
|
140 |
+
/site
|
141 |
+
|
142 |
+
# mypy
|
143 |
+
.mypy_cache/
|
144 |
+
.dmypy.json
|
145 |
+
dmypy.json
|
146 |
+
|
147 |
+
# Pyre type checker
|
148 |
+
.pyre/
|
149 |
+
|
150 |
+
# pytype static type analyzer
|
151 |
+
.pytype/
|
152 |
+
|
153 |
+
# Cython debug symbols
|
154 |
+
cython_debug/
|
155 |
+
|
156 |
+
# PyCharm
|
157 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
158 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
159 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
160 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
161 |
+
#.idea/
|
162 |
+
|
163 |
+
# mine
|
164 |
+
.flac
|
165 |
+
.pth
|
166 |
+
.pt
|
LICENSE.md
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
## KindaHex Non-Commercial Use License (HNCU)
|
4 |
+
|
5 |
+
|
6 |
+
This repository is licensed under the **KindaHex Non-Commercial Use License (HNCU)**. By using, modifying, or distributing any content from this repository, you agree to the terms outlined below.
|
7 |
+
|
8 |
+
### Terms of Use:
|
9 |
+
1. **Non-Commercial Use Only**: You are permitted to use, modify, and distribute the contents of this repository **only for non-commercial purposes**. Commercial use, including selling, licensing, or distributing for profit, is strictly prohibited.
|
10 |
+
|
11 |
+
2. **Modification and Derivative Works**: You may modify the contents of this repository and create derivative works. However, any modification or derivative work must also adhere to the non-commercial restriction and be subject to the terms of this license.
|
12 |
+
|
13 |
+
3. **Attribution**: When using or distributing the content (either as-is or modified), you must provide proper attribution to the original creator (blane187gt) in a manner that is reasonable and customary for the medium.
|
14 |
+
|
15 |
+
4. **No Warranty**: The content in this repository is provided "as-is," without any warranty, express or implied, including but not limited to warranties of merchantability or fitness for a particular purpose.
|
16 |
+
|
17 |
+
5. **Compliance with Laws**: You are responsible for ensuring that your use of the content complies with all applicable laws and regulations.
|
18 |
+
|
19 |
+
6. **Termination**: If you violate any of the terms of this license, your rights to use the repository’s content will be automatically terminated. You must cease all use and distribution of the content immediately upon termination.
|
20 |
+
|
21 |
+
### Restrictions:
|
22 |
+
- You may **not** use this repository's content for commercial gain, including but not limited to creating products, services, or tools that are sold or monetized.
|
23 |
+
- You may **not** sublicense or transfer rights to third parties for commercial purposes.
|
24 |
+
- You may not use the content in any manner that competes with the original repository or its creator.
|
25 |
+
|
README.md
CHANGED
@@ -1,3 +1,43 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# HexGen RVC
|
2 |
+
|
3 |
+
|
4 |
+
https://colab.research.google.com/drive/1dmGS0vEWuX55Z1w1tSRD6lJDV8s2deY0?usp=sharing
|
5 |
+
|
6 |
+
HexGen RVC is a tool designed for generating high-quality AI vocal covers using advanced source separation, vocal modeling, and audio processing techniques. This project builds on several community-driven efforts, integrating the best tools and frameworks available for music and vocal manipulation.
|
7 |
+
|
8 |
+
## Features
|
9 |
+
- **AI-Driven Vocal Cover Generation**: Produce custom vocal covers with ease.
|
10 |
+
- **Source Separation**: Isolate instrumentals and vocals from any track.
|
11 |
+
- **Efficient Workflow**: Streamlined integration with popular tools for music processing.
|
12 |
+
- **Colab Support**: Easily deploy and test models in Google Colab environments.
|
13 |
+
|
14 |
+
## Installation
|
15 |
+
1. Clone the repository:
|
16 |
+
```bash
|
17 |
+
git clone https://github.com/blane187gt/hexGen-RVC.git
|
18 |
+
cd hexGen-RVC
|
19 |
+
```
|
20 |
+
2. Follow specific setup instructions provided in the [documentation](https://github.com/blane187gt/hexGen-RVC/wiki) (if available) or in the code comments.
|
21 |
+
|
22 |
+
## Usage
|
23 |
+
1. Prepare your audio input file(s) and place them in the appropriate folder.
|
24 |
+
2. Run the script or Colab notebook as per the instructions.
|
25 |
+
3. Customize the output by tweaking the parameters and models used.
|
26 |
+
|
27 |
+
## Credits
|
28 |
+
This project would not have been possible without the contributions and support of the following tools and creators:
|
29 |
+
|
30 |
+
- [Audio Separator](https://github.com/karaokenerds/python-audio-separator) by [Andrew Beveridge](https://github.com/beveradb)
|
31 |
+
- [Applio](https://github.com/IAHispano/Applio) by [IAHispano](https://github.com/IAHispano)
|
32 |
+
- [yt-dlp](https://github.com/yt-dlp/yt-dlp)
|
33 |
+
- [Ultimate Vocal Remover GUI](https://github.com/Anjok07/ultimatevocalremovergui) by [Anjok07](https://github.com/Anjok07)
|
34 |
+
- [Music Source Separation Universal Training Code](https://github.com/ZFTurbo/Music-Source-Separation-Training) by [ZFTurbo](https://github.com/ZFTurbo)
|
35 |
+
- [AICoverGen](https://github.com/SociallyIneptWeeb/AICoverGen) by [SociallyIneptWeeb](https://github.com/SociallyIneptWeeb)
|
36 |
+
- [FullmatheusBallZ](https://www.youtube.com/@FullmatheusBallZ) for testing the Colab scripts.
|
37 |
+
- [Shirou](https://github.com/ShiromiyaG), the original project inspiration.
|
38 |
+
|
39 |
+
## Contributing
|
40 |
+
Feel free to submit pull requests or create issues for any improvements or bugs you encounter. Contributions are always welcome!
|
41 |
+
|
42 |
+
## License
|
43 |
+
This project is licensed under the terms specified in the `LICENSE` file. Ensure compliance with third-party dependencies when using or modifying this project.
|
assets/config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"lang": {
|
3 |
+
"override": false,
|
4 |
+
"selected_lang": "en_US"
|
5 |
+
}
|
6 |
+
}
|
assets/i18n/i18n.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import json
|
3 |
+
from pathlib import Path
|
4 |
+
from locale import getdefaultlocale
|
5 |
+
|
6 |
+
now_dir = os.getcwd()
|
7 |
+
sys.path.append(now_dir)
|
8 |
+
|
9 |
+
|
10 |
+
class I18nAuto:
|
11 |
+
LANGUAGE_PATH = os.path.join(now_dir, "assets", "i18n", "languages")
|
12 |
+
|
13 |
+
def __init__(self, language=None):
|
14 |
+
with open(
|
15 |
+
os.path.join(now_dir, "assets", "config.json"), "r", encoding="utf8"
|
16 |
+
) as file:
|
17 |
+
config = json.load(file)
|
18 |
+
override = config["lang"]["override"]
|
19 |
+
lang_prefix = config["lang"]["selected_lang"]
|
20 |
+
|
21 |
+
self.language = lang_prefix
|
22 |
+
|
23 |
+
if override == False:
|
24 |
+
language = language or getdefaultlocale()[0]
|
25 |
+
lang_prefix = language[:2] if language is not None else "en"
|
26 |
+
available_languages = self._get_available_languages()
|
27 |
+
matching_languages = [
|
28 |
+
lang for lang in available_languages if lang.startswith(lang_prefix)
|
29 |
+
]
|
30 |
+
self.language = matching_languages[0] if matching_languages else "en_US"
|
31 |
+
|
32 |
+
self.language_map = self._load_language_list()
|
33 |
+
|
34 |
+
def _load_language_list(self):
|
35 |
+
try:
|
36 |
+
file_path = Path(self.LANGUAGE_PATH) / f"{self.language}.json"
|
37 |
+
with open(file_path, "r", encoding="utf-8") as file:
|
38 |
+
return json.load(file)
|
39 |
+
except FileNotFoundError:
|
40 |
+
raise FileNotFoundError(
|
41 |
+
f"Failed to load language file for {self.language}. Check if the correct .json file exists."
|
42 |
+
)
|
43 |
+
|
44 |
+
def _get_available_languages(self):
|
45 |
+
language_files = [path.stem for path in Path(self.LANGUAGE_PATH).glob("*.json")]
|
46 |
+
return language_files
|
47 |
+
|
48 |
+
def _language_exists(self, language):
|
49 |
+
return (Path(self.LANGUAGE_PATH) / f"{language}.json").exists()
|
50 |
+
|
51 |
+
def __call__(self, key):
|
52 |
+
return self.language_map.get(key, key)
|
assets/i18n/languages/en_US.json
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Voice Model": "Voice Model",
|
3 |
+
"Select the voice model to use for the conversion.": "Select the voice model to use for the conversion.",
|
4 |
+
"Index File": "Index File",
|
5 |
+
"Select the index file to use for the conversion.": "Select the index file to use for the conversion.",
|
6 |
+
"Refresh": "Refresh",
|
7 |
+
"Unload Voice": "Unload Voice",
|
8 |
+
"Upload Audio": "Upload Audio",
|
9 |
+
"Select Audio": "Select Audio",
|
10 |
+
"Select the audio to convert.": "Select the audio to convert.",
|
11 |
+
"Advanced Settings": "Advanced Settings",
|
12 |
+
"RVC Settings": "RVC Settings",
|
13 |
+
"Output Path": "Output Path",
|
14 |
+
"Enter output path": "Enter output path",
|
15 |
+
"The path where the output audio will be saved, by default in audio_files/rvc/output.wav": "The path where the output audio will be saved, by default in audio_files/rvc/output.wav",
|
16 |
+
"Clear Outputs (Deletes all audios in assets/audios)": "Clear Outputs (Deletes all audios in assets/audios)",
|
17 |
+
"Export Format": "Export Format",
|
18 |
+
"Select the format to export the audio.": "Select the format to export the audio.",
|
19 |
+
"Split Audio": "Split Audio",
|
20 |
+
"Split the audio into chunks for inference to obtain better results in some cases.": "Split the audio into chunks for inference to obtain better results in some cases.",
|
21 |
+
"Pitch Extractor": "Pitch Extractor",
|
22 |
+
"Pitch extract Algorith.": "Pitch extract Algorith.",
|
23 |
+
"Hop Length": "Hop Length",
|
24 |
+
"Hop length for pitch extraction.": "Hop length for pitch extraction.",
|
25 |
+
"Embedder Model": "Embedder Model",
|
26 |
+
"Model used for learning speaker embedding.": "Model used for learning speaker embedding.",
|
27 |
+
"Autotune": "Autotune",
|
28 |
+
"Apply a soft autotune to your inferences, recommended for singing conversions.": "Apply a soft autotune to your inferences, recommended for singing conversions.",
|
29 |
+
"Pitch": "Pitch",
|
30 |
+
"Adjust the pitch of the audio.": "Adjust the pitch of the audio.",
|
31 |
+
"Filter Radius": "Filter Radius",
|
32 |
+
"If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration.": "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration.",
|
33 |
+
"Search Feature Ratio": "Search Feature Ratio",
|
34 |
+
"Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio.": "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio.",
|
35 |
+
"Volume Envelope": "Volume Envelope",
|
36 |
+
"Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed.": "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed.",
|
37 |
+
"Protect Voiceless Consonants": "Protect Voiceless Consonants",
|
38 |
+
"Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect.": "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect.",
|
39 |
+
"Audio Separation Settings": "Audio Separation Settings",
|
40 |
+
"Use TTA": "Use TTA",
|
41 |
+
"Use Test Time Augmentation.": "Use Test Time Augmentation.",
|
42 |
+
"Batch Size": "Batch Size",
|
43 |
+
"Set the batch size for the separation.": "Set the batch size for the separation.",
|
44 |
+
"Vocals Model": "Vocals Model",
|
45 |
+
"Select the vocals model to use for the separation.": "Select the vocals model to use for the separation.",
|
46 |
+
"Karaoke Model": "Karaoke Model",
|
47 |
+
"Select the karaoke model to use for the separation.": "Select the karaoke model to use for the separation.",
|
48 |
+
"Dereverb Model": "Dereverb Model",
|
49 |
+
"Select the dereverb model to use for the separation.": "Select the dereverb model to use for the separation.",
|
50 |
+
"Deeecho": "Deeecho",
|
51 |
+
"Apply deeecho to the audio.": "Apply deeecho to the audio.",
|
52 |
+
"Deeecho Model": "Deeecho Model",
|
53 |
+
"Select the deeecho model to use for the separation.": "Select the deeecho model to use for the separation.",
|
54 |
+
"Denoise": "Denoise",
|
55 |
+
"Apply denoise to the audio.": "Apply denoise to the audio.",
|
56 |
+
"Denoise Model": "Denoise Model",
|
57 |
+
"Select the denoise model to use for the separation.": "Select the denoise model to use for the separation.",
|
58 |
+
"Audio post-process Settings": "Audio post-process Settings",
|
59 |
+
"Delete Audios": "Delete Audios",
|
60 |
+
"Delete the audios after the conversion.": "Delete the audios after the conversion.",
|
61 |
+
"Reverb": "Reverb",
|
62 |
+
"Apply reverb to the audio.": "Apply reverb to the audio.",
|
63 |
+
"Reverb Room Size": "Reverb Room Size",
|
64 |
+
"Set the room size of the reverb.": "Set the room size of the reverb.",
|
65 |
+
"Reverb Damping": "Reverb Damping",
|
66 |
+
"Set the damping of the reverb.": "Set the damping of the reverb.",
|
67 |
+
"Reverb Wet Gain": "Reverb Wet Gain",
|
68 |
+
"Set the wet gain of the reverb.": "Set the wet gain of the reverb.",
|
69 |
+
"Reverb Dry Gain": "Reverb Dry Gain",
|
70 |
+
"Set the dry gain of the reverb.": "Set the dry gain of the reverb.",
|
71 |
+
"Reverb Width": "Reverb Width",
|
72 |
+
"Set the width of the reverb.": "Set the width of the reverb.",
|
73 |
+
"Vocals Volume": "Vocals Volume",
|
74 |
+
"Adjust the volume of the vocals.": "Adjust the volume of the vocals.",
|
75 |
+
"Instrumentals Volume": "Instrumentals Volume",
|
76 |
+
"Adjust the volume of the Instrumentals.": "Adjust the volume of the Instrumentals.",
|
77 |
+
"Backing Vocals Volume": "Backing Vocals Volume",
|
78 |
+
"Adjust the volume of the backing vocals.": "Adjust the volume of the backing vocals.",
|
79 |
+
"Device Settings": "Device Settings",
|
80 |
+
"Device": "Device",
|
81 |
+
"Select the device to use for the conversion. 0 to ∞ separated by - and for CPU leave only an -": "Select the device to use for the conversion. 0 to ∞ separated by - and for CPU leave only an -",
|
82 |
+
"Convert": "Convert",
|
83 |
+
"Output Information": "Output Information",
|
84 |
+
"The output information will be displayed here.": "The output information will be displayed here.",
|
85 |
+
"Export Audio": "Export Audio",
|
86 |
+
"Music URL": "Music URL",
|
87 |
+
"Download": "Download",
|
88 |
+
"Model URL": "Model URL"
|
89 |
+
}
|
assets/i18n/languages/pt_BR.json
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Voice Model": "Modelo de Voz",
|
3 |
+
"Select the voice model to use for the conversion.": "Selecione o modelo de voz a ser usado para a conversão.",
|
4 |
+
"Index File": "Arquivo Index",
|
5 |
+
"Select the index file to use for the conversion.": "Selecione o arquivo Index a ser usado para a conversão.",
|
6 |
+
"Refresh": "Atualizar",
|
7 |
+
"Unload Voice": "Descarregar Voz",
|
8 |
+
"Upload Audio": "Carregar Áudio",
|
9 |
+
"Select Audio": "Selecionar Áudio",
|
10 |
+
"Select the audio to convert.": "Selecione o áudio a ser convertido.",
|
11 |
+
"Advanced Settings": "Configurações Avançadas",
|
12 |
+
"RVC Settings": "Configurações RVC",
|
13 |
+
"Output Path": "Caminho de Saída",
|
14 |
+
"Enter output path": "Insira o caminho de saída",
|
15 |
+
"The path where the output audio will be saved, by default in audio_files/rvc/output.wav": "O caminho onde o áudio de saída será salvo, por padrão em audio_files/rvc/output.wav",
|
16 |
+
"Clear Outputs (Deletes all audios in assets/audios)": "Limpar Saídas (Exclui todos os áudios em assets/audios)",
|
17 |
+
"Export Format": "Formato de Exportação",
|
18 |
+
"Select the format to export the audio.": "Selecione o formato para exportar o áudio.",
|
19 |
+
"Split Audio": "Dividir Áudio",
|
20 |
+
"Split the audio into chunks for inference to obtain better results in some cases.": "Divida o áudio em partes para inferência para obter melhores resultados em alguns casos.",
|
21 |
+
"Pitch Extractor": "Extrator de Pitch",
|
22 |
+
"Pitch extract Algorith.": "Algoritmo de Extração de Pitch",
|
23 |
+
"Hop Length": "Hop Length",
|
24 |
+
"Hop length for pitch extraction.": "Hop Length para extração de pitch.",
|
25 |
+
"Embedder Model": "Modelo de Embedding",
|
26 |
+
"Model used for learning speaker embedding.": "Modelo usado para aprendizado de embedding de locutor.",
|
27 |
+
"Autotune": "Autotune",
|
28 |
+
"Apply a soft autotune to your inferences, recommended for singing conversions.": "Aplique um autotune suave às suas inferências, recomendado para conversões de canto.",
|
29 |
+
"Pitch": "Pitch",
|
30 |
+
"Adjust the pitch of the audio.": "Ajuste o pitch do áudio.",
|
31 |
+
"Filter Radius": "Raio do Filtro",
|
32 |
+
"If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration.": "Se o número for maior ou igual a três, o uso de filtragem mediana nos resultados de tom coletados tem o potencial de diminuir a respiração.",
|
33 |
+
"Search Feature Ratio": "Proporção da Função de Busca",
|
34 |
+
"Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio.": "Influência exercida pelo arquivo de índice; um valor mais alto corresponde a maior influência. No entanto, optar por valores mais baixos pode ajudar a mitigar artefatos presentes no áudio.",
|
35 |
+
"Volume Envelope": "Envelope de Volume",
|
36 |
+
"Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed.": "Substitua ou misture com o envelope de volume da saída. Quanto mais próximo o valor estiver de 1, mais o envelope de saída será empregado.",
|
37 |
+
"Protect Voiceless Consonants": "Proteger Consoantes Surdas",
|
38 |
+
"Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect.": "Proteja consoantes distintas e sons de respiração para evitar rasgos eletroacústicos e outros artefatos. Ajustar o parâmetro para seu valor máximo de 0,5 oferece proteção abrangente. No entanto, reduzir esse valor pode diminuir a extensão da proteção enquanto potencialmente mitiga o efeito de indexação.",
|
39 |
+
"Audio Separation Settings": "Configurações de Separação de Áudio",
|
40 |
+
"Use TTA": "Usar TTA",
|
41 |
+
"Use Test Time Augmentation.": "Usar Aumento de Tempo de Teste.",
|
42 |
+
"Batch Size": "Batch Size",
|
43 |
+
"Set the batch size for the separation.": "Defina o Batch Size para a separação.",
|
44 |
+
"Vocals Model": "Modelo de Vocais",
|
45 |
+
"Select the vocals model to use for the separation.": "Selecione o modelo de vocais a ser usado para a separação.",
|
46 |
+
"Karaoke Model": "Modelo de Karaokê",
|
47 |
+
"Select the karaoke model to use for the separation.": "Selecione o modelo de karaokê a ser usado para a separação.",
|
48 |
+
"Dereverb Model": "Modelo de Dereverb",
|
49 |
+
"Select the dereverb model to use for the separation.": "Selecione o modelo de dereverb a ser usado para a separação.",
|
50 |
+
"Deeecho": "Deeecho",
|
51 |
+
"Apply deeecho to the audio.": "Aplicar deeecho ao áudio.",
|
52 |
+
"Deeecho Model": "Modelo de Deeecho",
|
53 |
+
"Select the deeecho model to use for the separation.": "Selecione o modelo de deeecho a ser usado para a separação.",
|
54 |
+
"Denoise": "Redução de Ruído",
|
55 |
+
"Apply denoise to the audio.": "Aplicar redução de ruído ao áudio.",
|
56 |
+
"Denoise Model": "Modelo de Redução de Ruído",
|
57 |
+
"Select the denoise model to use for the separation.": "Selecione o modelo de redução de ruído a ser usado para a separação.",
|
58 |
+
"Audio post-process Settings": "Configurações de Pós-processamento de Áudio",
|
59 |
+
"Delete Audios": "Excluir Áudios",
|
60 |
+
"Delete the audios after the conversion.": "Excluir os áudios após a conversão.",
|
61 |
+
"Reverb": "Reverberação",
|
62 |
+
"Apply reverb to the audio.": "Aplicar reverberação ao áudio.",
|
63 |
+
"Reverb Room Size": "Tamanho da Sala de Reverberação",
|
64 |
+
"Set the room size of the reverb.": "Definir o tamanho da sala de reverberação.",
|
65 |
+
"Reverb Damping": "Amortecimento da Reverberação",
|
66 |
+
"Set the damping of the reverb.": "Definir o amortecimento da reverberação.",
|
67 |
+
"Reverb Wet Gain": "Ganho Molhado da Reverberação",
|
68 |
+
"Set the wet gain of the reverb.": "Definir o ganho molhado da reverberação.",
|
69 |
+
"Reverb Dry Gain": "Ganho Seco da Reverberação",
|
70 |
+
"Set the dry gain of the reverb.": "Definir o ganho seco da reverberação.",
|
71 |
+
"Reverb Width": "Largura da Reverberação",
|
72 |
+
"Set the width of the reverb.": "Definir a largura da reverberação.",
|
73 |
+
"Vocals Volume": "Volume dos Vocais",
|
74 |
+
"Adjust the volume of the vocals.": "Ajustar o volume dos vocais.",
|
75 |
+
"Instrumentals Volume": "Volume dos Instrumentais",
|
76 |
+
"Adjust the volume of the Instrumentals.": "Ajustar o volume dos instrumentais.",
|
77 |
+
"Backing Vocals Volume": "Volume dos Vocais de Apoio",
|
78 |
+
"Adjust the volume of the backing vocals.": "Ajustar o volume dos vocais de apoio.",
|
79 |
+
"Device Settings": "Configurações do Dispositivo",
|
80 |
+
"Device": "Dispositivo",
|
81 |
+
"Select the device to use for the conversion. 0 to ∞ separated by - and for CPU leave only an -": "Selecione o dispositivo a ser usado para a conversão. 0 a ∞ separados por - e para CPU deixe apenas um -",
|
82 |
+
"Convert": "Converter",
|
83 |
+
"Output Information": "Informações de Saída",
|
84 |
+
"The output information will be displayed here.": "As informações de saída serão exibidas aqui.",
|
85 |
+
"Export Audio": "Exportar Áudio",
|
86 |
+
"Music URL": "URL da Música",
|
87 |
+
"Download": "Baixar",
|
88 |
+
"Model URL": "URL do Modelo"
|
89 |
+
}
|
assets/i18n/scan.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
import json
|
3 |
+
from pathlib import Path
|
4 |
+
from collections import OrderedDict
|
5 |
+
|
6 |
+
|
7 |
+
def extract_i18n_strings(node):
|
8 |
+
i18n_strings = []
|
9 |
+
|
10 |
+
if (
|
11 |
+
isinstance(node, ast.Call)
|
12 |
+
and isinstance(node.func, ast.Name)
|
13 |
+
and node.func.id == "i18n"
|
14 |
+
):
|
15 |
+
for arg in node.args:
|
16 |
+
if isinstance(arg, ast.Str):
|
17 |
+
i18n_strings.append(arg.s)
|
18 |
+
|
19 |
+
for child_node in ast.iter_child_nodes(node):
|
20 |
+
i18n_strings.extend(extract_i18n_strings(child_node))
|
21 |
+
|
22 |
+
return i18n_strings
|
23 |
+
|
24 |
+
|
25 |
+
def process_file(file_path):
|
26 |
+
with open(file_path, "r", encoding="utf8") as file:
|
27 |
+
code = file.read()
|
28 |
+
if "I18nAuto" in code:
|
29 |
+
tree = ast.parse(code)
|
30 |
+
i18n_strings = extract_i18n_strings(tree)
|
31 |
+
print(file_path, len(i18n_strings))
|
32 |
+
return i18n_strings
|
33 |
+
return []
|
34 |
+
|
35 |
+
|
36 |
+
# Use pathlib for file handling
|
37 |
+
py_files = Path(".").rglob("*.py")
|
38 |
+
|
39 |
+
# Use a set to store unique strings
|
40 |
+
code_keys = set()
|
41 |
+
|
42 |
+
for py_file in py_files:
|
43 |
+
strings = process_file(py_file)
|
44 |
+
code_keys.update(strings)
|
45 |
+
|
46 |
+
print()
|
47 |
+
print("Total unique:", len(code_keys))
|
48 |
+
|
49 |
+
standard_file = "languages/en_US.json"
|
50 |
+
with open(standard_file, "r", encoding="utf-8") as file:
|
51 |
+
standard_data = json.load(file, object_pairs_hook=OrderedDict)
|
52 |
+
standard_keys = set(standard_data.keys())
|
53 |
+
|
54 |
+
# Combine unused and missing keys sections
|
55 |
+
unused_keys = standard_keys - code_keys
|
56 |
+
missing_keys = code_keys - standard_keys
|
57 |
+
|
58 |
+
print("Unused keys:", len(unused_keys))
|
59 |
+
for unused_key in unused_keys:
|
60 |
+
print("\t", unused_key)
|
61 |
+
|
62 |
+
print("Missing keys:", len(missing_keys))
|
63 |
+
for missing_key in missing_keys:
|
64 |
+
print("\t", missing_key)
|
65 |
+
|
66 |
+
code_keys_dict = OrderedDict((s, s) for s in code_keys)
|
67 |
+
|
68 |
+
# Use context manager for writing back to the file
|
69 |
+
with open(standard_file, "w", encoding="utf-8") as file:
|
70 |
+
json.dump(code_keys_dict, file, ensure_ascii=False, indent=4, sort_keys=True)
|
71 |
+
file.write("\n")
|
core.py
ADDED
@@ -0,0 +1,1023 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys, os
|
2 |
+
import subprocess
|
3 |
+
import torch
|
4 |
+
from functools import lru_cache
|
5 |
+
import shutil
|
6 |
+
from pedalboard import Pedalboard, Reverb
|
7 |
+
from pedalboard.io import AudioFile
|
8 |
+
from pydub import AudioSegment
|
9 |
+
from audio_separator.separator import Separator
|
10 |
+
import logging
|
11 |
+
import yaml
|
12 |
+
|
13 |
+
now_dir = os.getcwd()
|
14 |
+
sys.path.append(now_dir)
|
15 |
+
from programs.applio_code.rvc.infer.infer import VoiceConverter
|
16 |
+
from programs.applio_code.rvc.lib.tools.model_download import model_download_pipeline
|
17 |
+
from programs.music_separation_code.inference import proc_file
|
18 |
+
|
19 |
+
models_vocals = [
|
20 |
+
{
|
21 |
+
"name": "Mel-Roformer by KimberleyJSN",
|
22 |
+
"path": os.path.join(now_dir, "models", "mel-vocals"),
|
23 |
+
"model": os.path.join(now_dir, "models", "mel-vocals", "model.ckpt"),
|
24 |
+
"config": os.path.join(now_dir, "models", "mel-vocals", "config.yaml"),
|
25 |
+
"type": "mel_band_roformer",
|
26 |
+
"config_url": "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/KimberleyJensen/config_vocals_mel_band_roformer_kj.yaml",
|
27 |
+
"model_url": "https://huggingface.co/KimberleyJSN/melbandroformer/resolve/main/MelBandRoformer.ckpt",
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"name": "BS-Roformer by ViperX",
|
31 |
+
"path": os.path.join(now_dir, "models", "bs-vocals"),
|
32 |
+
"model": os.path.join(now_dir, "models", "bs-vocals", "model.ckpt"),
|
33 |
+
"config": os.path.join(now_dir, "models", "bs-vocals", "config.yaml"),
|
34 |
+
"type": "bs_roformer",
|
35 |
+
"config_url": "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml",
|
36 |
+
"model_url": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/model_bs_roformer_ep_317_sdr_12.9755.ckpt",
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"name": "MDX23C",
|
40 |
+
"path": os.path.join(now_dir, "models", "mdx23c-vocals"),
|
41 |
+
"model": os.path.join(now_dir, "models", "mdx23c-vocals", "model.ckpt"),
|
42 |
+
"config": os.path.join(now_dir, "models", "mdx23c-vocals", "config.yaml"),
|
43 |
+
"type": "mdx23c",
|
44 |
+
"config_url": "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/config_vocals_mdx23c.yaml",
|
45 |
+
"model_url": "https://github.com/ZFTurbo/Music-Source-Separation-Training/releases/download/v1.0.0/model_vocals_mdx23c_sdr_10.17.ckpt",
|
46 |
+
},
|
47 |
+
]
|
48 |
+
|
49 |
+
karaoke_models = [
|
50 |
+
{
|
51 |
+
"name": "Mel-Roformer Karaoke by aufr33 and viperx",
|
52 |
+
"path": os.path.join(now_dir, "models", "mel-kara"),
|
53 |
+
"model": os.path.join(now_dir, "models", "mel-kara", "model.ckpt"),
|
54 |
+
"config": os.path.join(now_dir, "models", "mel-kara", "config.yaml"),
|
55 |
+
"type": "mel_band_roformer",
|
56 |
+
"config_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel_band_roformer_karaoke_aufr33_viperx/config_mel_band_roformer_karaoke.yaml",
|
57 |
+
"model_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel_band_roformer_karaoke_aufr33_viperx/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt",
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"name": "UVR-BVE",
|
61 |
+
"full_name": "UVR-BVE-4B_SN-44100-1.pth",
|
62 |
+
"arch": "vr",
|
63 |
+
},
|
64 |
+
]
|
65 |
+
|
66 |
+
denoise_models = [
|
67 |
+
{
|
68 |
+
"name": "Mel-Roformer Denoise Normal by aufr33",
|
69 |
+
"path": os.path.join(now_dir, "models", "mel-denoise"),
|
70 |
+
"model": os.path.join(now_dir, "models", "mel-denoise", "model.ckpt"),
|
71 |
+
"config": os.path.join(now_dir, "models", "mel-denoise", "config.yaml"),
|
72 |
+
"type": "mel_band_roformer",
|
73 |
+
"config_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel-denoise/model_mel_band_roformer_denoise.yaml",
|
74 |
+
"model_url": "https://huggingface.co/jarredou/aufr33_MelBand_Denoise/resolve/main/denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt",
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"name": "Mel-Roformer Denoise Aggressive by aufr33",
|
78 |
+
"path": os.path.join(now_dir, "models", "mel-denoise-aggr"),
|
79 |
+
"model": os.path.join(now_dir, "models", "mel-denoise-aggr", "model.ckpt"),
|
80 |
+
"config": os.path.join(now_dir, "models", "mel-denoise-aggr", "config.yaml"),
|
81 |
+
"type": "mel_band_roformer",
|
82 |
+
"config_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel-denoise/model_mel_band_roformer_denoise.yaml",
|
83 |
+
"model_url": "https://huggingface.co/jarredou/aufr33_MelBand_Denoise/resolve/main/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt",
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"name": "UVR Denoise",
|
87 |
+
"full_name": "UVR-DeNoise.pth",
|
88 |
+
"arch": "vr",
|
89 |
+
},
|
90 |
+
]
|
91 |
+
|
92 |
+
dereverb_models = [
|
93 |
+
{
|
94 |
+
"name": "MDX23C DeReverb by aufr33 and jarredou",
|
95 |
+
"path": os.path.join(now_dir, "models", "mdx23c-dereveb"),
|
96 |
+
"model": os.path.join(now_dir, "models", "mdx23c-dereveb", "model.ckpt"),
|
97 |
+
"config": os.path.join(now_dir, "models", "mdx23c-dereveb", "config.yaml"),
|
98 |
+
"type": "mdx23c",
|
99 |
+
"config_url": "https://huggingface.co/jarredou/aufr33_jarredou_MDXv3_DeReverb/resolve/main/config_dereverb_mdx23c.yaml",
|
100 |
+
"model_url": "https://huggingface.co/jarredou/aufr33_jarredou_MDXv3_DeReverb/resolve/main/dereverb_mdx23c_sdr_6.9096.ckpt",
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"name": "BS-Roformer Dereverb by anvuew",
|
104 |
+
"path": os.path.join(now_dir, "models", "mdx23c-dereveb"),
|
105 |
+
"model": os.path.join(now_dir, "models", "mdx23c-dereveb", "model.ckpt"),
|
106 |
+
"config": os.path.join(now_dir, "models", "mdx23c-dereveb", "config.yaml"),
|
107 |
+
"type": "bs_roformer",
|
108 |
+
"config_url": "https://huggingface.co/anvuew/deverb_bs_roformer/resolve/main/deverb_bs_roformer_8_384dim_10depth.yaml",
|
109 |
+
"model_url": "https://huggingface.co/anvuew/deverb_bs_roformer/resolve/main/deverb_bs_roformer_8_384dim_10depth.ckpt",
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"name": "UVR-Deecho-Dereverb",
|
113 |
+
"full_name": "UVR-DeEcho-DeReverb.pth",
|
114 |
+
"arch": "vr",
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"name": "MDX Reverb HQ by FoxJoy",
|
118 |
+
"full_name": "Reverb_HQ_By_FoxJoy.onnx",
|
119 |
+
"arch": "mdx",
|
120 |
+
},
|
121 |
+
]
|
122 |
+
|
123 |
+
deecho_models = [
|
124 |
+
{
|
125 |
+
"name": "UVR-Deecho-Normal",
|
126 |
+
"full_name": "UVR-De-Echo-Normal.pth",
|
127 |
+
"arch": "vr",
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"name": "UVR-Deecho-Agggressive",
|
131 |
+
"full_name": "UVR-De-Echo-Aggressive.pth",
|
132 |
+
"arch": "vr",
|
133 |
+
},
|
134 |
+
]
|
135 |
+
|
136 |
+
|
137 |
+
@lru_cache(maxsize=None)
|
138 |
+
def import_voice_converter():
|
139 |
+
from programs.applio_code.rvc.infer.infer import VoiceConverter
|
140 |
+
|
141 |
+
return VoiceConverter()
|
142 |
+
|
143 |
+
|
144 |
+
@lru_cache(maxsize=1)
|
145 |
+
def get_config():
|
146 |
+
from programs.applio_code.rvc.configs.config import Config
|
147 |
+
|
148 |
+
return Config()
|
149 |
+
|
150 |
+
|
151 |
+
def download_file(url, path, filename):
|
152 |
+
os.makedirs(path, exist_ok=True)
|
153 |
+
file_path = os.path.join(path, filename)
|
154 |
+
|
155 |
+
if os.path.exists(file_path):
|
156 |
+
print(f"File '{filename}' already exists at '{path}'.")
|
157 |
+
return
|
158 |
+
|
159 |
+
try:
|
160 |
+
response = torch.hub.download_url_to_file(url, file_path)
|
161 |
+
print(f"File '{filename}' downloaded successfully")
|
162 |
+
except Exception as e:
|
163 |
+
print(f"Error downloading file '{filename}' from '{url}': {e}")
|
164 |
+
|
165 |
+
|
166 |
+
def get_model_info_by_name(model_name):
|
167 |
+
all_models = (
|
168 |
+
models_vocals
|
169 |
+
+ karaoke_models
|
170 |
+
+ dereverb_models
|
171 |
+
+ deecho_models
|
172 |
+
+ denoise_models
|
173 |
+
)
|
174 |
+
for model in all_models:
|
175 |
+
if model["name"] == model_name:
|
176 |
+
return model
|
177 |
+
return None
|
178 |
+
|
179 |
+
|
180 |
+
def get_last_modified_file(pasta):
|
181 |
+
if not os.path.isdir(pasta):
|
182 |
+
raise NotADirectoryError(f"{pasta} is not a valid directory.")
|
183 |
+
arquivos = [f for f in os.listdir(pasta) if os.path.isfile(os.path.join(pasta, f))]
|
184 |
+
if not arquivos:
|
185 |
+
return None
|
186 |
+
return max(arquivos, key=lambda x: os.path.getmtime(os.path.join(pasta, x)))
|
187 |
+
|
188 |
+
|
189 |
+
def search_with_word(folder, word):
|
190 |
+
if not os.path.isdir(folder):
|
191 |
+
raise NotADirectoryError(f"{folder} is not a valid directory.")
|
192 |
+
file_with_word = [file for file in os.listdir(folder) if word in file]
|
193 |
+
if not file_with_word:
|
194 |
+
return None
|
195 |
+
most_recent_file = max(
|
196 |
+
file_with_word, key=lambda file: os.path.getmtime(os.path.join(folder, file))
|
197 |
+
)
|
198 |
+
return most_recent_file
|
199 |
+
|
200 |
+
|
201 |
+
def search_with_two_words(folder, word1, word2):
|
202 |
+
if not os.path.isdir(folder):
|
203 |
+
raise NotADirectoryError(f"{folder} is not a valid directory.")
|
204 |
+
file_with_words = [
|
205 |
+
file for file in os.listdir(folder) if word1 in file and word2 in file
|
206 |
+
]
|
207 |
+
if not file_with_words:
|
208 |
+
return None
|
209 |
+
most_recent_file = max(
|
210 |
+
file_with_words, key=lambda file: os.path.getmtime(os.path.join(folder, file))
|
211 |
+
)
|
212 |
+
return most_recent_file
|
213 |
+
|
214 |
+
|
215 |
+
def get_last_modified_folder(path):
|
216 |
+
directories = [
|
217 |
+
os.path.join(path, d)
|
218 |
+
for d in os.listdir(path)
|
219 |
+
if os.path.isdir(os.path.join(path, d))
|
220 |
+
]
|
221 |
+
if not directories:
|
222 |
+
return None
|
223 |
+
last_modified_folder = max(directories, key=os.path.getmtime)
|
224 |
+
return last_modified_folder
|
225 |
+
|
226 |
+
|
227 |
+
def add_audio_effects(
|
228 |
+
audio_path,
|
229 |
+
reverb_size,
|
230 |
+
reverb_wet,
|
231 |
+
reverb_dry,
|
232 |
+
reverb_damping,
|
233 |
+
reverb_width,
|
234 |
+
output_path,
|
235 |
+
):
|
236 |
+
board = Pedalboard([])
|
237 |
+
board.append(
|
238 |
+
Reverb(
|
239 |
+
room_size=reverb_size,
|
240 |
+
dry_level=reverb_dry,
|
241 |
+
wet_level=reverb_wet,
|
242 |
+
damping=reverb_damping,
|
243 |
+
width=reverb_width,
|
244 |
+
)
|
245 |
+
)
|
246 |
+
with AudioFile(audio_path) as f:
|
247 |
+
with AudioFile(output_path, "w", f.samplerate, f.num_channels) as o:
|
248 |
+
while f.tell() < f.frames:
|
249 |
+
chunk = f.read(int(f.samplerate))
|
250 |
+
effected = board(chunk, f.samplerate, reset=False)
|
251 |
+
o.write(effected)
|
252 |
+
return output_path
|
253 |
+
|
254 |
+
|
255 |
+
def merge_audios(
|
256 |
+
vocals_path,
|
257 |
+
inst_path,
|
258 |
+
backing_path,
|
259 |
+
output_path,
|
260 |
+
main_gain,
|
261 |
+
inst_gain,
|
262 |
+
backing_Vol,
|
263 |
+
output_format,
|
264 |
+
):
|
265 |
+
main_vocal_audio = AudioSegment.from_file(vocals_path, format="flac") + main_gain
|
266 |
+
instrumental_audio = AudioSegment.from_file(inst_path, format="flac") + inst_gain
|
267 |
+
backing_vocal_audio = (
|
268 |
+
AudioSegment.from_file(backing_path, format="flac") + backing_Vol
|
269 |
+
)
|
270 |
+
combined_audio = main_vocal_audio.overlay(
|
271 |
+
instrumental_audio.overlay(backing_vocal_audio)
|
272 |
+
)
|
273 |
+
combined_audio.export(output_path, format=output_format)
|
274 |
+
return output_path
|
275 |
+
|
276 |
+
|
277 |
+
def check_fp16_support(device):
|
278 |
+
i_device = int(str(device).split(":")[-1])
|
279 |
+
gpu_name = torch.cuda.get_device_name(i_device)
|
280 |
+
low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"]
|
281 |
+
if any(gpu in gpu_name for gpu in low_end_gpus) and "V100" not in gpu_name.upper():
|
282 |
+
print(f"Your GPU {gpu_name} not support FP16 inference. Using FP32 instead.")
|
283 |
+
return False
|
284 |
+
return True
|
285 |
+
|
286 |
+
|
287 |
+
def full_inference_program(
|
288 |
+
model_path,
|
289 |
+
index_path,
|
290 |
+
input_audio_path,
|
291 |
+
output_path,
|
292 |
+
export_format_rvc,
|
293 |
+
split_audio,
|
294 |
+
autotune,
|
295 |
+
vocal_model,
|
296 |
+
karaoke_model,
|
297 |
+
dereverb_model,
|
298 |
+
deecho,
|
299 |
+
deecho_model,
|
300 |
+
denoise,
|
301 |
+
denoise_model,
|
302 |
+
reverb,
|
303 |
+
vocals_volume,
|
304 |
+
instrumentals_volume,
|
305 |
+
backing_vocals_volume,
|
306 |
+
export_format_final,
|
307 |
+
devices,
|
308 |
+
pitch,
|
309 |
+
filter_radius,
|
310 |
+
index_rate,
|
311 |
+
rms_mix_rate,
|
312 |
+
protect,
|
313 |
+
pitch_extract,
|
314 |
+
hop_lenght,
|
315 |
+
reverb_room_size,
|
316 |
+
reverb_damping,
|
317 |
+
reverb_wet_gain,
|
318 |
+
reverb_dry_gain,
|
319 |
+
reverb_width,
|
320 |
+
embedder_model,
|
321 |
+
delete_audios,
|
322 |
+
use_tta,
|
323 |
+
batch_size,
|
324 |
+
infer_backing_vocals,
|
325 |
+
infer_backing_vocals_model,
|
326 |
+
infer_backing_vocals_index,
|
327 |
+
change_inst_pitch,
|
328 |
+
pitch_back,
|
329 |
+
filter_radius_back,
|
330 |
+
index_rate_back,
|
331 |
+
rms_mix_rate_back,
|
332 |
+
protect_back,
|
333 |
+
pitch_extract_back,
|
334 |
+
hop_length_back,
|
335 |
+
export_format_rvc_back,
|
336 |
+
split_audio_back,
|
337 |
+
autotune_back,
|
338 |
+
embedder_model_back,
|
339 |
+
):
|
340 |
+
if torch.cuda.is_available():
|
341 |
+
n_gpu = torch.cuda.device_count()
|
342 |
+
devices = devices.replace("-", " ")
|
343 |
+
print(f"Number of GPUs available: {n_gpu}")
|
344 |
+
first_device = devices.split()[0]
|
345 |
+
fp16 = check_fp16_support(first_device)
|
346 |
+
else:
|
347 |
+
devices = "cpu"
|
348 |
+
print("Using CPU")
|
349 |
+
fp16 = False
|
350 |
+
|
351 |
+
music_folder = os.path.splitext(os.path.basename(input_audio_path))[0]
|
352 |
+
|
353 |
+
# Vocals Separation
|
354 |
+
model_info = get_model_info_by_name(vocal_model)
|
355 |
+
model_ckpt_path = os.path.join(model_info["path"], "model.ckpt")
|
356 |
+
if not os.path.exists(model_ckpt_path):
|
357 |
+
download_file(
|
358 |
+
model_info["model_url"],
|
359 |
+
model_info["path"],
|
360 |
+
"model.ckpt",
|
361 |
+
)
|
362 |
+
config_json_path = os.path.join(model_info["path"], "config.yaml")
|
363 |
+
if not os.path.exists(config_json_path):
|
364 |
+
download_file(
|
365 |
+
model_info["config_url"],
|
366 |
+
model_info["path"],
|
367 |
+
"config.yaml",
|
368 |
+
)
|
369 |
+
if not fp16:
|
370 |
+
with open(model_info["config"], "r") as file:
|
371 |
+
config = yaml.safe_load(file)
|
372 |
+
|
373 |
+
config["training"]["use_amp"] = False
|
374 |
+
|
375 |
+
with open(model_info["config"], "w") as file:
|
376 |
+
yaml.safe_dump(config, file)
|
377 |
+
store_dir = os.path.join(now_dir, "audio_files", music_folder, "vocals")
|
378 |
+
inst_dir = os.path.join(now_dir, "audio_files", music_folder, "instrumentals")
|
379 |
+
os.makedirs(store_dir, exist_ok=True)
|
380 |
+
os.makedirs(inst_dir, exist_ok=True)
|
381 |
+
input_audio_basename = os.path.splitext(os.path.basename(input_audio_path))[0]
|
382 |
+
search_result = search_with_word(store_dir, "vocals")
|
383 |
+
if search_result:
|
384 |
+
print("Vocals already separated"),
|
385 |
+
else:
|
386 |
+
print("Separating vocals")
|
387 |
+
command = [
|
388 |
+
"python",
|
389 |
+
os.path.join(now_dir, "programs", "music_separation_code", "inference.py"),
|
390 |
+
"--model_type",
|
391 |
+
model_info["type"],
|
392 |
+
"--config_path",
|
393 |
+
model_info["config"],
|
394 |
+
"--start_check_point",
|
395 |
+
model_info["model"],
|
396 |
+
"--input_file",
|
397 |
+
input_audio_path,
|
398 |
+
"--store_dir",
|
399 |
+
store_dir,
|
400 |
+
"--flac_file",
|
401 |
+
"--pcm_type",
|
402 |
+
"PCM_16",
|
403 |
+
"--extract_instrumental",
|
404 |
+
]
|
405 |
+
|
406 |
+
if devices == "cpu":
|
407 |
+
command.append("--force_cpu")
|
408 |
+
else:
|
409 |
+
device_ids = [str(int(device)) for device in devices.split()]
|
410 |
+
command.extend(["--device_ids"] + device_ids)
|
411 |
+
|
412 |
+
subprocess.run(command)
|
413 |
+
os.rename(
|
414 |
+
os.path.join(
|
415 |
+
store_dir,
|
416 |
+
search_with_two_words(
|
417 |
+
store_dir,
|
418 |
+
os.path.basename(input_audio_path).split(".")[0],
|
419 |
+
"instrumental",
|
420 |
+
),
|
421 |
+
),
|
422 |
+
os.path.join(
|
423 |
+
inst_dir,
|
424 |
+
f"{os.path.basename(input_audio_path).split('.')[0]}_instrumentals.flac",
|
425 |
+
),
|
426 |
+
)
|
427 |
+
inst_file = os.path.join(
|
428 |
+
inst_dir,
|
429 |
+
search_with_two_words(
|
430 |
+
inst_dir, os.path.basename(input_audio_path).split(".")[0], "instrumentals"
|
431 |
+
),
|
432 |
+
)
|
433 |
+
|
434 |
+
# karaoke separation
|
435 |
+
model_info = get_model_info_by_name(karaoke_model)
|
436 |
+
store_dir = os.path.join(now_dir, "audio_files", music_folder, "karaoke")
|
437 |
+
os.makedirs(store_dir, exist_ok=True)
|
438 |
+
vocals_path = os.path.join(now_dir, "audio_files", music_folder, "vocals")
|
439 |
+
input_file = search_with_word(vocals_path, "vocals")
|
440 |
+
karaoke_exists = search_with_word(store_dir, "karaoke") is not None
|
441 |
+
|
442 |
+
if karaoke_exists:
|
443 |
+
print("Backing vocals already separated")
|
444 |
+
else:
|
445 |
+
if input_file:
|
446 |
+
input_file = os.path.join(vocals_path, input_file)
|
447 |
+
print("Separating Backing vocals")
|
448 |
+
if model_info["name"] == "Mel-Roformer Karaoke by aufr33 and viperx":
|
449 |
+
model_ckpt_path = os.path.join(model_info["path"], "model.ckpt")
|
450 |
+
if not os.path.exists(model_ckpt_path):
|
451 |
+
download_file(
|
452 |
+
model_info["model_url"],
|
453 |
+
model_info["path"],
|
454 |
+
"model.ckpt",
|
455 |
+
)
|
456 |
+
config_json_path = os.path.join(model_info["path"], "config.yaml")
|
457 |
+
if not os.path.exists(config_json_path):
|
458 |
+
download_file(
|
459 |
+
model_info["config_url"],
|
460 |
+
model_info["path"],
|
461 |
+
"config.yaml",
|
462 |
+
)
|
463 |
+
if not fp16:
|
464 |
+
with open(model_info["config"], "r") as file:
|
465 |
+
config = yaml.safe_load(file)
|
466 |
+
|
467 |
+
config["training"]["use_amp"] = False
|
468 |
+
|
469 |
+
with open(model_info["config"], "w") as file:
|
470 |
+
yaml.safe_dump(config, file)
|
471 |
+
|
472 |
+
command = [
|
473 |
+
"python",
|
474 |
+
os.path.join(
|
475 |
+
now_dir, "programs", "music_separation_code", "inference.py"
|
476 |
+
),
|
477 |
+
"--model_type",
|
478 |
+
model_info["type"],
|
479 |
+
"--config_path",
|
480 |
+
model_info["config"],
|
481 |
+
"--start_check_point",
|
482 |
+
model_info["model"],
|
483 |
+
"--input_file",
|
484 |
+
input_file,
|
485 |
+
"--store_dir",
|
486 |
+
store_dir,
|
487 |
+
"--flac_file",
|
488 |
+
"--pcm_type",
|
489 |
+
"PCM_16",
|
490 |
+
"--extract_instrumental",
|
491 |
+
]
|
492 |
+
|
493 |
+
if devices == "cpu":
|
494 |
+
command.append("--force_cpu")
|
495 |
+
else:
|
496 |
+
device_ids = [str(int(device)) for device in devices.split()]
|
497 |
+
command.extend(["--device_ids"] + device_ids)
|
498 |
+
|
499 |
+
subprocess.run(command)
|
500 |
+
else:
|
501 |
+
separator = Separator(
|
502 |
+
model_file_dir=os.path.join(now_dir, "models", "karaoke"),
|
503 |
+
log_level=logging.WARNING,
|
504 |
+
normalization_threshold=1.0,
|
505 |
+
output_format="flac",
|
506 |
+
output_dir=store_dir,
|
507 |
+
vr_params={
|
508 |
+
"batch_size": batch_size,
|
509 |
+
"enable_tta": use_tta,
|
510 |
+
},
|
511 |
+
)
|
512 |
+
separator.load_model(model_filename=model_info["full_name"])
|
513 |
+
separator.separate(input_file)
|
514 |
+
karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke")
|
515 |
+
vocals_result = search_with_two_words(
|
516 |
+
karaoke_path,
|
517 |
+
os.path.basename(input_audio_path).split(".")[0],
|
518 |
+
"Vocals",
|
519 |
+
)
|
520 |
+
instrumental_result = search_with_two_words(
|
521 |
+
karaoke_path,
|
522 |
+
os.path.basename(input_audio_path).split(".")[0],
|
523 |
+
"Instrumental",
|
524 |
+
)
|
525 |
+
if "UVR-BVE-4B_SN-44100-1" in os.path.basename(vocals_result):
|
526 |
+
os.rename(
|
527 |
+
os.path.join(karaoke_path, vocals_result),
|
528 |
+
os.path.join(
|
529 |
+
karaoke_path,
|
530 |
+
f"{os.path.basename(input_audio_path).split('.')[0]}_karaoke.flac",
|
531 |
+
),
|
532 |
+
)
|
533 |
+
if "UVR-BVE-4B_SN-44100-1" in os.path.basename(instrumental_result):
|
534 |
+
os.rename(
|
535 |
+
os.path.join(karaoke_path, instrumental_result),
|
536 |
+
os.path.join(
|
537 |
+
karaoke_path,
|
538 |
+
f"{os.path.basename(input_audio_path).split('.')[0]}_instrumental.flac",
|
539 |
+
),
|
540 |
+
)
|
541 |
+
|
542 |
+
# dereverb
|
543 |
+
model_info = get_model_info_by_name(dereverb_model)
|
544 |
+
store_dir = os.path.join(now_dir, "audio_files", music_folder, "dereverb")
|
545 |
+
os.makedirs(store_dir, exist_ok=True)
|
546 |
+
karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke")
|
547 |
+
input_file = search_with_word(karaoke_path, "karaoke")
|
548 |
+
noreverb_exists = search_with_word(store_dir, "noreverb") is not None
|
549 |
+
if noreverb_exists:
|
550 |
+
print("Reverb already removed")
|
551 |
+
else:
|
552 |
+
if input_file:
|
553 |
+
input_file = os.path.join(karaoke_path, input_file)
|
554 |
+
print("Removing reverb")
|
555 |
+
if (
|
556 |
+
model_info["name"] == "BS-Roformer Dereverb by anvuew"
|
557 |
+
or model_info["name"] == "MDX23C DeReverb by aufr33 and jarredou"
|
558 |
+
):
|
559 |
+
model_ckpt_path = os.path.join(model_info["path"], "model.ckpt")
|
560 |
+
if not os.path.exists(model_ckpt_path):
|
561 |
+
download_file(
|
562 |
+
model_info["model_url"],
|
563 |
+
model_info["path"],
|
564 |
+
"model.ckpt",
|
565 |
+
)
|
566 |
+
config_json_path = os.path.join(model_info["path"], "config.yaml")
|
567 |
+
if not os.path.exists(config_json_path):
|
568 |
+
download_file(
|
569 |
+
model_info["config_url"],
|
570 |
+
model_info["path"],
|
571 |
+
"config.yaml",
|
572 |
+
)
|
573 |
+
if not fp16:
|
574 |
+
with open(model_info["config"], "r") as file:
|
575 |
+
config = yaml.safe_load(file)
|
576 |
+
|
577 |
+
config["training"]["use_amp"] = False
|
578 |
+
|
579 |
+
with open(model_info["config"], "w") as file:
|
580 |
+
yaml.safe_dump(config, file)
|
581 |
+
command = [
|
582 |
+
"python",
|
583 |
+
os.path.join(
|
584 |
+
now_dir, "programs", "music_separation_code", "inference.py"
|
585 |
+
),
|
586 |
+
"--model_type",
|
587 |
+
model_info["type"],
|
588 |
+
"--config_path",
|
589 |
+
model_info["config"],
|
590 |
+
"--start_check_point",
|
591 |
+
model_info["model"],
|
592 |
+
"--input_file",
|
593 |
+
input_file,
|
594 |
+
"--store_dir",
|
595 |
+
store_dir,
|
596 |
+
"--flac_file",
|
597 |
+
"--pcm_type",
|
598 |
+
"PCM_16",
|
599 |
+
]
|
600 |
+
|
601 |
+
if devices == "cpu":
|
602 |
+
command.append("--force_cpu")
|
603 |
+
else:
|
604 |
+
device_ids = [str(int(device)) for device in devices.split()]
|
605 |
+
command.extend(["--device_ids"] + device_ids)
|
606 |
+
|
607 |
+
subprocess.run(command)
|
608 |
+
else:
|
609 |
+
if model_info["arch"] == "vr":
|
610 |
+
separator = Separator(
|
611 |
+
model_file_dir=os.path.join(now_dir, "models", "dereverb"),
|
612 |
+
log_level=logging.WARNING,
|
613 |
+
normalization_threshold=1.0,
|
614 |
+
output_format="flac",
|
615 |
+
output_dir=store_dir,
|
616 |
+
output_single_stem="No Reverb",
|
617 |
+
vr_params={
|
618 |
+
"batch_size": batch_size,
|
619 |
+
"enable_tta": use_tta,
|
620 |
+
},
|
621 |
+
)
|
622 |
+
else:
|
623 |
+
separator = Separator(
|
624 |
+
model_file_dir=os.path.join(now_dir, "models", "dereverb"),
|
625 |
+
log_level=logging.WARNING,
|
626 |
+
normalization_threshold=1.0,
|
627 |
+
output_format="flac",
|
628 |
+
output_dir=store_dir,
|
629 |
+
output_single_stem="No Reverb",
|
630 |
+
)
|
631 |
+
separator.load_model(model_filename=model_info["full_name"])
|
632 |
+
separator.separate(input_file)
|
633 |
+
dereverb_path = os.path.join(
|
634 |
+
now_dir, "audio_files", music_folder, "dereverb"
|
635 |
+
)
|
636 |
+
search_result = search_with_two_words(
|
637 |
+
dereverb_path,
|
638 |
+
os.path.basename(input_audio_path).split(".")[0],
|
639 |
+
"No Reverb",
|
640 |
+
)
|
641 |
+
if "UVR-DeEcho-DeReverb" in os.path.basename(
|
642 |
+
search_result
|
643 |
+
) or "MDX Reverb HQ by FoxJoy" in os.path.basename(search_result):
|
644 |
+
os.rename(
|
645 |
+
os.path.join(dereverb_path, search_result),
|
646 |
+
os.path.join(
|
647 |
+
dereverb_path,
|
648 |
+
f"{os.path.basename(input_audio_path).split('.')[0]}_noreverb.flac",
|
649 |
+
),
|
650 |
+
)
|
651 |
+
|
652 |
+
# deecho
|
653 |
+
store_dir = os.path.join(now_dir, "audio_files", music_folder, "deecho")
|
654 |
+
os.makedirs(store_dir, exist_ok=True)
|
655 |
+
if deecho:
|
656 |
+
no_echo_exists = search_with_word(store_dir, "noecho") is not None
|
657 |
+
if no_echo_exists:
|
658 |
+
print("Echo already removed")
|
659 |
+
else:
|
660 |
+
print("Removing echo")
|
661 |
+
model_info = get_model_info_by_name(deecho_model)
|
662 |
+
|
663 |
+
dereverb_path = os.path.join(
|
664 |
+
now_dir, "audio_files", music_folder, "dereverb"
|
665 |
+
)
|
666 |
+
noreverb_file = search_with_word(dereverb_path, "noreverb")
|
667 |
+
|
668 |
+
input_file = os.path.join(dereverb_path, noreverb_file)
|
669 |
+
|
670 |
+
separator = Separator(
|
671 |
+
model_file_dir=os.path.join(now_dir, "models", "deecho"),
|
672 |
+
log_level=logging.WARNING,
|
673 |
+
normalization_threshold=1.0,
|
674 |
+
output_format="flac",
|
675 |
+
output_dir=store_dir,
|
676 |
+
output_single_stem="No Echo",
|
677 |
+
vr_params={
|
678 |
+
"batch_size": batch_size,
|
679 |
+
"enable_tta": use_tta,
|
680 |
+
},
|
681 |
+
)
|
682 |
+
separator.load_model(model_filename=model_info["full_name"])
|
683 |
+
separator.separate(input_file)
|
684 |
+
deecho_path = os.path.join(now_dir, "audio_files", music_folder, "deecho")
|
685 |
+
search_result = search_with_two_words(
|
686 |
+
deecho_path,
|
687 |
+
os.path.basename(input_audio_path).split(".")[0],
|
688 |
+
"No Echo",
|
689 |
+
)
|
690 |
+
if "UVR-De-Echo-Normal" in os.path.basename(
|
691 |
+
search_result
|
692 |
+
) or "UVR-Deecho-Agggressive" in os.path.basename(search_result):
|
693 |
+
os.rename(
|
694 |
+
os.path.join(deecho_path, search_result),
|
695 |
+
os.path.join(
|
696 |
+
deecho_path,
|
697 |
+
f"{os.path.basename(input_audio_path).split('.')[0]}_noecho.flac",
|
698 |
+
),
|
699 |
+
)
|
700 |
+
|
701 |
+
# denoise
|
702 |
+
store_dir = os.path.join(now_dir, "audio_files", music_folder, "denoise")
|
703 |
+
os.makedirs(store_dir, exist_ok=True)
|
704 |
+
if denoise:
|
705 |
+
no_noise_exists = search_with_word(store_dir, "dry") is not None
|
706 |
+
if no_noise_exists:
|
707 |
+
print("Noise already removed")
|
708 |
+
else:
|
709 |
+
model_info = get_model_info_by_name(denoise_model)
|
710 |
+
print("Removing noise")
|
711 |
+
input_file = (
|
712 |
+
os.path.join(
|
713 |
+
now_dir,
|
714 |
+
"audio_files",
|
715 |
+
music_folder,
|
716 |
+
"deecho",
|
717 |
+
search_with_word(
|
718 |
+
os.path.join(now_dir, "audio_files", music_folder, "deecho"),
|
719 |
+
"noecho",
|
720 |
+
),
|
721 |
+
)
|
722 |
+
if deecho
|
723 |
+
else os.path.join(
|
724 |
+
now_dir,
|
725 |
+
"audio_files",
|
726 |
+
music_folder,
|
727 |
+
"dereverb",
|
728 |
+
search_with_word(
|
729 |
+
os.path.join(now_dir, "audio_files", music_folder, "dereverb"),
|
730 |
+
"noreverb",
|
731 |
+
),
|
732 |
+
)
|
733 |
+
)
|
734 |
+
|
735 |
+
if (
|
736 |
+
model_info["name"] == "Mel-Roformer Denoise Normal by aufr33"
|
737 |
+
or model_info["name"] == "Mel-Roformer Denoise Aggressive by aufr33"
|
738 |
+
):
|
739 |
+
model_ckpt_path = os.path.join(model_info["path"], "model.ckpt")
|
740 |
+
if not os.path.exists(model_ckpt_path):
|
741 |
+
download_file(
|
742 |
+
model_info["model_url"],
|
743 |
+
model_info["path"],
|
744 |
+
"model.ckpt",
|
745 |
+
)
|
746 |
+
config_json_path = os.path.join(model_info["path"], "config.yaml")
|
747 |
+
if not os.path.exists(config_json_path):
|
748 |
+
download_file(
|
749 |
+
model_info["config_url"], model_info["path"], "config.yaml"
|
750 |
+
)
|
751 |
+
if not fp16:
|
752 |
+
with open(model_info["config"], "r") as file:
|
753 |
+
config = yaml.safe_load(file)
|
754 |
+
|
755 |
+
config["training"]["use_amp"] = False
|
756 |
+
|
757 |
+
with open(model_info["config"], "w") as file:
|
758 |
+
yaml.safe_dump(config, file)
|
759 |
+
command = [
|
760 |
+
"python",
|
761 |
+
os.path.join(
|
762 |
+
now_dir, "programs", "music_separation_code", "inference.py"
|
763 |
+
),
|
764 |
+
"--model_type",
|
765 |
+
model_info["type"],
|
766 |
+
"--config_path",
|
767 |
+
model_info["config"],
|
768 |
+
"--start_check_point",
|
769 |
+
model_info["model"],
|
770 |
+
"--input_file",
|
771 |
+
input_file,
|
772 |
+
"--store_dir",
|
773 |
+
store_dir,
|
774 |
+
"--flac_file",
|
775 |
+
"--pcm_type",
|
776 |
+
"PCM_16",
|
777 |
+
]
|
778 |
+
|
779 |
+
if devices == "cpu":
|
780 |
+
command.append("--force_cpu")
|
781 |
+
else:
|
782 |
+
device_ids = [str(int(device)) for device in devices.split()]
|
783 |
+
command.extend(["--device_ids"] + device_ids)
|
784 |
+
|
785 |
+
subprocess.run(command)
|
786 |
+
else:
|
787 |
+
separator = Separator(
|
788 |
+
model_file_dir=os.path.join(now_dir, "models", "denoise"),
|
789 |
+
log_level=logging.WARNING,
|
790 |
+
normalization_threshold=1.0,
|
791 |
+
output_format="flac",
|
792 |
+
output_dir=store_dir,
|
793 |
+
output_single_stem="No Noise",
|
794 |
+
vr_params={
|
795 |
+
"batch_size": batch_size,
|
796 |
+
"enable_tta": use_tta,
|
797 |
+
},
|
798 |
+
)
|
799 |
+
separator.load_model(model_filename=model_info["full_name"])
|
800 |
+
separator.separate(input_file)
|
801 |
+
search_result = search_with_two_words(
|
802 |
+
deecho_path,
|
803 |
+
os.path.basename(input_audio_path).split(".")[0],
|
804 |
+
"No Noise",
|
805 |
+
)
|
806 |
+
if "UVR Denoise" in os.path.basename(search_result):
|
807 |
+
os.rename(
|
808 |
+
os.path.join(deecho_path, search_result),
|
809 |
+
os.path.join(
|
810 |
+
deecho_path,
|
811 |
+
f"{os.path.basename(input_audio_path).split('.')[0]}_dry.flac",
|
812 |
+
),
|
813 |
+
)
|
814 |
+
|
815 |
+
# RVC
|
816 |
+
denoise_path = os.path.join(now_dir, "audio_files", music_folder, "denoise")
|
817 |
+
deecho_path = os.path.join(now_dir, "audio_files", music_folder, "deecho")
|
818 |
+
dereverb_path = os.path.join(now_dir, "audio_files", music_folder, "dereverb")
|
819 |
+
|
820 |
+
denoise_audio = search_with_two_words(
|
821 |
+
denoise_path, os.path.basename(input_audio_path).split(".")[0], "dry"
|
822 |
+
)
|
823 |
+
deecho_audio = search_with_two_words(
|
824 |
+
deecho_path, os.path.basename(input_audio_path).split(".")[0], "noecho"
|
825 |
+
)
|
826 |
+
dereverb = search_with_two_words(
|
827 |
+
dereverb_path, os.path.basename(input_audio_path).split(".")[0], "noreverb"
|
828 |
+
)
|
829 |
+
|
830 |
+
if denoise_audio:
|
831 |
+
final_path = os.path.join(
|
832 |
+
now_dir, "audio_files", music_folder, "denoise", denoise_audio
|
833 |
+
)
|
834 |
+
elif deecho_audio:
|
835 |
+
final_path = os.path.join(
|
836 |
+
now_dir, "audio_files", music_folder, "deecho", deecho_audio
|
837 |
+
)
|
838 |
+
elif dereverb:
|
839 |
+
final_path = os.path.join(
|
840 |
+
now_dir, "audio_files", music_folder, "dereverb", dereverb
|
841 |
+
)
|
842 |
+
else:
|
843 |
+
final_path = None
|
844 |
+
|
845 |
+
store_dir = os.path.join(now_dir, "audio_files", music_folder, "rvc")
|
846 |
+
os.makedirs(store_dir, exist_ok=True)
|
847 |
+
print("Making RVC inference")
|
848 |
+
output_rvc = os.path.join(
|
849 |
+
now_dir,
|
850 |
+
"audio_files",
|
851 |
+
music_folder,
|
852 |
+
"rvc",
|
853 |
+
f"{os.path.basename(input_audio_path).split('.')[0]}_rvc.wav",
|
854 |
+
)
|
855 |
+
inference_vc = import_voice_converter()
|
856 |
+
inference_vc.convert_audio(
|
857 |
+
audio_input_path=final_path,
|
858 |
+
audio_output_path=output_rvc,
|
859 |
+
model_path=model_path,
|
860 |
+
index_path=index_path,
|
861 |
+
embedder_model=embedder_model,
|
862 |
+
pitch=pitch,
|
863 |
+
f0_file=None,
|
864 |
+
f0_method=pitch_extract,
|
865 |
+
filter_radius=filter_radius,
|
866 |
+
index_rate=index_rate,
|
867 |
+
volume_envelope=rms_mix_rate,
|
868 |
+
protect=protect,
|
869 |
+
split_audio=split_audio,
|
870 |
+
f0_autotune=autotune,
|
871 |
+
hop_length=hop_lenght,
|
872 |
+
export_format=export_format_rvc,
|
873 |
+
embedder_model_custom=None,
|
874 |
+
)
|
875 |
+
backing_vocals = os.path.join(
|
876 |
+
karaoke_path, search_with_word(karaoke_path, "instrumental")
|
877 |
+
)
|
878 |
+
|
879 |
+
if infer_backing_vocals:
|
880 |
+
print("Infering backing vocals")
|
881 |
+
karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke")
|
882 |
+
instrumental_file = search_with_word(karaoke_path, "instrumental")
|
883 |
+
backing_vocals = os.path.join(karaoke_path, instrumental_file)
|
884 |
+
output_backing_vocals = os.path.join(
|
885 |
+
karaoke_path, f"{input_audio_basename}_instrumental_output.wav"
|
886 |
+
)
|
887 |
+
inference_vc.convert_audio(
|
888 |
+
audio_input_path=backing_vocals,
|
889 |
+
audio_output_path=output_backing_vocals,
|
890 |
+
model_path=infer_backing_vocals_model,
|
891 |
+
index_path=infer_backing_vocals_index,
|
892 |
+
embedder_model=embedder_model_back,
|
893 |
+
pitch=pitch_back,
|
894 |
+
f0_file=None,
|
895 |
+
f0_method=pitch_extract_back,
|
896 |
+
filter_radius=filter_radius_back,
|
897 |
+
index_rate=index_rate_back,
|
898 |
+
volume_envelope=rms_mix_rate_back,
|
899 |
+
protect=protect_back,
|
900 |
+
split_audio=split_audio_back,
|
901 |
+
f0_autotune=autotune_back,
|
902 |
+
hop_length=hop_length_back,
|
903 |
+
export_format=export_format_rvc_back,
|
904 |
+
embedder_model_custom=None,
|
905 |
+
)
|
906 |
+
backing_vocals = output_backing_vocals
|
907 |
+
|
908 |
+
# post process
|
909 |
+
if reverb:
|
910 |
+
add_audio_effects(
|
911 |
+
os.path.join(
|
912 |
+
now_dir,
|
913 |
+
"audio_files",
|
914 |
+
music_folder,
|
915 |
+
"rvc",
|
916 |
+
get_last_modified_file(
|
917 |
+
os.path.join(now_dir, "audio_files", music_folder, "rvc")
|
918 |
+
),
|
919 |
+
),
|
920 |
+
reverb_room_size,
|
921 |
+
reverb_wet_gain,
|
922 |
+
reverb_dry_gain,
|
923 |
+
reverb_damping,
|
924 |
+
reverb_width,
|
925 |
+
os.path.join(
|
926 |
+
now_dir,
|
927 |
+
"audio_files",
|
928 |
+
music_folder,
|
929 |
+
"rvc",
|
930 |
+
os.path.basename(input_audio_path),
|
931 |
+
),
|
932 |
+
)
|
933 |
+
if change_inst_pitch != 0:
|
934 |
+
print("Changing instrumental pitch")
|
935 |
+
inst_path = os.path.join(
|
936 |
+
now_dir,
|
937 |
+
"audio_files",
|
938 |
+
music_folder,
|
939 |
+
"instrumentals",
|
940 |
+
search_with_word(
|
941 |
+
os.path.join(now_dir, "audio_files", music_folder, "instrumentals"),
|
942 |
+
"instrumentals",
|
943 |
+
),
|
944 |
+
)
|
945 |
+
audio = AudioSegment.from_file(inst_path)
|
946 |
+
|
947 |
+
factor = 2 ** (change_inst_pitch / 12)
|
948 |
+
|
949 |
+
new_frame_rate = int(audio.frame_rate * factor)
|
950 |
+
audio = audio._spawn(audio.raw_data, overrides={"frame_rate": new_frame_rate})
|
951 |
+
|
952 |
+
audio = audio.set_frame_rate(audio.frame_rate)
|
953 |
+
output_dir_pitch = os.path.join(
|
954 |
+
now_dir, "audio_files", music_folder, "instrumentals"
|
955 |
+
)
|
956 |
+
output_path_pitch = os.path.join(
|
957 |
+
output_dir_pitch, "inst_with_changed_pitch.flac"
|
958 |
+
)
|
959 |
+
audio.export(output_path_pitch, format="flac")
|
960 |
+
|
961 |
+
# merge audios
|
962 |
+
store_dir = os.path.join(now_dir, "audio_files", music_folder, "final")
|
963 |
+
os.makedirs(store_dir, exist_ok=True)
|
964 |
+
|
965 |
+
vocals_path = os.path.join(now_dir, "audio_files", music_folder, "rvc")
|
966 |
+
vocals_file = get_last_modified_file(
|
967 |
+
os.path.join(now_dir, "audio_files", music_folder, "rvc")
|
968 |
+
)
|
969 |
+
vocals_file = os.path.join(vocals_path, vocals_file)
|
970 |
+
|
971 |
+
karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke")
|
972 |
+
karaoke_file = search_with_word(karaoke_path, "Instrumental") or search_with_word(
|
973 |
+
karaoke_path, "instrumental"
|
974 |
+
)
|
975 |
+
karaoke_file = os.path.join(karaoke_path, karaoke_file)
|
976 |
+
final_output_path = os.path.join(
|
977 |
+
now_dir,
|
978 |
+
"audio_files",
|
979 |
+
music_folder,
|
980 |
+
"final",
|
981 |
+
f"{os.path.basename(input_audio_path).split('.')[0]}_final.{export_format_final.lower()}",
|
982 |
+
)
|
983 |
+
print("Merging audios")
|
984 |
+
result = merge_audios(
|
985 |
+
vocals_file,
|
986 |
+
inst_file,
|
987 |
+
backing_vocals,
|
988 |
+
final_output_path,
|
989 |
+
vocals_volume,
|
990 |
+
instrumentals_volume,
|
991 |
+
backing_vocals_volume,
|
992 |
+
export_format_final,
|
993 |
+
)
|
994 |
+
print("Audios merged!")
|
995 |
+
if delete_audios:
|
996 |
+
main_directory = os.path.join(now_dir, "audio_files", music_folder)
|
997 |
+
folder_to_keep = "final"
|
998 |
+
for folder_name in os.listdir(main_directory):
|
999 |
+
folder_path = os.path.join(main_directory, folder_name)
|
1000 |
+
if os.path.isdir(folder_path) and folder_name != folder_to_keep:
|
1001 |
+
shutil.rmtree(folder_path)
|
1002 |
+
return (
|
1003 |
+
f"Audio file {os.path.basename(input_audio_path).split('.')[0]} converted with success",
|
1004 |
+
result,
|
1005 |
+
)
|
1006 |
+
|
1007 |
+
|
1008 |
+
def download_model(link):
|
1009 |
+
model_download_pipeline(link)
|
1010 |
+
return "Model downloaded with success"
|
1011 |
+
|
1012 |
+
|
1013 |
+
def download_music(link):
|
1014 |
+
os.makedirs(os.path.join(now_dir, "audio_files", "original_files"), exist_ok=True)
|
1015 |
+
command = [
|
1016 |
+
"yt-dlp",
|
1017 |
+
"-x",
|
1018 |
+
"--output",
|
1019 |
+
os.path.join(now_dir, "audio_files", "original_files", "%(title)s.%(ext)s"),
|
1020 |
+
link,
|
1021 |
+
]
|
1022 |
+
subprocess.run(command)
|
1023 |
+
return "Music downloaded with success"
|
logs/.gitkeep
ADDED
File without changes
|
main.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import sys, os
|
3 |
+
from tabs.full_inference import full_inference_tab
|
4 |
+
from tabs.download_model import download_model_tab
|
5 |
+
|
6 |
+
now_dir = os.getcwd()
|
7 |
+
sys.path.append(now_dir)
|
8 |
+
DEFAULT_PORT = 7755
|
9 |
+
MAX_PORT_ATTEMPTS = 10
|
10 |
+
|
11 |
+
from assets.i18n.i18n import I18nAuto
|
12 |
+
|
13 |
+
i18n = I18nAuto()
|
14 |
+
|
15 |
+
|
16 |
+
with gr.Blocks(title="hexGen-RVC", css="footer{display:none !important}") as app:
|
17 |
+
gr.Markdown("# hexGen RVC")
|
18 |
+
with gr.Tab(i18n("Full Inference")):
|
19 |
+
full_inference_tab()
|
20 |
+
with gr.Tab(i18n("Download Model")):
|
21 |
+
download_model_tab()
|
22 |
+
|
23 |
+
|
24 |
+
def launch(port):
|
25 |
+
app.launch(
|
26 |
+
share="--share" in sys.argv,
|
27 |
+
inbrowser="--open" in sys.argv,
|
28 |
+
server_port=port,
|
29 |
+
)
|
30 |
+
|
31 |
+
|
32 |
+
def get_port_from_args():
|
33 |
+
if "--port" in sys.argv:
|
34 |
+
port_index = sys.argv.index("--port") + 1
|
35 |
+
if port_index < len(sys.argv):
|
36 |
+
return int(sys.argv[port_index])
|
37 |
+
return DEFAULT_PORT
|
38 |
+
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
port = get_port_from_args()
|
42 |
+
for _ in range(MAX_PORT_ATTEMPTS):
|
43 |
+
try:
|
44 |
+
launch(port)
|
45 |
+
break
|
46 |
+
except OSError:
|
47 |
+
print(
|
48 |
+
f"Failed to launch on port {port}, trying again on port {port - 1}..."
|
49 |
+
)
|
50 |
+
port -= 1
|
51 |
+
except Exception as error:
|
52 |
+
print(f"An error occurred launching Gradio: {error}")
|
53 |
+
break
|
programs/applio_code/rvc/configs/config.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
version_config_paths = [
|
7 |
+
os.path.join("v1", "32000.json"),
|
8 |
+
os.path.join("v1", "40000.json"),
|
9 |
+
os.path.join("v1", "48000.json"),
|
10 |
+
os.path.join("v2", "48000.json"),
|
11 |
+
os.path.join("v2", "40000.json"),
|
12 |
+
os.path.join("v2", "32000.json"),
|
13 |
+
]
|
14 |
+
|
15 |
+
|
16 |
+
def singleton(cls):
|
17 |
+
instances = {}
|
18 |
+
|
19 |
+
def get_instance(*args, **kwargs):
|
20 |
+
if cls not in instances:
|
21 |
+
instances[cls] = cls(*args, **kwargs)
|
22 |
+
return instances[cls]
|
23 |
+
|
24 |
+
return get_instance
|
25 |
+
|
26 |
+
|
27 |
+
@singleton
|
28 |
+
class Config:
|
29 |
+
def __init__(self):
|
30 |
+
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
31 |
+
self.is_half = self.device != "cpu"
|
32 |
+
self.gpu_name = (
|
33 |
+
torch.cuda.get_device_name(int(self.device.split(":")[-1]))
|
34 |
+
if self.device.startswith("cuda")
|
35 |
+
else None
|
36 |
+
)
|
37 |
+
self.json_config = self.load_config_json()
|
38 |
+
self.gpu_mem = None
|
39 |
+
self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
|
40 |
+
|
41 |
+
def load_config_json(self) -> dict:
|
42 |
+
configs = {}
|
43 |
+
for config_file in version_config_paths:
|
44 |
+
config_path = os.path.join(
|
45 |
+
"programs", "applio_code", "rvc", "configs", config_file
|
46 |
+
)
|
47 |
+
with open(config_path, "r") as f:
|
48 |
+
configs[config_file] = json.load(f)
|
49 |
+
return configs
|
50 |
+
|
51 |
+
def has_mps(self) -> bool:
|
52 |
+
# Check if Metal Performance Shaders are available - for macOS 12.3+.
|
53 |
+
return torch.backends.mps.is_available()
|
54 |
+
|
55 |
+
def has_xpu(self) -> bool:
|
56 |
+
# Check if XPU is available.
|
57 |
+
return hasattr(torch, "xpu") and torch.xpu.is_available()
|
58 |
+
|
59 |
+
def set_precision(self, precision):
|
60 |
+
if precision not in ["fp32", "fp16"]:
|
61 |
+
raise ValueError("Invalid precision type. Must be 'fp32' or 'fp16'.")
|
62 |
+
|
63 |
+
fp16_run_value = precision == "fp16"
|
64 |
+
preprocess_target_version = "3.7" if precision == "fp16" else "3.0"
|
65 |
+
preprocess_path = os.path.join(
|
66 |
+
os.path.dirname(__file__),
|
67 |
+
os.pardir,
|
68 |
+
"rvc",
|
69 |
+
"train",
|
70 |
+
"preprocess",
|
71 |
+
"preprocess.py",
|
72 |
+
)
|
73 |
+
|
74 |
+
for config_path in version_config_paths:
|
75 |
+
full_config_path = os.path.join(
|
76 |
+
"programs", "applio_code", "rvc", "configs", config_path
|
77 |
+
)
|
78 |
+
try:
|
79 |
+
with open(full_config_path, "r") as f:
|
80 |
+
config = json.load(f)
|
81 |
+
config["train"]["fp16_run"] = fp16_run_value
|
82 |
+
with open(full_config_path, "w") as f:
|
83 |
+
json.dump(config, f, indent=4)
|
84 |
+
except FileNotFoundError:
|
85 |
+
print(f"File not found: {full_config_path}")
|
86 |
+
|
87 |
+
if os.path.exists(preprocess_path):
|
88 |
+
with open(preprocess_path, "r") as f:
|
89 |
+
preprocess_content = f.read()
|
90 |
+
preprocess_content = preprocess_content.replace(
|
91 |
+
"3.0" if precision == "fp16" else "3.7", preprocess_target_version
|
92 |
+
)
|
93 |
+
with open(preprocess_path, "w") as f:
|
94 |
+
f.write(preprocess_content)
|
95 |
+
|
96 |
+
return f"Overwritten preprocess and config.json to use {precision}."
|
97 |
+
|
98 |
+
def get_precision(self):
|
99 |
+
if not version_config_paths:
|
100 |
+
raise FileNotFoundError("No configuration paths provided.")
|
101 |
+
|
102 |
+
full_config_path = os.path.join(
|
103 |
+
"programs", "applio_code", "rvc", "configs", version_config_paths[0]
|
104 |
+
)
|
105 |
+
try:
|
106 |
+
with open(full_config_path, "r") as f:
|
107 |
+
config = json.load(f)
|
108 |
+
fp16_run_value = config["train"].get("fp16_run", False)
|
109 |
+
precision = "fp16" if fp16_run_value else "fp32"
|
110 |
+
return precision
|
111 |
+
except FileNotFoundError:
|
112 |
+
print(f"File not found: {full_config_path}")
|
113 |
+
return None
|
114 |
+
|
115 |
+
def device_config(self) -> tuple:
|
116 |
+
if self.device.startswith("cuda"):
|
117 |
+
self.set_cuda_config()
|
118 |
+
elif self.has_mps():
|
119 |
+
self.device = "mps"
|
120 |
+
self.is_half = False
|
121 |
+
self.set_precision("fp32")
|
122 |
+
else:
|
123 |
+
self.device = "cpu"
|
124 |
+
self.is_half = False
|
125 |
+
self.set_precision("fp32")
|
126 |
+
|
127 |
+
# Configuration for 6GB GPU memory
|
128 |
+
x_pad, x_query, x_center, x_max = (
|
129 |
+
(3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
|
130 |
+
)
|
131 |
+
if self.gpu_mem is not None and self.gpu_mem <= 4:
|
132 |
+
# Configuration for 5GB GPU memory
|
133 |
+
x_pad, x_query, x_center, x_max = (1, 5, 30, 32)
|
134 |
+
|
135 |
+
return x_pad, x_query, x_center, x_max
|
136 |
+
|
137 |
+
def set_cuda_config(self):
|
138 |
+
i_device = int(self.device.split(":")[-1])
|
139 |
+
self.gpu_name = torch.cuda.get_device_name(i_device)
|
140 |
+
# Zluda
|
141 |
+
if self.gpu_name.endswith("[ZLUDA]"):
|
142 |
+
print("Zluda compatibility enabled, experimental feature.")
|
143 |
+
torch.backends.cudnn.enabled = False
|
144 |
+
torch.backends.cuda.enable_flash_sdp(False)
|
145 |
+
torch.backends.cuda.enable_math_sdp(True)
|
146 |
+
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
147 |
+
low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"]
|
148 |
+
if (
|
149 |
+
any(gpu in self.gpu_name for gpu in low_end_gpus)
|
150 |
+
and "V100" not in self.gpu_name.upper()
|
151 |
+
):
|
152 |
+
self.is_half = False
|
153 |
+
self.set_precision("fp32")
|
154 |
+
|
155 |
+
self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (
|
156 |
+
1024**3
|
157 |
+
)
|
158 |
+
|
159 |
+
|
160 |
+
def max_vram_gpu(gpu):
|
161 |
+
if torch.cuda.is_available():
|
162 |
+
gpu_properties = torch.cuda.get_device_properties(gpu)
|
163 |
+
total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024)
|
164 |
+
return total_memory_gb
|
165 |
+
else:
|
166 |
+
return "0"
|
167 |
+
|
168 |
+
|
169 |
+
def get_gpu_info():
|
170 |
+
ngpu = torch.cuda.device_count()
|
171 |
+
gpu_infos = []
|
172 |
+
if torch.cuda.is_available() or ngpu != 0:
|
173 |
+
for i in range(ngpu):
|
174 |
+
gpu_name = torch.cuda.get_device_name(i)
|
175 |
+
mem = int(
|
176 |
+
torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024
|
177 |
+
+ 0.4
|
178 |
+
)
|
179 |
+
gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)")
|
180 |
+
if len(gpu_infos) > 0:
|
181 |
+
gpu_info = "\n".join(gpu_infos)
|
182 |
+
else:
|
183 |
+
gpu_info = "Unfortunately, there is no compatible GPU available to support your training."
|
184 |
+
return gpu_info
|
185 |
+
|
186 |
+
|
187 |
+
def get_number_of_gpus():
|
188 |
+
if torch.cuda.is_available():
|
189 |
+
num_gpus = torch.cuda.device_count()
|
190 |
+
return "-".join(map(str, range(num_gpus)))
|
191 |
+
else:
|
192 |
+
return "-"
|
programs/applio_code/rvc/configs/v1/32000.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"epochs": 20000,
|
6 |
+
"learning_rate": 1e-4,
|
7 |
+
"betas": [0.8, 0.99],
|
8 |
+
"eps": 1e-9,
|
9 |
+
"batch_size": 4,
|
10 |
+
"fp16_run": true,
|
11 |
+
"lr_decay": 0.999875,
|
12 |
+
"segment_size": 12800,
|
13 |
+
"init_lr_ratio": 1,
|
14 |
+
"warmup_epochs": 0,
|
15 |
+
"c_mel": 45,
|
16 |
+
"c_kl": 1.0
|
17 |
+
},
|
18 |
+
"data": {
|
19 |
+
"max_wav_value": 32768.0,
|
20 |
+
"sample_rate": 32000,
|
21 |
+
"filter_length": 1024,
|
22 |
+
"hop_length": 320,
|
23 |
+
"win_length": 1024,
|
24 |
+
"n_mel_channels": 80,
|
25 |
+
"mel_fmin": 0.0,
|
26 |
+
"mel_fmax": null
|
27 |
+
},
|
28 |
+
"model": {
|
29 |
+
"inter_channels": 192,
|
30 |
+
"hidden_channels": 192,
|
31 |
+
"filter_channels": 768,
|
32 |
+
"text_enc_hidden_dim": 256,
|
33 |
+
"n_heads": 2,
|
34 |
+
"n_layers": 6,
|
35 |
+
"kernel_size": 3,
|
36 |
+
"p_dropout": 0,
|
37 |
+
"resblock": "1",
|
38 |
+
"resblock_kernel_sizes": [3,7,11],
|
39 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
40 |
+
"upsample_rates": [10,4,2,2,2],
|
41 |
+
"upsample_initial_channel": 512,
|
42 |
+
"upsample_kernel_sizes": [16,16,4,4,4],
|
43 |
+
"use_spectral_norm": false,
|
44 |
+
"gin_channels": 256,
|
45 |
+
"spk_embed_dim": 109
|
46 |
+
}
|
47 |
+
}
|
programs/applio_code/rvc/configs/v1/40000.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"epochs": 20000,
|
6 |
+
"learning_rate": 1e-4,
|
7 |
+
"betas": [0.8, 0.99],
|
8 |
+
"eps": 1e-9,
|
9 |
+
"batch_size": 4,
|
10 |
+
"fp16_run": true,
|
11 |
+
"lr_decay": 0.999875,
|
12 |
+
"segment_size": 12800,
|
13 |
+
"init_lr_ratio": 1,
|
14 |
+
"warmup_epochs": 0,
|
15 |
+
"c_mel": 45,
|
16 |
+
"c_kl": 1.0
|
17 |
+
},
|
18 |
+
"data": {
|
19 |
+
"max_wav_value": 32768.0,
|
20 |
+
"sample_rate": 40000,
|
21 |
+
"filter_length": 2048,
|
22 |
+
"hop_length": 400,
|
23 |
+
"win_length": 2048,
|
24 |
+
"n_mel_channels": 125,
|
25 |
+
"mel_fmin": 0.0,
|
26 |
+
"mel_fmax": null
|
27 |
+
},
|
28 |
+
"model": {
|
29 |
+
"inter_channels": 192,
|
30 |
+
"hidden_channels": 192,
|
31 |
+
"filter_channels": 768,
|
32 |
+
"text_enc_hidden_dim": 256,
|
33 |
+
"n_heads": 2,
|
34 |
+
"n_layers": 6,
|
35 |
+
"kernel_size": 3,
|
36 |
+
"p_dropout": 0,
|
37 |
+
"resblock": "1",
|
38 |
+
"resblock_kernel_sizes": [3,7,11],
|
39 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
40 |
+
"upsample_rates": [10,10,2,2],
|
41 |
+
"upsample_initial_channel": 512,
|
42 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
43 |
+
"use_spectral_norm": false,
|
44 |
+
"gin_channels": 256,
|
45 |
+
"spk_embed_dim": 109
|
46 |
+
}
|
47 |
+
}
|
programs/applio_code/rvc/configs/v1/48000.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"epochs": 20000,
|
6 |
+
"learning_rate": 1e-4,
|
7 |
+
"betas": [0.8, 0.99],
|
8 |
+
"eps": 1e-9,
|
9 |
+
"batch_size": 4,
|
10 |
+
"fp16_run": true,
|
11 |
+
"lr_decay": 0.999875,
|
12 |
+
"segment_size": 11520,
|
13 |
+
"init_lr_ratio": 1,
|
14 |
+
"warmup_epochs": 0,
|
15 |
+
"c_mel": 45,
|
16 |
+
"c_kl": 1.0
|
17 |
+
},
|
18 |
+
"data": {
|
19 |
+
"max_wav_value": 32768.0,
|
20 |
+
"sample_rate": 48000,
|
21 |
+
"filter_length": 2048,
|
22 |
+
"hop_length": 480,
|
23 |
+
"win_length": 2048,
|
24 |
+
"n_mel_channels": 128,
|
25 |
+
"mel_fmin": 0.0,
|
26 |
+
"mel_fmax": null
|
27 |
+
},
|
28 |
+
"model": {
|
29 |
+
"inter_channels": 192,
|
30 |
+
"hidden_channels": 192,
|
31 |
+
"filter_channels": 768,
|
32 |
+
"text_enc_hidden_dim": 256,
|
33 |
+
"n_heads": 2,
|
34 |
+
"n_layers": 6,
|
35 |
+
"kernel_size": 3,
|
36 |
+
"p_dropout": 0,
|
37 |
+
"resblock": "1",
|
38 |
+
"resblock_kernel_sizes": [3,7,11],
|
39 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
40 |
+
"upsample_rates": [10,6,2,2,2],
|
41 |
+
"upsample_initial_channel": 512,
|
42 |
+
"upsample_kernel_sizes": [16,16,4,4,4],
|
43 |
+
"use_spectral_norm": false,
|
44 |
+
"gin_channels": 256,
|
45 |
+
"spk_embed_dim": 109
|
46 |
+
}
|
47 |
+
}
|
programs/applio_code/rvc/configs/v2/32000.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"learning_rate": 1e-4,
|
6 |
+
"betas": [0.8, 0.99],
|
7 |
+
"eps": 1e-9,
|
8 |
+
"fp16_run": true,
|
9 |
+
"lr_decay": 0.999875,
|
10 |
+
"segment_size": 12800,
|
11 |
+
"c_mel": 45,
|
12 |
+
"c_kl": 1.0
|
13 |
+
},
|
14 |
+
"data": {
|
15 |
+
"max_wav_value": 32768.0,
|
16 |
+
"sample_rate": 32000,
|
17 |
+
"filter_length": 1024,
|
18 |
+
"hop_length": 320,
|
19 |
+
"win_length": 1024,
|
20 |
+
"n_mel_channels": 80,
|
21 |
+
"mel_fmin": 0.0,
|
22 |
+
"mel_fmax": null
|
23 |
+
},
|
24 |
+
"model": {
|
25 |
+
"inter_channels": 192,
|
26 |
+
"hidden_channels": 192,
|
27 |
+
"filter_channels": 768,
|
28 |
+
"text_enc_hidden_dim": 768,
|
29 |
+
"n_heads": 2,
|
30 |
+
"n_layers": 6,
|
31 |
+
"kernel_size": 3,
|
32 |
+
"p_dropout": 0,
|
33 |
+
"resblock": "1",
|
34 |
+
"resblock_kernel_sizes": [3,7,11],
|
35 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
36 |
+
"upsample_rates": [10,8,2,2],
|
37 |
+
"upsample_initial_channel": 512,
|
38 |
+
"upsample_kernel_sizes": [20,16,4,4],
|
39 |
+
"use_spectral_norm": false,
|
40 |
+
"gin_channels": 256,
|
41 |
+
"spk_embed_dim": 109
|
42 |
+
}
|
43 |
+
}
|
programs/applio_code/rvc/configs/v2/40000.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"learning_rate": 1e-4,
|
6 |
+
"betas": [0.8, 0.99],
|
7 |
+
"eps": 1e-9,
|
8 |
+
"fp16_run": true,
|
9 |
+
"lr_decay": 0.999875,
|
10 |
+
"segment_size": 12800,
|
11 |
+
"c_mel": 45,
|
12 |
+
"c_kl": 1.0
|
13 |
+
},
|
14 |
+
"data": {
|
15 |
+
"max_wav_value": 32768.0,
|
16 |
+
"sample_rate": 40000,
|
17 |
+
"filter_length": 2048,
|
18 |
+
"hop_length": 400,
|
19 |
+
"win_length": 2048,
|
20 |
+
"n_mel_channels": 125,
|
21 |
+
"mel_fmin": 0.0,
|
22 |
+
"mel_fmax": null
|
23 |
+
},
|
24 |
+
"model": {
|
25 |
+
"inter_channels": 192,
|
26 |
+
"hidden_channels": 192,
|
27 |
+
"filter_channels": 768,
|
28 |
+
"text_enc_hidden_dim": 768,
|
29 |
+
"n_heads": 2,
|
30 |
+
"n_layers": 6,
|
31 |
+
"kernel_size": 3,
|
32 |
+
"p_dropout": 0,
|
33 |
+
"resblock": "1",
|
34 |
+
"resblock_kernel_sizes": [3,7,11],
|
35 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
36 |
+
"upsample_rates": [10,10,2,2],
|
37 |
+
"upsample_initial_channel": 512,
|
38 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
39 |
+
"use_spectral_norm": false,
|
40 |
+
"gin_channels": 256,
|
41 |
+
"spk_embed_dim": 109
|
42 |
+
}
|
43 |
+
}
|
programs/applio_code/rvc/configs/v2/48000.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"learning_rate": 1e-4,
|
6 |
+
"betas": [0.8, 0.99],
|
7 |
+
"eps": 1e-9,
|
8 |
+
"fp16_run": true,
|
9 |
+
"lr_decay": 0.999875,
|
10 |
+
"segment_size": 17280,
|
11 |
+
"c_mel": 45,
|
12 |
+
"c_kl": 1.0
|
13 |
+
},
|
14 |
+
"data": {
|
15 |
+
"max_wav_value": 32768.0,
|
16 |
+
"sample_rate": 48000,
|
17 |
+
"filter_length": 2048,
|
18 |
+
"hop_length": 480,
|
19 |
+
"win_length": 2048,
|
20 |
+
"n_mel_channels": 128,
|
21 |
+
"mel_fmin": 0.0,
|
22 |
+
"mel_fmax": null
|
23 |
+
},
|
24 |
+
"model": {
|
25 |
+
"inter_channels": 192,
|
26 |
+
"hidden_channels": 192,
|
27 |
+
"filter_channels": 768,
|
28 |
+
"text_enc_hidden_dim": 768,
|
29 |
+
"n_heads": 2,
|
30 |
+
"n_layers": 6,
|
31 |
+
"kernel_size": 3,
|
32 |
+
"p_dropout": 0,
|
33 |
+
"resblock": "1",
|
34 |
+
"resblock_kernel_sizes": [3,7,11],
|
35 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
36 |
+
"upsample_rates": [12,10,2,2],
|
37 |
+
"upsample_initial_channel": 512,
|
38 |
+
"upsample_kernel_sizes": [24,20,4,4],
|
39 |
+
"use_spectral_norm": false,
|
40 |
+
"gin_channels": 256,
|
41 |
+
"spk_embed_dim": 109
|
42 |
+
}
|
43 |
+
}
|
programs/applio_code/rvc/infer/infer.py
ADDED
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
import torch
|
5 |
+
import librosa
|
6 |
+
import logging
|
7 |
+
import traceback
|
8 |
+
import numpy as np
|
9 |
+
import soundfile as sf
|
10 |
+
|
11 |
+
from scipy.io import wavfile
|
12 |
+
|
13 |
+
now_dir = os.getcwd()
|
14 |
+
sys.path.append(now_dir)
|
15 |
+
|
16 |
+
from programs.applio_code.rvc.infer.pipeline import Pipeline as VC
|
17 |
+
from programs.applio_code.rvc.lib.utils import load_audio_infer, load_embedding
|
18 |
+
from programs.applio_code.rvc.lib.tools.split_audio import process_audio, merge_audio
|
19 |
+
from programs.applio_code.rvc.lib.algorithm.synthesizers import Synthesizer
|
20 |
+
from programs.applio_code.rvc.configs.config import Config
|
21 |
+
|
22 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
23 |
+
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
24 |
+
logging.getLogger("faiss").setLevel(logging.WARNING)
|
25 |
+
logging.getLogger("faiss.loader").setLevel(logging.WARNING)
|
26 |
+
|
27 |
+
|
28 |
+
class VoiceConverter:
|
29 |
+
"""
|
30 |
+
A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method.
|
31 |
+
"""
|
32 |
+
|
33 |
+
def __init__(self):
|
34 |
+
"""
|
35 |
+
Initializes the VoiceConverter with default configuration, and sets up models and parameters.
|
36 |
+
"""
|
37 |
+
self.config = Config() # Load RVC configuration
|
38 |
+
self.hubert_model = (
|
39 |
+
None # Initialize the Hubert model (for embedding extraction)
|
40 |
+
)
|
41 |
+
self.last_embedder_model = None # Last used embedder model
|
42 |
+
self.tgt_sr = None # Target sampling rate for the output audio
|
43 |
+
self.net_g = None # Generator network for voice conversion
|
44 |
+
self.vc = None # Voice conversion pipeline instance
|
45 |
+
self.cpt = None # Checkpoint for loading model weights
|
46 |
+
self.version = None # Model version
|
47 |
+
self.n_spk = None # Number of speakers in the model
|
48 |
+
self.use_f0 = None # Whether the model uses F0
|
49 |
+
|
50 |
+
def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
|
51 |
+
"""
|
52 |
+
Loads the HuBERT model for speaker embedding extraction.
|
53 |
+
"""
|
54 |
+
self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
|
55 |
+
self.hubert_model.to(self.config.device)
|
56 |
+
self.hubert_model = (
|
57 |
+
self.hubert_model.half()
|
58 |
+
if self.config.is_half
|
59 |
+
else self.hubert_model.float()
|
60 |
+
)
|
61 |
+
self.hubert_model.eval()
|
62 |
+
|
63 |
+
@staticmethod
|
64 |
+
def convert_audio_format(input_path, output_path, output_format):
|
65 |
+
"""
|
66 |
+
Converts an audio file to a specified output format.
|
67 |
+
"""
|
68 |
+
try:
|
69 |
+
if output_format != "WAV":
|
70 |
+
print(f"Converting audio to {output_format} format...")
|
71 |
+
audio, sample_rate = librosa.load(input_path, sr=None)
|
72 |
+
common_sample_rates = [
|
73 |
+
8000,
|
74 |
+
11025,
|
75 |
+
12000,
|
76 |
+
16000,
|
77 |
+
22050,
|
78 |
+
24000,
|
79 |
+
32000,
|
80 |
+
44100,
|
81 |
+
48000,
|
82 |
+
]
|
83 |
+
target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
|
84 |
+
audio = librosa.resample(
|
85 |
+
audio, orig_sr=sample_rate, target_sr=target_sr
|
86 |
+
)
|
87 |
+
sf.write(output_path, audio, target_sr, format=output_format.lower())
|
88 |
+
return output_path
|
89 |
+
except Exception as error:
|
90 |
+
print(f"An error occurred converting the audio format: {error}")
|
91 |
+
|
92 |
+
def convert_audio(
|
93 |
+
self,
|
94 |
+
audio_input_path: str,
|
95 |
+
audio_output_path: str,
|
96 |
+
model_path: str,
|
97 |
+
index_path: str,
|
98 |
+
embedder_model: str,
|
99 |
+
pitch: int,
|
100 |
+
f0_file: str,
|
101 |
+
f0_method: str,
|
102 |
+
index_rate: float,
|
103 |
+
volume_envelope: int,
|
104 |
+
protect: float,
|
105 |
+
hop_length: int,
|
106 |
+
split_audio: bool,
|
107 |
+
f0_autotune: bool,
|
108 |
+
filter_radius: int,
|
109 |
+
embedder_model_custom: str,
|
110 |
+
export_format: str,
|
111 |
+
resample_sr: int = 0,
|
112 |
+
sid: int = 0,
|
113 |
+
):
|
114 |
+
"""
|
115 |
+
Performs voice conversion on the input audio.
|
116 |
+
"""
|
117 |
+
self.get_vc(model_path, sid)
|
118 |
+
|
119 |
+
try:
|
120 |
+
start_time = time.time()
|
121 |
+
print(f"Converting audio '{audio_input_path}'...")
|
122 |
+
audio = load_audio_infer(
|
123 |
+
audio_input_path,
|
124 |
+
16000,
|
125 |
+
)
|
126 |
+
audio_max = np.abs(audio).max() / 0.95
|
127 |
+
|
128 |
+
if audio_max > 1:
|
129 |
+
audio /= audio_max
|
130 |
+
|
131 |
+
if not self.hubert_model or embedder_model != self.last_embedder_model:
|
132 |
+
self.load_hubert(embedder_model, embedder_model_custom)
|
133 |
+
self.last_embedder_model = embedder_model
|
134 |
+
|
135 |
+
file_index = (
|
136 |
+
index_path.strip()
|
137 |
+
.strip('"')
|
138 |
+
.strip("\n")
|
139 |
+
.strip('"')
|
140 |
+
.strip()
|
141 |
+
.replace("trained", "added")
|
142 |
+
)
|
143 |
+
|
144 |
+
if self.tgt_sr != resample_sr >= 16000:
|
145 |
+
self.tgt_sr = resample_sr
|
146 |
+
|
147 |
+
if split_audio:
|
148 |
+
result, new_dir_path = process_audio(audio_input_path)
|
149 |
+
if result == "Error":
|
150 |
+
return "Error with Split Audio", None
|
151 |
+
|
152 |
+
dir_path = (
|
153 |
+
new_dir_path.strip().strip('"').strip("\n").strip('"').strip()
|
154 |
+
)
|
155 |
+
if dir_path:
|
156 |
+
paths = [
|
157 |
+
os.path.join(root, name)
|
158 |
+
for root, _, files in os.walk(dir_path, topdown=False)
|
159 |
+
for name in files
|
160 |
+
if name.endswith(".wav") and root == dir_path
|
161 |
+
]
|
162 |
+
try:
|
163 |
+
for path in paths:
|
164 |
+
self.convert_audio(
|
165 |
+
audio_input_path=path,
|
166 |
+
audio_output_path=path,
|
167 |
+
model_path=model_path,
|
168 |
+
index_path=index_path,
|
169 |
+
sid=sid,
|
170 |
+
pitch=pitch,
|
171 |
+
f0_file=None,
|
172 |
+
f0_method=f0_method,
|
173 |
+
index_rate=index_rate,
|
174 |
+
resample_sr=resample_sr,
|
175 |
+
volume_envelope=volume_envelope,
|
176 |
+
protect=protect,
|
177 |
+
hop_length=hop_length,
|
178 |
+
split_audio=False,
|
179 |
+
f0_autotune=f0_autotune,
|
180 |
+
filter_radius=filter_radius,
|
181 |
+
export_format=export_format,
|
182 |
+
embedder_model=embedder_model,
|
183 |
+
embedder_model_custom=embedder_model_custom,
|
184 |
+
)
|
185 |
+
except Exception as error:
|
186 |
+
print(f"An error occurred processing the segmented audio: {error}")
|
187 |
+
print(traceback.format_exc())
|
188 |
+
return f"Error {error}"
|
189 |
+
print("Finished processing segmented audio, now merging audio...")
|
190 |
+
merge_timestamps_file = os.path.join(
|
191 |
+
os.path.dirname(new_dir_path),
|
192 |
+
f"{os.path.basename(audio_input_path).split('.')[0]}_timestamps.txt",
|
193 |
+
)
|
194 |
+
self.tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
|
195 |
+
os.remove(merge_timestamps_file)
|
196 |
+
sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV")
|
197 |
+
else:
|
198 |
+
audio_opt = self.vc.pipeline(
|
199 |
+
model=self.hubert_model,
|
200 |
+
net_g=self.net_g,
|
201 |
+
sid=sid,
|
202 |
+
audio=audio,
|
203 |
+
input_audio_path=audio_input_path,
|
204 |
+
pitch=pitch,
|
205 |
+
f0_method=f0_method,
|
206 |
+
file_index=file_index,
|
207 |
+
index_rate=index_rate,
|
208 |
+
pitch_guidance=self.use_f0,
|
209 |
+
filter_radius=filter_radius,
|
210 |
+
tgt_sr=self.tgt_sr,
|
211 |
+
resample_sr=resample_sr,
|
212 |
+
volume_envelope=volume_envelope,
|
213 |
+
version=self.version,
|
214 |
+
protect=protect,
|
215 |
+
hop_length=hop_length,
|
216 |
+
f0_autotune=f0_autotune,
|
217 |
+
f0_file=f0_file,
|
218 |
+
)
|
219 |
+
|
220 |
+
if audio_output_path:
|
221 |
+
sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV")
|
222 |
+
output_path_format = audio_output_path.replace(
|
223 |
+
".wav", f".{export_format.lower()}"
|
224 |
+
)
|
225 |
+
audio_output_path = self.convert_audio_format(
|
226 |
+
audio_output_path, output_path_format, export_format
|
227 |
+
)
|
228 |
+
|
229 |
+
elapsed_time = time.time() - start_time
|
230 |
+
print(
|
231 |
+
f"Conversion completed at '{audio_output_path}' in {elapsed_time:.2f} seconds."
|
232 |
+
)
|
233 |
+
|
234 |
+
except Exception as error:
|
235 |
+
print(f"An error occurred during audio conversion: {error}")
|
236 |
+
print(traceback.format_exc())
|
237 |
+
|
238 |
+
def convert_audio_batch(
|
239 |
+
self,
|
240 |
+
audio_input_paths: str,
|
241 |
+
audio_output_path: str,
|
242 |
+
model_path: str,
|
243 |
+
index_path: str,
|
244 |
+
embedder_model: str,
|
245 |
+
pitch: int,
|
246 |
+
f0_file: str,
|
247 |
+
f0_method: str,
|
248 |
+
index_rate: float,
|
249 |
+
volume_envelope: int,
|
250 |
+
protect: float,
|
251 |
+
hop_length: int,
|
252 |
+
split_audio: bool,
|
253 |
+
f0_autotune: bool,
|
254 |
+
filter_radius: int,
|
255 |
+
embedder_model_custom: str,
|
256 |
+
export_format: str,
|
257 |
+
resample_sr: int = 0,
|
258 |
+
sid: int = 0,
|
259 |
+
pid_file_path: str = None,
|
260 |
+
):
|
261 |
+
"""
|
262 |
+
Performs voice conversion on a batch of input audio files.
|
263 |
+
"""
|
264 |
+
pid = os.getpid()
|
265 |
+
with open(pid_file_path, "w") as pid_file:
|
266 |
+
pid_file.write(str(pid))
|
267 |
+
try:
|
268 |
+
if not self.hubert_model or embedder_model != self.last_embedder_model:
|
269 |
+
self.load_hubert(embedder_model, embedder_model_custom)
|
270 |
+
self.last_embedder_model = embedder_model
|
271 |
+
self.get_vc(model_path, sid)
|
272 |
+
file_index = (
|
273 |
+
index_path.strip()
|
274 |
+
.strip('"')
|
275 |
+
.strip("\n")
|
276 |
+
.strip('"')
|
277 |
+
.strip()
|
278 |
+
.replace("trained", "added")
|
279 |
+
)
|
280 |
+
start_time = time.time()
|
281 |
+
print(f"Converting audio batch '{audio_input_paths}'...")
|
282 |
+
audio_files = [
|
283 |
+
f
|
284 |
+
for f in os.listdir(audio_input_paths)
|
285 |
+
if f.endswith((".mp3", ".wav", ".flac", ".m4a", ".ogg", ".opus"))
|
286 |
+
]
|
287 |
+
print(f"Detected {len(audio_files)} audio files for inference.")
|
288 |
+
for i, audio_input_path in enumerate(audio_files):
|
289 |
+
audio_output_paths = os.path.join(
|
290 |
+
audio_output_path,
|
291 |
+
f"{os.path.splitext(os.path.basename(audio_input_path))[0]}_output.{export_format.lower()}",
|
292 |
+
)
|
293 |
+
if os.path.exists(audio_output_paths):
|
294 |
+
continue
|
295 |
+
print(f"Converting audio '{audio_input_path}'...")
|
296 |
+
audio_input_path = os.path.join(audio_input_paths, audio_input_path)
|
297 |
+
|
298 |
+
audio = load_audio_infer(
|
299 |
+
audio_input_path,
|
300 |
+
16000,
|
301 |
+
)
|
302 |
+
audio_max = np.abs(audio).max() / 0.95
|
303 |
+
|
304 |
+
if audio_max > 1:
|
305 |
+
audio /= audio_max
|
306 |
+
|
307 |
+
if self.tgt_sr != resample_sr >= 16000:
|
308 |
+
self.tgt_sr = resample_sr
|
309 |
+
|
310 |
+
if split_audio:
|
311 |
+
result, new_dir_path = process_audio(audio_input_path)
|
312 |
+
if result == "Error":
|
313 |
+
return "Error with Split Audio", None
|
314 |
+
|
315 |
+
dir_path = (
|
316 |
+
new_dir_path.strip().strip('"').strip("\n").strip('"').strip()
|
317 |
+
)
|
318 |
+
if dir_path:
|
319 |
+
paths = [
|
320 |
+
os.path.join(root, name)
|
321 |
+
for root, _, files in os.walk(dir_path, topdown=False)
|
322 |
+
for name in files
|
323 |
+
if name.endswith(".wav") and root == dir_path
|
324 |
+
]
|
325 |
+
try:
|
326 |
+
for path in paths:
|
327 |
+
self.convert_audio(
|
328 |
+
audio_input_path=path,
|
329 |
+
audio_output_path=path,
|
330 |
+
model_path=model_path,
|
331 |
+
index_path=index_path,
|
332 |
+
sid=sid,
|
333 |
+
pitch=pitch,
|
334 |
+
f0_file=None,
|
335 |
+
f0_method=f0_method,
|
336 |
+
index_rate=index_rate,
|
337 |
+
resample_sr=resample_sr,
|
338 |
+
volume_envelope=volume_envelope,
|
339 |
+
protect=protect,
|
340 |
+
hop_length=hop_length,
|
341 |
+
split_audio=False,
|
342 |
+
f0_autotune=f0_autotune,
|
343 |
+
filter_radius=filter_radius,
|
344 |
+
export_format=export_format,
|
345 |
+
embedder_model=embedder_model,
|
346 |
+
embedder_model_custom=embedder_model_custom,
|
347 |
+
)
|
348 |
+
except Exception as error:
|
349 |
+
print(
|
350 |
+
f"An error occurred processing the segmented audio: {error}"
|
351 |
+
)
|
352 |
+
print(traceback.format_exc())
|
353 |
+
return f"Error {error}"
|
354 |
+
print("Finished processing segmented audio, now merging audio...")
|
355 |
+
merge_timestamps_file = os.path.join(
|
356 |
+
os.path.dirname(new_dir_path),
|
357 |
+
f"{os.path.basename(audio_input_path).split('.')[0]}_timestamps.txt",
|
358 |
+
)
|
359 |
+
self.tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
|
360 |
+
os.remove(merge_timestamps_file)
|
361 |
+
else:
|
362 |
+
audio_opt = self.vc.pipeline(
|
363 |
+
model=self.hubert_model,
|
364 |
+
net_g=self.net_g,
|
365 |
+
sid=sid,
|
366 |
+
audio=audio,
|
367 |
+
input_audio_path=audio_input_path,
|
368 |
+
pitch=pitch,
|
369 |
+
f0_method=f0_method,
|
370 |
+
file_index=file_index,
|
371 |
+
index_rate=index_rate,
|
372 |
+
pitch_guidance=self.use_f0,
|
373 |
+
filter_radius=filter_radius,
|
374 |
+
tgt_sr=self.tgt_sr,
|
375 |
+
resample_sr=resample_sr,
|
376 |
+
volume_envelope=volume_envelope,
|
377 |
+
version=self.version,
|
378 |
+
protect=protect,
|
379 |
+
hop_length=hop_length,
|
380 |
+
f0_autotune=f0_autotune,
|
381 |
+
f0_file=f0_file,
|
382 |
+
)
|
383 |
+
|
384 |
+
if audio_output_paths:
|
385 |
+
sf.write(audio_output_paths, audio_opt, self.tgt_sr, format="WAV")
|
386 |
+
output_path_format = audio_output_paths.replace(
|
387 |
+
".wav", f".{export_format.lower()}"
|
388 |
+
)
|
389 |
+
audio_output_paths = self.convert_audio_format(
|
390 |
+
audio_output_paths, output_path_format, export_format
|
391 |
+
)
|
392 |
+
print(f"Conversion completed at '{audio_output_paths}'.")
|
393 |
+
elapsed_time = time.time() - start_time
|
394 |
+
print(f"Batch conversion completed in {elapsed_time:.2f} seconds.")
|
395 |
+
os.remove(pid_file_path)
|
396 |
+
except Exception as error:
|
397 |
+
print(f"An error occurred during audio conversion: {error}")
|
398 |
+
print(traceback.format_exc())
|
399 |
+
|
400 |
+
def get_vc(self, weight_root, sid):
|
401 |
+
"""
|
402 |
+
Loads the voice conversion model and sets up the pipeline.
|
403 |
+
"""
|
404 |
+
if sid == "" or sid == []:
|
405 |
+
self.cleanup_model()
|
406 |
+
if torch.cuda.is_available():
|
407 |
+
torch.cuda.empty_cache()
|
408 |
+
|
409 |
+
self.load_model(weight_root)
|
410 |
+
|
411 |
+
if self.cpt is not None:
|
412 |
+
self.setup_network()
|
413 |
+
self.setup_vc_instance()
|
414 |
+
|
415 |
+
def cleanup_model(self):
|
416 |
+
"""
|
417 |
+
Cleans up the model and releases resources.
|
418 |
+
"""
|
419 |
+
if self.hubert_model is not None:
|
420 |
+
del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
|
421 |
+
self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
|
422 |
+
if torch.cuda.is_available():
|
423 |
+
torch.cuda.empty_cache()
|
424 |
+
|
425 |
+
del self.net_g, self.cpt
|
426 |
+
if torch.cuda.is_available():
|
427 |
+
torch.cuda.empty_cache()
|
428 |
+
self.cpt = None
|
429 |
+
|
430 |
+
def load_model(self, weight_root):
|
431 |
+
"""
|
432 |
+
Loads the model weights from the specified path.
|
433 |
+
"""
|
434 |
+
self.cpt = (
|
435 |
+
torch.load(weight_root, map_location="cpu")
|
436 |
+
if os.path.isfile(weight_root)
|
437 |
+
else None
|
438 |
+
)
|
439 |
+
|
440 |
+
def setup_network(self):
|
441 |
+
"""
|
442 |
+
Sets up the network configuration based on the loaded checkpoint.
|
443 |
+
"""
|
444 |
+
if self.cpt is not None:
|
445 |
+
self.tgt_sr = self.cpt["config"][-1]
|
446 |
+
self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
|
447 |
+
self.use_f0 = self.cpt.get("f0", 1)
|
448 |
+
|
449 |
+
self.version = self.cpt.get("version", "v1")
|
450 |
+
self.text_enc_hidden_dim = 768 if self.version == "v2" else 256
|
451 |
+
self.net_g = Synthesizer(
|
452 |
+
*self.cpt["config"],
|
453 |
+
use_f0=self.use_f0,
|
454 |
+
text_enc_hidden_dim=self.text_enc_hidden_dim,
|
455 |
+
is_half=self.config.is_half,
|
456 |
+
)
|
457 |
+
del self.net_g.enc_q
|
458 |
+
self.net_g.load_state_dict(self.cpt["weight"], strict=False)
|
459 |
+
self.net_g.eval().to(self.config.device)
|
460 |
+
self.net_g = (
|
461 |
+
self.net_g.half() if self.config.is_half else self.net_g.float()
|
462 |
+
)
|
463 |
+
|
464 |
+
def setup_vc_instance(self):
|
465 |
+
"""
|
466 |
+
Sets up the voice conversion pipeline instance based on the target sampling rate and configuration.
|
467 |
+
"""
|
468 |
+
if self.cpt is not None:
|
469 |
+
self.vc = VC(self.tgt_sr, self.config)
|
470 |
+
self.n_spk = self.cpt["config"][-3]
|
programs/applio_code/rvc/infer/pipeline.py
ADDED
@@ -0,0 +1,701 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gc
|
3 |
+
import re
|
4 |
+
import sys
|
5 |
+
import torch
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import torchcrepe
|
8 |
+
import faiss
|
9 |
+
import librosa
|
10 |
+
import numpy as np
|
11 |
+
from scipy import signal
|
12 |
+
from torch import Tensor
|
13 |
+
|
14 |
+
now_dir = os.getcwd()
|
15 |
+
sys.path.append(now_dir)
|
16 |
+
|
17 |
+
from programs.applio_code.rvc.lib.predictors.RMVPE import RMVPE0Predictor
|
18 |
+
from programs.applio_code.rvc.lib.predictors.FCPE import FCPEF0Predictor
|
19 |
+
|
20 |
+
import logging
|
21 |
+
|
22 |
+
logging.getLogger("faiss").setLevel(logging.WARNING)
|
23 |
+
|
24 |
+
# Constants for high-pass filter
|
25 |
+
FILTER_ORDER = 5
|
26 |
+
CUTOFF_FREQUENCY = 48 # Hz
|
27 |
+
SAMPLE_RATE = 16000 # Hz
|
28 |
+
bh, ah = signal.butter(
|
29 |
+
N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE
|
30 |
+
)
|
31 |
+
|
32 |
+
input_audio_path2wav = {}
|
33 |
+
|
34 |
+
|
35 |
+
class AudioProcessor:
|
36 |
+
"""
|
37 |
+
A class for processing audio signals, specifically for adjusting RMS levels.
|
38 |
+
"""
|
39 |
+
|
40 |
+
def change_rms(
|
41 |
+
source_audio: np.ndarray,
|
42 |
+
source_rate: int,
|
43 |
+
target_audio: np.ndarray,
|
44 |
+
target_rate: int,
|
45 |
+
rate: float,
|
46 |
+
) -> np.ndarray:
|
47 |
+
"""
|
48 |
+
Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
source_audio: The source audio signal as a NumPy array.
|
52 |
+
source_rate: The sampling rate of the source audio.
|
53 |
+
target_audio: The target audio signal to adjust.
|
54 |
+
target_rate: The sampling rate of the target audio.
|
55 |
+
rate: The blending rate between the source and target RMS levels.
|
56 |
+
"""
|
57 |
+
# Calculate RMS of both audio data
|
58 |
+
rms1 = librosa.feature.rms(
|
59 |
+
y=source_audio,
|
60 |
+
frame_length=source_rate // 2 * 2,
|
61 |
+
hop_length=source_rate // 2,
|
62 |
+
)
|
63 |
+
rms2 = librosa.feature.rms(
|
64 |
+
y=target_audio,
|
65 |
+
frame_length=target_rate // 2 * 2,
|
66 |
+
hop_length=target_rate // 2,
|
67 |
+
)
|
68 |
+
|
69 |
+
# Interpolate RMS to match target audio length
|
70 |
+
rms1 = F.interpolate(
|
71 |
+
torch.from_numpy(rms1).float().unsqueeze(0),
|
72 |
+
size=target_audio.shape[0],
|
73 |
+
mode="linear",
|
74 |
+
).squeeze()
|
75 |
+
rms2 = F.interpolate(
|
76 |
+
torch.from_numpy(rms2).float().unsqueeze(0),
|
77 |
+
size=target_audio.shape[0],
|
78 |
+
mode="linear",
|
79 |
+
).squeeze()
|
80 |
+
rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6)
|
81 |
+
|
82 |
+
# Adjust target audio RMS based on the source audio RMS
|
83 |
+
adjusted_audio = (
|
84 |
+
target_audio
|
85 |
+
* (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy()
|
86 |
+
)
|
87 |
+
return adjusted_audio
|
88 |
+
|
89 |
+
|
90 |
+
class Autotune:
|
91 |
+
"""
|
92 |
+
A class for applying autotune to a given fundamental frequency (F0) contour.
|
93 |
+
"""
|
94 |
+
|
95 |
+
def __init__(self, ref_freqs):
|
96 |
+
"""
|
97 |
+
Initializes the Autotune class with a set of reference frequencies.
|
98 |
+
|
99 |
+
Args:
|
100 |
+
ref_freqs: A list of reference frequencies representing musical notes.
|
101 |
+
"""
|
102 |
+
self.ref_freqs = ref_freqs
|
103 |
+
self.note_dict = self.generate_interpolated_frequencies()
|
104 |
+
|
105 |
+
def generate_interpolated_frequencies(self):
|
106 |
+
"""
|
107 |
+
Generates a dictionary of interpolated frequencies between reference frequencies.
|
108 |
+
"""
|
109 |
+
note_dict = []
|
110 |
+
for i in range(len(self.ref_freqs) - 1):
|
111 |
+
freq_low = self.ref_freqs[i]
|
112 |
+
freq_high = self.ref_freqs[i + 1]
|
113 |
+
interpolated_freqs = np.linspace(
|
114 |
+
freq_low, freq_high, num=10, endpoint=False
|
115 |
+
)
|
116 |
+
note_dict.extend(interpolated_freqs)
|
117 |
+
note_dict.append(self.ref_freqs[-1])
|
118 |
+
return note_dict
|
119 |
+
|
120 |
+
def autotune_f0(self, f0):
|
121 |
+
"""
|
122 |
+
Autotunes a given F0 contour by snapping each frequency to the closest reference frequency.
|
123 |
+
|
124 |
+
Args:
|
125 |
+
f0: The input F0 contour as a NumPy array.
|
126 |
+
"""
|
127 |
+
autotuned_f0 = np.zeros_like(f0)
|
128 |
+
for i, freq in enumerate(f0):
|
129 |
+
closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
|
130 |
+
autotuned_f0[i] = closest_note
|
131 |
+
return autotuned_f0
|
132 |
+
|
133 |
+
|
134 |
+
class Pipeline:
|
135 |
+
"""
|
136 |
+
The main pipeline class for performing voice conversion, including preprocessing, F0 estimation,
|
137 |
+
voice conversion using a model, and post-processing.
|
138 |
+
"""
|
139 |
+
|
140 |
+
def __init__(self, tgt_sr, config):
|
141 |
+
"""
|
142 |
+
Initializes the Pipeline class with target sampling rate and configuration parameters.
|
143 |
+
|
144 |
+
Args:
|
145 |
+
tgt_sr: The target sampling rate for the output audio.
|
146 |
+
config: A configuration object containing various parameters for the pipeline.
|
147 |
+
"""
|
148 |
+
self.x_pad = config.x_pad
|
149 |
+
self.x_query = config.x_query
|
150 |
+
self.x_center = config.x_center
|
151 |
+
self.x_max = config.x_max
|
152 |
+
self.is_half = config.is_half
|
153 |
+
self.sample_rate = 16000
|
154 |
+
self.window = 160
|
155 |
+
self.t_pad = self.sample_rate * self.x_pad
|
156 |
+
self.t_pad_tgt = tgt_sr * self.x_pad
|
157 |
+
self.t_pad2 = self.t_pad * 2
|
158 |
+
self.t_query = self.sample_rate * self.x_query
|
159 |
+
self.t_center = self.sample_rate * self.x_center
|
160 |
+
self.t_max = self.sample_rate * self.x_max
|
161 |
+
self.time_step = self.window / self.sample_rate * 1000
|
162 |
+
self.f0_min = 50
|
163 |
+
self.f0_max = 1100
|
164 |
+
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
|
165 |
+
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
|
166 |
+
self.device = config.device
|
167 |
+
self.ref_freqs = [
|
168 |
+
65.41,
|
169 |
+
82.41,
|
170 |
+
110.00,
|
171 |
+
146.83,
|
172 |
+
196.00,
|
173 |
+
246.94,
|
174 |
+
329.63,
|
175 |
+
440.00,
|
176 |
+
587.33,
|
177 |
+
783.99,
|
178 |
+
1046.50,
|
179 |
+
]
|
180 |
+
self.autotune = Autotune(self.ref_freqs)
|
181 |
+
self.note_dict = self.autotune.note_dict
|
182 |
+
|
183 |
+
def get_f0_crepe(
|
184 |
+
self,
|
185 |
+
x,
|
186 |
+
f0_min,
|
187 |
+
f0_max,
|
188 |
+
p_len,
|
189 |
+
hop_length,
|
190 |
+
model="full",
|
191 |
+
):
|
192 |
+
"""
|
193 |
+
Estimates the fundamental frequency (F0) of a given audio signal using the Crepe model.
|
194 |
+
|
195 |
+
Args:
|
196 |
+
x: The input audio signal as a NumPy array.
|
197 |
+
f0_min: Minimum F0 value to consider.
|
198 |
+
f0_max: Maximum F0 value to consider.
|
199 |
+
p_len: Desired length of the F0 output.
|
200 |
+
hop_length: Hop length for the Crepe model.
|
201 |
+
model: Crepe model size to use ("full" or "tiny").
|
202 |
+
"""
|
203 |
+
x = x.astype(np.float32)
|
204 |
+
x /= np.quantile(np.abs(x), 0.999)
|
205 |
+
audio = torch.from_numpy(x).to(self.device, copy=True)
|
206 |
+
audio = torch.unsqueeze(audio, dim=0)
|
207 |
+
if audio.ndim == 2 and audio.shape[0] > 1:
|
208 |
+
audio = torch.mean(audio, dim=0, keepdim=True).detach()
|
209 |
+
audio = audio.detach()
|
210 |
+
pitch: Tensor = torchcrepe.predict(
|
211 |
+
audio,
|
212 |
+
self.sample_rate,
|
213 |
+
hop_length,
|
214 |
+
f0_min,
|
215 |
+
f0_max,
|
216 |
+
model,
|
217 |
+
batch_size=hop_length * 2,
|
218 |
+
device=self.device,
|
219 |
+
pad=True,
|
220 |
+
)
|
221 |
+
p_len = p_len or x.shape[0] // hop_length
|
222 |
+
source = np.array(pitch.squeeze(0).cpu().float().numpy())
|
223 |
+
source[source < 0.001] = np.nan
|
224 |
+
target = np.interp(
|
225 |
+
np.arange(0, len(source) * p_len, len(source)) / p_len,
|
226 |
+
np.arange(0, len(source)),
|
227 |
+
source,
|
228 |
+
)
|
229 |
+
f0 = np.nan_to_num(target)
|
230 |
+
return f0
|
231 |
+
|
232 |
+
def get_f0_hybrid(
|
233 |
+
self,
|
234 |
+
methods_str,
|
235 |
+
x,
|
236 |
+
f0_min,
|
237 |
+
f0_max,
|
238 |
+
p_len,
|
239 |
+
hop_length,
|
240 |
+
):
|
241 |
+
"""
|
242 |
+
Estimates the fundamental frequency (F0) using a hybrid approach combining multiple methods.
|
243 |
+
|
244 |
+
Args:
|
245 |
+
methods_str: A string specifying the methods to combine (e.g., "hybrid[crepe+rmvpe]").
|
246 |
+
x: The input audio signal as a NumPy array.
|
247 |
+
f0_min: Minimum F0 value to consider.
|
248 |
+
f0_max: Maximum F0 value to consider.
|
249 |
+
p_len: Desired length of the F0 output.
|
250 |
+
hop_length: Hop length for F0 estimation methods.
|
251 |
+
"""
|
252 |
+
methods_str = re.search("hybrid\[(.+)\]", methods_str)
|
253 |
+
if methods_str:
|
254 |
+
methods = [method.strip() for method in methods_str.group(1).split("+")]
|
255 |
+
f0_computation_stack = []
|
256 |
+
print(f"Calculating f0 pitch estimations for methods {str(methods)}")
|
257 |
+
x = x.astype(np.float32)
|
258 |
+
x /= np.quantile(np.abs(x), 0.999)
|
259 |
+
for method in methods:
|
260 |
+
f0 = None
|
261 |
+
if method == "crepe":
|
262 |
+
f0 = self.get_f0_crepe_computation(
|
263 |
+
x, f0_min, f0_max, p_len, int(hop_length)
|
264 |
+
)
|
265 |
+
elif method == "rmvpe":
|
266 |
+
self.model_rmvpe = RMVPE0Predictor(
|
267 |
+
os.path.join(
|
268 |
+
"programs",
|
269 |
+
"applio_code",
|
270 |
+
"rvc",
|
271 |
+
"models",
|
272 |
+
"predictors",
|
273 |
+
"rmvpe.pt",
|
274 |
+
),
|
275 |
+
is_half=self.is_half,
|
276 |
+
device=self.device,
|
277 |
+
)
|
278 |
+
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
|
279 |
+
f0 = f0[1:]
|
280 |
+
elif method == "fcpe":
|
281 |
+
self.model_fcpe = FCPEF0Predictor(
|
282 |
+
os.path.join(
|
283 |
+
"programs",
|
284 |
+
"applio_code",
|
285 |
+
"rvc",
|
286 |
+
"models",
|
287 |
+
"predictors",
|
288 |
+
"fcpe.pt",
|
289 |
+
),
|
290 |
+
f0_min=int(f0_min),
|
291 |
+
f0_max=int(f0_max),
|
292 |
+
dtype=torch.float32,
|
293 |
+
device=self.device,
|
294 |
+
sample_rate=self.sample_rate,
|
295 |
+
threshold=0.03,
|
296 |
+
)
|
297 |
+
f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
|
298 |
+
del self.model_fcpe
|
299 |
+
gc.collect()
|
300 |
+
f0_computation_stack.append(f0)
|
301 |
+
|
302 |
+
f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None]
|
303 |
+
f0_median_hybrid = None
|
304 |
+
if len(f0_computation_stack) == 1:
|
305 |
+
f0_median_hybrid = f0_computation_stack[0]
|
306 |
+
else:
|
307 |
+
f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
|
308 |
+
return f0_median_hybrid
|
309 |
+
|
310 |
+
def get_f0(
|
311 |
+
self,
|
312 |
+
input_audio_path,
|
313 |
+
x,
|
314 |
+
p_len,
|
315 |
+
pitch,
|
316 |
+
f0_method,
|
317 |
+
filter_radius,
|
318 |
+
hop_length,
|
319 |
+
f0_autotune,
|
320 |
+
inp_f0=None,
|
321 |
+
):
|
322 |
+
"""
|
323 |
+
Estimates the fundamental frequency (F0) of a given audio signal using various methods.
|
324 |
+
|
325 |
+
Args:
|
326 |
+
input_audio_path: Path to the input audio file.
|
327 |
+
x: The input audio signal as a NumPy array.
|
328 |
+
p_len: Desired length of the F0 output.
|
329 |
+
pitch: Key to adjust the pitch of the F0 contour.
|
330 |
+
f0_method: Method to use for F0 estimation (e.g., "crepe").
|
331 |
+
filter_radius: Radius for median filtering the F0 contour.
|
332 |
+
hop_length: Hop length for F0 estimation methods.
|
333 |
+
f0_autotune: Whether to apply autotune to the F0 contour.
|
334 |
+
inp_f0: Optional input F0 contour to use instead of estimating.
|
335 |
+
"""
|
336 |
+
global input_audio_path2wav
|
337 |
+
if f0_method == "crepe":
|
338 |
+
f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length))
|
339 |
+
elif f0_method == "crepe-tiny":
|
340 |
+
f0 = self.get_f0_crepe(
|
341 |
+
x, self.f0_min, self.f0_max, p_len, int(hop_length), "tiny"
|
342 |
+
)
|
343 |
+
elif f0_method == "rmvpe":
|
344 |
+
self.model_rmvpe = RMVPE0Predictor(
|
345 |
+
os.path.join(
|
346 |
+
"programs", "applio_code", "rvc", "models", "predictors", "rmvpe.pt"
|
347 |
+
),
|
348 |
+
is_half=self.is_half,
|
349 |
+
device=self.device,
|
350 |
+
)
|
351 |
+
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
|
352 |
+
elif f0_method == "fcpe":
|
353 |
+
self.model_fcpe = FCPEF0Predictor(
|
354 |
+
os.path.join(
|
355 |
+
"programs", "applio_code", "rvc", "models", "predictors", "fcpe.pt"
|
356 |
+
),
|
357 |
+
f0_min=int(self.f0_min),
|
358 |
+
f0_max=int(self.f0_max),
|
359 |
+
dtype=torch.float32,
|
360 |
+
device=self.device,
|
361 |
+
sample_rate=self.sample_rate,
|
362 |
+
threshold=0.03,
|
363 |
+
)
|
364 |
+
f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
|
365 |
+
del self.model_fcpe
|
366 |
+
gc.collect()
|
367 |
+
elif "hybrid" in f0_method:
|
368 |
+
input_audio_path2wav[input_audio_path] = x.astype(np.double)
|
369 |
+
f0 = self.get_f0_hybrid(
|
370 |
+
f0_method,
|
371 |
+
x,
|
372 |
+
self.f0_min,
|
373 |
+
self.f0_max,
|
374 |
+
p_len,
|
375 |
+
hop_length,
|
376 |
+
)
|
377 |
+
|
378 |
+
if f0_autotune == "True":
|
379 |
+
f0 = Autotune.autotune_f0(self, f0)
|
380 |
+
|
381 |
+
f0 *= pow(2, pitch / 12)
|
382 |
+
tf0 = self.sample_rate // self.window
|
383 |
+
if inp_f0 is not None:
|
384 |
+
delta_t = np.round(
|
385 |
+
(inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
|
386 |
+
).astype("int16")
|
387 |
+
replace_f0 = np.interp(
|
388 |
+
list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
|
389 |
+
)
|
390 |
+
shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
|
391 |
+
f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
|
392 |
+
:shape
|
393 |
+
]
|
394 |
+
f0bak = f0.copy()
|
395 |
+
f0_mel = 1127 * np.log(1 + f0 / 700)
|
396 |
+
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
|
397 |
+
self.f0_mel_max - self.f0_mel_min
|
398 |
+
) + 1
|
399 |
+
f0_mel[f0_mel <= 1] = 1
|
400 |
+
f0_mel[f0_mel > 255] = 255
|
401 |
+
f0_coarse = np.rint(f0_mel).astype(np.int)
|
402 |
+
|
403 |
+
return f0_coarse, f0bak
|
404 |
+
|
405 |
+
def voice_conversion(
|
406 |
+
self,
|
407 |
+
model,
|
408 |
+
net_g,
|
409 |
+
sid,
|
410 |
+
audio0,
|
411 |
+
pitch,
|
412 |
+
pitchf,
|
413 |
+
index,
|
414 |
+
big_npy,
|
415 |
+
index_rate,
|
416 |
+
version,
|
417 |
+
protect,
|
418 |
+
):
|
419 |
+
"""
|
420 |
+
Performs voice conversion on a given audio segment.
|
421 |
+
|
422 |
+
Args:
|
423 |
+
model: The feature extractor model.
|
424 |
+
net_g: The generative model for synthesizing speech.
|
425 |
+
sid: Speaker ID for the target voice.
|
426 |
+
audio0: The input audio segment.
|
427 |
+
pitch: Quantized F0 contour for pitch guidance.
|
428 |
+
pitchf: Original F0 contour for pitch guidance.
|
429 |
+
index: FAISS index for speaker embedding retrieval.
|
430 |
+
big_npy: Speaker embeddings stored in a NumPy array.
|
431 |
+
index_rate: Blending rate for speaker embedding retrieval.
|
432 |
+
version: Model version ("v1" or "v2").
|
433 |
+
protect: Protection level for preserving the original pitch.
|
434 |
+
"""
|
435 |
+
feats = torch.from_numpy(audio0)
|
436 |
+
if self.is_half:
|
437 |
+
feats = feats.half()
|
438 |
+
else:
|
439 |
+
feats = feats.float()
|
440 |
+
if feats.dim() == 2:
|
441 |
+
feats = feats.mean(-1)
|
442 |
+
assert feats.dim() == 1, feats.dim()
|
443 |
+
feats = feats.view(1, -1)
|
444 |
+
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
445 |
+
|
446 |
+
with torch.no_grad():
|
447 |
+
feats = model(feats.to(self.device))["last_hidden_state"]
|
448 |
+
feats = (
|
449 |
+
model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats
|
450 |
+
)
|
451 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
452 |
+
feats0 = feats.clone()
|
453 |
+
if (
|
454 |
+
isinstance(index, type(None)) == False
|
455 |
+
and isinstance(big_npy, type(None)) == False
|
456 |
+
and index_rate != 0
|
457 |
+
):
|
458 |
+
npy = feats[0].cpu().numpy()
|
459 |
+
if self.is_half:
|
460 |
+
npy = npy.astype("float32")
|
461 |
+
|
462 |
+
score, ix = index.search(npy, k=8)
|
463 |
+
weight = np.square(1 / score)
|
464 |
+
weight /= weight.sum(axis=1, keepdims=True)
|
465 |
+
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
466 |
+
|
467 |
+
if self.is_half:
|
468 |
+
npy = npy.astype("float16")
|
469 |
+
feats = (
|
470 |
+
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
|
471 |
+
+ (1 - index_rate) * feats
|
472 |
+
)
|
473 |
+
|
474 |
+
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
475 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
476 |
+
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
|
477 |
+
0, 2, 1
|
478 |
+
)
|
479 |
+
p_len = audio0.shape[0] // self.window
|
480 |
+
if feats.shape[1] < p_len:
|
481 |
+
p_len = feats.shape[1]
|
482 |
+
if pitch != None and pitchf != None:
|
483 |
+
pitch = pitch[:, :p_len]
|
484 |
+
pitchf = pitchf[:, :p_len]
|
485 |
+
|
486 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
487 |
+
pitchff = pitchf.clone()
|
488 |
+
pitchff[pitchf > 0] = 1
|
489 |
+
pitchff[pitchf < 1] = protect
|
490 |
+
pitchff = pitchff.unsqueeze(-1)
|
491 |
+
feats = feats * pitchff + feats0 * (1 - pitchff)
|
492 |
+
feats = feats.to(feats0.dtype)
|
493 |
+
p_len = torch.tensor([p_len], device=self.device).long()
|
494 |
+
with torch.no_grad():
|
495 |
+
if pitch != None and pitchf != None:
|
496 |
+
audio1 = (
|
497 |
+
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
|
498 |
+
.data.cpu()
|
499 |
+
.float()
|
500 |
+
.numpy()
|
501 |
+
)
|
502 |
+
else:
|
503 |
+
audio1 = (
|
504 |
+
(net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
|
505 |
+
)
|
506 |
+
del feats, p_len, padding_mask
|
507 |
+
if torch.cuda.is_available():
|
508 |
+
torch.cuda.empty_cache()
|
509 |
+
return audio1
|
510 |
+
|
511 |
+
def pipeline(
|
512 |
+
self,
|
513 |
+
model,
|
514 |
+
net_g,
|
515 |
+
sid,
|
516 |
+
audio,
|
517 |
+
input_audio_path,
|
518 |
+
pitch,
|
519 |
+
f0_method,
|
520 |
+
file_index,
|
521 |
+
index_rate,
|
522 |
+
pitch_guidance,
|
523 |
+
filter_radius,
|
524 |
+
tgt_sr,
|
525 |
+
resample_sr,
|
526 |
+
volume_envelope,
|
527 |
+
version,
|
528 |
+
protect,
|
529 |
+
hop_length,
|
530 |
+
f0_autotune,
|
531 |
+
f0_file,
|
532 |
+
):
|
533 |
+
"""
|
534 |
+
The main pipeline function for performing voice conversion.
|
535 |
+
|
536 |
+
Args:
|
537 |
+
model: The feature extractor model.
|
538 |
+
net_g: The generative model for synthesizing speech.
|
539 |
+
sid: Speaker ID for the target voice.
|
540 |
+
audio: The input audio signal.
|
541 |
+
input_audio_path: Path to the input audio file.
|
542 |
+
pitch: Key to adjust the pitch of the F0 contour.
|
543 |
+
f0_method: Method to use for F0 estimation.
|
544 |
+
file_index: Path to the FAISS index file for speaker embedding retrieval.
|
545 |
+
index_rate: Blending rate for speaker embedding retrieval.
|
546 |
+
pitch_guidance: Whether to use pitch guidance during voice conversion.
|
547 |
+
filter_radius: Radius for median filtering the F0 contour.
|
548 |
+
tgt_sr: Target sampling rate for the output audio.
|
549 |
+
resample_sr: Resampling rate for the output audio.
|
550 |
+
volume_envelope: Blending rate for adjusting the RMS level of the output audio.
|
551 |
+
version: Model version.
|
552 |
+
protect: Protection level for preserving the original pitch.
|
553 |
+
hop_length: Hop length for F0 estimation methods.
|
554 |
+
f0_autotune: Whether to apply autotune to the F0 contour.
|
555 |
+
f0_file: Path to a file containing an F0 contour to use.
|
556 |
+
"""
|
557 |
+
if file_index != "" and os.path.exists(file_index) == True and index_rate != 0:
|
558 |
+
try:
|
559 |
+
index = faiss.read_index(file_index)
|
560 |
+
big_npy = index.reconstruct_n(0, index.ntotal)
|
561 |
+
except Exception as error:
|
562 |
+
print(f"An error occurred reading the FAISS index: {error}")
|
563 |
+
index = big_npy = None
|
564 |
+
else:
|
565 |
+
index = big_npy = None
|
566 |
+
audio = signal.filtfilt(bh, ah, audio)
|
567 |
+
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
|
568 |
+
opt_ts = []
|
569 |
+
if audio_pad.shape[0] > self.t_max:
|
570 |
+
audio_sum = np.zeros_like(audio)
|
571 |
+
for i in range(self.window):
|
572 |
+
audio_sum += audio_pad[i : i - self.window]
|
573 |
+
for t in range(self.t_center, audio.shape[0], self.t_center):
|
574 |
+
opt_ts.append(
|
575 |
+
t
|
576 |
+
- self.t_query
|
577 |
+
+ np.where(
|
578 |
+
np.abs(audio_sum[t - self.t_query : t + self.t_query])
|
579 |
+
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
580 |
+
)[0][0]
|
581 |
+
)
|
582 |
+
s = 0
|
583 |
+
audio_opt = []
|
584 |
+
t = None
|
585 |
+
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
|
586 |
+
p_len = audio_pad.shape[0] // self.window
|
587 |
+
inp_f0 = None
|
588 |
+
if hasattr(f0_file, "name") == True:
|
589 |
+
try:
|
590 |
+
with open(f0_file.name, "r") as f:
|
591 |
+
lines = f.read().strip("\n").split("\n")
|
592 |
+
inp_f0 = []
|
593 |
+
for line in lines:
|
594 |
+
inp_f0.append([float(i) for i in line.split(",")])
|
595 |
+
inp_f0 = np.array(inp_f0, dtype="float32")
|
596 |
+
except Exception as error:
|
597 |
+
print(f"An error occurred reading the F0 file: {error}")
|
598 |
+
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
599 |
+
if pitch_guidance == True:
|
600 |
+
pitch, pitchf = self.get_f0(
|
601 |
+
input_audio_path,
|
602 |
+
audio_pad,
|
603 |
+
p_len,
|
604 |
+
pitch,
|
605 |
+
f0_method,
|
606 |
+
filter_radius,
|
607 |
+
hop_length,
|
608 |
+
f0_autotune,
|
609 |
+
inp_f0,
|
610 |
+
)
|
611 |
+
pitch = pitch[:p_len]
|
612 |
+
pitchf = pitchf[:p_len]
|
613 |
+
if self.device == "mps":
|
614 |
+
pitchf = pitchf.astype(np.float32)
|
615 |
+
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
616 |
+
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
617 |
+
for t in opt_ts:
|
618 |
+
t = t // self.window * self.window
|
619 |
+
if pitch_guidance == True:
|
620 |
+
audio_opt.append(
|
621 |
+
self.voice_conversion(
|
622 |
+
model,
|
623 |
+
net_g,
|
624 |
+
sid,
|
625 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
626 |
+
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
|
627 |
+
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
|
628 |
+
index,
|
629 |
+
big_npy,
|
630 |
+
index_rate,
|
631 |
+
version,
|
632 |
+
protect,
|
633 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
634 |
+
)
|
635 |
+
else:
|
636 |
+
audio_opt.append(
|
637 |
+
self.voice_conversion(
|
638 |
+
model,
|
639 |
+
net_g,
|
640 |
+
sid,
|
641 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
642 |
+
None,
|
643 |
+
None,
|
644 |
+
index,
|
645 |
+
big_npy,
|
646 |
+
index_rate,
|
647 |
+
version,
|
648 |
+
protect,
|
649 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
650 |
+
)
|
651 |
+
s = t
|
652 |
+
if pitch_guidance == True:
|
653 |
+
audio_opt.append(
|
654 |
+
self.voice_conversion(
|
655 |
+
model,
|
656 |
+
net_g,
|
657 |
+
sid,
|
658 |
+
audio_pad[t:],
|
659 |
+
pitch[:, t // self.window :] if t is not None else pitch,
|
660 |
+
pitchf[:, t // self.window :] if t is not None else pitchf,
|
661 |
+
index,
|
662 |
+
big_npy,
|
663 |
+
index_rate,
|
664 |
+
version,
|
665 |
+
protect,
|
666 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
667 |
+
)
|
668 |
+
else:
|
669 |
+
audio_opt.append(
|
670 |
+
self.voice_conversion(
|
671 |
+
model,
|
672 |
+
net_g,
|
673 |
+
sid,
|
674 |
+
audio_pad[t:],
|
675 |
+
None,
|
676 |
+
None,
|
677 |
+
index,
|
678 |
+
big_npy,
|
679 |
+
index_rate,
|
680 |
+
version,
|
681 |
+
protect,
|
682 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
683 |
+
)
|
684 |
+
audio_opt = np.concatenate(audio_opt)
|
685 |
+
if volume_envelope != 1:
|
686 |
+
audio_opt = AudioProcessor.change_rms(
|
687 |
+
audio, self.sample_rate, audio_opt, tgt_sr, volume_envelope
|
688 |
+
)
|
689 |
+
if resample_sr >= self.sample_rate and tgt_sr != resample_sr:
|
690 |
+
audio_opt = librosa.resample(
|
691 |
+
audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
|
692 |
+
)
|
693 |
+
audio_max = np.abs(audio_opt).max() / 0.99
|
694 |
+
max_int16 = 32768
|
695 |
+
if audio_max > 1:
|
696 |
+
max_int16 /= audio_max
|
697 |
+
audio_opt = (audio_opt * max_int16).astype(np.int16)
|
698 |
+
del pitch, pitchf, sid
|
699 |
+
if torch.cuda.is_available():
|
700 |
+
torch.cuda.empty_cache()
|
701 |
+
return audio_opt
|
programs/applio_code/rvc/lib/algorithm/__init__.py
ADDED
File without changes
|
programs/applio_code/rvc/lib/algorithm/attentions.py
ADDED
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
|
4 |
+
from programs.applio_code.rvc.lib.algorithm.commons import convert_pad_shape
|
5 |
+
|
6 |
+
|
7 |
+
class MultiHeadAttention(torch.nn.Module):
|
8 |
+
"""
|
9 |
+
Multi-head attention module with optional relative positional encoding and proximal bias.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
channels (int): Number of input channels.
|
13 |
+
out_channels (int): Number of output channels.
|
14 |
+
n_heads (int): Number of attention heads.
|
15 |
+
p_dropout (float, optional): Dropout probability. Defaults to 0.0.
|
16 |
+
window_size (int, optional): Window size for relative positional encoding. Defaults to None.
|
17 |
+
heads_share (bool, optional): Whether to share relative positional embeddings across heads. Defaults to True.
|
18 |
+
block_length (int, optional): Block length for local attention. Defaults to None.
|
19 |
+
proximal_bias (bool, optional): Whether to use proximal bias in self-attention. Defaults to False.
|
20 |
+
proximal_init (bool, optional): Whether to initialize the key projection weights the same as query projection weights. Defaults to False.
|
21 |
+
"""
|
22 |
+
|
23 |
+
def __init__(
|
24 |
+
self,
|
25 |
+
channels,
|
26 |
+
out_channels,
|
27 |
+
n_heads,
|
28 |
+
p_dropout=0.0,
|
29 |
+
window_size=None,
|
30 |
+
heads_share=True,
|
31 |
+
block_length=None,
|
32 |
+
proximal_bias=False,
|
33 |
+
proximal_init=False,
|
34 |
+
):
|
35 |
+
super().__init__()
|
36 |
+
assert channels % n_heads == 0
|
37 |
+
|
38 |
+
self.channels = channels
|
39 |
+
self.out_channels = out_channels
|
40 |
+
self.n_heads = n_heads
|
41 |
+
self.p_dropout = p_dropout
|
42 |
+
self.window_size = window_size
|
43 |
+
self.heads_share = heads_share
|
44 |
+
self.block_length = block_length
|
45 |
+
self.proximal_bias = proximal_bias
|
46 |
+
self.proximal_init = proximal_init
|
47 |
+
self.attn = None
|
48 |
+
|
49 |
+
self.k_channels = channels // n_heads
|
50 |
+
self.conv_q = torch.nn.Conv1d(channels, channels, 1)
|
51 |
+
self.conv_k = torch.nn.Conv1d(channels, channels, 1)
|
52 |
+
self.conv_v = torch.nn.Conv1d(channels, channels, 1)
|
53 |
+
self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
|
54 |
+
self.drop = torch.nn.Dropout(p_dropout)
|
55 |
+
|
56 |
+
if window_size is not None:
|
57 |
+
n_heads_rel = 1 if heads_share else n_heads
|
58 |
+
rel_stddev = self.k_channels**-0.5
|
59 |
+
self.emb_rel_k = torch.nn.Parameter(
|
60 |
+
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
|
61 |
+
* rel_stddev
|
62 |
+
)
|
63 |
+
self.emb_rel_v = torch.nn.Parameter(
|
64 |
+
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
|
65 |
+
* rel_stddev
|
66 |
+
)
|
67 |
+
|
68 |
+
torch.nn.init.xavier_uniform_(self.conv_q.weight)
|
69 |
+
torch.nn.init.xavier_uniform_(self.conv_k.weight)
|
70 |
+
torch.nn.init.xavier_uniform_(self.conv_v.weight)
|
71 |
+
if proximal_init:
|
72 |
+
with torch.no_grad():
|
73 |
+
self.conv_k.weight.copy_(self.conv_q.weight)
|
74 |
+
self.conv_k.bias.copy_(self.conv_q.bias)
|
75 |
+
|
76 |
+
def forward(self, x, c, attn_mask=None):
|
77 |
+
q = self.conv_q(x)
|
78 |
+
k = self.conv_k(c)
|
79 |
+
v = self.conv_v(c)
|
80 |
+
|
81 |
+
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
82 |
+
|
83 |
+
x = self.conv_o(x)
|
84 |
+
return x
|
85 |
+
|
86 |
+
def attention(self, query, key, value, mask=None):
|
87 |
+
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
88 |
+
b, d, t_s, t_t = (*key.size(), query.size(2))
|
89 |
+
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
90 |
+
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
91 |
+
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
92 |
+
|
93 |
+
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
94 |
+
if self.window_size is not None:
|
95 |
+
assert (
|
96 |
+
t_s == t_t
|
97 |
+
), "Relative attention is only available for self-attention."
|
98 |
+
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
99 |
+
rel_logits = self._matmul_with_relative_keys(
|
100 |
+
query / math.sqrt(self.k_channels), key_relative_embeddings
|
101 |
+
)
|
102 |
+
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
103 |
+
scores = scores + scores_local
|
104 |
+
if self.proximal_bias:
|
105 |
+
assert t_s == t_t, "Proximal bias is only available for self-attention."
|
106 |
+
scores = scores + self._attention_bias_proximal(t_s).to(
|
107 |
+
device=scores.device, dtype=scores.dtype
|
108 |
+
)
|
109 |
+
if mask is not None:
|
110 |
+
scores = scores.masked_fill(mask == 0, -1e4)
|
111 |
+
if self.block_length is not None:
|
112 |
+
assert (
|
113 |
+
t_s == t_t
|
114 |
+
), "Local attention is only available for self-attention."
|
115 |
+
block_mask = (
|
116 |
+
torch.ones_like(scores)
|
117 |
+
.triu(-self.block_length)
|
118 |
+
.tril(self.block_length)
|
119 |
+
)
|
120 |
+
scores = scores.masked_fill(block_mask == 0, -1e4)
|
121 |
+
p_attn = torch.nn.functional.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
|
122 |
+
p_attn = self.drop(p_attn)
|
123 |
+
output = torch.matmul(p_attn, value)
|
124 |
+
if self.window_size is not None:
|
125 |
+
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
126 |
+
value_relative_embeddings = self._get_relative_embeddings(
|
127 |
+
self.emb_rel_v, t_s
|
128 |
+
)
|
129 |
+
output = output + self._matmul_with_relative_values(
|
130 |
+
relative_weights, value_relative_embeddings
|
131 |
+
)
|
132 |
+
output = (
|
133 |
+
output.transpose(2, 3).contiguous().view(b, d, t_t)
|
134 |
+
) # [b, n_h, t_t, d_k] -> [b, d, t_t]
|
135 |
+
return output, p_attn
|
136 |
+
|
137 |
+
def _matmul_with_relative_values(self, x, y):
|
138 |
+
"""
|
139 |
+
x: [b, h, l, m]
|
140 |
+
y: [h or 1, m, d]
|
141 |
+
ret: [b, h, l, d]
|
142 |
+
"""
|
143 |
+
ret = torch.matmul(x, y.unsqueeze(0))
|
144 |
+
return ret
|
145 |
+
|
146 |
+
def _matmul_with_relative_keys(self, x, y):
|
147 |
+
"""
|
148 |
+
x: [b, h, l, d]
|
149 |
+
y: [h or 1, m, d]
|
150 |
+
ret: [b, h, l, m]
|
151 |
+
"""
|
152 |
+
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
153 |
+
return ret
|
154 |
+
|
155 |
+
def _get_relative_embeddings(self, relative_embeddings, length):
|
156 |
+
# Pad first before slice to avoid using cond ops.
|
157 |
+
pad_length = max(length - (self.window_size + 1), 0)
|
158 |
+
slice_start_position = max((self.window_size + 1) - length, 0)
|
159 |
+
slice_end_position = slice_start_position + 2 * length - 1
|
160 |
+
if pad_length > 0:
|
161 |
+
padded_relative_embeddings = torch.nn.functional.pad(
|
162 |
+
relative_embeddings,
|
163 |
+
convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
|
164 |
+
)
|
165 |
+
else:
|
166 |
+
padded_relative_embeddings = relative_embeddings
|
167 |
+
used_relative_embeddings = padded_relative_embeddings[
|
168 |
+
:, slice_start_position:slice_end_position
|
169 |
+
]
|
170 |
+
return used_relative_embeddings
|
171 |
+
|
172 |
+
def _relative_position_to_absolute_position(self, x):
|
173 |
+
"""
|
174 |
+
x: [b, h, l, 2*l-1]
|
175 |
+
ret: [b, h, l, l]
|
176 |
+
"""
|
177 |
+
batch, heads, length, _ = x.size()
|
178 |
+
|
179 |
+
# Concat columns of pad to shift from relative to absolute indexing.
|
180 |
+
x = torch.nn.functional.pad(
|
181 |
+
x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
|
182 |
+
)
|
183 |
+
|
184 |
+
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
185 |
+
x_flat = x.view([batch, heads, length * 2 * length])
|
186 |
+
x_flat = torch.nn.functional.pad(
|
187 |
+
x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
|
188 |
+
)
|
189 |
+
|
190 |
+
# Reshape and slice out the padded elements.
|
191 |
+
x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
|
192 |
+
:, :, :length, length - 1 :
|
193 |
+
]
|
194 |
+
return x_final
|
195 |
+
|
196 |
+
def _absolute_position_to_relative_position(self, x):
|
197 |
+
"""
|
198 |
+
x: [b, h, l, l]
|
199 |
+
ret: [b, h, l, 2*l-1]
|
200 |
+
"""
|
201 |
+
batch, heads, length, _ = x.size()
|
202 |
+
# padd along column
|
203 |
+
x = torch.nn.functional.pad(
|
204 |
+
x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
|
205 |
+
)
|
206 |
+
x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
|
207 |
+
# add 0's in the beginning that will skew the elements after reshape
|
208 |
+
x_flat = torch.nn.functional.pad(
|
209 |
+
x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]])
|
210 |
+
)
|
211 |
+
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
|
212 |
+
return x_final
|
213 |
+
|
214 |
+
def _attention_bias_proximal(self, length):
|
215 |
+
"""Bias for self-attention to encourage attention to close positions.
|
216 |
+
Args:
|
217 |
+
length: an integer scalar.
|
218 |
+
"""
|
219 |
+
r = torch.arange(length, dtype=torch.float32)
|
220 |
+
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
221 |
+
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
222 |
+
|
223 |
+
|
224 |
+
class FFN(torch.nn.Module):
|
225 |
+
"""
|
226 |
+
Feed-forward network module.
|
227 |
+
|
228 |
+
Args:
|
229 |
+
in_channels (int): Number of input channels.
|
230 |
+
out_channels (int): Number of output channels.
|
231 |
+
filter_channels (int): Number of filter channels in the convolution layers.
|
232 |
+
kernel_size (int): Kernel size of the convolution layers.
|
233 |
+
p_dropout (float, optional): Dropout probability. Defaults to 0.0.
|
234 |
+
activation (str, optional): Activation function to use. Defaults to None.
|
235 |
+
causal (bool, optional): Whether to use causal padding in the convolution layers. Defaults to False.
|
236 |
+
"""
|
237 |
+
|
238 |
+
def __init__(
|
239 |
+
self,
|
240 |
+
in_channels,
|
241 |
+
out_channels,
|
242 |
+
filter_channels,
|
243 |
+
kernel_size,
|
244 |
+
p_dropout=0.0,
|
245 |
+
activation=None,
|
246 |
+
causal=False,
|
247 |
+
):
|
248 |
+
super().__init__()
|
249 |
+
self.in_channels = in_channels
|
250 |
+
self.out_channels = out_channels
|
251 |
+
self.filter_channels = filter_channels
|
252 |
+
self.kernel_size = kernel_size
|
253 |
+
self.p_dropout = p_dropout
|
254 |
+
self.activation = activation
|
255 |
+
self.causal = causal
|
256 |
+
|
257 |
+
if causal:
|
258 |
+
self.padding = self._causal_padding
|
259 |
+
else:
|
260 |
+
self.padding = self._same_padding
|
261 |
+
|
262 |
+
self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size)
|
263 |
+
self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size)
|
264 |
+
self.drop = torch.nn.Dropout(p_dropout)
|
265 |
+
|
266 |
+
def forward(self, x, x_mask):
|
267 |
+
x = self.conv_1(self.padding(x * x_mask))
|
268 |
+
if self.activation == "gelu":
|
269 |
+
x = x * torch.sigmoid(1.702 * x)
|
270 |
+
else:
|
271 |
+
x = torch.relu(x)
|
272 |
+
x = self.drop(x)
|
273 |
+
x = self.conv_2(self.padding(x * x_mask))
|
274 |
+
return x * x_mask
|
275 |
+
|
276 |
+
def _causal_padding(self, x):
|
277 |
+
if self.kernel_size == 1:
|
278 |
+
return x
|
279 |
+
pad_l = self.kernel_size - 1
|
280 |
+
pad_r = 0
|
281 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
282 |
+
x = torch.nn.functional.pad(x, convert_pad_shape(padding))
|
283 |
+
return x
|
284 |
+
|
285 |
+
def _same_padding(self, x):
|
286 |
+
if self.kernel_size == 1:
|
287 |
+
return x
|
288 |
+
pad_l = (self.kernel_size - 1) // 2
|
289 |
+
pad_r = self.kernel_size // 2
|
290 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
291 |
+
x = torch.nn.functional.pad(x, convert_pad_shape(padding))
|
292 |
+
return x
|
programs/applio_code/rvc/lib/algorithm/commons.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from typing import List, Optional
|
4 |
+
|
5 |
+
|
6 |
+
def init_weights(m, mean=0.0, std=0.01):
|
7 |
+
"""
|
8 |
+
Initialize the weights of a module.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
m: The module to initialize.
|
12 |
+
mean: The mean of the normal distribution.
|
13 |
+
std: The standard deviation of the normal distribution.
|
14 |
+
"""
|
15 |
+
classname = m.__class__.__name__
|
16 |
+
if classname.find("Conv") != -1:
|
17 |
+
m.weight.data.normal_(mean, std)
|
18 |
+
|
19 |
+
|
20 |
+
def get_padding(kernel_size, dilation=1):
|
21 |
+
"""
|
22 |
+
Calculate the padding needed for a convolution.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
kernel_size: The size of the kernel.
|
26 |
+
dilation: The dilation of the convolution.
|
27 |
+
"""
|
28 |
+
return int((kernel_size * dilation - dilation) / 2)
|
29 |
+
|
30 |
+
|
31 |
+
def convert_pad_shape(pad_shape):
|
32 |
+
"""
|
33 |
+
Convert the pad shape to a list of integers.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
pad_shape: The pad shape..
|
37 |
+
"""
|
38 |
+
l = pad_shape[::-1]
|
39 |
+
pad_shape = [item for sublist in l for item in sublist]
|
40 |
+
return pad_shape
|
41 |
+
|
42 |
+
|
43 |
+
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
44 |
+
"""
|
45 |
+
Calculate the KL divergence between two distributions.
|
46 |
+
|
47 |
+
Args:
|
48 |
+
m_p: The mean of the first distribution.
|
49 |
+
logs_p: The log of the standard deviation of the first distribution.
|
50 |
+
m_q: The mean of the second distribution.
|
51 |
+
logs_q: The log of the standard deviation of the second distribution.
|
52 |
+
"""
|
53 |
+
kl = (logs_q - logs_p) - 0.5
|
54 |
+
kl += (
|
55 |
+
0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
|
56 |
+
)
|
57 |
+
return kl
|
58 |
+
|
59 |
+
|
60 |
+
def slice_segments(
|
61 |
+
x: torch.Tensor, ids_str: torch.Tensor, segment_size: int = 4, dim: int = 2
|
62 |
+
):
|
63 |
+
"""
|
64 |
+
Slice segments from a tensor, handling tensors with different numbers of dimensions.
|
65 |
+
|
66 |
+
Args:
|
67 |
+
x (torch.Tensor): The tensor to slice.
|
68 |
+
ids_str (torch.Tensor): The starting indices of the segments.
|
69 |
+
segment_size (int, optional): The size of each segment. Defaults to 4.
|
70 |
+
dim (int, optional): The dimension to slice across (2D or 3D tensors). Defaults to 2.
|
71 |
+
"""
|
72 |
+
if dim == 2:
|
73 |
+
ret = torch.zeros_like(x[:, :segment_size])
|
74 |
+
elif dim == 3:
|
75 |
+
ret = torch.zeros_like(x[:, :, :segment_size])
|
76 |
+
|
77 |
+
for i in range(x.size(0)):
|
78 |
+
idx_str = ids_str[i].item()
|
79 |
+
idx_end = idx_str + segment_size
|
80 |
+
if dim == 2:
|
81 |
+
ret[i] = x[i, idx_str:idx_end]
|
82 |
+
else:
|
83 |
+
ret[i] = x[i, :, idx_str:idx_end]
|
84 |
+
|
85 |
+
return ret
|
86 |
+
|
87 |
+
|
88 |
+
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
89 |
+
"""
|
90 |
+
Randomly slice segments from a tensor.
|
91 |
+
|
92 |
+
Args:
|
93 |
+
x: The tensor to slice.
|
94 |
+
x_lengths: The lengths of the sequences.
|
95 |
+
segment_size: The size of each segment.
|
96 |
+
"""
|
97 |
+
b, d, t = x.size()
|
98 |
+
if x_lengths is None:
|
99 |
+
x_lengths = t
|
100 |
+
ids_str_max = x_lengths - segment_size + 1
|
101 |
+
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
102 |
+
ret = slice_segments(x, ids_str, segment_size, dim=3)
|
103 |
+
return ret, ids_str
|
104 |
+
|
105 |
+
|
106 |
+
def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
107 |
+
"""
|
108 |
+
Generate a 1D timing signal.
|
109 |
+
|
110 |
+
Args:
|
111 |
+
length: The length of the signal.
|
112 |
+
channels: The number of channels of the signal.
|
113 |
+
min_timescale: The minimum timescale.
|
114 |
+
max_timescale: The maximum timescale.
|
115 |
+
"""
|
116 |
+
position = torch.arange(length, dtype=torch.float)
|
117 |
+
num_timescales = channels // 2
|
118 |
+
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
|
119 |
+
num_timescales - 1
|
120 |
+
)
|
121 |
+
inv_timescales = min_timescale * torch.exp(
|
122 |
+
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
|
123 |
+
)
|
124 |
+
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
125 |
+
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
126 |
+
signal = torch.nn.functional.pad(signal, [0, 0, 0, channels % 2])
|
127 |
+
signal = signal.view(1, channels, length)
|
128 |
+
return signal
|
129 |
+
|
130 |
+
|
131 |
+
def subsequent_mask(length):
|
132 |
+
"""
|
133 |
+
Generate a subsequent mask.
|
134 |
+
|
135 |
+
Args:
|
136 |
+
length: The length of the sequence.
|
137 |
+
"""
|
138 |
+
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
139 |
+
return mask
|
140 |
+
|
141 |
+
|
142 |
+
@torch.jit.script
|
143 |
+
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
144 |
+
"""
|
145 |
+
Fused add tanh sigmoid multiply operation.
|
146 |
+
|
147 |
+
Args:
|
148 |
+
input_a: The first input tensor.
|
149 |
+
input_b: The second input tensor.
|
150 |
+
n_channels: The number of channels.
|
151 |
+
"""
|
152 |
+
n_channels_int = n_channels[0]
|
153 |
+
in_act = input_a + input_b
|
154 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
155 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
156 |
+
acts = t_act * s_act
|
157 |
+
return acts
|
158 |
+
|
159 |
+
|
160 |
+
# Zluda, same as previous, but without jit.script
|
161 |
+
def fused_add_tanh_sigmoid_multiply_no_jit(input_a, input_b, n_channels):
|
162 |
+
"""
|
163 |
+
Fused add tanh sigmoid multiply operation.
|
164 |
+
|
165 |
+
Args:
|
166 |
+
input_a: The first input tensor.
|
167 |
+
input_b: The second input tensor.
|
168 |
+
n_channels: The number of channels.
|
169 |
+
"""
|
170 |
+
n_channels_int = n_channels[0]
|
171 |
+
in_act = input_a + input_b
|
172 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
173 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
174 |
+
acts = t_act * s_act
|
175 |
+
return acts
|
176 |
+
|
177 |
+
|
178 |
+
def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
|
179 |
+
"""
|
180 |
+
Convert the pad shape to a list of integers.
|
181 |
+
|
182 |
+
Args:
|
183 |
+
pad_shape: The pad shape.
|
184 |
+
"""
|
185 |
+
return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
|
186 |
+
|
187 |
+
|
188 |
+
def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
|
189 |
+
"""
|
190 |
+
Generate a sequence mask.
|
191 |
+
|
192 |
+
Args:
|
193 |
+
length: The lengths of the sequences.
|
194 |
+
max_length: The maximum length of the sequences.
|
195 |
+
"""
|
196 |
+
if max_length is None:
|
197 |
+
max_length = length.max()
|
198 |
+
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
199 |
+
return x.unsqueeze(0) < length.unsqueeze(1)
|
200 |
+
|
201 |
+
|
202 |
+
def clip_grad_value(parameters, clip_value, norm_type=2):
|
203 |
+
"""
|
204 |
+
Clip the gradients of a list of parameters.
|
205 |
+
|
206 |
+
Args:
|
207 |
+
parameters: The list of parameters to clip.
|
208 |
+
clip_value: The maximum value of the gradients.
|
209 |
+
norm_type: The type of norm to use for clipping.
|
210 |
+
"""
|
211 |
+
if isinstance(parameters, torch.Tensor):
|
212 |
+
parameters = [parameters]
|
213 |
+
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
214 |
+
norm_type = float(norm_type)
|
215 |
+
if clip_value is not None:
|
216 |
+
clip_value = float(clip_value)
|
217 |
+
|
218 |
+
total_norm = 0
|
219 |
+
for p in parameters:
|
220 |
+
param_norm = p.grad.data.norm(norm_type)
|
221 |
+
total_norm += param_norm.item() ** norm_type
|
222 |
+
if clip_value is not None:
|
223 |
+
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
224 |
+
total_norm = total_norm ** (1.0 / norm_type)
|
225 |
+
return total_norm
|
programs/applio_code/rvc/lib/algorithm/discriminators.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.nn.utils.parametrizations import spectral_norm, weight_norm
|
3 |
+
|
4 |
+
from programs.applio_code.rvc.lib.algorithm.commons import get_padding
|
5 |
+
from programs.applio_code.rvc.lib.algorithm.residuals import LRELU_SLOPE
|
6 |
+
|
7 |
+
|
8 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
9 |
+
"""
|
10 |
+
Multi-period discriminator.
|
11 |
+
|
12 |
+
This class implements a multi-period discriminator, which is used to
|
13 |
+
discriminate between real and fake audio signals. The discriminator
|
14 |
+
is composed of a series of convolutional layers that are applied to
|
15 |
+
the input signal at different periods.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
use_spectral_norm (bool): Whether to use spectral normalization.
|
19 |
+
Defaults to False.
|
20 |
+
"""
|
21 |
+
|
22 |
+
def __init__(self, use_spectral_norm=False):
|
23 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
24 |
+
periods = [2, 3, 5, 7, 11, 17]
|
25 |
+
self.discriminators = torch.nn.ModuleList(
|
26 |
+
[DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
27 |
+
+ [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods]
|
28 |
+
)
|
29 |
+
|
30 |
+
def forward(self, y, y_hat):
|
31 |
+
"""
|
32 |
+
Forward pass of the multi-period discriminator.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
y (torch.Tensor): Real audio signal.
|
36 |
+
y_hat (torch.Tensor): Fake audio signal.
|
37 |
+
"""
|
38 |
+
y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
|
39 |
+
for d in self.discriminators:
|
40 |
+
y_d_r, fmap_r = d(y)
|
41 |
+
y_d_g, fmap_g = d(y_hat)
|
42 |
+
y_d_rs.append(y_d_r)
|
43 |
+
y_d_gs.append(y_d_g)
|
44 |
+
fmap_rs.append(fmap_r)
|
45 |
+
fmap_gs.append(fmap_g)
|
46 |
+
|
47 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
48 |
+
|
49 |
+
|
50 |
+
class MultiPeriodDiscriminatorV2(torch.nn.Module):
|
51 |
+
"""
|
52 |
+
Multi-period discriminator V2.
|
53 |
+
|
54 |
+
This class implements a multi-period discriminator V2, which is used
|
55 |
+
to discriminate between real and fake audio signals. The discriminator
|
56 |
+
is composed of a series of convolutional layers that are applied to
|
57 |
+
the input signal at different periods.
|
58 |
+
|
59 |
+
Args:
|
60 |
+
use_spectral_norm (bool): Whether to use spectral normalization.
|
61 |
+
Defaults to False.
|
62 |
+
"""
|
63 |
+
|
64 |
+
def __init__(self, use_spectral_norm=False):
|
65 |
+
super(MultiPeriodDiscriminatorV2, self).__init__()
|
66 |
+
periods = [2, 3, 5, 7, 11, 17, 23, 37]
|
67 |
+
self.discriminators = torch.nn.ModuleList(
|
68 |
+
[DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
69 |
+
+ [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods]
|
70 |
+
)
|
71 |
+
|
72 |
+
def forward(self, y, y_hat):
|
73 |
+
"""
|
74 |
+
Forward pass of the multi-period discriminator V2.
|
75 |
+
|
76 |
+
Args:
|
77 |
+
y (torch.Tensor): Real audio signal.
|
78 |
+
y_hat (torch.Tensor): Fake audio signal.
|
79 |
+
"""
|
80 |
+
y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
|
81 |
+
for d in self.discriminators:
|
82 |
+
y_d_r, fmap_r = d(y)
|
83 |
+
y_d_g, fmap_g = d(y_hat)
|
84 |
+
y_d_rs.append(y_d_r)
|
85 |
+
y_d_gs.append(y_d_g)
|
86 |
+
fmap_rs.append(fmap_r)
|
87 |
+
fmap_gs.append(fmap_g)
|
88 |
+
|
89 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
90 |
+
|
91 |
+
|
92 |
+
class DiscriminatorS(torch.nn.Module):
|
93 |
+
"""
|
94 |
+
Discriminator for the short-term component.
|
95 |
+
|
96 |
+
This class implements a discriminator for the short-term component
|
97 |
+
of the audio signal. The discriminator is composed of a series of
|
98 |
+
convolutional layers that are applied to the input signal.
|
99 |
+
"""
|
100 |
+
|
101 |
+
def __init__(self, use_spectral_norm=False):
|
102 |
+
super(DiscriminatorS, self).__init__()
|
103 |
+
norm_f = spectral_norm if use_spectral_norm else weight_norm
|
104 |
+
self.convs = torch.nn.ModuleList(
|
105 |
+
[
|
106 |
+
norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)),
|
107 |
+
norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
108 |
+
norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
109 |
+
norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
110 |
+
norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
111 |
+
norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2)),
|
112 |
+
]
|
113 |
+
)
|
114 |
+
self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1))
|
115 |
+
self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
|
116 |
+
|
117 |
+
def forward(self, x):
|
118 |
+
"""
|
119 |
+
Forward pass of the discriminator.
|
120 |
+
|
121 |
+
Args:
|
122 |
+
x (torch.Tensor): Input audio signal.
|
123 |
+
"""
|
124 |
+
fmap = []
|
125 |
+
for conv in self.convs:
|
126 |
+
x = self.lrelu(conv(x))
|
127 |
+
fmap.append(x)
|
128 |
+
x = self.conv_post(x)
|
129 |
+
fmap.append(x)
|
130 |
+
x = torch.flatten(x, 1, -1)
|
131 |
+
return x, fmap
|
132 |
+
|
133 |
+
|
134 |
+
class DiscriminatorP(torch.nn.Module):
|
135 |
+
"""
|
136 |
+
Discriminator for the long-term component.
|
137 |
+
|
138 |
+
This class implements a discriminator for the long-term component
|
139 |
+
of the audio signal. The discriminator is composed of a series of
|
140 |
+
convolutional layers that are applied to the input signal at a given
|
141 |
+
period.
|
142 |
+
|
143 |
+
Args:
|
144 |
+
period (int): Period of the discriminator.
|
145 |
+
kernel_size (int): Kernel size of the convolutional layers.
|
146 |
+
Defaults to 5.
|
147 |
+
stride (int): Stride of the convolutional layers. Defaults to 3.
|
148 |
+
use_spectral_norm (bool): Whether to use spectral normalization.
|
149 |
+
Defaults to False.
|
150 |
+
"""
|
151 |
+
|
152 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
153 |
+
super(DiscriminatorP, self).__init__()
|
154 |
+
self.period = period
|
155 |
+
norm_f = spectral_norm if use_spectral_norm else weight_norm
|
156 |
+
|
157 |
+
in_channels = [1, 32, 128, 512, 1024]
|
158 |
+
out_channels = [32, 128, 512, 1024, 1024]
|
159 |
+
|
160 |
+
self.convs = torch.nn.ModuleList(
|
161 |
+
[
|
162 |
+
norm_f(
|
163 |
+
torch.nn.Conv2d(
|
164 |
+
in_ch,
|
165 |
+
out_ch,
|
166 |
+
(kernel_size, 1),
|
167 |
+
(stride, 1),
|
168 |
+
padding=(get_padding(kernel_size, 1), 0),
|
169 |
+
)
|
170 |
+
)
|
171 |
+
for in_ch, out_ch in zip(in_channels, out_channels)
|
172 |
+
]
|
173 |
+
)
|
174 |
+
|
175 |
+
self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
176 |
+
self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
|
177 |
+
|
178 |
+
def forward(self, x):
|
179 |
+
"""
|
180 |
+
Forward pass of the discriminator.
|
181 |
+
|
182 |
+
Args:
|
183 |
+
x (torch.Tensor): Input audio signal.
|
184 |
+
"""
|
185 |
+
fmap = []
|
186 |
+
b, c, t = x.shape
|
187 |
+
if t % self.period != 0:
|
188 |
+
n_pad = self.period - (t % self.period)
|
189 |
+
x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
|
190 |
+
x = x.view(b, c, -1, self.period)
|
191 |
+
|
192 |
+
for conv in self.convs:
|
193 |
+
x = self.lrelu(conv(x))
|
194 |
+
fmap.append(x)
|
195 |
+
|
196 |
+
x = self.conv_post(x)
|
197 |
+
fmap.append(x)
|
198 |
+
x = torch.flatten(x, 1, -1)
|
199 |
+
return x, fmap
|
programs/applio_code/rvc/lib/algorithm/encoders.py
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
from programs.applio_code.rvc.lib.algorithm.commons import sequence_mask
|
6 |
+
from programs.applio_code.rvc.lib.algorithm.modules import WaveNet
|
7 |
+
from programs.applio_code.rvc.lib.algorithm.normalization import LayerNorm
|
8 |
+
from programs.applio_code.rvc.lib.algorithm.attentions import FFN, MultiHeadAttention
|
9 |
+
|
10 |
+
|
11 |
+
class Encoder(torch.nn.Module):
|
12 |
+
"""
|
13 |
+
Encoder module for the Transformer model.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
hidden_channels (int): Number of hidden channels in the encoder.
|
17 |
+
filter_channels (int): Number of filter channels in the feed-forward network.
|
18 |
+
n_heads (int): Number of attention heads.
|
19 |
+
n_layers (int): Number of encoder layers.
|
20 |
+
kernel_size (int, optional): Kernel size of the convolution layers in the feed-forward network. Defaults to 1.
|
21 |
+
p_dropout (float, optional): Dropout probability. Defaults to 0.0.
|
22 |
+
window_size (int, optional): Window size for relative positional encoding. Defaults to 10.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
hidden_channels,
|
28 |
+
filter_channels,
|
29 |
+
n_heads,
|
30 |
+
n_layers,
|
31 |
+
kernel_size=1,
|
32 |
+
p_dropout=0.0,
|
33 |
+
window_size=10,
|
34 |
+
**kwargs
|
35 |
+
):
|
36 |
+
super().__init__()
|
37 |
+
self.hidden_channels = hidden_channels
|
38 |
+
self.filter_channels = filter_channels
|
39 |
+
self.n_heads = n_heads
|
40 |
+
self.n_layers = n_layers
|
41 |
+
self.kernel_size = kernel_size
|
42 |
+
self.p_dropout = p_dropout
|
43 |
+
self.window_size = window_size
|
44 |
+
|
45 |
+
self.drop = torch.nn.Dropout(p_dropout)
|
46 |
+
self.attn_layers = torch.nn.ModuleList()
|
47 |
+
self.norm_layers_1 = torch.nn.ModuleList()
|
48 |
+
self.ffn_layers = torch.nn.ModuleList()
|
49 |
+
self.norm_layers_2 = torch.nn.ModuleList()
|
50 |
+
for i in range(self.n_layers):
|
51 |
+
self.attn_layers.append(
|
52 |
+
MultiHeadAttention(
|
53 |
+
hidden_channels,
|
54 |
+
hidden_channels,
|
55 |
+
n_heads,
|
56 |
+
p_dropout=p_dropout,
|
57 |
+
window_size=window_size,
|
58 |
+
)
|
59 |
+
)
|
60 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
61 |
+
self.ffn_layers.append(
|
62 |
+
FFN(
|
63 |
+
hidden_channels,
|
64 |
+
hidden_channels,
|
65 |
+
filter_channels,
|
66 |
+
kernel_size,
|
67 |
+
p_dropout=p_dropout,
|
68 |
+
)
|
69 |
+
)
|
70 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
71 |
+
|
72 |
+
def forward(self, x, x_mask):
|
73 |
+
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
74 |
+
x = x * x_mask
|
75 |
+
for i in range(self.n_layers):
|
76 |
+
y = self.attn_layers[i](x, x, attn_mask)
|
77 |
+
y = self.drop(y)
|
78 |
+
x = self.norm_layers_1[i](x + y)
|
79 |
+
|
80 |
+
y = self.ffn_layers[i](x, x_mask)
|
81 |
+
y = self.drop(y)
|
82 |
+
x = self.norm_layers_2[i](x + y)
|
83 |
+
x = x * x_mask
|
84 |
+
return x
|
85 |
+
|
86 |
+
|
87 |
+
class TextEncoder(torch.nn.Module):
|
88 |
+
"""Text Encoder with configurable embedding dimension.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
out_channels (int): Output channels of the encoder.
|
92 |
+
hidden_channels (int): Hidden channels of the encoder.
|
93 |
+
filter_channels (int): Filter channels of the encoder.
|
94 |
+
n_heads (int): Number of attention heads.
|
95 |
+
n_layers (int): Number of encoder layers.
|
96 |
+
kernel_size (int): Kernel size of the convolutional layers.
|
97 |
+
p_dropout (float): Dropout probability.
|
98 |
+
embedding_dim (int): Embedding dimension for phone embeddings (v1 = 256, v2 = 768).
|
99 |
+
f0 (bool, optional): Whether to use F0 embedding. Defaults to True.
|
100 |
+
"""
|
101 |
+
|
102 |
+
def __init__(
|
103 |
+
self,
|
104 |
+
out_channels,
|
105 |
+
hidden_channels,
|
106 |
+
filter_channels,
|
107 |
+
n_heads,
|
108 |
+
n_layers,
|
109 |
+
kernel_size,
|
110 |
+
p_dropout,
|
111 |
+
embedding_dim,
|
112 |
+
f0=True,
|
113 |
+
):
|
114 |
+
super(TextEncoder, self).__init__()
|
115 |
+
self.out_channels = out_channels
|
116 |
+
self.hidden_channels = hidden_channels
|
117 |
+
self.filter_channels = filter_channels
|
118 |
+
self.n_heads = n_heads
|
119 |
+
self.n_layers = n_layers
|
120 |
+
self.kernel_size = kernel_size
|
121 |
+
self.p_dropout = float(p_dropout)
|
122 |
+
self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels)
|
123 |
+
self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True)
|
124 |
+
if f0:
|
125 |
+
self.emb_pitch = torch.nn.Embedding(256, hidden_channels)
|
126 |
+
self.encoder = Encoder(
|
127 |
+
hidden_channels,
|
128 |
+
filter_channels,
|
129 |
+
n_heads,
|
130 |
+
n_layers,
|
131 |
+
kernel_size,
|
132 |
+
float(p_dropout),
|
133 |
+
)
|
134 |
+
self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
135 |
+
|
136 |
+
def forward(
|
137 |
+
self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor
|
138 |
+
):
|
139 |
+
if pitch is None:
|
140 |
+
x = self.emb_phone(phone)
|
141 |
+
else:
|
142 |
+
x = self.emb_phone(phone) + self.emb_pitch(pitch)
|
143 |
+
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
|
144 |
+
x = self.lrelu(x)
|
145 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
146 |
+
x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
|
147 |
+
x = self.encoder(x * x_mask, x_mask)
|
148 |
+
stats = self.proj(x) * x_mask
|
149 |
+
|
150 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
151 |
+
return m, logs, x_mask
|
152 |
+
|
153 |
+
|
154 |
+
class PosteriorEncoder(torch.nn.Module):
|
155 |
+
"""Posterior Encoder for inferring latent representation.
|
156 |
+
|
157 |
+
Args:
|
158 |
+
in_channels (int): Number of channels in the input.
|
159 |
+
out_channels (int): Number of channels in the output.
|
160 |
+
hidden_channels (int): Number of hidden channels in the encoder.
|
161 |
+
kernel_size (int): Kernel size of the convolutional layers.
|
162 |
+
dilation_rate (int): Dilation rate of the convolutional layers.
|
163 |
+
n_layers (int): Number of layers in the encoder.
|
164 |
+
gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
|
165 |
+
"""
|
166 |
+
|
167 |
+
def __init__(
|
168 |
+
self,
|
169 |
+
in_channels,
|
170 |
+
out_channels,
|
171 |
+
hidden_channels,
|
172 |
+
kernel_size,
|
173 |
+
dilation_rate,
|
174 |
+
n_layers,
|
175 |
+
gin_channels=0,
|
176 |
+
):
|
177 |
+
super(PosteriorEncoder, self).__init__()
|
178 |
+
self.in_channels = in_channels
|
179 |
+
self.out_channels = out_channels
|
180 |
+
self.hidden_channels = hidden_channels
|
181 |
+
self.kernel_size = kernel_size
|
182 |
+
self.dilation_rate = dilation_rate
|
183 |
+
self.n_layers = n_layers
|
184 |
+
self.gin_channels = gin_channels
|
185 |
+
|
186 |
+
self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1)
|
187 |
+
self.enc = WaveNet(
|
188 |
+
hidden_channels,
|
189 |
+
kernel_size,
|
190 |
+
dilation_rate,
|
191 |
+
n_layers,
|
192 |
+
gin_channels=gin_channels,
|
193 |
+
)
|
194 |
+
self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
195 |
+
|
196 |
+
def forward(
|
197 |
+
self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
|
198 |
+
):
|
199 |
+
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
200 |
+
x = self.pre(x) * x_mask
|
201 |
+
x = self.enc(x, x_mask, g=g)
|
202 |
+
stats = self.proj(x) * x_mask
|
203 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
204 |
+
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
205 |
+
return z, m, logs, x_mask
|
206 |
+
|
207 |
+
def remove_weight_norm(self):
|
208 |
+
"""Removes weight normalization from the encoder."""
|
209 |
+
self.enc.remove_weight_norm()
|
210 |
+
|
211 |
+
def __prepare_scriptable__(self):
|
212 |
+
"""Prepares the module for scripting."""
|
213 |
+
for hook in self.enc._forward_pre_hooks.values():
|
214 |
+
if (
|
215 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
216 |
+
and hook.__class__.__name__ == "WeightNorm"
|
217 |
+
):
|
218 |
+
torch.nn.utils.remove_weight_norm(self.enc)
|
219 |
+
return self
|
programs/applio_code/rvc/lib/algorithm/generators.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.nn.utils import remove_weight_norm
|
3 |
+
from torch.nn.utils.parametrizations import weight_norm
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
+
from programs.applio_code.rvc.lib.algorithm.residuals import (
|
7 |
+
LRELU_SLOPE,
|
8 |
+
ResBlock1,
|
9 |
+
ResBlock2,
|
10 |
+
)
|
11 |
+
from programs.applio_code.rvc.lib.algorithm.commons import init_weights
|
12 |
+
|
13 |
+
|
14 |
+
class Generator(torch.nn.Module):
|
15 |
+
"""Generator for synthesizing audio. Optimized for performance and quality.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
initial_channel (int): Number of channels in the initial convolutional layer.
|
19 |
+
resblock (str): Type of residual block to use (1 or 2).
|
20 |
+
resblock_kernel_sizes (list): Kernel sizes of the residual blocks.
|
21 |
+
resblock_dilation_sizes (list): Dilation rates of the residual blocks.
|
22 |
+
upsample_rates (list): Upsampling rates.
|
23 |
+
upsample_initial_channel (int): Number of channels in the initial upsampling layer.
|
24 |
+
upsample_kernel_sizes (list): Kernel sizes of the upsampling layers.
|
25 |
+
gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
|
26 |
+
"""
|
27 |
+
|
28 |
+
def __init__(
|
29 |
+
self,
|
30 |
+
initial_channel,
|
31 |
+
resblock,
|
32 |
+
resblock_kernel_sizes,
|
33 |
+
resblock_dilation_sizes,
|
34 |
+
upsample_rates,
|
35 |
+
upsample_initial_channel,
|
36 |
+
upsample_kernel_sizes,
|
37 |
+
gin_channels=0,
|
38 |
+
):
|
39 |
+
super(Generator, self).__init__()
|
40 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
41 |
+
self.num_upsamples = len(upsample_rates)
|
42 |
+
self.conv_pre = torch.nn.Conv1d(
|
43 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
44 |
+
)
|
45 |
+
resblock = ResBlock1 if resblock == "1" else ResBlock2
|
46 |
+
|
47 |
+
self.ups_and_resblocks = torch.nn.ModuleList()
|
48 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
49 |
+
self.ups_and_resblocks.append(
|
50 |
+
weight_norm(
|
51 |
+
torch.nn.ConvTranspose1d(
|
52 |
+
upsample_initial_channel // (2**i),
|
53 |
+
upsample_initial_channel // (2 ** (i + 1)),
|
54 |
+
k,
|
55 |
+
u,
|
56 |
+
padding=(k - u) // 2,
|
57 |
+
)
|
58 |
+
)
|
59 |
+
)
|
60 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
61 |
+
for j, (k, d) in enumerate(
|
62 |
+
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
63 |
+
):
|
64 |
+
self.ups_and_resblocks.append(resblock(ch, k, d))
|
65 |
+
|
66 |
+
self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
67 |
+
self.ups_and_resblocks.apply(init_weights)
|
68 |
+
|
69 |
+
if gin_channels != 0:
|
70 |
+
self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
71 |
+
|
72 |
+
def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
|
73 |
+
x = self.conv_pre(x)
|
74 |
+
if g is not None:
|
75 |
+
x = x + self.cond(g)
|
76 |
+
|
77 |
+
resblock_idx = 0
|
78 |
+
for _ in range(self.num_upsamples):
|
79 |
+
x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
|
80 |
+
x = self.ups_and_resblocks[resblock_idx](x)
|
81 |
+
resblock_idx += 1
|
82 |
+
xs = 0
|
83 |
+
for _ in range(self.num_kernels):
|
84 |
+
xs += self.ups_and_resblocks[resblock_idx](x)
|
85 |
+
resblock_idx += 1
|
86 |
+
x = xs / self.num_kernels
|
87 |
+
|
88 |
+
x = torch.nn.functional.leaky_relu(x)
|
89 |
+
x = self.conv_post(x)
|
90 |
+
x = torch.tanh(x)
|
91 |
+
|
92 |
+
return x
|
93 |
+
|
94 |
+
def __prepare_scriptable__(self):
|
95 |
+
"""Prepares the module for scripting."""
|
96 |
+
for l in self.ups_and_resblocks:
|
97 |
+
for hook in l._forward_pre_hooks.values():
|
98 |
+
if (
|
99 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
100 |
+
and hook.__class__.__name__ == "WeightNorm"
|
101 |
+
):
|
102 |
+
torch.nn.utils.remove_weight_norm(l)
|
103 |
+
return self
|
104 |
+
|
105 |
+
def remove_weight_norm(self):
|
106 |
+
"""Removes weight normalization from the upsampling and residual blocks."""
|
107 |
+
for l in self.ups_and_resblocks:
|
108 |
+
remove_weight_norm(l)
|
109 |
+
|
110 |
+
|
111 |
+
class SineGen(torch.nn.Module):
|
112 |
+
"""Sine wave generator.
|
113 |
+
|
114 |
+
Args:
|
115 |
+
samp_rate (int): Sampling rate in Hz.
|
116 |
+
harmonic_num (int, optional): Number of harmonic overtones. Defaults to 0.
|
117 |
+
sine_amp (float, optional): Amplitude of sine waveform. Defaults to 0.1.
|
118 |
+
noise_std (float, optional): Standard deviation of Gaussian noise. Defaults to 0.003.
|
119 |
+
voiced_threshold (float, optional): F0 threshold for voiced/unvoiced classification. Defaults to 0.
|
120 |
+
flag_for_pulse (bool, optional): Whether this SineGen is used inside PulseGen. Defaults to False.
|
121 |
+
"""
|
122 |
+
|
123 |
+
def __init__(
|
124 |
+
self,
|
125 |
+
samp_rate,
|
126 |
+
harmonic_num=0,
|
127 |
+
sine_amp=0.1,
|
128 |
+
noise_std=0.003,
|
129 |
+
voiced_threshold=0,
|
130 |
+
flag_for_pulse=False,
|
131 |
+
):
|
132 |
+
super(SineGen, self).__init__()
|
133 |
+
self.sine_amp = sine_amp
|
134 |
+
self.noise_std = noise_std
|
135 |
+
self.harmonic_num = harmonic_num
|
136 |
+
self.dim = self.harmonic_num + 1
|
137 |
+
self.sample_rate = samp_rate
|
138 |
+
self.voiced_threshold = voiced_threshold
|
139 |
+
|
140 |
+
def _f02uv(self, f0):
|
141 |
+
"""Converts F0 to voiced/unvoiced signal.
|
142 |
+
|
143 |
+
Args:
|
144 |
+
f0 (torch.Tensor): F0 tensor with shape (batch_size, length, 1)..
|
145 |
+
"""
|
146 |
+
uv = torch.ones_like(f0)
|
147 |
+
uv = uv * (f0 > self.voiced_threshold)
|
148 |
+
return uv
|
149 |
+
|
150 |
+
def forward(self, f0: torch.Tensor, upp: int):
|
151 |
+
"""Generates sine waves.
|
152 |
+
|
153 |
+
Args:
|
154 |
+
f0 (torch.Tensor): F0 tensor with shape (batch_size, length, 1).
|
155 |
+
upp (int): Upsampling factor.
|
156 |
+
"""
|
157 |
+
with torch.no_grad():
|
158 |
+
f0 = f0[:, None].transpose(1, 2)
|
159 |
+
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
|
160 |
+
f0_buf[:, :, 0] = f0[:, :, 0]
|
161 |
+
f0_buf[:, :, 1:] = (
|
162 |
+
f0_buf[:, :, 0:1]
|
163 |
+
* torch.arange(2, self.harmonic_num + 2, device=f0.device)[
|
164 |
+
None, None, :
|
165 |
+
]
|
166 |
+
)
|
167 |
+
rad_values = (f0_buf / float(self.sample_rate)) % 1
|
168 |
+
rand_ini = torch.rand(
|
169 |
+
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
|
170 |
+
)
|
171 |
+
rand_ini[:, 0] = 0
|
172 |
+
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
|
173 |
+
tmp_over_one = torch.cumsum(rad_values, 1)
|
174 |
+
tmp_over_one *= upp
|
175 |
+
tmp_over_one = torch.nn.functional.interpolate(
|
176 |
+
tmp_over_one.transpose(2, 1),
|
177 |
+
scale_factor=float(upp),
|
178 |
+
mode="linear",
|
179 |
+
align_corners=True,
|
180 |
+
).transpose(2, 1)
|
181 |
+
rad_values = torch.nn.functional.interpolate(
|
182 |
+
rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest"
|
183 |
+
).transpose(2, 1)
|
184 |
+
tmp_over_one %= 1
|
185 |
+
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
|
186 |
+
cumsum_shift = torch.zeros_like(rad_values)
|
187 |
+
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
|
188 |
+
sine_waves = torch.sin(
|
189 |
+
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi
|
190 |
+
)
|
191 |
+
sine_waves = sine_waves * self.sine_amp
|
192 |
+
uv = self._f02uv(f0)
|
193 |
+
uv = torch.nn.functional.interpolate(
|
194 |
+
uv.transpose(2, 1), scale_factor=float(upp), mode="nearest"
|
195 |
+
).transpose(2, 1)
|
196 |
+
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
197 |
+
noise = noise_amp * torch.randn_like(sine_waves)
|
198 |
+
sine_waves = sine_waves * uv + noise
|
199 |
+
return sine_waves, uv, noise
|
programs/applio_code/rvc/lib/algorithm/modules.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from programs.applio_code.rvc.lib.algorithm.commons import (
|
3 |
+
fused_add_tanh_sigmoid_multiply_no_jit,
|
4 |
+
fused_add_tanh_sigmoid_multiply,
|
5 |
+
)
|
6 |
+
|
7 |
+
|
8 |
+
class WaveNet(torch.nn.Module):
|
9 |
+
"""WaveNet residual blocks as used in WaveGlow
|
10 |
+
|
11 |
+
Args:
|
12 |
+
hidden_channels (int): Number of hidden channels.
|
13 |
+
kernel_size (int): Size of the convolutional kernel.
|
14 |
+
dilation_rate (int): Dilation rate of the convolution.
|
15 |
+
n_layers (int): Number of convolutional layers.
|
16 |
+
gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
|
17 |
+
p_dropout (float, optional): Dropout probability. Defaults to 0.
|
18 |
+
"""
|
19 |
+
|
20 |
+
def __init__(
|
21 |
+
self,
|
22 |
+
hidden_channels,
|
23 |
+
kernel_size,
|
24 |
+
dilation_rate,
|
25 |
+
n_layers,
|
26 |
+
gin_channels=0,
|
27 |
+
p_dropout=0,
|
28 |
+
):
|
29 |
+
super(WaveNet, self).__init__()
|
30 |
+
assert kernel_size % 2 == 1
|
31 |
+
self.hidden_channels = hidden_channels
|
32 |
+
self.kernel_size = (kernel_size,)
|
33 |
+
self.dilation_rate = dilation_rate
|
34 |
+
self.n_layers = n_layers
|
35 |
+
self.gin_channels = gin_channels
|
36 |
+
self.p_dropout = p_dropout
|
37 |
+
|
38 |
+
self.in_layers = torch.nn.ModuleList()
|
39 |
+
self.res_skip_layers = torch.nn.ModuleList()
|
40 |
+
self.drop = torch.nn.Dropout(p_dropout)
|
41 |
+
|
42 |
+
if gin_channels != 0:
|
43 |
+
cond_layer = torch.nn.Conv1d(
|
44 |
+
gin_channels, 2 * hidden_channels * n_layers, 1
|
45 |
+
)
|
46 |
+
self.cond_layer = torch.nn.utils.parametrizations.weight_norm(
|
47 |
+
cond_layer, name="weight"
|
48 |
+
)
|
49 |
+
|
50 |
+
dilations = [dilation_rate**i for i in range(n_layers)]
|
51 |
+
paddings = [(kernel_size * d - d) // 2 for d in dilations]
|
52 |
+
|
53 |
+
for i in range(n_layers):
|
54 |
+
in_layer = torch.nn.Conv1d(
|
55 |
+
hidden_channels,
|
56 |
+
2 * hidden_channels,
|
57 |
+
kernel_size,
|
58 |
+
dilation=dilations[i],
|
59 |
+
padding=paddings[i],
|
60 |
+
)
|
61 |
+
in_layer = torch.nn.utils.parametrizations.weight_norm(
|
62 |
+
in_layer, name="weight"
|
63 |
+
)
|
64 |
+
self.in_layers.append(in_layer)
|
65 |
+
|
66 |
+
res_skip_channels = (
|
67 |
+
hidden_channels if i == n_layers - 1 else 2 * hidden_channels
|
68 |
+
)
|
69 |
+
|
70 |
+
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
71 |
+
res_skip_layer = torch.nn.utils.parametrizations.weight_norm(
|
72 |
+
res_skip_layer, name="weight"
|
73 |
+
)
|
74 |
+
self.res_skip_layers.append(res_skip_layer)
|
75 |
+
|
76 |
+
def forward(self, x, x_mask, g=None, **kwargs):
|
77 |
+
"""Forward pass.
|
78 |
+
|
79 |
+
Args:
|
80 |
+
x (torch.Tensor): Input tensor of shape (batch_size, hidden_channels, time_steps).
|
81 |
+
x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps).
|
82 |
+
g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps).
|
83 |
+
Defaults to None.
|
84 |
+
"""
|
85 |
+
output = torch.zeros_like(x)
|
86 |
+
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
87 |
+
|
88 |
+
if g is not None:
|
89 |
+
g = self.cond_layer(g)
|
90 |
+
|
91 |
+
# Zluda
|
92 |
+
is_zluda = x.device.type == "cuda" and torch.cuda.get_device_name().endswith(
|
93 |
+
"[ZLUDA]"
|
94 |
+
)
|
95 |
+
|
96 |
+
for i in range(self.n_layers):
|
97 |
+
x_in = self.in_layers[i](x)
|
98 |
+
if g is not None:
|
99 |
+
cond_offset = i * 2 * self.hidden_channels
|
100 |
+
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
|
101 |
+
else:
|
102 |
+
g_l = torch.zeros_like(x_in)
|
103 |
+
|
104 |
+
# Preventing HIP crash by not using jit-decorated function
|
105 |
+
if is_zluda:
|
106 |
+
acts = fused_add_tanh_sigmoid_multiply_no_jit(
|
107 |
+
x_in, g_l, n_channels_tensor
|
108 |
+
)
|
109 |
+
else:
|
110 |
+
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
|
111 |
+
|
112 |
+
acts = self.drop(acts)
|
113 |
+
|
114 |
+
res_skip_acts = self.res_skip_layers[i](acts)
|
115 |
+
if i < self.n_layers - 1:
|
116 |
+
res_acts = res_skip_acts[:, : self.hidden_channels, :]
|
117 |
+
x = (x + res_acts) * x_mask
|
118 |
+
output = output + res_skip_acts[:, self.hidden_channels :, :]
|
119 |
+
else:
|
120 |
+
output = output + res_skip_acts
|
121 |
+
return output * x_mask
|
122 |
+
|
123 |
+
def remove_weight_norm(self):
|
124 |
+
"""Remove weight normalization from the module."""
|
125 |
+
if self.gin_channels != 0:
|
126 |
+
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
127 |
+
for l in self.in_layers:
|
128 |
+
torch.nn.utils.remove_weight_norm(l)
|
129 |
+
for l in self.res_skip_layers:
|
130 |
+
torch.nn.utils.remove_weight_norm(l)
|
programs/applio_code/rvc/lib/algorithm/normalization.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
|
4 |
+
class LayerNorm(torch.nn.Module):
|
5 |
+
"""Layer normalization module.
|
6 |
+
|
7 |
+
Args:
|
8 |
+
channels (int): Number of channels.
|
9 |
+
eps (float, optional): Epsilon value for numerical stability. Defaults to 1e-5.
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self, channels, eps=1e-5):
|
13 |
+
super().__init__()
|
14 |
+
self.eps = eps
|
15 |
+
self.gamma = torch.nn.Parameter(torch.ones(channels))
|
16 |
+
self.beta = torch.nn.Parameter(torch.zeros(channels))
|
17 |
+
|
18 |
+
def forward(self, x):
|
19 |
+
"""Forward pass.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps).
|
23 |
+
|
24 |
+
"""
|
25 |
+
# Transpose to (batch_size, time_steps, channels) for layer_norm
|
26 |
+
x = x.transpose(1, -1)
|
27 |
+
x = torch.nn.functional.layer_norm(
|
28 |
+
x, (x.size(-1),), self.gamma, self.beta, self.eps
|
29 |
+
)
|
30 |
+
# Transpose back to (batch_size, channels, time_steps)
|
31 |
+
return x.transpose(1, -1)
|
programs/applio_code/rvc/lib/algorithm/nsf.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch.nn.utils import remove_weight_norm
|
4 |
+
from torch.nn.utils.parametrizations import weight_norm
|
5 |
+
from typing import Optional
|
6 |
+
|
7 |
+
from programs.applio_code.rvc.lib.algorithm.generators import SineGen
|
8 |
+
from programs.applio_code.rvc.lib.algorithm.residuals import (
|
9 |
+
LRELU_SLOPE,
|
10 |
+
ResBlock1,
|
11 |
+
ResBlock2,
|
12 |
+
)
|
13 |
+
from programs.applio_code.rvc.lib.algorithm.commons import init_weights
|
14 |
+
|
15 |
+
|
16 |
+
class SourceModuleHnNSF(torch.nn.Module):
|
17 |
+
"""
|
18 |
+
Source Module for harmonic-plus-noise excitation.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
sample_rate (int): Sampling rate in Hz.
|
22 |
+
harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0.
|
23 |
+
sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1.
|
24 |
+
add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003.
|
25 |
+
voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0.
|
26 |
+
is_half (bool, optional): Whether to use half precision. Defaults to True.
|
27 |
+
"""
|
28 |
+
|
29 |
+
def __init__(
|
30 |
+
self,
|
31 |
+
sample_rate,
|
32 |
+
harmonic_num=0,
|
33 |
+
sine_amp=0.1,
|
34 |
+
add_noise_std=0.003,
|
35 |
+
voiced_threshod=0,
|
36 |
+
is_half=True,
|
37 |
+
):
|
38 |
+
super(SourceModuleHnNSF, self).__init__()
|
39 |
+
|
40 |
+
self.sine_amp = sine_amp
|
41 |
+
self.noise_std = add_noise_std
|
42 |
+
self.is_half = is_half
|
43 |
+
|
44 |
+
self.l_sin_gen = SineGen(
|
45 |
+
sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
|
46 |
+
)
|
47 |
+
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
48 |
+
self.l_tanh = torch.nn.Tanh()
|
49 |
+
|
50 |
+
def forward(self, x: torch.Tensor, upsample_factor: int = 1):
|
51 |
+
sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor)
|
52 |
+
sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
|
53 |
+
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
|
54 |
+
return sine_merge, None, None
|
55 |
+
|
56 |
+
|
57 |
+
class GeneratorNSF(torch.nn.Module):
|
58 |
+
"""
|
59 |
+
Generator for synthesizing audio using the NSF (Neural Source Filter) approach.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
initial_channel (int): Number of channels in the initial convolutional layer.
|
63 |
+
resblock (str): Type of residual block to use (1 or 2).
|
64 |
+
resblock_kernel_sizes (list): Kernel sizes of the residual blocks.
|
65 |
+
resblock_dilation_sizes (list): Dilation rates of the residual blocks.
|
66 |
+
upsample_rates (list): Upsampling rates.
|
67 |
+
upsample_initial_channel (int): Number of channels in the initial upsampling layer.
|
68 |
+
upsample_kernel_sizes (list): Kernel sizes of the upsampling layers.
|
69 |
+
gin_channels (int): Number of channels for the global conditioning input.
|
70 |
+
sr (int): Sampling rate.
|
71 |
+
is_half (bool, optional): Whether to use half precision. Defaults to False.
|
72 |
+
"""
|
73 |
+
|
74 |
+
def __init__(
|
75 |
+
self,
|
76 |
+
initial_channel,
|
77 |
+
resblock,
|
78 |
+
resblock_kernel_sizes,
|
79 |
+
resblock_dilation_sizes,
|
80 |
+
upsample_rates,
|
81 |
+
upsample_initial_channel,
|
82 |
+
upsample_kernel_sizes,
|
83 |
+
gin_channels,
|
84 |
+
sr,
|
85 |
+
is_half=False,
|
86 |
+
):
|
87 |
+
super(GeneratorNSF, self).__init__()
|
88 |
+
|
89 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
90 |
+
self.num_upsamples = len(upsample_rates)
|
91 |
+
self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates))
|
92 |
+
self.m_source = SourceModuleHnNSF(
|
93 |
+
sample_rate=sr, harmonic_num=0, is_half=is_half
|
94 |
+
)
|
95 |
+
|
96 |
+
self.conv_pre = torch.nn.Conv1d(
|
97 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
98 |
+
)
|
99 |
+
resblock_cls = ResBlock1 if resblock == "1" else ResBlock2
|
100 |
+
|
101 |
+
self.ups = torch.nn.ModuleList()
|
102 |
+
self.noise_convs = torch.nn.ModuleList()
|
103 |
+
|
104 |
+
channels = [
|
105 |
+
upsample_initial_channel // (2 ** (i + 1))
|
106 |
+
for i in range(len(upsample_rates))
|
107 |
+
]
|
108 |
+
stride_f0s = [
|
109 |
+
math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
|
110 |
+
for i in range(len(upsample_rates))
|
111 |
+
]
|
112 |
+
|
113 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
114 |
+
self.ups.append(
|
115 |
+
weight_norm(
|
116 |
+
torch.nn.ConvTranspose1d(
|
117 |
+
upsample_initial_channel // (2**i),
|
118 |
+
channels[i],
|
119 |
+
k,
|
120 |
+
u,
|
121 |
+
padding=(k - u) // 2,
|
122 |
+
)
|
123 |
+
)
|
124 |
+
)
|
125 |
+
|
126 |
+
self.noise_convs.append(
|
127 |
+
torch.nn.Conv1d(
|
128 |
+
1,
|
129 |
+
channels[i],
|
130 |
+
kernel_size=(stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1),
|
131 |
+
stride=stride_f0s[i],
|
132 |
+
padding=(stride_f0s[i] // 2 if stride_f0s[i] > 1 else 0),
|
133 |
+
)
|
134 |
+
)
|
135 |
+
|
136 |
+
self.resblocks = torch.nn.ModuleList(
|
137 |
+
[
|
138 |
+
resblock_cls(channels[i], k, d)
|
139 |
+
for i in range(len(self.ups))
|
140 |
+
for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
141 |
+
]
|
142 |
+
)
|
143 |
+
|
144 |
+
self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False)
|
145 |
+
self.ups.apply(init_weights)
|
146 |
+
|
147 |
+
if gin_channels != 0:
|
148 |
+
self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
149 |
+
|
150 |
+
self.upp = math.prod(upsample_rates)
|
151 |
+
self.lrelu_slope = LRELU_SLOPE
|
152 |
+
|
153 |
+
def forward(self, x, f0, g: Optional[torch.Tensor] = None):
|
154 |
+
har_source, _, _ = self.m_source(f0, self.upp)
|
155 |
+
har_source = har_source.transpose(1, 2)
|
156 |
+
x = self.conv_pre(x)
|
157 |
+
|
158 |
+
if g is not None:
|
159 |
+
x = x + self.cond(g)
|
160 |
+
|
161 |
+
for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
|
162 |
+
x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
|
163 |
+
x = ups(x)
|
164 |
+
x = x + noise_convs(har_source)
|
165 |
+
|
166 |
+
xs = sum(
|
167 |
+
[
|
168 |
+
resblock(x)
|
169 |
+
for j, resblock in enumerate(self.resblocks)
|
170 |
+
if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)
|
171 |
+
]
|
172 |
+
)
|
173 |
+
x = xs / self.num_kernels
|
174 |
+
|
175 |
+
x = torch.nn.functional.leaky_relu(x)
|
176 |
+
x = torch.tanh(self.conv_post(x))
|
177 |
+
return x
|
178 |
+
|
179 |
+
def remove_weight_norm(self):
|
180 |
+
for l in self.ups:
|
181 |
+
remove_weight_norm(l)
|
182 |
+
for l in self.resblocks:
|
183 |
+
l.remove_weight_norm()
|
184 |
+
|
185 |
+
def __prepare_scriptable__(self):
|
186 |
+
for l in self.ups:
|
187 |
+
for hook in l._forward_pre_hooks.values():
|
188 |
+
if (
|
189 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
190 |
+
and hook.__class__.__name__ == "WeightNorm"
|
191 |
+
):
|
192 |
+
remove_weight_norm(l)
|
193 |
+
for l in self.resblocks:
|
194 |
+
for hook in l._forward_pre_hooks.values():
|
195 |
+
if (
|
196 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
197 |
+
and hook.__class__.__name__ == "WeightNorm"
|
198 |
+
):
|
199 |
+
remove_weight_norm(l)
|
200 |
+
return self
|
programs/applio_code/rvc/lib/algorithm/residuals.py
ADDED
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
import torch
|
3 |
+
from torch.nn.utils import remove_weight_norm
|
4 |
+
from torch.nn.utils.parametrizations import weight_norm
|
5 |
+
|
6 |
+
from programs.applio_code.rvc.lib.algorithm.modules import WaveNet
|
7 |
+
from programs.applio_code.rvc.lib.algorithm.commons import get_padding, init_weights
|
8 |
+
|
9 |
+
LRELU_SLOPE = 0.1
|
10 |
+
|
11 |
+
|
12 |
+
# Helper functions
|
13 |
+
def create_conv1d_layer(channels, kernel_size, dilation):
|
14 |
+
return weight_norm(
|
15 |
+
torch.nn.Conv1d(
|
16 |
+
channels,
|
17 |
+
channels,
|
18 |
+
kernel_size,
|
19 |
+
1,
|
20 |
+
dilation=dilation,
|
21 |
+
padding=get_padding(kernel_size, dilation),
|
22 |
+
)
|
23 |
+
)
|
24 |
+
|
25 |
+
|
26 |
+
def apply_mask(tensor, mask):
|
27 |
+
return tensor * mask if mask is not None else tensor
|
28 |
+
|
29 |
+
|
30 |
+
class ResBlockBase(torch.nn.Module):
|
31 |
+
def __init__(self, channels, kernel_size, dilations):
|
32 |
+
super(ResBlockBase, self).__init__()
|
33 |
+
self.convs1 = torch.nn.ModuleList(
|
34 |
+
[create_conv1d_layer(channels, kernel_size, d) for d in dilations]
|
35 |
+
)
|
36 |
+
self.convs1.apply(init_weights)
|
37 |
+
|
38 |
+
self.convs2 = torch.nn.ModuleList(
|
39 |
+
[create_conv1d_layer(channels, kernel_size, 1) for _ in dilations]
|
40 |
+
)
|
41 |
+
self.convs2.apply(init_weights)
|
42 |
+
|
43 |
+
def forward(self, x, x_mask=None):
|
44 |
+
for c1, c2 in zip(self.convs1, self.convs2):
|
45 |
+
xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
|
46 |
+
xt = apply_mask(xt, x_mask)
|
47 |
+
xt = torch.nn.functional.leaky_relu(c1(xt), LRELU_SLOPE)
|
48 |
+
xt = apply_mask(xt, x_mask)
|
49 |
+
xt = c2(xt)
|
50 |
+
x = xt + x
|
51 |
+
return apply_mask(x, x_mask)
|
52 |
+
|
53 |
+
def remove_weight_norm(self):
|
54 |
+
for conv in self.convs1 + self.convs2:
|
55 |
+
remove_weight_norm(conv)
|
56 |
+
|
57 |
+
|
58 |
+
class ResBlock1(ResBlockBase):
|
59 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
60 |
+
super(ResBlock1, self).__init__(channels, kernel_size, dilation)
|
61 |
+
|
62 |
+
|
63 |
+
class ResBlock2(ResBlockBase):
|
64 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
65 |
+
super(ResBlock2, self).__init__(channels, kernel_size, dilation)
|
66 |
+
|
67 |
+
|
68 |
+
class Log(torch.nn.Module):
|
69 |
+
"""Logarithm module for flow-based models.
|
70 |
+
|
71 |
+
This module computes the logarithm of the input and its log determinant.
|
72 |
+
During reverse, it computes the exponential of the input.
|
73 |
+
"""
|
74 |
+
|
75 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
76 |
+
"""Forward pass.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
x (torch.Tensor): Input tensor.
|
80 |
+
x_mask (torch.Tensor): Mask tensor.
|
81 |
+
reverse (bool, optional): Whether to reverse the operation. Defaults to False.
|
82 |
+
"""
|
83 |
+
if not reverse:
|
84 |
+
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
85 |
+
logdet = torch.sum(-y, [1, 2])
|
86 |
+
return y, logdet
|
87 |
+
else:
|
88 |
+
x = torch.exp(x) * x_mask
|
89 |
+
return x
|
90 |
+
|
91 |
+
|
92 |
+
class Flip(torch.nn.Module):
|
93 |
+
"""Flip module for flow-based models.
|
94 |
+
|
95 |
+
This module flips the input along the time dimension.
|
96 |
+
"""
|
97 |
+
|
98 |
+
def forward(self, x, *args, reverse=False, **kwargs):
|
99 |
+
"""Forward pass.
|
100 |
+
|
101 |
+
Args:
|
102 |
+
x (torch.Tensor): Input tensor.
|
103 |
+
reverse (bool, optional): Whether to reverse the operation. Defaults to False.
|
104 |
+
"""
|
105 |
+
x = torch.flip(x, [1])
|
106 |
+
if not reverse:
|
107 |
+
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
108 |
+
return x, logdet
|
109 |
+
else:
|
110 |
+
return x
|
111 |
+
|
112 |
+
|
113 |
+
class ElementwiseAffine(torch.nn.Module):
|
114 |
+
"""Elementwise affine transformation module for flow-based models.
|
115 |
+
|
116 |
+
This module performs an elementwise affine transformation on the input.
|
117 |
+
|
118 |
+
Args:
|
119 |
+
channels (int): Number of channels.
|
120 |
+
|
121 |
+
"""
|
122 |
+
|
123 |
+
def __init__(self, channels):
|
124 |
+
super().__init__()
|
125 |
+
self.channels = channels
|
126 |
+
self.m = torch.nn.Parameter(torch.zeros(channels, 1))
|
127 |
+
self.logs = torch.nn.Parameter(torch.zeros(channels, 1))
|
128 |
+
|
129 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
130 |
+
"""Forward pass.
|
131 |
+
|
132 |
+
Args:
|
133 |
+
x (torch.Tensor): Input tensor.
|
134 |
+
x_mask (torch.Tensor): Mask tensor.
|
135 |
+
reverse (bool, optional): Whether to reverse the operation. Defaults to False.
|
136 |
+
"""
|
137 |
+
if not reverse:
|
138 |
+
y = self.m + torch.exp(self.logs) * x
|
139 |
+
y = y * x_mask
|
140 |
+
logdet = torch.sum(self.logs * x_mask, [1, 2])
|
141 |
+
return y, logdet
|
142 |
+
else:
|
143 |
+
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
144 |
+
return x
|
145 |
+
|
146 |
+
|
147 |
+
class ResidualCouplingBlock(torch.nn.Module):
|
148 |
+
"""Residual Coupling Block for normalizing flow.
|
149 |
+
|
150 |
+
Args:
|
151 |
+
channels (int): Number of channels in the input.
|
152 |
+
hidden_channels (int): Number of hidden channels in the coupling layer.
|
153 |
+
kernel_size (int): Kernel size of the convolutional layers.
|
154 |
+
dilation_rate (int): Dilation rate of the convolutional layers.
|
155 |
+
n_layers (int): Number of layers in the coupling layer.
|
156 |
+
n_flows (int, optional): Number of coupling layers in the block. Defaults to 4.
|
157 |
+
gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
|
158 |
+
"""
|
159 |
+
|
160 |
+
def __init__(
|
161 |
+
self,
|
162 |
+
channels,
|
163 |
+
hidden_channels,
|
164 |
+
kernel_size,
|
165 |
+
dilation_rate,
|
166 |
+
n_layers,
|
167 |
+
n_flows=4,
|
168 |
+
gin_channels=0,
|
169 |
+
):
|
170 |
+
super(ResidualCouplingBlock, self).__init__()
|
171 |
+
self.channels = channels
|
172 |
+
self.hidden_channels = hidden_channels
|
173 |
+
self.kernel_size = kernel_size
|
174 |
+
self.dilation_rate = dilation_rate
|
175 |
+
self.n_layers = n_layers
|
176 |
+
self.n_flows = n_flows
|
177 |
+
self.gin_channels = gin_channels
|
178 |
+
|
179 |
+
self.flows = torch.nn.ModuleList()
|
180 |
+
for i in range(n_flows):
|
181 |
+
self.flows.append(
|
182 |
+
ResidualCouplingLayer(
|
183 |
+
channels,
|
184 |
+
hidden_channels,
|
185 |
+
kernel_size,
|
186 |
+
dilation_rate,
|
187 |
+
n_layers,
|
188 |
+
gin_channels=gin_channels,
|
189 |
+
mean_only=True,
|
190 |
+
)
|
191 |
+
)
|
192 |
+
self.flows.append(Flip())
|
193 |
+
|
194 |
+
def forward(
|
195 |
+
self,
|
196 |
+
x: torch.Tensor,
|
197 |
+
x_mask: torch.Tensor,
|
198 |
+
g: Optional[torch.Tensor] = None,
|
199 |
+
reverse: bool = False,
|
200 |
+
):
|
201 |
+
if not reverse:
|
202 |
+
for flow in self.flows:
|
203 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
204 |
+
else:
|
205 |
+
for flow in reversed(self.flows):
|
206 |
+
x = flow.forward(x, x_mask, g=g, reverse=reverse)
|
207 |
+
return x
|
208 |
+
|
209 |
+
def remove_weight_norm(self):
|
210 |
+
"""Removes weight normalization from the coupling layers."""
|
211 |
+
for i in range(self.n_flows):
|
212 |
+
self.flows[i * 2].remove_weight_norm()
|
213 |
+
|
214 |
+
def __prepare_scriptable__(self):
|
215 |
+
"""Prepares the module for scripting."""
|
216 |
+
for i in range(self.n_flows):
|
217 |
+
for hook in self.flows[i * 2]._forward_pre_hooks.values():
|
218 |
+
if (
|
219 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
220 |
+
and hook.__class__.__name__ == "WeightNorm"
|
221 |
+
):
|
222 |
+
torch.nn.utils.remove_weight_norm(self.flows[i * 2])
|
223 |
+
|
224 |
+
return self
|
225 |
+
|
226 |
+
|
227 |
+
class ResidualCouplingLayer(torch.nn.Module):
|
228 |
+
"""Residual coupling layer for flow-based models.
|
229 |
+
|
230 |
+
Args:
|
231 |
+
channels (int): Number of channels.
|
232 |
+
hidden_channels (int): Number of hidden channels.
|
233 |
+
kernel_size (int): Size of the convolutional kernel.
|
234 |
+
dilation_rate (int): Dilation rate of the convolution.
|
235 |
+
n_layers (int): Number of convolutional layers.
|
236 |
+
p_dropout (float, optional): Dropout probability. Defaults to 0.
|
237 |
+
gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
|
238 |
+
mean_only (bool, optional): Whether to use mean-only coupling. Defaults to False.
|
239 |
+
"""
|
240 |
+
|
241 |
+
def __init__(
|
242 |
+
self,
|
243 |
+
channels,
|
244 |
+
hidden_channels,
|
245 |
+
kernel_size,
|
246 |
+
dilation_rate,
|
247 |
+
n_layers,
|
248 |
+
p_dropout=0,
|
249 |
+
gin_channels=0,
|
250 |
+
mean_only=False,
|
251 |
+
):
|
252 |
+
assert channels % 2 == 0, "channels should be divisible by 2"
|
253 |
+
super().__init__()
|
254 |
+
self.channels = channels
|
255 |
+
self.hidden_channels = hidden_channels
|
256 |
+
self.kernel_size = kernel_size
|
257 |
+
self.dilation_rate = dilation_rate
|
258 |
+
self.n_layers = n_layers
|
259 |
+
self.half_channels = channels // 2
|
260 |
+
self.mean_only = mean_only
|
261 |
+
|
262 |
+
self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1)
|
263 |
+
self.enc = WaveNet(
|
264 |
+
hidden_channels,
|
265 |
+
kernel_size,
|
266 |
+
dilation_rate,
|
267 |
+
n_layers,
|
268 |
+
p_dropout=p_dropout,
|
269 |
+
gin_channels=gin_channels,
|
270 |
+
)
|
271 |
+
self.post = torch.nn.Conv1d(
|
272 |
+
hidden_channels, self.half_channels * (2 - mean_only), 1
|
273 |
+
)
|
274 |
+
self.post.weight.data.zero_()
|
275 |
+
self.post.bias.data.zero_()
|
276 |
+
|
277 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
278 |
+
"""Forward pass.
|
279 |
+
|
280 |
+
Args:
|
281 |
+
x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps).
|
282 |
+
x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps).
|
283 |
+
g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps).
|
284 |
+
Defaults to None.
|
285 |
+
reverse (bool, optional): Whether to reverse the operation. Defaults to False.
|
286 |
+
"""
|
287 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
288 |
+
h = self.pre(x0) * x_mask
|
289 |
+
h = self.enc(h, x_mask, g=g)
|
290 |
+
stats = self.post(h) * x_mask
|
291 |
+
if not self.mean_only:
|
292 |
+
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
293 |
+
else:
|
294 |
+
m = stats
|
295 |
+
logs = torch.zeros_like(m)
|
296 |
+
|
297 |
+
if not reverse:
|
298 |
+
x1 = m + x1 * torch.exp(logs) * x_mask
|
299 |
+
x = torch.cat([x0, x1], 1)
|
300 |
+
logdet = torch.sum(logs, [1, 2])
|
301 |
+
return x, logdet
|
302 |
+
else:
|
303 |
+
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
304 |
+
x = torch.cat([x0, x1], 1)
|
305 |
+
return x
|
306 |
+
|
307 |
+
def remove_weight_norm(self):
|
308 |
+
"""Remove weight normalization from the module."""
|
309 |
+
self.enc.remove_weight_norm()
|
programs/applio_code/rvc/lib/algorithm/synthesizers.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
from programs.applio_code.rvc.lib.algorithm.nsf import GeneratorNSF
|
5 |
+
from programs.applio_code.rvc.lib.algorithm.generators import Generator
|
6 |
+
from programs.applio_code.rvc.lib.algorithm.commons import (
|
7 |
+
slice_segments,
|
8 |
+
rand_slice_segments,
|
9 |
+
)
|
10 |
+
from programs.applio_code.rvc.lib.algorithm.residuals import ResidualCouplingBlock
|
11 |
+
from programs.applio_code.rvc.lib.algorithm.encoders import (
|
12 |
+
TextEncoder,
|
13 |
+
PosteriorEncoder,
|
14 |
+
)
|
15 |
+
|
16 |
+
|
17 |
+
class Synthesizer(torch.nn.Module):
|
18 |
+
"""
|
19 |
+
Base Synthesizer model.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
spec_channels (int): Number of channels in the spectrogram.
|
23 |
+
segment_size (int): Size of the audio segment.
|
24 |
+
inter_channels (int): Number of channels in the intermediate layers.
|
25 |
+
hidden_channels (int): Number of channels in the hidden layers.
|
26 |
+
filter_channels (int): Number of channels in the filter layers.
|
27 |
+
n_heads (int): Number of attention heads.
|
28 |
+
n_layers (int): Number of layers in the encoder.
|
29 |
+
kernel_size (int): Size of the convolution kernel.
|
30 |
+
p_dropout (float): Dropout probability.
|
31 |
+
resblock (str): Type of residual block.
|
32 |
+
resblock_kernel_sizes (list): Kernel sizes for the residual blocks.
|
33 |
+
resblock_dilation_sizes (list): Dilation sizes for the residual blocks.
|
34 |
+
upsample_rates (list): Upsampling rates for the decoder.
|
35 |
+
upsample_initial_channel (int): Number of channels in the initial upsampling layer.
|
36 |
+
upsample_kernel_sizes (list): Kernel sizes for the upsampling layers.
|
37 |
+
spk_embed_dim (int): Dimension of the speaker embedding.
|
38 |
+
gin_channels (int): Number of channels in the global conditioning vector.
|
39 |
+
sr (int): Sampling rate of the audio.
|
40 |
+
use_f0 (bool): Whether to use F0 information.
|
41 |
+
text_enc_hidden_dim (int): Hidden dimension for the text encoder.
|
42 |
+
kwargs: Additional keyword arguments.
|
43 |
+
"""
|
44 |
+
|
45 |
+
def __init__(
|
46 |
+
self,
|
47 |
+
spec_channels,
|
48 |
+
segment_size,
|
49 |
+
inter_channels,
|
50 |
+
hidden_channels,
|
51 |
+
filter_channels,
|
52 |
+
n_heads,
|
53 |
+
n_layers,
|
54 |
+
kernel_size,
|
55 |
+
p_dropout,
|
56 |
+
resblock,
|
57 |
+
resblock_kernel_sizes,
|
58 |
+
resblock_dilation_sizes,
|
59 |
+
upsample_rates,
|
60 |
+
upsample_initial_channel,
|
61 |
+
upsample_kernel_sizes,
|
62 |
+
spk_embed_dim,
|
63 |
+
gin_channels,
|
64 |
+
sr,
|
65 |
+
use_f0,
|
66 |
+
text_enc_hidden_dim=768,
|
67 |
+
**kwargs
|
68 |
+
):
|
69 |
+
super(Synthesizer, self).__init__()
|
70 |
+
self.spec_channels = spec_channels
|
71 |
+
self.inter_channels = inter_channels
|
72 |
+
self.hidden_channels = hidden_channels
|
73 |
+
self.filter_channels = filter_channels
|
74 |
+
self.n_heads = n_heads
|
75 |
+
self.n_layers = n_layers
|
76 |
+
self.kernel_size = kernel_size
|
77 |
+
self.p_dropout = float(p_dropout)
|
78 |
+
self.resblock = resblock
|
79 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
80 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
81 |
+
self.upsample_rates = upsample_rates
|
82 |
+
self.upsample_initial_channel = upsample_initial_channel
|
83 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
84 |
+
self.segment_size = segment_size
|
85 |
+
self.gin_channels = gin_channels
|
86 |
+
self.spk_embed_dim = spk_embed_dim
|
87 |
+
self.use_f0 = use_f0
|
88 |
+
|
89 |
+
self.enc_p = TextEncoder(
|
90 |
+
inter_channels,
|
91 |
+
hidden_channels,
|
92 |
+
filter_channels,
|
93 |
+
n_heads,
|
94 |
+
n_layers,
|
95 |
+
kernel_size,
|
96 |
+
float(p_dropout),
|
97 |
+
text_enc_hidden_dim,
|
98 |
+
f0=use_f0,
|
99 |
+
)
|
100 |
+
|
101 |
+
if use_f0:
|
102 |
+
self.dec = GeneratorNSF(
|
103 |
+
inter_channels,
|
104 |
+
resblock,
|
105 |
+
resblock_kernel_sizes,
|
106 |
+
resblock_dilation_sizes,
|
107 |
+
upsample_rates,
|
108 |
+
upsample_initial_channel,
|
109 |
+
upsample_kernel_sizes,
|
110 |
+
gin_channels=gin_channels,
|
111 |
+
sr=sr,
|
112 |
+
is_half=kwargs["is_half"],
|
113 |
+
)
|
114 |
+
else:
|
115 |
+
self.dec = Generator(
|
116 |
+
inter_channels,
|
117 |
+
resblock,
|
118 |
+
resblock_kernel_sizes,
|
119 |
+
resblock_dilation_sizes,
|
120 |
+
upsample_rates,
|
121 |
+
upsample_initial_channel,
|
122 |
+
upsample_kernel_sizes,
|
123 |
+
gin_channels=gin_channels,
|
124 |
+
)
|
125 |
+
|
126 |
+
self.enc_q = PosteriorEncoder(
|
127 |
+
spec_channels,
|
128 |
+
inter_channels,
|
129 |
+
hidden_channels,
|
130 |
+
5,
|
131 |
+
1,
|
132 |
+
16,
|
133 |
+
gin_channels=gin_channels,
|
134 |
+
)
|
135 |
+
self.flow = ResidualCouplingBlock(
|
136 |
+
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
137 |
+
)
|
138 |
+
self.emb_g = torch.nn.Embedding(self.spk_embed_dim, gin_channels)
|
139 |
+
|
140 |
+
def remove_weight_norm(self):
|
141 |
+
"""Removes weight normalization from the model."""
|
142 |
+
self.dec.remove_weight_norm()
|
143 |
+
self.flow.remove_weight_norm()
|
144 |
+
self.enc_q.remove_weight_norm()
|
145 |
+
|
146 |
+
def __prepare_scriptable__(self):
|
147 |
+
for hook in self.dec._forward_pre_hooks.values():
|
148 |
+
if (
|
149 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
150 |
+
and hook.__class__.__name__ == "WeightNorm"
|
151 |
+
):
|
152 |
+
torch.nn.utils.remove_weight_norm(self.dec)
|
153 |
+
for hook in self.flow._forward_pre_hooks.values():
|
154 |
+
if (
|
155 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
156 |
+
and hook.__class__.__name__ == "WeightNorm"
|
157 |
+
):
|
158 |
+
torch.nn.utils.remove_weight_norm(self.flow)
|
159 |
+
if hasattr(self, "enc_q"):
|
160 |
+
for hook in self.enc_q._forward_pre_hooks.values():
|
161 |
+
if (
|
162 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
163 |
+
and hook.__class__.__name__ == "WeightNorm"
|
164 |
+
):
|
165 |
+
torch.nn.utils.remove_weight_norm(self.enc_q)
|
166 |
+
return self
|
167 |
+
|
168 |
+
@torch.jit.ignore
|
169 |
+
def forward(
|
170 |
+
self,
|
171 |
+
phone: torch.Tensor,
|
172 |
+
phone_lengths: torch.Tensor,
|
173 |
+
pitch: Optional[torch.Tensor] = None,
|
174 |
+
pitchf: Optional[torch.Tensor] = None,
|
175 |
+
y: torch.Tensor = None,
|
176 |
+
y_lengths: torch.Tensor = None,
|
177 |
+
ds: Optional[torch.Tensor] = None,
|
178 |
+
):
|
179 |
+
"""
|
180 |
+
Forward pass of the model.
|
181 |
+
|
182 |
+
Args:
|
183 |
+
phone (torch.Tensor): Phoneme sequence.
|
184 |
+
phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
|
185 |
+
pitch (torch.Tensor, optional): Pitch sequence.
|
186 |
+
pitchf (torch.Tensor, optional): Fine-grained pitch sequence.
|
187 |
+
y (torch.Tensor, optional): Target spectrogram.
|
188 |
+
y_lengths (torch.Tensor, optional): Lengths of the target spectrograms.
|
189 |
+
ds (torch.Tensor, optional): Speaker embedding. Defaults to None.
|
190 |
+
"""
|
191 |
+
g = self.emb_g(ds).unsqueeze(-1)
|
192 |
+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
193 |
+
if y is not None:
|
194 |
+
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
195 |
+
z_p = self.flow(z, y_mask, g=g)
|
196 |
+
z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size)
|
197 |
+
if self.use_f0:
|
198 |
+
pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2)
|
199 |
+
o = self.dec(z_slice, pitchf, g=g)
|
200 |
+
else:
|
201 |
+
o = self.dec(z_slice, g=g)
|
202 |
+
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
203 |
+
else:
|
204 |
+
return None, None, x_mask, None, (None, None, m_p, logs_p, None, None)
|
205 |
+
|
206 |
+
@torch.jit.export
|
207 |
+
def infer(
|
208 |
+
self,
|
209 |
+
phone: torch.Tensor,
|
210 |
+
phone_lengths: torch.Tensor,
|
211 |
+
pitch: Optional[torch.Tensor] = None,
|
212 |
+
nsff0: Optional[torch.Tensor] = None,
|
213 |
+
sid: torch.Tensor = None,
|
214 |
+
rate: Optional[torch.Tensor] = None,
|
215 |
+
):
|
216 |
+
"""
|
217 |
+
Inference of the model.
|
218 |
+
|
219 |
+
Args:
|
220 |
+
phone (torch.Tensor): Phoneme sequence.
|
221 |
+
phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
|
222 |
+
pitch (torch.Tensor, optional): Pitch sequence.
|
223 |
+
nsff0 (torch.Tensor, optional): Fine-grained pitch sequence.
|
224 |
+
sid (torch.Tensor): Speaker embedding.
|
225 |
+
rate (torch.Tensor, optional): Rate for time-stretching. Defaults to None.
|
226 |
+
"""
|
227 |
+
g = self.emb_g(sid).unsqueeze(-1)
|
228 |
+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
229 |
+
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
230 |
+
if rate is not None:
|
231 |
+
assert isinstance(rate, torch.Tensor)
|
232 |
+
head = int(z_p.shape[2] * (1.0 - rate.item()))
|
233 |
+
z_p = z_p[:, :, head:]
|
234 |
+
x_mask = x_mask[:, :, head:]
|
235 |
+
if self.use_f0:
|
236 |
+
nsff0 = nsff0[:, head:]
|
237 |
+
if self.use_f0:
|
238 |
+
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
239 |
+
o = self.dec(z * x_mask, nsff0, g=g)
|
240 |
+
else:
|
241 |
+
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
242 |
+
o = self.dec(z * x_mask, g=g)
|
243 |
+
return o, x_mask, (z, z_p, m_p, logs_p)
|
programs/applio_code/rvc/lib/predictors/F0Extractor.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
import pathlib
|
3 |
+
import libf0
|
4 |
+
import librosa
|
5 |
+
import numpy as np
|
6 |
+
import resampy
|
7 |
+
import torch
|
8 |
+
import torchcrepe
|
9 |
+
import torchfcpe
|
10 |
+
import os
|
11 |
+
|
12 |
+
# from tools.anyf0.rmvpe import RMVPE
|
13 |
+
from programs.applio_code.rvc.lib.predictors.RMVPE import RMVPE0Predictor
|
14 |
+
from programs.applio_code.rvc.configs.config import Config
|
15 |
+
|
16 |
+
config = Config()
|
17 |
+
|
18 |
+
|
19 |
+
@dataclasses.dataclass
|
20 |
+
class F0Extractor:
|
21 |
+
wav_path: pathlib.Path
|
22 |
+
sample_rate: int = 44100
|
23 |
+
hop_length: int = 512
|
24 |
+
f0_min: int = 50
|
25 |
+
f0_max: int = 1600
|
26 |
+
method: str = "rmvpe"
|
27 |
+
x: np.ndarray = dataclasses.field(init=False)
|
28 |
+
|
29 |
+
def __post_init__(self):
|
30 |
+
self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate)
|
31 |
+
|
32 |
+
@property
|
33 |
+
def hop_size(self) -> float:
|
34 |
+
return self.hop_length / self.sample_rate
|
35 |
+
|
36 |
+
@property
|
37 |
+
def wav16k(self) -> np.ndarray:
|
38 |
+
return resampy.resample(self.x, self.sample_rate, 16000)
|
39 |
+
|
40 |
+
def extract_f0(self) -> np.ndarray:
|
41 |
+
f0 = None
|
42 |
+
method = self.method
|
43 |
+
# Fall back to CPU for ZLUDA as these methods use CUcFFT
|
44 |
+
device = (
|
45 |
+
"cpu"
|
46 |
+
if "cuda" in config.device
|
47 |
+
and torch.cuda.get_device_name().endswith("[ZLUDA]")
|
48 |
+
else config.device
|
49 |
+
)
|
50 |
+
|
51 |
+
if method == "crepe":
|
52 |
+
wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(device)
|
53 |
+
f0 = torchcrepe.predict(
|
54 |
+
wav16k_torch,
|
55 |
+
sample_rate=16000,
|
56 |
+
hop_length=160,
|
57 |
+
batch_size=512,
|
58 |
+
fmin=self.f0_min,
|
59 |
+
fmax=self.f0_max,
|
60 |
+
device=device,
|
61 |
+
)
|
62 |
+
f0 = f0[0].cpu().numpy()
|
63 |
+
elif method == "fcpe":
|
64 |
+
audio = librosa.to_mono(self.x)
|
65 |
+
audio_length = len(audio)
|
66 |
+
f0_target_length = (audio_length // self.hop_length) + 1
|
67 |
+
audio = (
|
68 |
+
torch.from_numpy(audio).float().unsqueeze(0).unsqueeze(-1).to(device)
|
69 |
+
)
|
70 |
+
model = torchfcpe.spawn_bundled_infer_model(device=device)
|
71 |
+
|
72 |
+
f0 = model.infer(
|
73 |
+
audio,
|
74 |
+
sr=self.sample_rate,
|
75 |
+
decoder_mode="local_argmax",
|
76 |
+
threshold=0.006,
|
77 |
+
f0_min=self.f0_min,
|
78 |
+
f0_max=self.f0_max,
|
79 |
+
interp_uv=False,
|
80 |
+
output_interp_target_length=f0_target_length,
|
81 |
+
)
|
82 |
+
f0 = f0.squeeze().cpu().numpy()
|
83 |
+
elif method == "rmvpe":
|
84 |
+
is_half = False if device == "cpu" else config.is_half
|
85 |
+
model_rmvpe = RMVPE0Predictor(
|
86 |
+
os.path.join(
|
87 |
+
"programs", "applio_code", "rvc", "models", "predictors", "rmvpe.pt"
|
88 |
+
),
|
89 |
+
is_half=is_half,
|
90 |
+
device=device,
|
91 |
+
# hop_length=80
|
92 |
+
)
|
93 |
+
f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03)
|
94 |
+
|
95 |
+
else:
|
96 |
+
raise ValueError(f"Unknown method: {self.method}")
|
97 |
+
return libf0.hz_to_cents(f0, librosa.midi_to_hz(0))
|
98 |
+
|
99 |
+
def plot_f0(self, f0):
|
100 |
+
from matplotlib import pyplot as plt
|
101 |
+
|
102 |
+
plt.figure(figsize=(10, 4))
|
103 |
+
plt.plot(f0)
|
104 |
+
plt.title(self.method)
|
105 |
+
plt.xlabel("Time (frames)")
|
106 |
+
plt.ylabel("F0 (cents)")
|
107 |
+
plt.show()
|
programs/applio_code/rvc/lib/predictors/FCPE.py
ADDED
@@ -0,0 +1,920 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union
|
2 |
+
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
from torch.nn.utils.parametrizations import weight_norm
|
8 |
+
from torchaudio.transforms import Resample
|
9 |
+
import os
|
10 |
+
import librosa
|
11 |
+
import soundfile as sf
|
12 |
+
import torch.utils.data
|
13 |
+
from librosa.filters import mel as librosa_mel_fn
|
14 |
+
import math
|
15 |
+
from functools import partial
|
16 |
+
|
17 |
+
from einops import rearrange, repeat
|
18 |
+
from local_attention import LocalAttention
|
19 |
+
from torch import nn
|
20 |
+
|
21 |
+
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
22 |
+
|
23 |
+
|
24 |
+
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
|
25 |
+
"""Loads wav file to torch tensor."""
|
26 |
+
try:
|
27 |
+
data, sample_rate = sf.read(full_path, always_2d=True)
|
28 |
+
except Exception as error:
|
29 |
+
print(f"An error occurred loading {full_path}: {error}")
|
30 |
+
if return_empty_on_exception:
|
31 |
+
return [], sample_rate or target_sr or 48000
|
32 |
+
else:
|
33 |
+
raise
|
34 |
+
|
35 |
+
data = data[:, 0] if len(data.shape) > 1 else data
|
36 |
+
assert len(data) > 2
|
37 |
+
|
38 |
+
# Normalize data
|
39 |
+
max_mag = (
|
40 |
+
-np.iinfo(data.dtype).min
|
41 |
+
if np.issubdtype(data.dtype, np.integer)
|
42 |
+
else max(np.amax(data), -np.amin(data))
|
43 |
+
)
|
44 |
+
max_mag = (
|
45 |
+
(2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0)
|
46 |
+
)
|
47 |
+
data = torch.FloatTensor(data.astype(np.float32)) / max_mag
|
48 |
+
|
49 |
+
# Handle exceptions and resample
|
50 |
+
if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:
|
51 |
+
return [], sample_rate or target_sr or 48000
|
52 |
+
if target_sr is not None and sample_rate != target_sr:
|
53 |
+
data = torch.from_numpy(
|
54 |
+
librosa.core.resample(
|
55 |
+
data.numpy(), orig_sr=sample_rate, target_sr=target_sr
|
56 |
+
)
|
57 |
+
)
|
58 |
+
sample_rate = target_sr
|
59 |
+
|
60 |
+
return data, sample_rate
|
61 |
+
|
62 |
+
|
63 |
+
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
64 |
+
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
65 |
+
|
66 |
+
|
67 |
+
def dynamic_range_decompression(x, C=1):
|
68 |
+
return np.exp(x) / C
|
69 |
+
|
70 |
+
|
71 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
72 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
73 |
+
|
74 |
+
|
75 |
+
def dynamic_range_decompression_torch(x, C=1):
|
76 |
+
return torch.exp(x) / C
|
77 |
+
|
78 |
+
|
79 |
+
class STFT:
|
80 |
+
def __init__(
|
81 |
+
self,
|
82 |
+
sr=22050,
|
83 |
+
n_mels=80,
|
84 |
+
n_fft=1024,
|
85 |
+
win_size=1024,
|
86 |
+
hop_length=256,
|
87 |
+
fmin=20,
|
88 |
+
fmax=11025,
|
89 |
+
clip_val=1e-5,
|
90 |
+
):
|
91 |
+
self.target_sr = sr
|
92 |
+
self.n_mels = n_mels
|
93 |
+
self.n_fft = n_fft
|
94 |
+
self.win_size = win_size
|
95 |
+
self.hop_length = hop_length
|
96 |
+
self.fmin = fmin
|
97 |
+
self.fmax = fmax
|
98 |
+
self.clip_val = clip_val
|
99 |
+
self.mel_basis = {}
|
100 |
+
self.hann_window = {}
|
101 |
+
|
102 |
+
def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
|
103 |
+
sample_rate = self.target_sr
|
104 |
+
n_mels = self.n_mels
|
105 |
+
n_fft = self.n_fft
|
106 |
+
win_size = self.win_size
|
107 |
+
hop_length = self.hop_length
|
108 |
+
fmin = self.fmin
|
109 |
+
fmax = self.fmax
|
110 |
+
clip_val = self.clip_val
|
111 |
+
|
112 |
+
factor = 2 ** (keyshift / 12)
|
113 |
+
n_fft_new = int(np.round(n_fft * factor))
|
114 |
+
win_size_new = int(np.round(win_size * factor))
|
115 |
+
hop_length_new = int(np.round(hop_length * speed))
|
116 |
+
|
117 |
+
# Optimize mel_basis and hann_window caching
|
118 |
+
mel_basis = self.mel_basis if not train else {}
|
119 |
+
hann_window = self.hann_window if not train else {}
|
120 |
+
|
121 |
+
mel_basis_key = str(fmax) + "_" + str(y.device)
|
122 |
+
if mel_basis_key not in mel_basis:
|
123 |
+
mel = librosa_mel_fn(
|
124 |
+
sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
|
125 |
+
)
|
126 |
+
mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
|
127 |
+
|
128 |
+
keyshift_key = str(keyshift) + "_" + str(y.device)
|
129 |
+
if keyshift_key not in hann_window:
|
130 |
+
hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
|
131 |
+
|
132 |
+
# Padding and STFT
|
133 |
+
pad_left = (win_size_new - hop_length_new) // 2
|
134 |
+
pad_right = max(
|
135 |
+
(win_size_new - hop_length_new + 1) // 2,
|
136 |
+
win_size_new - y.size(-1) - pad_left,
|
137 |
+
)
|
138 |
+
mode = "reflect" if pad_right < y.size(-1) else "constant"
|
139 |
+
y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
|
140 |
+
y = y.squeeze(1)
|
141 |
+
|
142 |
+
spec = torch.stft(
|
143 |
+
y,
|
144 |
+
n_fft_new,
|
145 |
+
hop_length=hop_length_new,
|
146 |
+
win_length=win_size_new,
|
147 |
+
window=hann_window[keyshift_key],
|
148 |
+
center=center,
|
149 |
+
pad_mode="reflect",
|
150 |
+
normalized=False,
|
151 |
+
onesided=True,
|
152 |
+
return_complex=True,
|
153 |
+
)
|
154 |
+
spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
|
155 |
+
|
156 |
+
# Handle keyshift and mel conversion
|
157 |
+
if keyshift != 0:
|
158 |
+
size = n_fft // 2 + 1
|
159 |
+
resize = spec.size(1)
|
160 |
+
spec = (
|
161 |
+
F.pad(spec, (0, 0, 0, size - resize))
|
162 |
+
if resize < size
|
163 |
+
else spec[:, :size, :]
|
164 |
+
)
|
165 |
+
spec = spec * win_size / win_size_new
|
166 |
+
spec = torch.matmul(mel_basis[mel_basis_key], spec)
|
167 |
+
spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
|
168 |
+
return spec
|
169 |
+
|
170 |
+
def __call__(self, audiopath):
|
171 |
+
audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
|
172 |
+
spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
|
173 |
+
return spect
|
174 |
+
|
175 |
+
|
176 |
+
stft = STFT()
|
177 |
+
|
178 |
+
|
179 |
+
def softmax_kernel(
|
180 |
+
data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None
|
181 |
+
):
|
182 |
+
b, h, *_ = data.shape
|
183 |
+
|
184 |
+
# Normalize data
|
185 |
+
data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
|
186 |
+
|
187 |
+
# Project data
|
188 |
+
ratio = projection_matrix.shape[0] ** -0.5
|
189 |
+
projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h)
|
190 |
+
projection = projection.type_as(data)
|
191 |
+
data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection)
|
192 |
+
|
193 |
+
# Calculate diagonal data
|
194 |
+
diag_data = data**2
|
195 |
+
diag_data = torch.sum(diag_data, dim=-1)
|
196 |
+
diag_data = (diag_data / 2.0) * (data_normalizer**2)
|
197 |
+
diag_data = diag_data.unsqueeze(dim=-1)
|
198 |
+
|
199 |
+
# Apply softmax
|
200 |
+
if is_query:
|
201 |
+
data_dash = ratio * (
|
202 |
+
torch.exp(
|
203 |
+
data_dash
|
204 |
+
- diag_data
|
205 |
+
- torch.max(data_dash, dim=-1, keepdim=True).values
|
206 |
+
)
|
207 |
+
+ eps
|
208 |
+
)
|
209 |
+
else:
|
210 |
+
data_dash = ratio * (torch.exp(data_dash - diag_data + eps))
|
211 |
+
|
212 |
+
return data_dash.type_as(data)
|
213 |
+
|
214 |
+
|
215 |
+
def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
|
216 |
+
unstructured_block = torch.randn((cols, cols), device=device)
|
217 |
+
q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
|
218 |
+
q, r = map(lambda t: t.to(device), (q, r))
|
219 |
+
|
220 |
+
if qr_uniform_q:
|
221 |
+
d = torch.diag(r, 0)
|
222 |
+
q *= d.sign()
|
223 |
+
return q.t()
|
224 |
+
|
225 |
+
|
226 |
+
def exists(val):
|
227 |
+
return val is not None
|
228 |
+
|
229 |
+
|
230 |
+
def empty(tensor):
|
231 |
+
return tensor.numel() == 0
|
232 |
+
|
233 |
+
|
234 |
+
def default(val, d):
|
235 |
+
return val if exists(val) else d
|
236 |
+
|
237 |
+
|
238 |
+
def cast_tuple(val):
|
239 |
+
return (val,) if not isinstance(val, tuple) else val
|
240 |
+
|
241 |
+
|
242 |
+
class PCmer(nn.Module):
|
243 |
+
def __init__(
|
244 |
+
self,
|
245 |
+
num_layers,
|
246 |
+
num_heads,
|
247 |
+
dim_model,
|
248 |
+
dim_keys,
|
249 |
+
dim_values,
|
250 |
+
residual_dropout,
|
251 |
+
attention_dropout,
|
252 |
+
):
|
253 |
+
super().__init__()
|
254 |
+
self.num_layers = num_layers
|
255 |
+
self.num_heads = num_heads
|
256 |
+
self.dim_model = dim_model
|
257 |
+
self.dim_values = dim_values
|
258 |
+
self.dim_keys = dim_keys
|
259 |
+
self.residual_dropout = residual_dropout
|
260 |
+
self.attention_dropout = attention_dropout
|
261 |
+
|
262 |
+
self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
|
263 |
+
|
264 |
+
def forward(self, phone, mask=None):
|
265 |
+
for layer in self._layers:
|
266 |
+
phone = layer(phone, mask)
|
267 |
+
return phone
|
268 |
+
|
269 |
+
|
270 |
+
class _EncoderLayer(nn.Module):
|
271 |
+
def __init__(self, parent: PCmer):
|
272 |
+
super().__init__()
|
273 |
+
self.conformer = ConformerConvModule(parent.dim_model)
|
274 |
+
self.norm = nn.LayerNorm(parent.dim_model)
|
275 |
+
self.dropout = nn.Dropout(parent.residual_dropout)
|
276 |
+
self.attn = SelfAttention(
|
277 |
+
dim=parent.dim_model, heads=parent.num_heads, causal=False
|
278 |
+
)
|
279 |
+
|
280 |
+
def forward(self, phone, mask=None):
|
281 |
+
phone = phone + (self.attn(self.norm(phone), mask=mask))
|
282 |
+
phone = phone + (self.conformer(phone))
|
283 |
+
return phone
|
284 |
+
|
285 |
+
|
286 |
+
def calc_same_padding(kernel_size):
|
287 |
+
pad = kernel_size // 2
|
288 |
+
return (pad, pad - (kernel_size + 1) % 2)
|
289 |
+
|
290 |
+
|
291 |
+
class Swish(nn.Module):
|
292 |
+
def forward(self, x):
|
293 |
+
return x * x.sigmoid()
|
294 |
+
|
295 |
+
|
296 |
+
class Transpose(nn.Module):
|
297 |
+
def __init__(self, dims):
|
298 |
+
super().__init__()
|
299 |
+
assert len(dims) == 2, "dims must be a tuple of two dimensions"
|
300 |
+
self.dims = dims
|
301 |
+
|
302 |
+
def forward(self, x):
|
303 |
+
return x.transpose(*self.dims)
|
304 |
+
|
305 |
+
|
306 |
+
class GLU(nn.Module):
|
307 |
+
def __init__(self, dim):
|
308 |
+
super().__init__()
|
309 |
+
self.dim = dim
|
310 |
+
|
311 |
+
def forward(self, x):
|
312 |
+
out, gate = x.chunk(2, dim=self.dim)
|
313 |
+
return out * gate.sigmoid()
|
314 |
+
|
315 |
+
|
316 |
+
class DepthWiseConv1d(nn.Module):
|
317 |
+
def __init__(self, chan_in, chan_out, kernel_size, padding):
|
318 |
+
super().__init__()
|
319 |
+
self.padding = padding
|
320 |
+
self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
|
321 |
+
|
322 |
+
def forward(self, x):
|
323 |
+
x = F.pad(x, self.padding)
|
324 |
+
return self.conv(x)
|
325 |
+
|
326 |
+
|
327 |
+
class ConformerConvModule(nn.Module):
|
328 |
+
def __init__(
|
329 |
+
self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0
|
330 |
+
):
|
331 |
+
super().__init__()
|
332 |
+
|
333 |
+
inner_dim = dim * expansion_factor
|
334 |
+
padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
|
335 |
+
|
336 |
+
self.net = nn.Sequential(
|
337 |
+
nn.LayerNorm(dim),
|
338 |
+
Transpose((1, 2)),
|
339 |
+
nn.Conv1d(dim, inner_dim * 2, 1),
|
340 |
+
GLU(dim=1),
|
341 |
+
DepthWiseConv1d(
|
342 |
+
inner_dim, inner_dim, kernel_size=kernel_size, padding=padding
|
343 |
+
),
|
344 |
+
Swish(),
|
345 |
+
nn.Conv1d(inner_dim, dim, 1),
|
346 |
+
Transpose((1, 2)),
|
347 |
+
nn.Dropout(dropout),
|
348 |
+
)
|
349 |
+
|
350 |
+
def forward(self, x):
|
351 |
+
return self.net(x)
|
352 |
+
|
353 |
+
|
354 |
+
def linear_attention(q, k, v):
|
355 |
+
if v is None:
|
356 |
+
out = torch.einsum("...ed,...nd->...ne", k, q)
|
357 |
+
return out
|
358 |
+
else:
|
359 |
+
k_cumsum = k.sum(dim=-2)
|
360 |
+
D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8)
|
361 |
+
context = torch.einsum("...nd,...ne->...de", k, v)
|
362 |
+
out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv)
|
363 |
+
return out
|
364 |
+
|
365 |
+
|
366 |
+
def gaussian_orthogonal_random_matrix(
|
367 |
+
nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None
|
368 |
+
):
|
369 |
+
nb_full_blocks = int(nb_rows / nb_columns)
|
370 |
+
block_list = []
|
371 |
+
|
372 |
+
for _ in range(nb_full_blocks):
|
373 |
+
q = orthogonal_matrix_chunk(
|
374 |
+
nb_columns, qr_uniform_q=qr_uniform_q, device=device
|
375 |
+
)
|
376 |
+
block_list.append(q)
|
377 |
+
|
378 |
+
remaining_rows = nb_rows - nb_full_blocks * nb_columns
|
379 |
+
if remaining_rows > 0:
|
380 |
+
q = orthogonal_matrix_chunk(
|
381 |
+
nb_columns, qr_uniform_q=qr_uniform_q, device=device
|
382 |
+
)
|
383 |
+
block_list.append(q[:remaining_rows])
|
384 |
+
|
385 |
+
final_matrix = torch.cat(block_list)
|
386 |
+
|
387 |
+
if scaling == 0:
|
388 |
+
multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
|
389 |
+
elif scaling == 1:
|
390 |
+
multiplier = math.sqrt((float(nb_columns))) * torch.ones(
|
391 |
+
(nb_rows,), device=device
|
392 |
+
)
|
393 |
+
else:
|
394 |
+
raise ValueError(f"Invalid scaling {scaling}")
|
395 |
+
|
396 |
+
return torch.diag(multiplier) @ final_matrix
|
397 |
+
|
398 |
+
|
399 |
+
class FastAttention(nn.Module):
|
400 |
+
def __init__(
|
401 |
+
self,
|
402 |
+
dim_heads,
|
403 |
+
nb_features=None,
|
404 |
+
ortho_scaling=0,
|
405 |
+
causal=False,
|
406 |
+
generalized_attention=False,
|
407 |
+
kernel_fn=nn.ReLU(),
|
408 |
+
qr_uniform_q=False,
|
409 |
+
no_projection=False,
|
410 |
+
):
|
411 |
+
super().__init__()
|
412 |
+
nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
|
413 |
+
|
414 |
+
self.dim_heads = dim_heads
|
415 |
+
self.nb_features = nb_features
|
416 |
+
self.ortho_scaling = ortho_scaling
|
417 |
+
|
418 |
+
self.create_projection = partial(
|
419 |
+
gaussian_orthogonal_random_matrix,
|
420 |
+
nb_rows=self.nb_features,
|
421 |
+
nb_columns=dim_heads,
|
422 |
+
scaling=ortho_scaling,
|
423 |
+
qr_uniform_q=qr_uniform_q,
|
424 |
+
)
|
425 |
+
projection_matrix = self.create_projection()
|
426 |
+
self.register_buffer("projection_matrix", projection_matrix)
|
427 |
+
|
428 |
+
self.generalized_attention = generalized_attention
|
429 |
+
self.kernel_fn = kernel_fn
|
430 |
+
self.no_projection = no_projection
|
431 |
+
self.causal = causal
|
432 |
+
|
433 |
+
@torch.no_grad()
|
434 |
+
def redraw_projection_matrix(self):
|
435 |
+
projections = self.create_projection()
|
436 |
+
self.projection_matrix.copy_(projections)
|
437 |
+
del projections
|
438 |
+
|
439 |
+
def forward(self, q, k, v):
|
440 |
+
device = q.device
|
441 |
+
|
442 |
+
if self.no_projection:
|
443 |
+
q = q.softmax(dim=-1)
|
444 |
+
k = torch.exp(k) if self.causal else k.softmax(dim=-2)
|
445 |
+
else:
|
446 |
+
create_kernel = partial(
|
447 |
+
softmax_kernel, projection_matrix=self.projection_matrix, device=device
|
448 |
+
)
|
449 |
+
q = create_kernel(q, is_query=True)
|
450 |
+
k = create_kernel(k, is_query=False)
|
451 |
+
|
452 |
+
attn_fn = linear_attention if not self.causal else self.causal_linear_fn
|
453 |
+
|
454 |
+
if v is None:
|
455 |
+
out = attn_fn(q, k, None)
|
456 |
+
return out
|
457 |
+
else:
|
458 |
+
out = attn_fn(q, k, v)
|
459 |
+
return out
|
460 |
+
|
461 |
+
|
462 |
+
class SelfAttention(nn.Module):
|
463 |
+
def __init__(
|
464 |
+
self,
|
465 |
+
dim,
|
466 |
+
causal=False,
|
467 |
+
heads=8,
|
468 |
+
dim_head=64,
|
469 |
+
local_heads=0,
|
470 |
+
local_window_size=256,
|
471 |
+
nb_features=None,
|
472 |
+
feature_redraw_interval=1000,
|
473 |
+
generalized_attention=False,
|
474 |
+
kernel_fn=nn.ReLU(),
|
475 |
+
qr_uniform_q=False,
|
476 |
+
dropout=0.0,
|
477 |
+
no_projection=False,
|
478 |
+
):
|
479 |
+
super().__init__()
|
480 |
+
assert dim % heads == 0, "dimension must be divisible by number of heads"
|
481 |
+
dim_head = default(dim_head, dim // heads)
|
482 |
+
inner_dim = dim_head * heads
|
483 |
+
self.fast_attention = FastAttention(
|
484 |
+
dim_head,
|
485 |
+
nb_features,
|
486 |
+
causal=causal,
|
487 |
+
generalized_attention=generalized_attention,
|
488 |
+
kernel_fn=kernel_fn,
|
489 |
+
qr_uniform_q=qr_uniform_q,
|
490 |
+
no_projection=no_projection,
|
491 |
+
)
|
492 |
+
|
493 |
+
self.heads = heads
|
494 |
+
self.global_heads = heads - local_heads
|
495 |
+
self.local_attn = (
|
496 |
+
LocalAttention(
|
497 |
+
window_size=local_window_size,
|
498 |
+
causal=causal,
|
499 |
+
autopad=True,
|
500 |
+
dropout=dropout,
|
501 |
+
look_forward=int(not causal),
|
502 |
+
rel_pos_emb_config=(dim_head, local_heads),
|
503 |
+
)
|
504 |
+
if local_heads > 0
|
505 |
+
else None
|
506 |
+
)
|
507 |
+
|
508 |
+
self.to_q = nn.Linear(dim, inner_dim)
|
509 |
+
self.to_k = nn.Linear(dim, inner_dim)
|
510 |
+
self.to_v = nn.Linear(dim, inner_dim)
|
511 |
+
self.to_out = nn.Linear(inner_dim, dim)
|
512 |
+
self.dropout = nn.Dropout(dropout)
|
513 |
+
|
514 |
+
@torch.no_grad()
|
515 |
+
def redraw_projection_matrix(self):
|
516 |
+
self.fast_attention.redraw_projection_matrix()
|
517 |
+
|
518 |
+
def forward(
|
519 |
+
self,
|
520 |
+
x,
|
521 |
+
context=None,
|
522 |
+
mask=None,
|
523 |
+
context_mask=None,
|
524 |
+
name=None,
|
525 |
+
inference=False,
|
526 |
+
**kwargs,
|
527 |
+
):
|
528 |
+
_, _, _, h, gh = *x.shape, self.heads, self.global_heads
|
529 |
+
|
530 |
+
cross_attend = exists(context)
|
531 |
+
context = default(context, x)
|
532 |
+
context_mask = default(context_mask, mask) if not cross_attend else context_mask
|
533 |
+
q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
|
534 |
+
|
535 |
+
q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
|
536 |
+
(q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
|
537 |
+
|
538 |
+
attn_outs = []
|
539 |
+
if not empty(q):
|
540 |
+
if exists(context_mask):
|
541 |
+
global_mask = context_mask[:, None, :, None]
|
542 |
+
v.masked_fill_(~global_mask, 0.0)
|
543 |
+
if cross_attend:
|
544 |
+
pass # TODO: Implement cross-attention
|
545 |
+
else:
|
546 |
+
out = self.fast_attention(q, k, v)
|
547 |
+
attn_outs.append(out)
|
548 |
+
|
549 |
+
if not empty(lq):
|
550 |
+
assert (
|
551 |
+
not cross_attend
|
552 |
+
), "local attention is not compatible with cross attention"
|
553 |
+
out = self.local_attn(lq, lk, lv, input_mask=mask)
|
554 |
+
attn_outs.append(out)
|
555 |
+
|
556 |
+
out = torch.cat(attn_outs, dim=1)
|
557 |
+
out = rearrange(out, "b h n d -> b n (h d)")
|
558 |
+
out = self.to_out(out)
|
559 |
+
return self.dropout(out)
|
560 |
+
|
561 |
+
|
562 |
+
def l2_regularization(model, l2_alpha):
|
563 |
+
l2_loss = []
|
564 |
+
for module in model.modules():
|
565 |
+
if type(module) is nn.Conv2d:
|
566 |
+
l2_loss.append((module.weight**2).sum() / 2.0)
|
567 |
+
return l2_alpha * sum(l2_loss)
|
568 |
+
|
569 |
+
|
570 |
+
class FCPE(nn.Module):
|
571 |
+
def __init__(
|
572 |
+
self,
|
573 |
+
input_channel=128,
|
574 |
+
out_dims=360,
|
575 |
+
n_layers=12,
|
576 |
+
n_chans=512,
|
577 |
+
use_siren=False,
|
578 |
+
use_full=False,
|
579 |
+
loss_mse_scale=10,
|
580 |
+
loss_l2_regularization=False,
|
581 |
+
loss_l2_regularization_scale=1,
|
582 |
+
loss_grad1_mse=False,
|
583 |
+
loss_grad1_mse_scale=1,
|
584 |
+
f0_max=1975.5,
|
585 |
+
f0_min=32.70,
|
586 |
+
confidence=False,
|
587 |
+
threshold=0.05,
|
588 |
+
use_input_conv=True,
|
589 |
+
):
|
590 |
+
super().__init__()
|
591 |
+
if use_siren is True:
|
592 |
+
raise ValueError("Siren is not supported yet.")
|
593 |
+
if use_full is True:
|
594 |
+
raise ValueError("Full model is not supported yet.")
|
595 |
+
|
596 |
+
self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
|
597 |
+
self.loss_l2_regularization = (
|
598 |
+
loss_l2_regularization if (loss_l2_regularization is not None) else False
|
599 |
+
)
|
600 |
+
self.loss_l2_regularization_scale = (
|
601 |
+
loss_l2_regularization_scale
|
602 |
+
if (loss_l2_regularization_scale is not None)
|
603 |
+
else 1
|
604 |
+
)
|
605 |
+
self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
|
606 |
+
self.loss_grad1_mse_scale = (
|
607 |
+
loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
|
608 |
+
)
|
609 |
+
self.f0_max = f0_max if (f0_max is not None) else 1975.5
|
610 |
+
self.f0_min = f0_min if (f0_min is not None) else 32.70
|
611 |
+
self.confidence = confidence if (confidence is not None) else False
|
612 |
+
self.threshold = threshold if (threshold is not None) else 0.05
|
613 |
+
self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
|
614 |
+
|
615 |
+
self.cent_table_b = torch.Tensor(
|
616 |
+
np.linspace(
|
617 |
+
self.f0_to_cent(torch.Tensor([f0_min]))[0],
|
618 |
+
self.f0_to_cent(torch.Tensor([f0_max]))[0],
|
619 |
+
out_dims,
|
620 |
+
)
|
621 |
+
)
|
622 |
+
self.register_buffer("cent_table", self.cent_table_b)
|
623 |
+
|
624 |
+
# conv in stack
|
625 |
+
_leaky = nn.LeakyReLU()
|
626 |
+
self.stack = nn.Sequential(
|
627 |
+
nn.Conv1d(input_channel, n_chans, 3, 1, 1),
|
628 |
+
nn.GroupNorm(4, n_chans),
|
629 |
+
_leaky,
|
630 |
+
nn.Conv1d(n_chans, n_chans, 3, 1, 1),
|
631 |
+
)
|
632 |
+
|
633 |
+
# transformer
|
634 |
+
self.decoder = PCmer(
|
635 |
+
num_layers=n_layers,
|
636 |
+
num_heads=8,
|
637 |
+
dim_model=n_chans,
|
638 |
+
dim_keys=n_chans,
|
639 |
+
dim_values=n_chans,
|
640 |
+
residual_dropout=0.1,
|
641 |
+
attention_dropout=0.1,
|
642 |
+
)
|
643 |
+
self.norm = nn.LayerNorm(n_chans)
|
644 |
+
|
645 |
+
# out
|
646 |
+
self.n_out = out_dims
|
647 |
+
self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))
|
648 |
+
|
649 |
+
def forward(
|
650 |
+
self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax"
|
651 |
+
):
|
652 |
+
if cdecoder == "argmax":
|
653 |
+
self.cdecoder = self.cents_decoder
|
654 |
+
elif cdecoder == "local_argmax":
|
655 |
+
self.cdecoder = self.cents_local_decoder
|
656 |
+
|
657 |
+
x = (
|
658 |
+
self.stack(mel.transpose(1, 2)).transpose(1, 2)
|
659 |
+
if self.use_input_conv
|
660 |
+
else mel
|
661 |
+
)
|
662 |
+
x = self.decoder(x)
|
663 |
+
x = self.norm(x)
|
664 |
+
x = self.dense_out(x)
|
665 |
+
x = torch.sigmoid(x)
|
666 |
+
|
667 |
+
if not infer:
|
668 |
+
gt_cent_f0 = self.f0_to_cent(gt_f0)
|
669 |
+
gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0)
|
670 |
+
loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0)
|
671 |
+
if self.loss_l2_regularization:
|
672 |
+
loss_all = loss_all + l2_regularization(
|
673 |
+
model=self, l2_alpha=self.loss_l2_regularization_scale
|
674 |
+
)
|
675 |
+
x = loss_all
|
676 |
+
if infer:
|
677 |
+
x = self.cdecoder(x)
|
678 |
+
x = self.cent_to_f0(x)
|
679 |
+
x = (1 + x / 700).log() if not return_hz_f0 else x
|
680 |
+
|
681 |
+
return x
|
682 |
+
|
683 |
+
def cents_decoder(self, y, mask=True):
|
684 |
+
B, N, _ = y.size()
|
685 |
+
ci = self.cent_table[None, None, :].expand(B, N, -1)
|
686 |
+
rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(
|
687 |
+
y, dim=-1, keepdim=True
|
688 |
+
)
|
689 |
+
if mask:
|
690 |
+
confident = torch.max(y, dim=-1, keepdim=True)[0]
|
691 |
+
confident_mask = torch.ones_like(confident)
|
692 |
+
confident_mask[confident <= self.threshold] = float("-INF")
|
693 |
+
rtn = rtn * confident_mask
|
694 |
+
return (rtn, confident) if self.confidence else rtn
|
695 |
+
|
696 |
+
def cents_local_decoder(self, y, mask=True):
|
697 |
+
B, N, _ = y.size()
|
698 |
+
ci = self.cent_table[None, None, :].expand(B, N, -1)
|
699 |
+
confident, max_index = torch.max(y, dim=-1, keepdim=True)
|
700 |
+
local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
|
701 |
+
local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1)
|
702 |
+
ci_l = torch.gather(ci, -1, local_argmax_index)
|
703 |
+
y_l = torch.gather(y, -1, local_argmax_index)
|
704 |
+
rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(
|
705 |
+
y_l, dim=-1, keepdim=True
|
706 |
+
)
|
707 |
+
if mask:
|
708 |
+
confident_mask = torch.ones_like(confident)
|
709 |
+
confident_mask[confident <= self.threshold] = float("-INF")
|
710 |
+
rtn = rtn * confident_mask
|
711 |
+
return (rtn, confident) if self.confidence else rtn
|
712 |
+
|
713 |
+
def cent_to_f0(self, cent):
|
714 |
+
return 10.0 * 2 ** (cent / 1200.0)
|
715 |
+
|
716 |
+
def f0_to_cent(self, f0):
|
717 |
+
return 1200.0 * torch.log2(f0 / 10.0)
|
718 |
+
|
719 |
+
def gaussian_blurred_cent(self, cents):
|
720 |
+
mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0)))
|
721 |
+
B, N, _ = cents.size()
|
722 |
+
ci = self.cent_table[None, None, :].expand(B, N, -1)
|
723 |
+
return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
|
724 |
+
|
725 |
+
|
726 |
+
class FCPEInfer:
|
727 |
+
def __init__(self, model_path, device=None, dtype=torch.float32):
|
728 |
+
if device is None:
|
729 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
730 |
+
self.device = device
|
731 |
+
ckpt = torch.load(model_path, map_location=torch.device(self.device))
|
732 |
+
self.args = DotDict(ckpt["config"])
|
733 |
+
self.dtype = dtype
|
734 |
+
model = FCPE(
|
735 |
+
input_channel=self.args.model.input_channel,
|
736 |
+
out_dims=self.args.model.out_dims,
|
737 |
+
n_layers=self.args.model.n_layers,
|
738 |
+
n_chans=self.args.model.n_chans,
|
739 |
+
use_siren=self.args.model.use_siren,
|
740 |
+
use_full=self.args.model.use_full,
|
741 |
+
loss_mse_scale=self.args.loss.loss_mse_scale,
|
742 |
+
loss_l2_regularization=self.args.loss.loss_l2_regularization,
|
743 |
+
loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
|
744 |
+
loss_grad1_mse=self.args.loss.loss_grad1_mse,
|
745 |
+
loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
|
746 |
+
f0_max=self.args.model.f0_max,
|
747 |
+
f0_min=self.args.model.f0_min,
|
748 |
+
confidence=self.args.model.confidence,
|
749 |
+
)
|
750 |
+
model.to(self.device).to(self.dtype)
|
751 |
+
model.load_state_dict(ckpt["model"])
|
752 |
+
model.eval()
|
753 |
+
self.model = model
|
754 |
+
self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
|
755 |
+
|
756 |
+
@torch.no_grad()
|
757 |
+
def __call__(self, audio, sr, threshold=0.05):
|
758 |
+
self.model.threshold = threshold
|
759 |
+
audio = audio[None, :]
|
760 |
+
mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
|
761 |
+
f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
|
762 |
+
return f0
|
763 |
+
|
764 |
+
|
765 |
+
class Wav2Mel:
|
766 |
+
def __init__(self, args, device=None, dtype=torch.float32):
|
767 |
+
self.sample_rate = args.mel.sampling_rate
|
768 |
+
self.hop_size = args.mel.hop_size
|
769 |
+
if device is None:
|
770 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
771 |
+
self.device = device
|
772 |
+
self.dtype = dtype
|
773 |
+
self.stft = STFT(
|
774 |
+
args.mel.sampling_rate,
|
775 |
+
args.mel.num_mels,
|
776 |
+
args.mel.n_fft,
|
777 |
+
args.mel.win_size,
|
778 |
+
args.mel.hop_size,
|
779 |
+
args.mel.fmin,
|
780 |
+
args.mel.fmax,
|
781 |
+
)
|
782 |
+
self.resample_kernel = {}
|
783 |
+
|
784 |
+
def extract_nvstft(self, audio, keyshift=0, train=False):
|
785 |
+
mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2)
|
786 |
+
return mel
|
787 |
+
|
788 |
+
def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
|
789 |
+
audio = audio.to(self.dtype).to(self.device)
|
790 |
+
if sample_rate == self.sample_rate:
|
791 |
+
audio_res = audio
|
792 |
+
else:
|
793 |
+
key_str = str(sample_rate)
|
794 |
+
if key_str not in self.resample_kernel:
|
795 |
+
self.resample_kernel[key_str] = Resample(
|
796 |
+
sample_rate, self.sample_rate, lowpass_filter_width=128
|
797 |
+
)
|
798 |
+
self.resample_kernel[key_str] = (
|
799 |
+
self.resample_kernel[key_str].to(self.dtype).to(self.device)
|
800 |
+
)
|
801 |
+
audio_res = self.resample_kernel[key_str](audio)
|
802 |
+
|
803 |
+
mel = self.extract_nvstft(
|
804 |
+
audio_res, keyshift=keyshift, train=train
|
805 |
+
) # B, n_frames, bins
|
806 |
+
n_frames = int(audio.shape[1] // self.hop_size) + 1
|
807 |
+
mel = (
|
808 |
+
torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel
|
809 |
+
)
|
810 |
+
mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel
|
811 |
+
return mel
|
812 |
+
|
813 |
+
def __call__(self, audio, sample_rate, keyshift=0, train=False):
|
814 |
+
return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
|
815 |
+
|
816 |
+
|
817 |
+
class DotDict(dict):
|
818 |
+
def __getattr__(*args):
|
819 |
+
val = dict.get(*args)
|
820 |
+
return DotDict(val) if type(val) is dict else val
|
821 |
+
|
822 |
+
__setattr__ = dict.__setitem__
|
823 |
+
__delattr__ = dict.__delitem__
|
824 |
+
|
825 |
+
|
826 |
+
class F0Predictor(object):
|
827 |
+
def compute_f0(self, wav, p_len):
|
828 |
+
pass
|
829 |
+
|
830 |
+
def compute_f0_uv(self, wav, p_len):
|
831 |
+
pass
|
832 |
+
|
833 |
+
|
834 |
+
class FCPEF0Predictor(F0Predictor):
|
835 |
+
def __init__(
|
836 |
+
self,
|
837 |
+
model_path,
|
838 |
+
hop_length=512,
|
839 |
+
f0_min=50,
|
840 |
+
f0_max=1100,
|
841 |
+
dtype=torch.float32,
|
842 |
+
device=None,
|
843 |
+
sample_rate=44100,
|
844 |
+
threshold=0.05,
|
845 |
+
):
|
846 |
+
self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype)
|
847 |
+
self.hop_length = hop_length
|
848 |
+
self.f0_min = f0_min
|
849 |
+
self.f0_max = f0_max
|
850 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
851 |
+
self.threshold = threshold
|
852 |
+
self.sample_rate = sample_rate
|
853 |
+
self.dtype = dtype
|
854 |
+
self.name = "fcpe"
|
855 |
+
|
856 |
+
def repeat_expand(
|
857 |
+
self,
|
858 |
+
content: Union[torch.Tensor, np.ndarray],
|
859 |
+
target_len: int,
|
860 |
+
mode: str = "nearest",
|
861 |
+
):
|
862 |
+
ndim = content.ndim
|
863 |
+
content = (
|
864 |
+
content[None, None]
|
865 |
+
if ndim == 1
|
866 |
+
else content[None] if ndim == 2 else content
|
867 |
+
)
|
868 |
+
assert content.ndim == 3
|
869 |
+
is_np = isinstance(content, np.ndarray)
|
870 |
+
content = torch.from_numpy(content) if is_np else content
|
871 |
+
results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
|
872 |
+
results = results.numpy() if is_np else results
|
873 |
+
return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results
|
874 |
+
|
875 |
+
def post_process(self, x, sample_rate, f0, pad_to):
|
876 |
+
f0 = (
|
877 |
+
torch.from_numpy(f0).float().to(x.device)
|
878 |
+
if isinstance(f0, np.ndarray)
|
879 |
+
else f0
|
880 |
+
)
|
881 |
+
f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0
|
882 |
+
|
883 |
+
vuv_vector = torch.zeros_like(f0)
|
884 |
+
vuv_vector[f0 > 0.0] = 1.0
|
885 |
+
vuv_vector[f0 <= 0.0] = 0.0
|
886 |
+
|
887 |
+
nzindex = torch.nonzero(f0).squeeze()
|
888 |
+
f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
|
889 |
+
time_org = self.hop_length / sample_rate * nzindex.cpu().numpy()
|
890 |
+
time_frame = np.arange(pad_to) * self.hop_length / sample_rate
|
891 |
+
|
892 |
+
vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
|
893 |
+
|
894 |
+
if f0.shape[0] <= 0:
|
895 |
+
return np.zeros(pad_to), vuv_vector.cpu().numpy()
|
896 |
+
if f0.shape[0] == 1:
|
897 |
+
return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy()
|
898 |
+
|
899 |
+
f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
|
900 |
+
return f0, vuv_vector.cpu().numpy()
|
901 |
+
|
902 |
+
def compute_f0(self, wav, p_len=None):
|
903 |
+
x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
|
904 |
+
p_len = x.shape[0] // self.hop_length if p_len is None else p_len
|
905 |
+
f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
|
906 |
+
if torch.all(f0 == 0):
|
907 |
+
return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
|
908 |
+
f0.cpu().numpy() if p_len is None else np.zeros(p_len)
|
909 |
+
)
|
910 |
+
return self.post_process(x, self.sample_rate, f0, p_len)[0]
|
911 |
+
|
912 |
+
def compute_f0_uv(self, wav, p_len=None):
|
913 |
+
x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
|
914 |
+
p_len = x.shape[0] // self.hop_length if p_len is None else p_len
|
915 |
+
f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
|
916 |
+
if torch.all(f0 == 0):
|
917 |
+
return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
|
918 |
+
f0.cpu().numpy() if p_len is None else np.zeros(p_len)
|
919 |
+
)
|
920 |
+
return self.post_process(x, self.sample_rate, f0, p_len)
|
programs/applio_code/rvc/lib/predictors/RMVPE.py
ADDED
@@ -0,0 +1,569 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from librosa.filters import mel
|
7 |
+
from typing import List
|
8 |
+
|
9 |
+
# Constants for readability
|
10 |
+
N_MELS = 128
|
11 |
+
N_CLASS = 360
|
12 |
+
|
13 |
+
|
14 |
+
# Define a helper function for creating convolutional blocks
|
15 |
+
class ConvBlockRes(nn.Module):
|
16 |
+
"""
|
17 |
+
A convolutional block with residual connection.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
in_channels (int): Number of input channels.
|
21 |
+
out_channels (int): Number of output channels.
|
22 |
+
momentum (float): Momentum for batch normalization.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(self, in_channels, out_channels, momentum=0.01):
|
26 |
+
super(ConvBlockRes, self).__init__()
|
27 |
+
self.conv = nn.Sequential(
|
28 |
+
nn.Conv2d(
|
29 |
+
in_channels=in_channels,
|
30 |
+
out_channels=out_channels,
|
31 |
+
kernel_size=(3, 3),
|
32 |
+
stride=(1, 1),
|
33 |
+
padding=(1, 1),
|
34 |
+
bias=False,
|
35 |
+
),
|
36 |
+
nn.BatchNorm2d(out_channels, momentum=momentum),
|
37 |
+
nn.ReLU(),
|
38 |
+
nn.Conv2d(
|
39 |
+
in_channels=out_channels,
|
40 |
+
out_channels=out_channels,
|
41 |
+
kernel_size=(3, 3),
|
42 |
+
stride=(1, 1),
|
43 |
+
padding=(1, 1),
|
44 |
+
bias=False,
|
45 |
+
),
|
46 |
+
nn.BatchNorm2d(out_channels, momentum=momentum),
|
47 |
+
nn.ReLU(),
|
48 |
+
)
|
49 |
+
if in_channels != out_channels:
|
50 |
+
self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
|
51 |
+
self.is_shortcut = True
|
52 |
+
else:
|
53 |
+
self.is_shortcut = False
|
54 |
+
|
55 |
+
def forward(self, x):
|
56 |
+
if self.is_shortcut:
|
57 |
+
return self.conv(x) + self.shortcut(x)
|
58 |
+
else:
|
59 |
+
return self.conv(x) + x
|
60 |
+
|
61 |
+
|
62 |
+
# Define a class for residual encoder blocks
|
63 |
+
class ResEncoderBlock(nn.Module):
|
64 |
+
"""
|
65 |
+
A residual encoder block.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
in_channels (int): Number of input channels.
|
69 |
+
out_channels (int): Number of output channels.
|
70 |
+
kernel_size (tuple): Size of the average pooling kernel.
|
71 |
+
n_blocks (int): Number of convolutional blocks in the block.
|
72 |
+
momentum (float): Momentum for batch normalization.
|
73 |
+
"""
|
74 |
+
|
75 |
+
def __init__(
|
76 |
+
self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
|
77 |
+
):
|
78 |
+
super(ResEncoderBlock, self).__init__()
|
79 |
+
self.n_blocks = n_blocks
|
80 |
+
self.conv = nn.ModuleList()
|
81 |
+
self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
|
82 |
+
for _ in range(n_blocks - 1):
|
83 |
+
self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
|
84 |
+
self.kernel_size = kernel_size
|
85 |
+
if self.kernel_size is not None:
|
86 |
+
self.pool = nn.AvgPool2d(kernel_size=kernel_size)
|
87 |
+
|
88 |
+
def forward(self, x):
|
89 |
+
for i in range(self.n_blocks):
|
90 |
+
x = self.conv[i](x)
|
91 |
+
if self.kernel_size is not None:
|
92 |
+
return x, self.pool(x)
|
93 |
+
else:
|
94 |
+
return x
|
95 |
+
|
96 |
+
|
97 |
+
# Define a class for the encoder
|
98 |
+
class Encoder(nn.Module):
|
99 |
+
"""
|
100 |
+
The encoder part of the DeepUnet.
|
101 |
+
|
102 |
+
Args:
|
103 |
+
in_channels (int): Number of input channels.
|
104 |
+
in_size (int): Size of the input tensor.
|
105 |
+
n_encoders (int): Number of encoder blocks.
|
106 |
+
kernel_size (tuple): Size of the average pooling kernel.
|
107 |
+
n_blocks (int): Number of convolutional blocks in each encoder block.
|
108 |
+
out_channels (int): Number of output channels for the first encoder block.
|
109 |
+
momentum (float): Momentum for batch normalization.
|
110 |
+
"""
|
111 |
+
|
112 |
+
def __init__(
|
113 |
+
self,
|
114 |
+
in_channels,
|
115 |
+
in_size,
|
116 |
+
n_encoders,
|
117 |
+
kernel_size,
|
118 |
+
n_blocks,
|
119 |
+
out_channels=16,
|
120 |
+
momentum=0.01,
|
121 |
+
):
|
122 |
+
super(Encoder, self).__init__()
|
123 |
+
self.n_encoders = n_encoders
|
124 |
+
self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
|
125 |
+
self.layers = nn.ModuleList()
|
126 |
+
self.latent_channels = []
|
127 |
+
for i in range(self.n_encoders):
|
128 |
+
self.layers.append(
|
129 |
+
ResEncoderBlock(
|
130 |
+
in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
|
131 |
+
)
|
132 |
+
)
|
133 |
+
self.latent_channels.append([out_channels, in_size])
|
134 |
+
in_channels = out_channels
|
135 |
+
out_channels *= 2
|
136 |
+
in_size //= 2
|
137 |
+
self.out_size = in_size
|
138 |
+
self.out_channel = out_channels
|
139 |
+
|
140 |
+
def forward(self, x: torch.Tensor):
|
141 |
+
concat_tensors: List[torch.Tensor] = []
|
142 |
+
x = self.bn(x)
|
143 |
+
for i in range(self.n_encoders):
|
144 |
+
t, x = self.layers[i](x)
|
145 |
+
concat_tensors.append(t)
|
146 |
+
return x, concat_tensors
|
147 |
+
|
148 |
+
|
149 |
+
# Define a class for the intermediate layer
|
150 |
+
class Intermediate(nn.Module):
|
151 |
+
"""
|
152 |
+
The intermediate layer of the DeepUnet.
|
153 |
+
|
154 |
+
Args:
|
155 |
+
in_channels (int): Number of input channels.
|
156 |
+
out_channels (int): Number of output channels.
|
157 |
+
n_inters (int): Number of convolutional blocks in the intermediate layer.
|
158 |
+
n_blocks (int): Number of convolutional blocks in each intermediate block.
|
159 |
+
momentum (float): Momentum for batch normalization.
|
160 |
+
"""
|
161 |
+
|
162 |
+
def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
|
163 |
+
super(Intermediate, self).__init__()
|
164 |
+
self.n_inters = n_inters
|
165 |
+
self.layers = nn.ModuleList()
|
166 |
+
self.layers.append(
|
167 |
+
ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
|
168 |
+
)
|
169 |
+
for _ in range(self.n_inters - 1):
|
170 |
+
self.layers.append(
|
171 |
+
ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
|
172 |
+
)
|
173 |
+
|
174 |
+
def forward(self, x):
|
175 |
+
for i in range(self.n_inters):
|
176 |
+
x = self.layers[i](x)
|
177 |
+
return x
|
178 |
+
|
179 |
+
|
180 |
+
# Define a class for residual decoder blocks
|
181 |
+
class ResDecoderBlock(nn.Module):
|
182 |
+
"""
|
183 |
+
A residual decoder block.
|
184 |
+
|
185 |
+
Args:
|
186 |
+
in_channels (int): Number of input channels.
|
187 |
+
out_channels (int): Number of output channels.
|
188 |
+
stride (tuple): Stride for transposed convolution.
|
189 |
+
n_blocks (int): Number of convolutional blocks in the block.
|
190 |
+
momentum (float): Momentum for batch normalization.
|
191 |
+
"""
|
192 |
+
|
193 |
+
def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
|
194 |
+
super(ResDecoderBlock, self).__init__()
|
195 |
+
out_padding = (0, 1) if stride == (1, 2) else (1, 1)
|
196 |
+
self.n_blocks = n_blocks
|
197 |
+
self.conv1 = nn.Sequential(
|
198 |
+
nn.ConvTranspose2d(
|
199 |
+
in_channels=in_channels,
|
200 |
+
out_channels=out_channels,
|
201 |
+
kernel_size=(3, 3),
|
202 |
+
stride=stride,
|
203 |
+
padding=(1, 1),
|
204 |
+
output_padding=out_padding,
|
205 |
+
bias=False,
|
206 |
+
),
|
207 |
+
nn.BatchNorm2d(out_channels, momentum=momentum),
|
208 |
+
nn.ReLU(),
|
209 |
+
)
|
210 |
+
self.conv2 = nn.ModuleList()
|
211 |
+
self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
|
212 |
+
for _ in range(n_blocks - 1):
|
213 |
+
self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
|
214 |
+
|
215 |
+
def forward(self, x, concat_tensor):
|
216 |
+
x = self.conv1(x)
|
217 |
+
x = torch.cat((x, concat_tensor), dim=1)
|
218 |
+
for i in range(self.n_blocks):
|
219 |
+
x = self.conv2[i](x)
|
220 |
+
return x
|
221 |
+
|
222 |
+
|
223 |
+
# Define a class for the decoder
|
224 |
+
class Decoder(nn.Module):
|
225 |
+
"""
|
226 |
+
The decoder part of the DeepUnet.
|
227 |
+
|
228 |
+
Args:
|
229 |
+
in_channels (int): Number of input channels.
|
230 |
+
n_decoders (int): Number of decoder blocks.
|
231 |
+
stride (tuple): Stride for transposed convolution.
|
232 |
+
n_blocks (int): Number of convolutional blocks in each decoder block.
|
233 |
+
momentum (float): Momentum for batch normalization.
|
234 |
+
"""
|
235 |
+
|
236 |
+
def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
|
237 |
+
super(Decoder, self).__init__()
|
238 |
+
self.layers = nn.ModuleList()
|
239 |
+
self.n_decoders = n_decoders
|
240 |
+
for _ in range(self.n_decoders):
|
241 |
+
out_channels = in_channels // 2
|
242 |
+
self.layers.append(
|
243 |
+
ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
|
244 |
+
)
|
245 |
+
in_channels = out_channels
|
246 |
+
|
247 |
+
def forward(self, x, concat_tensors):
|
248 |
+
for i in range(self.n_decoders):
|
249 |
+
x = self.layers[i](x, concat_tensors[-1 - i])
|
250 |
+
return x
|
251 |
+
|
252 |
+
|
253 |
+
# Define a class for the DeepUnet architecture
|
254 |
+
class DeepUnet(nn.Module):
|
255 |
+
"""
|
256 |
+
The DeepUnet architecture.
|
257 |
+
|
258 |
+
Args:
|
259 |
+
kernel_size (tuple): Size of the average pooling kernel.
|
260 |
+
n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
|
261 |
+
en_de_layers (int): Number of encoder/decoder layers.
|
262 |
+
inter_layers (int): Number of convolutional blocks in the intermediate layer.
|
263 |
+
in_channels (int): Number of input channels.
|
264 |
+
en_out_channels (int): Number of output channels for the first encoder block.
|
265 |
+
"""
|
266 |
+
|
267 |
+
def __init__(
|
268 |
+
self,
|
269 |
+
kernel_size,
|
270 |
+
n_blocks,
|
271 |
+
en_de_layers=5,
|
272 |
+
inter_layers=4,
|
273 |
+
in_channels=1,
|
274 |
+
en_out_channels=16,
|
275 |
+
):
|
276 |
+
super(DeepUnet, self).__init__()
|
277 |
+
self.encoder = Encoder(
|
278 |
+
in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
|
279 |
+
)
|
280 |
+
self.intermediate = Intermediate(
|
281 |
+
self.encoder.out_channel // 2,
|
282 |
+
self.encoder.out_channel,
|
283 |
+
inter_layers,
|
284 |
+
n_blocks,
|
285 |
+
)
|
286 |
+
self.decoder = Decoder(
|
287 |
+
self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
|
288 |
+
)
|
289 |
+
|
290 |
+
def forward(self, x):
|
291 |
+
x, concat_tensors = self.encoder(x)
|
292 |
+
x = self.intermediate(x)
|
293 |
+
x = self.decoder(x, concat_tensors)
|
294 |
+
return x
|
295 |
+
|
296 |
+
|
297 |
+
# Define a class for the end-to-end model
|
298 |
+
class E2E(nn.Module):
|
299 |
+
"""
|
300 |
+
The end-to-end model.
|
301 |
+
|
302 |
+
Args:
|
303 |
+
n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
|
304 |
+
n_gru (int): Number of GRU layers.
|
305 |
+
kernel_size (tuple): Size of the average pooling kernel.
|
306 |
+
en_de_layers (int): Number of encoder/decoder layers.
|
307 |
+
inter_layers (int): Number of convolutional blocks in the intermediate layer.
|
308 |
+
in_channels (int): Number of input channels.
|
309 |
+
en_out_channels (int): Number of output channels for the first encoder block.
|
310 |
+
"""
|
311 |
+
|
312 |
+
def __init__(
|
313 |
+
self,
|
314 |
+
n_blocks,
|
315 |
+
n_gru,
|
316 |
+
kernel_size,
|
317 |
+
en_de_layers=5,
|
318 |
+
inter_layers=4,
|
319 |
+
in_channels=1,
|
320 |
+
en_out_channels=16,
|
321 |
+
):
|
322 |
+
super(E2E, self).__init__()
|
323 |
+
self.unet = DeepUnet(
|
324 |
+
kernel_size,
|
325 |
+
n_blocks,
|
326 |
+
en_de_layers,
|
327 |
+
inter_layers,
|
328 |
+
in_channels,
|
329 |
+
en_out_channels,
|
330 |
+
)
|
331 |
+
self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
|
332 |
+
if n_gru:
|
333 |
+
self.fc = nn.Sequential(
|
334 |
+
BiGRU(3 * 128, 256, n_gru),
|
335 |
+
nn.Linear(512, N_CLASS),
|
336 |
+
nn.Dropout(0.25),
|
337 |
+
nn.Sigmoid(),
|
338 |
+
)
|
339 |
+
else:
|
340 |
+
self.fc = nn.Sequential(
|
341 |
+
nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
|
342 |
+
)
|
343 |
+
|
344 |
+
def forward(self, mel):
|
345 |
+
mel = mel.transpose(-1, -2).unsqueeze(1)
|
346 |
+
x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
|
347 |
+
x = self.fc(x)
|
348 |
+
return x
|
349 |
+
|
350 |
+
|
351 |
+
# Define a class for the MelSpectrogram extractor
|
352 |
+
class MelSpectrogram(torch.nn.Module):
|
353 |
+
"""
|
354 |
+
Extracts Mel-spectrogram features from audio.
|
355 |
+
|
356 |
+
Args:
|
357 |
+
is_half (bool): Whether to use half-precision floating-point numbers.
|
358 |
+
n_mel_channels (int): Number of Mel-frequency bands.
|
359 |
+
sample_rate (int): Sampling rate of the audio.
|
360 |
+
win_length (int): Length of the window function in samples.
|
361 |
+
hop_length (int): Hop size between frames in samples.
|
362 |
+
n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length.
|
363 |
+
mel_fmin (int, optional): Minimum frequency for the Mel filter bank. Defaults to 0.
|
364 |
+
mel_fmax (int, optional): Maximum frequency for the Mel filter bank. Defaults to None.
|
365 |
+
clamp (float, optional): Minimum value for clamping the Mel-spectrogram. Defaults to 1e-5.
|
366 |
+
"""
|
367 |
+
|
368 |
+
def __init__(
|
369 |
+
self,
|
370 |
+
is_half,
|
371 |
+
n_mel_channels,
|
372 |
+
sample_rate,
|
373 |
+
win_length,
|
374 |
+
hop_length,
|
375 |
+
n_fft=None,
|
376 |
+
mel_fmin=0,
|
377 |
+
mel_fmax=None,
|
378 |
+
clamp=1e-5,
|
379 |
+
):
|
380 |
+
super().__init__()
|
381 |
+
n_fft = win_length if n_fft is None else n_fft
|
382 |
+
self.hann_window = {}
|
383 |
+
mel_basis = mel(
|
384 |
+
sr=sample_rate,
|
385 |
+
n_fft=n_fft,
|
386 |
+
n_mels=n_mel_channels,
|
387 |
+
fmin=mel_fmin,
|
388 |
+
fmax=mel_fmax,
|
389 |
+
htk=True,
|
390 |
+
)
|
391 |
+
mel_basis = torch.from_numpy(mel_basis).float()
|
392 |
+
self.register_buffer("mel_basis", mel_basis)
|
393 |
+
self.n_fft = win_length if n_fft is None else n_fft
|
394 |
+
self.hop_length = hop_length
|
395 |
+
self.win_length = win_length
|
396 |
+
self.sample_rate = sample_rate
|
397 |
+
self.n_mel_channels = n_mel_channels
|
398 |
+
self.clamp = clamp
|
399 |
+
self.is_half = is_half
|
400 |
+
|
401 |
+
def forward(self, audio, keyshift=0, speed=1, center=True):
|
402 |
+
factor = 2 ** (keyshift / 12)
|
403 |
+
n_fft_new = int(np.round(self.n_fft * factor))
|
404 |
+
win_length_new = int(np.round(self.win_length * factor))
|
405 |
+
hop_length_new = int(np.round(self.hop_length * speed))
|
406 |
+
keyshift_key = str(keyshift) + "_" + str(audio.device)
|
407 |
+
if keyshift_key not in self.hann_window:
|
408 |
+
self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
|
409 |
+
audio.device
|
410 |
+
)
|
411 |
+
|
412 |
+
# Zluda, fall-back to CPU for FFTs since HIP SDK has no cuFFT alternative
|
413 |
+
source_device = audio.device
|
414 |
+
if audio.device.type == "cuda" and torch.cuda.get_device_name().endswith(
|
415 |
+
"[ZLUDA]"
|
416 |
+
):
|
417 |
+
audio = audio.to("cpu")
|
418 |
+
self.hann_window[keyshift_key] = self.hann_window[keyshift_key].to("cpu")
|
419 |
+
|
420 |
+
fft = torch.stft(
|
421 |
+
audio,
|
422 |
+
n_fft=n_fft_new,
|
423 |
+
hop_length=hop_length_new,
|
424 |
+
win_length=win_length_new,
|
425 |
+
window=self.hann_window[keyshift_key],
|
426 |
+
center=center,
|
427 |
+
return_complex=True,
|
428 |
+
).to(source_device)
|
429 |
+
|
430 |
+
magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
|
431 |
+
if keyshift != 0:
|
432 |
+
size = self.n_fft // 2 + 1
|
433 |
+
resize = magnitude.size(1)
|
434 |
+
if resize < size:
|
435 |
+
magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
|
436 |
+
magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
|
437 |
+
mel_output = torch.matmul(self.mel_basis, magnitude)
|
438 |
+
if self.is_half:
|
439 |
+
mel_output = mel_output.half()
|
440 |
+
log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
|
441 |
+
return log_mel_spec
|
442 |
+
|
443 |
+
|
444 |
+
# Define a class for the RMVPE0 predictor
|
445 |
+
class RMVPE0Predictor:
|
446 |
+
"""
|
447 |
+
A predictor for fundamental frequency (F0) based on the RMVPE0 model.
|
448 |
+
|
449 |
+
Args:
|
450 |
+
model_path (str): Path to the RMVPE0 model file.
|
451 |
+
is_half (bool): Whether to use half-precision floating-point numbers.
|
452 |
+
device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available.
|
453 |
+
"""
|
454 |
+
|
455 |
+
def __init__(self, model_path, is_half, device=None):
|
456 |
+
self.resample_kernel = {}
|
457 |
+
model = E2E(4, 1, (2, 2))
|
458 |
+
ckpt = torch.load(model_path, map_location="cpu")
|
459 |
+
model.load_state_dict(ckpt)
|
460 |
+
model.eval()
|
461 |
+
if is_half:
|
462 |
+
model = model.half()
|
463 |
+
self.model = model
|
464 |
+
self.resample_kernel = {}
|
465 |
+
self.is_half = is_half
|
466 |
+
self.device = device
|
467 |
+
self.mel_extractor = MelSpectrogram(
|
468 |
+
is_half, N_MELS, 16000, 1024, 160, None, 30, 8000
|
469 |
+
).to(device)
|
470 |
+
self.model = self.model.to(device)
|
471 |
+
cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191
|
472 |
+
self.cents_mapping = np.pad(cents_mapping, (4, 4))
|
473 |
+
|
474 |
+
def mel2hidden(self, mel):
|
475 |
+
"""
|
476 |
+
Converts Mel-spectrogram features to hidden representation.
|
477 |
+
|
478 |
+
Args:
|
479 |
+
mel (torch.Tensor): Mel-spectrogram features.
|
480 |
+
"""
|
481 |
+
with torch.no_grad():
|
482 |
+
n_frames = mel.shape[-1]
|
483 |
+
mel = F.pad(
|
484 |
+
mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
|
485 |
+
)
|
486 |
+
hidden = self.model(mel)
|
487 |
+
return hidden[:, :n_frames]
|
488 |
+
|
489 |
+
def decode(self, hidden, thred=0.03):
|
490 |
+
"""
|
491 |
+
Decodes hidden representation to F0.
|
492 |
+
|
493 |
+
Args:
|
494 |
+
hidden (np.ndarray): Hidden representation.
|
495 |
+
thred (float, optional): Threshold for salience. Defaults to 0.03.
|
496 |
+
"""
|
497 |
+
cents_pred = self.to_local_average_cents(hidden, thred=thred)
|
498 |
+
f0 = 10 * (2 ** (cents_pred / 1200))
|
499 |
+
f0[f0 == 10] = 0
|
500 |
+
return f0
|
501 |
+
|
502 |
+
def infer_from_audio(self, audio, thred=0.03):
|
503 |
+
"""
|
504 |
+
Infers F0 from audio.
|
505 |
+
|
506 |
+
Args:
|
507 |
+
audio (np.ndarray): Audio signal.
|
508 |
+
thred (float, optional): Threshold for salience. Defaults to 0.03.
|
509 |
+
"""
|
510 |
+
audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
|
511 |
+
mel = self.mel_extractor(audio, center=True)
|
512 |
+
hidden = self.mel2hidden(mel)
|
513 |
+
hidden = hidden.squeeze(0).cpu().numpy()
|
514 |
+
if self.is_half == True:
|
515 |
+
hidden = hidden.astype("float32")
|
516 |
+
f0 = self.decode(hidden, thred=thred)
|
517 |
+
return f0
|
518 |
+
|
519 |
+
def to_local_average_cents(self, salience, thred=0.05):
|
520 |
+
"""
|
521 |
+
Converts salience to local average cents.
|
522 |
+
|
523 |
+
Args:
|
524 |
+
salience (np.ndarray): Salience values.
|
525 |
+
thred (float, optional): Threshold for salience. Defaults to 0.05.
|
526 |
+
"""
|
527 |
+
center = np.argmax(salience, axis=1)
|
528 |
+
salience = np.pad(salience, ((0, 0), (4, 4)))
|
529 |
+
center += 4
|
530 |
+
todo_salience = []
|
531 |
+
todo_cents_mapping = []
|
532 |
+
starts = center - 4
|
533 |
+
ends = center + 5
|
534 |
+
for idx in range(salience.shape[0]):
|
535 |
+
todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
|
536 |
+
todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
|
537 |
+
todo_salience = np.array(todo_salience)
|
538 |
+
todo_cents_mapping = np.array(todo_cents_mapping)
|
539 |
+
product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
|
540 |
+
weight_sum = np.sum(todo_salience, 1)
|
541 |
+
devided = product_sum / weight_sum
|
542 |
+
maxx = np.max(salience, axis=1)
|
543 |
+
devided[maxx <= thred] = 0
|
544 |
+
return devided
|
545 |
+
|
546 |
+
|
547 |
+
# Define a class for BiGRU (bidirectional GRU)
|
548 |
+
class BiGRU(nn.Module):
|
549 |
+
"""
|
550 |
+
A bidirectional GRU layer.
|
551 |
+
|
552 |
+
Args:
|
553 |
+
input_features (int): Number of input features.
|
554 |
+
hidden_features (int): Number of hidden features.
|
555 |
+
num_layers (int): Number of GRU layers.
|
556 |
+
"""
|
557 |
+
|
558 |
+
def __init__(self, input_features, hidden_features, num_layers):
|
559 |
+
super(BiGRU, self).__init__()
|
560 |
+
self.gru = nn.GRU(
|
561 |
+
input_features,
|
562 |
+
hidden_features,
|
563 |
+
num_layers=num_layers,
|
564 |
+
batch_first=True,
|
565 |
+
bidirectional=True,
|
566 |
+
)
|
567 |
+
|
568 |
+
def forward(self, x):
|
569 |
+
return self.gru(x)[0]
|
programs/applio_code/rvc/lib/tools/analyzer.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import librosa.display
|
4 |
+
import librosa
|
5 |
+
|
6 |
+
|
7 |
+
def calculate_features(y, sr):
|
8 |
+
stft = np.abs(librosa.stft(y))
|
9 |
+
duration = librosa.get_duration(y=y, sr=sr)
|
10 |
+
cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0]
|
11 |
+
bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0]
|
12 |
+
rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0]
|
13 |
+
return stft, duration, cent, bw, rolloff
|
14 |
+
|
15 |
+
|
16 |
+
def plot_title(title):
|
17 |
+
plt.suptitle(title, fontsize=16, fontweight="bold")
|
18 |
+
|
19 |
+
|
20 |
+
def plot_spectrogram(y, sr, stft, duration, cmap="inferno"):
|
21 |
+
plt.subplot(3, 1, 1)
|
22 |
+
plt.imshow(
|
23 |
+
librosa.amplitude_to_db(stft, ref=np.max),
|
24 |
+
origin="lower",
|
25 |
+
extent=[0, duration, 0, sr / 1000],
|
26 |
+
aspect="auto",
|
27 |
+
cmap=cmap, # Change the colormap here
|
28 |
+
)
|
29 |
+
plt.colorbar(format="%+2.0f dB")
|
30 |
+
plt.xlabel("Time (s)")
|
31 |
+
plt.ylabel("Frequency (kHz)")
|
32 |
+
plt.title("Spectrogram")
|
33 |
+
|
34 |
+
|
35 |
+
def plot_waveform(y, sr, duration):
|
36 |
+
plt.subplot(3, 1, 2)
|
37 |
+
librosa.display.waveshow(y, sr=sr)
|
38 |
+
plt.xlabel("Time (s)")
|
39 |
+
plt.ylabel("Amplitude")
|
40 |
+
plt.title("Waveform")
|
41 |
+
|
42 |
+
|
43 |
+
def plot_features(times, cent, bw, rolloff, duration):
|
44 |
+
plt.subplot(3, 1, 3)
|
45 |
+
plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b")
|
46 |
+
plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g")
|
47 |
+
plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r")
|
48 |
+
plt.xlabel("Time (s)")
|
49 |
+
plt.title("Spectral Features")
|
50 |
+
plt.legend()
|
51 |
+
|
52 |
+
|
53 |
+
def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"):
|
54 |
+
y, sr = librosa.load(audio_file)
|
55 |
+
stft, duration, cent, bw, rolloff = calculate_features(y, sr)
|
56 |
+
|
57 |
+
plt.figure(figsize=(12, 10))
|
58 |
+
|
59 |
+
plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1])
|
60 |
+
plot_spectrogram(y, sr, stft, duration)
|
61 |
+
plot_waveform(y, sr, duration)
|
62 |
+
plot_features(librosa.times_like(cent), cent, bw, rolloff, duration)
|
63 |
+
|
64 |
+
plt.tight_layout()
|
65 |
+
|
66 |
+
if save_plot_path:
|
67 |
+
plt.savefig(save_plot_path, bbox_inches="tight", dpi=300)
|
68 |
+
plt.close()
|
69 |
+
|
70 |
+
audio_info = f"""Sample Rate: {sr}\nDuration: {(
|
71 |
+
str(round(duration, 2)) + " seconds"
|
72 |
+
if duration < 60
|
73 |
+
else str(round(duration / 60, 2)) + " minutes"
|
74 |
+
)}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}"""
|
75 |
+
|
76 |
+
return audio_info, save_plot_path
|
programs/applio_code/rvc/lib/tools/gdown.py
ADDED
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import six
|
4 |
+
import sys
|
5 |
+
import json
|
6 |
+
import tqdm
|
7 |
+
import time
|
8 |
+
import shutil
|
9 |
+
import warnings
|
10 |
+
import tempfile
|
11 |
+
import textwrap
|
12 |
+
import requests
|
13 |
+
from six.moves import urllib_parse
|
14 |
+
|
15 |
+
|
16 |
+
def indent(text, prefix):
|
17 |
+
"""Indent each non-empty line of text with the given prefix."""
|
18 |
+
return "".join(
|
19 |
+
(prefix + line if line.strip() else line) for line in text.splitlines(True)
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
class FileURLRetrievalError(Exception):
|
24 |
+
pass
|
25 |
+
|
26 |
+
|
27 |
+
class FolderContentsMaximumLimitError(Exception):
|
28 |
+
pass
|
29 |
+
|
30 |
+
|
31 |
+
def parse_url(url, warning=True):
|
32 |
+
"""Parse URLs especially for Google Drive links.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
url: URL to parse.
|
36 |
+
warning: Whether to warn if the URL is not a download link.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
A tuple (file_id, is_download_link), where file_id is the ID of the
|
40 |
+
file on Google Drive, and is_download_link is a flag indicating
|
41 |
+
whether the URL is a download link.
|
42 |
+
"""
|
43 |
+
parsed = urllib_parse.urlparse(url)
|
44 |
+
query = urllib_parse.parse_qs(parsed.query)
|
45 |
+
is_gdrive = parsed.hostname in ("drive.google.com", "docs.google.com")
|
46 |
+
is_download_link = parsed.path.endswith("/uc")
|
47 |
+
|
48 |
+
if not is_gdrive:
|
49 |
+
return None, is_download_link
|
50 |
+
|
51 |
+
file_id = query.get("id", [None])[0]
|
52 |
+
if file_id is None:
|
53 |
+
for pattern in (
|
54 |
+
r"^/file/d/(.*?)/(edit|view)$",
|
55 |
+
r"^/file/u/[0-9]+/d/(.*?)/(edit|view)$",
|
56 |
+
r"^/document/d/(.*?)/(edit|htmlview|view)$",
|
57 |
+
r"^/document/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
|
58 |
+
r"^/presentation/d/(.*?)/(edit|htmlview|view)$",
|
59 |
+
r"^/presentation/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
|
60 |
+
r"^/spreadsheets/d/(.*?)/(edit|htmlview|view)$",
|
61 |
+
r"^/spreadsheets/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
|
62 |
+
):
|
63 |
+
match = re.match(pattern, parsed.path)
|
64 |
+
if match:
|
65 |
+
file_id = match.group(1)
|
66 |
+
break
|
67 |
+
|
68 |
+
if warning and not is_download_link:
|
69 |
+
warnings.warn(
|
70 |
+
"You specified a Google Drive link that is not the correct link "
|
71 |
+
"to download a file. You might want to try `--fuzzy` option "
|
72 |
+
f"or the following url: https://drive.google.com/uc?id={file_id}"
|
73 |
+
)
|
74 |
+
|
75 |
+
return file_id, is_download_link
|
76 |
+
|
77 |
+
|
78 |
+
CHUNK_SIZE = 512 * 1024 # 512KB
|
79 |
+
HOME = os.path.expanduser("~")
|
80 |
+
|
81 |
+
|
82 |
+
def get_url_from_gdrive_confirmation(contents):
|
83 |
+
"""Extract the download URL from a Google Drive confirmation page."""
|
84 |
+
for pattern in (
|
85 |
+
r'href="(\/uc\?export=download[^"]+)',
|
86 |
+
r'href="/open\?id=([^"]+)"',
|
87 |
+
r'"downloadUrl":"([^"]+)',
|
88 |
+
):
|
89 |
+
match = re.search(pattern, contents)
|
90 |
+
if match:
|
91 |
+
url = match.group(1)
|
92 |
+
if pattern == r'href="/open\?id=([^"]+)"':
|
93 |
+
uuid = re.search(
|
94 |
+
r'<input\s+type="hidden"\s+name="uuid"\s+value="([^"]+)"',
|
95 |
+
contents,
|
96 |
+
).group(1)
|
97 |
+
url = (
|
98 |
+
"https://drive.usercontent.google.com/download?id="
|
99 |
+
+ url
|
100 |
+
+ "&confirm=t&uuid="
|
101 |
+
+ uuid
|
102 |
+
)
|
103 |
+
elif pattern == r'"downloadUrl":"([^"]+)':
|
104 |
+
url = url.replace("\\u003d", "=").replace("\\u0026", "&")
|
105 |
+
else:
|
106 |
+
url = "https://docs.google.com" + url.replace("&", "&")
|
107 |
+
return url
|
108 |
+
|
109 |
+
match = re.search(r'<p class="uc-error-subcaption">(.*)</p>', contents)
|
110 |
+
if match:
|
111 |
+
error = match.group(1)
|
112 |
+
raise FileURLRetrievalError(error)
|
113 |
+
|
114 |
+
raise FileURLRetrievalError(
|
115 |
+
"Cannot retrieve the public link of the file. "
|
116 |
+
"You may need to change the permission to "
|
117 |
+
"'Anyone with the link', or have had many accesses."
|
118 |
+
)
|
119 |
+
|
120 |
+
|
121 |
+
def _get_session(proxy, use_cookies, return_cookies_file=False):
|
122 |
+
"""Create a requests session with optional proxy and cookie handling."""
|
123 |
+
sess = requests.session()
|
124 |
+
sess.headers.update(
|
125 |
+
{"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"}
|
126 |
+
)
|
127 |
+
|
128 |
+
if proxy is not None:
|
129 |
+
sess.proxies = {"http": proxy, "https": proxy}
|
130 |
+
print("Using proxy:", proxy, file=sys.stderr)
|
131 |
+
|
132 |
+
cookies_file = os.path.join(HOME, ".cache/gdown/cookies.json")
|
133 |
+
if os.path.exists(cookies_file) and use_cookies:
|
134 |
+
with open(cookies_file) as f:
|
135 |
+
cookies = json.load(f)
|
136 |
+
for k, v in cookies:
|
137 |
+
sess.cookies[k] = v
|
138 |
+
|
139 |
+
return (sess, cookies_file) if return_cookies_file else sess
|
140 |
+
|
141 |
+
|
142 |
+
def download(
|
143 |
+
url=None,
|
144 |
+
output=None,
|
145 |
+
quiet=False,
|
146 |
+
proxy=None,
|
147 |
+
speed=None,
|
148 |
+
use_cookies=True,
|
149 |
+
verify=True,
|
150 |
+
id=None,
|
151 |
+
fuzzy=True,
|
152 |
+
resume=False,
|
153 |
+
format=None,
|
154 |
+
):
|
155 |
+
"""Download file from URL.
|
156 |
+
|
157 |
+
Parameters
|
158 |
+
----------
|
159 |
+
url: str
|
160 |
+
URL. Google Drive URL is also supported.
|
161 |
+
output: str
|
162 |
+
Output filename. Default is basename of URL.
|
163 |
+
quiet: bool
|
164 |
+
Suppress terminal output. Default is False.
|
165 |
+
proxy: str
|
166 |
+
Proxy.
|
167 |
+
speed: float
|
168 |
+
Download byte size per second (e.g., 256KB/s = 256 * 1024).
|
169 |
+
use_cookies: bool
|
170 |
+
Flag to use cookies. Default is True.
|
171 |
+
verify: bool or string
|
172 |
+
Either a bool, in which case it controls whether the server's TLS
|
173 |
+
certificate is verified, or a string, in which case it must be a path
|
174 |
+
to a CA bundle to use. Default is True.
|
175 |
+
id: str
|
176 |
+
Google Drive's file ID.
|
177 |
+
fuzzy: bool
|
178 |
+
Fuzzy extraction of Google Drive's file Id. Default is False.
|
179 |
+
resume: bool
|
180 |
+
Resume the download from existing tmp file if possible.
|
181 |
+
Default is False.
|
182 |
+
format: str, optional
|
183 |
+
Format of Google Docs, Spreadsheets and Slides. Default is:
|
184 |
+
- Google Docs: 'docx'
|
185 |
+
- Google Spreadsheet: 'xlsx'
|
186 |
+
- Google Slides: 'pptx'
|
187 |
+
|
188 |
+
Returns
|
189 |
+
-------
|
190 |
+
output: str
|
191 |
+
Output filename.
|
192 |
+
"""
|
193 |
+
if not (id is None) ^ (url is None):
|
194 |
+
raise ValueError("Either url or id has to be specified")
|
195 |
+
if id is not None:
|
196 |
+
url = f"https://drive.google.com/uc?id={id}"
|
197 |
+
|
198 |
+
url_origin = url
|
199 |
+
|
200 |
+
sess, cookies_file = _get_session(
|
201 |
+
proxy=proxy, use_cookies=use_cookies, return_cookies_file=True
|
202 |
+
)
|
203 |
+
|
204 |
+
gdrive_file_id, is_gdrive_download_link = parse_url(url, warning=not fuzzy)
|
205 |
+
|
206 |
+
if fuzzy and gdrive_file_id:
|
207 |
+
# overwrite the url with fuzzy match of a file id
|
208 |
+
url = f"https://drive.google.com/uc?id={gdrive_file_id}"
|
209 |
+
url_origin = url
|
210 |
+
is_gdrive_download_link = True
|
211 |
+
|
212 |
+
while True:
|
213 |
+
res = sess.get(url, stream=True, verify=verify)
|
214 |
+
|
215 |
+
if url == url_origin and res.status_code == 500:
|
216 |
+
# The file could be Google Docs or Spreadsheets.
|
217 |
+
url = f"https://drive.google.com/open?id={gdrive_file_id}"
|
218 |
+
continue
|
219 |
+
|
220 |
+
if res.headers["Content-Type"].startswith("text/html"):
|
221 |
+
title = re.search("<title>(.+)</title>", res.text)
|
222 |
+
if title:
|
223 |
+
title = title.group(1)
|
224 |
+
if title.endswith(" - Google Docs"):
|
225 |
+
url = f"https://docs.google.com/document/d/{gdrive_file_id}/export?format={'docx' if format is None else format}"
|
226 |
+
continue
|
227 |
+
if title.endswith(" - Google Sheets"):
|
228 |
+
url = f"https://docs.google.com/spreadsheets/d/{gdrive_file_id}/export?format={'xlsx' if format is None else format}"
|
229 |
+
continue
|
230 |
+
if title.endswith(" - Google Slides"):
|
231 |
+
url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}"
|
232 |
+
continue
|
233 |
+
elif (
|
234 |
+
"Content-Disposition" in res.headers
|
235 |
+
and res.headers["Content-Disposition"].endswith("pptx")
|
236 |
+
and format not in (None, "pptx")
|
237 |
+
):
|
238 |
+
url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}"
|
239 |
+
continue
|
240 |
+
|
241 |
+
if use_cookies:
|
242 |
+
os.makedirs(os.path.dirname(cookies_file), exist_ok=True)
|
243 |
+
with open(cookies_file, "w") as f:
|
244 |
+
cookies = [
|
245 |
+
(k, v)
|
246 |
+
for k, v in sess.cookies.items()
|
247 |
+
if not k.startswith("download_warning_")
|
248 |
+
]
|
249 |
+
json.dump(cookies, f, indent=2)
|
250 |
+
|
251 |
+
if "Content-Disposition" in res.headers:
|
252 |
+
# This is the file
|
253 |
+
break
|
254 |
+
if not (gdrive_file_id and is_gdrive_download_link):
|
255 |
+
break
|
256 |
+
|
257 |
+
# Need to redirect with confirmation
|
258 |
+
try:
|
259 |
+
url = get_url_from_gdrive_confirmation(res.text)
|
260 |
+
except FileURLRetrievalError as e:
|
261 |
+
message = (
|
262 |
+
"Failed to retrieve file url:\n\n"
|
263 |
+
"{}\n\n"
|
264 |
+
"You may still be able to access the file from the browser:"
|
265 |
+
f"\n\n\t{url_origin}\n\n"
|
266 |
+
"but Gdown can't. Please check connections and permissions."
|
267 |
+
).format(indent("\n".join(textwrap.wrap(str(e))), prefix="\t"))
|
268 |
+
raise FileURLRetrievalError(message)
|
269 |
+
|
270 |
+
if gdrive_file_id and is_gdrive_download_link:
|
271 |
+
content_disposition = urllib_parse.unquote(res.headers["Content-Disposition"])
|
272 |
+
filename_from_url = (
|
273 |
+
re.search(r"filename\*=UTF-8''(.*)", content_disposition)
|
274 |
+
or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition)
|
275 |
+
).group(1)
|
276 |
+
filename_from_url = filename_from_url.replace(os.path.sep, "_")
|
277 |
+
else:
|
278 |
+
filename_from_url = os.path.basename(url)
|
279 |
+
|
280 |
+
output = output or filename_from_url
|
281 |
+
|
282 |
+
output_is_path = isinstance(output, six.string_types)
|
283 |
+
if output_is_path and output.endswith(os.path.sep):
|
284 |
+
os.makedirs(output, exist_ok=True)
|
285 |
+
output = os.path.join(output, filename_from_url)
|
286 |
+
|
287 |
+
if output_is_path:
|
288 |
+
temp_dir = os.path.dirname(output) or "."
|
289 |
+
prefix = os.path.basename(output)
|
290 |
+
existing_tmp_files = [
|
291 |
+
os.path.join(temp_dir, file)
|
292 |
+
for file in os.listdir(temp_dir)
|
293 |
+
if file.startswith(prefix)
|
294 |
+
]
|
295 |
+
if resume and existing_tmp_files:
|
296 |
+
if len(existing_tmp_files) > 1:
|
297 |
+
print(
|
298 |
+
"There are multiple temporary files to resume:",
|
299 |
+
file=sys.stderr,
|
300 |
+
)
|
301 |
+
for file in existing_tmp_files:
|
302 |
+
print(f"\t{file}", file=sys.stderr)
|
303 |
+
print(
|
304 |
+
"Please remove them except one to resume downloading.",
|
305 |
+
file=sys.stderr,
|
306 |
+
)
|
307 |
+
return
|
308 |
+
tmp_file = existing_tmp_files[0]
|
309 |
+
else:
|
310 |
+
resume = False
|
311 |
+
tmp_file = tempfile.mktemp(
|
312 |
+
suffix=tempfile.template, prefix=prefix, dir=temp_dir
|
313 |
+
)
|
314 |
+
f = open(tmp_file, "ab")
|
315 |
+
else:
|
316 |
+
tmp_file = None
|
317 |
+
f = output
|
318 |
+
|
319 |
+
if tmp_file is not None and f.tell() != 0:
|
320 |
+
headers = {"Range": f"bytes={f.tell()}-"}
|
321 |
+
res = sess.get(url, headers=headers, stream=True, verify=verify)
|
322 |
+
|
323 |
+
if not quiet:
|
324 |
+
if resume:
|
325 |
+
print("Resume:", tmp_file, file=sys.stderr)
|
326 |
+
print(
|
327 |
+
"To:",
|
328 |
+
os.path.abspath(output) if output_is_path else output,
|
329 |
+
file=sys.stderr,
|
330 |
+
)
|
331 |
+
|
332 |
+
try:
|
333 |
+
total = int(res.headers.get("Content-Length", 0))
|
334 |
+
if not quiet:
|
335 |
+
pbar = tqdm.tqdm(total=total, unit="B", unit_scale=True)
|
336 |
+
t_start = time.time()
|
337 |
+
for chunk in res.iter_content(chunk_size=CHUNK_SIZE):
|
338 |
+
f.write(chunk)
|
339 |
+
if not quiet:
|
340 |
+
pbar.update(len(chunk))
|
341 |
+
if speed is not None:
|
342 |
+
elapsed_time_expected = 1.0 * pbar.n / speed
|
343 |
+
elapsed_time = time.time() - t_start
|
344 |
+
if elapsed_time < elapsed_time_expected:
|
345 |
+
time.sleep(elapsed_time_expected - elapsed_time)
|
346 |
+
if not quiet:
|
347 |
+
pbar.close()
|
348 |
+
if tmp_file:
|
349 |
+
f.close()
|
350 |
+
shutil.move(tmp_file, output)
|
351 |
+
finally:
|
352 |
+
sess.close()
|
353 |
+
|
354 |
+
return output
|
programs/applio_code/rvc/lib/tools/launch_tensorboard.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import logging
|
3 |
+
from tensorboard import program
|
4 |
+
|
5 |
+
log_path = "logs"
|
6 |
+
|
7 |
+
|
8 |
+
def launch_tensorboard_pipeline():
|
9 |
+
logging.getLogger("root").setLevel(logging.WARNING)
|
10 |
+
logging.getLogger("tensorboard").setLevel(logging.WARNING)
|
11 |
+
|
12 |
+
tb = program.TensorBoard()
|
13 |
+
tb.configure(argv=[None, "--logdir", log_path])
|
14 |
+
url = tb.launch()
|
15 |
+
|
16 |
+
print(
|
17 |
+
f"Access the tensorboard using the following link:\n{url}?pinnedCards=%5B%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fd%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fkl%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fmel%22%7D%5D"
|
18 |
+
)
|
19 |
+
|
20 |
+
while True:
|
21 |
+
time.sleep(600)
|
programs/applio_code/rvc/lib/tools/model_download.py
ADDED
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import six
|
4 |
+
import sys
|
5 |
+
import wget
|
6 |
+
import shutil
|
7 |
+
import zipfile
|
8 |
+
import requests
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
from urllib.parse import unquote, urlencode, parse_qs, urlparse
|
11 |
+
|
12 |
+
now_dir = os.getcwd()
|
13 |
+
sys.path.append(now_dir)
|
14 |
+
|
15 |
+
from programs.applio_code.rvc.lib.utils import format_title
|
16 |
+
from programs.applio_code.rvc.lib.tools import gdown
|
17 |
+
|
18 |
+
|
19 |
+
def find_folder_parent(search_dir, folder_name):
|
20 |
+
for dirpath, dirnames, _ in os.walk(search_dir):
|
21 |
+
if folder_name in dirnames:
|
22 |
+
return os.path.abspath(dirpath)
|
23 |
+
return None
|
24 |
+
|
25 |
+
|
26 |
+
file_path = find_folder_parent(now_dir, "logs")
|
27 |
+
zips_path = os.path.join(file_path, "zips")
|
28 |
+
|
29 |
+
|
30 |
+
def search_pth_index(folder):
|
31 |
+
pth_paths = [
|
32 |
+
os.path.join(folder, file)
|
33 |
+
for file in os.listdir(folder)
|
34 |
+
if os.path.isfile(os.path.join(folder, file)) and file.endswith(".pth")
|
35 |
+
]
|
36 |
+
index_paths = [
|
37 |
+
os.path.join(folder, file)
|
38 |
+
for file in os.listdir(folder)
|
39 |
+
if os.path.isfile(os.path.join(folder, file)) and file.endswith(".index")
|
40 |
+
]
|
41 |
+
|
42 |
+
return pth_paths, index_paths
|
43 |
+
|
44 |
+
|
45 |
+
def get_mediafire_download_link(url):
|
46 |
+
response = requests.get(url)
|
47 |
+
response.raise_for_status()
|
48 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
49 |
+
download_button = soup.find(
|
50 |
+
"a", {"class": "input popsok", "aria-label": "Download file"}
|
51 |
+
)
|
52 |
+
if download_button:
|
53 |
+
download_link = download_button.get("href")
|
54 |
+
return download_link
|
55 |
+
else:
|
56 |
+
return None
|
57 |
+
|
58 |
+
|
59 |
+
def download_from_url(url):
|
60 |
+
os.makedirs(zips_path, exist_ok=True)
|
61 |
+
if url != "":
|
62 |
+
if "drive.google.com" in url:
|
63 |
+
if "file/d/" in url:
|
64 |
+
file_id = url.split("file/d/")[1].split("/")[0]
|
65 |
+
elif "id=" in url:
|
66 |
+
file_id = url.split("id=")[1].split("&")[0]
|
67 |
+
else:
|
68 |
+
return None
|
69 |
+
|
70 |
+
if file_id:
|
71 |
+
os.chdir(zips_path)
|
72 |
+
try:
|
73 |
+
gdown.download(
|
74 |
+
f"https://drive.google.com/uc?id={file_id}",
|
75 |
+
quiet=True,
|
76 |
+
fuzzy=True,
|
77 |
+
)
|
78 |
+
except Exception as error:
|
79 |
+
error_message = str(
|
80 |
+
f"An error occurred downloading the file: {error}"
|
81 |
+
)
|
82 |
+
if (
|
83 |
+
"Too many users have viewed or downloaded this file recently"
|
84 |
+
in error_message
|
85 |
+
):
|
86 |
+
os.chdir(now_dir)
|
87 |
+
return "too much use"
|
88 |
+
elif (
|
89 |
+
"Cannot retrieve the public link of the file." in error_message
|
90 |
+
):
|
91 |
+
os.chdir(now_dir)
|
92 |
+
return "private link"
|
93 |
+
else:
|
94 |
+
print(error_message)
|
95 |
+
os.chdir(now_dir)
|
96 |
+
return None
|
97 |
+
elif "disk.yandex.ru" in url:
|
98 |
+
base_url = "https://cloud-api.yandex.net/v1/disk/public/resources/download?"
|
99 |
+
public_key = url
|
100 |
+
final_url = base_url + urlencode(dict(public_key=public_key))
|
101 |
+
response = requests.get(final_url)
|
102 |
+
download_url = response.json()["href"]
|
103 |
+
download_response = requests.get(download_url)
|
104 |
+
|
105 |
+
if download_response.status_code == 200:
|
106 |
+
filename = parse_qs(urlparse(unquote(download_url)).query).get(
|
107 |
+
"filename", [""]
|
108 |
+
)[0]
|
109 |
+
if filename:
|
110 |
+
os.chdir(zips_path)
|
111 |
+
with open(filename, "wb") as f:
|
112 |
+
f.write(download_response.content)
|
113 |
+
else:
|
114 |
+
print("Failed to get filename from URL.")
|
115 |
+
return None
|
116 |
+
|
117 |
+
elif "pixeldrain.com" in url:
|
118 |
+
try:
|
119 |
+
file_id = url.split("pixeldrain.com/u/")[1]
|
120 |
+
os.chdir(zips_path)
|
121 |
+
print(file_id)
|
122 |
+
response = requests.get(f"https://pixeldrain.com/api/file/{file_id}")
|
123 |
+
if response.status_code == 200:
|
124 |
+
file_name = (
|
125 |
+
response.headers.get("Content-Disposition")
|
126 |
+
.split("filename=")[-1]
|
127 |
+
.strip('";')
|
128 |
+
)
|
129 |
+
os.makedirs(zips_path, exist_ok=True)
|
130 |
+
with open(os.path.join(zips_path, file_name), "wb") as newfile:
|
131 |
+
newfile.write(response.content)
|
132 |
+
os.chdir(file_path)
|
133 |
+
return "downloaded"
|
134 |
+
else:
|
135 |
+
os.chdir(file_path)
|
136 |
+
return None
|
137 |
+
except Exception as error:
|
138 |
+
print(f"An error occurred downloading the file: {error}")
|
139 |
+
os.chdir(file_path)
|
140 |
+
return None
|
141 |
+
|
142 |
+
elif "cdn.discordapp.com" in url:
|
143 |
+
file = requests.get(url)
|
144 |
+
os.chdir(zips_path)
|
145 |
+
if file.status_code == 200:
|
146 |
+
name = url.split("/")
|
147 |
+
with open(os.path.join(name[-1]), "wb") as newfile:
|
148 |
+
newfile.write(file.content)
|
149 |
+
else:
|
150 |
+
return None
|
151 |
+
elif "/blob/" in url or "/resolve/" in url:
|
152 |
+
os.chdir(zips_path)
|
153 |
+
if "/blob/" in url:
|
154 |
+
url = url.replace("/blob/", "/resolve/")
|
155 |
+
|
156 |
+
response = requests.get(url, stream=True)
|
157 |
+
if response.status_code == 200:
|
158 |
+
content_disposition = six.moves.urllib_parse.unquote(
|
159 |
+
response.headers["Content-Disposition"]
|
160 |
+
)
|
161 |
+
m = re.search(r'filename="([^"]+)"', content_disposition)
|
162 |
+
file_name = m.groups()[0]
|
163 |
+
file_name = file_name.replace(os.path.sep, "_")
|
164 |
+
total_size_in_bytes = int(response.headers.get("content-length", 0))
|
165 |
+
block_size = 1024
|
166 |
+
progress_bar_length = 50
|
167 |
+
progress = 0
|
168 |
+
|
169 |
+
with open(os.path.join(zips_path, file_name), "wb") as file:
|
170 |
+
for data in response.iter_content(block_size):
|
171 |
+
file.write(data)
|
172 |
+
progress += len(data)
|
173 |
+
progress_percent = int((progress / total_size_in_bytes) * 100)
|
174 |
+
num_dots = int(
|
175 |
+
(progress / total_size_in_bytes) * progress_bar_length
|
176 |
+
)
|
177 |
+
progress_bar = (
|
178 |
+
"["
|
179 |
+
+ "." * num_dots
|
180 |
+
+ " " * (progress_bar_length - num_dots)
|
181 |
+
+ "]"
|
182 |
+
)
|
183 |
+
print(
|
184 |
+
f"{progress_percent}% {progress_bar} {progress}/{total_size_in_bytes} ",
|
185 |
+
end="\r",
|
186 |
+
)
|
187 |
+
if progress_percent == 100:
|
188 |
+
print("\n")
|
189 |
+
|
190 |
+
else:
|
191 |
+
os.chdir(now_dir)
|
192 |
+
return None
|
193 |
+
elif "/tree/main" in url:
|
194 |
+
os.chdir(zips_path)
|
195 |
+
response = requests.get(url)
|
196 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
197 |
+
temp_url = ""
|
198 |
+
for link in soup.find_all("a", href=True):
|
199 |
+
if link["href"].endswith(".zip"):
|
200 |
+
temp_url = link["href"]
|
201 |
+
break
|
202 |
+
if temp_url:
|
203 |
+
url = temp_url
|
204 |
+
url = url.replace("blob", "resolve")
|
205 |
+
if "huggingface.co" not in url:
|
206 |
+
url = "https://huggingface.co" + url
|
207 |
+
|
208 |
+
wget.download(url)
|
209 |
+
else:
|
210 |
+
os.chdir(now_dir)
|
211 |
+
return None
|
212 |
+
elif "applio.org" in url:
|
213 |
+
parts = url.split("/")
|
214 |
+
id_with_query = parts[-1]
|
215 |
+
id_parts = id_with_query.split("?")
|
216 |
+
id_number = id_parts[0]
|
217 |
+
|
218 |
+
url = "https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models"
|
219 |
+
headers = {
|
220 |
+
"apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10"
|
221 |
+
}
|
222 |
+
|
223 |
+
params = {"id": f"eq.{id_number}"}
|
224 |
+
response = requests.get(url, headers=headers, params=params)
|
225 |
+
if response.status_code == 200:
|
226 |
+
json_response = response.json()
|
227 |
+
print(json_response)
|
228 |
+
if json_response:
|
229 |
+
link = json_response[0]["link"]
|
230 |
+
verify = download_from_url(link)
|
231 |
+
if verify == "downloaded":
|
232 |
+
return "downloaded"
|
233 |
+
else:
|
234 |
+
return None
|
235 |
+
else:
|
236 |
+
return None
|
237 |
+
else:
|
238 |
+
try:
|
239 |
+
os.chdir(zips_path)
|
240 |
+
wget.download(url)
|
241 |
+
except Exception as error:
|
242 |
+
os.chdir(now_dir)
|
243 |
+
print(f"An error occurred downloading the file: {error}")
|
244 |
+
return None
|
245 |
+
|
246 |
+
for currentPath, _, zipFiles in os.walk(zips_path):
|
247 |
+
for Files in zipFiles:
|
248 |
+
filePart = Files.split(".")
|
249 |
+
extensionFile = filePart[len(filePart) - 1]
|
250 |
+
filePart.pop()
|
251 |
+
nameFile = "_".join(filePart)
|
252 |
+
realPath = os.path.join(currentPath, Files)
|
253 |
+
os.rename(realPath, nameFile + "." + extensionFile)
|
254 |
+
|
255 |
+
os.chdir(now_dir)
|
256 |
+
return "downloaded"
|
257 |
+
|
258 |
+
os.chdir(now_dir)
|
259 |
+
return None
|
260 |
+
|
261 |
+
|
262 |
+
def extract_and_show_progress(zipfile_path, unzips_path):
|
263 |
+
try:
|
264 |
+
with zipfile.ZipFile(zipfile_path, "r") as zip_ref:
|
265 |
+
for file_info in zip_ref.infolist():
|
266 |
+
zip_ref.extract(file_info, unzips_path)
|
267 |
+
os.remove(zipfile_path)
|
268 |
+
return True
|
269 |
+
except Exception as error:
|
270 |
+
print(f"An error occurred extracting the zip file: {error}")
|
271 |
+
return False
|
272 |
+
|
273 |
+
|
274 |
+
def unzip_file(zip_path, zip_file_name):
|
275 |
+
zip_file_path = os.path.join(zip_path, zip_file_name + ".zip")
|
276 |
+
extract_path = os.path.join(file_path, zip_file_name)
|
277 |
+
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
278 |
+
zip_ref.extractall(extract_path)
|
279 |
+
os.remove(zip_file_path)
|
280 |
+
|
281 |
+
|
282 |
+
def model_download_pipeline(url: str):
|
283 |
+
try:
|
284 |
+
verify = download_from_url(url)
|
285 |
+
if verify == "downloaded":
|
286 |
+
extract_folder_path = ""
|
287 |
+
for filename in os.listdir(zips_path):
|
288 |
+
if filename.endswith(".zip"):
|
289 |
+
zipfile_path = os.path.join(zips_path, filename)
|
290 |
+
print("Proceeding with the extraction...")
|
291 |
+
|
292 |
+
model_zip = os.path.basename(zipfile_path)
|
293 |
+
model_name = format_title(model_zip.split(".zip")[0])
|
294 |
+
extract_folder_path = os.path.join(
|
295 |
+
"logs",
|
296 |
+
os.path.normpath(model_name),
|
297 |
+
)
|
298 |
+
success = extract_and_show_progress(
|
299 |
+
zipfile_path, extract_folder_path
|
300 |
+
)
|
301 |
+
|
302 |
+
macosx_path = os.path.join(extract_folder_path, "__MACOSX")
|
303 |
+
if os.path.exists(macosx_path):
|
304 |
+
shutil.rmtree(macosx_path)
|
305 |
+
|
306 |
+
subfolders = [
|
307 |
+
f
|
308 |
+
for f in os.listdir(extract_folder_path)
|
309 |
+
if os.path.isdir(os.path.join(extract_folder_path, f))
|
310 |
+
]
|
311 |
+
if len(subfolders) == 1:
|
312 |
+
subfolder_path = os.path.join(
|
313 |
+
extract_folder_path, subfolders[0]
|
314 |
+
)
|
315 |
+
for item in os.listdir(subfolder_path):
|
316 |
+
s = os.path.join(subfolder_path, item)
|
317 |
+
d = os.path.join(extract_folder_path, item)
|
318 |
+
shutil.move(s, d)
|
319 |
+
os.rmdir(subfolder_path)
|
320 |
+
|
321 |
+
for item in os.listdir(extract_folder_path):
|
322 |
+
if ".pth" in item:
|
323 |
+
file_name = item.split(".pth")[0]
|
324 |
+
if file_name != model_name:
|
325 |
+
os.rename(
|
326 |
+
os.path.join(extract_folder_path, item),
|
327 |
+
os.path.join(
|
328 |
+
extract_folder_path, model_name + ".pth"
|
329 |
+
),
|
330 |
+
)
|
331 |
+
else:
|
332 |
+
if "v2" not in item:
|
333 |
+
if "_nprobe_1_" in item and "_v1" in item:
|
334 |
+
file_name = item.split("_nprobe_1_")[1].split(
|
335 |
+
"_v1"
|
336 |
+
)[0]
|
337 |
+
if file_name != model_name:
|
338 |
+
new_file_name = (
|
339 |
+
item.split("_nprobe_1_")[0]
|
340 |
+
+ "_nprobe_1_"
|
341 |
+
+ model_name
|
342 |
+
+ "_v1"
|
343 |
+
)
|
344 |
+
os.rename(
|
345 |
+
os.path.join(extract_folder_path, item),
|
346 |
+
os.path.join(
|
347 |
+
extract_folder_path,
|
348 |
+
new_file_name + ".index",
|
349 |
+
),
|
350 |
+
)
|
351 |
+
else:
|
352 |
+
if "_nprobe_1_" in item and "_v2" in item:
|
353 |
+
file_name = item.split("_nprobe_1_")[1].split(
|
354 |
+
"_v2"
|
355 |
+
)[0]
|
356 |
+
if file_name != model_name:
|
357 |
+
new_file_name = (
|
358 |
+
item.split("_nprobe_1_")[0]
|
359 |
+
+ "_nprobe_1_"
|
360 |
+
+ model_name
|
361 |
+
+ "_v2"
|
362 |
+
)
|
363 |
+
os.rename(
|
364 |
+
os.path.join(extract_folder_path, item),
|
365 |
+
os.path.join(
|
366 |
+
extract_folder_path,
|
367 |
+
new_file_name + ".index",
|
368 |
+
),
|
369 |
+
)
|
370 |
+
|
371 |
+
if success:
|
372 |
+
print(f"Model {model_name} downloaded!")
|
373 |
+
else:
|
374 |
+
print(f"Error downloading {model_name}")
|
375 |
+
return "Error"
|
376 |
+
if extract_folder_path == "":
|
377 |
+
print("Zip file was not found.")
|
378 |
+
return "Error"
|
379 |
+
result = search_pth_index(extract_folder_path)
|
380 |
+
return result
|
381 |
+
else:
|
382 |
+
return "Error"
|
383 |
+
except Exception as error:
|
384 |
+
print(f"An unexpected error occurred: {error}")
|
385 |
+
return "Error"
|
programs/applio_code/rvc/lib/tools/prerequisites_download.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from concurrent.futures import ThreadPoolExecutor
|
3 |
+
from tqdm import tqdm
|
4 |
+
import requests
|
5 |
+
|
6 |
+
url_base = "https://huggingface.co/IAHispano/Applio/resolve/main/Resources"
|
7 |
+
|
8 |
+
pretraineds_v1_list = [
|
9 |
+
(
|
10 |
+
"pretrained_v1/",
|
11 |
+
[
|
12 |
+
"D32k.pth",
|
13 |
+
"D40k.pth",
|
14 |
+
"D48k.pth",
|
15 |
+
"G32k.pth",
|
16 |
+
"G40k.pth",
|
17 |
+
"G48k.pth",
|
18 |
+
"f0D32k.pth",
|
19 |
+
"f0D40k.pth",
|
20 |
+
"f0D48k.pth",
|
21 |
+
"f0G32k.pth",
|
22 |
+
"f0G40k.pth",
|
23 |
+
"f0G48k.pth",
|
24 |
+
],
|
25 |
+
)
|
26 |
+
]
|
27 |
+
pretraineds_v2_list = [
|
28 |
+
(
|
29 |
+
"pretrained_v2/",
|
30 |
+
[
|
31 |
+
"D32k.pth",
|
32 |
+
"D40k.pth",
|
33 |
+
"D48k.pth",
|
34 |
+
"G32k.pth",
|
35 |
+
"G40k.pth",
|
36 |
+
"G48k.pth",
|
37 |
+
"f0D32k.pth",
|
38 |
+
"f0D40k.pth",
|
39 |
+
"f0D48k.pth",
|
40 |
+
"f0G32k.pth",
|
41 |
+
"f0G40k.pth",
|
42 |
+
"f0G48k.pth",
|
43 |
+
],
|
44 |
+
)
|
45 |
+
]
|
46 |
+
models_list = [("predictors/", ["rmvpe.pt", "fcpe.pt"])]
|
47 |
+
embedders_list = [("embedders/contentvec/", ["pytorch_model.bin", "config.json"])]
|
48 |
+
linux_executables_list = [("formant/", ["stftpitchshift"])]
|
49 |
+
executables_list = [
|
50 |
+
("", ["ffmpeg.exe", "ffprobe.exe"]),
|
51 |
+
("formant/", ["stftpitchshift.exe"]),
|
52 |
+
]
|
53 |
+
|
54 |
+
folder_mapping_list = {
|
55 |
+
"pretrained_v1/": "programs/applio_code/rvc/models/pretraineds/pretrained_v1/",
|
56 |
+
"pretrained_v2/": "programs/applio_code/rvc/models/pretraineds/pretrained_v2/",
|
57 |
+
"embedders/contentvec/": "programs/applio_code/rvc/models/embedders/contentvec/",
|
58 |
+
"predictors/": "programs/applio_code/rvc/models/predictors/",
|
59 |
+
"formant/": "programs/applio_code/rvc/models/formant/",
|
60 |
+
}
|
61 |
+
|
62 |
+
|
63 |
+
def get_file_size_if_missing(file_list):
|
64 |
+
"""
|
65 |
+
Calculate the total size of files to be downloaded only if they do not exist locally.
|
66 |
+
"""
|
67 |
+
total_size = 0
|
68 |
+
for remote_folder, files in file_list:
|
69 |
+
local_folder = folder_mapping_list.get(remote_folder, "")
|
70 |
+
for file in files:
|
71 |
+
destination_path = os.path.join(local_folder, file)
|
72 |
+
if not os.path.exists(destination_path):
|
73 |
+
url = f"{url_base}/{remote_folder}{file}"
|
74 |
+
response = requests.head(url)
|
75 |
+
total_size += int(response.headers.get("content-length", 0))
|
76 |
+
return total_size
|
77 |
+
|
78 |
+
|
79 |
+
def download_file(url, destination_path, global_bar):
|
80 |
+
"""
|
81 |
+
Download a file from the given URL to the specified destination path,
|
82 |
+
updating the global progress bar as data is downloaded.
|
83 |
+
"""
|
84 |
+
|
85 |
+
dir_name = os.path.dirname(destination_path)
|
86 |
+
if dir_name:
|
87 |
+
os.makedirs(dir_name, exist_ok=True)
|
88 |
+
response = requests.get(url, stream=True)
|
89 |
+
block_size = 1024
|
90 |
+
with open(destination_path, "wb") as file:
|
91 |
+
for data in response.iter_content(block_size):
|
92 |
+
file.write(data)
|
93 |
+
global_bar.update(len(data))
|
94 |
+
|
95 |
+
|
96 |
+
def download_mapping_files(file_mapping_list, global_bar):
|
97 |
+
"""
|
98 |
+
Download all files in the provided file mapping list using a thread pool executor,
|
99 |
+
and update the global progress bar as downloads progress.
|
100 |
+
"""
|
101 |
+
with ThreadPoolExecutor() as executor:
|
102 |
+
futures = []
|
103 |
+
for remote_folder, file_list in file_mapping_list:
|
104 |
+
local_folder = folder_mapping_list.get(remote_folder, "")
|
105 |
+
for file in file_list:
|
106 |
+
destination_path = os.path.join(local_folder, file)
|
107 |
+
if not os.path.exists(destination_path):
|
108 |
+
url = f"{url_base}/{remote_folder}{file}"
|
109 |
+
futures.append(
|
110 |
+
executor.submit(
|
111 |
+
download_file, url, destination_path, global_bar
|
112 |
+
)
|
113 |
+
)
|
114 |
+
for future in futures:
|
115 |
+
future.result()
|
116 |
+
|
117 |
+
|
118 |
+
def calculate_total_size(pretraineds_v1, pretraineds_v2, models, exe):
|
119 |
+
"""
|
120 |
+
Calculate the total size of all files to be downloaded based on selected categories.
|
121 |
+
"""
|
122 |
+
total_size = 0
|
123 |
+
if models:
|
124 |
+
total_size += get_file_size_if_missing(models_list)
|
125 |
+
total_size += get_file_size_if_missing(embedders_list)
|
126 |
+
if exe:
|
127 |
+
total_size += get_file_size_if_missing(
|
128 |
+
executables_list if os.name == "nt" else linux_executables_list
|
129 |
+
)
|
130 |
+
if pretraineds_v1:
|
131 |
+
total_size += get_file_size_if_missing(pretraineds_v1_list)
|
132 |
+
if pretraineds_v2:
|
133 |
+
total_size += get_file_size_if_missing(pretraineds_v2_list)
|
134 |
+
return total_size
|
135 |
+
|
136 |
+
|
137 |
+
def prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe):
|
138 |
+
"""
|
139 |
+
Manage the download pipeline for different categories of files.
|
140 |
+
"""
|
141 |
+
total_size = calculate_total_size(pretraineds_v1, pretraineds_v2, models, exe)
|
142 |
+
|
143 |
+
if total_size > 0:
|
144 |
+
with tqdm(
|
145 |
+
total=total_size, unit="iB", unit_scale=True, desc="Downloading all files"
|
146 |
+
) as global_bar:
|
147 |
+
if models:
|
148 |
+
download_mapping_files(models_list, global_bar)
|
149 |
+
download_mapping_files(embedders_list, global_bar)
|
150 |
+
if exe:
|
151 |
+
download_mapping_files(
|
152 |
+
executables_list if os.name == "nt" else linux_executables_list,
|
153 |
+
global_bar,
|
154 |
+
)
|
155 |
+
if pretraineds_v1:
|
156 |
+
download_mapping_files(pretraineds_v1_list, global_bar)
|
157 |
+
if pretraineds_v2:
|
158 |
+
download_mapping_files(pretraineds_v2_list, global_bar)
|
159 |
+
else:
|
160 |
+
pass
|
161 |
+
|
162 |
+
|
163 |
+
if __name__ == "__main__":
|
164 |
+
prequisites_download_pipeline(False, False, True, False)
|
programs/applio_code/rvc/lib/tools/pretrained_selector.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def pretrained_selector(pitch_guidance):
|
2 |
+
if pitch_guidance == True:
|
3 |
+
return {
|
4 |
+
"v1": {
|
5 |
+
32000: (
|
6 |
+
"rvc/models/pretraineds/pretrained_v1/f0G32k.pth",
|
7 |
+
"rvc/models/pretraineds/pretrained_v1/f0D32k.pth",
|
8 |
+
),
|
9 |
+
40000: (
|
10 |
+
"rvc/models/pretraineds/pretrained_v1/f0G40k.pth",
|
11 |
+
"rvc/models/pretraineds/pretrained_v1/f0D40k.pth",
|
12 |
+
),
|
13 |
+
48000: (
|
14 |
+
"rvc/models/pretraineds/pretrained_v1/f0G48k.pth",
|
15 |
+
"rvc/models/pretraineds/pretrained_v1/f0D48k.pth",
|
16 |
+
),
|
17 |
+
},
|
18 |
+
"v2": {
|
19 |
+
32000: (
|
20 |
+
"rvc/models/pretraineds/pretrained_v2/f0G32k.pth",
|
21 |
+
"rvc/models/pretraineds/pretrained_v2/f0D32k.pth",
|
22 |
+
),
|
23 |
+
40000: (
|
24 |
+
"rvc/models/pretraineds/pretrained_v2/f0G40k.pth",
|
25 |
+
"rvc/models/pretraineds/pretrained_v2/f0D40k.pth",
|
26 |
+
),
|
27 |
+
48000: (
|
28 |
+
"rvc/models/pretraineds/pretrained_v2/f0G48k.pth",
|
29 |
+
"rvc/models/pretraineds/pretrained_v2/f0D48k.pth",
|
30 |
+
),
|
31 |
+
},
|
32 |
+
}
|
33 |
+
elif pitch_guidance == False:
|
34 |
+
return {
|
35 |
+
"v1": {
|
36 |
+
32000: (
|
37 |
+
"rvc/models/pretraineds/pretrained_v1/G32k.pth",
|
38 |
+
"rvc/models/pretraineds/pretrained_v1/D32k.pth",
|
39 |
+
),
|
40 |
+
40000: (
|
41 |
+
"rvc/models/pretraineds/pretrained_v1/G40k.pth",
|
42 |
+
"rvc/models/pretraineds/pretrained_v1/D40k.pth",
|
43 |
+
),
|
44 |
+
48000: (
|
45 |
+
"rvc/models/pretraineds/pretrained_v1/G48k.pth",
|
46 |
+
"rvc/models/pretraineds/pretrained_v1/D48k.pth",
|
47 |
+
),
|
48 |
+
},
|
49 |
+
"v2": {
|
50 |
+
32000: (
|
51 |
+
"rvc/models/pretraineds/pretrained_v2/G32k.pth",
|
52 |
+
"rvc/models/pretraineds/pretrained_v2/D32k.pth",
|
53 |
+
),
|
54 |
+
40000: (
|
55 |
+
"rvc/models/pretraineds/pretrained_v2/G40k.pth",
|
56 |
+
"rvc/models/pretraineds/pretrained_v2/D40k.pth",
|
57 |
+
),
|
58 |
+
48000: (
|
59 |
+
"rvc/models/pretraineds/pretrained_v2/G48k.pth",
|
60 |
+
"rvc/models/pretraineds/pretrained_v2/D48k.pth",
|
61 |
+
),
|
62 |
+
},
|
63 |
+
}
|
programs/applio_code/rvc/lib/tools/split_audio.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydub.silence import detect_nonsilent
|
2 |
+
from pydub import AudioSegment
|
3 |
+
import numpy as np
|
4 |
+
import re
|
5 |
+
import os
|
6 |
+
|
7 |
+
from programs.applio_code.rvc.lib.utils import format_title
|
8 |
+
|
9 |
+
|
10 |
+
def process_audio(file_path):
|
11 |
+
try:
|
12 |
+
# load audio file
|
13 |
+
song = AudioSegment.from_file(file_path)
|
14 |
+
|
15 |
+
# set silence threshold and duration
|
16 |
+
silence_thresh = -70 # dB
|
17 |
+
min_silence_len = 750 # ms, adjust as needed
|
18 |
+
|
19 |
+
# detect nonsilent parts
|
20 |
+
nonsilent_parts = detect_nonsilent(
|
21 |
+
song, min_silence_len=min_silence_len, silence_thresh=silence_thresh
|
22 |
+
)
|
23 |
+
|
24 |
+
# Create a new directory to store chunks
|
25 |
+
file_dir = os.path.dirname(file_path)
|
26 |
+
file_name = os.path.basename(file_path).split(".")[0]
|
27 |
+
file_name = format_title(file_name)
|
28 |
+
new_dir_path = os.path.join(file_dir, file_name)
|
29 |
+
os.makedirs(new_dir_path, exist_ok=True)
|
30 |
+
|
31 |
+
# Check if timestamps file exists, if so delete it
|
32 |
+
timestamps_file = os.path.join(file_dir, f"{file_name}_timestamps.txt")
|
33 |
+
if os.path.isfile(timestamps_file):
|
34 |
+
os.remove(timestamps_file)
|
35 |
+
|
36 |
+
# export chunks and save start times
|
37 |
+
segment_count = 0
|
38 |
+
for i, (start_i, end_i) in enumerate(nonsilent_parts):
|
39 |
+
chunk = song[start_i:end_i]
|
40 |
+
chunk_file_path = os.path.join(new_dir_path, f"chunk{i}.wav")
|
41 |
+
chunk.export(chunk_file_path, format="wav")
|
42 |
+
|
43 |
+
print(f"Segment {i} created!")
|
44 |
+
segment_count += 1
|
45 |
+
|
46 |
+
# write start times to file
|
47 |
+
with open(timestamps_file, "a", encoding="utf-8") as f:
|
48 |
+
f.write(f"{chunk_file_path} starts at {start_i} ms\n")
|
49 |
+
|
50 |
+
print(f"Total segments created: {segment_count}")
|
51 |
+
print(f"Split all chunks for {file_path} successfully!")
|
52 |
+
|
53 |
+
return "Finish", new_dir_path
|
54 |
+
|
55 |
+
except Exception as error:
|
56 |
+
print(f"An error occurred splitting the audio: {error}")
|
57 |
+
return "Error", None
|
58 |
+
|
59 |
+
|
60 |
+
def merge_audio(timestamps_file):
|
61 |
+
try:
|
62 |
+
# Extract prefix from the timestamps filename
|
63 |
+
prefix = os.path.basename(timestamps_file).replace("_timestamps.txt", "")
|
64 |
+
timestamps_dir = os.path.dirname(timestamps_file)
|
65 |
+
|
66 |
+
# Open the timestamps file
|
67 |
+
with open(timestamps_file, "r", encoding="utf-8") as f:
|
68 |
+
lines = f.readlines()
|
69 |
+
|
70 |
+
# Initialize empty list to hold audio segments
|
71 |
+
audio_segments = []
|
72 |
+
last_end_time = 0
|
73 |
+
|
74 |
+
print(f"Processing file: {timestamps_file}")
|
75 |
+
|
76 |
+
for line in lines:
|
77 |
+
# Extract filename and start time from line
|
78 |
+
match = re.search(r"(chunk\d+.wav) starts at (\d+) ms", line)
|
79 |
+
if match:
|
80 |
+
filename, start_time = match.groups()
|
81 |
+
start_time = int(start_time)
|
82 |
+
|
83 |
+
# Construct the complete path to the chunk file
|
84 |
+
chunk_file = os.path.join(timestamps_dir, prefix, filename)
|
85 |
+
|
86 |
+
# Add silence from last_end_time to start_time
|
87 |
+
silence_duration = max(start_time - last_end_time, 0)
|
88 |
+
silence = AudioSegment.silent(duration=silence_duration)
|
89 |
+
audio_segments.append(silence)
|
90 |
+
|
91 |
+
# Load audio file and append to list
|
92 |
+
audio = AudioSegment.from_wav(chunk_file)
|
93 |
+
audio_segments.append(audio)
|
94 |
+
|
95 |
+
# Update last_end_time
|
96 |
+
last_end_time = start_time + len(audio)
|
97 |
+
|
98 |
+
print(f"Processed chunk: {chunk_file}")
|
99 |
+
|
100 |
+
# Concatenate all audio_segments and export
|
101 |
+
merged_audio = sum(audio_segments)
|
102 |
+
merged_audio_np = np.array(merged_audio.get_array_of_samples())
|
103 |
+
# print(f"Exported merged file: {merged_filename}\n")
|
104 |
+
return merged_audio.frame_rate, merged_audio_np
|
105 |
+
|
106 |
+
except Exception as error:
|
107 |
+
print(f"An error occurred splitting the audio: {error}")
|
programs/applio_code/rvc/lib/tools/tts.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import asyncio
|
3 |
+
import edge_tts
|
4 |
+
|
5 |
+
|
6 |
+
async def main():
|
7 |
+
# Parse command line arguments
|
8 |
+
text = str(sys.argv[1])
|
9 |
+
voice = str(sys.argv[2])
|
10 |
+
rate = int(sys.argv[3])
|
11 |
+
output_file = str(sys.argv[4])
|
12 |
+
|
13 |
+
rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
|
14 |
+
|
15 |
+
await edge_tts.Communicate(text, voice, rate=rates).save(output_file)
|
16 |
+
print(f"TTS with {voice} completed. Output TTS file: '{output_file}'")
|
17 |
+
|
18 |
+
|
19 |
+
if __name__ == "__main__":
|
20 |
+
asyncio.run(main())
|
programs/applio_code/rvc/lib/tools/tts_voices.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
programs/applio_code/rvc/lib/utils.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import librosa
|
3 |
+
import soundfile as sf
|
4 |
+
import re
|
5 |
+
import unicodedata
|
6 |
+
import wget
|
7 |
+
from torch import nn
|
8 |
+
|
9 |
+
import logging
|
10 |
+
from transformers import HubertModel
|
11 |
+
import warnings
|
12 |
+
|
13 |
+
# Remove this to see warnings about transformers models
|
14 |
+
warnings.filterwarnings("ignore")
|
15 |
+
|
16 |
+
logging.getLogger("fairseq").setLevel(logging.ERROR)
|
17 |
+
logging.getLogger("faiss.loader").setLevel(logging.ERROR)
|
18 |
+
logging.getLogger("transformers").setLevel(logging.ERROR)
|
19 |
+
logging.getLogger("torch").setLevel(logging.ERROR)
|
20 |
+
|
21 |
+
now_dir = os.getcwd()
|
22 |
+
sys.path.append(now_dir)
|
23 |
+
|
24 |
+
base_path = os.path.join(now_dir, "rvc", "models", "formant", "stftpitchshift")
|
25 |
+
stft = base_path + ".exe" if sys.platform == "win32" else base_path
|
26 |
+
|
27 |
+
|
28 |
+
class HubertModelWithFinalProj(HubertModel):
|
29 |
+
def __init__(self, config):
|
30 |
+
super().__init__(config)
|
31 |
+
self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
|
32 |
+
|
33 |
+
|
34 |
+
def load_audio(file, sample_rate):
|
35 |
+
try:
|
36 |
+
file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
37 |
+
audio, sr = sf.read(file)
|
38 |
+
if len(audio.shape) > 1:
|
39 |
+
audio = librosa.to_mono(audio.T)
|
40 |
+
if sr != sample_rate:
|
41 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
|
42 |
+
except Exception as error:
|
43 |
+
raise RuntimeError(f"An error occurred loading the audio: {error}")
|
44 |
+
|
45 |
+
return audio.flatten()
|
46 |
+
|
47 |
+
|
48 |
+
def load_audio_infer(file, sample_rate):
|
49 |
+
file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
50 |
+
if not os.path.isfile(file):
|
51 |
+
raise FileNotFoundError(f"File not found: {file}")
|
52 |
+
audio, sr = sf.read(file)
|
53 |
+
if len(audio.shape) > 1:
|
54 |
+
audio = librosa.to_mono(audio.T)
|
55 |
+
if sr != sample_rate:
|
56 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
|
57 |
+
return audio.flatten()
|
58 |
+
|
59 |
+
|
60 |
+
def format_title(title):
|
61 |
+
formatted_title = (
|
62 |
+
unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
|
63 |
+
)
|
64 |
+
formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title)
|
65 |
+
formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title)
|
66 |
+
formatted_title = re.sub(r"\s+", "_", formatted_title)
|
67 |
+
return formatted_title
|
68 |
+
|
69 |
+
|
70 |
+
def load_embedding(embedder_model, custom_embedder=None):
|
71 |
+
embedder_root = os.path.join(
|
72 |
+
now_dir, "programs", "applio_code", "rvc", "models", "embedders"
|
73 |
+
)
|
74 |
+
embedding_list = {
|
75 |
+
"contentvec": os.path.join(embedder_root, "contentvec"),
|
76 |
+
"chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"),
|
77 |
+
"japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"),
|
78 |
+
"korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"),
|
79 |
+
}
|
80 |
+
|
81 |
+
online_embedders = {
|
82 |
+
"contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/pytorch_model.bin",
|
83 |
+
"chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin",
|
84 |
+
"japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin",
|
85 |
+
"korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin",
|
86 |
+
}
|
87 |
+
|
88 |
+
config_files = {
|
89 |
+
"contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/config.json",
|
90 |
+
"chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/config.json",
|
91 |
+
"japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/config.json",
|
92 |
+
"korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/config.json",
|
93 |
+
}
|
94 |
+
|
95 |
+
if embedder_model == "custom":
|
96 |
+
if os.path.exists(custom_embedder):
|
97 |
+
model_path = custom_embedder
|
98 |
+
else:
|
99 |
+
print(f"Custom embedder not found: {custom_embedder}, using contentvec")
|
100 |
+
model_path = embedding_list["contentvec"]
|
101 |
+
else:
|
102 |
+
model_path = embedding_list[embedder_model]
|
103 |
+
bin_file = os.path.join(model_path, "pytorch_model.bin")
|
104 |
+
json_file = os.path.join(model_path, "config.json")
|
105 |
+
os.makedirs(model_path, exist_ok=True)
|
106 |
+
if not os.path.exists(bin_file):
|
107 |
+
url = online_embedders[embedder_model]
|
108 |
+
print(f"Downloading {url} to {model_path}...")
|
109 |
+
wget.download(url, out=bin_file)
|
110 |
+
if not os.path.exists(json_file):
|
111 |
+
url = config_files[embedder_model]
|
112 |
+
print(f"Downloading {url} to {model_path}...")
|
113 |
+
wget.download(url, out=json_file)
|
114 |
+
|
115 |
+
models = HubertModelWithFinalProj.from_pretrained(model_path)
|
116 |
+
return models
|
programs/applio_code/rvc/models/embedders/contentvec/config.json
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"activation_dropout": 0.1,
|
3 |
+
"apply_spec_augment": true,
|
4 |
+
"architectures": [
|
5 |
+
"HubertModelWithFinalProj"
|
6 |
+
],
|
7 |
+
"attention_dropout": 0.1,
|
8 |
+
"bos_token_id": 1,
|
9 |
+
"classifier_proj_size": 256,
|
10 |
+
"conv_bias": false,
|
11 |
+
"conv_dim": [
|
12 |
+
512,
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512
|
19 |
+
],
|
20 |
+
"conv_kernel": [
|
21 |
+
10,
|
22 |
+
3,
|
23 |
+
3,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
2,
|
27 |
+
2
|
28 |
+
],
|
29 |
+
"conv_stride": [
|
30 |
+
5,
|
31 |
+
2,
|
32 |
+
2,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2
|
37 |
+
],
|
38 |
+
"ctc_loss_reduction": "sum",
|
39 |
+
"ctc_zero_infinity": false,
|
40 |
+
"do_stable_layer_norm": false,
|
41 |
+
"eos_token_id": 2,
|
42 |
+
"feat_extract_activation": "gelu",
|
43 |
+
"feat_extract_norm": "group",
|
44 |
+
"feat_proj_dropout": 0.0,
|
45 |
+
"feat_proj_layer_norm": true,
|
46 |
+
"final_dropout": 0.1,
|
47 |
+
"hidden_act": "gelu",
|
48 |
+
"hidden_dropout": 0.1,
|
49 |
+
"hidden_size": 768,
|
50 |
+
"initializer_range": 0.02,
|
51 |
+
"intermediate_size": 3072,
|
52 |
+
"layer_norm_eps": 1e-05,
|
53 |
+
"layerdrop": 0.1,
|
54 |
+
"mask_feature_length": 10,
|
55 |
+
"mask_feature_min_masks": 0,
|
56 |
+
"mask_feature_prob": 0.0,
|
57 |
+
"mask_time_length": 10,
|
58 |
+
"mask_time_min_masks": 2,
|
59 |
+
"mask_time_prob": 0.05,
|
60 |
+
"model_type": "hubert",
|
61 |
+
"num_attention_heads": 12,
|
62 |
+
"num_conv_pos_embedding_groups": 16,
|
63 |
+
"num_conv_pos_embeddings": 128,
|
64 |
+
"num_feat_extract_layers": 7,
|
65 |
+
"num_hidden_layers": 12,
|
66 |
+
"pad_token_id": 0,
|
67 |
+
"torch_dtype": "float32",
|
68 |
+
"transformers_version": "4.27.3",
|
69 |
+
"use_weighted_layer_sum": false,
|
70 |
+
"vocab_size": 32
|
71 |
+
}
|
programs/applio_code/rvc/models/embedders/contentvec/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e
|
3 |
+
size 378342945
|
programs/applio_code/rvc/models/predictors/fcpe.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c3a8dd2dbd51baf19ed295006f2ac25dba6dd60adc7ec578ae5fbd94970951da
|
3 |
+
size 69005189
|
programs/applio_code/rvc/models/predictors/rmvpe.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d62215f4306e3ca278246188607209f09af3dc77ed4232efdd069798c4ec193
|
3 |
+
size 181184272
|
programs/music_separation_code/ensemble.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding: utf-8
|
2 |
+
__author__ = "Roman Solovyev (ZFTurbo): https://github.com/ZFTurbo/"
|
3 |
+
|
4 |
+
import os
|
5 |
+
import librosa
|
6 |
+
import soundfile as sf
|
7 |
+
import numpy as np
|
8 |
+
import argparse
|
9 |
+
|
10 |
+
|
11 |
+
def stft(wave, nfft, hl):
|
12 |
+
wave_left = np.asfortranarray(wave[0])
|
13 |
+
wave_right = np.asfortranarray(wave[1])
|
14 |
+
spec_left = librosa.stft(wave_left, n_fft=nfft, hop_length=hl)
|
15 |
+
spec_right = librosa.stft(wave_right, n_fft=nfft, hop_length=hl)
|
16 |
+
spec = np.asfortranarray([spec_left, spec_right])
|
17 |
+
return spec
|
18 |
+
|
19 |
+
|
20 |
+
def istft(spec, hl, length):
|
21 |
+
spec_left = np.asfortranarray(spec[0])
|
22 |
+
spec_right = np.asfortranarray(spec[1])
|
23 |
+
wave_left = librosa.istft(spec_left, hop_length=hl, length=length)
|
24 |
+
wave_right = librosa.istft(spec_right, hop_length=hl, length=length)
|
25 |
+
wave = np.asfortranarray([wave_left, wave_right])
|
26 |
+
return wave
|
27 |
+
|
28 |
+
|
29 |
+
def absmax(a, *, axis):
|
30 |
+
dims = list(a.shape)
|
31 |
+
dims.pop(axis)
|
32 |
+
indices = np.ogrid[tuple(slice(0, d) for d in dims)]
|
33 |
+
argmax = np.abs(a).argmax(axis=axis)
|
34 |
+
indices.insert((len(a.shape) + axis) % len(a.shape), argmax)
|
35 |
+
return a[tuple(indices)]
|
36 |
+
|
37 |
+
|
38 |
+
def absmin(a, *, axis):
|
39 |
+
dims = list(a.shape)
|
40 |
+
dims.pop(axis)
|
41 |
+
indices = np.ogrid[tuple(slice(0, d) for d in dims)]
|
42 |
+
argmax = np.abs(a).argmin(axis=axis)
|
43 |
+
indices.insert((len(a.shape) + axis) % len(a.shape), argmax)
|
44 |
+
return a[tuple(indices)]
|
45 |
+
|
46 |
+
|
47 |
+
def lambda_max(arr, axis=None, key=None, keepdims=False):
|
48 |
+
idxs = np.argmax(key(arr), axis)
|
49 |
+
if axis is not None:
|
50 |
+
idxs = np.expand_dims(idxs, axis)
|
51 |
+
result = np.take_along_axis(arr, idxs, axis)
|
52 |
+
if not keepdims:
|
53 |
+
result = np.squeeze(result, axis=axis)
|
54 |
+
return result
|
55 |
+
else:
|
56 |
+
return arr.flatten()[idxs]
|
57 |
+
|
58 |
+
|
59 |
+
def lambda_min(arr, axis=None, key=None, keepdims=False):
|
60 |
+
idxs = np.argmin(key(arr), axis)
|
61 |
+
if axis is not None:
|
62 |
+
idxs = np.expand_dims(idxs, axis)
|
63 |
+
result = np.take_along_axis(arr, idxs, axis)
|
64 |
+
if not keepdims:
|
65 |
+
result = np.squeeze(result, axis=axis)
|
66 |
+
return result
|
67 |
+
else:
|
68 |
+
return arr.flatten()[idxs]
|
69 |
+
|
70 |
+
|
71 |
+
def average_waveforms(pred_track, weights, algorithm):
|
72 |
+
"""
|
73 |
+
:param pred_track: shape = (num, channels, length)
|
74 |
+
:param weights: shape = (num, )
|
75 |
+
:param algorithm: One of avg_wave, median_wave, min_wave, max_wave, avg_fft, median_fft, min_fft, max_fft
|
76 |
+
:return: averaged waveform in shape (channels, length)
|
77 |
+
"""
|
78 |
+
|
79 |
+
pred_track = np.array(pred_track)
|
80 |
+
final_length = pred_track.shape[-1]
|
81 |
+
|
82 |
+
mod_track = []
|
83 |
+
for i in range(pred_track.shape[0]):
|
84 |
+
if algorithm == "avg_wave":
|
85 |
+
mod_track.append(pred_track[i] * weights[i])
|
86 |
+
elif algorithm in ["median_wave", "min_wave", "max_wave"]:
|
87 |
+
mod_track.append(pred_track[i])
|
88 |
+
elif algorithm in ["avg_fft", "min_fft", "max_fft", "median_fft"]:
|
89 |
+
spec = stft(pred_track[i], nfft=2048, hl=1024)
|
90 |
+
if algorithm in ["avg_fft"]:
|
91 |
+
mod_track.append(spec * weights[i])
|
92 |
+
else:
|
93 |
+
mod_track.append(spec)
|
94 |
+
pred_track = np.array(mod_track)
|
95 |
+
|
96 |
+
if algorithm in ["avg_wave"]:
|
97 |
+
pred_track = pred_track.sum(axis=0)
|
98 |
+
pred_track /= np.array(weights).sum().T
|
99 |
+
elif algorithm in ["median_wave"]:
|
100 |
+
pred_track = np.median(pred_track, axis=0)
|
101 |
+
elif algorithm in ["min_wave"]:
|
102 |
+
pred_track = np.array(pred_track)
|
103 |
+
pred_track = lambda_min(pred_track, axis=0, key=np.abs)
|
104 |
+
elif algorithm in ["max_wave"]:
|
105 |
+
pred_track = np.array(pred_track)
|
106 |
+
pred_track = lambda_max(pred_track, axis=0, key=np.abs)
|
107 |
+
elif algorithm in ["avg_fft"]:
|
108 |
+
pred_track = pred_track.sum(axis=0)
|
109 |
+
pred_track /= np.array(weights).sum()
|
110 |
+
pred_track = istft(pred_track, 1024, final_length)
|
111 |
+
elif algorithm in ["min_fft"]:
|
112 |
+
pred_track = np.array(pred_track)
|
113 |
+
pred_track = lambda_min(pred_track, axis=0, key=np.abs)
|
114 |
+
pred_track = istft(pred_track, 1024, final_length)
|
115 |
+
elif algorithm in ["max_fft"]:
|
116 |
+
pred_track = np.array(pred_track)
|
117 |
+
pred_track = absmax(pred_track, axis=0)
|
118 |
+
pred_track = istft(pred_track, 1024, final_length)
|
119 |
+
elif algorithm in ["median_fft"]:
|
120 |
+
pred_track = np.array(pred_track)
|
121 |
+
pred_track = np.median(pred_track, axis=0)
|
122 |
+
pred_track = istft(pred_track, 1024, final_length)
|
123 |
+
return pred_track
|
124 |
+
|
125 |
+
|
126 |
+
def ensemble_files(args):
|
127 |
+
parser = argparse.ArgumentParser()
|
128 |
+
parser.add_argument(
|
129 |
+
"--files",
|
130 |
+
type=str,
|
131 |
+
required=True,
|
132 |
+
nargs="+",
|
133 |
+
help="Path to all audio-files to ensemble",
|
134 |
+
)
|
135 |
+
parser.add_argument(
|
136 |
+
"--type",
|
137 |
+
type=str,
|
138 |
+
default="avg_wave",
|
139 |
+
help="One of avg_wave, median_wave, min_wave, max_wave, avg_fft, median_fft, min_fft, max_fft",
|
140 |
+
)
|
141 |
+
parser.add_argument(
|
142 |
+
"--weights",
|
143 |
+
type=float,
|
144 |
+
nargs="+",
|
145 |
+
help="Weights to create ensemble. Number of weights must be equal to number of files",
|
146 |
+
)
|
147 |
+
parser.add_argument(
|
148 |
+
"--output",
|
149 |
+
default="res.wav",
|
150 |
+
type=str,
|
151 |
+
help="Path to wav file where ensemble result will be stored",
|
152 |
+
)
|
153 |
+
if args is None:
|
154 |
+
args = parser.parse_args()
|
155 |
+
else:
|
156 |
+
args = parser.parse_args(args)
|
157 |
+
|
158 |
+
print("Ensemble type: {}".format(args.type))
|
159 |
+
print("Number of input files: {}".format(len(args.files)))
|
160 |
+
if args.weights is not None:
|
161 |
+
weights = args.weights
|
162 |
+
else:
|
163 |
+
weights = np.ones(len(args.files))
|
164 |
+
print("Weights: {}".format(weights))
|
165 |
+
print("Output file: {}".format(args.output))
|
166 |
+
data = []
|
167 |
+
for f in args.files:
|
168 |
+
if not os.path.isfile(f):
|
169 |
+
print("Error. Can't find file: {}. Check paths.".format(f))
|
170 |
+
exit()
|
171 |
+
print("Reading file: {}".format(f))
|
172 |
+
wav, sr = librosa.load(f, sr=None, mono=False)
|
173 |
+
# wav, sr = sf.read(f)
|
174 |
+
print("Waveform shape: {} sample rate: {}".format(wav.shape, sr))
|
175 |
+
data.append(wav)
|
176 |
+
data = np.array(data)
|
177 |
+
res = average_waveforms(data, weights, args.type)
|
178 |
+
print("Result shape: {}".format(res.shape))
|
179 |
+
sf.write(args.output, res.T, sr, "FLOAT")
|
180 |
+
|
181 |
+
|
182 |
+
if __name__ == "__main__":
|
183 |
+
ensemble_files(None)
|