diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..0ea47296b5ff8dd013e09b0c5cb336ab1af8f18f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "third_party/Matcha-TTS"] + path = third_party/Matcha-TTS + url = https://github.com/shivammehta25/Matcha-TTS diff --git a/.gradio/certificate.pem b/.gradio/certificate.pem new file mode 100644 index 0000000000000000000000000000000000000000..b85c8037f6b60976b2546fdbae88312c5246d9a3 --- /dev/null +++ b/.gradio/certificate.pem @@ -0,0 +1,31 @@ +-----BEGIN CERTIFICATE----- +MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw +TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh +cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4 +WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu +ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY +MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc +h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+ +0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U +A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW +T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH +B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC +B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv +KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn +OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn +jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw +qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI +rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV +HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq +hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL +ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ +3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK +NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5 +ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur +TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC +jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc +oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq +4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA +mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d +emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc= +-----END CERTIFICATE----- diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..ec8f5895574bc1e0c78b3ca95cefeeb484f10106 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2024 GLM-4-Voice Model Team @ Zhipu AI + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 86d107e1729a6bddadb5a5d8bdb874fc2a012ea4..1cbc3ec19cab182eb283787f7f98e2a65b6e66c0 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,111 @@ --- -title: GLM 4 Voice -emoji: 💻 -colorFrom: red -colorTo: purple +title: GLM-4-Voice +app_file: web_demo.py sdk: gradio -sdk_version: 5.5.0 -app_file: app.py -pinned: false +sdk_version: 5.3.0 --- +# GLM-4-Voice +Read this in [English](./README_en.md) -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +GLM-4-Voice 是智谱 AI 推出的端到端语音模型。GLM-4-Voice 能够直接理解和生成中英文语音,进行实时语音对话,并且能够遵循用户的指令要求改变语音的情感、语调、语速、方言等属性。 + +## Model Architecture +![Model Architecture](./resources/architecture.jpeg) + +GLM-4-Voice 由三个部分组成: +* GLM-4-Voice-Tokenizer: 通过在 [Whisper](https://github.com/openai/whisper) 的 Encoder 部分增加 Vector Quantization 并在 ASR 数据上有监督训练,将连续的语音输入转化为离散的 token。每秒音频平均只需要用 12.5 个离散 token 表示。 +* GLM-4-Voice-Decoder: 基于 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 的 Flow Matching 模型结构训练的支持流式推理的语音解码器,将离散化的语音 token 转化为连续的语音输出。最少只需要 10 个语音 token 即可开始生成,降低端到端对话延迟。 +* GLM-4-Voice-9B: 在 [GLM-4-9B](https://github.com/THUDM/GLM-4) 的基础上进行语音模态的预训练和对齐,从而能够理解和生成离散化的语音 token。 + +预训练方面,为了攻克模型在语音模态下的智商和合成表现力两个难关,我们将 Speech2Speech 任务解耦合为“根据用户音频做出文本回复”和“根据文本回复和用户语音合成回复语音”两个任务,并设计两种预训练目标,分别基于文本预训练数据和无监督音频数据合成语音-文本交错数据以适配这两种任务形式。GLM-4-Voice-9B 在 GLM-4-9B 的基座模型基础之上,经过了数百万小时音频和数千亿 token 的音频文本交错数据预训练,拥有很强的音频理解和建模能力。 + +对齐方面,为了支持高质量的语音对话,我们设计了一套流式思考架构:根据用户语音,GLM-4-Voice 可以流式交替输出文本和语音两个模态的内容,其中语音模态以文本作为参照保证回复内容的高质量,并根据用户的语音指令要求做出相应的声音变化,在最大程度保留语言模型智商的情况下仍然具有端到端建模的能力,同时具备低延迟性,最低只需要输出 20 个 token 便可以合成语音。 + +更详细的技术报告将在之后公布。 + +## Model List + +| Model | Type | Download | +|:---------------------:| :---: |:------------------------------------------------------------------------------------------------------------------------------------------------:| +| GLM-4-Voice-Tokenizer | Speech Tokenizer | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-tokenizer) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-voice-tokenizer) | +| GLM-4-Voice-9B | Chat Model | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-9b) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-voice-9b) +| GLM-4-Voice-Decoder | Speech Decoder | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-decoder) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-voice-decoder) + +## Usage +我们提供了可以直接启动的 Web Demo。用户可以输入语音或文本,模型会同时给出语音和文字回复。 + +![](resources/web_demo.png) + +### Preparation +首先下载仓库 +```shell +git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice +cd GLM-4-Voice +``` +然后安装依赖。 +```shell +pip install -r requirements.txt +``` +由于 Decoder 模型不支持通过 `transformers` 初始化,因此 checkpoint 需要单独下载。 + +```shell +# git 模型下载,请确保已安装 git-lfs +git clone https://huggingface.co/THUDM/glm-4-voice-decoder +``` + +### Launch Web Demo +首先启动模型服务 +```shell +python model_server.py --model-path glm-4-voice-9b +``` +此命令会自动下载 `glm-4-voice-9b`。如果网络条件不好,也手动下载之后通过 `--model-path` 指定本地的路径。 + +然后启动 web 服务 +```shell +python web_demo.py +``` +即可在 http://127.0.0.1:8888 访问 web demo。此命令会自动下载 `glm-4-voice-tokenizer` 和 `glm-4-voice-9b`。如果网络条件不好,也可以手动下载之后通过 `--tokenizer-path` 和 `--model-path` 指定本地的路径。 + +### Known Issues +* Gradio 的流式音频播放效果不稳定。在生成完成后点击对话框中的音频质量会更高。 + +## Cases +我们提供了 GLM-4-Voice 的部分对话案例,包括控制情绪、改变语速、生成方言等。 + +* 用轻柔的声音引导我放松 + +https://github.com/user-attachments/assets/4e3d9200-076d-4c28-a641-99df3af38eb0 + +* 用激动的声音解说足球比赛 + +https://github.com/user-attachments/assets/0163de2d-e876-4999-b1bc-bbfa364b799b + +* 用哀怨的声音讲一个鬼故事 + +https://github.com/user-attachments/assets/a75b2087-d7bc-49fa-a0c5-e8c99935b39a + +* 用东北话介绍一下冬天有多冷 + +https://github.com/user-attachments/assets/91ba54a1-8f5c-4cfe-8e87-16ed1ecf4037 + +* 用重庆话念“吃葡萄不吐葡萄皮” + +https://github.com/user-attachments/assets/7eb72461-9e84-4d8e-9c58-1809cf6a8a9b + +* 用北京话念一句绕口令 + +https://github.com/user-attachments/assets/a9bb223e-9c0a-440d-8537-0a7f16e31651 + + * 加快语速 + +https://github.com/user-attachments/assets/c98a4604-366b-4304-917f-3c850a82fe9f + + * 再快一点 + +https://github.com/user-attachments/assets/d5ff0815-74f8-4738-b0f1-477cfc8dcc2d + +## Acknowledge +本项目的部分代码来自: +* [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) +* [transformers](https://github.com/huggingface/transformers) +* [GLM-4](https://github.com/THUDM/GLM-4) diff --git a/README_en.md b/README_en.md new file mode 100644 index 0000000000000000000000000000000000000000..073edd7b0fb31a8594ca62f95b7b3a40565e2cfb --- /dev/null +++ b/README_en.md @@ -0,0 +1,97 @@ +# GLM-4-Voice +GLM-4-Voice is an end-to-end voice model launched by Zhipu AI. GLM-4-Voice can directly understand and generate Chinese and English speech, engage in real-time voice conversations, and change attributes such as emotion, intonation, speech rate, and dialect based on user instructions. + +## Model Architecture + +![Model Architecture](./resources/architecture.jpeg) +We provide the three components of GLM-4-Voice: +* GLM-4-Voice-Tokenizer: Trained by adding vector quantization to the encoder part of [Whisper](https://github.com/openai/whisper), converting continuous speech input into discrete tokens. Each second of audio is converted into 12.5 discrete tokens. +* GLM-4-Voice-9B: Pre-trained and aligned on speech modality based on [GLM-4-9B](https://github.com/THUDM/GLM-4), enabling understanding and generation of discretized speech. +* GLM-4-Voice-Decoder: A speech decoder supporting streaming inference, retrained based on [CosyVoice](https://github.com/FunAudioLLM/CosyVoice), converting discrete speech tokens into continuous speech output. Generation can start with as few as 10 audio tokens, reducing conversation latency. + +A more detailed technical report will be published later. + +## Model List +| Model | Type | Download | +|:---------------------:| :---: |:------------------:| +| GLM-4-Voice-Tokenizer | Speech Tokenizer | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-tokenizer) | +| GLM-4-Voice-9B | Chat Model | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-9b) +| GLM-4-Voice-Decoder | Speech Decoder | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-decoder) + +## Usage +We provide a Web Demo that can be launched directly. Users can input speech or text, and the model will respond with both speech and text. + +![](resources/web_demo.png) + +### Preparation +First, download the repository +```shell +git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice +cd GLM-4-Voice +``` +Then, install the dependencies. +```shell +pip install -r requirements.txt +``` +Since the Decoder model does not support initialization via `transformers`, the checkpoint needs to be downloaded separately. + +```shell +# Git model download, please ensure git-lfs is installed +git clone https://huggingface.co/THUDM/glm-4-voice-decoder +``` + +### Launch Web Demo +First, start the model service +```shell +python model_server.py --model-path glm-4-voice-9b +``` + +Then, start the web service +```shell +python web_demo.py +``` +You can then access the web demo at http://127.0.0.1:8888. + +### Known Issues +* Gradio’s streaming audio playback can be unstable. The audio quality will be higher when clicking on the audio in the dialogue box after generation is complete. + +## Examples +We provide some dialogue cases for GLM-4-Voice, including emotion control, speech rate alteration, dialect generation, etc. (The examples are in Chinese.) + +* Use a gentle voice to guide me to relax + +https://github.com/user-attachments/assets/4e3d9200-076d-4c28-a641-99df3af38eb0 + +* Use an excited voice to commentate a football match + +https://github.com/user-attachments/assets/0163de2d-e876-4999-b1bc-bbfa364b799b + +* Tell a ghost story with a mournful voice + +https://github.com/user-attachments/assets/a75b2087-d7bc-49fa-a0c5-e8c99935b39a + +* Introduce how cold winter is with a Northeastern dialect + +https://github.com/user-attachments/assets/91ba54a1-8f5c-4cfe-8e87-16ed1ecf4037 + +* Say "Eat grapes without spitting out the skins" in Chongqing dialect + +https://github.com/user-attachments/assets/7eb72461-9e84-4d8e-9c58-1809cf6a8a9b + +* Recite a tongue twister with a Beijing accent + +https://github.com/user-attachments/assets/a9bb223e-9c0a-440d-8537-0a7f16e31651 + + * Increase the speech rate + +https://github.com/user-attachments/assets/c98a4604-366b-4304-917f-3c850a82fe9f + + * Even faster + +https://github.com/user-attachments/assets/d5ff0815-74f8-4738-b0f1-477cfc8dcc2d + +## Acknowledge +Some code in this project is from: +* [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) +* [transformers](https://github.com/huggingface/transformers) +* [GLM-4](https://github.com/THUDM/GLM-4) diff --git a/__pycache__/flow_inference.cpython-310.pyc b/__pycache__/flow_inference.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c82c4c67329c3d930bf18ba7aa83a4982ba12cd0 Binary files /dev/null and b/__pycache__/flow_inference.cpython-310.pyc differ diff --git a/cosyvoice/__init__.py b/cosyvoice/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosyvoice/__pycache__/__init__.cpython-310.pyc b/cosyvoice/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2ee5506faf9509b806ae0300ffc30bef1636b52 Binary files /dev/null and b/cosyvoice/__pycache__/__init__.cpython-310.pyc differ diff --git a/cosyvoice/bin/inference.py b/cosyvoice/bin/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..6b777fa1cba925f9786db60b7efa15dcd189adeb --- /dev/null +++ b/cosyvoice/bin/inference.py @@ -0,0 +1,114 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import argparse +import logging +logging.getLogger('matplotlib').setLevel(logging.WARNING) +import os + +import torch +from torch.utils.data import DataLoader +import torchaudio +from hyperpyyaml import load_hyperpyyaml +from tqdm import tqdm +from cosyvoice.cli.model import CosyVoiceModel + +from cosyvoice.dataset.dataset import Dataset + +def get_args(): + parser = argparse.ArgumentParser(description='inference with your model') + parser.add_argument('--config', required=True, help='config file') + parser.add_argument('--prompt_data', required=True, help='prompt data file') + parser.add_argument('--prompt_utt2data', required=True, help='prompt data file') + parser.add_argument('--tts_text', required=True, help='tts input file') + parser.add_argument('--llm_model', required=True, help='llm model file') + parser.add_argument('--flow_model', required=True, help='flow model file') + parser.add_argument('--hifigan_model', required=True, help='hifigan model file') + parser.add_argument('--gpu', + type=int, + default=-1, + help='gpu id for this rank, -1 for cpu') + parser.add_argument('--mode', + default='sft', + choices=['sft', 'zero_shot'], + help='inference mode') + parser.add_argument('--result_dir', required=True, help='asr result file') + args = parser.parse_args() + print(args) + return args + + +def main(): + args = get_args() + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(message)s') + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) + + # Init cosyvoice models from configs + use_cuda = args.gpu >= 0 and torch.cuda.is_available() + device = torch.device('cuda' if use_cuda else 'cpu') + with open(args.config, 'r') as f: + configs = load_hyperpyyaml(f) + + model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift']) + model.load(args.llm_model, args.flow_model, args.hifigan_model) + + test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data) + test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) + + del configs + os.makedirs(args.result_dir, exist_ok=True) + fn = os.path.join(args.result_dir, 'wav.scp') + f = open(fn, 'w') + with torch.no_grad(): + for batch_idx, batch in tqdm(enumerate(test_data_loader)): + utts = batch["utts"] + assert len(utts) == 1, "inference mode only support batchsize 1" + text = batch["text"] + text_token = batch["text_token"].to(device) + text_token_len = batch["text_token_len"].to(device) + tts_text = batch["tts_text"] + tts_index = batch["tts_index"] + tts_text_token = batch["tts_text_token"].to(device) + tts_text_token_len = batch["tts_text_token_len"].to(device) + speech_token = batch["speech_token"].to(device) + speech_token_len = batch["speech_token_len"].to(device) + speech_feat = batch["speech_feat"].to(device) + speech_feat_len = batch["speech_feat_len"].to(device) + utt_embedding = batch["utt_embedding"].to(device) + spk_embedding = batch["spk_embedding"].to(device) + if args.mode == 'sft': + model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, + 'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding} + else: + model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, + 'prompt_text': text_token, 'prompt_text_len': text_token_len, + 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len, + 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len, + 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len, + 'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding} + model_output = model.inference(**model_input) + tts_key = '{}_{}'.format(utts[0], tts_index[0]) + tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key)) + torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050) + f.write('{} {}\n'.format(tts_key, tts_fn)) + f.flush() + f.close() + logging.info('Result wav.scp saved in {}'.format(fn)) + + +if __name__ == '__main__': + main() diff --git a/cosyvoice/bin/train.py b/cosyvoice/bin/train.py new file mode 100644 index 0000000000000000000000000000000000000000..2f4c9fee8415823f26ae7d11a3b81ee24b6f31ea --- /dev/null +++ b/cosyvoice/bin/train.py @@ -0,0 +1,140 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import argparse +import datetime +import logging +logging.getLogger('matplotlib').setLevel(logging.WARNING) +from copy import deepcopy +import torch +import torch.distributed as dist +# import deepspeed +import pdb +from hyperpyyaml import load_hyperpyyaml + +from torch.distributed.elastic.multiprocessing.errors import record + +from cosyvoice.utils.executor import Executor +from cosyvoice.utils.train_utils import ( + init_distributed, + init_dataset_and_dataloader, + init_optimizer_and_scheduler, + init_summarywriter, save_model, + wrap_cuda_model, check_modify_and_save_config) + + +def get_args(): + parser = argparse.ArgumentParser(description='training your network') + parser.add_argument('--train_engine', + default='torch_ddp', + choices=['torch_ddp', 'deepspeed'], + help='Engine for paralleled training') + parser.add_argument('--model', required=True, help='model which will be trained') + parser.add_argument('--config', required=True, help='config file') + parser.add_argument('--train_data', required=True, help='train data file') + parser.add_argument('--cv_data', required=True, help='cv data file') + parser.add_argument('--checkpoint', help='checkpoint model') + parser.add_argument('--model_dir', required=True, help='save model dir') + parser.add_argument('--tensorboard_dir', + default='tensorboard', + help='tensorboard log dir') + parser.add_argument('--ddp.dist_backend', + dest='dist_backend', + default='nccl', + choices=['nccl', 'gloo'], + help='distributed backend') + parser.add_argument('--num_workers', + default=0, + type=int, + help='num of subprocess workers for reading') + parser.add_argument('--prefetch', + default=100, + type=int, + help='prefetch number') + parser.add_argument('--pin_memory', + action='store_true', + default=False, + help='Use pinned memory buffers used for reading') + parser.add_argument('--deepspeed.save_states', + dest='save_states', + default='model_only', + choices=['model_only', 'model+optimizer'], + help='save model/optimizer states') + parser.add_argument('--timeout', + default=30, + type=int, + help='timeout (in seconds) of cosyvoice_join.') + # parser = deepspeed.add_config_arguments(parser) + args = parser.parse_args() + return args + + +@record +def main(): + args = get_args() + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(message)s') + + override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model} + with open(args.config, 'r') as f: + configs = load_hyperpyyaml(f, overrides=override_dict) + configs['train_conf'].update(vars(args)) + + # Init env for ddp + init_distributed(args) + + # Get dataset & dataloader + train_dataset, cv_dataset, train_data_loader, cv_data_loader = \ + init_dataset_and_dataloader(args, configs) + + # Do some sanity checks and save config to arsg.model_dir + configs = check_modify_and_save_config(args, configs) + + # Tensorboard summary + writer = init_summarywriter(args) + + # load checkpoint + model = configs[args.model] + if args.checkpoint is not None: + model.load_state_dict(torch.load(args.checkpoint, map_location='cpu')) + + # Dispatch model from cpu to gpu + model = wrap_cuda_model(args, model) + + # Get optimizer & scheduler + model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model) + # pdb.set_trace() + # Save init checkpoints + info_dict = deepcopy(configs['train_conf']) + save_model(model, 'init', info_dict) + + # Get executor + executor = Executor() + + # Start training loop + for epoch in range(info_dict['max_epoch']): + executor.epoch = epoch + train_dataset.set_epoch(epoch) + dist.barrier() + # try: + # dist.barrier() + # except RuntimeError as e: + # logging.info('except RuntimeError as e: {}'.format(e)) + group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout)) + executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join) + dist.destroy_process_group(group_join) + +if __name__ == '__main__': + main() diff --git a/cosyvoice/cli/__init__.py b/cosyvoice/cli/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py new file mode 100644 index 0000000000000000000000000000000000000000..ea8c4482891a62df6cbac39faa88972c81f5412f --- /dev/null +++ b/cosyvoice/cli/cosyvoice.py @@ -0,0 +1,83 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import torch +from hyperpyyaml import load_hyperpyyaml +from modelscope import snapshot_download +from cosyvoice.cli.frontend import CosyVoiceFrontEnd +from cosyvoice.cli.model import CosyVoiceModel + +class CosyVoice: + + def __init__(self, model_dir): + instruct = True if '-Instruct' in model_dir else False + self.model_dir = model_dir + if not os.path.exists(model_dir): + model_dir = snapshot_download(model_dir) + with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f: + configs = load_hyperpyyaml(f) + self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'], + configs['feat_extractor'], + '{}/campplus.onnx'.format(model_dir), + '{}/speech_tokenizer_v1.onnx'.format(model_dir), + '{}/spk2info.pt'.format(model_dir), + instruct, + configs['allowed_special']) + self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift']) + self.model.load('{}/llm.pt'.format(model_dir), + '{}/flow.pt'.format(model_dir), + '{}/hift.pt'.format(model_dir)) + del configs + + def list_avaliable_spks(self): + spks = list(self.frontend.spk2info.keys()) + return spks + + def inference_sft(self, tts_text, spk_id): + tts_speeches = [] + for i in self.frontend.text_normalize(tts_text, split=True): + model_input = self.frontend.frontend_sft(i, spk_id) + model_output = self.model.inference(**model_input) + tts_speeches.append(model_output['tts_speech']) + return {'tts_speech': torch.concat(tts_speeches, dim=1)} + + def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k): + prompt_text = self.frontend.text_normalize(prompt_text, split=False) + tts_speeches = [] + for i in self.frontend.text_normalize(tts_text, split=True): + model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k) + model_output = self.model.inference(**model_input) + tts_speeches.append(model_output['tts_speech']) + return {'tts_speech': torch.concat(tts_speeches, dim=1)} + + def inference_cross_lingual(self, tts_text, prompt_speech_16k): + if self.frontend.instruct is True: + raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir)) + tts_speeches = [] + for i in self.frontend.text_normalize(tts_text, split=True): + model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k) + model_output = self.model.inference(**model_input) + tts_speeches.append(model_output['tts_speech']) + return {'tts_speech': torch.concat(tts_speeches, dim=1)} + + def inference_instruct(self, tts_text, spk_id, instruct_text): + if self.frontend.instruct is False: + raise ValueError('{} do not support instruct inference'.format(self.model_dir)) + instruct_text = self.frontend.text_normalize(instruct_text, split=False) + tts_speeches = [] + for i in self.frontend.text_normalize(tts_text, split=True): + model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text) + model_output = self.model.inference(**model_input) + tts_speeches.append(model_output['tts_speech']) + return {'tts_speech': torch.concat(tts_speeches, dim=1)} diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py new file mode 100644 index 0000000000000000000000000000000000000000..3ed85500cd3ab65f8f4f7540c084adb0c648186f --- /dev/null +++ b/cosyvoice/cli/frontend.py @@ -0,0 +1,168 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from functools import partial +import onnxruntime +import torch +import numpy as np +import whisper +from typing import Callable +import torchaudio.compliance.kaldi as kaldi +import torchaudio +import os +import re +import inflect +try: + import ttsfrd + use_ttsfrd = True +except ImportError: + print("failed to import ttsfrd, use WeTextProcessing instead") + from tn.chinese.normalizer import Normalizer as ZhNormalizer + from tn.english.normalizer import Normalizer as EnNormalizer + use_ttsfrd = False +from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph + + +class CosyVoiceFrontEnd: + + def __init__(self, + get_tokenizer: Callable, + feat_extractor: Callable, + campplus_model: str, + speech_tokenizer_model: str, + spk2info: str = '', + instruct: bool = False, + allowed_special: str = 'all'): + self.tokenizer = get_tokenizer() + self.feat_extractor = feat_extractor + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + option = onnxruntime.SessionOptions() + option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL + option.intra_op_num_threads = 1 + self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"]) + self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option, providers=["CUDAExecutionProvider"if torch.cuda.is_available() else "CPUExecutionProvider"]) + if os.path.exists(spk2info): + self.spk2info = torch.load(spk2info, map_location=self.device) + self.instruct = instruct + self.allowed_special = allowed_special + self.inflect_parser = inflect.engine() + self.use_ttsfrd = use_ttsfrd + if self.use_ttsfrd: + self.frd = ttsfrd.TtsFrontendEngine() + ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, 'failed to initialize ttsfrd resource' + self.frd.set_lang_type('pinyin') + self.frd.enable_pinyin_mix(True) + self.frd.set_breakmodel_index(1) + else: + self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False) + self.en_tn_model = EnNormalizer() + + def _extract_text_token(self, text): + text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special) + text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device) + text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device) + return text_token, text_token_len + + def _extract_speech_token(self, speech): + feat = whisper.log_mel_spectrogram(speech, n_mels=128) + speech_token = self.speech_tokenizer_session.run(None, {self.speech_tokenizer_session.get_inputs()[0].name: feat.detach().cpu().numpy(), + self.speech_tokenizer_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist() + speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device) + speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device) + return speech_token, speech_token_len + + def _extract_spk_embedding(self, speech): + feat = kaldi.fbank(speech, + num_mel_bins=80, + dither=0, + sample_frequency=16000) + feat = feat - feat.mean(dim=0, keepdim=True) + embedding = self.campplus_session.run(None, {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist() + embedding = torch.tensor([embedding]).to(self.device) + return embedding + + def _extract_speech_feat(self, speech): + speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device) + speech_feat = speech_feat.unsqueeze(dim=0) + speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device) + return speech_feat, speech_feat_len + + def text_normalize(self, text, split=True): + text = text.strip() + if contains_chinese(text): + if self.use_ttsfrd: + text = self.frd.get_frd_extra_info(text, 'input') + else: + text = self.zh_tn_model.normalize(text) + text = text.replace("\n", "") + text = replace_blank(text) + text = replace_corner_mark(text) + text = text.replace(".", "、") + text = text.replace(" - ", ",") + text = remove_bracket(text) + text = re.sub(r'[,,]+$', '。', text) + texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80, + token_min_n=60, merge_len=20, + comma_split=False)] + else: + if self.use_ttsfrd: + text = self.frd.get_frd_extra_info(text, 'input') + else: + text = self.en_tn_model.normalize(text) + text = spell_out_number(text, self.inflect_parser) + texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80, + token_min_n=60, merge_len=20, + comma_split=False)] + if split is False: + return text + return texts + + def frontend_sft(self, tts_text, spk_id): + tts_text_token, tts_text_token_len = self._extract_text_token(tts_text) + embedding = self.spk2info[spk_id]['embedding'] + model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding} + return model_input + + def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k): + tts_text_token, tts_text_token_len = self._extract_text_token(tts_text) + prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text) + prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k) + speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050) + speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k) + embedding = self._extract_spk_embedding(prompt_speech_16k) + model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, + 'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len, + 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len, + 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len, + 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len, + 'llm_embedding': embedding, 'flow_embedding': embedding} + return model_input + + def frontend_cross_lingual(self, tts_text, prompt_speech_16k): + model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k) + # in cross lingual mode, we remove prompt in llm + del model_input['prompt_text'] + del model_input['prompt_text_len'] + del model_input['llm_prompt_speech_token'] + del model_input['llm_prompt_speech_token_len'] + return model_input + + def frontend_instruct(self, tts_text, spk_id, instruct_text): + model_input = self.frontend_sft(tts_text, spk_id) + # in instruct mode, we remove spk_embedding in llm due to information leakage + del model_input['llm_embedding'] + instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '') + model_input['prompt_text'] = instruct_text_token + model_input['prompt_text_len'] = instruct_text_token_len + return model_input diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py new file mode 100644 index 0000000000000000000000000000000000000000..446a84e079dcef74a018ef7fe6b2038709b97b0f --- /dev/null +++ b/cosyvoice/cli/model.py @@ -0,0 +1,95 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +class CosyVoiceModel: + + def __init__(self, + llm: torch.nn.Module, + flow: torch.nn.Module, + hift: torch.nn.Module): + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.llm = llm + self.flow = flow + self.hift = hift + + def load(self, llm_model, flow_model, hift_model): + self.llm.load_state_dict(torch.load(llm_model, map_location=self.device)) + self.llm.to(self.device).eval() + self.flow.load_state_dict(torch.load(flow_model, map_location=self.device)) + self.flow.to(self.device).eval() + self.hift.load_state_dict(torch.load(hift_model, map_location=self.device)) + self.hift.to(self.device).eval() + + def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192), + prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32), + llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), + flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), + prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)): + tts_speech_token = self.llm.inference(text=text.to(self.device), + text_len=text_len.to(self.device), + prompt_text=prompt_text.to(self.device), + prompt_text_len=prompt_text_len.to(self.device), + prompt_speech_token=llm_prompt_speech_token.to(self.device), + prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device), + embedding=llm_embedding.to(self.device), + beam_size=1, + sampling=25, + max_token_text_ratio=30, + min_token_text_ratio=3) + tts_mel = self.flow.inference(token=tts_speech_token, + token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device), + prompt_token=flow_prompt_speech_token.to(self.device), + prompt_token_len=flow_prompt_speech_token_len.to(self.device), + prompt_feat=prompt_speech_feat.to(self.device), + prompt_feat_len=prompt_speech_feat_len.to(self.device), + embedding=flow_embedding.to(self.device)) + tts_speech = self.hift.inference(mel=tts_mel).cpu() + torch.cuda.empty_cache() + return {'tts_speech': tts_speech} + + def text_to_token(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192), + prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32), + llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), + flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), + prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)): + tts_speech_token = self.llm.inference(text=text.to(self.device), + text_len=text_len.to(self.device), + prompt_text=prompt_text.to(self.device), + prompt_text_len=prompt_text_len.to(self.device), + prompt_speech_token=llm_prompt_speech_token.to(self.device), + prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device), + embedding=llm_embedding.to(self.device), + beam_size=1, + sampling=25, + max_token_text_ratio=30, + min_token_text_ratio=3) + return tts_speech_token + + def token_to_speech(self, tts_speech_token, flow_embedding, llm_embedding=torch.zeros(0, 192), + prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32), + llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), + flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), + prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)): + + tts_mel = self.flow.inference(token=tts_speech_token, + token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device), + prompt_token=flow_prompt_speech_token.to(self.device), + prompt_token_len=flow_prompt_speech_token_len.to(self.device), + prompt_feat=prompt_speech_feat.to(self.device), + prompt_feat_len=prompt_speech_feat_len.to(self.device), + embedding=flow_embedding.to(self.device)) + tts_speech = self.hift.inference(mel=tts_mel).cpu() + torch.cuda.empty_cache() + return {'tts_speech': tts_speech} \ No newline at end of file diff --git a/cosyvoice/dataset/__init__.py b/cosyvoice/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosyvoice/dataset/dataset.py b/cosyvoice/dataset/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..6681504383f73ac9ba1609b9d48bdec7aae23f28 --- /dev/null +++ b/cosyvoice/dataset/dataset.py @@ -0,0 +1,160 @@ +# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import json +import math +from functools import partial + +import torch +import torch.distributed as dist +from torch.utils.data import IterableDataset +from cosyvoice.utils.file_utils import read_lists, read_json_lists + + +class Processor(IterableDataset): + + def __init__(self, source, f, *args, **kw): + assert callable(f) + self.source = source + self.f = f + self.args = args + self.kw = kw + + def set_epoch(self, epoch): + self.source.set_epoch(epoch) + + def __iter__(self): + """ Return an iterator over the source dataset processed by the + given processor. + """ + assert self.source is not None + assert callable(self.f) + return self.f(iter(self.source), *self.args, **self.kw) + + def apply(self, f): + assert callable(f) + return Processor(self, f, *self.args, **self.kw) + + +class DistributedSampler: + + def __init__(self, shuffle=True, partition=True): + self.epoch = -1 + self.update() + self.shuffle = shuffle + self.partition = partition + + def update(self): + assert dist.is_available() + if dist.is_initialized(): + self.rank = dist.get_rank() + self.world_size = dist.get_world_size() + else: + self.rank = 0 + self.world_size = 1 + worker_info = torch.utils.data.get_worker_info() + if worker_info is None: + self.worker_id = 0 + self.num_workers = 1 + else: + self.worker_id = worker_info.id + self.num_workers = worker_info.num_workers + return dict(rank=self.rank, + world_size=self.world_size, + worker_id=self.worker_id, + num_workers=self.num_workers) + + def set_epoch(self, epoch): + self.epoch = epoch + + def sample(self, data): + """ Sample data according to rank/world_size/num_workers + + Args: + data(List): input data list + + Returns: + List: data list after sample + """ + data = list(range(len(data))) + # force datalist even + if self.partition: + if self.shuffle: + random.Random(self.epoch).shuffle(data) + if len(data) < self.world_size: + data = data * math.ceil(self.world_size / len(data)) + data = data[:self.world_size] + data = data[self.rank::self.world_size] + if len(data) < self.num_workers: + data = data * math.ceil(self.num_workers / len(data)) + data = data[:self.num_workers] + data = data[self.worker_id::self.num_workers] + return data + + +class DataList(IterableDataset): + + def __init__(self, lists, shuffle=True, partition=True): + self.lists = lists + self.sampler = DistributedSampler(shuffle, partition) + + def set_epoch(self, epoch): + self.sampler.set_epoch(epoch) + + def __iter__(self): + sampler_info = self.sampler.update() + indexes = self.sampler.sample(self.lists) + for index in indexes: + data = dict(src=self.lists[index]) + data.update(sampler_info) + yield data + + +def Dataset(data_list_file, + data_pipeline, + mode='train', + shuffle=True, + partition=True, + tts_file='', + prompt_utt2data=''): + """ Construct dataset from arguments + + We have two shuffle stage in the Dataset. The first is global + shuffle at shards tar/raw file level. The second is global shuffle + at training samples level. + + Args: + data_type(str): raw/shard + tokenizer (BaseTokenizer): tokenizer to tokenize + partition(bool): whether to do data partition in terms of rank + """ + assert mode in ['train', 'inference'] + lists = read_lists(data_list_file) + # import pdb + # pdb.set_trace() + if mode == 'inference': + with open(tts_file) as f: + tts_data = json.load(f) + utt2lists = read_json_lists(prompt_utt2data) + # filter unnecessary file in inference mode + lists = list(set([utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists])) + dataset = DataList(lists,shuffle=shuffle,partition=partition) + if mode == 'inference': + # map partial arg tts_data in inference mode + data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data) + for func in data_pipeline: + dataset = Processor(dataset, func, mode=mode) + return dataset diff --git a/cosyvoice/dataset/processor.py b/cosyvoice/dataset/processor.py new file mode 100644 index 0000000000000000000000000000000000000000..8c8c743fdbe03139a9703ba635e31ab74c459c67 --- /dev/null +++ b/cosyvoice/dataset/processor.py @@ -0,0 +1,965 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import random +import json +import tarfile +import json +import io +import pyarrow.parquet as pq +from io import BytesIO +import torch +import torchaudio +from torch.nn.utils.rnn import pad_sequence +import torch.nn.functional as F +import tarfile +import json +import io +import wave +import numpy as np +import torchaudio +import os +import sys +import json +import random +import pickle +import argparse +import itertools +import mmap +import struct +import collections + + + +import shutil +import multiprocessing as mp +from pathlib import Path + +from tqdm import tqdm +from collections import defaultdict +from copy import deepcopy +from datetime import datetime +import pickle + +from wids import wids +import math + +torchaudio.set_audio_backend('soundfile') + +AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) + +try: + MAIN_SPK_EMBEDDING=torch.load("/workspace/audio_checkpoints/flow_model/spk_embedding/0909/mean_embedding.pt") + GPT_SPK_EMBEDDING=torch.load("/workspace/audio_checkpoints/flow_model/spk_embedding/0909/spk_mean_embeddings.pt") +except: + MAIN_SPK_EMBEDDING=torch.zeros(1,192) + GPT_SPK_EMBEDDING=torch.zeros(1,192) + +def parquet_opener(data, mode='train', tts_data={}): + """ Give url or local file, return file descriptor + Inplace operation. + + Args: + data(Iterable[str]): url or local file list + + Returns: + Iterable[{src, stream}] + """ + for sample in data: + assert 'src' in sample + url = sample['src'] + try: + df = pq.read_table(url).to_pandas() + for i in range(len(df)): + if mode == 'inference' and df.loc[i, 'utt'] not in tts_data: + continue + sample.update(dict(df.loc[i])) + if mode == 'train': + # NOTE do not return sample directly, must initialize a new dict + yield {**sample} + else: + for index, text in enumerate(tts_data[df.loc[i, 'utt']]): + yield {**sample, 'tts_index': index, 'tts_text': text} + except Exception as ex: + logging.warning('Failed to open {}, ex info {}'.format(url, ex)) + + + + +def parse_tar_header(header_bytes): + header = struct.unpack("!100s8s8s8s12s12s8s1s100s6s2s32s32s8s8s155s", header_bytes) + return TarHeader(*header) + +TarHeader = collections.namedtuple( + "TarHeader", + [ + "name", + "mode", + "uid", + "gid", + "size", + "mtime", + "chksum", + "typeflag", + "linkname", + "magic", + "version", + "uname", + "gname", + "devmajor", + "devminor", + "prefix", + ], +) + +class MMTar: + def __init__(self, file_path: Path | str): + self.stream = open(file_path, "rb") + self.mmap = mmap.mmap(self.stream.fileno(), 0, access=mmap.ACCESS_READ) + + def __del__(self): + try: + self.mmap.close() + self.stream.close() + except: # noqa + pass + + def get_at_offset(self, offset) -> tuple[str, bytes]: + header = parse_tar_header(self.mmap[offset : offset + 500]) + name = header.name.decode("utf-8").strip("\x00") + start = offset + 512 + end = start + int(header.size.decode("utf-8")[:-1], 8) + return name, self.mmap[start:end] + + +class Tar: + def __init__(self, path: Path): + self.tar = MMTar(path) + indices_path = path.with_suffix(".index") + self.index = pickle.loads(indices_path.read_bytes()) + self.name_mapping = {} + for name, offset, _ in self.index: + self.name_mapping[name] = offset + + def read(self, name: str) -> bytes: + return self.tar.get_at_offset(self.name_mapping[name])[1] + +def cosy_jsonl_opener(data, mode='train', tts_data={}): + """ Give url or local file, return file descriptor + Inplace operation. + + Args: + data(Iterable[str]): url or local file list + + Returns: + Iterable[{src, stream}] + """ + for sample in data: + assert 'src' in sample + cosy_jsonl_path = sample['src'] + tar_file_path=cosy_jsonl_path.replace(".vq0907.jsonl",".tar") + try: + tar_data=Tar(Path(tar_file_path)) + with open(cosy_jsonl_path, 'r') as f: + for line in f: + item=json.loads(line) + cosy_token = item['cosy_token'] + sample['speech_token']=torch.tensor(cosy_token) + sample['speech'], sample['sample_rate']= torchaudio.load(io.BytesIO(tar_data.read(item['filename']))) + # print(item['filename']) + yield {**sample} + + except Exception as ex: + logging.warning('Failed to open {}, ex info {}'.format(cosy_jsonl_path, ex)) + + +def cosy_jsonl_opener_vq0918_nopool(data, mode='train', tts_data={}): + """ Give url or local file, return file descriptor + Inplace operation. + + Args: + data(Iterable[str]): url or local file list + + Returns: + Iterable[{src, stream}] + """ + for sample in data: + assert 'src' in sample + cosy_jsonl_path = sample['src'] + tar_file_path=cosy_jsonl_path.replace(".vq0918-nopool.jsonl",".tar") + + + try: + tar_data=Tar(Path(tar_file_path)) + with open(cosy_jsonl_path, 'r') as f: + # cosy_data = [json.loads(line) for line in f] + for line in f: + item=json.loads(line) + cosy_token = item['cosy_token'] + sample['speech_token']=torch.tensor(cosy_token) + sample['speech'], sample['sample_rate']= torchaudio.load(io.BytesIO(tar_data.read(item['filename']))) + # print(item['filename']) + yield {**sample} + + except Exception as ex: + logging.warning('Failed to open {}, ex info {}'.format(cosy_jsonl_path, ex)) + + + +def cosy_jsonl_opener_vq0918_pool2(data, mode='train', tts_data={}): + """ Give url or local file, return file descriptor + Inplace operation. + + Args: + data(Iterable[str]): url or local file list + + Returns: + Iterable[{src, stream}] + """ + for sample in data: + assert 'src' in sample + cosy_jsonl_path = sample['src'] + tar_file_path=cosy_jsonl_path.replace(".vq0918-pool2.jsonl",".tar") + + try: + tar_data=Tar(Path(tar_file_path)) + with open(cosy_jsonl_path, 'r') as f: + for line in f: + item=json.loads(line) + cosy_token = item['cosy_token'] + sample['speech_token']=torch.tensor(cosy_token) + sample['speech'], sample['sample_rate']= torchaudio.load(io.BytesIO(tar_data.read(item['filename']))) + + yield {**sample} + + except Exception as ex: + logging.warning('Failed to open {}, ex info {}'.format(cosy_jsonl_path, ex)) + + +def cosy_jsonl_opener_vq0918_pool4(data, mode='train', tts_data={}): + """ Give url or local file, return file descriptor + Inplace operation. + + Args: + data(Iterable[str]): url or local file list + + Returns: + Iterable[{src, stream}] + """ + for sample in data: + assert 'src' in sample + cosy_jsonl_path = sample['src'] + tar_file_path=cosy_jsonl_path.replace(".vq0918-pool4.jsonl",".tar") + try: + tar_data=Tar(Path(tar_file_path)) + with open(cosy_jsonl_path, 'r') as f: + # cosy_data = [json.loads(line) for line in f] + for line in f: + item=json.loads(line) + cosy_token = item['cosy_token'] + sample['speech_token']=torch.tensor(cosy_token) + sample['speech'], sample['sample_rate']= torchaudio.load(io.BytesIO(tar_data.read(item['filename']))) + # print(item['filename']) + yield {**sample} + + except Exception as ex: + logging.warning('Failed to open {}, ex info {}'.format(cosy_jsonl_path, ex)) + + +def cosy_jsonl_opener_vq0918_pool8(data, mode='train', tts_data={}): + """ Give url or local file, return file descriptor + Inplace operation. + + Args: + data(Iterable[str]): url or local file list + + Returns: + Iterable[{src, stream}] + """ + for sample in data: + assert 'src' in sample + cosy_jsonl_path = sample['src'] + tar_file_path=cosy_jsonl_path.replace(".vq0918-pool8.jsonl",".tar") + + try: + tar_data=Tar(Path(tar_file_path)) + with open(cosy_jsonl_path, 'r') as f: + # cosy_data = [json.loads(line) for line in f] + for line in f: + item=json.loads(line) + cosy_token = item['cosy_token'] + sample['speech_token']=torch.tensor(cosy_token) + sample['speech'], sample['sample_rate']= torchaudio.load(io.BytesIO(tar_data.read(item['filename']))) + # print(item['filename']) + yield {**sample} + + except Exception as ex: + logging.warning('Failed to open {}, ex info {}'.format(cosy_jsonl_path, ex)) + + + +def process_sft_vq0918_pool4(data, mode='train', tts_data={}): + for sample in data: + assert 'src' in sample + + token_npy_path = sample['src'] + wav_path=token_npy_path.replace(".vq0918-pool4.npy","") + + # wav_path,token_npy_path=sample['src'].split(' ') + try: + sample['speech_token']=torch.tensor(np.load(token_npy_path)) + sample['speech'], sample['sample_rate']= torchaudio.load(wav_path) + if sample['speech'].shape[0] > 1: + sample['speech'] = sample['speech'].mean(dim=0, keepdim=True) + sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING) + yield {**sample} + except Exception as ex: + logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex)) + logging.warning('Failed to open {}'.format(wav_path)) + + +def process_sft_vq0918_pool4_split(data, mode='train',split_token=25, tts_data={}): + for sample in data: + assert 'src' in sample + + token_npy_path = sample['src'] + wav_path=token_npy_path.replace(".vq0918-pool4.npy","") + + # wav_path,token_npy_path=sample['src'].split(' ') + try: + # sample['speech_token']=torch.tensor(np.load(token_npy_path)) + # sample['speech'], sample['sample_rate']= torchaudio.load(wav_path) + # if sample['speech'].shape[0] > 1: + # sample['speech'] = sample['speech'].mean(dim=0, keepdim=True) + + + # sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING) + + + speech_token=torch.tensor(np.load(token_npy_path)) + speech,sample_rate= torchaudio.load(wav_path) + # split_speech=int(split_token / 12.5 * sample_rate) + if speech.shape[0] > 1: + speech = speech.mean(dim=0, keepdim=True) + + sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING) + sample['sample_rate']=sample_rate + + num_splits = (speech_token.size(0) + split_token - 1) // split_token + + for split_id in range(num_splits): + end_token_idx = min((split_id + 1) * split_token, speech_token.size(0)) + end_speech_idx=int(np.ceil(end_token_idx / 12.5 * sample_rate)) + sample['speech_token']=speech_token[:end_token_idx] + sample['speech']=speech[:,:end_speech_idx] + print(sample['speech_token'].size(),sample['speech'].size()) + yield {**sample} + except Exception as ex: + logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex)) + logging.warning('Failed to open {}'.format(wav_path)) + + +def process_sft_vq0918_pool2(data, mode='train', tts_data={}): + for sample in data: + assert 'src' in sample + + token_npy_path = sample['src'].replace(".vq0918-pool4.npy",".vq0918-pool2.npy") + wav_path=token_npy_path.replace(".vq0918-pool2.npy","") + + # wav_path,token_npy_path=sample['src'].split(' ') + try: + sample['speech_token']=torch.tensor(np.load(token_npy_path)) + sample['speech'], sample['sample_rate']= torchaudio.load(wav_path) + if sample['speech'].shape[0] > 1: + sample['speech'] = sample['speech'].mean(dim=0, keepdim=True) + + sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING) + yield {**sample} + except Exception as ex: + logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex)) + logging.warning('Failed to open {}'.format(wav_path)) + + +def process_sft_vq0918_pool2_split(data, mode='train',split_token=50, tts_data={}): + for sample in data: + assert 'src' in sample + + token_npy_path = sample['src'] + wav_path=token_npy_path.replace(".vq0918-pool2.npy","") + + # wav_path,token_npy_path=sample['src'].split(' ') + try: + # sample['speech_token']=torch.tensor(np.load(token_npy_path)) + # sample['speech'], sample['sample_rate']= torchaudio.load(wav_path) + # if sample['speech'].shape[0] > 1: + # sample['speech'] = sample['speech'].mean(dim=0, keepdim=True) + + + # sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING) + + + speech_token=torch.tensor(np.load(token_npy_path)) + speech,sample_rate= torchaudio.load(wav_path) + # split_speech=int(split_token / 12.5 * sample_rate) + if speech.shape[0] > 1: + speech = speech.mean(dim=0, keepdim=True) + + sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING) + sample['sample_rate']=sample_rate + + num_splits = (speech_token.size(0) + split_token - 1) // split_token + + for split_id in range(num_splits): + end_token_idx = min((split_id + 1) * split_token, speech_token.size(0)) + end_speech_idx=int(np.ceil(end_token_idx / 25 * sample_rate)) + sample['speech_token']=speech_token[:end_token_idx] + sample['speech']=speech[:,:end_speech_idx] + print(sample['speech_token'].size(),sample['speech'].size()) + yield {**sample} + except Exception as ex: + logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex)) + logging.warning('Failed to open {}'.format(wav_path)) + +def process_sft_vq0918_pool4_gpt(data, mode='train', tts_data={}): + for sample in data: + assert 'src' in sample + try: + entry=json.loads(sample['src']) + sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING) + + for conv in entry["conversations"]: + if "response_wav" in conv: + wav_path = f"/workspace/audio_data/sft/{conv['response_wav']}" + token_npy_path=wav_path.replace(".wav",".wav.vq0918-pool4.npy") + sample['speech_token']=torch.tensor(np.load(token_npy_path)) + sample['speech'], sample['sample_rate']= torchaudio.load(wav_path) + if sample['speech'].shape[0] > 1: + sample['speech'] = sample['speech'].mean(dim=0, keepdim=True) + sample['spk_embedding']=spk_embedding + yield {**sample} + except Exception as ex: + # logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex)) + logging.warning('Failed to open {}'.format(wav_path)) + + +def process_sft_vq0918_pool4_gpt_1010(data, mode='train', tts_data={}): + for sample in data: + assert 'src' in sample + try: + entry=json.loads(sample['src']) + sample['spk_embedding']=torch.zeros_like(MAIN_SPK_EMBEDDING) + + for conv in entry["conversations"]: + if "response_wav" in conv: + wav_path = f"/workspace/audio_data/sft/{conv['response_wav']}" + token_npy_path=wav_path.replace(".wav",".wav.vq0918-pool4.npy") + sample['speech_token']=torch.tensor(np.load(token_npy_path)) + sample['speech'], sample['sample_rate']= torchaudio.load(wav_path) + if sample['speech'].shape[0] > 1: + sample['speech'] = sample['speech'].mean(dim=0, keepdim=True) + sample['spk_embedding']=spk_embedding + yield {**sample} + if "prompt_wav" in conv: + wav_path = f"/workspace/audio_data/sft/{conv['response_wav']}" + token_npy_path=wav_path.replace(".wav",".wav.vq0918-pool4.npy") + sample['speech_token']=torch.tensor(np.load(token_npy_path)) + sample['speech'], sample['sample_rate']= torchaudio.load(wav_path) + if sample['speech'].shape[0] > 1: + sample['speech'] = sample['speech'].mean(dim=0, keepdim=True) + sample['spk_embedding']=spk_embedding + yield {**sample} + except Exception as ex: + # logging.warning('Failed to open {}, ex info {}'.format(wav_path, ex)) + logging.warning('Failed to open {}'.format(wav_path)) + + +def filter(data, + max_length=10240, + min_length=10, + token_max_length=200, + token_min_length=1, + min_output_input_ratio=0.0005, + max_output_input_ratio=1, + mode='train'): + """ Filter sample according to feature and label length + Inplace operation. + + Args:: + data: Iterable[{key, wav, label, sample_rate}] + max_length: drop utterance which is greater than max_length(10ms) + min_length: drop utterance which is less than min_length(10ms) + token_max_length: drop utterance which is greater than + token_max_length, especially when use char unit for + english modeling + token_min_length: drop utterance which is + less than token_max_length + min_output_input_ratio: minimal ration of + token_length / feats_length(10ms) + max_output_input_ratio: maximum ration of + token_length / feats_length(10ms) + + Returns: + Iterable[{key, wav, label, sample_rate}] + """ + for sample in data: + # sample['speech'], sample['sample_rate'] = torchaudio.load(BytesIO(sample['audio_data'])) + # del sample['audio_data'] + # sample['wav'] is torch.Tensor, we have 100 frames every second + num_frames = sample['speech'].size(1) / sample['sample_rate'] * 100 + if num_frames < min_length: + continue + if num_frames > max_length: + continue + if len(sample['text_token']) < token_min_length: + continue + if len(sample['text_token']) > token_max_length: + continue + if len(sample['speech_token']) == 0: + continue + if num_frames != 0: + if len(sample['text_token']) / num_frames < min_output_input_ratio: + continue + if len(sample['text_token']) / num_frames > max_output_input_ratio: + continue + yield sample + + +def filter_speech_token(data, + max_length=10240, + min_length=10, + token_max_length=5000, + token_min_length=1, + min_output_input_ratio=0.0005, + max_output_input_ratio=30, + mode='train'): + """ Filter sample according to feature and label length + Inplace operation. + + Args:: + data: Iterable[{key, wav, label, sample_rate}] + max_length: drop utterance which is greater than max_length(10ms) + min_length: drop utterance which is less than min_length(10ms) + token_max_length: drop utterance which is greater than + token_max_length, especially when use char unit for + english modeling + token_min_length: drop utterance which is + less than token_max_length + min_output_input_ratio: minimal ration of + token_length / feats_length(10ms) + max_output_input_ratio: maximum ration of + token_length / feats_length(10ms) + + Returns: + Iterable[{key, wav, label, sample_rate}] + """ + for sample in data: + # sample['speech'], sample['sample_rate'] = torchaudio.load(BytesIO(sample['audio_data'])) + # del sample['audio_data'] + # sample['wav'] is torch.Tensor, we have 100 frames every second + num_frames = sample['speech'].size(1) / sample['sample_rate'] * 100 + if num_frames < min_length: + continue + if num_frames > max_length: + continue + if len(sample['speech_token']) < token_min_length: + continue + if len(sample['speech_token']) > token_max_length: + continue + if len(sample['speech_token']) == 0: + continue + if num_frames != 0: + if len(sample['speech_token']) / num_frames < min_output_input_ratio: + continue + if len(sample['speech_token']) / num_frames > max_output_input_ratio: + continue + yield sample + + +def resample(data, resample_rate=22050, min_sample_rate=16000, mode='train'): + """ Resample data. + Inplace operation. + + Args: + data: Iterable[{key, wav, label, sample_rate}] + resample_rate: target resample rate + + Returns: + Iterable[{key, wav, label, sample_rate}] + """ + for sample in data: + assert 'sample_rate' in sample + assert 'speech' in sample + sample_rate = sample['sample_rate'] + waveform = sample['speech'] + if sample_rate != resample_rate: + if sample_rate < min_sample_rate: + continue + sample['sample_rate'] = resample_rate + sample['speech'] = torchaudio.transforms.Resample( + orig_freq=sample_rate, new_freq=resample_rate)(waveform) + max_val = sample['speech'].abs().max() + if max_val > 1: + sample['speech'] /= max_val + yield sample + + +def compute_fbank(data, + feat_extractor, + mode='train'): + """ Extract fbank + + Args: + data: Iterable[{key, wav, label, sample_rate}] + + Returns: + Iterable[{key, feat, label}] + """ + for sample in data: + assert 'sample_rate' in sample + assert 'speech' in sample + # assert 'utt' in sample + # assert 'text_token' in sample + waveform = sample['speech'] + mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1) + sample['speech_feat'] = mat + del sample['speech'] + yield sample + + +def parse_embedding(data, normalize, mode='train'): + """ Parse utt_embedding/spk_embedding + + Args: + data: Iterable[{key, wav, label, sample_rate}] + + Returns: + Iterable[{key, feat, label}] + """ + for sample in data: + sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32) + sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32) + if normalize: + sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0) + sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0) + yield sample + + +def tokenize(data, get_tokenizer, allowed_special, mode='train'): + """ Decode text to chars or BPE + Inplace operation + + Args: + data: Iterable[{key, wav, txt, sample_rate}] + + Returns: + Iterable[{key, wav, txt, tokens, label, sample_rate}] + """ + tokenizer = get_tokenizer() + for sample in data: + assert 'text' in sample + sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special) + if mode == 'inference': + sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special) + yield sample + + +def shuffle(data, shuffle_size=10000, mode='train'): + """ Local shuffle the data + + Args: + data: Iterable[{key, feat, label}] + shuffle_size: buffer size for shuffle + + Returns: + Iterable[{key, feat, label}] + """ + buf = [] + for sample in data: + buf.append(sample) + if len(buf) >= shuffle_size: + random.shuffle(buf) + for x in buf: + yield x + buf = [] + # The sample left over + random.shuffle(buf) + for x in buf: + yield x + + +def sort(data, sort_size=500, mode='train'): + """ Sort the data by feature length. + Sort is used after shuffle and before batch, so we can group + utts with similar lengths into a batch, and `sort_size` should + be less than `shuffle_size` + + Args: + data: Iterable[{key, feat, label}] + sort_size: buffer size for sort + + Returns: + Iterable[{key, feat, label}] + """ + + buf = [] + for sample in data: + buf.append(sample) + if len(buf) >= sort_size: + buf.sort(key=lambda x: x['speech_feat'].size(0)) + for x in buf: + yield x + buf = [] + # The sample left over + buf.sort(key=lambda x: x['speech_feat'].size(0)) + for x in buf: + yield x + + +def static_batch(data, batch_size=16): + """ Static batch the data by `batch_size` + + Args: + data: Iterable[{key, feat, label}] + batch_size: batch size + + Returns: + Iterable[List[{key, feat, label}]] + """ + buf = [] + for sample in data: + buf.append(sample) + if len(buf) >= batch_size: + yield buf + buf = [] + if len(buf) > 0: + yield buf + + +def dynamic_batch(data, max_frames_in_batch=12000, mode='train'): + """ Dynamic batch the data until the total frames in batch + reach `max_frames_in_batch` + + Args: + data: Iterable[{key, feat, label}] + max_frames_in_batch: max_frames in one batch + + Returns: + Iterable[List[{key, feat, label}]] + """ + buf = [] + longest_frames = 0 + for sample in data: + assert 'speech_feat' in sample + assert isinstance(sample['speech_feat'], torch.Tensor) + new_sample_frames = sample['speech_feat'].size(0) + longest_frames = max(longest_frames, new_sample_frames) + frames_after_padding = longest_frames * (len(buf) + 1) + if frames_after_padding > max_frames_in_batch: + yield buf + buf = [sample] + longest_frames = new_sample_frames + else: + buf.append(sample) + if len(buf) > 0: + yield buf + + +def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'): + """ Wrapper for static/dynamic batch + """ + if mode == 'inference': + return static_batch(data, 1) + else: + if batch_type == 'static': + return static_batch(data, batch_size) + elif batch_type == 'dynamic': + return dynamic_batch(data, max_frames_in_batch) + else: + logging.fatal('Unsupported batch type {}'.format(batch_type)) + + +def padding(data, use_spk_embedding, mode='train'): + """ Padding the data into training data + + Args: + data: Iterable[List[{key, feat, label}]] + + Returns: + Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] + """ + for sample in data: + assert isinstance(sample, list) + speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample], + dtype=torch.int32) + order = torch.argsort(speech_feat_len, descending=True) + + utts = [sample[i]['utt'] for i in order] + speech_token = [torch.tensor(sample[i]['speech_token']) for i in order] + speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32) + speech_token = pad_sequence(speech_token, + batch_first=True, + padding_value=0) + speech_feat = [sample[i]['speech_feat'] for i in order] + speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32) + speech_feat = pad_sequence(speech_feat, + batch_first=True, + padding_value=0) + text = [sample[i]['text'] for i in order] + text_token = [torch.tensor(sample[i]['text_token']) for i in order] + text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32) + text_token = pad_sequence(text_token, batch_first=True, padding_value=0) + utt_embedding = torch.stack([sample[i]['utt_embedding'] for i in order], dim=0) + spk_embedding = torch.stack([sample[i]['spk_embedding'] for i in order], dim=0) + batch = { + "utts": utts, + "speech_token": speech_token, + "speech_token_len": speech_token_len, + "speech_feat": speech_feat, + "speech_feat_len": speech_feat_len, + "text": text, + "text_token": text_token, + "text_token_len": text_token_len, + "utt_embedding": utt_embedding, + "spk_embedding": spk_embedding, + } + if mode == 'inference': + tts_text = [sample[i]['tts_text'] for i in order] + tts_index = [sample[i]['tts_index'] for i in order] + tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order] + tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32) + tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1) + batch.update({'tts_text': tts_text, + 'tts_index': tts_index, + 'tts_text_token': tts_text_token, + 'tts_text_token_len': tts_text_token_len}) + if use_spk_embedding is True: + batch["embedding"] = batch["spk_embedding"] + else: + batch["embedding"] = batch["utt_embedding"] + yield batch + + + +def padding_speech_token(data, use_spk_embedding, mode='train'): + """ Padding the data into training data + + Args: + data: Iterable[List[{key, feat, label}]] + + Returns: + Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] + """ + for sample in data: + assert isinstance(sample, list) + speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample], + dtype=torch.int32) + order = torch.argsort(speech_feat_len, descending=True) + + # utts = [sample[i]['utt'] for i in order] + # speech_token = [torch.tensor(sample[i]['speech_token']) for i in order] + try: + speech_token = [sample[i]['speech_token'].clone().detach() for i in order] + speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32) + speech_token = pad_sequence(speech_token, + batch_first=True, + padding_value=0) + speech_feat = [sample[i]['speech_feat'] for i in order] + speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32) + speech_feat = pad_sequence(speech_feat, + batch_first=True, + padding_value=0) + batch = { + "speech_token": speech_token, + "speech_token_len": speech_token_len, + "speech_feat": speech_feat, + "speech_feat_len": speech_feat_len, + } + if mode == 'inference': + tts_text = [sample[i]['tts_text'] for i in order] + tts_index = [sample[i]['tts_index'] for i in order] + tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order] + tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32) + tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1) + batch.update({'tts_text': tts_text, + 'tts_index': tts_index, + 'tts_text_token': tts_text_token, + 'tts_text_token_len': tts_text_token_len}) + # if use_spk_embedding is True: + # batch["embedding"] = batch["spk_embedding"] + # else: + # batch["embedding"] = batch["utt_embedding"] + batch["embedding"]=torch.zeros((batch["speech_feat"].size(0),192),device=batch["speech_feat"].device) + yield batch + except Exception as ex: + logging.warning(' ex info {}'.format(ex)) + # assert False + + + +def padding_speech_token_spk(data, use_spk_embedding, mode='train'): + """ Padding the data into training data + + Args: + data: Iterable[List[{key, feat, label}]] + + Returns: + Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)] + """ + for sample in data: + assert isinstance(sample, list) + speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample], + dtype=torch.int32) + order = torch.argsort(speech_feat_len, descending=True) + + # utts = [sample[i]['utt'] for i in order] + # speech_token = [torch.tensor(sample[i]['speech_token']) for i in order] + try: + speech_token = [sample[i]['speech_token'].clone().detach() for i in order] + speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32) + speech_token = pad_sequence(speech_token, + batch_first=True, + padding_value=0) + speech_feat = [sample[i]['speech_feat'] for i in order] + speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32) + speech_feat = pad_sequence(speech_feat, + batch_first=True, + padding_value=0) + spk_embedding = torch.stack([sample[i]['spk_embedding'] for i in order], dim=0) + batch = { + "speech_token": speech_token, + "speech_token_len": speech_token_len, + "speech_feat": speech_feat, + "speech_feat_len": speech_feat_len, + "spk_embedding": spk_embedding, + } + if mode == 'inference': + tts_text = [sample[i]['tts_text'] for i in order] + tts_index = [sample[i]['tts_index'] for i in order] + tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order] + tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32) + tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1) + batch.update({'tts_text': tts_text, + 'tts_index': tts_index, + 'tts_text_token': tts_text_token, + 'tts_text_token_len': tts_text_token_len}) + # if use_spk_embedding is True: + # batch["embedding"] = batch["spk_embedding"] + # else: + # batch["embedding"] = batch["utt_embedding"] + # batch["embedding"]=torch.zeros((batch["speech_feat"].size(0),192),device=batch["speech_feat"].device) + batch["embedding"] = batch["spk_embedding"] + yield batch + except Exception as ex: + logging.warning(' ex info {}'.format(ex)) + # assert False \ No newline at end of file diff --git a/cosyvoice/flow/__pycache__/decoder.cpython-310.pyc b/cosyvoice/flow/__pycache__/decoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2430081c4b6f4bc780096ee9256fe2e207bb4267 Binary files /dev/null and b/cosyvoice/flow/__pycache__/decoder.cpython-310.pyc differ diff --git a/cosyvoice/flow/__pycache__/flow.cpython-310.pyc b/cosyvoice/flow/__pycache__/flow.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24d50c38012133309895d2e881261fc29a084d2f Binary files /dev/null and b/cosyvoice/flow/__pycache__/flow.cpython-310.pyc differ diff --git a/cosyvoice/flow/__pycache__/flow_matching.cpython-310.pyc b/cosyvoice/flow/__pycache__/flow_matching.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12041b6bc67587d0440b8ea768915b24038ae955 Binary files /dev/null and b/cosyvoice/flow/__pycache__/flow_matching.cpython-310.pyc differ diff --git a/cosyvoice/flow/__pycache__/length_regulator.cpython-310.pyc b/cosyvoice/flow/__pycache__/length_regulator.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1e664b83ed9e1a9a22e13f9c2630eda13b1a3ae Binary files /dev/null and b/cosyvoice/flow/__pycache__/length_regulator.cpython-310.pyc differ diff --git a/cosyvoice/flow/decoder.py b/cosyvoice/flow/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..43492799390b44a2843bc53604603842754799f9 --- /dev/null +++ b/cosyvoice/flow/decoder.py @@ -0,0 +1,222 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch.nn as nn +from einops import pack, rearrange, repeat +from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D +from matcha.models.components.transformer import BasicTransformerBlock + + +class ConditionalDecoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + channels=(256, 256), + dropout=0.05, + attention_head_dim=64, + n_blocks=1, + num_mid_blocks=2, + num_heads=4, + act_fn="snake", + ): + """ + This decoder requires an input with the same shape of the target. So, if your text content + is shorter or longer than the outputs, please re-sampling it before feeding to the decoder. + """ + super().__init__() + channels = tuple(channels) + self.in_channels = in_channels + self.out_channels = out_channels + + self.time_embeddings = SinusoidalPosEmb(in_channels) + time_embed_dim = channels[0] * 4 + self.time_mlp = TimestepEmbedding( + in_channels=in_channels, + time_embed_dim=time_embed_dim, + act_fn="silu", + ) + self.down_blocks = nn.ModuleList([]) + self.mid_blocks = nn.ModuleList([]) + self.up_blocks = nn.ModuleList([]) + + output_channel = in_channels + for i in range(len(channels)): # pylint: disable=consider-using-enumerate + input_channel = output_channel + output_channel = channels[i] + is_last = i == len(channels) - 1 + resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + downsample = ( + Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1) + ) + self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample])) + + for i in range(num_mid_blocks): + input_channel = channels[-1] + out_channels = channels[-1] + resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + + transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + + self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks])) + + channels = channels[::-1] + (channels[0],) + for i in range(len(channels) - 1): + input_channel = channels[i] * 2 + output_channel = channels[i + 1] + is_last = i == len(channels) - 2 + resnet = ResnetBlock1D( + dim=input_channel, + dim_out=output_channel, + time_emb_dim=time_embed_dim, + ) + transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + upsample = ( + Upsample1D(output_channel, use_conv_transpose=True) + if not is_last + else nn.Conv1d(output_channel, output_channel, 3, padding=1) + ) + self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample])) + self.final_block = Block1D(channels[-1], channels[-1]) + self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1) + self.initialize_weights() + + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_(m.weight, nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.GroupNorm): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.kaiming_normal_(m.weight, nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x, mask, mu, t, spks=None, cond=None): + """Forward pass of the UNet1DConditional model. + + Args: + x (torch.Tensor): shape (batch_size, in_channels, time) + mask (_type_): shape (batch_size, 1, time) + t (_type_): shape (batch_size) + spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None. + cond (_type_, optional): placeholder for future use. Defaults to None. + + Raises: + ValueError: _description_ + ValueError: _description_ + + Returns: + _type_: _description_ + """ + + t = self.time_embeddings(t) + t = self.time_mlp(t) + + x = pack([x, mu], "b * t")[0] + + if spks is not None: + spks = repeat(spks, "b c -> b c t", t=x.shape[-1]) + x = pack([x, spks], "b * t")[0] + if cond is not None: + x = pack([x, cond], "b * t")[0] + + hiddens = [] + masks = [mask] + for resnet, transformer_blocks, downsample in self.down_blocks: + mask_down = masks[-1] + x = resnet(x, mask_down, t) + x = rearrange(x, "b c t -> b t c").contiguous() + attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down) + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + hiddens.append(x) # Save hidden states for skip connections + x = downsample(x * mask_down) + masks.append(mask_down[:, :, ::2]) + masks = masks[:-1] + mask_mid = masks[-1] + + for resnet, transformer_blocks in self.mid_blocks: + x = resnet(x, mask_mid, t) + x = rearrange(x, "b c t -> b t c").contiguous() + attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid) + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + + for resnet, transformer_blocks, upsample in self.up_blocks: + mask_up = masks.pop() + skip = hiddens.pop() + x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0] + x = resnet(x, mask_up, t) + x = rearrange(x, "b c t -> b t c").contiguous() + attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up) + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + x = upsample(x * mask_up) + x = self.final_block(x, mask_up) + output = self.final_proj(x * mask_up) + return output * mask diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py new file mode 100644 index 0000000000000000000000000000000000000000..415b2a98872c29f82a9a49b89fba7996c10c042d --- /dev/null +++ b/cosyvoice/flow/flow.py @@ -0,0 +1,144 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import random +from typing import Dict, Optional +import torch +import torch.nn as nn +from torch.nn import functional as F +from omegaconf import DictConfig +from cosyvoice.utils.mask import make_pad_mask + + +class MaskedDiffWithXvec(torch.nn.Module): + def __init__(self, + input_size: int = 512, + output_size: int = 80, + spk_embed_dim: int = 192, + output_type: str = "mel", + vocab_size: int = 4096, + input_frame_rate: int = 50, + only_mask_loss: bool = True, + encoder: torch.nn.Module = None, + length_regulator: torch.nn.Module = None, + decoder: torch.nn.Module = None, + decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}}, + mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}): + super().__init__() + self.input_size = input_size + self.output_size = output_size + self.decoder_conf = decoder_conf + self.mel_feat_conf = mel_feat_conf + self.vocab_size = vocab_size + self.output_type = output_type + self.input_frame_rate = input_frame_rate + logging.info(f"input frame rate={self.input_frame_rate}") + self.input_embedding = nn.Embedding(vocab_size, input_size) + self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size) + self.encoder = encoder + self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size) + self.decoder = decoder + self.length_regulator = length_regulator + self.only_mask_loss = only_mask_loss + + def forward( + self, + batch: dict, + device: torch.device, + ) -> Dict[str, Optional[torch.Tensor]]: + token = batch['speech_token'].to(device) + token_len = batch['speech_token_len'].to(device) + feat = batch['speech_feat'].to(device) + feat_len = batch['speech_feat_len'].to(device) + embedding = batch['embedding'].to(device) + + # xvec projection + embedding = F.normalize(embedding, dim=1) + embedding = self.spk_embed_affine_layer(embedding) + # embedding=None + + # concat text and prompt_text + mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device) + # print(token.max(),self.input_embedding) + token = self.input_embedding(torch.clamp(token, min=0)) * mask + + + # text encode + h, h_lengths = self.encoder(token, token_len) + h = self.encoder_proj(h) + h, h_lengths = self.length_regulator(h, feat_len) + + # get conditions + conds = torch.zeros(feat.shape, device=token.device) + for i, j in enumerate(feat_len): + if random.random() < 0.5: + continue + index = random.randint(0, int(0.8 * j)) + conds[i, :index] = feat[i, :index] + conds = conds.transpose(1, 2) + + mask = (~make_pad_mask(feat_len)).to(h) + feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1) + loss, _ = self.decoder.compute_loss( + feat.transpose(1, 2).contiguous(), + mask.unsqueeze(1), + h.transpose(1, 2).contiguous(), + embedding, + cond=conds + ) + return {'loss': loss} + + @torch.inference_mode() + def inference(self, + token, + token_len, + prompt_token, + prompt_token_len, + prompt_feat, + prompt_feat_len, + embedding): + assert token.shape[0] == 1 + # xvec projection + embedding = F.normalize(embedding, dim=1) + embedding = self.spk_embed_affine_layer(embedding) + + # concat text and prompt_text + token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len + mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding) + token = self.input_embedding(torch.clamp(token, min=0)) * mask + + # text encode + h, h_lengths = self.encoder(token, token_len) + h = self.encoder_proj(h) + feat_len = (token_len / self.input_frame_rate * 22050 / 256).int() + h, h_lengths = self.length_regulator(h, feat_len) + + # get conditions + conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device) + if prompt_feat.shape[1] != 0: + for i, j in enumerate(prompt_feat_len): + conds[i, :j] = prompt_feat[i] + conds = conds.transpose(1, 2) + + mask = (~make_pad_mask(feat_len)).to(h) + feat = self.decoder( + mu=h.transpose(1, 2).contiguous(), + mask=mask.unsqueeze(1), + spks=embedding, + cond=conds, + n_timesteps=10 + ) + if prompt_feat.shape[1] != 0: + feat = feat[:, :, prompt_feat.shape[1]:] + return feat diff --git a/cosyvoice/flow/flow_gradtts.py b/cosyvoice/flow/flow_gradtts.py new file mode 100644 index 0000000000000000000000000000000000000000..8e558c0ff65a0c6befd1c5aa49c20464307e82b1 --- /dev/null +++ b/cosyvoice/flow/flow_gradtts.py @@ -0,0 +1,142 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import random +from typing import Dict, Optional +import torch +import torch.nn as nn +from torch.nn import functional as F +from omegaconf import DictConfig +from cosyvoice.utils.mask import make_pad_mask + + +class MaskedDiffWithXvec(torch.nn.Module): + def __init__(self, + input_size: int = 512, + output_size: int = 80, + spk_embed_dim: int = 192, + output_type: str = "mel", + vocab_size: int = 4096, + input_frame_rate: int = 50, + only_mask_loss: bool = True, + encoder: torch.nn.Module = None, + length_regulator: torch.nn.Module = None, + decoder: torch.nn.Module = None, + decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}}, + mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}): + super().__init__() + self.input_size = input_size + self.output_size = output_size + self.decoder_conf = decoder_conf + self.mel_feat_conf = mel_feat_conf + self.vocab_size = vocab_size + self.output_type = output_type + self.input_frame_rate = input_frame_rate + logging.info(f"input frame rate={self.input_frame_rate}") + self.input_embedding = nn.Embedding(vocab_size, input_size) + self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size) + self.encoder = encoder + self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size) + self.decoder = decoder + self.length_regulator = length_regulator + self.only_mask_loss = only_mask_loss + + def forward( + self, + batch: dict, + device: torch.device, + ) -> Dict[str, Optional[torch.Tensor]]: + token = batch['speech_token'].to(device) + token_len = batch['speech_token_len'].to(device) + feat = batch['speech_feat'].to(device) + feat_len = batch['speech_feat_len'].to(device) + embedding = batch['embedding'].to(device) + + # xvec projection + embedding = F.normalize(embedding, dim=1) + embedding = self.spk_embed_affine_layer(embedding) + # embedding=None + + # concat text and prompt_text + mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device) + token = self.input_embedding(torch.clamp(token, min=0)) * mask + + # text encode + h, h_lengths = self.encoder(token, token_len) + h = self.encoder_proj(h) + h, h_lengths = self.length_regulator(h, feat_len) + + # get conditions + conds = torch.zeros(feat.shape, device=token.device) + # for i, j in enumerate(feat_len): + # if random.random() < 0.5: + # continue + # index = random.randint(0, int(0.3 * j)) + # conds[i, :index] = feat[i, :index] + conds = conds.transpose(1, 2) + + mask = (~make_pad_mask(feat_len)).to(h) + feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1) + loss, _ = self.decoder.compute_loss( + feat.transpose(1, 2).contiguous(), + mask.unsqueeze(1), + h.transpose(1, 2).contiguous(), + embedding, + cond=conds + ) + return {'loss': loss} + + @torch.inference_mode() + def inference(self, + token, + token_len, + prompt_token, + prompt_token_len, + prompt_feat, + prompt_feat_len, + embedding): + assert token.shape[0] == 1 + # xvec projection + embedding = F.normalize(embedding, dim=1) + embedding = self.spk_embed_affine_layer(embedding) + + # concat text and prompt_text + token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len + mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding) + token = self.input_embedding(torch.clamp(token, min=0)) * mask + + # text encode + h, h_lengths = self.encoder(token, token_len) + h = self.encoder_proj(h) + feat_len = (token_len / self.input_frame_rate * 22050 / 256).int() + h, h_lengths = self.length_regulator(h, feat_len) + + # get conditions + conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device) + if prompt_feat.shape[1] != 0: + for i, j in enumerate(prompt_feat_len): + conds[i, :j] = prompt_feat[i] + conds = conds.transpose(1, 2) + + mask = (~make_pad_mask(feat_len)).to(h) + feat = self.decoder( + mu=h.transpose(1, 2).contiguous(), + mask=mask.unsqueeze(1), + spks=embedding, + cond=conds, + n_timesteps=10 + ) + if prompt_feat.shape[1] != 0: + feat = feat[:, :, prompt_feat.shape[1]:] + return feat diff --git a/cosyvoice/flow/flow_matching.py b/cosyvoice/flow/flow_matching.py new file mode 100644 index 0000000000000000000000000000000000000000..ec487d7d8effbf9c7284624b839184e43df40b9c --- /dev/null +++ b/cosyvoice/flow/flow_matching.py @@ -0,0 +1,142 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch.nn.functional as F +from matcha.models.components.flow_matching import BASECFM + +class ConditionalCFM(BASECFM): + def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None): + super().__init__( + n_feats=in_channels, + cfm_params=cfm_params, + n_spks=n_spks, + spk_emb_dim=spk_emb_dim, + ) + self.t_scheduler = cfm_params.t_scheduler + self.training_cfg_rate = cfm_params.training_cfg_rate + self.inference_cfg_rate = cfm_params.inference_cfg_rate + in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0) + # Just change the architecture of the estimator here + self.estimator = estimator + + @torch.inference_mode() + def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): + """Forward diffusion + + Args: + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + n_timesteps (int): number of diffusion steps + temperature (float, optional): temperature for scaling noise. Defaults to 1.0. + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + + Returns: + sample: generated mel-spectrogram + shape: (batch_size, n_feats, mel_timesteps) + """ + torch.manual_seed(42) + + z = torch.randn_like(mu) * temperature + + t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device) + if self.t_scheduler == 'cosine': + t_span = 1 - torch.cos(t_span * 0.5 * torch.pi) + return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond) + + def solve_euler(self, x, t_span, mu, mask, spks, cond): + """ + Fixed euler solver for ODEs. + Args: + x (torch.Tensor): random noise + t_span (torch.Tensor): n_timesteps interpolated + shape: (n_timesteps + 1,) + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + """ + t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0] + + # I am storing this because I can later plot it by putting a debugger here and saving it to a file + # Or in future might add like a return_all_steps flag + sol = [] + + for step in range(1, len(t_span)): + dphi_dt = self.estimator(x, mask, mu, t, spks, cond) + # Classifier-Free Guidance inference introduced in VoiceBox + if self.inference_cfg_rate > 0: + cfg_dphi_dt = self.estimator( + x, mask, + torch.zeros_like(mu), t, + torch.zeros_like(spks) if spks is not None else None, + torch.zeros_like(cond) + ) + dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - + self.inference_cfg_rate * cfg_dphi_dt) + x = x + dt * dphi_dt + t = t + dt + + sol.append(x) + if step < len(t_span) - 1: + dt = t_span[step + 1] - t + + return sol[-1] + + def compute_loss(self, x1, mask, mu, spks=None, cond=None): + """Computes diffusion loss + + Args: + x1 (torch.Tensor): Target + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): target mask + shape: (batch_size, 1, mel_timesteps) + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + spks (torch.Tensor, optional): speaker embedding. Defaults to None. + shape: (batch_size, spk_emb_dim) + + Returns: + loss: conditional flow matching loss + y: conditional flow + shape: (batch_size, n_feats, mel_timesteps) + """ + b, _, t = mu.shape + + # random timestep + t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype) + if self.t_scheduler == 'cosine': + t = 1 - torch.cos(t * 0.5 * torch.pi) + # sample noise p(x_0) + z = torch.randn_like(x1) + + y = (1 - (1 - self.sigma_min) * t) * z + t * x1 + u = x1 - (1 - self.sigma_min) * z + + # during training, we randomly drop condition to trade off mode coverage and sample fidelity + if self.training_cfg_rate > 0: + cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate + mu = mu * cfg_mask.view(-1, 1, 1) + spks = spks * cfg_mask.view(-1, 1) + cond = cond * cfg_mask.view(-1, 1, 1) + + pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond) + loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1]) + return loss, y diff --git a/cosyvoice/flow/flow_matching_dit.py b/cosyvoice/flow/flow_matching_dit.py new file mode 100644 index 0000000000000000000000000000000000000000..abadcc218cc3b30bd5c8da829079974b60f09b47 --- /dev/null +++ b/cosyvoice/flow/flow_matching_dit.py @@ -0,0 +1,180 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pdb + +import torch +import torch.nn.functional as F +from matcha.models.components.flow_matching import BASECFM + + +class ConditionalCFM(BASECFM): + def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None): + super().__init__( + n_feats=in_channels, + cfm_params=cfm_params, + n_spks=n_spks, + spk_emb_dim=spk_emb_dim, + ) + self.t_scheduler = cfm_params.t_scheduler + self.training_cfg_rate = cfm_params.training_cfg_rate + self.inference_cfg_rate = cfm_params.inference_cfg_rate + in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0) + # Just change the architecture of the estimator here + + io_channels = 80 + input_concat_dim = 80 + embed_dim = 768 + depth = 24 + num_heads = 24 + project_cond_tokens = False + transformer_type = "continuous_transformer" + self.estimator = estimator + + @torch.inference_mode() + def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): + """Forward diffusion + + Args: + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + n_timesteps (int): number of diffusion steps + temperature (float, optional): temperature for scaling noise. Defaults to 1.0. + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + + Returns: + sample: generated mel-spectrogram + shape: (batch_size, n_feats, mel_timesteps) + """ + z = torch.randn_like(mu) * temperature + t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device) + if self.t_scheduler == 'cosine': + t_span = 1 - torch.cos(t_span * 0.5 * torch.pi) + return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond) + + def solve_euler(self, x, t_span, mu, mask, spks, cond): + """ + Fixed euler solver for ODEs. + Args: + x (torch.Tensor): random noise torch.Size([1, 80, 621]) + t_span (torch.Tensor): n_timesteps interpolated + shape: (n_timesteps + 1,) + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + """ + t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0] + + # I am storing this because I can later plot it by putting a debugger here and saving it to a file + # Or in future might add like a return_all_steps flag + sol = [] + + cfg_dropout_prob = 0.1 + cfg_scale = 1.0 + + # cfg_dropout_prob = 0.0 + # cfg_scale = 3.0 + + for step in range(1, len(t_span)): + # dphi_dt = self.estimator(x, mask, mu, t, spks, cond) + # pdb.set_trace() + dphi_dt = self.estimator(x, # [bs, 80, 229] + t[None], # (bs,) + global_embed=spks, + input_concat_cond=mu, + mask=mask[0], # [bs, 229] + cfg_dropout_prob=cfg_dropout_prob, cfg_scale=cfg_scale) + + # Classifier-Free Guidance inference introduced in VoiceBox + if self.inference_cfg_rate > 0: + # cfg_dphi_dt = self.estimator( + # x, mask, + # torch.zeros_like(mu), t, + # torch.zeros_like(spks) if spks is not None else None, + # torch.zeros_like(cond) + # ) + cfg_dphi_dt = self.estimator(x, # [bs, 80, 229] + t[None], # (bs,) + global_embed=torch.zeros_like(spks) if spks is not None else None, + input_concat_cond=torch.zeros_like(mu), + mask=mask[0], # [bs, 229] + cfg_dropout_prob=cfg_dropout_prob, cfg_scale=cfg_scale) + + dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - + self.inference_cfg_rate * cfg_dphi_dt) + x = x + dt * dphi_dt + t = t + dt + sol.append(x) + if step < len(t_span) - 1: + dt = t_span[step + 1] - t + + return sol[-1] + + def compute_loss(self, x1, mask, mu, spks=None, cond=None): + """Computes diffusion loss + + Args: + x1 (torch.Tensor): Target + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): target mask + shape: (batch_size, 1, mel_timesteps) + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + spks (torch.Tensor, optional): speaker embedding. Defaults to None. + shape: (batch_size, spk_emb_dim) + + Returns: + loss: conditional flow matching loss + y: conditional flow + shape: (batch_size, n_feats, mel_timesteps) + """ + b, _, t = mu.shape + + # random timestep + t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype) + if self.t_scheduler == 'cosine': + t = 1 - torch.cos(t * 0.5 * torch.pi) + # sample noise p(x_0) + z = torch.randn_like(x1) + + y = (1 - (1 - self.sigma_min) * t) * z + t * x1 + u = x1 - (1 - self.sigma_min) * z + + # during training, we randomly drop condition to trade off mode coverage and sample fidelity + if self.training_cfg_rate > 0: + cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate + mu = mu * cfg_mask.view(-1, 1, 1) + spks = spks * cfg_mask.view(-1, 1) + cond = cond * cfg_mask.view(-1, 1, 1) + + # pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond) + pred = self.estimator(y, # [bs, 80, 229] + t.squeeze(1, 2), # (bs,) + global_embed=spks, + input_concat_cond=mu, + mask=mask.squeeze(1), # [bs, 229] + cfg_dropout_prob=0.1) + + loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1]) + return loss, y + + # def estimator_trans(self): + # pass diff --git a/cosyvoice/flow/length_regulator.py b/cosyvoice/flow/length_regulator.py new file mode 100644 index 0000000000000000000000000000000000000000..622f29aaccc44d8e8cce23ecab7b086ebb853fde --- /dev/null +++ b/cosyvoice/flow/length_regulator.py @@ -0,0 +1,49 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Tuple +import torch.nn as nn +from torch.nn import functional as F +from cosyvoice.utils.mask import make_pad_mask + + +class InterpolateRegulator(nn.Module): + def __init__( + self, + channels: int, + sampling_ratios: Tuple, + out_channels: int = None, + groups: int = 1, + ): + super().__init__() + self.sampling_ratios = sampling_ratios + out_channels = out_channels or channels + model = nn.ModuleList([]) + if len(sampling_ratios) > 0: + for _ in sampling_ratios: + module = nn.Conv1d(channels, channels, 3, 1, 1) + norm = nn.GroupNorm(groups, channels) + act = nn.Mish() + model.extend([module, norm, act]) + model.append( + nn.Conv1d(channels, out_channels, 1, 1) + ) + self.model = nn.Sequential(*model) + + def forward(self, x, ylens=None): + # x in (B, T, D) + mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1) + x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest') + out = self.model(x).transpose(1, 2).contiguous() + olens = ylens + return out * mask, olens diff --git a/cosyvoice/flow/stable/adp.py b/cosyvoice/flow/stable/adp.py new file mode 100644 index 0000000000000000000000000000000000000000..a7ff72026df1c0bed73563d025d314dd2ccd4d19 --- /dev/null +++ b/cosyvoice/flow/stable/adp.py @@ -0,0 +1,1591 @@ +# Copied and modified from https://github.com/archinetai/audio-diffusion-pytorch/blob/v0.0.94/audio_diffusion_pytorch/modules.py under MIT License +# License can be found in LICENSES/LICENSE_ADP.txt + +import math +from inspect import isfunction +from math import ceil, floor, log, pi, log2 +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, TypeVar, Union +from packaging import version + +import torch +import torch.nn as nn +from einops import rearrange, reduce, repeat +from einops.layers.torch import Rearrange +from einops_exts import rearrange_many +from torch import Tensor, einsum +from torch.backends.cuda import sdp_kernel +from torch.nn import functional as F +from dac.nn.layers import Snake1d +import pdb +""" +Utils +""" + + +class ConditionedSequential(nn.Module): + def __init__(self, *modules): + super().__init__() + self.module_list = nn.ModuleList(*modules) + + def forward(self, x: Tensor, mapping: Optional[Tensor] = None): + for module in self.module_list: + x = module(x, mapping) + return x + +T = TypeVar("T") + +def default(val: Optional[T], d: Union[Callable[..., T], T]) -> T: + if exists(val): + return val + return d() if isfunction(d) else d + +def exists(val: Optional[T]) -> T: + return val is not None + +def closest_power_2(x: float) -> int: + exponent = log2(x) + distance_fn = lambda z: abs(x - 2 ** z) # noqa + exponent_closest = min((floor(exponent), ceil(exponent)), key=distance_fn) + return 2 ** int(exponent_closest) + +def group_dict_by_prefix(prefix: str, d: Dict) -> Tuple[Dict, Dict]: + return_dicts: Tuple[Dict, Dict] = ({}, {}) + for key in d.keys(): + no_prefix = int(not key.startswith(prefix)) + return_dicts[no_prefix][key] = d[key] + return return_dicts + +def groupby(prefix: str, d: Dict, keep_prefix: bool = False) -> Tuple[Dict, Dict]: + kwargs_with_prefix, kwargs = group_dict_by_prefix(prefix, d) + if keep_prefix: + return kwargs_with_prefix, kwargs + kwargs_no_prefix = {k[len(prefix) :]: v for k, v in kwargs_with_prefix.items()} + return kwargs_no_prefix, kwargs + +""" +Convolutional Blocks +""" +import typing as tp + +# Copied from https://github.com/facebookresearch/audiocraft/blob/main/audiocraft/modules/conv.py under MIT License +# License available in LICENSES/LICENSE_META.txt + +def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, + padding_total: int = 0) -> int: + """See `pad_for_conv1d`.""" + length = x.shape[-1] + n_frames = (length - kernel_size + padding_total) / stride + 1 + ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total) + return ideal_length - length + + +def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0): + """Pad for a convolution to make sure that the last window is full. + Extra padding is added at the end. This is required to ensure that we can rebuild + an output of the same length, as otherwise, even with padding, some time steps + might get removed. + For instance, with total padding = 4, kernel size = 4, stride = 2: + 0 0 1 2 3 4 5 0 0 # (0s are padding) + 1 2 3 # (output frames of a convolution, last 0 is never used) + 0 0 1 2 3 4 5 0 # (output of tr. conv., but pos. 5 is going to get removed as padding) + 1 2 3 4 # once you removed padding, we are missing one time step ! + """ + extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total) + return F.pad(x, (0, extra_padding)) + + +def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.): + """Tiny wrapper around F.pad, just to allow for reflect padding on small input. + If this is the case, we insert extra 0 padding to the right before the reflection happen. + """ + length = x.shape[-1] + padding_left, padding_right = paddings + assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) + if mode == 'reflect': + max_pad = max(padding_left, padding_right) + extra_pad = 0 + if length <= max_pad: + extra_pad = max_pad - length + 1 + x = F.pad(x, (0, extra_pad)) + padded = F.pad(x, paddings, mode, value) + end = padded.shape[-1] - extra_pad + return padded[..., :end] + else: + return F.pad(x, paddings, mode, value) + + +def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]): + """Remove padding from x, handling properly zero padding. Only for 1d!""" + padding_left, padding_right = paddings + assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) + assert (padding_left + padding_right) <= x.shape[-1] + end = x.shape[-1] - padding_right + return x[..., padding_left: end] + + +class Conv1d(nn.Conv1d): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, x: Tensor, causal=False) -> Tensor: + kernel_size = self.kernel_size[0] + stride = self.stride[0] + dilation = self.dilation[0] + kernel_size = (kernel_size - 1) * dilation + 1 # effective kernel size with dilations + padding_total = kernel_size - stride + extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total) + if causal: + # Left padding for causal + x = pad1d(x, (padding_total, extra_padding)) + else: + # Asymmetric padding required for odd strides + padding_right = padding_total // 2 + padding_left = padding_total - padding_right + x = pad1d(x, (padding_left, padding_right + extra_padding)) + return super().forward(x) + +class ConvTranspose1d(nn.ConvTranspose1d): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, x: Tensor, causal=False) -> Tensor: + kernel_size = self.kernel_size[0] + stride = self.stride[0] + padding_total = kernel_size - stride + + y = super().forward(x) + + # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be + # removed at the very end, when keeping only the right length for the output, + # as removing it here would require also passing the length at the matching layer + # in the encoder. + if causal: + padding_right = ceil(padding_total) + padding_left = padding_total - padding_right + y = unpad1d(y, (padding_left, padding_right)) + else: + # Asymmetric padding required for odd strides + padding_right = padding_total // 2 + padding_left = padding_total - padding_right + y = unpad1d(y, (padding_left, padding_right)) + return y + + +def Downsample1d( + in_channels: int, out_channels: int, factor: int, kernel_multiplier: int = 2 +) -> nn.Module: + assert kernel_multiplier % 2 == 0, "Kernel multiplier must be even" + + return Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=factor * kernel_multiplier + 1, + stride=factor + ) + + +def Upsample1d( + in_channels: int, out_channels: int, factor: int, use_nearest: bool = False +) -> nn.Module: + + if factor == 1: + return Conv1d( + in_channels=in_channels, out_channels=out_channels, kernel_size=3 + ) + + if use_nearest: + return nn.Sequential( + nn.Upsample(scale_factor=factor, mode="nearest"), + Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3 + ), + ) + else: + return ConvTranspose1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=factor * 2, + stride=factor + ) + + +class ConvBlock1d(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + *, + kernel_size: int = 3, + stride: int = 1, + dilation: int = 1, + num_groups: int = 8, + use_norm: bool = True, + use_snake: bool = False + ) -> None: + super().__init__() + + self.groupnorm = ( + nn.GroupNorm(num_groups=num_groups, num_channels=in_channels) + if use_norm + else nn.Identity() + ) + + if use_snake: + self.activation = Snake1d(in_channels) + else: + self.activation = nn.SiLU() + + self.project = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + ) + + def forward( + self, x: Tensor, scale_shift: Optional[Tuple[Tensor, Tensor]] = None, causal=False + ) -> Tensor: + x = self.groupnorm(x) + if exists(scale_shift): + scale, shift = scale_shift + x = x * (scale + 1) + shift + x = self.activation(x) + return self.project(x, causal=causal) + + +class MappingToScaleShift(nn.Module): + def __init__( + self, + features: int, + channels: int, + ): + super().__init__() + + self.to_scale_shift = nn.Sequential( + nn.SiLU(), + nn.Linear(in_features=features, out_features=channels * 2), + ) + + def forward(self, mapping: Tensor) -> Tuple[Tensor, Tensor]: + scale_shift = self.to_scale_shift(mapping) + scale_shift = rearrange(scale_shift, "b c -> b c 1") + scale, shift = scale_shift.chunk(2, dim=1) + return scale, shift + + +class ResnetBlock1d(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + *, + kernel_size: int = 3, + stride: int = 1, + dilation: int = 1, + use_norm: bool = True, + use_snake: bool = False, + num_groups: int = 8, + context_mapping_features: Optional[int] = None, + ) -> None: + super().__init__() + + self.use_mapping = exists(context_mapping_features) + + self.block1 = ConvBlock1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + use_norm=use_norm, + num_groups=num_groups, + use_snake=use_snake + ) + + if self.use_mapping: + assert exists(context_mapping_features) + self.to_scale_shift = MappingToScaleShift( + features=context_mapping_features, channels=out_channels + ) + + self.block2 = ConvBlock1d( + in_channels=out_channels, + out_channels=out_channels, + use_norm=use_norm, + num_groups=num_groups, + use_snake=use_snake + ) + + self.to_out = ( + Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1) + if in_channels != out_channels + else nn.Identity() + ) + + def forward(self, x: Tensor, mapping: Optional[Tensor] = None, causal=False) -> Tensor: + assert_message = "context mapping required if context_mapping_features > 0" + assert not (self.use_mapping ^ exists(mapping)), assert_message + + h = self.block1(x, causal=causal) + + scale_shift = None + if self.use_mapping: + scale_shift = self.to_scale_shift(mapping) + + h = self.block2(h, scale_shift=scale_shift, causal=causal) + + return h + self.to_out(x) + + +class Patcher(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + patch_size: int, + context_mapping_features: Optional[int] = None, + use_snake: bool = False, + ): + super().__init__() + assert_message = f"out_channels must be divisible by patch_size ({patch_size})" + assert out_channels % patch_size == 0, assert_message + self.patch_size = patch_size + + self.block = ResnetBlock1d( + in_channels=in_channels, + out_channels=out_channels // patch_size, + num_groups=1, + context_mapping_features=context_mapping_features, + use_snake=use_snake + ) + + def forward(self, x: Tensor, mapping: Optional[Tensor] = None, causal=False) -> Tensor: + x = self.block(x, mapping, causal=causal) + x = rearrange(x, "b c (l p) -> b (c p) l", p=self.patch_size) + return x + + +class Unpatcher(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + patch_size: int, + context_mapping_features: Optional[int] = None, + use_snake: bool = False + ): + super().__init__() + assert_message = f"in_channels must be divisible by patch_size ({patch_size})" + assert in_channels % patch_size == 0, assert_message + self.patch_size = patch_size + + self.block = ResnetBlock1d( + in_channels=in_channels // patch_size, + out_channels=out_channels, + num_groups=1, + context_mapping_features=context_mapping_features, + use_snake=use_snake + ) + + def forward(self, x: Tensor, mapping: Optional[Tensor] = None, causal=False) -> Tensor: + x = rearrange(x, " b (c p) l -> b c (l p) ", p=self.patch_size) + x = self.block(x, mapping, causal=causal) + return x + + +""" +Attention Components +""" +def FeedForward(features: int, multiplier: int) -> nn.Module: + mid_features = features * multiplier + return nn.Sequential( + nn.Linear(in_features=features, out_features=mid_features), + nn.GELU(), + nn.Linear(in_features=mid_features, out_features=features), + ) + +def add_mask(sim: Tensor, mask: Tensor) -> Tensor: + b, ndim = sim.shape[0], mask.ndim + if ndim == 3: + mask = rearrange(mask, "b n m -> b 1 n m") + if ndim == 2: + mask = repeat(mask, "n m -> b 1 n m", b=b) + max_neg_value = -torch.finfo(sim.dtype).max + sim = sim.masked_fill(~mask, max_neg_value) + return sim + +def causal_mask(q: Tensor, k: Tensor) -> Tensor: + b, i, j, device = q.shape[0], q.shape[-2], k.shape[-2], q.device + mask = ~torch.ones((i, j), dtype=torch.bool, device=device).triu(j - i + 1) + mask = repeat(mask, "n m -> b n m", b=b) + return mask + +class AttentionBase(nn.Module): + def __init__( + self, + features: int, + *, + head_features: int, + num_heads: int, + out_features: Optional[int] = None, + ): + super().__init__() + self.scale = head_features**-0.5 + self.num_heads = num_heads + mid_features = head_features * num_heads + out_features = default(out_features, features) + + self.to_out = nn.Linear( + in_features=mid_features, out_features=out_features + ) + + self.use_flash = torch.cuda.is_available() and version.parse(torch.__version__) >= version.parse('2.0.0') + + if not self.use_flash: + return + + device_properties = torch.cuda.get_device_properties(torch.device('cuda')) + + if device_properties.major == 8 and device_properties.minor == 0: + # Use flash attention for A100 GPUs + self.sdp_kernel_config = (True, False, False) + else: + # Don't use flash attention for other GPUs + self.sdp_kernel_config = (False, True, True) + + def forward( + self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None, is_causal: bool = False + ) -> Tensor: + # Split heads + q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=self.num_heads) + + if not self.use_flash: + if is_causal and not mask: + # Mask out future tokens for causal attention + mask = causal_mask(q, k) + + # Compute similarity matrix and add eventual mask + sim = einsum("... n d, ... m d -> ... n m", q, k) * self.scale + sim = add_mask(sim, mask) if exists(mask) else sim + + # Get attention matrix with softmax + attn = sim.softmax(dim=-1, dtype=torch.float32) + + # Compute values + out = einsum("... n m, ... m d -> ... n d", attn, v) + else: + with sdp_kernel(*self.sdp_kernel_config): + out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, is_causal=is_causal) + + out = rearrange(out, "b h n d -> b n (h d)") + return self.to_out(out) + +class Attention(nn.Module): + def __init__( + self, + features: int, + *, + head_features: int, + num_heads: int, + out_features: Optional[int] = None, + context_features: Optional[int] = None, + causal: bool = False, + ): + super().__init__() + self.context_features = context_features + self.causal = causal + mid_features = head_features * num_heads + context_features = default(context_features, features) + + self.norm = nn.LayerNorm(features) + self.norm_context = nn.LayerNorm(context_features) + self.to_q = nn.Linear( + in_features=features, out_features=mid_features, bias=False + ) + self.to_kv = nn.Linear( + in_features=context_features, out_features=mid_features * 2, bias=False + ) + self.attention = AttentionBase( + features, + num_heads=num_heads, + head_features=head_features, + out_features=out_features, + ) + + def forward( + self, + x: Tensor, # [b, n, c] + context: Optional[Tensor] = None, # [b, m, d] + context_mask: Optional[Tensor] = None, # [b, m], false is masked, + causal: Optional[bool] = False, + ) -> Tensor: + assert_message = "You must provide a context when using context_features" + assert not self.context_features or exists(context), assert_message + # Use context if provided + context = default(context, x) + # Normalize then compute q from input and k,v from context + x, context = self.norm(x), self.norm_context(context) + + q, k, v = (self.to_q(x), *torch.chunk(self.to_kv(context), chunks=2, dim=-1)) + + if exists(context_mask): + # Mask out cross-attention for padding tokens + mask = repeat(context_mask, "b m -> b m d", d=v.shape[-1]) + k, v = k * mask, v * mask + + # Compute and return attention + return self.attention(q, k, v, is_causal=self.causal or causal) + + +def FeedForward(features: int, multiplier: int) -> nn.Module: + mid_features = features * multiplier + return nn.Sequential( + nn.Linear(in_features=features, out_features=mid_features), + nn.GELU(), + nn.Linear(in_features=mid_features, out_features=features), + ) + +""" +Transformer Blocks +""" + + +class TransformerBlock(nn.Module): + def __init__( + self, + features: int, + num_heads: int, + head_features: int, + multiplier: int, + context_features: Optional[int] = None, + ): + super().__init__() + + self.use_cross_attention = exists(context_features) and context_features > 0 + + self.attention = Attention( + features=features, + num_heads=num_heads, + head_features=head_features + ) + + if self.use_cross_attention: + self.cross_attention = Attention( + features=features, + num_heads=num_heads, + head_features=head_features, + context_features=context_features + ) + + self.feed_forward = FeedForward(features=features, multiplier=multiplier) + + def forward(self, x: Tensor, *, context: Optional[Tensor] = None, context_mask: Optional[Tensor] = None, causal: Optional[bool] = False) -> Tensor: + x = self.attention(x, causal=causal) + x + if self.use_cross_attention: + x = self.cross_attention(x, context=context, context_mask=context_mask) + x + x = self.feed_forward(x) + x + return x + + +""" +Transformers +""" + + +class Transformer1d(nn.Module): + def __init__( + self, + num_layers: int, + channels: int, + num_heads: int, + head_features: int, + multiplier: int, + context_features: Optional[int] = None, + ): + super().__init__() + + self.to_in = nn.Sequential( + nn.GroupNorm(num_groups=32, num_channels=channels, eps=1e-6, affine=True), + Conv1d( + in_channels=channels, + out_channels=channels, + kernel_size=1, + ), + Rearrange("b c t -> b t c"), + ) + + self.blocks = nn.ModuleList( + [ + TransformerBlock( + features=channels, + head_features=head_features, + num_heads=num_heads, + multiplier=multiplier, + context_features=context_features, + ) + for i in range(num_layers) + ] + ) + + self.to_out = nn.Sequential( + Rearrange("b t c -> b c t"), + Conv1d( + in_channels=channels, + out_channels=channels, + kernel_size=1, + ), + ) + + def forward(self, x: Tensor, *, context: Optional[Tensor] = None, context_mask: Optional[Tensor] = None, causal=False) -> Tensor: + x = self.to_in(x) + for block in self.blocks: + x = block(x, context=context, context_mask=context_mask, causal=causal) + x = self.to_out(x) + return x + + +""" +Time Embeddings +""" + + +class SinusoidalEmbedding(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.dim = dim + + def forward(self, x: Tensor) -> Tensor: + device, half_dim = x.device, self.dim // 2 + emb = torch.tensor(log(10000) / (half_dim - 1), device=device) + emb = torch.exp(torch.arange(half_dim, device=device) * -emb) + emb = rearrange(x, "i -> i 1") * rearrange(emb, "j -> 1 j") + return torch.cat((emb.sin(), emb.cos()), dim=-1) + + +class LearnedPositionalEmbedding(nn.Module): + """Used for continuous time""" + + def __init__(self, dim: int): + super().__init__() + assert (dim % 2) == 0 + half_dim = dim // 2 + self.weights = nn.Parameter(torch.randn(half_dim)) + + def forward(self, x: Tensor) -> Tensor: + x = rearrange(x, "b -> b 1") + freqs = x * rearrange(self.weights, "d -> 1 d") * 2 * pi + fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1) + fouriered = torch.cat((x, fouriered), dim=-1) + return fouriered + + +def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module: + return nn.Sequential( + LearnedPositionalEmbedding(dim), + nn.Linear(in_features=dim + 1, out_features=out_features), + ) + + +""" +Encoder/Decoder Components +""" + + +class DownsampleBlock1d(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + *, + factor: int, + num_groups: int, + num_layers: int, + kernel_multiplier: int = 2, + use_pre_downsample: bool = True, + use_skip: bool = False, + use_snake: bool = False, + extract_channels: int = 0, + context_channels: int = 0, + num_transformer_blocks: int = 0, + attention_heads: Optional[int] = None, + attention_features: Optional[int] = None, + attention_multiplier: Optional[int] = None, + context_mapping_features: Optional[int] = None, + context_embedding_features: Optional[int] = None, + ): + super().__init__() + self.use_pre_downsample = use_pre_downsample + self.use_skip = use_skip + self.use_transformer = num_transformer_blocks > 0 + self.use_extract = extract_channels > 0 + self.use_context = context_channels > 0 + + channels = out_channels if use_pre_downsample else in_channels + + self.downsample = Downsample1d( + in_channels=in_channels, + out_channels=out_channels, + factor=factor, + kernel_multiplier=kernel_multiplier, + ) + + self.blocks = nn.ModuleList( + [ + ResnetBlock1d( + in_channels=channels + context_channels if i == 0 else channels, + out_channels=channels, + num_groups=num_groups, + context_mapping_features=context_mapping_features, + use_snake=use_snake + ) + for i in range(num_layers) + ] + ) + + if self.use_transformer: + assert ( + (exists(attention_heads) or exists(attention_features)) + and exists(attention_multiplier) + ) + + if attention_features is None and attention_heads is not None: + attention_features = channels // attention_heads + + if attention_heads is None and attention_features is not None: + attention_heads = channels // attention_features + + self.transformer = Transformer1d( + num_layers=num_transformer_blocks, + channels=channels, + num_heads=attention_heads, + head_features=attention_features, + multiplier=attention_multiplier, + context_features=context_embedding_features + ) + + if self.use_extract: + num_extract_groups = min(num_groups, extract_channels) + self.to_extracted = ResnetBlock1d( + in_channels=out_channels, + out_channels=extract_channels, + num_groups=num_extract_groups, + use_snake=use_snake + ) + + def forward( + self, + x: Tensor, + *, + mapping: Optional[Tensor] = None, + channels: Optional[Tensor] = None, + embedding: Optional[Tensor] = None, + embedding_mask: Optional[Tensor] = None, + causal: Optional[bool] = False + ) -> Union[Tuple[Tensor, List[Tensor]], Tensor]: + + if self.use_pre_downsample: + x = self.downsample(x) + + if self.use_context and exists(channels): + x = torch.cat([x, channels], dim=1) + + skips = [] + for block in self.blocks: + x = block(x, mapping=mapping, causal=causal) + skips += [x] if self.use_skip else [] + + if self.use_transformer: + x = self.transformer(x, context=embedding, context_mask=embedding_mask, causal=causal) + skips += [x] if self.use_skip else [] + + if not self.use_pre_downsample: + x = self.downsample(x) + + if self.use_extract: + extracted = self.to_extracted(x) + return x, extracted + + return (x, skips) if self.use_skip else x + + +class UpsampleBlock1d(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + *, + factor: int, + num_layers: int, + num_groups: int, + use_nearest: bool = False, + use_pre_upsample: bool = False, + use_skip: bool = False, + use_snake: bool = False, + skip_channels: int = 0, + use_skip_scale: bool = False, + extract_channels: int = 0, + num_transformer_blocks: int = 0, + attention_heads: Optional[int] = None, + attention_features: Optional[int] = None, + attention_multiplier: Optional[int] = None, + context_mapping_features: Optional[int] = None, + context_embedding_features: Optional[int] = None, + ): + super().__init__() + + self.use_extract = extract_channels > 0 + self.use_pre_upsample = use_pre_upsample + self.use_transformer = num_transformer_blocks > 0 + self.use_skip = use_skip + self.skip_scale = 2 ** -0.5 if use_skip_scale else 1.0 + + channels = out_channels if use_pre_upsample else in_channels + + self.blocks = nn.ModuleList( + [ + ResnetBlock1d( + in_channels=channels + skip_channels, + out_channels=channels, + num_groups=num_groups, + context_mapping_features=context_mapping_features, + use_snake=use_snake + ) + for _ in range(num_layers) + ] + ) + + if self.use_transformer: + assert ( + (exists(attention_heads) or exists(attention_features)) + and exists(attention_multiplier) + ) + + if attention_features is None and attention_heads is not None: + attention_features = channels // attention_heads + + if attention_heads is None and attention_features is not None: + attention_heads = channels // attention_features + + self.transformer = Transformer1d( + num_layers=num_transformer_blocks, + channels=channels, + num_heads=attention_heads, + head_features=attention_features, + multiplier=attention_multiplier, + context_features=context_embedding_features, + ) + + self.upsample = Upsample1d( + in_channels=in_channels, + out_channels=out_channels, + factor=factor, + use_nearest=use_nearest, + ) + + if self.use_extract: + num_extract_groups = min(num_groups, extract_channels) + self.to_extracted = ResnetBlock1d( + in_channels=out_channels, + out_channels=extract_channels, + num_groups=num_extract_groups, + use_snake=use_snake + ) + + def add_skip(self, x: Tensor, skip: Tensor) -> Tensor: + return torch.cat([x, skip * self.skip_scale], dim=1) + + def forward( + self, + x: Tensor, + *, + skips: Optional[List[Tensor]] = None, + mapping: Optional[Tensor] = None, + embedding: Optional[Tensor] = None, + embedding_mask: Optional[Tensor] = None, + causal: Optional[bool] = False + ) -> Union[Tuple[Tensor, Tensor], Tensor]: + + if self.use_pre_upsample: + x = self.upsample(x) + + for block in self.blocks: + x = self.add_skip(x, skip=skips.pop()) if exists(skips) else x + x = block(x, mapping=mapping, causal=causal) + + if self.use_transformer: + x = self.transformer(x, context=embedding, context_mask=embedding_mask, causal=causal) + + if not self.use_pre_upsample: + x = self.upsample(x) + + if self.use_extract: + extracted = self.to_extracted(x) + return x, extracted + + return x + + +class BottleneckBlock1d(nn.Module): + def __init__( + self, + channels: int, + *, + num_groups: int, + num_transformer_blocks: int = 0, + attention_heads: Optional[int] = None, + attention_features: Optional[int] = None, + attention_multiplier: Optional[int] = None, + context_mapping_features: Optional[int] = None, + context_embedding_features: Optional[int] = None, + use_snake: bool = False, + ): + super().__init__() + self.use_transformer = num_transformer_blocks > 0 + + self.pre_block = ResnetBlock1d( + in_channels=channels, + out_channels=channels, + num_groups=num_groups, + context_mapping_features=context_mapping_features, + use_snake=use_snake + ) + + if self.use_transformer: + assert ( + (exists(attention_heads) or exists(attention_features)) + and exists(attention_multiplier) + ) + + if attention_features is None and attention_heads is not None: + attention_features = channels // attention_heads + + if attention_heads is None and attention_features is not None: + attention_heads = channels // attention_features + + self.transformer = Transformer1d( + num_layers=num_transformer_blocks, + channels=channels, + num_heads=attention_heads, + head_features=attention_features, + multiplier=attention_multiplier, + context_features=context_embedding_features, + ) + + self.post_block = ResnetBlock1d( + in_channels=channels, + out_channels=channels, + num_groups=num_groups, + context_mapping_features=context_mapping_features, + use_snake=use_snake + ) + + def forward( + self, + x: Tensor, + *, + mapping: Optional[Tensor] = None, + embedding: Optional[Tensor] = None, + embedding_mask: Optional[Tensor] = None, + causal: Optional[bool] = False + ) -> Tensor: + x = self.pre_block(x, mapping=mapping, causal=causal) + if self.use_transformer: + x = self.transformer(x, context=embedding, context_mask=embedding_mask, causal=causal) + x = self.post_block(x, mapping=mapping, causal=causal) + return x + + +""" +UNet +""" + + +class UNet1d(nn.Module): + def __init__( + self, + in_channels: int, + channels: int, + multipliers: Sequence[int], + factors: Sequence[int], + num_blocks: Sequence[int], + attentions: Sequence[int], + patch_size: int = 1, + resnet_groups: int = 8, + use_context_time: bool = True, + kernel_multiplier_downsample: int = 2, + use_nearest_upsample: bool = False, + use_skip_scale: bool = True, + use_snake: bool = False, + use_stft: bool = False, + use_stft_context: bool = False, + out_channels: Optional[int] = None, + context_features: Optional[int] = None, + context_features_multiplier: int = 4, + context_channels: Optional[Sequence[int]] = None, + context_embedding_features: Optional[int] = None, + **kwargs, + ): + super().__init__() + out_channels = default(out_channels, in_channels) + context_channels = list(default(context_channels, [])) + num_layers = len(multipliers) - 1 + use_context_features = exists(context_features) + use_context_channels = len(context_channels) > 0 + context_mapping_features = None + + attention_kwargs, kwargs = groupby("attention_", kwargs, keep_prefix=True) + + self.num_layers = num_layers + self.use_context_time = use_context_time + self.use_context_features = use_context_features + self.use_context_channels = use_context_channels + self.use_stft = use_stft + self.use_stft_context = use_stft_context + + self.context_features = context_features + context_channels_pad_length = num_layers + 1 - len(context_channels) + context_channels = context_channels + [0] * context_channels_pad_length + self.context_channels = context_channels + self.context_embedding_features = context_embedding_features + + if use_context_channels: + has_context = [c > 0 for c in context_channels] + self.has_context = has_context + self.channels_ids = [sum(has_context[:i]) for i in range(len(has_context))] + + assert ( + len(factors) == num_layers + and len(attentions) >= num_layers + and len(num_blocks) == num_layers + ) + + if use_context_time or use_context_features: + context_mapping_features = channels * context_features_multiplier + + self.to_mapping = nn.Sequential( + nn.Linear(context_mapping_features, context_mapping_features), + nn.GELU(), + nn.Linear(context_mapping_features, context_mapping_features), + nn.GELU(), + ) + + if use_context_time: + assert exists(context_mapping_features) + self.to_time = nn.Sequential( + TimePositionalEmbedding( + dim=channels, out_features=context_mapping_features + ), + nn.GELU(), + ) + + if use_context_features: + assert exists(context_features) and exists(context_mapping_features) + self.to_features = nn.Sequential( + nn.Linear( + in_features=context_features, out_features=context_mapping_features + ), + nn.GELU(), + ) + + if use_stft: + stft_kwargs, kwargs = groupby("stft_", kwargs) + assert "num_fft" in stft_kwargs, "stft_num_fft required if use_stft=True" + stft_channels = (stft_kwargs["num_fft"] // 2 + 1) * 2 + in_channels *= stft_channels + out_channels *= stft_channels + context_channels[0] *= stft_channels if use_stft_context else 1 + assert exists(in_channels) and exists(out_channels) + self.stft = STFT(**stft_kwargs) + + assert not kwargs, f"Unknown arguments: {', '.join(list(kwargs.keys()))}" + + self.to_in = Patcher( + in_channels=in_channels + context_channels[0], + out_channels=channels * multipliers[0], + patch_size=patch_size, + context_mapping_features=context_mapping_features, + use_snake=use_snake + ) + + self.downsamples = nn.ModuleList( + [ + DownsampleBlock1d( + in_channels=channels * multipliers[i], + out_channels=channels * multipliers[i + 1], + context_mapping_features=context_mapping_features, + context_channels=context_channels[i + 1], + context_embedding_features=context_embedding_features, + num_layers=num_blocks[i], + factor=factors[i], + kernel_multiplier=kernel_multiplier_downsample, + num_groups=resnet_groups, + use_pre_downsample=True, + use_skip=True, + use_snake=use_snake, + num_transformer_blocks=attentions[i], + **attention_kwargs, + ) + for i in range(num_layers) + ] + ) + + self.bottleneck = BottleneckBlock1d( + channels=channels * multipliers[-1], + context_mapping_features=context_mapping_features, + context_embedding_features=context_embedding_features, + num_groups=resnet_groups, + num_transformer_blocks=attentions[-1], + use_snake=use_snake, + **attention_kwargs, + ) + + self.upsamples = nn.ModuleList( + [ + UpsampleBlock1d( + in_channels=channels * multipliers[i + 1], + out_channels=channels * multipliers[i], + context_mapping_features=context_mapping_features, + context_embedding_features=context_embedding_features, + num_layers=num_blocks[i] + (1 if attentions[i] else 0), + factor=factors[i], + use_nearest=use_nearest_upsample, + num_groups=resnet_groups, + use_skip_scale=use_skip_scale, + use_pre_upsample=False, + use_skip=True, + use_snake=use_snake, + skip_channels=channels * multipliers[i + 1], + num_transformer_blocks=attentions[i], + **attention_kwargs, + ) + for i in reversed(range(num_layers)) + ] + ) + + self.to_out = Unpatcher( + in_channels=channels * multipliers[0], + out_channels=out_channels, + patch_size=patch_size, + context_mapping_features=context_mapping_features, + use_snake=use_snake + ) + + def get_channels( + self, channels_list: Optional[Sequence[Tensor]] = None, layer: int = 0 + ) -> Optional[Tensor]: + """Gets context channels at `layer` and checks that shape is correct""" + use_context_channels = self.use_context_channels and self.has_context[layer] + if not use_context_channels: + return None + assert exists(channels_list), "Missing context" + # Get channels index (skipping zero channel contexts) + channels_id = self.channels_ids[layer] + # Get channels + channels = channels_list[channels_id] + message = f"Missing context for layer {layer} at index {channels_id}" + assert exists(channels), message + # Check channels + num_channels = self.context_channels[layer] + message = f"Expected context with {num_channels} channels at idx {channels_id}" + assert channels.shape[1] == num_channels, message + # STFT channels if requested + channels = self.stft.encode1d(channels) if self.use_stft_context else channels # type: ignore # noqa + return channels + + def get_mapping( + self, time: Optional[Tensor] = None, features: Optional[Tensor] = None + ) -> Optional[Tensor]: + """Combines context time features and features into mapping""" + items, mapping = [], None + # Compute time features + if self.use_context_time: + assert_message = "use_context_time=True but no time features provided" + assert exists(time), assert_message + items += [self.to_time(time)] + # Compute features + if self.use_context_features: + assert_message = "context_features exists but no features provided" + assert exists(features), assert_message + items += [self.to_features(features)] + # Compute joint mapping + if self.use_context_time or self.use_context_features: + mapping = reduce(torch.stack(items), "n b m -> b m", "sum") + mapping = self.to_mapping(mapping) + return mapping + + def forward( + self, + x: Tensor, + time: Optional[Tensor] = None, + *, + features: Optional[Tensor] = None, + channels_list: Optional[Sequence[Tensor]] = None, + embedding: Optional[Tensor] = None, + embedding_mask: Optional[Tensor] = None, + causal: Optional[bool] = False, + ) -> Tensor: + channels = self.get_channels(channels_list, layer=0) + # Apply stft if required + print(x.shape) + x = self.stft.encode1d(x) if self.use_stft else x # type: ignore + print(x.shape) + # Concat context channels at layer 0 if provided + x = torch.cat([x, channels], dim=1) if exists(channels) else x + print(x.shape) + # Compute mapping from time and features + mapping = self.get_mapping(time, features) + x = self.to_in(x, mapping, causal=causal) + print(x.shape) + skips_list = [x] + + for i, downsample in enumerate(self.downsamples): + channels = self.get_channels(channels_list, layer=i + 1) + x, skips = downsample( + x, mapping=mapping, channels=channels, embedding=embedding, embedding_mask=embedding_mask, causal=causal + ) + skips_list += [skips] + + x = self.bottleneck(x, mapping=mapping, embedding=embedding, embedding_mask=embedding_mask, causal=causal) + for i, upsample in enumerate(self.upsamples): + skips = skips_list.pop() + x = upsample(x, skips=skips, mapping=mapping, embedding=embedding, embedding_mask=embedding_mask, causal=causal) + + x += skips_list.pop() + x = self.to_out(x, mapping, causal=causal) + x = self.stft.decode1d(x) if self.use_stft else x + + return x + + +""" Conditioning Modules """ + + +class FixedEmbedding(nn.Module): + def __init__(self, max_length: int, features: int): + super().__init__() + self.max_length = max_length + self.embedding = nn.Embedding(max_length, features) + + def forward(self, x: Tensor) -> Tensor: + batch_size, length, device = *x.shape[0:2], x.device + assert_message = "Input sequence length must be <= max_length" + assert length <= self.max_length, assert_message + position = torch.arange(length, device=device) + fixed_embedding = self.embedding(position) + fixed_embedding = repeat(fixed_embedding, "n d -> b n d", b=batch_size) + return fixed_embedding + + +def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor: + if proba == 1: + return torch.ones(shape, device=device, dtype=torch.bool) + elif proba == 0: + return torch.zeros(shape, device=device, dtype=torch.bool) + else: + return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool) + + +class UNetCFG1d(UNet1d): + + """UNet1d with Classifier-Free Guidance""" + + def __init__( + self, + context_embedding_max_length: int, + context_embedding_features: int, + use_xattn_time: bool = False, + **kwargs, + ): + super().__init__( + context_embedding_features=context_embedding_features, **kwargs + ) + + self.use_xattn_time = use_xattn_time + + if use_xattn_time: + assert exists(context_embedding_features) + self.to_time_embedding = nn.Sequential( + TimePositionalEmbedding( + dim=kwargs["channels"], out_features=context_embedding_features + ), + nn.GELU(), + ) + + context_embedding_max_length += 1 # Add one for time embedding + + self.fixed_embedding = FixedEmbedding( + max_length=context_embedding_max_length, features=context_embedding_features + ) + + def forward( # type: ignore + self, + x: Tensor, + time: Tensor, + *, + embedding: Tensor, + embedding_mask: Optional[Tensor] = None, + embedding_scale: float = 1.0, + embedding_mask_proba: float = 0.0, + batch_cfg: bool = False, + rescale_cfg: bool = False, + scale_phi: float = 0.4, + negative_embedding: Optional[Tensor] = None, + negative_embedding_mask: Optional[Tensor] = None, + **kwargs, + ) -> Tensor: + b, device = embedding.shape[0], embedding.device + + if self.use_xattn_time: + embedding = torch.cat([embedding, self.to_time_embedding(time).unsqueeze(1)], dim=1) + + if embedding_mask is not None: + embedding_mask = torch.cat([embedding_mask, torch.ones((b, 1), device=device)], dim=1) + + fixed_embedding = self.fixed_embedding(embedding) + + if embedding_mask_proba > 0.0: + # Randomly mask embedding + batch_mask = rand_bool( + shape=(b, 1, 1), proba=embedding_mask_proba, device=device + ) + embedding = torch.where(batch_mask, fixed_embedding, embedding) + + if embedding_scale != 1.0: + if batch_cfg: + batch_x = torch.cat([x, x], dim=0) + batch_time = torch.cat([time, time], dim=0) + + if negative_embedding is not None: + if negative_embedding_mask is not None: + negative_embedding_mask = negative_embedding_mask.to(torch.bool).unsqueeze(2) + + negative_embedding = torch.where(negative_embedding_mask, negative_embedding, fixed_embedding) + + batch_embed = torch.cat([embedding, negative_embedding], dim=0) + + else: + batch_embed = torch.cat([embedding, fixed_embedding], dim=0) + + batch_mask = None + if embedding_mask is not None: + batch_mask = torch.cat([embedding_mask, embedding_mask], dim=0) + + batch_features = None + features = kwargs.pop("features", None) + if self.use_context_features: + batch_features = torch.cat([features, features], dim=0) + + batch_channels = None + channels_list = kwargs.pop("channels_list", None) + if self.use_context_channels: + batch_channels = [] + for channels in channels_list: + batch_channels += [torch.cat([channels, channels], dim=0)] + + # Compute both normal and fixed embedding outputs + batch_out = super().forward(batch_x, batch_time, embedding=batch_embed, embedding_mask=batch_mask, features=batch_features, channels_list=batch_channels, **kwargs) + out, out_masked = batch_out.chunk(2, dim=0) + + else: + # Compute both normal and fixed embedding outputs + out = super().forward(x, time, embedding=embedding, embedding_mask=embedding_mask, **kwargs) + out_masked = super().forward(x, time, embedding=fixed_embedding, embedding_mask=embedding_mask, **kwargs) + + out_cfg = out_masked + (out - out_masked) * embedding_scale + + if rescale_cfg: + + out_std = out.std(dim=1, keepdim=True) + out_cfg_std = out_cfg.std(dim=1, keepdim=True) + + return scale_phi * (out_cfg * (out_std/out_cfg_std)) + (1-scale_phi) * out_cfg + + else: + + return out_cfg + + else: + return super().forward(x, time, embedding=embedding, embedding_mask=embedding_mask, **kwargs) + + +class UNetNCCA1d(UNet1d): + + """UNet1d with Noise Channel Conditioning Augmentation""" + + def __init__(self, context_features: int, **kwargs): + super().__init__(context_features=context_features, **kwargs) + self.embedder = NumberEmbedder(features=context_features) + + def expand(self, x: Any, shape: Tuple[int, ...]) -> Tensor: + x = x if torch.is_tensor(x) else torch.tensor(x) + return x.expand(shape) + + def forward( # type: ignore + self, + x: Tensor, + time: Tensor, + *, + channels_list: Sequence[Tensor], + channels_augmentation: Union[ + bool, Sequence[bool], Sequence[Sequence[bool]], Tensor + ] = False, + channels_scale: Union[ + float, Sequence[float], Sequence[Sequence[float]], Tensor + ] = 0, + **kwargs, + ) -> Tensor: + b, n = x.shape[0], len(channels_list) + channels_augmentation = self.expand(channels_augmentation, shape=(b, n)).to(x) + channels_scale = self.expand(channels_scale, shape=(b, n)).to(x) + + # Augmentation (for each channel list item) + for i in range(n): + scale = channels_scale[:, i] * channels_augmentation[:, i] + scale = rearrange(scale, "b -> b 1 1") + item = channels_list[i] + channels_list[i] = torch.randn_like(item) * scale + item * (1 - scale) # type: ignore # noqa + + # Scale embedding (sum reduction if more than one channel list item) + channels_scale_emb = self.embedder(channels_scale) + channels_scale_emb = reduce(channels_scale_emb, "b n d -> b d", "sum") + + return super().forward( + x=x, + time=time, + channels_list=channels_list, + features=channels_scale_emb, + **kwargs, + ) + + +class UNetAll1d(UNetCFG1d, UNetNCCA1d): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, *args, **kwargs): # type: ignore + return UNetCFG1d.forward(self, *args, **kwargs) + + +def XUNet1d(type: str = "base", **kwargs) -> UNet1d: + if type == "base": + return UNet1d(**kwargs) + elif type == "all": + return UNetAll1d(**kwargs) + elif type == "cfg": + return UNetCFG1d(**kwargs) + elif type == "ncca": + return UNetNCCA1d(**kwargs) + else: + raise ValueError(f"Unknown XUNet1d type: {type}") + +class NumberEmbedder(nn.Module): + def __init__( + self, + features: int, + dim: int = 256, + ): + super().__init__() + self.features = features + self.embedding = TimePositionalEmbedding(dim=dim, out_features=features) + + def forward(self, x: Union[List[float], Tensor]) -> Tensor: + if not torch.is_tensor(x): + device = next(self.embedding.parameters()).device + x = torch.tensor(x, device=device) + assert isinstance(x, Tensor) + shape = x.shape + x = rearrange(x, "... -> (...)") + embedding = self.embedding(x) + x = embedding.view(*shape, self.features) + return x # type: ignore + + +""" +Audio Transforms +""" + + +class STFT(nn.Module): + """Helper for torch stft and istft""" + + def __init__( + self, + num_fft: int = 1023, + hop_length: int = 256, + window_length: Optional[int] = None, + length: Optional[int] = None, + use_complex: bool = False, + ): + super().__init__() + self.num_fft = num_fft + self.hop_length = default(hop_length, floor(num_fft // 4)) + self.window_length = default(window_length, num_fft) + self.length = length + self.register_buffer("window", torch.hann_window(self.window_length)) + self.use_complex = use_complex + + def encode(self, wave: Tensor) -> Tuple[Tensor, Tensor]: + b = wave.shape[0] + wave = rearrange(wave, "b c t -> (b c) t") + + stft = torch.stft( + wave, + n_fft=self.num_fft, + hop_length=self.hop_length, + win_length=self.window_length, + window=self.window, # type: ignore + return_complex=True, + normalized=True, + ) + + if self.use_complex: + # Returns real and imaginary + stft_a, stft_b = stft.real, stft.imag + else: + # Returns magnitude and phase matrices + magnitude, phase = torch.abs(stft), torch.angle(stft) + stft_a, stft_b = magnitude, phase + + return rearrange_many((stft_a, stft_b), "(b c) f l -> b c f l", b=b) + + def decode(self, stft_a: Tensor, stft_b: Tensor) -> Tensor: + b, l = stft_a.shape[0], stft_a.shape[-1] # noqa + length = closest_power_2(l * self.hop_length) + + stft_a, stft_b = rearrange_many((stft_a, stft_b), "b c f l -> (b c) f l") + + if self.use_complex: + real, imag = stft_a, stft_b + else: + magnitude, phase = stft_a, stft_b + real, imag = magnitude * torch.cos(phase), magnitude * torch.sin(phase) + + stft = torch.stack([real, imag], dim=-1) + + wave = torch.istft( + stft, + n_fft=self.num_fft, + hop_length=self.hop_length, + win_length=self.window_length, + window=self.window, # type: ignore + length=default(self.length, length), + normalized=True, + ) + + return rearrange(wave, "(b c) t -> b c t", b=b) + + def encode1d( + self, wave: Tensor, stacked: bool = True + ) -> Union[Tensor, Tuple[Tensor, Tensor]]: + stft_a, stft_b = self.encode(wave) + stft_a, stft_b = rearrange_many((stft_a, stft_b), "b c f l -> b (c f) l") + return torch.cat((stft_a, stft_b), dim=1) if stacked else (stft_a, stft_b) + + def decode1d(self, stft_pair: Tensor) -> Tensor: + f = self.num_fft // 2 + 1 + stft_a, stft_b = stft_pair.chunk(chunks=2, dim=1) + stft_a, stft_b = rearrange_many((stft_a, stft_b), "b (c f) l -> b c f l", f=f) + return self.decode(stft_a, stft_b) diff --git a/cosyvoice/flow/stable/blocks.py b/cosyvoice/flow/stable/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..3c827fd2441e643717d123847236d3d6c003ef4f --- /dev/null +++ b/cosyvoice/flow/stable/blocks.py @@ -0,0 +1,339 @@ +from functools import reduce +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from torch.backends.cuda import sdp_kernel +from packaging import version + +from dac.nn.layers import Snake1d + +class ResidualBlock(nn.Module): + def __init__(self, main, skip=None): + super().__init__() + self.main = nn.Sequential(*main) + self.skip = skip if skip else nn.Identity() + + def forward(self, input): + return self.main(input) + self.skip(input) + +class ResConvBlock(ResidualBlock): + def __init__(self, c_in, c_mid, c_out, is_last=False, kernel_size=5, conv_bias=True, use_snake=False): + skip = None if c_in == c_out else nn.Conv1d(c_in, c_out, 1, bias=False) + super().__init__([ + nn.Conv1d(c_in, c_mid, kernel_size, padding=kernel_size//2, bias=conv_bias), + nn.GroupNorm(1, c_mid), + Snake1d(c_mid) if use_snake else nn.GELU(), + nn.Conv1d(c_mid, c_out, kernel_size, padding=kernel_size//2, bias=conv_bias), + nn.GroupNorm(1, c_out) if not is_last else nn.Identity(), + (Snake1d(c_out) if use_snake else nn.GELU()) if not is_last else nn.Identity(), + ], skip) + +class SelfAttention1d(nn.Module): + def __init__(self, c_in, n_head=1, dropout_rate=0.): + super().__init__() + assert c_in % n_head == 0 + self.norm = nn.GroupNorm(1, c_in) + self.n_head = n_head + self.qkv_proj = nn.Conv1d(c_in, c_in * 3, 1) + self.out_proj = nn.Conv1d(c_in, c_in, 1) + self.dropout = nn.Dropout(dropout_rate, inplace=True) + + self.use_flash = torch.cuda.is_available() and version.parse(torch.__version__) >= version.parse('2.0.0') + + if not self.use_flash: + return + + device_properties = torch.cuda.get_device_properties(torch.device('cuda')) + + if device_properties.major == 8 and device_properties.minor == 0: + # Use flash attention for A100 GPUs + self.sdp_kernel_config = (True, False, False) + else: + # Don't use flash attention for other GPUs + self.sdp_kernel_config = (False, True, True) + + def forward(self, input): + n, c, s = input.shape + qkv = self.qkv_proj(self.norm(input)) + qkv = qkv.view( + [n, self.n_head * 3, c // self.n_head, s]).transpose(2, 3) + q, k, v = qkv.chunk(3, dim=1) + scale = k.shape[3]**-0.25 + + if self.use_flash: + with sdp_kernel(*self.sdp_kernel_config): + y = F.scaled_dot_product_attention(q, k, v, is_causal=False).contiguous().view([n, c, s]) + else: + att = ((q * scale) @ (k.transpose(2, 3) * scale)).softmax(3) + y = (att @ v).transpose(2, 3).contiguous().view([n, c, s]) + + + return input + self.dropout(self.out_proj(y)) + +class SkipBlock(nn.Module): + def __init__(self, *main): + super().__init__() + self.main = nn.Sequential(*main) + + def forward(self, input): + return torch.cat([self.main(input), input], dim=1) + +class FourierFeatures(nn.Module): + def __init__(self, in_features, out_features, std=1.): + super().__init__() + assert out_features % 2 == 0 + self.weight = nn.Parameter(torch.randn( + [out_features // 2, in_features]) * std) + + def forward(self, input): + f = 2 * math.pi * input @ self.weight.T + return torch.cat([f.cos(), f.sin()], dim=-1) + +def expand_to_planes(input, shape): + return input[..., None].repeat([1, 1, shape[2]]) + +_kernels = { + 'linear': + [1 / 8, 3 / 8, 3 / 8, 1 / 8], + 'cubic': + [-0.01171875, -0.03515625, 0.11328125, 0.43359375, + 0.43359375, 0.11328125, -0.03515625, -0.01171875], + 'lanczos3': + [0.003689131001010537, 0.015056144446134567, -0.03399861603975296, + -0.066637322306633, 0.13550527393817902, 0.44638532400131226, + 0.44638532400131226, 0.13550527393817902, -0.066637322306633, + -0.03399861603975296, 0.015056144446134567, 0.003689131001010537] +} + +class Downsample1d(nn.Module): + def __init__(self, kernel='linear', pad_mode='reflect', channels_last=False): + super().__init__() + self.pad_mode = pad_mode + kernel_1d = torch.tensor(_kernels[kernel]) + self.pad = kernel_1d.shape[0] // 2 - 1 + self.register_buffer('kernel', kernel_1d) + self.channels_last = channels_last + + def forward(self, x): + if self.channels_last: + x = x.permute(0, 2, 1) + x = F.pad(x, (self.pad,) * 2, self.pad_mode) + weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0]]) + indices = torch.arange(x.shape[1], device=x.device) + weight[indices, indices] = self.kernel.to(weight) + x = F.conv1d(x, weight, stride=2) + if self.channels_last: + x = x.permute(0, 2, 1) + return x + + +class Upsample1d(nn.Module): + def __init__(self, kernel='linear', pad_mode='reflect', channels_last=False): + super().__init__() + self.pad_mode = pad_mode + kernel_1d = torch.tensor(_kernels[kernel]) * 2 + self.pad = kernel_1d.shape[0] // 2 - 1 + self.register_buffer('kernel', kernel_1d) + self.channels_last = channels_last + + def forward(self, x): + if self.channels_last: + x = x.permute(0, 2, 1) + x = F.pad(x, ((self.pad + 1) // 2,) * 2, self.pad_mode) + weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0]]) + indices = torch.arange(x.shape[1], device=x.device) + weight[indices, indices] = self.kernel.to(weight) + x = F.conv_transpose1d(x, weight, stride=2, padding=self.pad * 2 + 1) + if self.channels_last: + x = x.permute(0, 2, 1) + return x + +def Downsample1d_2( + in_channels: int, out_channels: int, factor: int, kernel_multiplier: int = 2 +) -> nn.Module: + assert kernel_multiplier % 2 == 0, "Kernel multiplier must be even" + + return nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=factor * kernel_multiplier + 1, + stride=factor, + padding=factor * (kernel_multiplier // 2), + ) + + +def Upsample1d_2( + in_channels: int, out_channels: int, factor: int, use_nearest: bool = False +) -> nn.Module: + + if factor == 1: + return nn.Conv1d( + in_channels=in_channels, out_channels=out_channels, kernel_size=3, padding=1 + ) + + if use_nearest: + return nn.Sequential( + nn.Upsample(scale_factor=factor, mode="nearest"), + nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + padding=1, + ), + ) + else: + return nn.ConvTranspose1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=factor * 2, + stride=factor, + padding=factor // 2 + factor % 2, + output_padding=factor % 2, + ) + +def zero_init(layer): + nn.init.zeros_(layer.weight) + if layer.bias is not None: + nn.init.zeros_(layer.bias) + return layer + +def rms_norm(x, scale, eps): + dtype = reduce(torch.promote_types, (x.dtype, scale.dtype, torch.float32)) + mean_sq = torch.mean(x.to(dtype)**2, dim=-1, keepdim=True) + scale = scale.to(dtype) * torch.rsqrt(mean_sq + eps) + return x * scale.to(x.dtype) + +#rms_norm = torch.compile(rms_norm) + +class AdaRMSNorm(nn.Module): + def __init__(self, features, cond_features, eps=1e-6): + super().__init__() + self.eps = eps + self.linear = zero_init(nn.Linear(cond_features, features, bias=False)) + + def extra_repr(self): + return f"eps={self.eps}," + + def forward(self, x, cond): + return rms_norm(x, self.linear(cond)[:, None, :] + 1, self.eps) + +def normalize(x, eps=1e-4): + dim = list(range(1, x.ndim)) + n = torch.linalg.vector_norm(x, dim=dim, keepdim=True) + alpha = np.sqrt(n.numel() / x.numel()) + return x / torch.add(eps, n, alpha=alpha) + +class ForcedWNConv1d(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=1): + super().__init__() + self.weight = nn.Parameter(torch.randn([out_channels, in_channels, kernel_size])) + + def forward(self, x): + if self.training: + with torch.no_grad(): + self.weight.copy_(normalize(self.weight)) + + fan_in = self.weight[0].numel() + + w = normalize(self.weight) / math.sqrt(fan_in) + + return F.conv1d(x, w, padding='same') + +# Kernels + +use_compile = True + +def compile(function, *args, **kwargs): + if not use_compile: + return function + try: + return torch.compile(function, *args, **kwargs) + except RuntimeError: + return function + + +@compile +def linear_geglu(x, weight, bias=None): + x = x @ weight.mT + if bias is not None: + x = x + bias + x, gate = x.chunk(2, dim=-1) + return x * F.gelu(gate) + + +@compile +def rms_norm(x, scale, eps): + dtype = reduce(torch.promote_types, (x.dtype, scale.dtype, torch.float32)) + mean_sq = torch.mean(x.to(dtype)**2, dim=-1, keepdim=True) + scale = scale.to(dtype) * torch.rsqrt(mean_sq + eps) + return x * scale.to(x.dtype) + +# Layers + +class LinearGEGLU(nn.Linear): + def __init__(self, in_features, out_features, bias=True): + super().__init__(in_features, out_features * 2, bias=bias) + self.out_features = out_features + + def forward(self, x): + return linear_geglu(x, self.weight, self.bias) + + +class RMSNorm(nn.Module): + def __init__(self, shape, fix_scale = False, eps=1e-6): + super().__init__() + self.eps = eps + + if fix_scale: + self.register_buffer("scale", torch.ones(shape)) + else: + self.scale = nn.Parameter(torch.ones(shape)) + + def extra_repr(self): + return f"shape={tuple(self.scale.shape)}, eps={self.eps}" + + def forward(self, x): + return rms_norm(x, self.scale, self.eps) + +def snake_beta(x, alpha, beta): + return x + (1.0 / (beta + 0.000000001)) * pow(torch.sin(x * alpha), 2) + +# try: +# snake_beta = torch.compile(snake_beta) +# except RuntimeError: +# pass + +# Adapted from https://github.com/NVIDIA/BigVGAN/blob/main/activations.py under MIT license +# License available in LICENSES/LICENSE_NVIDIA.txt +class SnakeBeta(nn.Module): + + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True): + super(SnakeBeta, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: # log scale alphas initialized to zeros + self.alpha = nn.Parameter(torch.zeros(in_features) * alpha) + self.beta = nn.Parameter(torch.zeros(in_features) * alpha) + else: # linear scale alphas initialized to ones + self.alpha = nn.Parameter(torch.ones(in_features) * alpha) + self.beta = nn.Parameter(torch.ones(in_features) * alpha) + + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + beta = self.beta.unsqueeze(0).unsqueeze(-1) + if self.alpha_logscale: + alpha = torch.exp(alpha) + beta = torch.exp(beta) + x = snake_beta(x, alpha, beta) + + return x \ No newline at end of file diff --git a/cosyvoice/flow/stable/dit.py b/cosyvoice/flow/stable/dit.py new file mode 100644 index 0000000000000000000000000000000000000000..dcd5efa0f9430ca550b9845b2e8c13ae32534c2c --- /dev/null +++ b/cosyvoice/flow/stable/dit.py @@ -0,0 +1,415 @@ +import typing as tp + +import torch + +from einops import rearrange +from torch import nn +from torch.nn import functional as F +from x_transformers import ContinuousTransformerWrapper, Encoder + +from .blocks import FourierFeatures +from .transformer import ContinuousTransformer +from .transformer_use_mask import ContinuousTransformer as ContinuousTransformer_mask + + +class DiffusionTransformer(nn.Module): + def __init__(self, + io_channels=32, + patch_size=1, + embed_dim=768, + cond_token_dim=0, + project_cond_tokens=True, + global_cond_dim=0, + project_global_cond=True, + input_concat_dim=0, + prepend_cond_dim=0, + depth=12, + num_heads=8, + transformer_type: tp.Literal["x-transformers", "continuous_transformer"] = "x-transformers", + global_cond_type: tp.Literal["prepend", "adaLN"] = "prepend", + **kwargs): + + super().__init__() + + self.cond_token_dim = cond_token_dim + + # Timestep embeddings + timestep_features_dim = 256 + + self.timestep_features = FourierFeatures(1, timestep_features_dim) + + self.to_timestep_embed = nn.Sequential( + nn.Linear(timestep_features_dim, embed_dim, bias=True), + nn.SiLU(), + nn.Linear(embed_dim, embed_dim, bias=True), + ) + + if cond_token_dim > 0: + # Conditioning tokens + + cond_embed_dim = cond_token_dim if not project_cond_tokens else embed_dim + self.to_cond_embed = nn.Sequential( + nn.Linear(cond_token_dim, cond_embed_dim, bias=False), + nn.SiLU(), + nn.Linear(cond_embed_dim, cond_embed_dim, bias=False) + ) + else: + cond_embed_dim = 0 + self.to_cond_embed = nn.Identity() + + if global_cond_dim > 0: + # Global conditioning + global_embed_dim = global_cond_dim if not project_global_cond else embed_dim + self.to_global_embed = nn.Sequential( + nn.Linear(global_cond_dim, global_embed_dim, bias=False), + nn.SiLU(), + nn.Linear(global_embed_dim, global_embed_dim, bias=False) + ) + + if prepend_cond_dim > 0: + # Prepend conditioning + self.to_prepend_embed = nn.Sequential( + nn.Linear(prepend_cond_dim, embed_dim, bias=False), + nn.SiLU(), + nn.Linear(embed_dim, embed_dim, bias=False) + ) + + self.input_concat_dim = input_concat_dim + + dim_in = io_channels + self.input_concat_dim + + self.patch_size = patch_size + + # Transformer + + self.transformer_type = transformer_type + + self.global_cond_type = global_cond_type + + if self.transformer_type == "x-transformers": + self.transformer = ContinuousTransformerWrapper( + dim_in=dim_in * patch_size, + dim_out=io_channels * patch_size, + max_seq_len=0, # Not relevant without absolute positional embeds + attn_layers=Encoder( + dim=embed_dim, + depth=depth, + heads=num_heads, + attn_flash=True, + cross_attend=cond_token_dim > 0, + dim_context=None if cond_embed_dim == 0 else cond_embed_dim, + zero_init_branch_output=True, + use_abs_pos_emb=False, + rotary_pos_emb=True, + ff_swish=True, + ff_glu=True, + **kwargs + ) + ) + + elif self.transformer_type == "continuous_transformer": + + global_dim = None + + if self.global_cond_type == "adaLN": + # The global conditioning is projected to the embed_dim already at this point + global_dim = embed_dim + + self.transformer = ContinuousTransformer( + dim=embed_dim, + depth=depth, + dim_heads=embed_dim // num_heads, + dim_in=dim_in * patch_size, + dim_out=io_channels * patch_size, + cross_attend=cond_token_dim > 0, + cond_token_dim=cond_embed_dim, + global_cond_dim=global_dim, + **kwargs + ) + elif self.transformer_type == "continuous_transformer_with_mask": + + global_dim = None + + if self.global_cond_type == "adaLN": + # The global conditioning is projected to the embed_dim already at this point + global_dim = embed_dim + + self.transformer = ContinuousTransformer_mask( + dim=embed_dim, + depth=depth, + dim_heads=embed_dim // num_heads, + dim_in=dim_in * patch_size, + dim_out=io_channels * patch_size, + cross_attend=cond_token_dim > 0, + cond_token_dim=cond_embed_dim, + global_cond_dim=global_dim, + **kwargs + ) + + else: + raise ValueError(f"Unknown transformer type: {self.transformer_type}") + + self.preprocess_conv = nn.Conv1d(dim_in, dim_in, 1, bias=False) + nn.init.zeros_(self.preprocess_conv.weight) + self.postprocess_conv = nn.Conv1d(io_channels, io_channels, 1, bias=False) + nn.init.zeros_(self.postprocess_conv.weight) + + def _forward( + self, + x, + t, + mask=None, + cross_attn_cond=None, + cross_attn_cond_mask=None, + input_concat_cond=None, + global_embed=None, + prepend_cond=None, + prepend_cond_mask=None, + return_info=False, + **kwargs): + ### 1. 需要重新写过以适应不同长度的con + if cross_attn_cond is not None: + cross_attn_cond = self.to_cond_embed(cross_attn_cond) + + if global_embed is not None: + # Project the global conditioning to the embedding dimension + global_embed = self.to_global_embed(global_embed) + + prepend_inputs = None + prepend_mask = None + prepend_length = 0 + if prepend_cond is not None: + # Project the prepend conditioning to the embedding dimension + prepend_cond = self.to_prepend_embed(prepend_cond) + + prepend_inputs = prepend_cond + if prepend_cond_mask is not None: + prepend_mask = prepend_cond_mask + + if input_concat_cond is not None: + + # Interpolate input_concat_cond to the same length as x + if input_concat_cond.shape[2] != x.shape[2]: + input_concat_cond = F.interpolate(input_concat_cond, (x.shape[2],), mode='nearest') + + x = torch.cat([x, input_concat_cond], dim=1) + + # Get the batch of timestep embeddings + try: + timestep_embed = self.to_timestep_embed(self.timestep_features(t[:, None])) # (b, embed_dim) + except Exception as e: + print("t.shape:", t.shape, "x.shape", x.shape) + print("t:", t) + raise e + + # Timestep embedding is considered a global embedding. Add to the global conditioning if it exists + if global_embed is not None: + global_embed = global_embed + timestep_embed + else: + global_embed = timestep_embed + + # Add the global_embed to the prepend inputs if there is no global conditioning support in the transformer + if self.global_cond_type == "prepend": + if prepend_inputs is None: + # Prepend inputs are just the global embed, and the mask is all ones + prepend_inputs = global_embed.unsqueeze(1) + prepend_mask = torch.ones((x.shape[0], 1), device=x.device, dtype=torch.bool) + else: + # Prepend inputs are the prepend conditioning + the global embed + prepend_inputs = torch.cat([prepend_inputs, global_embed.unsqueeze(1)], dim=1) + prepend_mask = torch.cat([prepend_mask, torch.ones((x.shape[0], 1), device=x.device, dtype=torch.bool)], + dim=1) + + prepend_length = prepend_inputs.shape[1] + + x = self.preprocess_conv(x) + x + + x = rearrange(x, "b c t -> b t c") + + extra_args = {} + + if self.global_cond_type == "adaLN": + extra_args["global_cond"] = global_embed + + if self.patch_size > 1: + x = rearrange(x, "b (t p) c -> b t (c p)", p=self.patch_size) + + if self.transformer_type == "x-transformers": + output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, + context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, + **extra_args, **kwargs) + elif self.transformer_type in ["continuous_transformer","continuous_transformer_with_mask"] : + output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, + context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, + return_info=return_info, **extra_args, **kwargs) + + if return_info: + output, info = output + elif self.transformer_type == "mm_transformer": + output = self.transformer(x, context=cross_attn_cond, mask=mask, context_mask=cross_attn_cond_mask, + **extra_args, **kwargs) + + output = rearrange(output, "b t c -> b c t")[:, :, prepend_length:] + + if self.patch_size > 1: + output = rearrange(output, "b (c p) t -> b c (t p)", p=self.patch_size) + + output = self.postprocess_conv(output) + output + + if return_info: + return output, info + + return output + + def forward( + self, + x, + t, + cross_attn_cond=None, + cross_attn_cond_mask=None, + negative_cross_attn_cond=None, + negative_cross_attn_mask=None, + input_concat_cond=None, + global_embed=None, + negative_global_embed=None, + prepend_cond=None, + prepend_cond_mask=None, + cfg_scale=1.0, + cfg_dropout_prob=0.0, + causal=False, + scale_phi=0.0, + mask=None, + return_info=False, + **kwargs): + + assert causal == False, "Causal mode is not supported for DiffusionTransformer" + + if cross_attn_cond_mask is not None: + cross_attn_cond_mask = cross_attn_cond_mask.bool() + + cross_attn_cond_mask = None # Temporarily disabling conditioning masks due to kernel issue for flash attention + + if prepend_cond_mask is not None: + prepend_cond_mask = prepend_cond_mask.bool() + + # CFG dropout + if cfg_dropout_prob > 0.0: + if cross_attn_cond is not None: + null_embed = torch.zeros_like(cross_attn_cond, device=cross_attn_cond.device) + dropout_mask = torch.bernoulli( + torch.full((cross_attn_cond.shape[0], 1, 1), cfg_dropout_prob, device=cross_attn_cond.device)).to( + torch.bool) + cross_attn_cond = torch.where(dropout_mask, null_embed, cross_attn_cond) + + if prepend_cond is not None: + null_embed = torch.zeros_like(prepend_cond, device=prepend_cond.device) + dropout_mask = torch.bernoulli( + torch.full((prepend_cond.shape[0], 1, 1), cfg_dropout_prob, device=prepend_cond.device)).to( + torch.bool) + prepend_cond = torch.where(dropout_mask, null_embed, prepend_cond) + + if cfg_scale != 1.0 and (cross_attn_cond is not None or prepend_cond is not None): + # Classifier-free guidance + # Concatenate conditioned and unconditioned inputs on the batch dimension + batch_inputs = torch.cat([x, x], dim=0) + batch_timestep = torch.cat([t, t], dim=0) + + if global_embed is not None: + batch_global_cond = torch.cat([global_embed, global_embed], dim=0) + else: + batch_global_cond = None + + if input_concat_cond is not None: + batch_input_concat_cond = torch.cat([input_concat_cond, input_concat_cond], dim=0) + else: + batch_input_concat_cond = None + + batch_cond = None + batch_cond_masks = None + + # Handle CFG for cross-attention conditioning + if cross_attn_cond is not None: + + null_embed = torch.zeros_like(cross_attn_cond, device=cross_attn_cond.device) + + # For negative cross-attention conditioning, replace the null embed with the negative cross-attention conditioning + if negative_cross_attn_cond is not None: + + # If there's a negative cross-attention mask, set the masked tokens to the null embed + if negative_cross_attn_mask is not None: + negative_cross_attn_mask = negative_cross_attn_mask.to(torch.bool).unsqueeze(2) + + negative_cross_attn_cond = torch.where(negative_cross_attn_mask, negative_cross_attn_cond, + null_embed) + + batch_cond = torch.cat([cross_attn_cond, negative_cross_attn_cond], dim=0) + + else: + batch_cond = torch.cat([cross_attn_cond, null_embed], dim=0) + + if cross_attn_cond_mask is not None: + batch_cond_masks = torch.cat([cross_attn_cond_mask, cross_attn_cond_mask], dim=0) + + batch_prepend_cond = None + batch_prepend_cond_mask = None + + if prepend_cond is not None: + + null_embed = torch.zeros_like(prepend_cond, device=prepend_cond.device) + + batch_prepend_cond = torch.cat([prepend_cond, null_embed], dim=0) + + if prepend_cond_mask is not None: + batch_prepend_cond_mask = torch.cat([prepend_cond_mask, prepend_cond_mask], dim=0) + + if mask is not None: + batch_masks = torch.cat([mask, mask], dim=0) + else: + batch_masks = None + + batch_output = self._forward( + batch_inputs, + batch_timestep, + cross_attn_cond=batch_cond, + cross_attn_cond_mask=batch_cond_masks, + mask=batch_masks, + input_concat_cond=batch_input_concat_cond, + global_embed=batch_global_cond, + prepend_cond=batch_prepend_cond, + prepend_cond_mask=batch_prepend_cond_mask, + return_info=return_info, + **kwargs) + + if return_info: + batch_output, info = batch_output + + cond_output, uncond_output = torch.chunk(batch_output, 2, dim=0) + cfg_output = uncond_output + (cond_output - uncond_output) * cfg_scale + + # CFG Rescale + if scale_phi != 0.0: + cond_out_std = cond_output.std(dim=1, keepdim=True) + out_cfg_std = cfg_output.std(dim=1, keepdim=True) + output = scale_phi * (cfg_output * (cond_out_std / out_cfg_std)) + (1 - scale_phi) * cfg_output + else: + output = cfg_output + + if return_info: + return output, info + + return output + + else: + return self._forward( + x, + t, + cross_attn_cond=cross_attn_cond, + cross_attn_cond_mask=cross_attn_cond_mask, + input_concat_cond=input_concat_cond, + global_embed=global_embed, + prepend_cond=prepend_cond, + prepend_cond_mask=prepend_cond_mask, + mask=mask, + return_info=return_info, + **kwargs + ) diff --git a/cosyvoice/flow/stable/dit_v2.py b/cosyvoice/flow/stable/dit_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..b4baad9e5a91561a1c994f2000d75618fa97ba60 --- /dev/null +++ b/cosyvoice/flow/stable/dit_v2.py @@ -0,0 +1,307 @@ +import typing as tp + +import torch + +from einops import rearrange +from torch import nn +from torch.nn import functional as F +from x_transformers import ContinuousTransformerWrapper, Encoder + +from .blocks import FourierFeatures +from .transformer import ContinuousTransformer +from model.stable import transformer_use_mask + + +class DiffusionTransformerV2(nn.Module): + def __init__(self, + io_channels=32, + patch_size=1, + embed_dim=768, + cond_token_dim=0, + project_cond_tokens=True, + global_cond_dim=0, + project_global_cond=True, + input_concat_dim=0, + prepend_cond_dim=0, + depth=12, + num_heads=8, + transformer_type: tp.Literal["x-transformers", "continuous_transformer"] = "x-transformers", + global_cond_type: tp.Literal["prepend", "adaLN"] = "prepend", + **kwargs): + + super().__init__() + d_model = embed_dim + n_head = num_heads + n_layers = depth + encoder_layer = torch.nn.TransformerEncoderLayer(batch_first=True, + norm_first=True, + d_model=d_model, + nhead=n_head) + self.transformer = torch.nn.TransformerEncoder(encoder_layer, num_layers=n_layers) + + # ===================================== timestep embedding + timestep_features_dim = 256 + self.timestep_features = FourierFeatures(1, timestep_features_dim) + self.to_timestep_embed = nn.Sequential( + nn.Linear(timestep_features_dim, embed_dim, bias=True), + nn.SiLU(), + nn.Linear(embed_dim, embed_dim, bias=True), + ) + + + def _forward( + self, + Xt_btd, + t, #(1d) + mu_btd, + ): + + timestep_embed = self.to_timestep_embed(self.timestep_features(t[:, None])) # (b, embed_dim) + cated_input = torch.cat([t,mu,x_t]) + + ### 1. 需要重新写过以适应不同长度的con + if cross_attn_cond is not None: + cross_attn_cond = self.to_cond_embed(cross_attn_cond) + + if global_embed is not None: + # Project the global conditioning to the embedding dimension + global_embed = self.to_global_embed(global_embed) + + prepend_inputs = None + prepend_mask = None + prepend_length = 0 + if prepend_cond is not None: + # Project the prepend conditioning to the embedding dimension + prepend_cond = self.to_prepend_embed(prepend_cond) + + prepend_inputs = prepend_cond + if prepend_cond_mask is not None: + prepend_mask = prepend_cond_mask + + if input_concat_cond is not None: + + # Interpolate input_concat_cond to the same length as x + if input_concat_cond.shape[2] != x.shape[2]: + input_concat_cond = F.interpolate(input_concat_cond, (x.shape[2],), mode='nearest') + + x = torch.cat([x, input_concat_cond], dim=1) + + # Get the batch of timestep embeddings + try: + timestep_embed = self.to_timestep_embed(self.timestep_features(t[:, None])) # (b, embed_dim) + except Exception as e: + print("t.shape:", t.shape, "x.shape", x.shape) + print("t:", t) + raise e + + # Timestep embedding is considered a global embedding. Add to the global conditioning if it exists + if global_embed is not None: + global_embed = global_embed + timestep_embed + else: + global_embed = timestep_embed + + # Add the global_embed to the prepend inputs if there is no global conditioning support in the transformer + if self.global_cond_type == "prepend": + if prepend_inputs is None: + # Prepend inputs are just the global embed, and the mask is all ones + prepend_inputs = global_embed.unsqueeze(1) + prepend_mask = torch.ones((x.shape[0], 1), device=x.device, dtype=torch.bool) + else: + # Prepend inputs are the prepend conditioning + the global embed + prepend_inputs = torch.cat([prepend_inputs, global_embed.unsqueeze(1)], dim=1) + prepend_mask = torch.cat([prepend_mask, torch.ones((x.shape[0], 1), device=x.device, dtype=torch.bool)], + dim=1) + + prepend_length = prepend_inputs.shape[1] + + x = self.preprocess_conv(x) + x + + x = rearrange(x, "b c t -> b t c") + + extra_args = {} + + if self.global_cond_type == "adaLN": + extra_args["global_cond"] = global_embed + + if self.patch_size > 1: + x = rearrange(x, "b (t p) c -> b t (c p)", p=self.patch_size) + + if self.transformer_type == "x-transformers": + output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, + context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, + **extra_args, **kwargs) + elif self.transformer_type in ["continuous_transformer", "continuous_transformer_with_mask"]: + output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, + context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, + return_info=return_info, **extra_args, **kwargs) + + if return_info: + output, info = output + elif self.transformer_type == "mm_transformer": + output = self.transformer(x, context=cross_attn_cond, mask=mask, context_mask=cross_attn_cond_mask, + **extra_args, **kwargs) + + output = rearrange(output, "b t c -> b c t")[:, :, prepend_length:] + + if self.patch_size > 1: + output = rearrange(output, "b (c p) t -> b c (t p)", p=self.patch_size) + + output = self.postprocess_conv(output) + output + + if return_info: + return output, info + + return output + + def forward( + self, + x, + t, + cross_attn_cond=None, + cross_attn_cond_mask=None, + negative_cross_attn_cond=None, + negative_cross_attn_mask=None, + input_concat_cond=None, + global_embed=None, + negative_global_embed=None, + prepend_cond=None, + prepend_cond_mask=None, + cfg_scale=1.0, + cfg_dropout_prob=0.0, + causal=False, + scale_phi=0.0, + mask=None, + return_info=False, + **kwargs): + + assert causal == False, "Causal mode is not supported for DiffusionTransformer" + + if cross_attn_cond_mask is not None: + cross_attn_cond_mask = cross_attn_cond_mask.bool() + + cross_attn_cond_mask = None # Temporarily disabling conditioning masks due to kernel issue for flash attention + + if prepend_cond_mask is not None: + prepend_cond_mask = prepend_cond_mask.bool() + + # CFG dropout + if cfg_dropout_prob > 0.0: + if cross_attn_cond is not None: + null_embed = torch.zeros_like(cross_attn_cond, device=cross_attn_cond.device) + dropout_mask = torch.bernoulli( + torch.full((cross_attn_cond.shape[0], 1, 1), cfg_dropout_prob, device=cross_attn_cond.device)).to( + torch.bool) + cross_attn_cond = torch.where(dropout_mask, null_embed, cross_attn_cond) + + if prepend_cond is not None: + null_embed = torch.zeros_like(prepend_cond, device=prepend_cond.device) + dropout_mask = torch.bernoulli( + torch.full((prepend_cond.shape[0], 1, 1), cfg_dropout_prob, device=prepend_cond.device)).to( + torch.bool) + prepend_cond = torch.where(dropout_mask, null_embed, prepend_cond) + + if cfg_scale != 1.0 and (cross_attn_cond is not None or prepend_cond is not None): + # Classifier-free guidance + # Concatenate conditioned and unconditioned inputs on the batch dimension + batch_inputs = torch.cat([x, x], dim=0) + batch_timestep = torch.cat([t, t], dim=0) + + if global_embed is not None: + batch_global_cond = torch.cat([global_embed, global_embed], dim=0) + else: + batch_global_cond = None + + if input_concat_cond is not None: + batch_input_concat_cond = torch.cat([input_concat_cond, input_concat_cond], dim=0) + else: + batch_input_concat_cond = None + + batch_cond = None + batch_cond_masks = None + + # Handle CFG for cross-attention conditioning + if cross_attn_cond is not None: + + null_embed = torch.zeros_like(cross_attn_cond, device=cross_attn_cond.device) + + # For negative cross-attention conditioning, replace the null embed with the negative cross-attention conditioning + if negative_cross_attn_cond is not None: + + # If there's a negative cross-attention mask, set the masked tokens to the null embed + if negative_cross_attn_mask is not None: + negative_cross_attn_mask = negative_cross_attn_mask.to(torch.bool).unsqueeze(2) + + negative_cross_attn_cond = torch.where(negative_cross_attn_mask, negative_cross_attn_cond, + null_embed) + + batch_cond = torch.cat([cross_attn_cond, negative_cross_attn_cond], dim=0) + + else: + batch_cond = torch.cat([cross_attn_cond, null_embed], dim=0) + + if cross_attn_cond_mask is not None: + batch_cond_masks = torch.cat([cross_attn_cond_mask, cross_attn_cond_mask], dim=0) + + batch_prepend_cond = None + batch_prepend_cond_mask = None + + if prepend_cond is not None: + + null_embed = torch.zeros_like(prepend_cond, device=prepend_cond.device) + + batch_prepend_cond = torch.cat([prepend_cond, null_embed], dim=0) + + if prepend_cond_mask is not None: + batch_prepend_cond_mask = torch.cat([prepend_cond_mask, prepend_cond_mask], dim=0) + + if mask is not None: + batch_masks = torch.cat([mask, mask], dim=0) + else: + batch_masks = None + + batch_output = self._forward( + batch_inputs, + batch_timestep, + cross_attn_cond=batch_cond, + cross_attn_cond_mask=batch_cond_masks, + mask=batch_masks, + input_concat_cond=batch_input_concat_cond, + global_embed=batch_global_cond, + prepend_cond=batch_prepend_cond, + prepend_cond_mask=batch_prepend_cond_mask, + return_info=return_info, + **kwargs) + + if return_info: + batch_output, info = batch_output + + cond_output, uncond_output = torch.chunk(batch_output, 2, dim=0) + cfg_output = uncond_output + (cond_output - uncond_output) * cfg_scale + + # CFG Rescale + if scale_phi != 0.0: + cond_out_std = cond_output.std(dim=1, keepdim=True) + out_cfg_std = cfg_output.std(dim=1, keepdim=True) + output = scale_phi * (cfg_output * (cond_out_std / out_cfg_std)) + (1 - scale_phi) * cfg_output + else: + output = cfg_output + + if return_info: + return output, info + + return output + + else: + return self._forward( + x, + t, + cross_attn_cond=cross_attn_cond, + cross_attn_cond_mask=cross_attn_cond_mask, + input_concat_cond=input_concat_cond, + global_embed=global_embed, + prepend_cond=prepend_cond, + prepend_cond_mask=prepend_cond_mask, + mask=mask, + return_info=return_info, + **kwargs + ) diff --git a/cosyvoice/flow/stable/sampling.py b/cosyvoice/flow/stable/sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..2229e5089e3407a367df2d382ae039ca6364c489 --- /dev/null +++ b/cosyvoice/flow/stable/sampling.py @@ -0,0 +1,232 @@ +import torch +import math +from tqdm import trange, tqdm + +import k_diffusion as K + +# Define the noise schedule and sampling loop +def get_alphas_sigmas(t): + """Returns the scaling factors for the clean image (alpha) and for the + noise (sigma), given a timestep.""" + return torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2) + +def alpha_sigma_to_t(alpha, sigma): + """Returns a timestep, given the scaling factors for the clean image and for + the noise.""" + return torch.atan2(sigma, alpha) / math.pi * 2 + +def t_to_alpha_sigma(t): + """Returns the scaling factors for the clean image and for the noise, given + a timestep.""" + return torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2) + + +@torch.no_grad() +def sample_discrete_euler(model, x, steps, sigma_max=1, **extra_args): + """Draws samples from a model given starting noise. Euler method""" + + # Make tensor of ones to broadcast the single t values + ts = x.new_ones([x.shape[0]]) + + # Create the noise schedule + t = torch.linspace(sigma_max, 0, steps + 1) + + #alphas, sigmas = 1-t, t + + for t_curr, t_prev in tqdm(zip(t[:-1], t[1:])): + # Broadcast the current timestep to the correct shape + t_curr_tensor = t_curr * torch.ones( + (x.shape[0],), dtype=x.dtype, device=x.device + ) + dt = t_prev - t_curr # we solve backwards in our formulation + x = x + dt * model(x, t_curr_tensor, **extra_args) #.denoise(x, denoiser, t_curr_tensor, cond, uc) + + # If we are on the last timestep, output the denoised image + return x + +@torch.no_grad() +def sample(model, x, steps, eta, **extra_args): + """Draws samples from a model given starting noise. v-diffusion""" + ts = x.new_ones([x.shape[0]]) + + # Create the noise schedule + t = torch.linspace(1, 0, steps + 1)[:-1] + + alphas, sigmas = get_alphas_sigmas(t) + + # The sampling loop + for i in trange(steps): + + # Get the model output (v, the predicted velocity) + with torch.cuda.amp.autocast(): + v = model(x, ts * t[i], **extra_args).float() + + # Predict the noise and the denoised image + pred = x * alphas[i] - v * sigmas[i] + eps = x * sigmas[i] + v * alphas[i] + + # If we are not on the last timestep, compute the noisy image for the + # next timestep. + if i < steps - 1: + # If eta > 0, adjust the scaling factor for the predicted noise + # downward according to the amount of additional noise to add + ddim_sigma = eta * (sigmas[i + 1]**2 / sigmas[i]**2).sqrt() * \ + (1 - alphas[i]**2 / alphas[i + 1]**2).sqrt() + adjusted_sigma = (sigmas[i + 1]**2 - ddim_sigma**2).sqrt() + + # Recombine the predicted noise and predicted denoised image in the + # correct proportions for the next step + x = pred * alphas[i + 1] + eps * adjusted_sigma + + # Add the correct amount of fresh noise + if eta: + x += torch.randn_like(x) * ddim_sigma + + # If we are on the last timestep, output the denoised image + return pred + +# Soft mask inpainting is just shrinking hard (binary) mask inpainting +# Given a float-valued soft mask (values between 0 and 1), get the binary mask for this particular step +def get_bmask(i, steps, mask): + strength = (i+1)/(steps) + # convert to binary mask + bmask = torch.where(mask<=strength,1,0) + return bmask + +def make_cond_model_fn(model, cond_fn): + def cond_model_fn(x, sigma, **kwargs): + with torch.enable_grad(): + x = x.detach().requires_grad_() + denoised = model(x, sigma, **kwargs) + cond_grad = cond_fn(x, sigma, denoised=denoised, **kwargs).detach() + cond_denoised = denoised.detach() + cond_grad * K.utils.append_dims(sigma**2, x.ndim) + return cond_denoised + return cond_model_fn + +# Uses k-diffusion from https://github.com/crowsonkb/k-diffusion +# init_data is init_audio as latents (if this is latent diffusion) +# For sampling, set both init_data and mask to None +# For variations, set init_data +# For inpainting, set both init_data & mask +def sample_k( + model_fn, + noise, + init_data=None, + mask=None, + steps=100, + sampler_type="dpmpp-2m-sde", + sigma_min=0.5, + sigma_max=50, + rho=1.0, device="cuda", + callback=None, + cond_fn=None, + **extra_args + ): + + denoiser = K.external.VDenoiser(model_fn) + + if cond_fn is not None: + denoiser = make_cond_model_fn(denoiser, cond_fn) + + # Make the list of sigmas. Sigma values are scalars related to the amount of noise each denoising step has + sigmas = K.sampling.get_sigmas_polyexponential(steps, sigma_min, sigma_max, rho, device=device) + # Scale the initial noise by sigma + noise = noise * sigmas[0] + + wrapped_callback = callback + + if mask is None and init_data is not None: + # VARIATION (no inpainting) + # set the initial latent to the init_data, and noise it with initial sigma + x = init_data + noise + elif mask is not None and init_data is not None: + # INPAINTING + bmask = get_bmask(0, steps, mask) + # initial noising + input_noised = init_data + noise + # set the initial latent to a mix of init_data and noise, based on step 0's binary mask + x = input_noised * bmask + noise * (1-bmask) + # define the inpainting callback function (Note: side effects, it mutates x) + # See https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py#L596C13-L596C105 + # callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + # This is called immediately after `denoised = model(x, sigmas[i] * s_in, **extra_args)` + def inpainting_callback(args): + i = args["i"] + x = args["x"] + sigma = args["sigma"] + #denoised = args["denoised"] + # noise the init_data input with this step's appropriate amount of noise + input_noised = init_data + torch.randn_like(init_data) * sigma + # shrinking hard mask + bmask = get_bmask(i, steps, mask) + # mix input_noise with x, using binary mask + new_x = input_noised * bmask + x * (1-bmask) + # mutate x + x[:,:,:] = new_x[:,:,:] + # wrap together the inpainting callback and the user-submitted callback. + if callback is None: + wrapped_callback = inpainting_callback + else: + wrapped_callback = lambda args: (inpainting_callback(args), callback(args)) + else: + # SAMPLING + # set the initial latent to noise + x = noise + + + with torch.cuda.amp.autocast(): + if sampler_type == "k-heun": + return K.sampling.sample_heun(denoiser, x, sigmas, disable=False, callback=wrapped_callback, extra_args=extra_args) + elif sampler_type == "k-lms": + return K.sampling.sample_lms(denoiser, x, sigmas, disable=False, callback=wrapped_callback, extra_args=extra_args) + elif sampler_type == "k-dpmpp-2s-ancestral": + return K.sampling.sample_dpmpp_2s_ancestral(denoiser, x, sigmas, disable=False, callback=wrapped_callback, extra_args=extra_args) + elif sampler_type == "k-dpm-2": + return K.sampling.sample_dpm_2(denoiser, x, sigmas, disable=False, callback=wrapped_callback, extra_args=extra_args) + elif sampler_type == "k-dpm-fast": + return K.sampling.sample_dpm_fast(denoiser, x, sigma_min, sigma_max, steps, disable=False, callback=wrapped_callback, extra_args=extra_args) + elif sampler_type == "k-dpm-adaptive": + return K.sampling.sample_dpm_adaptive(denoiser, x, sigma_min, sigma_max, rtol=0.01, atol=0.01, disable=False, callback=wrapped_callback, extra_args=extra_args) + elif sampler_type == "dpmpp-2m-sde": + return K.sampling.sample_dpmpp_2m_sde(denoiser, x, sigmas, disable=False, callback=wrapped_callback, extra_args=extra_args) + elif sampler_type == "dpmpp-3m-sde": + return K.sampling.sample_dpmpp_3m_sde(denoiser, x, sigmas, disable=False, callback=wrapped_callback, extra_args=extra_args) + +# Uses discrete Euler sampling for rectified flow models +# init_data is init_audio as latents (if this is latent diffusion) +# For sampling, set both init_data and mask to None +# For variations, set init_data +# For inpainting, set both init_data & mask +def sample_rf( + model_fn, + noise, + init_data=None, + steps=100, + sigma_max=1, + device="cuda", + callback=None, + cond_fn=None, + **extra_args + ): + + if sigma_max > 1: + sigma_max = 1 + + if cond_fn is not None: + denoiser = make_cond_model_fn(denoiser, cond_fn) + + wrapped_callback = callback + + if init_data is not None: + # VARIATION (no inpainting) + # Interpolate the init data and the noise for init audio + x = init_data * (1 - sigma_max) + noise * sigma_max + else: + # SAMPLING + # set the initial latent to noise + x = noise + + with torch.cuda.amp.autocast(): + # TODO: Add callback support + #return sample_discrete_euler(model_fn, x, steps, sigma_max, callback=wrapped_callback, **extra_args) + return sample_discrete_euler(model_fn, x, steps, sigma_max, **extra_args) \ No newline at end of file diff --git a/cosyvoice/flow/stable/stable_diffusion.py b/cosyvoice/flow/stable/stable_diffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..732ef506c5e95ada6566176d9ffc3ea99a4cb7ab --- /dev/null +++ b/cosyvoice/flow/stable/stable_diffusion.py @@ -0,0 +1,109 @@ +import torch +from torch.nn import functional as F +from .dit import DiffusionTransformer +from .adp import UNet1d +from .sampling import sample +import math +from model.base import BaseModule +import pdb + +target_length = 1536 + + +def pad_and_create_mask(matrix, target_length): + T = matrix.shape[2] + if T > target_length: + raise ValueError("The third dimension length %s should not exceed %s" % (T, target_length)) + + padding_size = target_length - T + + padded_matrix = F.pad(matrix, (0, padding_size), "constant", 0) + + mask = torch.ones((1, target_length)) + mask[:, T:] = 0 # Set the padding part to 0 + + return padded_matrix.to(matrix.device), mask.to(matrix.device) + + +class Stable_Diffusion(BaseModule): + def __init__(self, io_channels, input_concat_dim=None, embed_dim=768, depth=24, num_heads=24, + project_cond_tokens=False, transformer_type="continuous_transformer"): + super(Stable_Diffusion, self).__init__() + self.diffusion = DiffusionTransformer( + io_channels=io_channels, + input_concat_dim=input_concat_dim, + embed_dim=embed_dim, + # cond_token_dim=target_length, + depth=depth, + num_heads=num_heads, + project_cond_tokens=project_cond_tokens, + transformer_type=transformer_type, + ) + # self.diffusion = UNet1d( + # in_channels=80, + # channels=256, + # resnet_groups=16, + # kernel_multiplier_downsample=2, + # multipliers=[4, 4, 4, 5, 5], + # factors=[1, 2, 2, 4], # 输入长度不一致卷积缩短 + # num_blocks=[2, 2, 2, 2], + # attentions=[1, 3, 3, 3, 3], + # attention_heads=16, + # attention_multiplier=4, + # use_nearest_upsample=False, + # use_skip_scale=True, + # use_context_time=True + # ) + self.rng = torch.quasirandom.SobolEngine(1, scramble=True) + + @torch.no_grad() + def forward(self, mu, mask, n_timesteps): + # pdb.set_trace() + mask = mask.squeeze(1) + noise = torch.randn_like(mu).to(mu.device) + # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length) + # extra_args = {"cross_attn_cond": mu, "cross_attn_cond_mask": mask, "mask": mask} + extra_args = {"input_concat_cond": mu, "mask": mask} + fakes = sample(self.diffusion, noise, n_timesteps, 0, **extra_args) + + return fakes + + def compute_loss(self, x0, mask, mu): + + # pdb.set_trace() + t = self.rng.draw(x0.shape[0])[:, 0].to(x0.device) + alphas, sigmas = torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2) + + alphas = alphas[:, None, None] + sigmas = sigmas[:, None, None] + noise = torch.randn_like(x0) + noised_inputs = x0 * alphas + noise * sigmas + targets = noise * alphas - x0 * sigmas + mask = mask.squeeze(1) + # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length) + # output = self.diffusion(noised_inputs, t, cross_attn_cond=mu, + # cross_attn_cond_mask=mask, mask=mask, cfg_dropout_prob=0.1) + # pdb.set_trace() + output = self.diffusion(noised_inputs, # [bs, 80, 229] + t, # (bs,) + input_concat_cond=mu, + mask=mask, # [bs, 229] + cfg_dropout_prob=0.1) + + return self.mse_loss(output, targets, mask), output + + def mse_loss(self, output, targets, mask): + + mse_loss = F.mse_loss(output, targets, reduction='none') + + if mask.ndim == 2 and mse_loss.ndim == 3: + mask = mask.unsqueeze(1) + + if mask.shape[1] != mse_loss.shape[1]: + mask = mask.repeat(1, mse_loss.shape[1], 1) + + mse_loss = mse_loss * mask + + mse_loss = mse_loss.mean() + + return mse_loss diff --git a/cosyvoice/flow/stable/stable_diffusion_test.py b/cosyvoice/flow/stable/stable_diffusion_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b4fb79da70d084ba6571a12fb51f15bb331ea8d7 --- /dev/null +++ b/cosyvoice/flow/stable/stable_diffusion_test.py @@ -0,0 +1,104 @@ +import torch +from torch.nn import functional as F +from .dit import DiffusionTransformer +from .adp import UNet1d +from .sampling import sample +import math +from model.base import BaseModule +import pdb + +target_length = 1536 +def pad_and_create_mask(matrix, target_length): + + T = matrix.shape[2] + if T > target_length: + raise ValueError("The third dimension length %s should not exceed %s"%(T, target_length)) + + padding_size = target_length - T + + padded_matrix = F.pad(matrix, (0, padding_size), "constant", 0) + + mask = torch.ones((1, target_length)) + mask[:, T:] = 0 # Set the padding part to 0 + + return padded_matrix.to(matrix.device), mask.to(matrix.device) + + +class Stable_Diffusion(BaseModule): + def __init__(self): + super(Stable_Diffusion, self).__init__() + self.diffusion = DiffusionTransformer( + io_channels=80, + # input_concat_dim=80, + embed_dim=768, + # cond_token_dim=target_length, + depth=24, + num_heads=24, + project_cond_tokens=False, + transformer_type="continuous_transformer", + ) + # self.diffusion = UNet1d( + # in_channels=80, + # channels=256, + # resnet_groups=16, + # kernel_multiplier_downsample=2, + # multipliers=[4, 4, 4, 5, 5], + # factors=[1, 2, 2, 4], # 输入长度不一致卷积缩短 + # num_blocks=[2, 2, 2, 2], + # attentions=[1, 3, 3, 3, 3], + # attention_heads=16, + # attention_multiplier=4, + # use_nearest_upsample=False, + # use_skip_scale=True, + # use_context_time=True + # ) + self.rng = torch.quasirandom.SobolEngine(1, scramble=True) + + @torch.no_grad() + def forward(self, mu, mask, n_timesteps): + # pdb.set_trace() + mask = mask.squeeze(1) + # noise = torch.randn_like(mu).to(mu.device) + # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length) + # extra_args = {"cross_attn_cond": mu, "cross_attn_cond_mask": mask, "mask": mask} + extra_args = {"mask": mask} + fakes = sample(self.diffusion, mu, n_timesteps, 0, **extra_args) + + return fakes + + + def compute_loss(self, x0, mask, mu): + + # pdb.set_trace() + t = self.rng.draw(x0.shape[0])[:, 0].to(x0.device) + alphas, sigmas = torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2) + + alphas = alphas[:, None, None] + sigmas = sigmas[:, None, None] + noise = torch.randn_like(x0) + noised_inputs = x0 * alphas + noise * sigmas + targets = mu * alphas - x0 * sigmas + mask = mask.squeeze(1) + # mu_pad, mu_pad_mask = pad_and_create_mask(mu, target_length) + # output = self.diffusion(noised_inputs, t, cross_attn_cond=mu, + # cross_attn_cond_mask=mask, mask=mask, cfg_dropout_prob=0.1) + output = self.diffusion(noised_inputs, t, mask=mask, cfg_dropout_prob=0.1) + + return self.mse_loss(output, targets, mask), output + + + def mse_loss(self, output, targets, mask): + + mse_loss = F.mse_loss(output, targets, reduction='none') + + if mask.ndim == 2 and mse_loss.ndim == 3: + mask = mask.unsqueeze(1) + + if mask.shape[1] != mse_loss.shape[1]: + mask = mask.repeat(1, mse_loss.shape[1], 1) + + mse_loss = mse_loss[mask] + + mse_loss = mse_loss.mean() + + return mse_loss \ No newline at end of file diff --git a/cosyvoice/flow/stable/transformer.py b/cosyvoice/flow/stable/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..417beeb78dc70d69fcbb323acbc8d9d7f9e22a09 --- /dev/null +++ b/cosyvoice/flow/stable/transformer.py @@ -0,0 +1,816 @@ +import pdb +from functools import reduce, partial +from packaging import version + +from einops import rearrange, repeat +from einops.layers.torch import Rearrange +import torch +import torch.nn.functional as F +from torch import nn, einsum +from torch.cuda.amp import autocast +from typing import Callable, Literal + +try: + from flash_attn import flash_attn_func, flash_attn_kvpacked_func +except ImportError as e: + print(e) + print('flash_attn not installed, disabling Flash Attention') + flash_attn_kvpacked_func = None + flash_attn_func = None + +try: + import natten +except ImportError: + natten = None + +def checkpoint(function, *args, **kwargs): + kwargs.setdefault("use_reentrant", False) + return torch.utils.checkpoint.checkpoint(function, *args, **kwargs) + + +# Copied and modified from https://github.com/lucidrains/x-transformers/blob/main/x_transformers/attend.py under MIT License +# License can be found in LICENSES/LICENSE_XTRANSFORMERS.txt + +def create_causal_mask(i, j, device): + return torch.ones((i, j), device = device, dtype = torch.bool).triu(j - i + 1) + +def or_reduce(masks): + head, *body = masks + for rest in body: + head = head | rest + return head + +# positional embeddings + +class AbsolutePositionalEmbedding(nn.Module): + def __init__(self, dim, max_seq_len): + super().__init__() + self.scale = dim ** -0.5 + self.max_seq_len = max_seq_len + self.emb = nn.Embedding(max_seq_len, dim) + + def forward(self, x, pos = None, seq_start_pos = None): + seq_len, device = x.shape[1], x.device + assert seq_len <= self.max_seq_len, f'you are passing in a sequence length of {seq_len} but your absolute positional embedding has a max sequence length of {self.max_seq_len}' + + if pos is None: + pos = torch.arange(seq_len, device = device) + + if seq_start_pos is not None: + pos = (pos - seq_start_pos[..., None]).clamp(min = 0) + + pos_emb = self.emb(pos) + pos_emb = pos_emb * self.scale + return pos_emb + +class ScaledSinusoidalEmbedding(nn.Module): + def __init__(self, dim, theta = 10000): + super().__init__() + assert (dim % 2) == 0, 'dimension must be divisible by 2' + self.scale = nn.Parameter(torch.ones(1) * dim ** -0.5) + + half_dim = dim // 2 + freq_seq = torch.arange(half_dim).float() / half_dim + inv_freq = theta ** -freq_seq + self.register_buffer('inv_freq', inv_freq, persistent = False) + + def forward(self, x, pos = None, seq_start_pos = None): + seq_len, device = x.shape[1], x.device + + if pos is None: + pos = torch.arange(seq_len, device = device) + + if seq_start_pos is not None: + pos = pos - seq_start_pos[..., None] + + emb = einsum('i, j -> i j', pos, self.inv_freq) + emb = torch.cat((emb.sin(), emb.cos()), dim = -1) + return emb * self.scale + +class RotaryEmbedding(nn.Module): + def __init__( + self, + dim, + use_xpos = False, + scale_base = 512, + interpolation_factor = 1., + base = 10000, + base_rescale_factor = 1. + ): + super().__init__() + # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning + # has some connection to NTK literature + # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ + base *= base_rescale_factor ** (dim / (dim - 2)) + + inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer('inv_freq', inv_freq) + + assert interpolation_factor >= 1. + self.interpolation_factor = interpolation_factor + + if not use_xpos: + self.register_buffer('scale', None) + return + + scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim) + + self.scale_base = scale_base + self.register_buffer('scale', scale) + + def forward_from_seq_len(self, seq_len): + device = self.inv_freq.device + + t = torch.arange(seq_len, device = device) + return self.forward(t) + + @autocast(enabled = False) + def forward(self, t): + device = self.inv_freq.device + + t = t.to(torch.float32) + + t = t / self.interpolation_factor + + freqs = torch.einsum('i , j -> i j', t, self.inv_freq) + freqs = torch.cat((freqs, freqs), dim = -1) + + if self.scale is None: + return freqs, 1. + + power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base + scale = self.scale ** rearrange(power, 'n -> n 1') + scale = torch.cat((scale, scale), dim = -1) + + return freqs, scale + +def rotate_half(x): + x = rearrange(x, '... (j d) -> ... j d', j = 2) + x1, x2 = x.unbind(dim = -2) + return torch.cat((-x2, x1), dim = -1) + +@autocast(enabled = False) +def apply_rotary_pos_emb(t, freqs, scale = 1): + out_dtype = t.dtype + + # cast to float32 if necessary for numerical stability + dtype = reduce(torch.promote_types, (t.dtype, freqs.dtype, torch.float32)) + rot_dim, seq_len = freqs.shape[-1], t.shape[-2] + freqs, t = freqs.to(dtype), t.to(dtype) + freqs = freqs[-seq_len:, :] + + if t.ndim == 4 and freqs.ndim == 3: + freqs = rearrange(freqs, 'b n d -> b 1 n d') + + # partial rotary embeddings, Wang et al. GPT-J + t, t_unrotated = t[..., :rot_dim], t[..., rot_dim:] + t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale) + + t, t_unrotated = t.to(out_dtype), t_unrotated.to(out_dtype) + + return torch.cat((t, t_unrotated), dim = -1) + +# norms +class LayerNorm(nn.Module): + def __init__(self, dim, bias=False, fix_scale=False): + """ + bias-less layernorm has been shown to be more stable. most newer models have moved towards rmsnorm, also bias-less + """ + super().__init__() + + if fix_scale: + self.register_buffer("gamma", torch.ones(dim)) + else: + self.gamma = nn.Parameter(torch.ones(dim)) + + if bias: + self.beta = nn.Parameter(torch.zeros(dim)) + else: + self.register_buffer("beta", torch.zeros(dim)) + + + def forward(self, x): + return F.layer_norm(x, x.shape[-1:], weight=self.gamma, bias=self.beta) + +# feedforward + +class GLU(nn.Module): + def __init__( + self, + dim_in, + dim_out, + activation: Callable, + use_conv = False, + conv_kernel_size = 3, + ): + super().__init__() + self.act = activation + self.proj = nn.Linear(dim_in, dim_out * 2) if not use_conv else nn.Conv1d(dim_in, dim_out * 2, conv_kernel_size, padding = (conv_kernel_size // 2)) + self.use_conv = use_conv + + def forward(self, x): + if self.use_conv: + x = rearrange(x, 'b n d -> b d n') + x = self.proj(x) + x = rearrange(x, 'b d n -> b n d') + else: + x = self.proj(x) + + x, gate = x.chunk(2, dim = -1) + return x * self.act(gate) + +class FeedForward(nn.Module): + def __init__( + self, + dim, + dim_out = None, + mult = 4, + no_bias = False, + glu = True, + use_conv = False, + conv_kernel_size = 3, + zero_init_output = True, + ): + super().__init__() + inner_dim = int(dim * mult) + + # Default to SwiGLU + + activation = nn.SiLU() + + dim_out = dim if dim_out is None else dim_out + + if glu: + linear_in = GLU(dim, inner_dim, activation) + else: + linear_in = nn.Sequential( + Rearrange('b n d -> b d n') if use_conv else nn.Identity(), + nn.Linear(dim, inner_dim, bias = not no_bias) if not use_conv else nn.Conv1d(dim, inner_dim, conv_kernel_size, padding = (conv_kernel_size // 2), bias = not no_bias), + Rearrange('b n d -> b d n') if use_conv else nn.Identity(), + activation + ) + + linear_out = nn.Linear(inner_dim, dim_out, bias = not no_bias) if not use_conv else nn.Conv1d(inner_dim, dim_out, conv_kernel_size, padding = (conv_kernel_size // 2), bias = not no_bias) + + # init last linear layer to 0 + if zero_init_output: + nn.init.zeros_(linear_out.weight) + if not no_bias: + nn.init.zeros_(linear_out.bias) + + + self.ff = nn.Sequential( + linear_in, + Rearrange('b d n -> b n d') if use_conv else nn.Identity(), + linear_out, + Rearrange('b n d -> b d n') if use_conv else nn.Identity(), + ) + + def forward(self, x): + return self.ff(x) + +class Attention(nn.Module): + def __init__( + self, + dim, + dim_heads = 64, + dim_context = None, + causal = False, + zero_init_output=True, + qk_norm: Literal['l2', 'ln', 'none'] = 'none', + natten_kernel_size = None + ): + super().__init__() + self.dim = dim + self.dim_heads = dim_heads + self.causal = causal + + dim_kv = dim_context if dim_context is not None else dim + + self.num_heads = dim // dim_heads + self.kv_heads = dim_kv // dim_heads + + if dim_context is not None: + self.to_q = nn.Linear(dim, dim, bias=False) + self.to_kv = nn.Linear(dim_kv, dim_kv * 2, bias=False) + else: + self.to_qkv = nn.Linear(dim, dim * 3, bias=False) + + self.to_out = nn.Linear(dim, dim, bias=False) + + if zero_init_output: + nn.init.zeros_(self.to_out.weight) + + self.qk_norm = qk_norm + + if self.qk_norm == "ln": + self.q_norm = nn.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6) + self.k_norm = nn.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6) + + # Using 1d neighborhood attention + self.natten_kernel_size = natten_kernel_size + if natten_kernel_size is not None: + return + + self.use_pt_flash = torch.cuda.is_available() and version.parse(torch.__version__) >= version.parse('2.0.0') + + self.use_fa_flash = torch.cuda.is_available() and flash_attn_func is not None + # pdb.set_trace() + self.use_fa_flash = False + + self.sdp_kwargs = dict( + enable_flash = True, + enable_math = True, + enable_mem_efficient = True + ) + + def flash_attn( + self, + q, + k, + v, + mask = None, + causal = None + ): + batch, heads, q_len, _, k_len, device = *q.shape, k.shape[-2], q.device + kv_heads = k.shape[1] + # Recommended for multi-query single-key-value attention by Tri Dao + # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64]) + + if heads != kv_heads: + # Repeat interleave kv_heads to match q_heads + heads_per_kv_head = heads // kv_heads + k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim = 1), (k, v)) + + if k.ndim == 3: + k = rearrange(k, 'b ... -> b 1 ...').expand_as(q) + + if v.ndim == 3: + v = rearrange(v, 'b ... -> b 1 ...').expand_as(q) + + causal = self.causal if causal is None else causal + + if q_len == 1 and causal: + causal = False + + if mask is not None: + assert mask.ndim == 4 + mask = mask.expand(batch, heads, q_len, k_len) + + # handle kv cache - this should be bypassable in updated flash attention 2 + + if k_len > q_len and causal: + causal_mask = self.create_causal_mask(q_len, k_len, device = device) + if mask is None: + mask = ~causal_mask + else: + mask = mask & ~causal_mask + causal = False + + # manually handle causal mask, if another mask was given + + row_is_entirely_masked = None + + if mask is not None and causal: + causal_mask = self.create_causal_mask(q_len, k_len, device = device) + mask = mask & ~causal_mask + + # protect against an entire row being masked out + + row_is_entirely_masked = ~mask.any(dim = -1) + mask[..., 0] = mask[..., 0] | row_is_entirely_masked + + causal = False + + with torch.backends.cuda.sdp_kernel(**self.sdp_kwargs): + out = F.scaled_dot_product_attention( + q, k, v, + attn_mask = mask, + is_causal = causal + ) + + # for a row that is entirely masked out, should zero out the output of that row token + + if row_is_entirely_masked is not None: + out = out.masked_fill(row_is_entirely_masked[..., None], 0.) + + return out + + def forward( + self, + x, + context = None, + mask = None, + context_mask = None, + rotary_pos_emb = None, + causal = None + ): + h, kv_h, has_context = self.num_heads, self.kv_heads, context is not None + + kv_input = context if has_context else x + + if hasattr(self, 'to_q'): + # Use separate linear projections for q and k/v + q = self.to_q(x) + q = rearrange(q, 'b n (h d) -> b h n d', h = h) + + k, v = self.to_kv(kv_input).chunk(2, dim=-1) + + k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v)) + else: + # Use fused linear projection + q, k, v = self.to_qkv(x).chunk(3, dim=-1) + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v)) + + # Normalize q and k for cosine sim attention + if self.qk_norm == "l2": + q = F.normalize(q, dim=-1) + k = F.normalize(k, dim=-1) + elif self.qk_norm == "ln": + q = self.q_norm(q) + k = self.k_norm(k) + + if rotary_pos_emb is not None and not has_context: + freqs, _ = rotary_pos_emb + + q_dtype = q.dtype + k_dtype = k.dtype + + q = q.to(torch.float32) + k = k.to(torch.float32) + freqs = freqs.to(torch.float32) + + q = apply_rotary_pos_emb(q, freqs) + k = apply_rotary_pos_emb(k, freqs) + + q = q.to(q_dtype) + k = k.to(k_dtype) + + input_mask = context_mask + + if input_mask is None and not has_context: + input_mask = mask + + # determine masking + masks = [] + final_attn_mask = None # The mask that will be applied to the attention matrix, taking all masks into account + + if input_mask is not None: + input_mask = rearrange(input_mask, 'b j -> b 1 1 j') + masks.append(~input_mask) + + # Other masks will be added here later + + if len(masks) > 0: + final_attn_mask = ~or_reduce(masks) + + n, device = q.shape[-2], q.device + + causal = self.causal if causal is None else causal + + if n == 1 and causal: + causal = False + + if self.natten_kernel_size is not None: + if natten is None: + raise ImportError('natten not installed, please install natten to use neighborhood attention') + + dtype_in = q.dtype + q, k, v = map(lambda t: t.to(torch.float32), (q, k, v)) + + attn = natten.functional.natten1dqk(q, k, kernel_size = self.natten_kernel_size, dilation=1) + + if final_attn_mask is not None: + attn = attn.masked_fill(final_attn_mask, -torch.finfo(attn.dtype).max) + + attn = F.softmax(attn, dim=-1, dtype=torch.float32) + + out = natten.functional.natten1dav(attn, v, kernel_size = self.natten_kernel_size, dilation=1).to(dtype_in) + + # Prioritize Flash Attention 2 + elif self.use_fa_flash: + # pdb.set_trace() + assert final_attn_mask is None, 'masking not yet supported for Flash Attention 2' + # Flash Attention 2 requires FP16 inputs + fa_dtype_in = q.dtype + q, k, v = map(lambda t: rearrange(t, 'b h n d -> b n h d').to(torch.float16), (q, k, v)) + + out = flash_attn_func(q, k, v, causal = causal) + + out = rearrange(out.to(fa_dtype_in), 'b n h d -> b h n d') + + # Fall back to PyTorch implementation + elif self.use_pt_flash: + out = self.flash_attn(q, k, v, causal = causal, mask = final_attn_mask) + + else: + # Fall back to custom implementation + + if h != kv_h: + # Repeat interleave kv_heads to match q_heads + heads_per_kv_head = h // kv_h + k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim = 1), (k, v)) + + scale = 1. / (q.shape[-1] ** 0.5) + + kv_einsum_eq = 'b j d' if k.ndim == 3 else 'b h j d' + + dots = einsum(f'b h i d, {kv_einsum_eq} -> b h i j', q, k) * scale + + i, j, dtype = *dots.shape[-2:], dots.dtype + + mask_value = -torch.finfo(dots.dtype).max + + if final_attn_mask is not None: + dots = dots.masked_fill(~final_attn_mask, mask_value) + + if causal: + causal_mask = self.create_causal_mask(i, j, device = device) + dots = dots.masked_fill(causal_mask, mask_value) + + attn = F.softmax(dots, dim=-1, dtype=torch.float32) + attn = attn.type(dtype) + + out = einsum(f'b h i j, {kv_einsum_eq} -> b h i d', attn, v) + + # merge heads + out = rearrange(out, ' b h n d -> b n (h d)') + + # Communicate between heads + + # with autocast(enabled = False): + # out_dtype = out.dtype + # out = out.to(torch.float32) + # out = self.to_out(out).to(out_dtype) + out = self.to_out(out) + + if mask is not None: + mask = rearrange(mask, 'b n -> b n 1') + out = out.masked_fill(~mask, 0.) + + return out + +class ConformerModule(nn.Module): + def __init__( + self, + dim, + norm_kwargs = {}, + ): + + super().__init__() + + self.dim = dim + + self.in_norm = LayerNorm(dim, **norm_kwargs) + self.pointwise_conv = nn.Conv1d(dim, dim, kernel_size=1, bias=False) + self.glu = GLU(dim, dim, nn.SiLU()) + self.depthwise_conv = nn.Conv1d(dim, dim, kernel_size=17, groups=dim, padding=8, bias=False) + self.mid_norm = LayerNorm(dim, **norm_kwargs) # This is a batch norm in the original but I don't like batch norm + self.swish = nn.SiLU() + self.pointwise_conv_2 = nn.Conv1d(dim, dim, kernel_size=1, bias=False) + + def forward(self, x): + x = self.in_norm(x) + x = rearrange(x, 'b n d -> b d n') + x = self.pointwise_conv(x) + x = rearrange(x, 'b d n -> b n d') + x = self.glu(x) + x = rearrange(x, 'b n d -> b d n') + x = self.depthwise_conv(x) + x = rearrange(x, 'b d n -> b n d') + x = self.mid_norm(x) + x = self.swish(x) + x = rearrange(x, 'b n d -> b d n') + x = self.pointwise_conv_2(x) + x = rearrange(x, 'b d n -> b n d') + + return x + +class TransformerBlock(nn.Module): + def __init__( + self, + dim, + dim_heads = 64, + cross_attend = False, + dim_context = None, + global_cond_dim = None, + causal = False, + zero_init_branch_outputs = True, + conformer = False, + layer_ix = -1, + remove_norms = False, + attn_kwargs = {}, + ff_kwargs = {}, + norm_kwargs = {} + ): + + super().__init__() + self.dim = dim + self.dim_heads = dim_heads + self.cross_attend = cross_attend + self.dim_context = dim_context + self.causal = causal + + self.pre_norm = LayerNorm(dim, **norm_kwargs) if not remove_norms else nn.Identity() + + self.self_attn = Attention( + dim, + dim_heads = dim_heads, + causal = causal, + zero_init_output=zero_init_branch_outputs, + **attn_kwargs + ) + ### 2. 主要是这边需要修改 + if cross_attend: + self.cross_attend_norm = LayerNorm(dim, **norm_kwargs) if not remove_norms else nn.Identity() + self.cross_attn = Attention( + dim, + dim_heads = dim_heads, + dim_context=dim_context, + causal = causal, + zero_init_output=zero_init_branch_outputs, + **attn_kwargs + ) + + self.ff_norm = LayerNorm(dim, **norm_kwargs) if not remove_norms else nn.Identity() + self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, **ff_kwargs) + + self.layer_ix = layer_ix + + self.conformer = ConformerModule(dim, norm_kwargs=norm_kwargs) if conformer else None + + self.global_cond_dim = global_cond_dim + + if global_cond_dim is not None: + self.to_scale_shift_gate = nn.Sequential( + nn.SiLU(), + nn.Linear(global_cond_dim, dim * 6, bias=False) + ) + + nn.init.zeros_(self.to_scale_shift_gate[1].weight) + #nn.init.zeros_(self.to_scale_shift_gate_self[1].bias) + + def forward( + self, + x, + context = None, + global_cond=None, + mask = None, + context_mask = None, + rotary_pos_emb = None + ): + if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None: + + scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = self.to_scale_shift_gate(global_cond).unsqueeze(1).chunk(6, dim = -1) + + # self-attention with adaLN + residual = x + x = self.pre_norm(x) + x = x * (1 + scale_self) + shift_self + x = self.self_attn(x, mask = mask, rotary_pos_emb = rotary_pos_emb) + x = x * torch.sigmoid(1 - gate_self) + x = x + residual + + if context is not None: + x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask) + + if self.conformer is not None: + x = x + self.conformer(x) + + # feedforward with adaLN + residual = x + x = self.ff_norm(x) + x = x * (1 + scale_ff) + shift_ff + x = self.ff(x) + x = x * torch.sigmoid(1 - gate_ff) + x = x + residual + + else: + x = x + self.self_attn(self.pre_norm(x), mask = mask, rotary_pos_emb = rotary_pos_emb) + + if context is not None: + x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask) + + if self.conformer is not None: + x = x + self.conformer(x) + + x = x + self.ff(self.ff_norm(x)) + + return x + +class ContinuousTransformer(nn.Module): + def __init__( + self, + dim, + depth, + *, + dim_in = None, + dim_out = None, + dim_heads = 64, + cross_attend=False, + cond_token_dim=None, + global_cond_dim=None, + causal=False, + rotary_pos_emb=True, + zero_init_branch_outputs=True, + conformer=False, + use_sinusoidal_emb=False, + use_abs_pos_emb=False, + abs_pos_emb_max_length=10000, + **kwargs + ): + + super().__init__() + + self.dim = dim + self.depth = depth + self.causal = causal + self.layers = nn.ModuleList([]) + + self.project_in = nn.Linear(dim_in, dim, bias=False) if dim_in is not None else nn.Identity() + self.project_out = nn.Linear(dim, dim_out, bias=False) if dim_out is not None else nn.Identity() + + if rotary_pos_emb: + self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32)) + else: + self.rotary_pos_emb = None + + self.use_sinusoidal_emb = use_sinusoidal_emb + if use_sinusoidal_emb: + self.pos_emb = ScaledSinusoidalEmbedding(dim) + + self.use_abs_pos_emb = use_abs_pos_emb + if use_abs_pos_emb: + self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length) + + for i in range(depth): + self.layers.append( + TransformerBlock( + dim, + dim_heads = dim_heads, + cross_attend = cross_attend, + dim_context = cond_token_dim, + global_cond_dim = global_cond_dim, + causal = causal, + zero_init_branch_outputs = zero_init_branch_outputs, + conformer=conformer, + layer_ix=i, + **kwargs + ) + ) + + def forward( + self, + x, + mask = None, + prepend_embeds = None, + prepend_mask = None, + global_cond = None, + return_info = False, + **kwargs + ): + batch, seq, device = *x.shape[:2], x.device + + info = { + "hidden_states": [], + } + + x = self.project_in(x) + if prepend_embeds is not None: + prepend_length, prepend_dim = prepend_embeds.shape[1:] + + assert prepend_dim == x.shape[-1], 'prepend dimension must match sequence dimension' + + x = torch.cat((prepend_embeds, x), dim = -2) + + if prepend_mask is not None or mask is not None: + mask = mask if mask is not None else torch.ones((batch, seq), device = device, dtype = torch.bool) + prepend_mask = prepend_mask if prepend_mask is not None else torch.ones((batch, prepend_length), device = device, dtype = torch.bool) + + mask = torch.cat((prepend_mask, mask), dim = -1) + + # Attention layers + + if self.rotary_pos_emb is not None: + rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1]) + else: + rotary_pos_emb = None + + if self.use_sinusoidal_emb or self.use_abs_pos_emb: + x = x + self.pos_emb(x) + + # Iterate over the transformer layers + for layer in self.layers: + #x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs) + # pdb.set_trace() + x = checkpoint(layer, x, mask=mask.bool(),rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs) + + if return_info: + info["hidden_states"].append(x) + + x = self.project_out(x) + + if return_info: + return x, info + + return x diff --git a/cosyvoice/flow/stable/transformer_use_mask.py b/cosyvoice/flow/stable/transformer_use_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..d22c5704f71d14b1f4446869a7f191383d5c31d3 --- /dev/null +++ b/cosyvoice/flow/stable/transformer_use_mask.py @@ -0,0 +1,845 @@ +import pdb +from functools import reduce, partial +from packaging import version + +from einops import rearrange, repeat +from einops.layers.torch import Rearrange +import torch +import torch.nn.functional as F +from torch import nn, einsum +from torch.cuda.amp import autocast +from typing import Callable, Literal + +try: + from flash_attn import flash_attn_func, flash_attn_kvpacked_func +except ImportError as e: + print(e) + print('flash_attn not installed, disabling Flash Attention') + flash_attn_kvpacked_func = None + flash_attn_func = None + +try: + import natten +except ImportError: + natten = None + + +def checkpoint(function, *args, **kwargs): + kwargs.setdefault("use_reentrant", False) + return torch.utils.checkpoint.checkpoint(function, *args, **kwargs) + + +# Copied and modified from https://github.com/lucidrains/x-transformers/blob/main/x_transformers/attend.py under MIT License +# License can be found in LICENSES/LICENSE_XTRANSFORMERS.txt + +def create_causal_mask(i, j, device): + return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 1) + + +def or_reduce(masks): + head, *body = masks + for rest in body: + head = head | rest + return head + + +# positional embeddings + +class AbsolutePositionalEmbedding(nn.Module): + def __init__(self, dim, max_seq_len): + super().__init__() + self.scale = dim ** -0.5 + self.max_seq_len = max_seq_len + self.emb = nn.Embedding(max_seq_len, dim) + + def forward(self, x, pos=None, seq_start_pos=None): + seq_len, device = x.shape[1], x.device + assert seq_len <= self.max_seq_len, f'you are passing in a sequence length of {seq_len} but your absolute positional embedding has a max sequence length of {self.max_seq_len}' + + if pos is None: + pos = torch.arange(seq_len, device=device) + + if seq_start_pos is not None: + pos = (pos - seq_start_pos[..., None]).clamp(min=0) + + pos_emb = self.emb(pos) + pos_emb = pos_emb * self.scale + return pos_emb + + +class ScaledSinusoidalEmbedding(nn.Module): + def __init__(self, dim, theta=10000): + super().__init__() + assert (dim % 2) == 0, 'dimension must be divisible by 2' + self.scale = nn.Parameter(torch.ones(1) * dim ** -0.5) + + half_dim = dim // 2 + freq_seq = torch.arange(half_dim).float() / half_dim + inv_freq = theta ** -freq_seq + self.register_buffer('inv_freq', inv_freq, persistent=False) + + def forward(self, x, pos=None, seq_start_pos=None): + seq_len, device = x.shape[1], x.device + + if pos is None: + pos = torch.arange(seq_len, device=device) + + if seq_start_pos is not None: + pos = pos - seq_start_pos[..., None] + + emb = einsum('i, j -> i j', pos, self.inv_freq) + emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + return emb * self.scale + + +class RotaryEmbedding(nn.Module): + def __init__( + self, + dim, + use_xpos=False, + scale_base=512, + interpolation_factor=1., + base=10000, + base_rescale_factor=1. + ): + super().__init__() + # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning + # has some connection to NTK literature + # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ + base *= base_rescale_factor ** (dim / (dim - 2)) + + inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer('inv_freq', inv_freq) + + assert interpolation_factor >= 1. + self.interpolation_factor = interpolation_factor + + if not use_xpos: + self.register_buffer('scale', None) + return + + scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim) + + self.scale_base = scale_base + self.register_buffer('scale', scale) + + def forward_from_seq_len(self, seq_len): + device = self.inv_freq.device + + t = torch.arange(seq_len, device=device) + return self.forward(t) + + @autocast(enabled=False) + def forward(self, t): + device = self.inv_freq.device + + t = t.to(torch.float32) + + t = t / self.interpolation_factor + + freqs = torch.einsum('i , j -> i j', t, self.inv_freq) + freqs = torch.cat((freqs, freqs), dim=-1) + + if self.scale is None: + return freqs, 1. + + power = (torch.arange(seq_len, device=device) - (seq_len // 2)) / self.scale_base + scale = self.scale ** rearrange(power, 'n -> n 1') + scale = torch.cat((scale, scale), dim=-1) + + return freqs, scale + + +def rotate_half(x): + x = rearrange(x, '... (j d) -> ... j d', j=2) + x1, x2 = x.unbind(dim=-2) + return torch.cat((-x2, x1), dim=-1) + + +@autocast(enabled=False) +def apply_rotary_pos_emb(t, freqs, scale=1): + out_dtype = t.dtype + + # cast to float32 if necessary for numerical stability + dtype = reduce(torch.promote_types, (t.dtype, freqs.dtype, torch.float32)) + rot_dim, seq_len = freqs.shape[-1], t.shape[-2] + freqs, t = freqs.to(dtype), t.to(dtype) + freqs = freqs[-seq_len:, :] + + if t.ndim == 4 and freqs.ndim == 3: + freqs = rearrange(freqs, 'b n d -> b 1 n d') + + # partial rotary embeddings, Wang et al. GPT-J + t, t_unrotated = t[..., :rot_dim], t[..., rot_dim:] + t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale) + + t, t_unrotated = t.to(out_dtype), t_unrotated.to(out_dtype) + + return torch.cat((t, t_unrotated), dim=-1) + + +# norms +class LayerNorm(nn.Module): + def __init__(self, dim, bias=False, fix_scale=False): + """ + bias-less layernorm has been shown to be more stable. most newer models have moved towards rmsnorm, also bias-less + """ + super().__init__() + + if fix_scale: + self.register_buffer("gamma", torch.ones(dim)) + else: + self.gamma = nn.Parameter(torch.ones(dim)) + + if bias: + self.beta = nn.Parameter(torch.zeros(dim)) + else: + self.register_buffer("beta", torch.zeros(dim)) + + def forward(self, x): + return F.layer_norm(x, x.shape[-1:], weight=self.gamma, bias=self.beta) + + +# feedforward + +class GLU(nn.Module): + def __init__( + self, + dim_in, + dim_out, + activation: Callable, + use_conv=False, + conv_kernel_size=3, + ): + super().__init__() + self.act = activation + self.proj = nn.Linear(dim_in, dim_out * 2) if not use_conv else nn.Conv1d(dim_in, dim_out * 2, conv_kernel_size, + padding=(conv_kernel_size // 2)) + self.use_conv = use_conv + + def forward(self, x): + if self.use_conv: + x = rearrange(x, 'b n d -> b d n') + x = self.proj(x) + x = rearrange(x, 'b d n -> b n d') + else: + x = self.proj(x) + + x, gate = x.chunk(2, dim=-1) + return x * self.act(gate) + + +class FeedForward(nn.Module): + def __init__( + self, + dim, + dim_out=None, + mult=4, + no_bias=False, + glu=True, + use_conv=False, + conv_kernel_size=3, + zero_init_output=True, + ): + super().__init__() + inner_dim = int(dim * mult) + + # Default to SwiGLU + + activation = nn.SiLU() + + dim_out = dim if dim_out is None else dim_out + + if glu: + linear_in = GLU(dim, inner_dim, activation) + else: + linear_in = nn.Sequential( + Rearrange('b n d -> b d n') if use_conv else nn.Identity(), + nn.Linear(dim, inner_dim, bias=not no_bias) if not use_conv else nn.Conv1d(dim, inner_dim, + conv_kernel_size, padding=( + conv_kernel_size // 2), bias=not no_bias), + Rearrange('b n d -> b d n') if use_conv else nn.Identity(), + activation + ) + + linear_out = nn.Linear(inner_dim, dim_out, bias=not no_bias) if not use_conv else nn.Conv1d(inner_dim, dim_out, + conv_kernel_size, + padding=( + conv_kernel_size // 2), + bias=not no_bias) + + # init last linear layer to 0 + if zero_init_output: + nn.init.zeros_(linear_out.weight) + if not no_bias: + nn.init.zeros_(linear_out.bias) + + self.ff = nn.Sequential( + linear_in, + Rearrange('b d n -> b n d') if use_conv else nn.Identity(), + linear_out, + Rearrange('b n d -> b d n') if use_conv else nn.Identity(), + ) + + def forward(self, x): + return self.ff(x) + + +class Attention(nn.Module): + def __init__( + self, + dim, + dim_heads=64, + dim_context=None, + causal=False, + zero_init_output=True, + qk_norm: Literal['l2', 'ln', 'none'] = 'none', + natten_kernel_size=None + ): + super().__init__() + self.dim = dim + self.dim_heads = dim_heads + self.causal = causal + + dim_kv = dim_context if dim_context is not None else dim + + self.num_heads = dim // dim_heads + self.kv_heads = dim_kv // dim_heads + + if dim_context is not None: + self.to_q = nn.Linear(dim, dim, bias=False) + self.to_kv = nn.Linear(dim_kv, dim_kv * 2, bias=False) + else: + self.to_qkv = nn.Linear(dim, dim * 3, bias=False) + + self.to_out = nn.Linear(dim, dim, bias=False) + + if zero_init_output: + nn.init.zeros_(self.to_out.weight) + + self.qk_norm = qk_norm + + if self.qk_norm == "ln": + self.q_norm = nn.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6) + self.k_norm = nn.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6) + + # Using 1d neighborhood attention + self.natten_kernel_size = natten_kernel_size + if natten_kernel_size is not None: + return + + self.use_pt_flash = torch.cuda.is_available() and version.parse(torch.__version__) >= version.parse('2.0.0') + + self.use_fa_flash = torch.cuda.is_available() and flash_attn_func is not None + # pdb.set_trace() + self.use_fa_flash = False + + self.sdp_kwargs = dict( + enable_flash=True, + enable_math=True, + enable_mem_efficient=True + ) + + def flash_attn( + self, + q, + k, + v, + mask=None, + causal=None + ): + batch, heads, q_len, _, k_len, device = *q.shape, k.shape[-2], q.device + kv_heads = k.shape[1] + # Recommended for multi-query single-key-value attention by Tri Dao + # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64]) + + if heads != kv_heads: + # Repeat interleave kv_heads to match q_heads + heads_per_kv_head = heads // kv_heads + k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim=1), (k, v)) + + if k.ndim == 3: + k = rearrange(k, 'b ... -> b 1 ...').expand_as(q) + + if v.ndim == 3: + v = rearrange(v, 'b ... -> b 1 ...').expand_as(q) + + causal = self.causal if causal is None else causal + + if q_len == 1 and causal: + causal = False + + if mask is not None: + assert mask.ndim == 4 + mask = mask.expand(batch, heads, q_len, k_len) + + assert causal + # handle kv cache - this should be bypassable in updated flash attention 2 + if k_len > q_len and causal: + causal_mask = create_causal_mask(q_len, k_len, device=device) + if mask is None: + mask = ~causal_mask + else: + mask = mask & ~causal_mask + causal = False + + # manually handle causal mask, if another mask was given + + row_is_entirely_masked = None + + if mask is not None and causal: + causal_mask = create_causal_mask(q_len, k_len, device=device) + mask = mask & ~causal_mask + + # protect against an entire row being masked out + + row_is_entirely_masked = ~mask.any(dim=-1) + mask[..., 0] = mask[..., 0] | row_is_entirely_masked + + causal = False + + with torch.backends.cuda.sdp_kernel(**self.sdp_kwargs): + out = F.scaled_dot_product_attention( + q, k, v, + attn_mask=mask, + is_causal=causal + ) + + # for a row that is entirely masked out, should zero out the output of that row token + + if row_is_entirely_masked is not None: + out = out.masked_fill(row_is_entirely_masked[..., None], 0.) + + return out + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + rotary_pos_emb=None, + causal=None + ): + h, kv_h, has_context = self.num_heads, self.kv_heads, context is not None + + kv_input = context if has_context else x + + if hasattr(self, 'to_q'): + # Use separate linear projections for q and k/v + q = self.to_q(x) + q = rearrange(q, 'b n (h d) -> b h n d', h=h) + + k, v = self.to_kv(kv_input).chunk(2, dim=-1) + + k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=kv_h), (k, v)) + else: + # Use fused linear projection + q, k, v = self.to_qkv(x).chunk(3, dim=-1) + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v)) + + # Normalize q and k for cosine sim attention + if self.qk_norm == "l2": + q = F.normalize(q, dim=-1) + k = F.normalize(k, dim=-1) + elif self.qk_norm == "ln": + q = self.q_norm(q) + k = self.k_norm(k) + + if rotary_pos_emb is not None and not has_context: + freqs, _ = rotary_pos_emb + + q_dtype = q.dtype + k_dtype = k.dtype + + q = q.to(torch.float32) + k = k.to(torch.float32) + freqs = freqs.to(torch.float32) + + q = apply_rotary_pos_emb(q, freqs) + k = apply_rotary_pos_emb(k, freqs) + + q = q.to(q_dtype) + k = k.to(k_dtype) + + input_mask = context_mask + + if input_mask is None and not has_context: + input_mask = mask + + # determine masking + masks = [] + final_attn_mask = None # The mask that will be applied to the attention matrix, taking all masks into account + + if input_mask is not None: + input_mask = rearrange(input_mask, 'b j -> b 1 1 j') + masks.append(~input_mask) + + # Other masks will be added here later + + if len(masks) > 0: + final_attn_mask = ~or_reduce(masks) + + n, device = q.shape[-2], q.device + + causal = self.causal if causal is None else causal + + if n == 1 and causal: + causal = False + + if self.natten_kernel_size is not None: + if natten is None: + raise ImportError('natten not installed, please install natten to use neighborhood attention') + + dtype_in = q.dtype + q, k, v = map(lambda t: t.to(torch.float32), (q, k, v)) + + attn = natten.functional.natten1dqk(q, k, kernel_size=self.natten_kernel_size, dilation=1) + + if final_attn_mask is not None: + attn = attn.masked_fill(final_attn_mask, -torch.finfo(attn.dtype).max) + + attn = F.softmax(attn, dim=-1, dtype=torch.float32) + + out = natten.functional.natten1dav(attn, v, kernel_size=self.natten_kernel_size, dilation=1).to(dtype_in) + + # Prioritize Flash Attention 2 + elif self.use_fa_flash: + assert final_attn_mask is None, 'masking not yet supported for Flash Attention 2' + # Flash Attention 2 requires FP16 inputs + fa_dtype_in = q.dtype + q, k, v = map(lambda t: rearrange(t, 'b h n d -> b n h d').to(torch.float16), (q, k, v)) + + out = flash_attn_func(q, k, v, causal=causal) + + out = rearrange(out.to(fa_dtype_in), 'b n h d -> b h n d') + + # Fall back to PyTorch implementation + elif self.use_pt_flash: + # causal=False + # final_attn_mask:[64, 1, 1, 348] + out = self.flash_attn(q, k, v, causal=True, mask=final_attn_mask) + + else: + # Fall back to custom implementation + + if h != kv_h: + # Repeat interleave kv_heads to match q_heads + heads_per_kv_head = h // kv_h + k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim=1), (k, v)) + + scale = 1. / (q.shape[-1] ** 0.5) + + kv_einsum_eq = 'b j d' if k.ndim == 3 else 'b h j d' + + dots = einsum(f'b h i d, {kv_einsum_eq} -> b h i j', q, k) * scale + + i, j, dtype = *dots.shape[-2:], dots.dtype + + mask_value = -torch.finfo(dots.dtype).max + + if final_attn_mask is not None: + dots = dots.masked_fill(~final_attn_mask, mask_value) + + if causal: + causal_mask = create_causal_mask(i, j, device=device) + dots = dots.masked_fill(causal_mask, mask_value) + + attn = F.softmax(dots, dim=-1, dtype=torch.float32) + attn = attn.type(dtype) + + out = einsum(f'b h i j, {kv_einsum_eq} -> b h i d', attn, v) + + # merge heads + out = rearrange(out, ' b h n d -> b n (h d)') + + # Communicate between heads + + # with autocast(enabled = False): + # out_dtype = out.dtype + # out = out.to(torch.float32) + # out = self.to_out(out).to(out_dtype) + out = self.to_out(out) + + if mask is not None: + mask = rearrange(mask, 'b n -> b n 1') + out = out.masked_fill(~mask, 0.) + + return out + + +class ConformerModule(nn.Module): + def __init__( + self, + dim, + norm_kwargs={}, + ): + super().__init__() + + self.dim = dim + + self.in_norm = LayerNorm(dim, **norm_kwargs) + self.pointwise_conv = nn.Conv1d(dim, dim, kernel_size=1, bias=False) + self.glu = GLU(dim, dim, nn.SiLU()) + self.depthwise_conv = nn.Conv1d(dim, dim, kernel_size=17, groups=dim, padding=8, bias=False) + self.mid_norm = LayerNorm(dim, + **norm_kwargs) # This is a batch norm in the original but I don't like batch norm + self.swish = nn.SiLU() + self.pointwise_conv_2 = nn.Conv1d(dim, dim, kernel_size=1, bias=False) + + def forward(self, x): + x = self.in_norm(x) + x = rearrange(x, 'b n d -> b d n') + x = self.pointwise_conv(x) + x = rearrange(x, 'b d n -> b n d') + x = self.glu(x) + x = rearrange(x, 'b n d -> b d n') + x = self.depthwise_conv(x) + x = rearrange(x, 'b d n -> b n d') + x = self.mid_norm(x) + x = self.swish(x) + x = rearrange(x, 'b n d -> b d n') + x = self.pointwise_conv_2(x) + x = rearrange(x, 'b d n -> b n d') + + return x + + +class TransformerBlock(nn.Module): + def __init__( + self, + dim, + dim_heads=64, + cross_attend=False, + dim_context=None, + global_cond_dim=None, + causal=False, + zero_init_branch_outputs=True, + conformer=False, + layer_ix=-1, + remove_norms=False, + attn_kwargs={}, + ff_kwargs={}, + norm_kwargs={} + ): + + super().__init__() + self.dim = dim + self.dim_heads = dim_heads + self.cross_attend = cross_attend + self.dim_context = dim_context + self.causal = causal + + self.pre_norm = LayerNorm(dim, **norm_kwargs) if not remove_norms else nn.Identity() + + self.self_attn = Attention( + dim, + dim_heads=dim_heads, + causal=causal, + zero_init_output=zero_init_branch_outputs, + **attn_kwargs + ) + ### 2. 主要是这边需要修改 + if cross_attend: + self.cross_attend_norm = LayerNorm(dim, **norm_kwargs) if not remove_norms else nn.Identity() + self.cross_attn = Attention( + dim, + dim_heads=dim_heads, + dim_context=dim_context, + causal=causal, + zero_init_output=zero_init_branch_outputs, + **attn_kwargs + ) + + self.ff_norm = LayerNorm(dim, **norm_kwargs) if not remove_norms else nn.Identity() + self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, **ff_kwargs) + + self.layer_ix = layer_ix + + self.conformer = ConformerModule(dim, norm_kwargs=norm_kwargs) if conformer else None + + self.global_cond_dim = global_cond_dim + + if global_cond_dim is not None: + self.to_scale_shift_gate = nn.Sequential( + nn.SiLU(), + nn.Linear(global_cond_dim, dim * 6, bias=False) + ) + + nn.init.zeros_(self.to_scale_shift_gate[1].weight) + # nn.init.zeros_(self.to_scale_shift_gate_self[1].bias) + + def forward( + self, + x, + context=None, + global_cond=None, + mask=None, + context_mask=None, + rotary_pos_emb=None + ): + if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None: + + scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = self.to_scale_shift_gate( + global_cond).unsqueeze(1).chunk(6, dim=-1) + + # self-attention with adaLN + residual = x + x = self.pre_norm(x) + x = x * (1 + scale_self) + shift_self + x = self.self_attn(x, mask=mask, rotary_pos_emb=rotary_pos_emb) + x = x * torch.sigmoid(1 - gate_self) + x = x + residual + + if context is not None: + x = x + self.cross_attn(self.cross_attend_norm(x), context=context, context_mask=context_mask) + + if self.conformer is not None: + x = x + self.conformer(x) + + # feedforward with adaLN + residual = x + x = self.ff_norm(x) + x = x * (1 + scale_ff) + shift_ff + x = self.ff(x) + x = x * torch.sigmoid(1 - gate_ff) + x = x + residual + + else: + x = x + self.self_attn(self.pre_norm(x), mask=mask, rotary_pos_emb=rotary_pos_emb) + + if context is not None: + x = x + self.cross_attn(self.cross_attend_norm(x), context=context, context_mask=context_mask) + + if self.conformer is not None: + x = x + self.conformer(x) + + x = x + self.ff(self.ff_norm(x)) + + return x + + +class ContinuousTransformer(nn.Module): + def __init__( + self, + dim, + depth, + *, + dim_in=None, + dim_out=None, + dim_heads=64, + cross_attend=False, + cond_token_dim=None, + global_cond_dim=None, + causal=False, + rotary_pos_emb=True, + zero_init_branch_outputs=True, + conformer=False, + use_sinusoidal_emb=False, + use_abs_pos_emb=False, + abs_pos_emb_max_length=10000, + **kwargs + ): + + super().__init__() + + self.dim = dim + self.depth = depth + self.causal = causal + self.layers = nn.ModuleList([]) + + self.project_in = nn.Linear(dim_in, dim, bias=False) if dim_in is not None else nn.Identity() + self.project_out = nn.Linear(dim, dim_out, bias=False) if dim_out is not None else nn.Identity() + + if rotary_pos_emb: + self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32)) + else: + self.rotary_pos_emb = None + + self.use_sinusoidal_emb = use_sinusoidal_emb + if use_sinusoidal_emb: + self.pos_emb = ScaledSinusoidalEmbedding(dim) + + self.use_abs_pos_emb = use_abs_pos_emb + if use_abs_pos_emb: + self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length) + + for i in range(depth): + self.layers.append( + TransformerBlock( + dim, + dim_heads=dim_heads, + cross_attend=cross_attend, + dim_context=cond_token_dim, + global_cond_dim=global_cond_dim, + causal=causal, + zero_init_branch_outputs=zero_init_branch_outputs, + conformer=conformer, + layer_ix=i, + **kwargs + ) + ) + + def forward( + self, + x, + mask=None, + prepend_embeds=None, + prepend_mask=None, + global_cond=None, + return_info=False, + **kwargs + ): + batch, seq, device = *x.shape[:2], x.device + + info = { + "hidden_states": [], + } + + x = self.project_in(x) + if prepend_embeds is not None: + prepend_length, prepend_dim = prepend_embeds.shape[1:] + + assert prepend_dim == x.shape[-1], 'prepend dimension must match sequence dimension' + + x = torch.cat((prepend_embeds, x), dim=-2) + + if prepend_mask is not None or mask is not None: + mask = mask if mask is not None else torch.ones((batch, seq), device=device, dtype=torch.bool) + prepend_mask = prepend_mask if prepend_mask is not None else torch.ones((batch, prepend_length), + device=device, dtype=torch.bool) + + mask = torch.cat((prepend_mask, mask), dim=-1) + + # Attention layers + + if self.rotary_pos_emb is not None: + rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1]) + else: + rotary_pos_emb = None + + if self.use_sinusoidal_emb or self.use_abs_pos_emb: + x = x + self.pos_emb(x) + + # Iterate over the transformer layers + mask = self.refine_mask(mask) + for layer in self.layers: + # x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs) + # pdb.set_trace() + x = checkpoint(layer, x, mask=mask.bool(), rotary_pos_emb=rotary_pos_emb, global_cond=global_cond, **kwargs) + + if return_info: + info["hidden_states"].append(x) + + x = self.project_out(x) + + if return_info: + return x, info + + return x + + def refine_mask(self, mask): + return mask + # pdb.set_trace() + # mask = 1 - torch.triu(torch.ones(seq_length, seq_length), diagonal=1) + # return mask diff --git a/cosyvoice/hifigan/__pycache__/f0_predictor.cpython-310.pyc b/cosyvoice/hifigan/__pycache__/f0_predictor.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1f9504c5ea64e6af2b587c0bc1380ce219e0b26 Binary files /dev/null and b/cosyvoice/hifigan/__pycache__/f0_predictor.cpython-310.pyc differ diff --git a/cosyvoice/hifigan/__pycache__/generator.cpython-310.pyc b/cosyvoice/hifigan/__pycache__/generator.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..378335024aa71b4b1c4028a1c0f6f624e90c329e Binary files /dev/null and b/cosyvoice/hifigan/__pycache__/generator.cpython-310.pyc differ diff --git a/cosyvoice/hifigan/f0_predictor.py b/cosyvoice/hifigan/f0_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..36b85f4ed90c3a412cb179f49ccb471132a86550 --- /dev/null +++ b/cosyvoice/hifigan/f0_predictor.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch.nn as nn +from torch.nn.utils import weight_norm + + +class ConvRNNF0Predictor(nn.Module): + def __init__(self, + num_class: int = 1, + in_channels: int = 80, + cond_channels: int = 512 + ): + super().__init__() + + self.num_class = num_class + self.condnet = nn.Sequential( + weight_norm( + nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1) + ), + nn.ELU(), + weight_norm( + nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) + ), + nn.ELU(), + weight_norm( + nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) + ), + nn.ELU(), + weight_norm( + nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) + ), + nn.ELU(), + weight_norm( + nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) + ), + nn.ELU(), + ) + self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.condnet(x) + x = x.transpose(1, 2) + return torch.abs(self.classifier(x).squeeze(-1)) diff --git a/cosyvoice/hifigan/generator.py b/cosyvoice/hifigan/generator.py new file mode 100644 index 0000000000000000000000000000000000000000..a43ac05a7d828c86965d3f56b6798d522689089f --- /dev/null +++ b/cosyvoice/hifigan/generator.py @@ -0,0 +1,398 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""HIFI-GAN""" + +import typing as tp +import numpy as np +from scipy.signal import get_window +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Conv1d +from torch.nn import ConvTranspose1d +from torch.nn.utils import remove_weight_norm +from torch.nn.utils import weight_norm +from torch.distributions.uniform import Uniform + +from cosyvoice.transformer.activation import Snake +from cosyvoice.utils.common import get_padding +from cosyvoice.utils.common import init_weights + + +"""hifigan based generator implementation. + +This code is modified from https://github.com/jik876/hifi-gan + ,https://github.com/kan-bayashi/ParallelWaveGAN and + https://github.com/NVIDIA/BigVGAN + +""" + + +class ResBlock(torch.nn.Module): + """Residual block module in HiFiGAN/BigVGAN.""" + def __init__( + self, + channels: int = 512, + kernel_size: int = 3, + dilations: tp.List[int] = [1, 3, 5], + ): + super(ResBlock, self).__init__() + self.convs1 = nn.ModuleList() + self.convs2 = nn.ModuleList() + + for dilation in dilations: + self.convs1.append( + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation, + padding=get_padding(kernel_size, dilation) + ) + ) + ) + self.convs2.append( + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1) + ) + ) + ) + self.convs1.apply(init_weights) + self.convs2.apply(init_weights) + self.activations1 = nn.ModuleList([ + Snake(channels, alpha_logscale=False) + for _ in range(len(self.convs1)) + ]) + self.activations2 = nn.ModuleList([ + Snake(channels, alpha_logscale=False) + for _ in range(len(self.convs2)) + ]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + for idx in range(len(self.convs1)): + xt = self.activations1[idx](x) + xt = self.convs1[idx](xt) + xt = self.activations2[idx](xt) + xt = self.convs2[idx](xt) + x = xt + x + return x + + def remove_weight_norm(self): + for idx in range(len(self.convs1)): + remove_weight_norm(self.convs1[idx]) + remove_weight_norm(self.convs2[idx]) + + +class SineGen(torch.nn.Module): + """ Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__(self, samp_rate, harmonic_num=0, + sine_amp=0.1, noise_std=0.003, + voiced_threshold=0): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = (f0 > self.voiced_threshold).type(torch.float32) + return uv + + @torch.no_grad() + def forward(self, f0): + """ + :param f0: [B, 1, sample_len], Hz + :return: [B, 1, sample_len] + """ + + F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device) + for i in range(self.harmonic_num + 1): + F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate + + theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1) + u_dist = Uniform(low=-np.pi, high=np.pi) + phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device) + phase_vec[:, 0, :] = 0 + + # generate sine waveforms + sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec) + + # generate uv signal + uv = self._f02uv(f0) + + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """ SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen(sampling_rate, harmonic_num, + sine_amp, add_noise_std, voiced_threshod) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x): + """ + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + """ + # source for harmonic branch + with torch.no_grad(): + sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2)) + sine_wavs = sine_wavs.transpose(1, 2) + uv = uv.transpose(1, 2) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + # source for noise branch, in the same shape as uv + noise = torch.randn_like(uv) * self.sine_amp / 3 + return sine_merge, noise, uv + + +class HiFTGenerator(nn.Module): + """ + HiFTNet Generator: Neural Source Filter + ISTFTNet + https://arxiv.org/abs/2309.09493 + """ + def __init__( + self, + in_channels: int = 80, + base_channels: int = 512, + nb_harmonics: int = 8, + sampling_rate: int = 22050, + nsf_alpha: float = 0.1, + nsf_sigma: float = 0.003, + nsf_voiced_threshold: float = 10, + upsample_rates: tp.List[int] = [8, 8], + upsample_kernel_sizes: tp.List[int] = [16, 16], + istft_params: tp.Dict[str, int] = {"n_fft": 16, "hop_len": 4}, + resblock_kernel_sizes: tp.List[int] = [3, 7, 11], + resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + source_resblock_kernel_sizes: tp.List[int] = [7, 11], + source_resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5]], + lrelu_slope: float = 0.1, + audio_limit: float = 0.99, + f0_predictor: torch.nn.Module = None, + ): + super(HiFTGenerator, self).__init__() + + self.out_channels = 1 + self.nb_harmonics = nb_harmonics + self.sampling_rate = sampling_rate + self.istft_params = istft_params + self.lrelu_slope = lrelu_slope + self.audio_limit = audio_limit + + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.m_source = SourceModuleHnNSF( + sampling_rate=sampling_rate, + upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"], + harmonic_num=nb_harmonics, + sine_amp=nsf_alpha, + add_noise_std=nsf_sigma, + voiced_threshod=nsf_voiced_threshold) + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"]) + + self.conv_pre = weight_norm( + Conv1d(in_channels, base_channels, 7, 1, padding=3) + ) + + # Up + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + base_channels // (2**i), + base_channels // (2**(i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + # Down + self.source_downs = nn.ModuleList() + self.source_resblocks = nn.ModuleList() + downsample_rates = [1] + upsample_rates[::-1][:-1] + downsample_cum_rates = np.cumprod(downsample_rates) + for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)): + if u == 1: + self.source_downs.append( + Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1) + ) + else: + self.source_downs.append( + Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2)) + ) + + self.source_resblocks.append( + ResBlock(base_channels // (2 ** (i + 1)), k, d) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = base_channels // (2**(i + 1)) + for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): + self.resblocks.append(ResBlock(ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + self.reflection_pad = nn.ReflectionPad1d((1, 0)) + self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32)) + self.f0_predictor = f0_predictor + + def _f02source(self, f0: torch.Tensor) -> torch.Tensor: + f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t + + har_source, _, _ = self.m_source(f0) + return har_source.transpose(1, 2) + + def _stft(self, x): + spec = torch.stft( + x, + self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device), + return_complex=True) + spec = torch.view_as_real(spec) # [B, F, TT, 2] + return spec[..., 0], spec[..., 1] + + def _istft(self, magnitude, phase): + magnitude = torch.clip(magnitude, max=1e2) + real = magnitude * torch.cos(phase) + img = magnitude * torch.sin(phase) + inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"], + self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device)) + return inverse_transform + + def forward(self, x: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor: + f0 = self.f0_predictor(x) + s = self._f02source(f0) + + # use cache_source to avoid glitch + if cache_source.shape[2] != 0: + s[:, :, :cache_source.shape[2]] = cache_source + + s_stft_real, s_stft_imag = self._stft(s.squeeze(1)) + s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1) + + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, self.lrelu_slope) + x = self.ups[i](x) + + if i == self.num_upsamples - 1: + x = self.reflection_pad(x) + + # fusion + si = self.source_downs[i](s_stft) + si = self.source_resblocks[i](si) + x = x + si + + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + + x = F.leaky_relu(x) + x = self.conv_post(x) + magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :]) + phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :]) # actually, sin is redundancy + + x = self._istft(magnitude, phase) + x = torch.clamp(x, -self.audio_limit, self.audio_limit) + return x, s + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + self.source_module.remove_weight_norm() + for l in self.source_downs: + remove_weight_norm(l) + for l in self.source_resblocks: + l.remove_weight_norm() + + @torch.inference_mode() + def inference(self, mel: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor: + return self.forward(x=mel, cache_source=cache_source) \ No newline at end of file diff --git a/cosyvoice/llm/__pycache__/llm.cpython-310.pyc b/cosyvoice/llm/__pycache__/llm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01e88efe5a519071a56088723211bff13d66bca3 Binary files /dev/null and b/cosyvoice/llm/__pycache__/llm.cpython-310.pyc differ diff --git a/cosyvoice/llm/llm.py b/cosyvoice/llm/llm.py new file mode 100644 index 0000000000000000000000000000000000000000..3b418c5d1017c6f8412418dd8d1c1b7790947241 --- /dev/null +++ b/cosyvoice/llm/llm.py @@ -0,0 +1,206 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, Optional, Union +import torch +from torch import nn +import torch.nn.functional as F +from torch.nn.utils.rnn import pad_sequence, unpad_sequence +from cosyvoice.utils.common import IGNORE_ID +from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss +from cosyvoice.utils.common import th_accuracy + + +class TransformerLM(torch.nn.Module): + def __init__( + self, + text_encoder_input_size: int, + llm_input_size: int, + llm_output_size: int, + text_token_size: int, + speech_token_size: int, + text_encoder: torch.nn.Module, + llm: torch.nn.Module, + length_normalized_loss: bool = True, + lsm_weight: float = 0.0, + spk_embed_dim: int = 192, + ): + super().__init__() + self.llm_input_size = llm_input_size + self.speech_token_size = speech_token_size + # 1. build text token inputs related modules + self.text_embedding = torch.nn.Embedding(text_token_size, text_encoder_input_size) + self.text_encoder = text_encoder + self.text_encoder_affine_layer = nn.Linear( + self.text_encoder.output_size(), + llm_input_size + ) + + # 2. build speech token language model related modules + self.sos_eos = 0 + self.task_id = 1 + self.llm_embedding = torch.nn.Embedding(2, llm_input_size) + self.llm = llm + self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 1) + self.criterion_ce = LabelSmoothingLoss( + size=speech_token_size + 1, + padding_idx=IGNORE_ID, + smoothing=lsm_weight, + normalize_length=length_normalized_loss, + ) + + # 3. [Optional] build speech token related modules + self.speech_embedding = torch.nn.Embedding(speech_token_size, llm_input_size) + self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, llm_input_size) + + def encode( + self, + text: torch.Tensor, + text_lengths: torch.Tensor, + ): + encoder_out, encoder_mask = self.text_encoder(text, text_lengths, decoding_chunk_size=1, num_decoding_left_chunks=-1) + encoder_out_lens = encoder_mask.squeeze(1).sum(1) + encoder_out = self.text_encoder_affine_layer(encoder_out) + return encoder_out, encoder_out_lens + + def pad_unpad_sequence(self, sos_eos_emb, embedding, text_token, text_token_len, task_id_emb, speech_token, speech_token_len): + text_token = unpad_sequence(text_token, text_token_len.cpu(), batch_first=True) + speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True) + lm_input = [torch.concat([sos_eos_emb.squeeze(dim=0), embedding[i], text_token[i], task_id_emb.squeeze(dim=0), speech_token[i]], dim=0) for i in range(len(text_token))] + lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32) + lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID) + return lm_input, lm_input_len + + def forward( + self, + batch: dict, + device: torch.device, + ) -> Dict[str, Optional[torch.Tensor]]: + """ + Args: + text: (B, L, D) + text_lengths: (B,) + audio: (B, T, N) or (B, T) + audio_lengths: (B,) + """ + text_token = batch['text_token'].to(device) + text_token_len = batch['text_token_len'].to(device) + speech_token = batch['speech_token'].to(device) + speech_token_len = batch['speech_token_len'].to(device) + embedding = batch['embedding'].to(device) + + # 1. prepare llm_target + lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))] + lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device) + + # 1. encode text_token + text_token = self.text_embedding(text_token) + text_token, text_token_len = self.encode(text_token, text_token_len) + + # 2. embedding projection + embedding = F.normalize(embedding, dim=1) + embedding = self.spk_embed_affine_layer(embedding) + embedding = embedding.unsqueeze(1) + + # 3. eos and task_id + sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1) + task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1) + + # 4. encode speech_token + speech_token = self.speech_embedding(speech_token) + + # 5. unpad and pad + lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb, embedding, text_token, text_token_len, task_id_emb, speech_token, speech_token_len) + + # 6. run lm forward + lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device)) + logits = self.llm_decoder(lm_output) + loss = self.criterion_ce(logits, lm_target) + acc = th_accuracy(logits.view(-1, self.speech_token_size + 1), lm_target, ignore_label=IGNORE_ID) + return {'loss': loss, 'acc': acc} + + def sampling_ids( + self, + weighted_scores: torch.Tensor, + sampling: Union[bool, int, float] = True, + beam_size: int = 1, + ignore_eos: bool = True, + ): + while True: + prob, indices = weighted_scores.softmax(dim=-1).topk(sampling) + top_ids = prob.multinomial(beam_size, replacement=True) + top_ids = indices[top_ids] + if (not ignore_eos) or (self.speech_token_size not in top_ids): + break + return top_ids + + @torch.inference_mode() + def inference( + self, + text: torch.Tensor, + text_len: torch.Tensor, + prompt_text: torch.Tensor, + prompt_text_len: torch.Tensor, + prompt_speech_token: torch.Tensor, + prompt_speech_token_len: torch.Tensor, + embedding: torch.Tensor, + beam_size: int = 1, + sampling: int = 25, + max_token_text_ratio: float = 20, + min_token_text_ratio: float = 2, + ) -> torch.Tensor: + device = text.device + text = torch.concat([prompt_text, text], dim=1) + text_len += prompt_text_len + text = self.text_embedding(text) + + # 1. encode text + text, text_len = self.encode(text, text_len) + + # 2. encode embedding + if embedding.shape[0] != 0: + embedding = F.normalize(embedding, dim=1) + embedding = self.spk_embed_affine_layer(embedding) + embedding = embedding.unsqueeze(dim=1) + else: + embedding = torch.zeros(1, 0, self.llm_input_size).to(device) + + # 3. concat llm_input + sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1) + task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1) + if prompt_speech_token_len != 0: + prompt_speech_token_emb = self.speech_embedding(prompt_speech_token) + else: + prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size).to(device) + lm_input = torch.concat([sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1) + + # 4. cal min/max_length + min_len = int((text_len - prompt_text_len) * min_token_text_ratio) + max_len = int((text_len - prompt_text_len) * max_token_text_ratio) + + # 5. step by step decode + out_tokens = [] + offset = 0 + att_cache, cnn_cache = torch.zeros((0, 0, 0, 0), device=lm_input.device), torch.zeros((0, 0, 0, 0), device=lm_input.device) + for i in range(max_len): + y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=0, required_cache_size=-1, att_cache=att_cache, cnn_cache=cnn_cache, + att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool)) + logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1) + top_ids = self.sampling_ids(logp.squeeze(dim=0), sampling, beam_size, ignore_eos=True if i < min_len else False).item() + if top_ids == self.speech_token_size: + break + out_tokens.append(top_ids) + offset += lm_input.size(1) + lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1) + + return torch.tensor([out_tokens], dtype=torch.int64, device=device) diff --git a/cosyvoice/transformer/__init__.py b/cosyvoice/transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosyvoice/transformer/__pycache__/__init__.cpython-310.pyc b/cosyvoice/transformer/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb30f6341ee066a16dc90f0618612bcc2984a780 Binary files /dev/null and b/cosyvoice/transformer/__pycache__/__init__.cpython-310.pyc differ diff --git a/cosyvoice/transformer/__pycache__/activation.cpython-310.pyc b/cosyvoice/transformer/__pycache__/activation.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21c06e630b307f6af1748914ac6f7c52f8806efc Binary files /dev/null and b/cosyvoice/transformer/__pycache__/activation.cpython-310.pyc differ diff --git a/cosyvoice/transformer/__pycache__/attention.cpython-310.pyc b/cosyvoice/transformer/__pycache__/attention.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..435b009511761f7bbd38065ce8dfe1c9deb941d0 Binary files /dev/null and b/cosyvoice/transformer/__pycache__/attention.cpython-310.pyc differ diff --git a/cosyvoice/transformer/__pycache__/convolution.cpython-310.pyc b/cosyvoice/transformer/__pycache__/convolution.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..548657a463fd80a6195e1ae80e4063395627e1f9 Binary files /dev/null and b/cosyvoice/transformer/__pycache__/convolution.cpython-310.pyc differ diff --git a/cosyvoice/transformer/__pycache__/embedding.cpython-310.pyc b/cosyvoice/transformer/__pycache__/embedding.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15356e843e7fa2d361d614e8e8e2b3c69acfd354 Binary files /dev/null and b/cosyvoice/transformer/__pycache__/embedding.cpython-310.pyc differ diff --git a/cosyvoice/transformer/__pycache__/encoder.cpython-310.pyc b/cosyvoice/transformer/__pycache__/encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4adb4bace9451311d2c17507b45ea99957ce226 Binary files /dev/null and b/cosyvoice/transformer/__pycache__/encoder.cpython-310.pyc differ diff --git a/cosyvoice/transformer/__pycache__/encoder_layer.cpython-310.pyc b/cosyvoice/transformer/__pycache__/encoder_layer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47352f1defdd5159f44149c6607705263716538b Binary files /dev/null and b/cosyvoice/transformer/__pycache__/encoder_layer.cpython-310.pyc differ diff --git a/cosyvoice/transformer/__pycache__/label_smoothing_loss.cpython-310.pyc b/cosyvoice/transformer/__pycache__/label_smoothing_loss.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c73ec64ce22ae9aed7e7317117c768df5852eb6e Binary files /dev/null and b/cosyvoice/transformer/__pycache__/label_smoothing_loss.cpython-310.pyc differ diff --git a/cosyvoice/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc b/cosyvoice/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..807483af3f6f1c9c6d343dc732c9535b9159c8ce Binary files /dev/null and b/cosyvoice/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc differ diff --git a/cosyvoice/transformer/__pycache__/subsampling.cpython-310.pyc b/cosyvoice/transformer/__pycache__/subsampling.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2746d21e39d96214cf557f55c8ec1d7cf65cff51 Binary files /dev/null and b/cosyvoice/transformer/__pycache__/subsampling.cpython-310.pyc differ diff --git a/cosyvoice/transformer/activation.py b/cosyvoice/transformer/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..8cea54816385d3b6585ccc2417bc71630d578177 --- /dev/null +++ b/cosyvoice/transformer/activation.py @@ -0,0 +1,84 @@ +# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) +# 2020 Northwestern Polytechnical University (Pengcheng Guo) +# 2020 Mobvoi Inc (Binbin Zhang) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Swish() activation function for Conformer.""" + +import torch +from torch import nn, sin, pow +from torch.nn import Parameter + + +class Swish(torch.nn.Module): + """Construct an Swish object.""" + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Return Swish activation function.""" + return x * torch.sigmoid(x) + + +# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. +# LICENSE is in incl_licenses directory. +class Snake(nn.Module): + ''' + Implementation of a sine-based periodic activation function + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter + References: + - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snake(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha: trainable parameter + alpha is initialized to 1 by default, higher values = higher-frequency. + alpha will be trained along with the rest of your model. + ''' + super(Snake, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: # log scale alphas initialized to zeros + self.alpha = Parameter(torch.zeros(in_features) * alpha) + else: # linear scale alphas initialized to ones + self.alpha = Parameter(torch.ones(in_features) * alpha) + + self.alpha.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + Snake ∶= x + 1/a * sin^2 (xa) + ''' + alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + if self.alpha_logscale: + alpha = torch.exp(alpha) + x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x diff --git a/cosyvoice/transformer/attention.py b/cosyvoice/transformer/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..b9aaa62d1ec0954e9a168b42bc66702e41591aed --- /dev/null +++ b/cosyvoice/transformer/attention.py @@ -0,0 +1,612 @@ +# Copyright (c) 2019 Shigeki Karita +# 2020 Mobvoi Inc (Binbin Zhang) +# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Multi-Head Attention layer definition.""" + +import math +from typing import Tuple + +import torch +from torch import nn + + +class MultiHeadedAttention(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, + n_head: int, + n_feat: int, + dropout_rate: float, + key_bias: bool = True): + """Construct an MultiHeadedAttention object.""" + super().__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + self.linear_q = nn.Linear(n_feat, n_feat) + self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias) + self.linear_v = nn.Linear(n_feat, n_feat) + self.linear_out = nn.Linear(n_feat, n_feat) + self.dropout = nn.Dropout(p=dropout_rate) + + def forward_qkv( + self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor, size + (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor, size + (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor, size + (#batch, n_head, time2, d_k). + + """ + n_batch = query.size(0) + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + q = q.transpose(1, 2) # (batch, head, time1, d_k) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + + return q, k, v + + def forward_attention( + self, + value: torch.Tensor, + scores: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) + ) -> torch.Tensor: + """Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value, size + (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score, size + (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask, size (#batch, 1, time2) or + (#batch, time1, time2), (0, 0, 0) means fake mask. + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + + """ + n_batch = value.size(0) + # NOTE(xcsong): When will `if mask.size(2) > 0` be True? + # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the + # 1st chunk to ease the onnx export.] + # 2. pytorch training + if mask.size(2) > 0: # time2 > 0 + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + # For last chunk, time2 might be larger than scores.size(-1) + mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) + scores = scores.masked_fill(mask, -float('inf')) + attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0) # (batch, head, time1, time2) + # NOTE(xcsong): When will `if mask.size(2) > 0` be False? + # 1. onnx(16/-1, -1/-1, 16/0) + # 2. jit (16/-1, -1/-1, 16/0, 16/4) + else: + attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = (x.transpose(1, 2).contiguous().view(n_batch, -1, + self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + pos_emb: torch.Tensor = torch.empty(0), + cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + 1.When applying cross attention between decoder and encoder, + the batch padding mask for input is in (#batch, 1, T) shape. + 2.When applying self attention of encoder, + the mask is in (#batch, T, T) shape. + 3.When applying self attention of decoder, + the mask is in (#batch, L, L) shape. + 4.If the different position in decoder see different block + of the encoder, such as Mocha, the passed in mask could be + in (#batch, L, T) shape. But there is no such case in current + CosyVoice. + cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + + """ + q, k, v = self.forward_qkv(query, key, value) + + # NOTE(xcsong): + # when export onnx model, for 1st chunk, we feed + # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) + # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). + # In all modes, `if cache.size(0) > 0` will alwayse be `True` + # and we will always do splitting and + # concatnation(this will simplify onnx export). Note that + # it's OK to concat & split zero-shaped tensors(see code below). + # when export jit model, for 1st chunk, we always feed + # cache(0, 0, 0, 0) since jit supports dynamic if-branch. + # >>> a = torch.ones((1, 2, 0, 4)) + # >>> b = torch.ones((1, 2, 3, 4)) + # >>> c = torch.cat((a, b), dim=2) + # >>> torch.equal(b, c) # True + # >>> d = torch.split(a, 2, dim=-1) + # >>> torch.equal(d[0], d[1]) # True + if cache.size(0) > 0: + key_cache, value_cache = torch.split(cache, + cache.size(-1) // 2, + dim=-1) + k = torch.cat([key_cache, k], dim=2) + v = torch.cat([value_cache, v], dim=2) + # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's + # non-trivial to calculate `next_cache_start` here. + new_cache = torch.cat((k, v), dim=-1) + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + return self.forward_attention(v, scores, mask), new_cache + + +class RelPositionMultiHeadedAttention(MultiHeadedAttention): + """Multi-Head Attention layer with relative position encoding. + Paper: https://arxiv.org/abs/1901.02860 + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + """ + + def __init__(self, + n_head: int, + n_feat: int, + dropout_rate: float, + key_bias: bool = True): + """Construct an RelPositionMultiHeadedAttention object.""" + super().__init__(n_head, n_feat, dropout_rate, key_bias) + # linear transformation for positional encoding + self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) + self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + def rel_shift(self, x): + """Compute relative positional encoding. + + Args: + x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1). + time1 means the length of query vector. + + Returns: + torch.Tensor: Output tensor. + + """ + zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) + x = x_padded[:, :, 1:].view_as(x)[ + :, :, :, : x.size(-1) // 2 + 1 + ] # only keep the positions from 0 to time2 + return x + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + pos_emb: torch.Tensor = torch.empty(0), + cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute 'Scaled Dot Product Attention' with rel. positional encoding. + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2), (0, 0, 0) means fake mask. + pos_emb (torch.Tensor): Positional embedding tensor + (#batch, time2, size). + cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + """ + q, k, v = self.forward_qkv(query, key, value) + q = q.transpose(1, 2) # (batch, time1, head, d_k) + + # NOTE(xcsong): + # when export onnx model, for 1st chunk, we feed + # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) + # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). + # In all modes, `if cache.size(0) > 0` will alwayse be `True` + # and we will always do splitting and + # concatnation(this will simplify onnx export). Note that + # it's OK to concat & split zero-shaped tensors(see code below). + # when export jit model, for 1st chunk, we always feed + # cache(0, 0, 0, 0) since jit supports dynamic if-branch. + # >>> a = torch.ones((1, 2, 0, 4)) + # >>> b = torch.ones((1, 2, 3, 4)) + # >>> c = torch.cat((a, b), dim=2) + # >>> torch.equal(b, c) # True + # >>> d = torch.split(a, 2, dim=-1) + # >>> torch.equal(d[0], d[1]) # True + if cache.size(0) > 0: + key_cache, value_cache = torch.split(cache, + cache.size(-1) // 2, + dim=-1) + k = torch.cat([key_cache, k], dim=2) + v = torch.cat([value_cache, v], dim=2) + # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's + # non-trivial to calculate `next_cache_start` here. + new_cache = torch.cat((k, v), dim=-1) + + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) + p = p.transpose(1, 2) # (batch, head, time1, d_k) + + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch, head, time1, time2) + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + # compute matrix b and matrix d + # (batch, head, time1, time2) + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used + if matrix_ac.shape != matrix_bd.shape: + matrix_bd = self.rel_shift(matrix_bd) + + scores = (matrix_ac + matrix_bd) / math.sqrt( + self.d_k) # (batch, head, time1, time2) + + return self.forward_attention(v, scores, mask), new_cache + + + + +# class BlockRelPositionMultiHeadedAttention(MultiHeadedAttention): +# """Multi-Head Attention layer with relative position encoding. +# Paper: https://arxiv.org/abs/1901.02860 +# Args: +# n_head (int): The number of heads. +# n_feat (int): The number of features. +# dropout_rate (float): Dropout rate. +# """ + +# def __init__(self, +# n_head: int, +# n_feat: int, +# dropout_rate: float, +# key_bias: bool = True, +# block_size=25): +# """Construct an RelPositionMultiHeadedAttention object.""" +# super().__init__(n_head, n_feat, dropout_rate, key_bias) +# # linear transformation for positional encoding +# self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) +# # these two learnable bias are used in matrix c and matrix d +# # as described in https://arxiv.org/abs/1901.02860 Section 3.3 +# self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) +# self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) +# torch.nn.init.xavier_uniform_(self.pos_bias_u) +# torch.nn.init.xavier_uniform_(self.pos_bias_v) +# self.block_size=block_size + +# def rel_shift(self, x): +# """Compute relative positional encoding. + +# Args: +# x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1). +# time1 means the length of query vector. + +# Returns: +# torch.Tensor: Output tensor. + +# """ +# zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype) +# x_padded = torch.cat([zero_pad, x], dim=-1) + +# x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) +# x = x_padded[:, :, 1:].view_as(x)[ +# :, :, :, : x.size(-1) // 2 + 1 +# ] # only keep the positions from 0 to time2 +# return x + +# def forward( +# self, +# query: torch.Tensor, +# key: torch.Tensor, +# value: torch.Tensor, +# mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), +# pos_emb: torch.Tensor = torch.empty(0), +# cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) +# ) -> Tuple[torch.Tensor, torch.Tensor]: +# """Compute 'Scaled Dot Product Attention' with rel. positional encoding. +# Args: +# query (torch.Tensor): Query tensor (#batch, time1, size). +# key (torch.Tensor): Key tensor (#batch, time2, size). +# value (torch.Tensor): Value tensor (#batch, time2, size). +# mask (torch.Tensor): Mask tensor (#batch, 1, time2) or +# (#batch, time1, time2), (0, 0, 0) means fake mask. +# pos_emb (torch.Tensor): Positional embedding tensor +# (#batch, time2, size). +# cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), +# where `cache_t == chunk_size * num_decoding_left_chunks` +# and `head * d_k == size` +# Returns: +# torch.Tensor: Output tensor (#batch, time1, d_model). +# torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) +# where `cache_t == chunk_size * num_decoding_left_chunks` +# and `head * d_k == size` +# """ +# q, k, v = self.forward_qkv(query, key, value) +# q = q.transpose(1, 2) # (batch, time1, head, d_k) + +# # NOTE(xcsong): +# # when export onnx model, for 1st chunk, we feed +# # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) +# # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). +# # In all modes, `if cache.size(0) > 0` will alwayse be `True` +# # and we will always do splitting and +# # concatnation(this will simplify onnx export). Note that +# # it's OK to concat & split zero-shaped tensors(see code below). +# # when export jit model, for 1st chunk, we always feed +# # cache(0, 0, 0, 0) since jit supports dynamic if-branch. +# # >>> a = torch.ones((1, 2, 0, 4)) +# # >>> b = torch.ones((1, 2, 3, 4)) +# # >>> c = torch.cat((a, b), dim=2) +# # >>> torch.equal(b, c) # True +# # >>> d = torch.split(a, 2, dim=-1) +# # >>> torch.equal(d[0], d[1]) # True +# if cache.size(0) > 0: +# key_cache, value_cache = torch.split(cache, +# cache.size(-1) // 2, +# dim=-1) +# k = torch.cat([key_cache, k], dim=2) +# v = torch.cat([value_cache, v], dim=2) +# # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's +# # non-trivial to calculate `next_cache_start` here. +# new_cache = torch.cat((k, v), dim=-1) + +# n_batch_pos = pos_emb.size(0) +# p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) +# p = p.transpose(1, 2) # (batch, head, time1, d_k) + +# # (batch, head, time1, d_k) +# q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) +# # (batch, head, time1, d_k) +# q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + +# # compute attention score +# # first compute matrix a and matrix c +# # as described in https://arxiv.org/abs/1901.02860 Section 3.3 +# # (batch, head, time1, time2) + +# # Compute matrix ac and bd +# matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) # (batch, head, time1, time2) +# matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) # (batch, head, time1, time2) + +# batch_size, num_heads, seq_len, _ = matrix_ac.shape + +# # Create block causal mask +# block_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=self.block_size).to(matrix_ac.device).bool() +# # mask = mask.masked_fill(mask == 1, float('-inf')) # mask upper triangular matrix beyond block + +# # Apply relative shift if necessary +# if matrix_ac.shape != matrix_bd.shape: +# matrix_bd = self.rel_shift(matrix_bd) + +# # Combine ac and bd and apply the block causal mask +# scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k) # (batch, head, time1, time2) +# scores = scores.masked_fill(block_mask.unsqueeze(0).unsqueeze(0), float('-inf')) # apply the block mask + +# # Forward attention +# return self.forward_attention(v, scores, mask), new_cache + + + +from cosyvoice.utils import block_mask_util +class BlockRelPositionMultiHeadedAttention(MultiHeadedAttention): + """Multi-Head Attention layer with relative position encoding. + Paper: https://arxiv.org/abs/1901.02860 + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + """ + + def __init__(self, + n_head: int, + n_feat: int, + dropout_rate: float, + key_bias: bool = True, block_size=25): + """Construct an RelPositionMultiHeadedAttention object.""" + super().__init__(n_head, n_feat, dropout_rate, key_bias) + # linear transformation for positional encoding + self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) + self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + self.block_size = block_size + + def rel_shift(self, x: torch.Tensor) -> torch.Tensor: + """Compute relative positional encoding. + + Args: + x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1). + time1 means the length of query vector. + + Returns: + torch.Tensor: Output tensor. + + """ + zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), + device=x.device, + dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(x.size()[0], + x.size()[1], + x.size(3) + 1, x.size(2)) + x = x_padded[:, :, 1:].view_as(x)[ + :, :, :, : x.size(-1) // 2 + 1 + ] # only keep the positions from 0 to time2 + return x + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + pos_emb: torch.Tensor = torch.empty(0), + cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute 'Scaled Dot Product Attention' with rel. positional encoding. + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2), (0, 0, 0) means fake mask. + pos_emb (torch.Tensor): Positional embedding tensor + (#batch, time2, size). + cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + """ + q, k, v = self.forward_qkv(query, key, value) + q = q.transpose(1, 2) # (batch, time1, head, d_k) + + # 0代表被mask的位置 + bs, time_len, _ = query.shape + # mask = torch.tril(torch.ones(time_len, time_len).to(mask), diagonal=0).int() + # block_size = self.block_size + # mask[:, 0:block_size] = 1 + block_mask = block_mask_util.create_grid_mask(time_len,self.block_size,fill_triangle=True).to(query).int() + block_mask = block_mask[None].repeat(bs, 1, 1) + mask=mask*block_mask + + # NOTE(xcsong): + # when export onnx model, for 1st chunk, we feed + # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) + # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). + # In all modes, `if cache.size(0) > 0` will alwayse be `True` + # and we will always do splitting and + # concatnation(this will simplify onnx export). Note that + # it's OK to concat & split zero-shaped tensors(see code below). + # when export jit model, for 1st chunk, we always feed + # cache(0, 0, 0, 0) since jit supports dynamic if-branch. + # >>> a = torch.ones((1, 2, 0, 4)) + # >>> b = torch.ones((1, 2, 3, 4)) + # >>> c = torch.cat((a, b), dim=2) + # >>> torch.equal(b, c) # True + # >>> d = torch.split(a, 2, dim=-1) + # >>> torch.equal(d[0], d[1]) # True + if cache.size(0) > 0: + key_cache, value_cache = torch.split(cache, + cache.size(-1) // 2, + dim=-1) + k = torch.cat([key_cache, k], dim=2) + v = torch.cat([value_cache, v], dim=2) + # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's + # non-trivial to calculate `next_cache_start` here. + new_cache = torch.cat((k, v), dim=-1) + + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) + p = p.transpose(1, 2) # (batch, head, time1, d_k) + + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch, head, time1, time2) + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + # compute matrix b and matrix d + # (batch, head, time1, time2) + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used + if matrix_ac.shape != matrix_bd.shape: + matrix_bd = self.rel_shift(matrix_bd) + + scores = (matrix_ac + matrix_bd) / math.sqrt( + self.d_k) # (batch, head, time1, time2) + + return self.forward_attention(v, scores, mask), new_cache diff --git a/cosyvoice/transformer/convolution.py b/cosyvoice/transformer/convolution.py new file mode 100644 index 0000000000000000000000000000000000000000..4d5d96149154776000991a681a666fbe55e562fe --- /dev/null +++ b/cosyvoice/transformer/convolution.py @@ -0,0 +1,145 @@ +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""ConvolutionModule definition.""" + +from typing import Tuple + +import torch +from torch import nn + + +class ConvolutionModule(nn.Module): + """ConvolutionModule in Conformer model.""" + + def __init__(self, + channels: int, + kernel_size: int = 15, + activation: nn.Module = nn.ReLU(), + norm: str = "batch_norm", + causal: bool = False, + bias: bool = True): + """Construct an ConvolutionModule object. + Args: + channels (int): The number of channels of conv layers. + kernel_size (int): Kernel size of conv layers. + causal (int): Whether use causal convolution or not + """ + super().__init__() + + self.pointwise_conv1 = nn.Conv1d( + channels, + 2 * channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + # self.lorder is used to distinguish if it's a causal convolution, + # if self.lorder > 0: it's a causal convolution, the input will be + # padded with self.lorder frames on the left in forward. + # else: it's a symmetrical convolution + if causal: + padding = 0 + self.lorder = kernel_size - 1 + else: + # kernel_size should be an odd number for none causal convolution + assert (kernel_size - 1) % 2 == 0 + padding = (kernel_size - 1) // 2 + self.lorder = 0 + self.depthwise_conv = nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + padding=padding, + groups=channels, + bias=bias, + ) + + assert norm in ['batch_norm', 'layer_norm'] + if norm == "batch_norm": + self.use_layer_norm = False + self.norm = nn.BatchNorm1d(channels) + else: + self.use_layer_norm = True + self.norm = nn.LayerNorm(channels) + + self.pointwise_conv2 = nn.Conv1d( + channels, + channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.activation = activation + + def forward( + self, + x: torch.Tensor, + mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + cache: torch.Tensor = torch.zeros((0, 0, 0)), + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute convolution module. + Args: + x (torch.Tensor): Input tensor (#batch, time, channels). + mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), + (0, 0, 0) means fake mask. + cache (torch.Tensor): left context cache, it is only + used in causal convolution (#batch, channels, cache_t), + (0, 0, 0) meas fake cache. + Returns: + torch.Tensor: Output tensor (#batch, time, channels). + """ + # exchange the temporal dimension and the feature dimension + x = x.transpose(1, 2) # (#batch, channels, time) + + # mask batch padding + if mask_pad.size(2) > 0: # time > 0 + x.masked_fill_(~mask_pad, 0.0) + + if self.lorder > 0: + if cache.size(2) == 0: # cache_t == 0 + x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) + else: + assert cache.size(0) == x.size(0) # equal batch + assert cache.size(1) == x.size(1) # equal channel + x = torch.cat((cache, x), dim=2) + assert (x.size(2) > self.lorder) + new_cache = x[:, :, -self.lorder:] + else: + # It's better we just return None if no cache is required, + # However, for JIT export, here we just fake one tensor instead of + # None. + new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) + + # GLU mechanism + x = self.pointwise_conv1(x) # (batch, 2*channel, dim) + x = nn.functional.glu(x, dim=1) # (batch, channel, dim) + + # 1D Depthwise Conv + x = self.depthwise_conv(x) + if self.use_layer_norm: + x = x.transpose(1, 2) + x = self.activation(self.norm(x)) + if self.use_layer_norm: + x = x.transpose(1, 2) + x = self.pointwise_conv2(x) + # mask batch padding + if mask_pad.size(2) > 0: # time > 0 + x.masked_fill_(~mask_pad, 0.0) + + return x.transpose(1, 2), new_cache diff --git a/cosyvoice/transformer/decoder.py b/cosyvoice/transformer/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..961c875eab519f7a9e8a6e56720dc878b7852372 --- /dev/null +++ b/cosyvoice/transformer/decoder.py @@ -0,0 +1,396 @@ +# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Decoder definition.""" +from typing import Tuple, List, Optional + +import torch +import torch.utils.checkpoint as ckpt +import logging + +from cosyvoice.transformer.decoder_layer import DecoderLayer +from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward +from cosyvoice.utils.class_utils import ( + COSYVOICE_EMB_CLASSES, + COSYVOICE_ATTENTION_CLASSES, + COSYVOICE_ACTIVATION_CLASSES, +) +from cosyvoice.utils.mask import (subsequent_mask, make_pad_mask) + + +class TransformerDecoder(torch.nn.Module): + """Base class of Transfomer decoder module. + Args: + vocab_size: output dim + encoder_output_size: dimension of attention + attention_heads: the number of heads of multi head attention + linear_units: the hidden units number of position-wise feedforward + num_blocks: the number of decoder blocks + dropout_rate: dropout rate + self_attention_dropout_rate: dropout rate for attention + input_layer: input layer type + use_output_layer: whether to use output layer + pos_enc_class: PositionalEncoding or ScaledPositionalEncoding + normalize_before: + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + src_attention: if false, encoder-decoder cross attention is not + applied, such as CIF model + key_bias: whether use bias in attention.linear_k, False for whisper models. + gradient_checkpointing: rerunning a forward-pass segment for each + checkpointed segment during backward. + tie_word_embedding: Tie or clone module weights depending of whether we are + using TorchScript or not + """ + + def __init__( + self, + vocab_size: int, + encoder_output_size: int, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + self_attention_dropout_rate: float = 0.0, + src_attention_dropout_rate: float = 0.0, + input_layer: str = "embed", + use_output_layer: bool = True, + normalize_before: bool = True, + src_attention: bool = True, + key_bias: bool = True, + activation_type: str = "relu", + gradient_checkpointing: bool = False, + tie_word_embedding: bool = False, + ): + super().__init__() + attention_dim = encoder_output_size + activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]() + + self.embed = torch.nn.Sequential( + torch.nn.Identity() if input_layer == "no_pos" else + torch.nn.Embedding(vocab_size, attention_dim), + COSYVOICE_EMB_CLASSES[input_layer](attention_dim, + positional_dropout_rate), + ) + + self.normalize_before = normalize_before + self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) + self.use_output_layer = use_output_layer + if use_output_layer: + self.output_layer = torch.nn.Linear(attention_dim, vocab_size) + else: + self.output_layer = torch.nn.Identity() + self.num_blocks = num_blocks + self.decoders = torch.nn.ModuleList([ + DecoderLayer( + attention_dim, + COSYVOICE_ATTENTION_CLASSES["selfattn"]( + attention_heads, attention_dim, + self_attention_dropout_rate, key_bias), + COSYVOICE_ATTENTION_CLASSES["selfattn"]( + attention_heads, attention_dim, src_attention_dropout_rate, + key_bias) if src_attention else None, + PositionwiseFeedForward(attention_dim, linear_units, + dropout_rate, activation), + dropout_rate, + normalize_before, + ) for _ in range(self.num_blocks) + ]) + + self.gradient_checkpointing = gradient_checkpointing + self.tie_word_embedding = tie_word_embedding + + def forward( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + ys_in_pad: torch.Tensor, + ys_in_lens: torch.Tensor, + r_ys_in_pad: torch.Tensor = torch.empty(0), + reverse_weight: float = 0.0, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Forward decoder. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoder memory mask, (batch, 1, maxlen_in) + ys_in_pad: padded input token ids, int64 (batch, maxlen_out) + ys_in_lens: input lengths of this batch (batch) + r_ys_in_pad: not used in transformer decoder, in order to unify api + with bidirectional decoder + reverse_weight: not used in transformer decoder, in order to unify + api with bidirectional decode + Returns: + (tuple): tuple containing: + x: decoded token score before softmax (batch, maxlen_out, + vocab_size) if use_output_layer is True, + torch.tensor(0.0), in order to unify api with bidirectional decoder + olens: (batch, ) + NOTE(xcsong): + We pass the `__call__` method of the modules instead of `forward` to the + checkpointing API because `__call__` attaches all the hooks of the module. + https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2 + """ + tgt = ys_in_pad + maxlen = tgt.size(1) + # tgt_mask: (B, 1, L) + tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) + tgt_mask = tgt_mask.to(tgt.device) + # m: (1, L, L) + m = subsequent_mask(tgt_mask.size(-1), + device=tgt_mask.device).unsqueeze(0) + # tgt_mask: (B, L, L) + tgt_mask = tgt_mask & m + x, _ = self.embed(tgt) + if self.gradient_checkpointing and self.training: + x = self.forward_layers_checkpointed(x, tgt_mask, memory, + memory_mask) + else: + x = self.forward_layers(x, tgt_mask, memory, memory_mask) + if self.normalize_before: + x = self.after_norm(x) + if self.use_output_layer: + x = self.output_layer(x) + olens = tgt_mask.sum(1) + return x, torch.tensor(0.0), olens + + def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor, + memory: torch.Tensor, + memory_mask: torch.Tensor) -> torch.Tensor: + for layer in self.decoders: + x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, + memory_mask) + return x + + @torch.jit.ignore(drop=True) + def forward_layers_checkpointed(self, x: torch.Tensor, + tgt_mask: torch.Tensor, + memory: torch.Tensor, + memory_mask: torch.Tensor) -> torch.Tensor: + for layer in self.decoders: + x, tgt_mask, memory, memory_mask = ckpt.checkpoint( + layer.__call__, x, tgt_mask, memory, memory_mask) + return x + + def forward_one_step( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + tgt: torch.Tensor, + tgt_mask: torch.Tensor, + cache: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, List[torch.Tensor]]: + """Forward one step. + This is only used for decoding. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoded memory mask, (batch, 1, maxlen_in) + tgt: input token ids, int64 (batch, maxlen_out) + tgt_mask: input token mask, (batch, maxlen_out) + dtype=torch.uint8 in PyTorch 1.2- + dtype=torch.bool in PyTorch 1.2+ (include 1.2) + cache: cached output list of (batch, max_time_out-1, size) + Returns: + y, cache: NN output value and cache per `self.decoders`. + y.shape` is (batch, maxlen_out, token) + """ + x, _ = self.embed(tgt) + new_cache = [] + for i, decoder in enumerate(self.decoders): + if cache is None: + c = None + else: + c = cache[i] + x, tgt_mask, memory, memory_mask = decoder(x, + tgt_mask, + memory, + memory_mask, + cache=c) + new_cache.append(x) + if self.normalize_before: + y = self.after_norm(x[:, -1]) + else: + y = x[:, -1] + if self.use_output_layer: + y = torch.log_softmax(self.output_layer(y), dim=-1) + return y, new_cache + + def tie_or_clone_weights(self, jit_mode: bool = True): + """Tie or clone module weights (between word_emb and output_layer) + depending of whether we are using TorchScript or not""" + if not self.use_output_layer: + return + if jit_mode: + logging.info("clone emb.weight to output.weight") + self.output_layer.weight = torch.nn.Parameter( + self.embed[0].weight.clone()) + else: + logging.info("tie emb.weight with output.weight") + self.output_layer.weight = self.embed[0].weight + + if getattr(self.output_layer, "bias", None) is not None: + self.output_layer.bias.data = torch.nn.functional.pad( + self.output_layer.bias.data, + ( + 0, + self.output_layer.weight.shape[0] - + self.output_layer.bias.shape[0], + ), + "constant", + 0, + ) + + +class BiTransformerDecoder(torch.nn.Module): + """Base class of Transfomer decoder module. + Args: + vocab_size: output dim + encoder_output_size: dimension of attention + attention_heads: the number of heads of multi head attention + linear_units: the hidden units number of position-wise feedforward + num_blocks: the number of decoder blocks + r_num_blocks: the number of right to left decoder blocks + dropout_rate: dropout rate + self_attention_dropout_rate: dropout rate for attention + input_layer: input layer type + use_output_layer: whether to use output layer + pos_enc_class: PositionalEncoding or ScaledPositionalEncoding + normalize_before: + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + key_bias: whether use bias in attention.linear_k, False for whisper models. + """ + + def __init__( + self, + vocab_size: int, + encoder_output_size: int, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + r_num_blocks: int = 0, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + self_attention_dropout_rate: float = 0.0, + src_attention_dropout_rate: float = 0.0, + input_layer: str = "embed", + use_output_layer: bool = True, + normalize_before: bool = True, + key_bias: bool = True, + gradient_checkpointing: bool = False, + tie_word_embedding: bool = False, + ): + + super().__init__() + self.tie_word_embedding = tie_word_embedding + self.left_decoder = TransformerDecoder( + vocab_size, + encoder_output_size, + attention_heads, + linear_units, + num_blocks, + dropout_rate, + positional_dropout_rate, + self_attention_dropout_rate, + src_attention_dropout_rate, + input_layer, + use_output_layer, + normalize_before, + key_bias=key_bias, + gradient_checkpointing=gradient_checkpointing, + tie_word_embedding=tie_word_embedding) + + self.right_decoder = TransformerDecoder( + vocab_size, + encoder_output_size, + attention_heads, + linear_units, + r_num_blocks, + dropout_rate, + positional_dropout_rate, + self_attention_dropout_rate, + src_attention_dropout_rate, + input_layer, + use_output_layer, + normalize_before, + key_bias=key_bias, + gradient_checkpointing=gradient_checkpointing, + tie_word_embedding=tie_word_embedding) + + def forward( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + ys_in_pad: torch.Tensor, + ys_in_lens: torch.Tensor, + r_ys_in_pad: torch.Tensor, + reverse_weight: float = 0.0, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Forward decoder. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoder memory mask, (batch, 1, maxlen_in) + ys_in_pad: padded input token ids, int64 (batch, maxlen_out) + ys_in_lens: input lengths of this batch (batch) + r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), + used for right to left decoder + reverse_weight: used for right to left decoder + Returns: + (tuple): tuple containing: + x: decoded token score before softmax (batch, maxlen_out, + vocab_size) if use_output_layer is True, + r_x: x: decoded token score (right to left decoder) + before softmax (batch, maxlen_out, vocab_size) + if use_output_layer is True, + olens: (batch, ) + """ + l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, + ys_in_lens) + r_x = torch.tensor(0.0) + if reverse_weight > 0.0: + r_x, _, olens = self.right_decoder(memory, memory_mask, + r_ys_in_pad, ys_in_lens) + return l_x, r_x, olens + + def forward_one_step( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + tgt: torch.Tensor, + tgt_mask: torch.Tensor, + cache: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, List[torch.Tensor]]: + """Forward one step. + This is only used for decoding. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoded memory mask, (batch, 1, maxlen_in) + tgt: input token ids, int64 (batch, maxlen_out) + tgt_mask: input token mask, (batch, maxlen_out) + dtype=torch.uint8 in PyTorch 1.2- + dtype=torch.bool in PyTorch 1.2+ (include 1.2) + cache: cached output list of (batch, max_time_out-1, size) + Returns: + y, cache: NN output value and cache per `self.decoders`. + y.shape` is (batch, maxlen_out, token) + """ + return self.left_decoder.forward_one_step(memory, memory_mask, tgt, + tgt_mask, cache) + + def tie_or_clone_weights(self, jit_mode: bool = True): + """Tie or clone module weights (between word_emb and output_layer) + depending of whether we are using TorchScript or not""" + self.left_decoder.tie_or_clone_weights(jit_mode) + self.right_decoder.tie_or_clone_weights(jit_mode) diff --git a/cosyvoice/transformer/decoder_layer.py b/cosyvoice/transformer/decoder_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..91c7c5d7fb2a8e79cea7705646e5381016f73466 --- /dev/null +++ b/cosyvoice/transformer/decoder_layer.py @@ -0,0 +1,132 @@ +# Copyright (c) 2019 Shigeki Karita +# 2020 Mobvoi Inc (Binbin Zhang) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Decoder self-attention layer definition.""" +from typing import Optional, Tuple + +import torch +from torch import nn + + +class DecoderLayer(nn.Module): + """Single decoder layer module. + + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + src_attn (torch.nn.Module): Inter-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + If `None` is passed, Inter-attention is not used, such as + CIF, GPT, and other decoder only model. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): + True: use layer_norm before each sub-block. + False: to use layer_norm after each sub-block. + """ + + def __init__( + self, + size: int, + self_attn: nn.Module, + src_attn: Optional[nn.Module], + feed_forward: nn.Module, + dropout_rate: float, + normalize_before: bool = True, + ): + """Construct an DecoderLayer object.""" + super().__init__() + self.size = size + self.self_attn = self_attn + self.src_attn = src_attn + self.feed_forward = feed_forward + self.norm1 = nn.LayerNorm(size, eps=1e-5) + self.norm2 = nn.LayerNorm(size, eps=1e-5) + self.norm3 = nn.LayerNorm(size, eps=1e-5) + self.dropout = nn.Dropout(dropout_rate) + self.normalize_before = normalize_before + + def forward( + self, + tgt: torch.Tensor, + tgt_mask: torch.Tensor, + memory: torch.Tensor, + memory_mask: torch.Tensor, + cache: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute decoded features. + + Args: + tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). + tgt_mask (torch.Tensor): Mask for input tensor + (#batch, maxlen_out). + memory (torch.Tensor): Encoded memory + (#batch, maxlen_in, size). + memory_mask (torch.Tensor): Encoded memory mask + (#batch, maxlen_in). + cache (torch.Tensor): cached tensors. + (#batch, maxlen_out - 1, size). + + Returns: + torch.Tensor: Output tensor (#batch, maxlen_out, size). + torch.Tensor: Mask for output tensor (#batch, maxlen_out). + torch.Tensor: Encoded memory (#batch, maxlen_in, size). + torch.Tensor: Encoded memory mask (#batch, maxlen_in). + + """ + residual = tgt + if self.normalize_before: + tgt = self.norm1(tgt) + + if cache is None: + tgt_q = tgt + tgt_q_mask = tgt_mask + else: + # compute only the last frame query keeping dim: max_time_out -> 1 + assert cache.shape == ( + tgt.shape[0], + tgt.shape[1] - 1, + self.size, + ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" + tgt_q = tgt[:, -1:, :] + residual = residual[:, -1:, :] + tgt_q_mask = tgt_mask[:, -1:, :] + + x = residual + self.dropout( + self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) + if not self.normalize_before: + x = self.norm1(x) + + if self.src_attn is not None: + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + self.dropout( + self.src_attn(x, memory, memory, memory_mask)[0]) + if not self.normalize_before: + x = self.norm2(x) + + residual = x + if self.normalize_before: + x = self.norm3(x) + x = residual + self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm3(x) + + if cache is not None: + x = torch.cat([cache, x], dim=1) + + return x, tgt_mask, memory, memory_mask diff --git a/cosyvoice/transformer/embedding.py b/cosyvoice/transformer/embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..46130a503f72f103e09d3392077ed352368ce54f --- /dev/null +++ b/cosyvoice/transformer/embedding.py @@ -0,0 +1,293 @@ +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Positonal Encoding Module.""" + +import math +from typing import Tuple, Union + +import torch +import torch.nn.functional as F +import numpy as np + + +class PositionalEncoding(torch.nn.Module): + """Positional encoding. + + :param int d_model: embedding dim + :param float dropout_rate: dropout rate + :param int max_len: maximum input length + + PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) + PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) + """ + + def __init__(self, + d_model: int, + dropout_rate: float, + max_len: int = 5000, + reverse: bool = False): + """Construct an PositionalEncoding object.""" + super().__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.max_len = max_len + + self.pe = torch.zeros(self.max_len, self.d_model) + position = torch.arange(0, self.max_len, + dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * + -(math.log(10000.0) / self.d_model)) + self.pe[:, 0::2] = torch.sin(position * div_term) + self.pe[:, 1::2] = torch.cos(position * div_term) + self.pe = self.pe.unsqueeze(0) + + def forward(self, + x: torch.Tensor, + offset: Union[int, torch.Tensor] = 0) \ + -> Tuple[torch.Tensor, torch.Tensor]: + """Add positional encoding. + + Args: + x (torch.Tensor): Input. Its shape is (batch, time, ...) + offset (int, torch.tensor): position offset + + Returns: + torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) + torch.Tensor: for compatibility to RelPositionalEncoding + """ + + self.pe = self.pe.to(x.device) + pos_emb = self.position_encoding(offset, x.size(1), False) + x = x * self.xscale + pos_emb + return self.dropout(x), self.dropout(pos_emb) + + def position_encoding(self, + offset: Union[int, torch.Tensor], + size: int, + apply_dropout: bool = True) -> torch.Tensor: + """ For getting encoding in a streaming fashion + + Attention!!!!! + we apply dropout only once at the whole utterance level in a none + streaming way, but will call this function several times with + increasing input size in a streaming scenario, so the dropout will + be applied several times. + + Args: + offset (int or torch.tensor): start offset + size (int): required size of position encoding + + Returns: + torch.Tensor: Corresponding encoding + """ + # How to subscript a Union type: + # https://github.com/pytorch/pytorch/issues/69434 + if isinstance(offset, int): + assert offset + size <= self.max_len + pos_emb = self.pe[:, offset:offset + size] + elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar + assert offset + size <= self.max_len + pos_emb = self.pe[:, offset:offset + size] + else: # for batched streaming decoding on GPU + assert torch.max(offset) + size <= self.max_len + index = offset.unsqueeze(1) + \ + torch.arange(0, size).to(offset.device) # B X T + flag = index > 0 + # remove negative offset + index = index * flag + pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model + + if apply_dropout: + pos_emb = self.dropout(pos_emb) + return pos_emb + + +class RelPositionalEncoding(PositionalEncoding): + """Relative positional encoding module. + See : Appendix B in https://arxiv.org/abs/1901.02860 + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + """Initialize class.""" + super().__init__(d_model, dropout_rate, max_len, reverse=True) + + def forward(self, + x: torch.Tensor, + offset: Union[int, torch.Tensor] = 0) \ + -> Tuple[torch.Tensor, torch.Tensor]: + """Compute positional encoding. + Args: + x (torch.Tensor): Input tensor (batch, time, `*`). + Returns: + torch.Tensor: Encoded tensor (batch, time, `*`). + torch.Tensor: Positional embedding tensor (1, time, `*`). + """ + self.pe = self.pe.to(x.device) + x = x * self.xscale + pos_emb = self.position_encoding(offset, x.size(1), False) + return self.dropout(x), self.dropout(pos_emb) + + +class WhisperPositionalEncoding(PositionalEncoding): + """ Sinusoids position encoding used in openai-whisper.encoder + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500): + super().__init__(d_model, dropout_rate, max_len) + self.xscale = 1.0 + log_timescale_increment = np.log(10000) / (d_model // 2 - 1) + inv_timescales = torch.exp(-log_timescale_increment * + torch.arange(d_model // 2)) + scaled_time = torch.arange(max_len)[:, np.newaxis] * \ + inv_timescales[np.newaxis, :] + pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1) + delattr(self, "pe") + self.register_buffer("pe", pe.unsqueeze(0)) + + +class LearnablePositionalEncoding(PositionalEncoding): + """ Learnable position encoding used in openai-whisper.decoder + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448): + super().__init__(d_model, dropout_rate, max_len) + # NOTE(xcsong): overwrite self.pe & self.xscale + self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model)) + self.xscale = 1.0 + + +class NoPositionalEncoding(torch.nn.Module): + """ No position encoding + """ + + def __init__(self, d_model: int, dropout_rate: float): + super().__init__() + self.d_model = d_model + self.dropout = torch.nn.Dropout(p=dropout_rate) + + def forward(self, + x: torch.Tensor, + offset: Union[int, torch.Tensor] = 0) \ + -> Tuple[torch.Tensor, torch.Tensor]: + """ Just return zero vector for interface compatibility + """ + pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) + return self.dropout(x), pos_emb + + def position_encoding(self, offset: Union[int, torch.Tensor], + size: int) -> torch.Tensor: + return torch.zeros(1, size, self.d_model) + + +class EspnetRelPositionalEncoding(torch.nn.Module): + """Relative positional encoding module (new implementation). + + Details can be found in https://github.com/espnet/espnet/pull/2816. + + See : Appendix B in https://arxiv.org/abs/1901.02860 + + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + + """ + + def __init__(self, d_model, dropout_rate, max_len=5000): + """Construct an PositionalEncoding object.""" + super(EspnetRelPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x): + """Reset the positional encodings.""" + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x.size(1) * 2 - 1: + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` means to the position of query vecotr and `j` means the + # position of key vector. We use position relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i torch.Tensor: + """ For getting encoding in a streaming fashion + + Attention!!!!! + we apply dropout only once at the whole utterance level in a none + streaming way, but will call this function several times with + increasing input size in a streaming scenario, so the dropout will + be applied several times. + + Args: + offset (int or torch.tensor): start offset + size (int): required size of position encoding + + Returns: + torch.Tensor: Corresponding encoding + """ + pos_emb = self.pe[ + :, + self.pe.size(1) // 2 - size + 1 : self.pe.size(1) // 2 + size, + ] + return pos_emb diff --git a/cosyvoice/transformer/encoder.py b/cosyvoice/transformer/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..f5e98c683bd2e76b51fda7acdca355348d072d58 --- /dev/null +++ b/cosyvoice/transformer/encoder.py @@ -0,0 +1,567 @@ +# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) +# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Encoder definition.""" +from typing import Tuple + +import torch +import torch.utils.checkpoint as ckpt + +from cosyvoice.transformer.convolution import ConvolutionModule +from cosyvoice.transformer.encoder_layer import TransformerEncoderLayer +from cosyvoice.transformer.encoder_layer import ConformerEncoderLayer +from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward +from cosyvoice.utils.class_utils import ( + COSYVOICE_EMB_CLASSES, + COSYVOICE_SUBSAMPLE_CLASSES, + COSYVOICE_ATTENTION_CLASSES, + COSYVOICE_ACTIVATION_CLASSES, +) +from cosyvoice.utils.mask import make_pad_mask +from cosyvoice.utils.mask import add_optional_chunk_mask + + +class BaseEncoder(torch.nn.Module): + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = "conv2d", + pos_enc_layer_type: str = "abs_pos", + normalize_before: bool = True, + static_chunk_size: int = 0, + use_dynamic_chunk: bool = False, + global_cmvn: torch.nn.Module = None, + use_dynamic_left_chunk: bool = False, + gradient_checkpointing: bool = False, + ): + """ + Args: + input_size (int): input dim + output_size (int): dimension of attention + attention_heads (int): the number of heads of multi head attention + linear_units (int): the hidden units number of position-wise feed + forward + num_blocks (int): the number of decoder blocks + dropout_rate (float): dropout rate + attention_dropout_rate (float): dropout rate in attention + positional_dropout_rate (float): dropout rate after adding + positional encoding + input_layer (str): input layer type. + optional [linear, conv2d, conv2d6, conv2d8] + pos_enc_layer_type (str): Encoder positional encoding layer type. + opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] + normalize_before (bool): + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + static_chunk_size (int): chunk size for static chunk training and + decoding + use_dynamic_chunk (bool): whether use dynamic chunk size for + training or not, You can only use fixed chunk(chunk_size > 0) + or dyanmic chunk size(use_dynamic_chunk = True) + global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module + use_dynamic_left_chunk (bool): whether use dynamic left chunk in + dynamic chunk training + key_bias: whether use bias in attention.linear_k, False for whisper models. + gradient_checkpointing: rerunning a forward-pass segment for each + checkpointed segment during backward. + """ + super().__init__() + self._output_size = output_size + + self.global_cmvn = global_cmvn + self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer]( + input_size, + output_size, + dropout_rate, + COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size, + positional_dropout_rate), + ) + + self.normalize_before = normalize_before + self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) + self.static_chunk_size = static_chunk_size + self.use_dynamic_chunk = use_dynamic_chunk + self.use_dynamic_left_chunk = use_dynamic_left_chunk + self.gradient_checkpointing = gradient_checkpointing + + def output_size(self) -> int: + return self._output_size + + def forward( + self, + xs: torch.Tensor, + xs_lens: torch.Tensor, + decoding_chunk_size: int = 0, + num_decoding_left_chunks: int = -1, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Embed positions in tensor. + + Args: + xs: padded input tensor (B, T, D) + xs_lens: input length (B) + decoding_chunk_size: decoding chunk size for dynamic chunk + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + num_decoding_left_chunks: number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + Returns: + encoder output tensor xs, and subsampled masks + xs: padded output tensor (B, T' ~= T/subsample_rate, D) + masks: torch.Tensor batch padding mask after subsample + (B, 1, T' ~= T/subsample_rate) + NOTE(xcsong): + We pass the `__call__` method of the modules instead of `forward` to the + checkpointing API because `__call__` attaches all the hooks of the module. + https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2 + """ + T = xs.size(1) + masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + xs, pos_emb, masks = self.embed(xs, masks) + mask_pad = masks # (B, 1, T/subsample_rate) + chunk_masks = add_optional_chunk_mask(xs, masks, + self.use_dynamic_chunk, + self.use_dynamic_left_chunk, + decoding_chunk_size, + self.static_chunk_size, + num_decoding_left_chunks) + if self.gradient_checkpointing and self.training: + xs = self.forward_layers_checkpointed(xs, chunk_masks, pos_emb, + mask_pad) + else: + xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad) + if self.normalize_before: + xs = self.after_norm(xs) + # Here we assume the mask is not changed in encoder layers, so just + # return the masks before encoder layers, and the masks will be used + # for cross attention with decoder later + return xs, masks + + def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor) -> torch.Tensor: + for layer in self.encoders: + xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) + return xs + + @torch.jit.ignore(drop=True) + def forward_layers_checkpointed(self, xs: torch.Tensor, + chunk_masks: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor) -> torch.Tensor: + for layer in self.encoders: + xs, chunk_masks, _, _ = ckpt.checkpoint(layer.__call__, xs, + chunk_masks, pos_emb, + mask_pad) + return xs + + def forward_chunk( + self, + xs: torch.Tensor, + offset: int, + required_cache_size: int, + att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), + cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), + att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ Forward just one chunk + + Args: + xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), + where `time == (chunk_size - 1) * subsample_rate + \ + subsample.right_context + 1` + offset (int): current offset in encoder output time stamp + required_cache_size (int): cache size required for next chunk + compuation + >=0: actual cache size + <0: means all history cache is required + att_cache (torch.Tensor): cache tensor for KEY & VALUE in + transformer/conformer attention, with shape + (elayers, head, cache_t1, d_k * 2), where + `head * d_k == hidden-dim` and + `cache_t1 == chunk_size * num_decoding_left_chunks`. + cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, + (elayers, b=1, hidden-dim, cache_t2), where + `cache_t2 == cnn.lorder - 1` + + Returns: + torch.Tensor: output of current input xs, + with shape (b=1, chunk_size, hidden-dim). + torch.Tensor: new attention cache required for next chunk, with + dynamic shape (elayers, head, ?, d_k * 2) + depending on required_cache_size. + torch.Tensor: new conformer cnn cache required for next chunk, with + same shape as the original cnn_cache. + + """ + assert xs.size(0) == 1 + # tmp_masks is just for interface compatibility + tmp_masks = torch.ones(1, + xs.size(1), + device=xs.device, + dtype=torch.bool) + tmp_masks = tmp_masks.unsqueeze(1) + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) + xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) + # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) + elayers, cache_t1 = att_cache.size(0), att_cache.size(2) + chunk_size = xs.size(1) + attention_key_size = cache_t1 + chunk_size + pos_emb = self.embed.position_encoding(offset=offset - cache_t1, + size=attention_key_size) + if required_cache_size < 0: + next_cache_start = 0 + elif required_cache_size == 0: + next_cache_start = attention_key_size + else: + next_cache_start = max(attention_key_size - required_cache_size, 0) + r_att_cache = [] + r_cnn_cache = [] + for i, layer in enumerate(self.encoders): + # NOTE(xcsong): Before layer.forward + # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), + # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) + xs, _, new_att_cache, new_cnn_cache = layer( + xs, + att_mask, + pos_emb, + att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, + cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache) + # NOTE(xcsong): After layer.forward + # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), + # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2) + r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) + r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) + if self.normalize_before: + xs = self.after_norm(xs) + + # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), + # ? may be larger than cache_t1, it depends on required_cache_size + r_att_cache = torch.cat(r_att_cache, dim=0) + # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) + r_cnn_cache = torch.cat(r_cnn_cache, dim=0) + + return (xs, r_att_cache, r_cnn_cache) + + def forward_chunk_by_chunk( + self, + xs: torch.Tensor, + decoding_chunk_size: int, + num_decoding_left_chunks: int = -1, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ Forward input chunk by chunk with chunk_size like a streaming + fashion + + Here we should pay special attention to computation cache in the + streaming style forward chunk by chunk. Three things should be taken + into account for computation in the current network: + 1. transformer/conformer encoder layers output cache + 2. convolution in conformer + 3. convolution in subsampling + + However, we don't implement subsampling cache for: + 1. We can control subsampling module to output the right result by + overlapping input instead of cache left context, even though it + wastes some computation, but subsampling only takes a very + small fraction of computation in the whole model. + 2. Typically, there are several covolution layers with subsampling + in subsampling module, it is tricky and complicated to do cache + with different convolution layers with different subsampling + rate. + 3. Currently, nn.Sequential is used to stack all the convolution + layers in subsampling, we need to rewrite it to make it work + with cache, which is not prefered. + Args: + xs (torch.Tensor): (1, max_len, dim) + chunk_size (int): decoding chunk size + """ + assert decoding_chunk_size > 0 + # The model is trained by static or dynamic chunk + assert self.static_chunk_size > 0 or self.use_dynamic_chunk + subsampling = self.embed.subsampling_rate + context = self.embed.right_context + 1 # Add current frame + stride = subsampling * decoding_chunk_size + decoding_window = (decoding_chunk_size - 1) * subsampling + context + num_frames = xs.size(1) + att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) + cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) + outputs = [] + offset = 0 + required_cache_size = decoding_chunk_size * num_decoding_left_chunks + + # Feed forward overlap input step by step + for cur in range(0, num_frames - context + 1, stride): + end = min(cur + decoding_window, num_frames) + chunk_xs = xs[:, cur:end, :] + (y, att_cache, + cnn_cache) = self.forward_chunk(chunk_xs, offset, + required_cache_size, att_cache, + cnn_cache) + outputs.append(y) + offset += y.size(1) + ys = torch.cat(outputs, 1) + masks = torch.ones((1, 1, ys.size(1)), + device=ys.device, + dtype=torch.bool) + return ys, masks + + +class TransformerEncoder(BaseEncoder): + """Transformer encoder module.""" + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = "conv2d", + pos_enc_layer_type: str = "abs_pos", + normalize_before: bool = True, + static_chunk_size: int = 0, + use_dynamic_chunk: bool = False, + global_cmvn: torch.nn.Module = None, + use_dynamic_left_chunk: bool = False, + key_bias: bool = True, + selfattention_layer_type: str = "selfattn", + activation_type: str = "relu", + gradient_checkpointing: bool = False, + ): + """ Construct TransformerEncoder + + See Encoder for the meaning of each parameter. + """ + super().__init__(input_size, output_size, attention_heads, + linear_units, num_blocks, dropout_rate, + positional_dropout_rate, attention_dropout_rate, + input_layer, pos_enc_layer_type, normalize_before, + static_chunk_size, use_dynamic_chunk, global_cmvn, + use_dynamic_left_chunk, gradient_checkpointing) + activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]() + self.encoders = torch.nn.ModuleList([ + TransformerEncoderLayer( + output_size, + COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](attention_heads, + output_size, + attention_dropout_rate, + key_bias), + PositionwiseFeedForward(output_size, linear_units, + dropout_rate, activation), + dropout_rate, normalize_before) for _ in range(num_blocks) + ]) + + +class ConformerEncoder(BaseEncoder): + """Conformer encoder module.""" + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = "conv2d", + pos_enc_layer_type: str = "rel_pos", + normalize_before: bool = True, + static_chunk_size: int = 0, + use_dynamic_chunk: bool = False, + global_cmvn: torch.nn.Module = None, + use_dynamic_left_chunk: bool = False, + positionwise_conv_kernel_size: int = 1, + macaron_style: bool = True, + selfattention_layer_type: str = "rel_selfattn", + activation_type: str = "swish", + use_cnn_module: bool = True, + cnn_module_kernel: int = 15, + causal: bool = False, + cnn_module_norm: str = "batch_norm", + key_bias: bool = True, + gradient_checkpointing: bool = False, + ): + """Construct ConformerEncoder + + Args: + input_size to use_dynamic_chunk, see in BaseEncoder + positionwise_conv_kernel_size (int): Kernel size of positionwise + conv1d layer. + macaron_style (bool): Whether to use macaron style for + positionwise layer. + selfattention_layer_type (str): Encoder attention layer type, + the parameter has no effect now, it's just for configure + compatibility. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + cnn_module_kernel (int): Kernel size of convolution module. + causal (bool): whether to use causal convolution or not. + key_bias: whether use bias in attention.linear_k, False for whisper models. + """ + super().__init__(input_size, output_size, attention_heads, + linear_units, num_blocks, dropout_rate, + positional_dropout_rate, attention_dropout_rate, + input_layer, pos_enc_layer_type, normalize_before, + static_chunk_size, use_dynamic_chunk, global_cmvn, + use_dynamic_left_chunk, gradient_checkpointing) + activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]() + + # self-attention module definition + encoder_selfattn_layer_args = ( + attention_heads, + output_size, + attention_dropout_rate, + key_bias, + ) + # feed-forward module definition + positionwise_layer_args = ( + output_size, + linear_units, + dropout_rate, + activation, + ) + # convolution module definition + convolution_layer_args = (output_size, cnn_module_kernel, activation, + cnn_module_norm, causal) + + self.encoders = torch.nn.ModuleList([ + ConformerEncoderLayer( + output_size, + COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type]( + *encoder_selfattn_layer_args), + PositionwiseFeedForward(*positionwise_layer_args), + PositionwiseFeedForward( + *positionwise_layer_args) if macaron_style else None, + ConvolutionModule( + *convolution_layer_args) if use_cnn_module else None, + dropout_rate, + normalize_before, + ) for _ in range(num_blocks) + ]) + + + + +class BlockConformerEncoder(BaseEncoder): + """Conformer encoder module.""" + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = "conv2d", + pos_enc_layer_type: str = "rel_pos", + normalize_before: bool = True, + static_chunk_size: int = 0, + use_dynamic_chunk: bool = False, + global_cmvn: torch.nn.Module = None, + use_dynamic_left_chunk: bool = False, + positionwise_conv_kernel_size: int = 1, + macaron_style: bool = True, + selfattention_layer_type: str = "rel_selfattn", + activation_type: str = "swish", + use_cnn_module: bool = True, + cnn_module_kernel: int = 15, + causal: bool = False, + cnn_module_norm: str = "batch_norm", + key_bias: bool = True, + gradient_checkpointing: bool = False, + block_size=25, + ): + """Construct ConformerEncoder + + Args: + input_size to use_dynamic_chunk, see in BaseEncoder + positionwise_conv_kernel_size (int): Kernel size of positionwise + conv1d layer. + macaron_style (bool): Whether to use macaron style for + positionwise layer. + selfattention_layer_type (str): Encoder attention layer type, + the parameter has no effect now, it's just for configure + compatibility. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + cnn_module_kernel (int): Kernel size of convolution module. + causal (bool): whether to use causal convolution or not. + key_bias: whether use bias in attention.linear_k, False for whisper models. + """ + super().__init__(input_size, output_size, attention_heads, + linear_units, num_blocks, dropout_rate, + positional_dropout_rate, attention_dropout_rate, + input_layer, pos_enc_layer_type, normalize_before, + static_chunk_size, use_dynamic_chunk, global_cmvn, + use_dynamic_left_chunk, gradient_checkpointing) + activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]() + + # self-attention module definition + encoder_selfattn_layer_args = ( + attention_heads, + output_size, + attention_dropout_rate, + key_bias, + block_size, + ) + # feed-forward module definition + positionwise_layer_args = ( + output_size, + linear_units, + dropout_rate, + activation, + ) + # convolution module definition + convolution_layer_args = (output_size, cnn_module_kernel, activation, + cnn_module_norm, causal) + + self.encoders = torch.nn.ModuleList([ + ConformerEncoderLayer( + output_size, + COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type]( + *encoder_selfattn_layer_args), + PositionwiseFeedForward(*positionwise_layer_args), + PositionwiseFeedForward( + *positionwise_layer_args) if macaron_style else None, + ConvolutionModule( + *convolution_layer_args) if use_cnn_module else None, + dropout_rate, + normalize_before, + ) for _ in range(num_blocks) + ]) + self.block_size=block_size diff --git a/cosyvoice/transformer/encoder_layer.py b/cosyvoice/transformer/encoder_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..dfd758bc1cc7780aa4f6a322a264c879b74a6cfe --- /dev/null +++ b/cosyvoice/transformer/encoder_layer.py @@ -0,0 +1,236 @@ +# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) +# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Encoder self-attention layer definition.""" + +from typing import Optional, Tuple + +import torch +from torch import nn + + +class TransformerEncoderLayer(nn.Module): + """Encoder layer module. + + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` + instance can be used as the argument. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward`, instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): + True: use layer_norm before each sub-block. + False: to use layer_norm after each sub-block. + """ + + def __init__( + self, + size: int, + self_attn: torch.nn.Module, + feed_forward: torch.nn.Module, + dropout_rate: float, + normalize_before: bool = True, + ): + """Construct an EncoderLayer object.""" + super().__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.norm1 = nn.LayerNorm(size, eps=1e-5) + self.norm2 = nn.LayerNorm(size, eps=1e-5) + self.dropout = nn.Dropout(dropout_rate) + self.size = size + self.normalize_before = normalize_before + + def forward( + self, + x: torch.Tensor, + mask: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), + cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute encoded features. + + Args: + x (torch.Tensor): (#batch, time, size) + mask (torch.Tensor): Mask tensor for the input (#batch, time,time), + (0, 0, 0) means fake mask. + pos_emb (torch.Tensor): just for interface compatibility + to ConformerEncoderLayer + mask_pad (torch.Tensor): does not used in transformer layer, + just for unified api with conformer. + att_cache (torch.Tensor): Cache tensor of the KEY & VALUE + (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. + cnn_cache (torch.Tensor): Convolution cache in conformer layer + (#batch=1, size, cache_t2), not used here, it's for interface + compatibility to ConformerEncoderLayer. + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time, time). + torch.Tensor: att_cache tensor, + (#batch=1, head, cache_t1 + time, d_k * 2). + torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). + + """ + residual = x + if self.normalize_before: + x = self.norm1(x) + x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb=pos_emb, cache=att_cache) + x = residual + self.dropout(x_att) + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm2(x) + + fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) + return x, mask, new_att_cache, fake_cnn_cache + + +class ConformerEncoderLayer(nn.Module): + """Encoder layer module. + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` + instance can be used as the argument. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward` instance can be used as the argument. + feed_forward_macaron (torch.nn.Module): Additional feed-forward module + instance. + `PositionwiseFeedForward` instance can be used as the argument. + conv_module (torch.nn.Module): Convolution module instance. + `ConvlutionModule` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): + True: use layer_norm before each sub-block. + False: use layer_norm after each sub-block. + """ + + def __init__( + self, + size: int, + self_attn: torch.nn.Module, + feed_forward: Optional[nn.Module] = None, + feed_forward_macaron: Optional[nn.Module] = None, + conv_module: Optional[nn.Module] = None, + dropout_rate: float = 0.1, + normalize_before: bool = True, + ): + """Construct an EncoderLayer object.""" + super().__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.feed_forward_macaron = feed_forward_macaron + self.conv_module = conv_module + self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module + self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module + if feed_forward_macaron is not None: + self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) + self.ff_scale = 0.5 + else: + self.ff_scale = 1.0 + if self.conv_module is not None: + self.norm_conv = nn.LayerNorm(size, eps=1e-5) # for the CNN module + self.norm_final = nn.LayerNorm( + size, eps=1e-5) # for the final output of the block + self.dropout = nn.Dropout(dropout_rate) + self.size = size + self.normalize_before = normalize_before + + def forward( + self, + x: torch.Tensor, + mask: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), + cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute encoded features. + + Args: + x (torch.Tensor): (#batch, time, size) + mask (torch.Tensor): Mask tensor for the input (#batch, time,time), + (0, 0, 0) means fake mask. + pos_emb (torch.Tensor): positional encoding, must not be None + for ConformerEncoderLayer. + mask_pad (torch.Tensor): batch padding mask used for conv module. + (#batch, 1,time), (0, 0, 0) means fake mask. + att_cache (torch.Tensor): Cache tensor of the KEY & VALUE + (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. + cnn_cache (torch.Tensor): Convolution cache in conformer layer + (#batch=1, size, cache_t2) + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time, time). + torch.Tensor: att_cache tensor, + (#batch=1, head, cache_t1 + time, d_k * 2). + torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). + """ + + # whether to use macaron style + if self.feed_forward_macaron is not None: + residual = x + if self.normalize_before: + x = self.norm_ff_macaron(x) + x = residual + self.ff_scale * self.dropout( + self.feed_forward_macaron(x)) + if not self.normalize_before: + x = self.norm_ff_macaron(x) + + # multi-headed self-attention module + residual = x + if self.normalize_before: + x = self.norm_mha(x) + x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, + att_cache) + x = residual + self.dropout(x_att) + if not self.normalize_before: + x = self.norm_mha(x) + + # convolution module + # Fake new cnn cache here, and then change it in conv_module + new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) + if self.conv_module is not None: + residual = x + if self.normalize_before: + x = self.norm_conv(x) + x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) + x = residual + self.dropout(x) + + if not self.normalize_before: + x = self.norm_conv(x) + + # feed forward module + residual = x + if self.normalize_before: + x = self.norm_ff(x) + + x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm_ff(x) + + if self.conv_module is not None: + x = self.norm_final(x) + + return x, mask, new_att_cache, new_cnn_cache diff --git a/cosyvoice/transformer/label_smoothing_loss.py b/cosyvoice/transformer/label_smoothing_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..feacabf09609ee6eb047c89ce18d372256c72c71 --- /dev/null +++ b/cosyvoice/transformer/label_smoothing_loss.py @@ -0,0 +1,96 @@ +# Copyright (c) 2019 Shigeki Karita +# 2020 Mobvoi Inc (Binbin Zhang) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Label smoothing module.""" + +import torch +from torch import nn + + +class LabelSmoothingLoss(nn.Module): + """Label-smoothing loss. + + In a standard CE loss, the label's data distribution is: + [0,1,2] -> + [ + [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], + [0.0, 0.0, 1.0], + ] + + In the smoothing version CE Loss,some probabilities + are taken from the true label prob (1.0) and are divided + among other labels. + + e.g. + smoothing=0.1 + [0,1,2] -> + [ + [0.9, 0.05, 0.05], + [0.05, 0.9, 0.05], + [0.05, 0.05, 0.9], + ] + + Args: + size (int): the number of class + padding_idx (int): padding class id which will be ignored for loss + smoothing (float): smoothing rate (0.0 means the conventional CE) + normalize_length (bool): + normalize loss by sequence length if True + normalize loss by batch size if False + """ + + def __init__(self, + size: int, + padding_idx: int, + smoothing: float, + normalize_length: bool = False): + """Construct an LabelSmoothingLoss object.""" + super(LabelSmoothingLoss, self).__init__() + self.criterion = nn.KLDivLoss(reduction="none") + self.padding_idx = padding_idx + self.confidence = 1.0 - smoothing + self.smoothing = smoothing + self.size = size + self.normalize_length = normalize_length + + def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + """Compute loss between x and target. + + The model outputs and data labels tensors are flatten to + (batch*seqlen, class) shape and a mask is applied to the + padding part which should not be calculated for loss. + + Args: + x (torch.Tensor): prediction (batch, seqlen, class) + target (torch.Tensor): + target signal masked with self.padding_id (batch, seqlen) + Returns: + loss (torch.Tensor) : The KL loss, scalar float value + """ + assert x.size(2) == self.size + batch_size = x.size(0) + x = x.view(-1, self.size) + target = target.view(-1) + # use zeros_like instead of torch.no_grad() for true_dist, + # since no_grad() can not be exported by JIT + true_dist = torch.zeros_like(x) + true_dist.fill_(self.smoothing / (self.size - 1)) + ignore = target == self.padding_idx # (B,) + total = len(target) - ignore.sum().item() + target = target.masked_fill(ignore, 0) # avoid -1 index + true_dist.scatter_(1, target.unsqueeze(1), self.confidence) + kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) + denom = total if self.normalize_length else batch_size + return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/cosyvoice/transformer/positionwise_feed_forward.py b/cosyvoice/transformer/positionwise_feed_forward.py new file mode 100644 index 0000000000000000000000000000000000000000..b7a2cf6e7315e3a5ed2794423daff0a59cc5b208 --- /dev/null +++ b/cosyvoice/transformer/positionwise_feed_forward.py @@ -0,0 +1,115 @@ +# Copyright (c) 2019 Shigeki Karita +# 2020 Mobvoi Inc (Binbin Zhang) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Positionwise feed forward layer definition.""" + +import torch + + +class PositionwiseFeedForward(torch.nn.Module): + """Positionwise feed forward layer. + + FeedForward are appied on each position of the sequence. + The output dim is same with the input dim. + + Args: + idim (int): Input dimenstion. + hidden_units (int): The number of hidden units. + dropout_rate (float): Dropout rate. + activation (torch.nn.Module): Activation function + """ + + def __init__( + self, + idim: int, + hidden_units: int, + dropout_rate: float, + activation: torch.nn.Module = torch.nn.ReLU(), + ): + """Construct a PositionwiseFeedForward object.""" + super(PositionwiseFeedForward, self).__init__() + self.w_1 = torch.nn.Linear(idim, hidden_units) + self.activation = activation + self.dropout = torch.nn.Dropout(dropout_rate) + self.w_2 = torch.nn.Linear(hidden_units, idim) + + def forward(self, xs: torch.Tensor) -> torch.Tensor: + """Forward function. + + Args: + xs: input tensor (B, L, D) + Returns: + output tensor, (B, L, D) + """ + return self.w_2(self.dropout(self.activation(self.w_1(xs)))) + + +class MoEFFNLayer(torch.nn.Module): + """ + Mixture of expert with Positionwise feed forward layer + See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf + The output dim is same with the input dim. + + Modified from https://github.com/Lightning-AI/lit-gpt/pull/823 + https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219 + Args: + n_expert: number of expert. + n_expert_per_token: The actual number of experts used for each frame + idim (int): Input dimenstion. + hidden_units (int): The number of hidden units. + dropout_rate (float): Dropout rate. + activation (torch.nn.Module): Activation function + """ + + def __init__( + self, + n_expert: int, + n_expert_per_token: int, + idim: int, + hidden_units: int, + dropout_rate: float, + activation: torch.nn.Module = torch.nn.ReLU(), + ): + super(MoEFFNLayer, self).__init__() + self.gate = torch.nn.Linear(idim, n_expert, bias=False) + self.experts = torch.nn.ModuleList( + PositionwiseFeedForward(idim, hidden_units, dropout_rate, + activation) for _ in range(n_expert)) + self.n_expert_per_token = n_expert_per_token + + def forward(self, xs: torch.Tensor) -> torch.Tensor: + """Foward function. + Args: + xs: input tensor (B, L, D) + Returns: + output tensor, (B, L, D) + + """ + B, L, D = xs.size( + ) # batch size, sequence length, embedding dimension (idim) + xs = xs.view(-1, D) # (B*L, D) + router = self.gate(xs) # (B*L, n_expert) + logits, indices = torch.topk( + router, self.n_expert_per_token + ) # probs:(B*L, n_expert), indices: (B*L, n_expert) + weights = torch.nn.functional.softmax( + logits, dim=1, + dtype=torch.float).to(dtype=xs.dtype) # (B*L, n_expert_per_token) + output = torch.zeros_like(xs) # (B*L, D) + for i, expert in enumerate(self.experts): + mask = indices == i + batch_idx, ith_expert = torch.where(mask) + output[batch_idx] += weights[batch_idx, ith_expert, None] * expert( + xs[batch_idx]) + return output.view(B, L, D) diff --git a/cosyvoice/transformer/subsampling.py b/cosyvoice/transformer/subsampling.py new file mode 100644 index 0000000000000000000000000000000000000000..e17c2e324e3afb24e1b619effe29cef07c9c5b3a --- /dev/null +++ b/cosyvoice/transformer/subsampling.py @@ -0,0 +1,383 @@ +# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Subsampling layer definition.""" + +from typing import Tuple, Union + +import torch + + +class BaseSubsampling(torch.nn.Module): + + def __init__(self): + super().__init__() + self.right_context = 0 + self.subsampling_rate = 1 + + def position_encoding(self, offset: Union[int, torch.Tensor], + size: int) -> torch.Tensor: + return self.pos_enc.position_encoding(offset, size) + + +class EmbedinigNoSubsampling(BaseSubsampling): + """Embedding input without subsampling + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + super().__init__() + self.embed = torch.nn.Embedding(idim, odim) + self.pos_enc = pos_enc_class + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Input x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: linear input tensor (#batch, time', odim), + where time' = time . + torch.Tensor: linear input mask (#batch, 1, time'), + where time' = time . + + """ + x = self.embed(x) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask + + +class LinearNoSubsampling(BaseSubsampling): + """Linear transform the input without subsampling + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an linear object.""" + super().__init__() + self.out = torch.nn.Sequential( + torch.nn.Linear(idim, odim), + torch.nn.LayerNorm(odim, eps=1e-5), + torch.nn.Dropout(dropout_rate), + ) + self.pos_enc = pos_enc_class + self.right_context = 0 + self.subsampling_rate = 1 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Input x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: linear input tensor (#batch, time', odim), + where time' = time . + torch.Tensor: linear input mask (#batch, 1, time'), + where time' = time . + + """ + x = self.out(x) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask + + +class Conv1dSubsampling2(BaseSubsampling): + """Convolutional 1D subsampling (to 1/2 length). + It is designed for Whisper, ref: + https://github.com/openai/whisper/blob/main/whisper/model.py + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an Conv1dSubsampling2 object.""" + super().__init__() + self.conv = torch.nn.Sequential( + torch.nn.Conv1d(idim, odim, kernel_size=3, padding=1), + torch.nn.GELU(), + torch.nn.Conv1d(odim, odim, kernel_size=3, stride=2, padding=1), + torch.nn.GELU(), + ) + self.pos_enc = pos_enc_class + # The right context for every conv layer is computed by: + # (kernel_size - 1) * frame_rate_of_this_layer + self.subsampling_rate = 2 + # 4 = (3 - 1) * 1 + (3 - 1) * 1 + self.right_context = 4 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Subsample x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: Subsampled tensor (#batch, time', odim), + where time' = time // 2. + torch.Tensor: Subsampled mask (#batch, 1, time'), + where time' = time // 2. + torch.Tensor: positional encoding + + """ + time = x.size(1) + x = x.transpose(1, 2) # (b, f, t) + x = self.conv(x) + x = x.transpose(1, 2) # (b, t, f) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask[:, :, (time + 1) % 2::2] + + +class Conv2dSubsampling4(BaseSubsampling): + """Convolutional 2D subsampling (to 1/4 length). + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an Conv2dSubsampling4 object.""" + super().__init__() + self.conv = torch.nn.Sequential( + torch.nn.Conv2d(1, odim, 3, 2), + torch.nn.ReLU(), + torch.nn.Conv2d(odim, odim, 3, 2), + torch.nn.ReLU(), + ) + self.out = torch.nn.Sequential( + torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) + self.pos_enc = pos_enc_class + # The right context for every conv layer is computed by: + # (kernel_size - 1) * frame_rate_of_this_layer + self.subsampling_rate = 4 + # 6 = (3 - 1) * 1 + (3 - 1) * 2 + self.right_context = 6 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Subsample x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: Subsampled tensor (#batch, time', odim), + where time' = time // 4. + torch.Tensor: Subsampled mask (#batch, 1, time'), + where time' = time // 4. + torch.Tensor: positional encoding + + """ + x = x.unsqueeze(1) # (b, c=1, t, f) + x = self.conv(x) + b, c, t, f = x.size() + x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] + + +class Conv2dSubsampling6(BaseSubsampling): + """Convolutional 2D subsampling (to 1/6 length). + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + pos_enc (torch.nn.Module): Custom position encoding layer. + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an Conv2dSubsampling6 object.""" + super().__init__() + self.conv = torch.nn.Sequential( + torch.nn.Conv2d(1, odim, 3, 2), + torch.nn.ReLU(), + torch.nn.Conv2d(odim, odim, 5, 3), + torch.nn.ReLU(), + ) + self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), + odim) + self.pos_enc = pos_enc_class + # 10 = (3 - 1) * 1 + (5 - 1) * 2 + self.subsampling_rate = 6 + self.right_context = 10 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Subsample x. + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: Subsampled tensor (#batch, time', odim), + where time' = time // 6. + torch.Tensor: Subsampled mask (#batch, 1, time'), + where time' = time // 6. + torch.Tensor: positional encoding + """ + x = x.unsqueeze(1) # (b, c, t, f) + x = self.conv(x) + b, c, t, f = x.size() + x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] + + +class Conv2dSubsampling8(BaseSubsampling): + """Convolutional 2D subsampling (to 1/8 length). + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an Conv2dSubsampling8 object.""" + super().__init__() + self.conv = torch.nn.Sequential( + torch.nn.Conv2d(1, odim, 3, 2), + torch.nn.ReLU(), + torch.nn.Conv2d(odim, odim, 3, 2), + torch.nn.ReLU(), + torch.nn.Conv2d(odim, odim, 3, 2), + torch.nn.ReLU(), + ) + self.linear = torch.nn.Linear( + odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) + self.pos_enc = pos_enc_class + self.subsampling_rate = 8 + # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 + self.right_context = 14 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Subsample x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: Subsampled tensor (#batch, time', odim), + where time' = time // 8. + torch.Tensor: Subsampled mask (#batch, 1, time'), + where time' = time // 8. + torch.Tensor: positional encoding + """ + x = x.unsqueeze(1) # (b, c, t, f) + x = self.conv(x) + b, c, t, f = x.size() + x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] + + +class LegacyLinearNoSubsampling(BaseSubsampling): + """Linear transform the input without subsampling + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an linear object.""" + super().__init__() + self.out = torch.nn.Sequential( + torch.nn.Linear(idim, odim), + torch.nn.LayerNorm(odim, eps=1e-5), + torch.nn.Dropout(dropout_rate), + torch.nn.ReLU(), + ) + self.pos_enc = pos_enc_class + self.right_context = 0 + self.subsampling_rate = 1 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Input x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: linear input tensor (#batch, time', odim), + where time' = time . + torch.Tensor: linear input mask (#batch, 1, time'), + where time' = time . + + """ + x = self.out(x) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask diff --git a/cosyvoice/utils/__init__.py b/cosyvoice/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cosyvoice/utils/__pycache__/__init__.cpython-310.pyc b/cosyvoice/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc52d04255f858206d8e7c848cfd356ea8abd2db Binary files /dev/null and b/cosyvoice/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/cosyvoice/utils/__pycache__/block_mask_util.cpython-310.pyc b/cosyvoice/utils/__pycache__/block_mask_util.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c86792d294e74cd009e955e53a58ec5d54eb60c5 Binary files /dev/null and b/cosyvoice/utils/__pycache__/block_mask_util.cpython-310.pyc differ diff --git a/cosyvoice/utils/__pycache__/class_utils.cpython-310.pyc b/cosyvoice/utils/__pycache__/class_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..928955fce7671cb9caf9cdfe00ce259a569d4973 Binary files /dev/null and b/cosyvoice/utils/__pycache__/class_utils.cpython-310.pyc differ diff --git a/cosyvoice/utils/__pycache__/common.cpython-310.pyc b/cosyvoice/utils/__pycache__/common.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e02c8b48d489ea45bc41e09e12a99e6886d3b4e4 Binary files /dev/null and b/cosyvoice/utils/__pycache__/common.cpython-310.pyc differ diff --git a/cosyvoice/utils/__pycache__/mask.cpython-310.pyc b/cosyvoice/utils/__pycache__/mask.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5bc5034809799063a08150aa3ff7ae6a6184b486 Binary files /dev/null and b/cosyvoice/utils/__pycache__/mask.cpython-310.pyc differ diff --git a/cosyvoice/utils/block_mask_util.py b/cosyvoice/utils/block_mask_util.py new file mode 100644 index 0000000000000000000000000000000000000000..58e22b051e2ed91fbe51f3287ae09ac31fee65da --- /dev/null +++ b/cosyvoice/utils/block_mask_util.py @@ -0,0 +1,34 @@ +import torch + + +def create_grid_mask(seq_length, trunck_length, fill_triangle): + assert seq_length > 0 + + # 先不考虑seen_length创建一个grid mask: + if fill_triangle: + mask = 1 - torch.triu(torch.ones(seq_length, seq_length), diagonal=1) + # 下三角与主对角线都为1 + else: + mask = torch.zeros(seq_length, seq_length) + + for i in range(seq_length): + trunck_idx = i // trunck_length + trunck_start = trunck_idx * trunck_length + trunck_end = trunck_length + trunck_start + mask[i][trunck_start:trunck_end] = 1 + + return mask + + +if __name__ == "__main__": + mask = create_grid_mask(seq_length=8, trunck_length=3, fill_triangle=True).int() + print(mask) +# tensor([[1, 1, 1, 0, 0, 0, 0, 0], +# [1, 1, 1, 0, 0, 0, 0, 0], +# [1, 1, 1, 0, 0, 0, 0, 0], +# [1, 1, 1, 1, 1, 1, 0, 0], +# [1, 1, 1, 1, 1, 1, 0, 0], +# [1, 1, 1, 1, 1, 1, 0, 0], +# [1, 1, 1, 1, 1, 1, 1, 1], +# [1, 1, 1, 1, 1, 1, 1, 1]] + diff --git a/cosyvoice/utils/class_utils.py b/cosyvoice/utils/class_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f2250c792cbf5f2cdf2705500fcdd22c5615a133 --- /dev/null +++ b/cosyvoice/utils/class_utils.py @@ -0,0 +1,72 @@ +# Copyright [2023-11-28] +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +from cosyvoice.transformer.activation import Swish +from cosyvoice.transformer.subsampling import ( + LinearNoSubsampling, + EmbedinigNoSubsampling, + Conv1dSubsampling2, + Conv2dSubsampling4, + Conv2dSubsampling6, + Conv2dSubsampling8, +) +from cosyvoice.transformer.embedding import (PositionalEncoding, + RelPositionalEncoding, + WhisperPositionalEncoding, + LearnablePositionalEncoding, + NoPositionalEncoding) +from cosyvoice.transformer.attention import (MultiHeadedAttention, + RelPositionMultiHeadedAttention, + BlockRelPositionMultiHeadedAttention) +from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding +from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling + + +COSYVOICE_ACTIVATION_CLASSES = { + "hardtanh": torch.nn.Hardtanh, + "tanh": torch.nn.Tanh, + "relu": torch.nn.ReLU, + "selu": torch.nn.SELU, + "swish": getattr(torch.nn, "SiLU", Swish), + "gelu": torch.nn.GELU, +} + +COSYVOICE_SUBSAMPLE_CLASSES = { + "linear": LinearNoSubsampling, + "linear_legacy": LegacyLinearNoSubsampling, + "embed": EmbedinigNoSubsampling, + "conv1d2": Conv1dSubsampling2, + "conv2d": Conv2dSubsampling4, + "conv2d6": Conv2dSubsampling6, + "conv2d8": Conv2dSubsampling8, + 'paraformer_dummy': torch.nn.Identity +} + +COSYVOICE_EMB_CLASSES = { + "embed": PositionalEncoding, + "abs_pos": PositionalEncoding, + "rel_pos": RelPositionalEncoding, + "rel_pos_espnet": EspnetRelPositionalEncoding, + "no_pos": NoPositionalEncoding, + "abs_pos_whisper": WhisperPositionalEncoding, + "embed_learnable_pe": LearnablePositionalEncoding, +} + +COSYVOICE_ATTENTION_CLASSES = { + "selfattn": MultiHeadedAttention, + "rel_selfattn": RelPositionMultiHeadedAttention, + "block_rel_selfattn": BlockRelPositionMultiHeadedAttention, +} diff --git a/cosyvoice/utils/common.py b/cosyvoice/utils/common.py new file mode 100644 index 0000000000000000000000000000000000000000..6ec5e178359031e42c64090eede8aabfdf067afa --- /dev/null +++ b/cosyvoice/utils/common.py @@ -0,0 +1,103 @@ +# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Unility functions for Transformer.""" + +from typing import List + +import torch + +IGNORE_ID = -1 + + +def pad_list(xs: List[torch.Tensor], pad_value: int): + """Perform padding for the list of tensors. + + Args: + xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. + pad_value (float): Value for padding. + + Returns: + Tensor: Padded tensor (B, Tmax, `*`). + + Examples: + >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] + >>> x + [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] + >>> pad_list(x, 0) + tensor([[1., 1., 1., 1.], + [1., 1., 0., 0.], + [1., 0., 0., 0.]]) + + """ + max_len = max([len(item) for item in xs]) + batchs = len(xs) + ndim = xs[0].ndim + if ndim == 1: + pad_res = torch.zeros(batchs, + max_len, + dtype=xs[0].dtype, + device=xs[0].device) + elif ndim == 2: + pad_res = torch.zeros(batchs, + max_len, + xs[0].shape[1], + dtype=xs[0].dtype, + device=xs[0].device) + elif ndim == 3: + pad_res = torch.zeros(batchs, + max_len, + xs[0].shape[1], + xs[0].shape[2], + dtype=xs[0].dtype, + device=xs[0].device) + else: + raise ValueError(f"Unsupported ndim: {ndim}") + pad_res.fill_(pad_value) + for i in range(batchs): + pad_res[i, :len(xs[i])] = xs[i] + return pad_res + + +def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, + ignore_label: int) -> torch.Tensor: + """Calculate accuracy. + + Args: + pad_outputs (Tensor): Prediction tensors (B * Lmax, D). + pad_targets (LongTensor): Target label tensors (B, Lmax). + ignore_label (int): Ignore label id. + + Returns: + torch.Tensor: Accuracy value (0.0 - 1.0). + + """ + pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), + pad_outputs.size(1)).argmax(2) + mask = pad_targets != ignore_label + numerator = torch.sum( + pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) + denominator = torch.sum(mask) + return (numerator / denominator).detach() + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) diff --git a/cosyvoice/utils/executor.py b/cosyvoice/utils/executor.py new file mode 100644 index 0000000000000000000000000000000000000000..5d9159411eab228fb36b46d5c728f3989ce5e920 --- /dev/null +++ b/cosyvoice/utils/executor.py @@ -0,0 +1,132 @@ +# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from contextlib import nullcontext +import os + +import torch +import torch.distributed as dist +import tqdm + +from cosyvoice.utils.train_utils import update_parameter_and_lr, log_per_step, log_per_save, batch_forward, batch_backward, save_model, cosyvoice_join + + +class Executor: + + def __init__(self): + self.step = 0 + self.epoch = 0 + self.rank = int(os.environ.get('RANK', 0)) + self.device = torch.device('cuda:{}'.format(self.rank)) + + def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join): + ''' Train one epoch + ''' + + lr = optimizer.param_groups[0]['lr'] + logging.info('Epoch {} TRAIN info lr {} rank {}'.format(self.epoch, lr, self.rank)) + logging.info('using accumulate grad, new batch size is {} times' + ' larger than before'.format(info_dict['accum_grad'])) + # A context manager to be used in conjunction with an instance of + # torch.nn.parallel.DistributedDataParallel to be able to train + # with uneven inputs across participating processes. + model.train() + model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext + with model_context(): + for batch_idx, batch_dict in tqdm.tqdm(enumerate(train_data_loader)): + # print("======== forword ========") + info_dict["tag"] = "TRAIN" + info_dict["step"] = self.step + info_dict["epoch"] = self.epoch + info_dict["batch_idx"] = batch_idx + if cosyvoice_join(group_join, info_dict): + break + # import pdb + # pdb.set_trace() + # Disable gradient synchronizations across DDP processes. + # Within this context, gradients will be accumulated on module + # variables, which will later be synchronized. + if info_dict['train_engine'] == 'torch_ddp' and (batch_idx + 1) % info_dict["accum_grad"] != 0: + context = model.no_sync + # Used for single gpu training and DDP gradient synchronization + # processes. + else: + context = nullcontext + + new_batch_dict={ + # "utts":batch_dict["utts"], + "speech_token":batch_dict["speech_token"], + "speech_token_len":batch_dict["speech_token_len"], + "speech_feat":batch_dict["speech_feat"], + "speech_feat_len":batch_dict["speech_feat_len"], + "embedding":batch_dict["embedding"], + # "embedding":torch.zeros((batch_dict["speech_feat"].size(0),192),device=batch_dict["speech_feat"].device) + } + + with context(): + info_dict = batch_forward(model, new_batch_dict, info_dict) + info_dict = batch_backward(model, info_dict) + + info_dict = update_parameter_and_lr(model, optimizer, scheduler, info_dict) + log_per_step(writer, info_dict) + # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save + if info_dict['save_per_step'] > 0 and (self.step + 1) % info_dict['save_per_step'] == 0 and (batch_idx + 1) % info_dict["accum_grad"] == 0: + dist.barrier() + # try: + # dist.barrier() + # except RuntimeError as e: + # logging.info('except RuntimeError as e: {}'.format(e)) + self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=False) + model.train() + if (batch_idx + 1) % info_dict["accum_grad"] == 0: + self.step += 1 + dist.barrier() + # try: + # dist.barrier() + # except RuntimeError as e: + # logging.info('except RuntimeError as e: {}'.format(e)) + self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True) + + @torch.inference_mode() + def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=True): + ''' Cross validation on + ''' + logging.info('Epoch {} Step {} on_batch_end {} CV rank {}'.format(self.epoch, self.step + 1, on_batch_end, self.rank)) + model.eval() + total_num_utts, total_loss_dict = 0, {} # avoid division by 0 + for batch_idx, batch_dict in enumerate(cv_data_loader): + info_dict["tag"] = "CV" + info_dict["step"] = self.step + info_dict["epoch"] = self.epoch + info_dict["batch_idx"] = batch_idx + + # num_utts = len(batch_dict["utts"]) + num_utts=batch_dict["speech_token"].size(0) + total_num_utts += num_utts + + info_dict = batch_forward(model, batch_dict, info_dict) + + for k, v in info_dict['loss_dict'].items(): + if k not in total_loss_dict: + total_loss_dict[k] = [] + total_loss_dict[k].append(v.item() * num_utts) + log_per_step(None, info_dict) + for k, v in total_loss_dict.items(): + total_loss_dict[k] = sum(v) / total_num_utts + info_dict['loss_dict'] = total_loss_dict + log_per_save(writer, info_dict) + model_name = 'epoch_{}_whole'.format(self.epoch) if on_batch_end else 'epoch_{}_step_{}'.format(self.epoch, self.step + 1) + save_model(model, model_name, info_dict) diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d4179e109da4073ca9be75767c3f59d2ee68a5cf --- /dev/null +++ b/cosyvoice/utils/file_utils.py @@ -0,0 +1,53 @@ +# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import torchaudio + + +def read_lists(list_file): + lists = [] + with open(list_file, 'r', encoding='utf8') as fin: + for line in fin: + lists.append(line.strip()) + return lists + +def read_json_lists(list_file): + lists = read_lists(list_file) + results = {} + for fn in lists: + with open(fn, 'r', encoding='utf8') as fin: + results.update(json.load(fin)) + return results + +def load_wav(wav, target_sr): + speech, sample_rate = torchaudio.load(wav) + speech = speech.mean(dim=0, keepdim=True) + if sample_rate != target_sr: + assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr) + speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech) + return speech + +def speed_change(waveform, sample_rate, speed_factor: str): + effects = [ + ["tempo", speed_factor], # speed_factor + ["rate", f"{sample_rate}"] + ] + augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor( + waveform, + sample_rate, + effects + ) + return augmented_waveform, new_sample_rate diff --git a/cosyvoice/utils/frontend_utils.py b/cosyvoice/utils/frontend_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..59489a7a6fdb442b1134baac3e5eef0211130954 --- /dev/null +++ b/cosyvoice/utils/frontend_utils.py @@ -0,0 +1,125 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+') + +# whether contain chinese character +def contains_chinese(text): + return bool(chinese_char_pattern.search(text)) + + +# replace special symbol +def replace_corner_mark(text): + text = text.replace('²', '平方') + text = text.replace('³', '立方') + return text + + +# remove meaningless symbol +def remove_bracket(text): + text = text.replace('(', '').replace(')', '') + text = text.replace('【', '').replace('】', '') + text = text.replace('`', '').replace('`', '') + text = text.replace("——", " ") + return text + + +# spell Arabic numerals +def spell_out_number(text: str, inflect_parser): + new_text = [] + st = None + for i, c in enumerate(text): + if not c.isdigit(): + if st is not None: + num_str = inflect_parser.number_to_words(text[st: i]) + new_text.append(num_str) + st = None + new_text.append(c) + else: + if st is None: + st = i + if st is not None and st < len(text): + num_str = inflect_parser.number_to_words(text[st:]) + new_text.append(num_str) + return ''.join(new_text) + + +# split paragrah logic: +# 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len +# 2. cal sentence len according to lang +# 3. split sentence according to puncatation +def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False): + def calc_utt_length(_text: str): + if lang == "zh": + return len(_text) + else: + return len(tokenize(_text)) + + def should_merge(_text: str): + if lang == "zh": + return len(_text) < merge_len + else: + return len(tokenize(_text)) < merge_len + + if lang == "zh": + pounc = ['。', '?', '!', ';', ':', '、', '.', '?', '!', ';'] + else: + pounc = ['.', '?', '!', ';', ':'] + if comma_split: + pounc.extend([',', ',']) + st = 0 + utts = [] + for i, c in enumerate(text): + if c in pounc: + if len(text[st: i]) > 0: + utts.append(text[st: i] + c) + if i + 1 < len(text) and text[i + 1] in ['"', '”']: + tmp = utts.pop(-1) + utts.append(tmp + text[i + 1]) + st = i + 2 + else: + st = i + 1 + if len(utts) == 0: + if lang == "zh": + utts.append(text + '。') + else: + utts.append(text + '.') + final_utts = [] + cur_utt = "" + for utt in utts: + if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n: + final_utts.append(cur_utt) + cur_utt = "" + cur_utt = cur_utt + utt + if len(cur_utt) > 0: + if should_merge(cur_utt) and len(final_utts) != 0: + final_utts[-1] = final_utts[-1] + cur_utt + else: + final_utts.append(cur_utt) + + return final_utts + + +# remove blank between chinese character +def replace_blank(text: str): + out_str = [] + for i, c in enumerate(text): + if c == " ": + if ((text[i + 1].isascii() and text[i + 1] != " ") and + (text[i - 1].isascii() and text[i - 1] != " ")): + out_str.append(c) + else: + out_str.append(c) + return "".join(out_str) diff --git a/cosyvoice/utils/mask.py b/cosyvoice/utils/mask.py new file mode 100644 index 0000000000000000000000000000000000000000..2b460bbd5adb4bd61d643ace71400a14fe314236 --- /dev/null +++ b/cosyvoice/utils/mask.py @@ -0,0 +1,227 @@ +# Copyright (c) 2019 Shigeki Karita +# 2020 Mobvoi Inc (Binbin Zhang) +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +''' +def subsequent_mask( + size: int, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + """Create mask for subsequent steps (size, size). + + This mask is used only in decoder which works in an auto-regressive mode. + This means the current step could only do attention with its left steps. + + In encoder, fully attention is used when streaming is not necessary and + the sequence is not long. In this case, no attention mask is needed. + + When streaming is need, chunk-based attention is used in encoder. See + subsequent_chunk_mask for the chunk-based attention mask. + + Args: + size (int): size of mask + str device (str): "cpu" or "cuda" or torch.Tensor.device + dtype (torch.device): result dtype + + Returns: + torch.Tensor: mask + + Examples: + >>> subsequent_mask(3) + [[1, 0, 0], + [1, 1, 0], + [1, 1, 1]] + """ + ret = torch.ones(size, size, device=device, dtype=torch.bool) + return torch.tril(ret) +''' + + +def subsequent_mask( + size: int, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + """Create mask for subsequent steps (size, size). + + This mask is used only in decoder which works in an auto-regressive mode. + This means the current step could only do attention with its left steps. + + In encoder, fully attention is used when streaming is not necessary and + the sequence is not long. In this case, no attention mask is needed. + + When streaming is need, chunk-based attention is used in encoder. See + subsequent_chunk_mask for the chunk-based attention mask. + + Args: + size (int): size of mask + str device (str): "cpu" or "cuda" or torch.Tensor.device + dtype (torch.device): result dtype + + Returns: + torch.Tensor: mask + + Examples: + >>> subsequent_mask(3) + [[1, 0, 0], + [1, 1, 0], + [1, 1, 1]] + """ + arange = torch.arange(size, device=device) + mask = arange.expand(size, size) + arange = arange.unsqueeze(-1) + mask = mask <= arange + return mask + + +def subsequent_chunk_mask( + size: int, + chunk_size: int, + num_left_chunks: int = -1, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + """Create mask for subsequent steps (size, size) with chunk size, + this is for streaming encoder + + Args: + size (int): size of mask + chunk_size (int): size of chunk + num_left_chunks (int): number of left chunks + <0: use full chunk + >=0: use num_left_chunks + device (torch.device): "cpu" or "cuda" or torch.Tensor.device + + Returns: + torch.Tensor: mask + + Examples: + >>> subsequent_chunk_mask(4, 2) + [[1, 1, 0, 0], + [1, 1, 0, 0], + [1, 1, 1, 1], + [1, 1, 1, 1]] + """ + ret = torch.zeros(size, size, device=device, dtype=torch.bool) + for i in range(size): + if num_left_chunks < 0: + start = 0 + else: + start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) + ending = min((i // chunk_size + 1) * chunk_size, size) + ret[i, start:ending] = True + return ret + + +def add_optional_chunk_mask(xs: torch.Tensor, + masks: torch.Tensor, + use_dynamic_chunk: bool, + use_dynamic_left_chunk: bool, + decoding_chunk_size: int, + static_chunk_size: int, + num_decoding_left_chunks: int, + enable_full_context: bool = True): + """ Apply optional mask for encoder. + + Args: + xs (torch.Tensor): padded input, (B, L, D), L for max length + mask (torch.Tensor): mask for xs, (B, 1, L) + use_dynamic_chunk (bool): whether to use dynamic chunk or not + use_dynamic_left_chunk (bool): whether to use dynamic left chunk for + training. + decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + static_chunk_size (int): chunk size for static chunk training/decoding + if it's greater than 0, if use_dynamic_chunk is true, + this parameter will be ignored + num_decoding_left_chunks: number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + enable_full_context (bool): + True: chunk size is either [1, 25] or full context(max_len) + False: chunk size ~ U[1, 25] + + Returns: + torch.Tensor: chunk mask of the input xs. + """ + # Whether to use chunk mask or not + if use_dynamic_chunk: + max_len = xs.size(1) + if decoding_chunk_size < 0: + chunk_size = max_len + num_left_chunks = -1 + elif decoding_chunk_size > 0: + chunk_size = decoding_chunk_size + num_left_chunks = num_decoding_left_chunks + else: + # chunk size is either [1, 25] or full context(max_len). + # Since we use 4 times subsampling and allow up to 1s(100 frames) + # delay, the maximum frame is 100 / 4 = 25. + chunk_size = torch.randint(1, max_len, (1, )).item() + num_left_chunks = -1 + if chunk_size > max_len // 2 and enable_full_context: + chunk_size = max_len + else: + chunk_size = chunk_size % 25 + 1 + if use_dynamic_left_chunk: + max_left_chunks = (max_len - 1) // chunk_size + num_left_chunks = torch.randint(0, max_left_chunks, + (1, )).item() + chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, + num_left_chunks, + xs.device) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) + chunk_masks = masks & chunk_masks # (B, L, L) + elif static_chunk_size > 0: + num_left_chunks = num_decoding_left_chunks + chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, + num_left_chunks, + xs.device) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) + chunk_masks = masks & chunk_masks # (B, L, L) + else: + chunk_masks = masks + return chunk_masks + + +def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: + """Make mask tensor containing indices of padded part. + + See description of make_non_pad_mask. + + Args: + lengths (torch.Tensor): Batch of lengths (B,). + Returns: + torch.Tensor: Mask tensor containing indices of padded part. + + Examples: + >>> lengths = [5, 3, 2] + >>> make_pad_mask(lengths) + masks = [[0, 0, 0, 0 ,0], + [0, 0, 0, 1, 1], + [0, 0, 1, 1, 1]] + """ + batch_size = lengths.size(0) + max_len = max_len if max_len > 0 else lengths.max().item() + seq_range = torch.arange(0, + max_len, + dtype=torch.int64, + device=lengths.device) + seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) + seq_length_expand = lengths.unsqueeze(-1) + mask = seq_range_expand >= seq_length_expand + return mask diff --git a/cosyvoice/utils/scheduler.py b/cosyvoice/utils/scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..fbf4803f81bd7a3cee4af7bd8b6af2d3b46304d7 --- /dev/null +++ b/cosyvoice/utils/scheduler.py @@ -0,0 +1,739 @@ +# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +# 2022 Ximalaya Inc (Yuguang Yang) +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +# NeMo(https://github.com/NVIDIA/NeMo) + +from typing import Union + +import math +import warnings +import torch +from torch.optim.lr_scheduler import _LRScheduler + + +class WarmupLR(_LRScheduler): + """The WarmupLR scheduler + + This scheduler is almost same as NoamLR Scheduler except for following + difference: + + NoamLR: + lr = optimizer.lr * model_size ** -0.5 + * min(step ** -0.5, step * warmup_step ** -1.5) + WarmupLR: + lr = optimizer.lr * warmup_step ** 0.5 + * min(step ** -0.5, step * warmup_step ** -1.5) + + Note that the maximum lr equals to optimizer.lr in this scheduler. + + """ + + def __init__( + self, + optimizer: torch.optim.Optimizer, + warmup_steps: Union[int, float] = 25000, + last_epoch: int = -1, + ): + self.warmup_steps = warmup_steps + + # __init__() must be invoked before setting field + # because step() is also invoked in __init__() + super().__init__(optimizer, last_epoch) + + def __repr__(self): + return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" + + def get_lr(self): + step_num = self.last_epoch + 1 + if self.warmup_steps == 0: + return [lr * step_num**-0.5 for lr in self.base_lrs] + else: + return [ + lr * self.warmup_steps**0.5 * + min(step_num**-0.5, step_num * self.warmup_steps**-1.5) + for lr in self.base_lrs + ] + + def set_step(self, step: int): + self.last_epoch = step + + +class WarmupPolicy(_LRScheduler): + """Adds warmup kwargs and warmup logic to lr policy. + All arguments should be passed as kwargs for clarity, + Args: + warmup_steps: Number of training steps in warmup stage + warmup_ratio: Ratio of warmup steps to total steps + max_steps: Total number of steps while training or `None` for + infinite training + """ + + def __init__(self, + optimizer, + *, + warmup_steps=None, + warmup_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1): + assert not (warmup_steps is not None and warmup_ratio is not None),\ + "Either use particular number of step or ratio" + assert warmup_ratio is None or max_steps is not None, \ + "If there is a ratio, there should be a total steps" + + # It is necessary to assign all attributes *before* __init__, + # as class is wrapped by an inner class. + self.max_steps = max_steps + if warmup_steps is not None: + self.warmup_steps = warmup_steps + elif warmup_ratio is not None: + self.warmup_steps = int(warmup_ratio * max_steps) + else: + self.warmup_steps = 0 + + self.min_lr = min_lr + super().__init__(optimizer, last_epoch) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn( + "To get the last learning rate computed " + "by the scheduler, please use `get_last_lr()`.", + UserWarning, + stacklevel=2) + + step = self.last_epoch + + if step <= self.warmup_steps and self.warmup_steps > 0: + return self._get_warmup_lr(step) + + if step > self.max_steps: + return [self.min_lr for _ in self.base_lrs] + + return self._get_lr(step) + + def _get_warmup_lr(self, step): + lr_val = (step + 1) / (self.warmup_steps + 1) + return [initial_lr * lr_val for initial_lr in self.base_lrs] + + def _get_lr(self, step): + """Simple const lr policy""" + return self.base_lrs + + +class SquareRootConstantPolicy(_LRScheduler): + """Adds warmup kwargs and warmup logic to lr policy. + All arguments should be passed as kwargs for clarity, + Args: + warmup_steps: Number of training steps in warmup stage + warmup_ratio: Ratio of warmup steps to total steps + max_steps: Total number of steps while training or `None` for + infinite training + """ + + def __init__(self, + optimizer, + *, + constant_steps=None, + constant_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1): + assert not (constant_steps is not None + and constant_ratio is not None), \ + "Either use particular number of step or ratio" + assert constant_ratio is None or max_steps is not None, \ + "If there is a ratio, there should be a total steps" + + # It is necessary to assign all attributes *before* __init__, + # as class is wrapped by an inner class. + self.max_steps = max_steps + if constant_steps is not None: + self.constant_steps = constant_steps + elif constant_ratio is not None: + self.constant_steps = int(constant_ratio * max_steps) + else: + self.constant_steps = 0 + + self.constant_lr = 1 / (constant_steps**0.5) + self.min_lr = min_lr + super().__init__(optimizer, last_epoch) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn( + "To get the last learning rate computed " + "by the scheduler, please use `get_last_lr()`.", + UserWarning, + stacklevel=2) + + step = self.last_epoch + + if step <= self.constant_steps: + return [self.constant_lr for _ in self.base_lrs] + + if step > self.max_steps: + return [self.min_lr for _ in self.base_lrs] + + return self._get_lr(step) + + def _get_lr(self, step): + """Simple const lr policy""" + return self.base_lrs + + +class WarmupHoldPolicy(WarmupPolicy): + """Variant of WarmupPolicy which maintains high + learning rate for a defined number of steps. + All arguments should be passed as kwargs for clarity, + Args: + warmup_steps: Number of training steps in warmup stage + warmup_ratio: Ratio of warmup steps to total steps + hold_steps: Number of training steps to + hold the learning rate after warm up + hold_ratio: Ratio of hold steps to total steps + max_steps: Total number of steps while training or `None` for + infinite training + """ + + def __init__( + self, + optimizer, + *, + warmup_steps=None, + warmup_ratio=None, + hold_steps=None, + hold_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1, + ): + assert not (hold_steps is not None and hold_ratio is not None), \ + "Either use particular number of step or ratio" + assert hold_ratio is None or max_steps is not None, \ + "If there is a ratio, there should be a total steps" + + self.min_lr = min_lr + self._last_warmup_lr = 0.0 + + # Necessary to duplicate as class attributes are hidden in inner class + self.max_steps = max_steps + if warmup_steps is not None: + self.warmup_steps = warmup_steps + elif warmup_ratio is not None: + self.warmup_steps = int(warmup_ratio * max_steps) + else: + self.warmup_steps = 0 + + if hold_steps is not None: + self.hold_steps = hold_steps + self.warmup_steps + elif hold_ratio is not None: + self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps + else: + self.hold_steps = 0 + + super().__init__( + optimizer, + warmup_steps=warmup_steps, + warmup_ratio=warmup_ratio, + max_steps=max_steps, + last_epoch=last_epoch, + min_lr=min_lr, + ) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn( + "To get the last learning rate computed by the scheduler," + " " + "please use `get_last_lr()`.", + UserWarning, + stacklevel=2) + + step = self.last_epoch + + # Warmup phase + if step <= self.warmup_steps and self.warmup_steps > 0: + return self._get_warmup_lr(step) + + # Hold phase + if (step >= self.warmup_steps) and (step < self.hold_steps): + return self.base_lrs + + if step > self.max_steps: + return [self.min_lr for _ in self.base_lrs] + + return self._get_lr(step) + + +class WarmupAnnealHoldPolicy(_LRScheduler): + """Adds warmup kwargs and warmup logic to lr policy. + All arguments should be passed as kwargs for clarity, + Args: + warmup_steps: Number of training steps in warmup stage + warmup_ratio: Ratio of warmup steps to total steps + max_steps: Total number of steps while training or `None` for + infinite training + min_lr: Minimum lr to hold the learning rate after decay at. + constant_steps: Number of steps to keep lr constant at. + constant_ratio: Ratio of steps to keep lr constant. + """ + + def __init__( + self, + optimizer, + *, + warmup_steps=None, + warmup_ratio=None, + constant_steps=None, + constant_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1, + ): + assert not (warmup_steps is not None + and warmup_ratio is not None), \ + "Either use particular number of step or ratio" + assert not (constant_steps is not None + and constant_ratio is not None), \ + "Either use constant_steps or constant_ratio" + assert warmup_ratio is None or max_steps is not None, \ + "If there is a ratio, there should be a total steps" + + # It is necessary to assign all attributes *before* __init__, + # as class is wrapped by an inner class. + self.max_steps = max_steps + + if warmup_steps is not None: + self.warmup_steps = warmup_steps + elif warmup_ratio is not None: + self.warmup_steps = int(warmup_ratio * max_steps) + else: + self.warmup_steps = 0 + + if constant_steps is not None: + self.constant_steps = constant_steps + elif constant_ratio is not None: + self.constant_steps = int(constant_ratio * max_steps) + else: + self.constant_steps = 0 + + self.decay_steps = max_steps - (self.constant_steps + + self.warmup_steps) + + self.min_lr = min_lr + super().__init__(optimizer, last_epoch) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn( + "To get the last learning rate computed " + "by the scheduler, please use `get_last_lr()`.", + UserWarning, + stacklevel=2) + + step = self.last_epoch + + # Warmup steps + if self.warmup_steps > 0 and step <= self.warmup_steps: + return self._get_warmup_lr(step) + + # Constant steps after warmup and decay + if self.constant_steps > 0 and ( + self.warmup_steps + self.decay_steps) < step <= self.max_steps: + return self._get_constant_lr(step) + + # Min lr after max steps of updates + if step > self.max_steps: + return [self.min_lr for _ in self.base_lrs] + + return self._get_lr(step) + + def _get_warmup_lr(self, step): + lr_val = (step + 1) / (self.warmup_steps + 1) + return [initial_lr * lr_val for initial_lr in self.base_lrs] + + def _get_constant_lr(self, step): + return [self.min_lr for _ in self.base_lrs] + + def _get_lr(self, step): + """Simple const lr policy""" + return self.base_lrs + + +def _squareroot_annealing(initial_lr, step, max_steps, min_lr): + mult = ((max_steps - step) / max_steps)**0.5 + out_lr = initial_lr * mult + out_lr = max(out_lr, min_lr) + return out_lr + + +def _square_annealing(initial_lr, step, max_steps, min_lr): + mult = ((max_steps - step) / max_steps)**2 + out_lr = initial_lr * mult + out_lr = max(out_lr, min_lr) + return out_lr + + +def _cosine_annealing(initial_lr, step, max_steps, min_lr): + mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) + out_lr = (initial_lr - min_lr) * mult + min_lr + return out_lr + + +def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, + decay_steps, min_lr): + assert max_lr > min_lr + # Use linear warmup for the initial part. + if warmup_steps > 0 and step <= warmup_steps: + return max_lr * float(step) / float(warmup_steps) + + # For any steps larger than `decay_steps`, use `min_lr`. + if step > warmup_steps + decay_steps: + return min_lr + + # If we are done with the warmup period, use the decay style. + num_steps_ = step - warmup_steps + decay_steps_ = decay_steps + decay_ratio = float(num_steps_) / float(decay_steps_) + assert decay_ratio >= 0.0 + assert decay_ratio <= 1.0 + delta_lr = max_lr - min_lr + + coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) + + return min_lr + coeff * delta_lr + + +def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): + if cycle: + multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) + decay_steps *= multiplier + else: + step = min(step, decay_steps) + p = step / decay_steps + lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) + lr += min_lr + return lr + + +def _noam_hold_annealing(initial_lr, step, warmup_steps, hold_steps, + decay_rate, min_lr): + # hold_steps = total number of steps + # to hold the LR, not the warmup + hold steps. + T_warmup_decay = max(1, warmup_steps**decay_rate) + T_hold_decay = max(1, (step - hold_steps)**decay_rate) + lr = (initial_lr * T_warmup_decay) / T_hold_decay + lr = max(lr, min_lr) + return lr + + +class SquareAnnealing(WarmupPolicy): + + def __init__(self, + optimizer, + *, + max_steps, + min_lr=1e-5, + last_epoch=-1, + **kwargs): + super().__init__(optimizer=optimizer, + max_steps=max_steps, + last_epoch=last_epoch, + min_lr=min_lr, + **kwargs) + + def _get_lr(self, step): + new_lrs = [ + _square_annealing( + initial_lr=initial_lr, + step=step - self.warmup_steps, + max_steps=self.max_steps - self.warmup_steps, + min_lr=self.min_lr, + ) for initial_lr in self.base_lrs + ] + return new_lrs + + +class SquareRootAnnealing(WarmupPolicy): + + def __init__(self, + optimizer, + *, + max_steps, + min_lr=0, + last_epoch=-1, + **kwargs): + super().__init__(optimizer=optimizer, + max_steps=max_steps, + last_epoch=last_epoch, + min_lr=min_lr, + **kwargs) + + def _get_lr(self, step): + new_lrs = [ + _squareroot_annealing(initial_lr=initial_lr, + step=step, + max_steps=self.max_steps, + min_lr=self.min_lr) + for initial_lr in self.base_lrs + ] + return new_lrs + + +class CosineAnnealing(WarmupAnnealHoldPolicy): + + def __init__(self, + optimizer, + *, + max_steps, + min_lr=0, + last_epoch=-1, + **kwargs): + super().__init__(optimizer=optimizer, + max_steps=max_steps, + last_epoch=last_epoch, + min_lr=min_lr, + **kwargs) + + def _get_lr(self, step): + for initial_lr in self.base_lrs: + if initial_lr < self.min_lr: + raise ValueError( + f"{self} received an initial learning rate " + f"that was lower than the minimum learning rate.") + + if self.constant_steps is None or self.constant_steps == 0: + new_lrs = [ + _cosine_annealing( + initial_lr=initial_lr, + step=step - self.warmup_steps, + max_steps=self.max_steps - self.warmup_steps, + min_lr=self.min_lr, + ) for initial_lr in self.base_lrs + ] + else: + new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) + return new_lrs + + def _get_warmup_lr(self, step): + if self.constant_steps is None or self.constant_steps == 0: + return super()._get_warmup_lr(step) + else: + # Use linear warmup for the initial part. + return self._get_linear_warmup_with_cosine_annealing_lr(step) + + def _get_constant_lr(self, step): + # Only called when `constant_steps` > 0. + return self._get_linear_warmup_with_cosine_annealing_lr(step) + + def _get_linear_warmup_with_cosine_annealing_lr(self, step): + # Cosine Schedule for Megatron LM, + # slightly different warmup schedule + constant LR at the end. + new_lrs = [ + _linear_warmup_with_cosine_annealing( + max_lr=self.base_lrs[0], + warmup_steps=self.warmup_steps, + step=step, + decay_steps=self.decay_steps, + min_lr=self.min_lr, + ) for _ in self.base_lrs + ] + return new_lrs + + +class NoamAnnealing(_LRScheduler): + + def __init__(self, + optimizer, + *, + d_model, + warmup_steps=None, + warmup_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1): + self._normalize = d_model**(-0.5) + assert not (warmup_steps is not None + and warmup_ratio is not None), \ + "Either use particular number of step or ratio" + assert warmup_ratio is None or max_steps is not None, \ + "If there is a ratio, there should be a total steps" + + # It is necessary to assign all attributes *before* __init__, + # as class is wrapped by an inner class. + self.max_steps = max_steps + if warmup_steps is not None: + self.warmup_steps = warmup_steps + elif warmup_ratio is not None: + self.warmup_steps = int(warmup_ratio * max_steps) + else: + self.warmup_steps = 0 + + self.min_lr = min_lr + super().__init__(optimizer, last_epoch) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn( + "To get the last learning rate computed " + "by the scheduler, please use `get_last_lr()`.", + UserWarning, + stacklevel=2) + + step = max(1, self.last_epoch) + + for initial_lr in self.base_lrs: + if initial_lr < self.min_lr: + raise ValueError( + f"{self} received an initial learning rate " + f"that was lower than the minimum learning rate.") + + new_lrs = [ + self._noam_annealing(initial_lr=initial_lr, step=step) + for initial_lr in self.base_lrs + ] + return new_lrs + + def _noam_annealing(self, initial_lr, step): + if self.warmup_steps > 0: + mult = self._normalize * min(step**(-0.5), + step * (self.warmup_steps**(-1.5))) + else: + mult = self._normalize * step**(-0.5) + + out_lr = initial_lr * mult + if step > self.warmup_steps: + out_lr = max(out_lr, self.min_lr) + return out_lr + + +class NoamHoldAnnealing(WarmupHoldPolicy): + + def __init__(self, + optimizer, + *, + max_steps, + decay_rate=0.5, + min_lr=0.0, + last_epoch=-1, + **kwargs): + """ + From Nemo: + Implementation of the Noam Hold Annealing policy + from the SqueezeFormer paper. + + Unlike NoamAnnealing, the peak learning rate + can be explicitly set for this scheduler. + The schedule first performs linear warmup, + then holds the peak LR, then decays with some schedule for + the remainder of the steps. + Therefore the min-lr is still dependent + on the hyper parameters selected. + + It's schedule is determined by three factors- + + Warmup Steps: Initial stage, where linear warmup + occurs uptil the peak LR is reached. Unlike NoamAnnealing, + the peak LR is explicitly stated here instead of a scaling factor. + + Hold Steps: Intermediate stage, where the peak LR + is maintained for some number of steps. In this region, + the high peak LR allows the model to converge faster + if training is stable. However the high LR + may also cause instability during training. + Should usually be a significant fraction of training + steps (around 30-40% of the entire training steps). + + Decay Steps: Final stage, where the LR rapidly decays + with some scaling rate (set by decay rate). + To attain Noam decay, use 0.5, + for Squeezeformer recommended decay, use 1.0. + The fast decay after prolonged high LR during + hold phase allows for rapid convergence. + + References: + - [Squeezeformer: + An Efficient Transformer for Automatic Speech Recognition] + (https://arxiv.org/abs/2206.00888) + + Args: + optimizer: Pytorch compatible Optimizer object. + warmup_steps: Number of training steps in warmup stage + warmup_ratio: Ratio of warmup steps to total steps + hold_steps: Number of training steps to + hold the learning rate after warm up + hold_ratio: Ratio of hold steps to total steps + max_steps: Total number of steps while training or `None` for + infinite training + decay_rate: Float value describing the polynomial decay + after the hold period. Default value + of 0.5 corresponds to Noam decay. + min_lr: Minimum learning rate. + """ + self.decay_rate = decay_rate + super().__init__(optimizer=optimizer, + max_steps=max_steps, + last_epoch=last_epoch, + min_lr=min_lr, + **kwargs) + + def _get_lr(self, step): + if self.warmup_steps is None or self.warmup_steps == 0: + raise ValueError( + "Noam scheduler cannot be used without warmup steps") + + if self.hold_steps > 0: + hold_steps = self.hold_steps - self.warmup_steps + else: + hold_steps = 0 + + new_lrs = [ + _noam_hold_annealing( + initial_lr, + step=step, + warmup_steps=self.warmup_steps, + hold_steps=hold_steps, + decay_rate=self.decay_rate, + min_lr=self.min_lr, + ) for initial_lr in self.base_lrs + ] + return new_lrs + + def set_step(self, step: int): + self.last_epoch = step + + +class ConstantLR(_LRScheduler): + """The ConstantLR scheduler + + This scheduler keeps a constant lr + + """ + + def __init__( + self, + optimizer: torch.optim.Optimizer, + ): + # __init__() must be invoked before setting field + # because step() is also invoked in __init__() + super().__init__(optimizer) + + def get_lr(self): + return self.base_lrs + + def set_step(self, step: int): + self.last_epoch = step diff --git a/cosyvoice/utils/train_utils.py b/cosyvoice/utils/train_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..020005d001e1c386c84398981dfcc039b41fa89b --- /dev/null +++ b/cosyvoice/utils/train_utils.py @@ -0,0 +1,289 @@ +# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) +# 2023 Horizon Inc. (authors: Xingchen Song) +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from contextlib import nullcontext +import logging +import os +import torch +import json +import re +import datetime +import yaml + +# import deepspeed +import torch.optim as optim +import torch.distributed as dist + +from torch.utils.tensorboard import SummaryWriter +from torch.utils.data import DataLoader +from torch.nn.utils import clip_grad_norm_ + +# from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live + +from cosyvoice.dataset.dataset import Dataset +from cosyvoice.utils.scheduler import WarmupLR, NoamHoldAnnealing, ConstantLR + + +def init_distributed(args): + world_size = int(os.environ.get('WORLD_SIZE', 1)) + local_rank = int(os.environ.get('LOCAL_RANK', 0)) + rank = int(os.environ.get('RANK', 0)) + logging.info('training on multiple gpus, this gpu {}'.format(local_rank) + + ', rank {}, world_size {}'.format(rank, world_size)) + if args.train_engine == 'torch_ddp': + torch.cuda.set_device(local_rank) + dist.init_process_group(args.dist_backend) + else: + deepspeed.init_distributed(dist_backend=args.dist_backend) + return world_size, local_rank, rank + + +def init_dataset_and_dataloader(args, configs): + train_dataset = Dataset(args.train_data, data_pipeline=configs['data_pipeline'], mode='train', shuffle=True, partition=True) + cv_dataset = Dataset(args.cv_data, data_pipeline=configs['data_pipeline'], mode='train', shuffle=False, partition=False) + + # do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts + train_data_loader = DataLoader(train_dataset, + batch_size=None, + pin_memory=args.pin_memory, + num_workers=args.num_workers, + prefetch_factor=args.prefetch) + cv_data_loader = DataLoader(cv_dataset, + batch_size=None, + pin_memory=args.pin_memory, + num_workers=args.num_workers, + prefetch_factor=args.prefetch) + return train_dataset, cv_dataset, train_data_loader, cv_data_loader + + + +def check_modify_and_save_config(args, configs): + if args.train_engine == "torch_ddp": + configs['train_conf']["dtype"] = 'fp32' + else: + with open(args.deepspeed_config, 'r') as fin: + ds_configs = json.load(fin) + if "fp16" in ds_configs and ds_configs["fp16"]["enabled"]: + configs['train_conf']["dtype"] = "fp16" + elif "bf16" in ds_configs and ds_configs["bf16"]["enabled"]: + configs['train_conf']["dtype"] = "bf16" + else: + configs['train_conf']["dtype"] = "fp32" + assert ds_configs["train_micro_batch_size_per_gpu"] == 1 + # if use deepspeed, override ddp config + configs['train_conf']['save_per_step'] = int(configs['train_conf']['save_per_step'] * configs['train_conf']['accum_grad'] / ds_configs["gradient_accumulation_steps"]) + configs['train_conf']['accum_grad'] = ds_configs["gradient_accumulation_steps"] + configs['train_conf']['grad_clip'] = ds_configs["gradient_clipping"] + configs['train_conf']['log_interval'] = ds_configs["steps_per_print"] + return configs + + +def wrap_cuda_model(args, model): + local_world_size = int(os.environ.get('LOCAL_WORLD_SIZE', 1)) + world_size = int(os.environ.get('WORLD_SIZE', 1)) + if args.train_engine == "torch_ddp": # native pytorch ddp + assert (torch.cuda.is_available()) + model.cuda() + model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) + else: + if int(os.environ.get('RANK', 0)) == 0: + logging.info("Estimating model states memory needs (zero2)...") + estimate_zero2_model_states_mem_needs_all_live( + model, + num_gpus_per_node=local_world_size, + num_nodes=world_size // local_world_size) + return model + + +def init_optimizer_and_scheduler(args, configs, model): + if configs['train_conf']['optim'] == 'adam': + optimizer = optim.Adam(model.parameters(), **configs['train_conf']['optim_conf']) + elif configs['train_conf']['optim'] == 'adamw': + optimizer = optim.AdamW(model.parameters(), **configs['train_conf']['optim_conf']) + else: + raise ValueError("unknown optimizer: " + configs['train_conf']) + + if configs['train_conf']['scheduler'] == 'warmuplr': + scheduler_type = WarmupLR + scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf']) + elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing': + scheduler_type = NoamHoldAnnealing + scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf']) + elif configs['train_conf']['scheduler'] == 'constantlr': + scheduler_type = ConstantLR + scheduler = ConstantLR(optimizer) + else: + raise ValueError("unknown scheduler: " + configs['train_conf']) + + # use deepspeed optimizer for speedup + if args.train_engine == "deepspeed": + def scheduler(opt): + return scheduler_type(opt, **configs['train_conf']['scheduler_conf']) + model, optimizer, _, scheduler = deepspeed.initialize( + args=args, + model=model, + optimizer=None, + lr_scheduler=scheduler, + model_parameters=model.parameters()) + + return model, optimizer, scheduler + + +def init_summarywriter(args): + writer = None + if int(os.environ.get('RANK', 0)) == 0: + os.makedirs(args.model_dir, exist_ok=True) + writer = SummaryWriter(args.tensorboard_dir) + return writer + + +def save_model(model, model_name, info_dict): + rank = int(os.environ.get('RANK', 0)) + model_dir = info_dict["model_dir"] + save_model_path = os.path.join(model_dir, '{}.pt'.format(model_name)) + + if info_dict["train_engine"] == "torch_ddp": + if rank == 0: + torch.save(model.module.state_dict(), save_model_path) + else: + with torch.no_grad(): + model.save_checkpoint(save_dir=model_dir, + tag=model_name, + client_state=info_dict) + if rank == 0: + info_path = re.sub('.pt$', '.yaml', save_model_path) + info_dict['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') + with open(info_path, 'w') as fout: + data = yaml.dump(info_dict) + fout.write(data) + logging.info('[Rank {}] Checkpoint: save to checkpoint {}'.format(rank, save_model_path)) + + +def cosyvoice_join(group_join, info_dict): + world_size = int(os.environ.get('WORLD_SIZE', 1)) + local_rank = int(os.environ.get('LOCAL_RANK', 0)) + rank = int(os.environ.get('RANK', 0)) + + if info_dict["batch_idx"] != 0: + # we try to join all rank in both ddp and deepspeed mode, in case different rank has different lr + try: + dist.monitored_barrier(group=group_join, + timeout=group_join.options._timeout) + return False + except RuntimeError as e: + logging.info("Detected uneven workload distribution: {}\n".format(e) + + "Break current worker to manually join all workers, " + + "world_size {}, current rank {}, current local_rank {}\n". + format(world_size, rank, local_rank)) + return True + else: + return False + + +def batch_forward(model, batch, info_dict): + device = int(os.environ.get('LOCAL_RANK', 0)) + + dtype = info_dict["dtype"] + if dtype == "fp16": + dtype = torch.float16 + elif dtype == "bf16": + dtype = torch.bfloat16 + else: # fp32 + dtype = torch.float32 + + if info_dict['train_engine'] == 'torch_ddp': + autocast = nullcontext() + else: + autocast = torch.cuda.amp.autocast(enabled=True, dtype=dtype, cache_enabled=False) + + with autocast: + info_dict['loss_dict'] = model(batch, device) + return info_dict + + +def batch_backward(model, info_dict): + if info_dict["train_engine"] == "deepspeed": + scaled_loss = model.backward(info_dict['loss_dict']['loss']) + else: + scaled_loss = info_dict['loss_dict']['loss'] / info_dict['accum_grad'] + scaled_loss.backward() + + info_dict['loss_dict']['loss'] = scaled_loss + return info_dict + + +def update_parameter_and_lr(model, optimizer, scheduler, info_dict): + grad_norm = 0.0 + if info_dict['train_engine'] == "deepspeed": + info_dict["is_gradient_accumulation_boundary"] = model.is_gradient_accumulation_boundary() + model.step() + grad_norm = model.get_global_grad_norm() + elif (info_dict['batch_idx'] + 1) % info_dict["accum_grad"] == 0: + grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip']) + if torch.isfinite(grad_norm): + optimizer.step() + optimizer.zero_grad() + scheduler.step() + info_dict["lr"] = optimizer.param_groups[0]['lr'] + info_dict["grad_norm"] = grad_norm + return info_dict + + +def log_per_step(writer, info_dict): + tag = info_dict["tag"] + epoch = info_dict.get('epoch', 0) + step = info_dict["step"] + batch_idx = info_dict["batch_idx"] + loss_dict = info_dict['loss_dict'] + rank = int(os.environ.get('RANK', 0)) + + # only rank 0 write to tensorboard to avoid multi-process write + if writer is not None: + if (info_dict['train_engine'] == 'deepspeed' and info_dict['is_gradient_accumulation_boundary'] is True) or \ + (info_dict['train_engine'] == 'torch_ddp' and (info_dict['batch_idx'] + 1) % info_dict['accum_grad'] == 0): + for k in ['epoch', 'lr', 'grad_norm']: + writer.add_scalar('{}/{}'.format(tag, k), info_dict[k], step + 1) + for k, v in loss_dict.items(): + writer.add_scalar('{}/{}'.format(tag, k), v, step + 1) + + # TRAIN & CV, Shell log (stdout) + if (info_dict['batch_idx'] + 1) % info_dict['log_interval'] == 0: + log_str = '{} Batch {}/{} '.format(tag, epoch, batch_idx + 1) + for name, value in loss_dict.items(): + log_str += '{} {:.6f} '.format(name, value) + if tag == "TRAIN": + log_str += 'lr {:.8f} grad_norm {:.6f}'.format( + info_dict["lr"], info_dict['grad_norm']) + log_str += ' rank {}'.format(rank) + logging.debug(log_str) + + +def log_per_save(writer, info_dict): + tag = info_dict["tag"] + epoch = info_dict["epoch"] + step = info_dict["step"] + loss_dict = info_dict["loss_dict"] + lr = info_dict['lr'] + rank = int(os.environ.get('RANK', 0)) + logging.info( + 'Epoch {} Step {} CV info lr {} {} rank {}'.format( + epoch, step + 1, lr, rank, ' '.join(['{}_{}'.format(k, v) for k, v in loss_dict.items()]))) + + if writer is not None: + for k in ['epoch', 'lr']: + writer.add_scalar('{}/{}'.format(tag, k), info_dict[k], step + 1) + for k, v in loss_dict.items(): + writer.add_scalar('{}/{}'.format(tag, k), v, step + 1) diff --git a/flow_inference.py b/flow_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..ebee05a936c6b6ce76f5ec9a6d6e26ecb847bcd9 --- /dev/null +++ b/flow_inference.py @@ -0,0 +1,142 @@ +import torch +import torchaudio +import numpy as np +import re +from hyperpyyaml import load_hyperpyyaml +import uuid +from collections import defaultdict + + +def fade_in_out(fade_in_mel, fade_out_mel, window): + device = fade_in_mel.device + fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu() + mel_overlap_len = int(window.shape[0] / 2) + fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \ + fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:] + return fade_in_mel.to(device) + + +class AudioDecoder: + def __init__(self, config_path, flow_ckpt_path, hift_ckpt_path, device="cuda"): + self.device = device + + with open(config_path, 'r') as f: + self.scratch_configs = load_hyperpyyaml(f) + + # Load models + self.flow = self.scratch_configs['flow'] + self.flow.load_state_dict(torch.load(flow_ckpt_path, map_location=self.device)) + self.hift = self.scratch_configs['hift'] + self.hift.load_state_dict(torch.load(hift_ckpt_path, map_location=self.device)) + + # Move models to the appropriate device + self.flow.to(self.device) + self.hift.to(self.device) + self.mel_overlap_dict = defaultdict(lambda: None) + self.hift_cache_dict = defaultdict(lambda: None) + self.token_min_hop_len = 2 * self.flow.input_frame_rate + self.token_max_hop_len = 4 * self.flow.input_frame_rate + self.token_overlap_len = 5 + self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256) + self.mel_window = np.hamming(2 * self.mel_overlap_len) + # hift cache + self.mel_cache_len = 1 + self.source_cache_len = int(self.mel_cache_len * 256) + # speech fade in out + self.speech_window = np.hamming(2 * self.source_cache_len) + + def token2wav(self, token, uuid, prompt_token=torch.zeros(1, 0, dtype=torch.int32), + prompt_feat=torch.zeros(1, 0, 80), embedding=torch.zeros(1, 192), finalize=False): + tts_mel = self.flow.inference(token=token.to(self.device), + token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device), + prompt_token=prompt_token.to(self.device), + prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to( + self.device), + prompt_feat=prompt_feat.to(self.device), + prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to( + self.device), + embedding=embedding.to(self.device)) + + # mel overlap fade in out + if self.mel_overlap_dict[uuid] is not None: + tts_mel = fade_in_out(tts_mel, self.mel_overlap_dict[uuid], self.mel_window) + # append hift cache + if self.hift_cache_dict[uuid] is not None: + hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source'] + tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2) + + else: + hift_cache_source = torch.zeros(1, 1, 0) + # _tts_mel=tts_mel.contiguous() + # keep overlap mel and hift cache + if finalize is False: + self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:] + tts_mel = tts_mel[:, :, :-self.mel_overlap_len] + tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source) + + self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:], + 'source': tts_source[:, :, -self.source_cache_len:], + 'speech': tts_speech[:, -self.source_cache_len:]} + # if self.hift_cache_dict[uuid] is not None: + # tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window) + tts_speech = tts_speech[:, :-self.source_cache_len] + + else: + tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source) + del self.hift_cache_dict[uuid] + del self.mel_overlap_dict[uuid] + # if uuid in self.hift_cache_dict.keys() and self.hift_cache_dict[uuid] is not None: + # tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window) + return tts_speech, tts_mel + + def offline_inference(self, token): + this_uuid = str(uuid.uuid1()) + tts_speech, tts_mel = self.token2wav(token, uuid=this_uuid, finalize=True) + return tts_speech.cpu() + + def stream_inference(self, token): + token.to(self.device) + this_uuid = str(uuid.uuid1()) + + # Prepare other necessary input tensors + llm_embedding = torch.zeros(1, 192).to(self.device) + prompt_speech_feat = torch.zeros(1, 0, 80).to(self.device) + flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32).to(self.device) + + tts_speechs = [] + tts_mels = [] + + block_size = self.flow.encoder.block_size + prev_mel = None + + for idx in range(0, token.size(1), block_size): + # if idx>block_size: break + tts_token = token[:, idx:idx + block_size] + + print(tts_token.size()) + + if prev_mel is not None: + prompt_speech_feat = torch.cat(tts_mels, dim=-1).transpose(1, 2) + flow_prompt_speech_token = token[:, :idx] + + if idx + block_size >= token.size(-1): + is_finalize = True + else: + is_finalize = False + + tts_speech, tts_mel = self.token2wav(tts_token, uuid=this_uuid, + prompt_token=flow_prompt_speech_token.to(self.device), + prompt_feat=prompt_speech_feat.to(self.device), finalize=is_finalize) + + prev_mel = tts_mel + prev_speech = tts_speech + print(tts_mel.size()) + + tts_speechs.append(tts_speech) + tts_mels.append(tts_mel) + + # Convert Mel spectrogram to audio using HiFi-GAN + tts_speech = torch.cat(tts_speechs, dim=-1).cpu() + + return tts_speech.cpu() + diff --git a/glm-4-voice-decoder/.gitattributes b/glm-4-voice-decoder/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/glm-4-voice-decoder/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/glm-4-voice-decoder/LICENSE b/glm-4-voice-decoder/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..a6bb941bd9d7c6f6a9ca745499971af703de33dd --- /dev/null +++ b/glm-4-voice-decoder/LICENSE @@ -0,0 +1,70 @@ +The glm-4-voice License + +1. 定义 + +“许可方”是指分发其软件的 glm-4-voice 模型团队。 +“软件”是指根据本许可提供的 glm-4-voice 模型参数。 + +2. 许可授予 + +根据本许可的条款和条件,许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可。 +本许可允许您免费使用本仓库中的所有开源模型进行学术研究,对于希望将模型用于商业目的的用户,需在[这里](https://open.bigmodel.cn/mla/form)完成登记。经过登记的用户可以免费使用本模型进行商业活动,但必须遵守本许可的所有条款和条件。 +上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。 +如果您分发或提供 THUDM / 智谱AI 关于 glm-4 开源模型的材料(或其任何衍生作品),或使用其中任何材料(包括 glm-4 系列的所有开源模型)的产品或服务,您应: + +(A) 随任何此类 THUDM / 智谱AI 材料提供本协议的副本; +(B) 在相关网站、用户界面、博客文章、关于页面或产品文档上突出显示 “Built with glm-4”。 +如果您使用 THUDM / 智谱AI的 glm-4 开源模型的材料来创建、训练、微调或以其他方式改进已分发或可用的 AI 模型,您还应在任何此类 AI 模型名称的开头添加 “glm-4”。 + +3. 限制 + +您不得出于任何军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。 +您不得利用本软件从事任何危害国家安全和国家统一,危害社会公共利益及公序良俗,侵犯他人商业秘密、知识产权、名誉权、肖像权、财产权等权益的行为。 +您在使用中应遵循使用地所适用的法律法规政策、道德规范等要求。 + +4. 免责声明 + +本软件“按原样”提供,不提供任何明示或暗示的保证,包括但不限于对适销性、特定用途的适用性和非侵权性的保证。 +在任何情况下,作者或版权持有人均不对任何索赔、损害或其他责任负责,无论是在合同诉讼、侵权行为还是其他方面,由软件或软件的使用或其他交易引起、由软件引起或与之相关 +软件。 + +5. 责任限制 + +除适用法律禁止的范围外,在任何情况下且根据任何法律理论,无论是基于侵权行为、疏忽、合同、责任或其他原因,任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、 +或间接损害,或任何其他商业损失,即使许可人已被告知此类损害的可能性。 + +6. 争议解决 + +本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。 +请注意,许可证可能会更新到更全面的版本。 有关许可和版权的任何问题,请通过 license@zhipuai.cn 与我们联系。 +1. Definitions +“Licensor” means the glm-4-voice Model Team that distributes its Software. +“Software” means the glm-4-voice model parameters made available under this license. +2. License +Under the terms and conditions of this license, the Licensor hereby grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license. +This license allows you to use all open source models in this repository for free for academic research. For users who wish to use the models for commercial purposes, please do so [here](https://open.bigmodel.cn/mla/form) +Complete registration. Registered users are free to use this model for commercial activities, but must comply with all terms and conditions of this license. +The copyright notice and this license notice shall be included in all copies or substantial portions of the Software. +If you distribute or provide THUDM / Zhipu AI materials on the glm-4 open source model (or any derivative works thereof), or products or services that use any materials therein (including all open source models of the glm-4 series), you should: +(A) Provide a copy of this Agreement with any such THUDM/Zhipu AI Materials; +(B) Prominently display "Built with glm-4" on the relevant website, user interface, blog post, related page or product documentation. +If you use materials from THUDM/Zhipu AI's glm-4 model to create, train, operate, or otherwise improve assigned or available AI models, you should also add "glm-4" to the beginning of any such AI model name. +3. Restrictions +You are not allowed to use, copy, modify, merge, publish, distribute, copy or create all or part of the derivative works of this software for any military or illegal purposes. +You are not allowed to use this software to engage in any behavior that endangers national security and unity, endangers social public interests and public order, infringes on the rights and interests of others such as trade secrets, intellectual property rights, reputation rights, portrait rights, and property rights. +You should comply with the applicable laws, regulations, policies, ethical standards, and other requirements in the place of use during use. +4. Disclaimer +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +5. Limitation of Liability +EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, +NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, +INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED +OF THE POSSIBILITY OF SUCH DAMAGES. +6. Dispute Resolution +This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute +arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing. +Note that the license is subject to update to a more comprehensive version. For any questions related to the license and +copyright, please contact us at license@zhipuai.cn. \ No newline at end of file diff --git a/glm-4-voice-decoder/README.md b/glm-4-voice-decoder/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5ff4108008892255b0414a095df371597e4e907c --- /dev/null +++ b/glm-4-voice-decoder/README.md @@ -0,0 +1,13 @@ +# GLM-4-Voice-Decoder + +GLM-4-Voice 是智谱 AI 推出的端到端语音模型。GLM-4-Voice 能够直接理解和生成中英文语音,进行实时语音对话,并且能够根据用户的指令改变语音的情感、语调、语速、方言等属性。 + +GLM-4-Voice is an end-to-end voice model launched by Zhipu AI. GLM-4-Voice can directly understand and generate Chinese and English speech, engage in real-time voice conversations, and change attributes such as emotion, intonation, speech rate, and dialect based on user instructions. + +本仓库是 GLM-4-Voice 的 speech decoder 部分。GLM-4-Voice-Decoder 是基于 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 重新训练的支持流式推理的语音解码器,将离散化的语音 token 转化为连续的语音输出。最少只需要 10 个音频 token 即可开始生成,降低对话延迟。 + +The repo provides the speech decoder of GLM-4-Voice. GLM-4-Voice-Decoder is a speech decoder supporting streaming inference, retrained based on [CosyVoice](https://github.com/FunAudioLLM/CosyVoice), converting discrete speech tokens into continuous speech output. Generation can start with as few as 10 audio tokens, reducing conversation latency. + +更多信息请参考我们的仓库 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice). + +For more information please refer to our repo [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice). \ No newline at end of file diff --git a/glm-4-voice-decoder/config.yaml b/glm-4-voice-decoder/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3cd2469f443087972d12104c5d395dc03698e9a4 --- /dev/null +++ b/glm-4-voice-decoder/config.yaml @@ -0,0 +1,130 @@ +# set random seed, so that you may reproduce your result. +__set_seed1: !apply:random.seed [1986] +__set_seed2: !apply:numpy.random.seed [1986] +__set_seed3: !apply:torch.manual_seed [1986] +__set_seed4: !apply:torch.cuda.manual_seed_all [1986] + +# fixed params +sample_rate: 22050 +text_encoder_input_size: 512 +llm_input_size: 1024 +llm_output_size: 1024 +spk_embed_dim: 192 + +# model params +# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. +# for system/third_party class/function, we do not require this. +llm: !new:cosyvoice.llm.llm.TransformerLM + text_encoder_input_size: !ref + llm_input_size: !ref + llm_output_size: !ref + text_token_size: 51866 + speech_token_size: 4096 + length_normalized_loss: True + lsm_weight: 0 + spk_embed_dim: !ref + text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder + input_size: !ref + output_size: 1024 + attention_heads: 8 + linear_units: 2048 + num_blocks: 3 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0 + normalize_before: True + input_layer: 'linear' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + use_cnn_module: False + macaron_style: False + use_dynamic_chunk: False + use_dynamic_left_chunk: False + static_chunk_size: 1 + llm: !new:cosyvoice.transformer.encoder.TransformerEncoder + input_size: !ref + output_size: !ref + attention_heads: 8 + linear_units: 2048 + num_blocks: 7 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0 + input_layer: 'linear_legacy' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + static_chunk_size: 1 + +flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec + input_size: 512 + output_size: 80 + spk_embed_dim: !ref + output_type: 'mel' + vocab_size: 16384 + input_frame_rate: 12.5 + only_mask_loss: True + encoder: !new:cosyvoice.transformer.encoder.BlockConformerEncoder + output_size: 512 + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + normalize_before: True + input_layer: 'linear' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'block_rel_selfattn' + block_size: 10 + input_size: 512 + use_cnn_module: False + macaron_style: False + length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator + channels: 80 + sampling_ratios: [1, 1, 1, 1] + decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM + in_channels: 240 + n_spks: 1 + spk_emb_dim: 80 + cfm_params: !new:omegaconf.DictConfig + content: + sigma_min: 1e-06 + solver: 'euler' + t_scheduler: 'cosine' + training_cfg_rate: 0.2 + inference_cfg_rate: 0.7 + reg_loss_type: 'l1' + estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder + in_channels: 320 + out_channels: 80 + channels: [256, 256] + dropout: 0 + attention_head_dim: 64 + n_blocks: 4 + num_mid_blocks: 12 + num_heads: 8 + act_fn: 'gelu' + +hift: !new:cosyvoice.hifigan.generator.HiFTGenerator + in_channels: 80 + base_channels: 512 + nb_harmonics: 8 + sampling_rate: !ref + nsf_alpha: 0.1 + nsf_sigma: 0.003 + nsf_voiced_threshold: 10 + upsample_rates: [8, 8] + upsample_kernel_sizes: [16, 16] + istft_params: + n_fft: 16 + hop_len: 4 + resblock_kernel_sizes: [3, 7, 11] + resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + source_resblock_kernel_sizes: [7, 11] + source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] + lrelu_slope: 0.1 + audio_limit: 0.99 + f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor + num_class: 1 + in_channels: 80 + cond_channels: 512 diff --git a/glm-4-voice-decoder/flow.pt b/glm-4-voice-decoder/flow.pt new file mode 100644 index 0000000000000000000000000000000000000000..69184d728b0f7400441c306dca763c039c190724 --- /dev/null +++ b/glm-4-voice-decoder/flow.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:450f52ba9cbea63757914e72b42ac5af21509a2767de0b18b4694546fd1d30ed +size 445171500 diff --git a/glm-4-voice-decoder/hift.pt b/glm-4-voice-decoder/hift.pt new file mode 100644 index 0000000000000000000000000000000000000000..422ac4fe5753f0a5fa0e0f6c7281a53893ca0d98 --- /dev/null +++ b/glm-4-voice-decoder/hift.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91e679b6ca1eff71187ffb4f3ab0444935594cdcc20a9bd12afad111ef8d6012 +size 81896716 diff --git a/model_server.py b/model_server.py new file mode 100644 index 0000000000000000000000000000000000000000..0f3f233c9a9c58de75dc1288ca62c03c8b301eaa --- /dev/null +++ b/model_server.py @@ -0,0 +1,116 @@ +""" +A model worker executes the model. +""" +import argparse +import json +import uuid + +from fastapi import FastAPI, Request +from fastapi.responses import StreamingResponse +from transformers import AutoModel, AutoTokenizer +import torch +import uvicorn + +from transformers.generation.streamers import BaseStreamer +from threading import Thread +from queue import Queue + + +class TokenStreamer(BaseStreamer): + def __init__(self, skip_prompt: bool = False, timeout=None): + self.skip_prompt = skip_prompt + + # variables used in the streaming process + self.token_queue = Queue() + self.stop_signal = None + self.next_tokens_are_prompt = True + self.timeout = timeout + + def put(self, value): + if len(value.shape) > 1 and value.shape[0] > 1: + raise ValueError("TextStreamer only supports batch size 1") + elif len(value.shape) > 1: + value = value[0] + + if self.skip_prompt and self.next_tokens_are_prompt: + self.next_tokens_are_prompt = False + return + + for token in value.tolist(): + self.token_queue.put(token) + + def end(self): + self.token_queue.put(self.stop_signal) + + def __iter__(self): + return self + + def __next__(self): + value = self.token_queue.get(timeout=self.timeout) + if value == self.stop_signal: + raise StopIteration() + else: + return value + + +class ModelWorker: + def __init__(self, model_path, device='cuda'): + self.device = device + self.glm_model = AutoModel.from_pretrained(model_path, trust_remote_code=True, + device=device).to(device).eval() + self.glm_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + @torch.inference_mode() + def generate_stream(self, params): + tokenizer, model = self.glm_tokenizer, self.glm_model + + prompt = params["prompt"] + + temperature = float(params.get("temperature", 1.0)) + top_p = float(params.get("top_p", 1.0)) + max_new_tokens = int(params.get("max_new_tokens", 256)) + + inputs = tokenizer([prompt], return_tensors="pt") + inputs = inputs.to(self.device) + streamer = TokenStreamer(skip_prompt=True) + thread = Thread(target=model.generate, + kwargs=dict(**inputs, max_new_tokens=int(max_new_tokens), + temperature=float(temperature), top_p=float(top_p), + streamer=streamer)) + thread.start() + for token_id in streamer: + yield (json.dumps({"token_id": token_id, "error_code": 0}) + "\n").encode() + + def generate_stream_gate(self, params): + try: + for x in self.generate_stream(params): + yield x + except Exception as e: + print("Caught Unknown Error", e) + ret = { + "text": "Server Error", + "error_code": 1, + } + yield (json.dumps(ret)+ "\n").encode() + + +app = FastAPI() + + +@app.post("/generate_stream") +async def generate_stream(request: Request): + params = await request.json() + + generator = worker.generate_stream_gate(params) + return StreamingResponse(generator) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=10000) + parser.add_argument("--model-path", type=str, default="THUDM/glm-4-voice-9b") + args = parser.parse_args() + + worker = ModelWorker(args.model_path) + uvicorn.run(app, host=args.host, port=args.port, log_level="info") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..12e69ee84794c421b1941139f18456798d953fb0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,36 @@ +conformer==0.3.2 +deepspeed==0.14.2; sys_platform == 'linux' +diffusers==0.27.2 +fastapi==0.115.3 +fastapi-cli==0.0.4 +gdown==5.1.0 +gradio==5.3.0 +grpcio==1.57.0 +grpcio-tools==1.57.0 +huggingface_hub==0.25.2 +hydra-core==1.3.2 +HyperPyYAML==1.2.2 +inflect==7.3.1 +librosa==0.10.2 +lightning==2.2.4 +matplotlib==3.7.5 +modelscope==1.15.0 +networkx==3.1 +numpy==1.24.4 +omegaconf==2.3.0 +onnxruntime-gpu==1.16.0; sys_platform == 'linux' +onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows' +openai-whisper==20231117 +protobuf==4.25 +pydantic==2.7.0 +rich==13.7.1 +Requests==2.32.3 +safetensors==0.4.5 +soundfile==0.12.1 +tensorboard==2.14.0 +transformers==4.44.1 +uvicorn==0.32.0 +wget==3.2 +WeTextProcessing==1.0.3 +torch==2.3.0 +torchaudio==2.3.0 diff --git a/resources/architecture.jpeg b/resources/architecture.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..b0499b816cb0005260636b3f4343ecbaea5388a9 Binary files /dev/null and b/resources/architecture.jpeg differ diff --git a/resources/web_demo.png b/resources/web_demo.png new file mode 100644 index 0000000000000000000000000000000000000000..af78198d6a7d5356344868b4c36ad9d6037554d4 Binary files /dev/null and b/resources/web_demo.png differ diff --git a/speech_tokenizer/__init__.py b/speech_tokenizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/speech_tokenizer/__pycache__/__init__.cpython-310.pyc b/speech_tokenizer/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6861ffc2ebdb09310fff758f41faf682b71feeef Binary files /dev/null and b/speech_tokenizer/__pycache__/__init__.cpython-310.pyc differ diff --git a/speech_tokenizer/__pycache__/configuration_whisper.cpython-310.pyc b/speech_tokenizer/__pycache__/configuration_whisper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d572ef36abcee51edd88124fb4e2405360a0271 Binary files /dev/null and b/speech_tokenizer/__pycache__/configuration_whisper.cpython-310.pyc differ diff --git a/speech_tokenizer/__pycache__/generation_whisper.cpython-310.pyc b/speech_tokenizer/__pycache__/generation_whisper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aabd495ee6bbb5d393b80a7a539a1bda7327ed9b Binary files /dev/null and b/speech_tokenizer/__pycache__/generation_whisper.cpython-310.pyc differ diff --git a/speech_tokenizer/__pycache__/modeling_whisper.cpython-310.pyc b/speech_tokenizer/__pycache__/modeling_whisper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9650a08e31f38d3753bb6016a3d9a482a625dbc Binary files /dev/null and b/speech_tokenizer/__pycache__/modeling_whisper.cpython-310.pyc differ diff --git a/speech_tokenizer/__pycache__/utils.cpython-310.pyc b/speech_tokenizer/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d832b5d41537bfdbcdbeb2478daa5efa70bf56e Binary files /dev/null and b/speech_tokenizer/__pycache__/utils.cpython-310.pyc differ diff --git a/speech_tokenizer/configuration_whisper.py b/speech_tokenizer/configuration_whisper.py new file mode 100644 index 0000000000000000000000000000000000000000..5ee76eeae2921fa2c8665a926f957e238807c32e --- /dev/null +++ b/speech_tokenizer/configuration_whisper.py @@ -0,0 +1,37 @@ +from transformers import WhisperConfig + + +class WhisperVQConfig(WhisperConfig): + def __init__(self, + pooling_kernel_size=None, + pooling_type="max", + pooling_position=0, + quantize_vocab_size=None, + quantize_position=16, + quantize_commit_coefficient=0.25, + quantize_loss_scale=1.0, + quantize_ema_decay=None, + quantize_restart_interval=None, + quantize_encoder_only=False, + quantize_causal_encoder=False, + quantize_causal_block_size=None, + skip_language_detection=False, + encoder_causal_attention=False, + encoder_causal_convolution=False, + **kwargs): + self.pooling_kernel_size = pooling_kernel_size + self.pooling_type = pooling_type + self.pooling_position = pooling_position + self.quantize_vocab_size = quantize_vocab_size + self.quantize_position = quantize_position + self.quantize_commit_coefficient = quantize_commit_coefficient + self.quantize_loss_scale = quantize_loss_scale + self.quantize_ema_decay = quantize_ema_decay + self.quantize_restart_interval = quantize_restart_interval + self.quantize_encoder_only = quantize_encoder_only + self.quantize_causal_encoder = quantize_causal_encoder + self.quantize_causal_block_size = quantize_causal_block_size + self.skip_language_detection = skip_language_detection + self.encoder_causal_attention = encoder_causal_attention + self.encoder_causal_convolution = encoder_causal_convolution + super().__init__(**kwargs) diff --git a/speech_tokenizer/generation_whisper.py b/speech_tokenizer/generation_whisper.py new file mode 100644 index 0000000000000000000000000000000000000000..7141ba72f7d7e7928318378f06d675607692f267 --- /dev/null +++ b/speech_tokenizer/generation_whisper.py @@ -0,0 +1,1828 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import math +import warnings +import zlib +from typing import Callable, Iterator, List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +from transformers.cache_utils import EncoderDecoderCache + +from transformers.generation.configuration_utils import GenerationConfig +from transformers.generation.logits_process import ( + LogitsProcessorList, + SuppressTokensAtBeginLogitsProcessor, + SuppressTokensLogitsProcessor, + WhisperNoSpeechDetection, + WhisperTimeStampLogitsProcessor, +) +from transformers.generation.stopping_criteria import StoppingCriteriaList +from transformers.modeling_outputs import BaseModelOutput +from transformers.utils import logging +from transformers.models.whisper.tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE + + +logger = logging.get_logger(__name__) + + +def _median_filter(inputs: torch.Tensor, filter_width: int) -> torch.Tensor: + """ + Applies a median filter of width `filter_width` along the last dimension of the input. + + The `inputs` tensor is assumed to be 3- or 4-dimensional. + """ + if filter_width <= 0 or filter_width % 2 != 1: + raise ValueError("`filter_width` should be an odd number") + + pad_width = filter_width // 2 + if inputs.shape[-1] <= pad_width: + return inputs + + # Pad the left and right edges. + inputs = nn.functional.pad(inputs, (pad_width, pad_width, 0, 0), mode="reflect") + + # sort() is faster than torch.median (https://github.com/pytorch/pytorch/issues/51450) + result = inputs.unfold(-1, filter_width, 1).sort()[0][..., pad_width] + return result + + +def _dynamic_time_warping(matrix: np.ndarray): + """ + Measures similarity between two temporal sequences: the input audio and the output tokens. Used to generate + token-level timestamps. + """ + output_length, input_length = matrix.shape + cost = np.ones((output_length + 1, input_length + 1), dtype=np.float32) * np.inf + trace = -np.ones((output_length + 1, input_length + 1), dtype=np.float32) + + cost[0, 0] = 0 + for j in range(1, input_length + 1): + for i in range(1, output_length + 1): + c0 = cost[i - 1, j - 1] + c1 = cost[i - 1, j] + c2 = cost[i, j - 1] + + if c0 < c1 and c0 < c2: + c, t = c0, 0 + elif c1 < c0 and c1 < c2: + c, t = c1, 1 + else: + c, t = c2, 2 + + cost[i, j] = matrix[i - 1, j - 1] + c + trace[i, j] = t + + # backtrace + i = trace.shape[0] - 1 + j = trace.shape[1] - 1 + trace[0, :] = 2 + trace[:, 0] = 1 + + text_indices = [] + time_indices = [] + while i > 0 or j > 0: + text_indices.append(i - 1) + time_indices.append(j - 1) + if trace[i, j] == 0: + i -= 1 + j -= 1 + elif trace[i, j] == 1: + i -= 1 + elif trace[i, j] == 2: + j -= 1 + else: + raise RuntimeError( + f"Internal error in dynamic time warping. Unexpected trace[{i}, {j}]. Please file a bug report." + ) + + text_indices = np.array(text_indices)[::-1] + time_indices = np.array(time_indices)[::-1] + return text_indices, time_indices + + +def _get_attr_from_logit_processors(logits_processor, logit_processor_class, attribute_name): + if logits_processor is not None: + logit_processor = next((cls for cls in logits_processor if isinstance(cls, logit_processor_class)), None) + if logit_processor: + return getattr(logit_processor, attribute_name, None) + return None + + +def _pad_to_max_length( + current_segments, + pad_token_id, + device, + padding_side="right", + padding="longest", + bos_token_tensor=None, + cut_off_length=None, +): + max_total_length = 0 + sequences = [] + + if padding_side not in ["right", "left"]: + raise ValueError(f"`padding_side` must be either 'right' or 'left', not {padding_side}") + + if padding not in ["longest", "max_length"]: + raise ValueError(f"`padding` must be either 'longest' or 'max_length', not {padding}") + elif padding == "max_length" and cut_off_length is None: + raise ValueError("`cut_off_length` must be specified when `padding='max_length'`") + + for current_segment_list in current_segments: + if current_segment_list is not None and len([d["tokens"] for d in current_segment_list]) > 0: + sequence = torch.cat([d["tokens"] for d in current_segment_list], dim=-1) + + if cut_off_length is not None: + sequence = sequence[-cut_off_length:] + + if bos_token_tensor is not None: + sequence = torch.cat([bos_token_tensor, sequence]) + + sequences.append(sequence) + max_total_length = max(max_total_length, len(sequences[-1])) + elif bos_token_tensor is not None: + sequences.append(bos_token_tensor) + else: + sequences.append(torch.tensor([], device=device)) + + max_total_length = cut_off_length + 1 if padding == "max_length" else max_total_length + for i in range(len(current_segments)): + pad_length = max_total_length - len(sequences[i]) + pad = (0, pad_length) if padding_side == "right" else (pad_length, 0) + sequences[i] = F.pad(sequences[i], pad=pad, value=pad_token_id) + + sequences = torch.stack(sequences, dim=0) + return sequences + + +class WhisperGenerationMixin: + def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None): + """ + Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to + map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder + cross-attentions will be cropped before applying DTW. + + Returns: + tensor containing the timestamps in seconds for each predicted token + """ + # Create a list with `decoder_layers` elements, each a tensor of shape + # (batch size, attention_heads, output length, input length). + cross_attentions = [] + for i in range(self.config.decoder_layers): + cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2)) + + # Select specific cross-attention layers and heads. This is a tensor + # of shape (batch size, num selected, output length, input length). + weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads]) + weights = weights.permute([1, 0, 2, 3]) + + weight_length = None + + if "beam_indices" in generate_outputs: + # If beam search has been used, the output sequences may have been generated for more timesteps than their sequence_lengths + # since the beam search strategy chooses the most probable sequences at the end of the search. + # In that case, the cross_attentions weights are too long and we have to make sure that they have the right output_length + weight_length = (generate_outputs.beam_indices != -1).sum(-1).max() + weights = weights[:, :, :weight_length] + + # If beam index is still -1, it means that the associated token id is EOS + # We need to replace the index with 0 since index_select gives an error if any of the indexes is -1. + beam_indices = generate_outputs.beam_indices[:, :weight_length] + beam_indices = beam_indices.masked_fill(beam_indices == -1, 0) + + # Select the cross attention from the right beam for each output sequences + weights = torch.stack( + [ + torch.index_select(weights[:, :, i, :], dim=0, index=beam_indices[:, i]) + for i in range(beam_indices.shape[1]) + ], + dim=2, + ) + + # make sure timestamps are as long as weights + input_length = weight_length or cross_attentions[0].shape[2] + timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)[:, : input_length + 1] + batch_size = timestamps.shape[0] + + if num_frames is not None: + # two cases: + # 1. num_frames is the same for each sample -> compute the DTW matrix for each sample in parallel + # 2. num_frames is different, compute the DTW matrix for each sample sequentially + + # we're using np.unique because num_frames can be int/list/tuple + if isinstance(num_frames, int): + weights = weights[..., : num_frames // 2] + + elif isinstance(num_frames, (list, tuple, np.ndarray)) and len(np.unique(num_frames)) == 1: + weights = weights[..., : num_frames[0] // 2] + + elif isinstance(num_frames, (torch.Tensor)) and len(torch.unique(num_frames)) == 1: + weights = weights[..., : num_frames[0] // 2] + + else: + # num_frames is of shape (batch_size,) whereas batch_size is truely batch_size*num_return_sequences + repeat_time = batch_size if isinstance(num_frames, int) else batch_size // len(num_frames) + num_frames = np.repeat(num_frames, repeat_time) + + if num_frames is None or isinstance(num_frames, int): + # Normalize and smoothen the weights. + std = torch.std(weights, dim=-2, keepdim=True, unbiased=False) + mean = torch.mean(weights, dim=-2, keepdim=True) + weights = (weights - mean) / std + weights = _median_filter(weights, self.config.median_filter_width) + + # Average the different cross-attention heads. + weights = weights.mean(dim=1) + + # Perform dynamic time warping on each element of the batch. + for batch_idx in range(batch_size): + if num_frames is not None and isinstance(num_frames, (tuple, list, np.ndarray, torch.Tensor)): + matrix = weights[batch_idx, ..., : num_frames[batch_idx] // 2] + + # Normalize and smoothen the weights. + std = torch.std(matrix, dim=-2, keepdim=True, unbiased=False) + mean = torch.mean(matrix, dim=-2, keepdim=True) + matrix = (matrix - mean) / std + matrix = _median_filter(matrix, self.config.median_filter_width) + + # Average the different cross-attention heads. + matrix = matrix.mean(dim=0) + else: + matrix = weights[batch_idx] + + text_indices, time_indices = _dynamic_time_warping(-matrix.cpu().double().numpy()) + jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool) + jump_times = time_indices[jumps] * time_precision + timestamps[batch_idx, 1:] = torch.tensor(jump_times) + + return timestamps + + def generate( + self, + input_features: Optional[torch.Tensor] = None, + generation_config: Optional[GenerationConfig] = None, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, + synced_gpus: bool = False, + return_timestamps: Optional[bool] = None, + task: Optional[str] = None, + language: Optional[Union[str, List[str]]] = None, + is_multilingual: Optional[bool] = None, + prompt_ids: Optional[torch.Tensor] = None, + prompt_condition_type: Optional[str] = None, # first-segment, all-segments + condition_on_prev_tokens: Optional[bool] = None, + temperature: Optional[Union[float, Tuple[float, ...]]] = None, + compression_ratio_threshold: Optional[float] = None, + logprob_threshold: Optional[float] = None, + no_speech_threshold: Optional[float] = None, + num_segment_frames: Optional[int] = None, + attention_mask: Optional[torch.Tensor] = None, + time_precision: float = 0.02, + return_token_timestamps: Optional[bool] = None, + return_segments: bool = False, + return_dict_in_generate: Optional[bool] = None, + **kwargs, + ): + """ + Transcribes or translates log-mel input features to a sequence of auto-regressively generated token ids. + + + + Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the + model's default generation configuration. You can override any `generation_config` by passing the corresponding + parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. + + For an overview of generation strategies and code examples, check out the [following + guide](./generation_strategies). + + + + Parameters: + input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*): + Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by + loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the + [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a + tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details. + generation_config (`~generation.GenerationConfig`, *optional*): + The generation configuration to be used as base parametrization for the generation call. `**kwargs` + passed to generate matching the attributes of `generation_config` will override them. If + `generation_config` is not provided, the default will be used, which had the following loading + priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model + configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s + default values, whose documentation should be checked to parameterize generation. + logits_processor (`LogitsProcessorList`, *optional*): + Custom logits processors that complement the default logits processors built from arguments and + generation config. If a logit processor is passed that is already created with the arguments or a + generation config an error is thrown. This feature is intended for advanced users. + stopping_criteria (`StoppingCriteriaList`, *optional*): + Custom stopping criteria that complement the default stopping criteria built from arguments and a + generation config. If a stopping criteria is passed that is already created with the arguments or a + generation config an error is thrown. This feature is intended for advanced users. + prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*): + If provided, this function constraints the beam search to allowed tokens only at each step. If not + provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and + `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned + on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful + for constrained generation conditioned on the prefix, as described in [Autoregressive Entity + Retrieval](https://arxiv.org/abs/2010.00904). + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + return_timestamps (`bool`, *optional*): + Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`. + task (`str`, *optional*): + Task to use for generation, either "translate" or "transcribe". The `model.config.forced_decoder_ids` + will be updated accordingly. + language (`str` or list of `str`, *optional*): + Language token to use for generation, can be either in the form of `<|en|>`, `en` or `english`. For + batched generation, a list of language tokens can be passed. You can find all the possible language + tokens in the `model.generation_config.lang_to_id` dictionary. + is_multilingual (`bool`, *optional*): + Whether or not the model is multilingual. + prompt_ids (`torch.Tensor`, *optional*): + Rank-1 tensor of token IDs created by passing text to [`~WhisperProcessor.get_prompt_ids`] that is + provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for + transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words + correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value. + prompt_condition_type (`str`, *optional*): + Only relevant for long-form transcription. Condition type of `prompt_ids`. 'first-segment' means only the first segment is conditioned on `prompt_ids`. 'all-segments' means each segment is conditioned on `prompt_ids`. Make sure to enable `condition_on_prev_tokens` for 'all-segments'. + Defaults to 'first-segment'. For short-term transcription only 'first-segment' is possible. + condition_on_prev_tokens (`bool`, *optional*): + Only relevant for long-form transcription. Whether to condition each segment on the previous segment. + As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve + performance. + temperature (`float` or list of `float`, *optional*): + The temperature to be used for generation. Passing a single `float` value and `do_sample=True` activates + generation using sampling. For long-form transcription, temperature fallback can be activated by passing + a list of float values such as (0.0, 0.2, 0.4, 0.6, 0.8, 1.0). As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve + performance. + compression_ratio_threshold (`float`, *optional*): + Only relevant for long-form transcription. If defined, the zlib compression rate of each segment will be computed. If the compression rate of + a segment is higher than `compression_ratio_threshold`, temperature fallback is activated: the generated segment is discarded and the generation is + repeated using a higher temperature. The intuition behind this feature is that segments with very high compression rates + suffer from a lot of repetition. The unwanted repetition can be reduced by injecting more randomness by increasing the temperature. If `compression_ratio_threshold` is defined + make sure that `temperature` is a list of values. A common value for `compression_ratio_threshold` is 1.35. + As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve + performance. + logprob_threshold (`float`, *optional*): + Only relevant for long-form transcription. If defined, the average log-probability of each segment will be computed. If the log-probability of + a given segment is lower than `logprob_threshold`, temperature fallback is activated: the generated segment is discarded and the generation is + repeated using a higher temperature. The intuition behind this feature is that segments of low log-probability + can be improved by injecting more randomness by increasing the temperature. If `logprob_threshold` is defined + make sure that `temperature` is a list of values. A common value for `logprob_threshold` is -1.0. + As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve + performance. + no_speech_threshold (`float`, *optional*): + Only relevant for long-form transcription. If defined, the "no-speech" token combined with the `logprob_threshold` + is used to determine whether a segment contains only silence. In this case, the transcription for this segment + is skipped. + As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve + performance. + num_segment_frames (`int`, *optional*): + The number of frames a single segment is made of. If not defined, `num_segment_frames` defaults to the model's stride + times the maximum input length. + attention_mask (`torch.Tensor`, *optional*): + `attention_mask` needs to be passed when doing long-form transcription using a batch size > 1. + time_precision (`int`, *optional*, defaults to 0.02): + The duration of output token in seconds. *E.g.* 0.02 means that a generated token on average accounts + for 20 ms. + return_token_timestamps (`bool`, *optional*): + Whether to return token-level timestamps with the text. This can be used with or without the + `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into + words. + return_segments (`bool`, *optional*, defaults to `False`): + Whether to additionally return a list of all segments. Note that this option can only be enabled + when doing long-form transcription. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~utils.ModelOutput`] instead of just returning the generated tokens. + Note that when doing long-form transcription, `return_dict_in_generate` can only be enabled when + `return_segments` is set True. In this case the generation outputs of each segment is added to each + segment. + kwargs (`Dict[str, Any]`, *optional*): + Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be + forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder + specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*. + + Return: + [`~utils.ModelOutput`] or `torch.LongTensor` or `Dict[str, Any]`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` + or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor` or a dict of segments when `return_segments=True`. + + If the passed input is > 30 seconds / > 3000 mel input features and `return_segments=True` then a dictionary of generated sequence ids, called `sequences` and a list of each generated segment is returned. + + else if the passed input is <= 30 seconds / >= 3000 mel input features, the possible [`~utils.ModelOutput`] types are: + + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] + + else only the generated output sequence ids are returned. + + Example: + + - *Longform transcription*: To transcribe or translate audios longer than 30 seconds, process the audio files without truncation and pass all mel features at once to generate. + + ```python + >>> import torch + >>> from transformers import AutoProcessor, WhisperForConditionalGeneration + >>> from datasets import load_dataset, Audio + + >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") + >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") + >>> model.cuda() # doctest: +IGNORE_RESULT + + >>> # load audios > 30 seconds + >>> ds = load_dataset("distil-whisper/meanwhile", "default")["test"] + >>> # resample to 16kHz + >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000)) + >>> # take first 8 audios and retrieve array + >>> audio = ds[:8]["audio"] + >>> audio = [x["array"] for x in audio] + + >>> # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio + >>> inputs = processor(audio, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True, sampling_rate=16_000) + >>> inputs = inputs.to("cuda", torch.float32) + + >>> # transcribe audio to ids + >>> generated_ids = model.generate(**inputs) + + >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True) + >>> transcription[0] + " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct dennies. set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile." + ``` + + - *Shortform transcription*: If passed mel input features are < 30 seconds, the whole audio will be transcribed with a single call to generate. + + ```python + >>> import torch + >>> from transformers import AutoProcessor, WhisperForConditionalGeneration + >>> from datasets import load_dataset + + >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") + >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") + + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + + >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") + >>> input_features = inputs.input_features + + >>> generated_ids = model.generate(inputs=input_features) + + >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + >>> transcription + ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.' + ``` + + """ + # 0. deprecate old inputs + if "inputs" in kwargs: + input_features = kwargs.pop("inputs") + warnings.warn( + "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.", + FutureWarning, + ) + + # 1. prepare generation config + generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs) + + # 2. set global generate variables + input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0] + num_segment_frames = input_stride * self.config.max_source_positions + batch_size, total_input_frames = self._retrieve_total_input_frames( + input_features=input_features, input_stride=input_stride, kwargs=kwargs + ) + is_shortform = total_input_frames <= num_segment_frames + + # 3. Make sure generation config is correctly set + # Make sure the generation config is correctly set depending on whether timestamps are to be returned or not + return_dict_in_generate = self._set_return_outputs( + return_dict_in_generate=return_dict_in_generate, + return_token_timestamps=return_token_timestamps, + logprob_threshold=logprob_threshold, + generation_config=generation_config, + ) + timestamp_begin = self._set_return_timestamps( + return_timestamps=return_timestamps, is_shortform=is_shortform, generation_config=generation_config + ) + self._set_language_and_task( + language=language, task=task, is_multilingual=is_multilingual, generation_config=generation_config + ) + self._set_num_frames( + return_token_timestamps=return_token_timestamps, generation_config=generation_config, kwargs=kwargs + ) + self._set_thresholds_and_condition( + generation_config=generation_config, + logprob_threshold=logprob_threshold, + compression_ratio_threshold=compression_ratio_threshold, + no_speech_threshold=no_speech_threshold, + condition_on_prev_tokens=condition_on_prev_tokens, + ) + self._set_prompt_condition_type( + generation_config=generation_config, + prompt_condition_type=prompt_condition_type, + ) + + kwargs["attention_mask"] = attention_mask + # pass self.config for backward compatibility + init_tokens = self._retrieve_init_tokens( + input_features, + batch_size=batch_size, + generation_config=generation_config, + config=self.config, + num_segment_frames=num_segment_frames, + kwargs=kwargs, + ) + # passing `decoder_input_ids` is deprecated - the only exception is for assisted generation + # where the input ids are handled explicitly by the generate method + self._check_decoder_input_ids(kwargs=kwargs) + + # 3. Retrieve logits processors + device = kwargs["encoder_outputs"][0].device if "encoder_outputs" in kwargs else input_features.device + begin_index = init_tokens.shape[1] + logits_processor = self._retrieve_logit_processors( + generation_config=generation_config, + logits_processor=logits_processor, + begin_index=begin_index, # begin index is index of first generated decoder token + num_beams=kwargs.get("num_beams", 1), + device=device, + ) + + # 4 Set and retrieve global generation variables + self._set_condition_on_prev_tokens( + condition_on_prev_tokens=condition_on_prev_tokens, generation_config=generation_config + ) + + temperatures = [temperature] if not isinstance(temperature, (list, tuple)) else temperature + temperature = temperatures[0] + + max_frames, seek = self._retrieve_max_frames_and_seek( + batch_size=batch_size, + attention_mask=attention_mask, + total_input_frames=total_input_frames, + is_shortform=is_shortform, + ) + + # 5 Prepare running variables, list for generation + num_return_sequences = generation_config.num_return_sequences + ( + batch_idx_map, + cur_bsz, + input_features, + seek, + max_frames, + init_tokens, + do_condition_on_prev_tokens, + ) = self._expand_variables_for_generation( + input_features=input_features, + seek=seek, + max_frames=max_frames, + init_tokens=init_tokens, + batch_size=batch_size, + condition_on_prev_tokens=condition_on_prev_tokens, + generation_config=generation_config, + ) + + current_segments = self._prepare_segments( + prompt_ids=prompt_ids, + batch_size=cur_bsz, + generation_config=generation_config, + ) + + # 6 Transcribe audio until we reach the end of all input audios + while (seek < max_frames).any(): + # 6.1 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop + # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order + # to know which original audio is being decoded + # Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk + input_features, cur_bsz, batch_idx_map = self._maybe_reduce_batch( + input_features=input_features, + seek=seek, + max_frames=max_frames, + cur_bsz=cur_bsz, + batch_idx_map=batch_idx_map, + ) + time_offset = seek * time_precision / input_stride + seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames) + + # 6.2 cut out next 30s segment from input features + segment_input = self._get_input_segment( + input_features=input_features, + seek=seek, + seek_num_frames=seek_num_frames, + num_segment_frames=num_segment_frames, + cur_bsz=cur_bsz, + batch_idx_map=batch_idx_map, + ) + + # 6.3 prepare decoder input ids + suppress_tokens = _get_attr_from_logit_processors( + logits_processor, SuppressTokensLogitsProcessor, "suppress_tokens" + ) + + decoder_input_ids, kwargs = self._prepare_decoder_input_ids( + cur_bsz=cur_bsz, + init_tokens=init_tokens, + current_segments=current_segments, + batch_idx_map=batch_idx_map, + do_condition_on_prev_tokens=do_condition_on_prev_tokens, + prompt_ids=prompt_ids, + generation_config=generation_config, + config=self.config, + device=init_tokens.device, + suppress_tokens=suppress_tokens, + kwargs=kwargs, + ) + + # 6.4 set max new tokens or max length + self._set_max_new_tokens_and_length( + config=self.config, + decoder_input_ids=decoder_input_ids, + generation_config=generation_config, + ) + + # 6.5 Set current `begin_index` for all logit processors + if logits_processor is not None: + for proc in logits_processor: + if hasattr(proc, "set_begin_index"): + proc.set_begin_index(decoder_input_ids.shape[-1]) + + # 6.6 Run generate with fallback + ( + seek_sequences, + seek_outputs, + should_skip, + do_condition_on_prev_tokens, + model_output_type, + ) = self.generate_with_fallback( + segment_input=segment_input, + decoder_input_ids=decoder_input_ids, + cur_bsz=cur_bsz, + batch_idx_map=batch_idx_map, + seek=seek, + num_segment_frames=num_segment_frames, + max_frames=max_frames, + temperatures=temperatures, + generation_config=generation_config, + logits_processor=logits_processor, + stopping_criteria=stopping_criteria, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + synced_gpus=synced_gpus, + return_token_timestamps=return_token_timestamps, + do_condition_on_prev_tokens=do_condition_on_prev_tokens, + is_shortform=is_shortform, + batch_size=batch_size, + kwargs=kwargs, + ) + + # 6.7 In every generated sequence, split by timestamp tokens and extract segments + for i, seek_sequence in enumerate(seek_sequences): + prev_i = batch_idx_map[i] + + if should_skip[i]: + seek[prev_i] += seek_num_frames[prev_i] + continue + + segments, segment_offset = self._retrieve_segment( + seek_sequence=seek_sequence, + seek_outputs=seek_outputs, + time_offset=time_offset, + timestamp_begin=timestamp_begin, + seek_num_frames=seek_num_frames, + time_precision=time_precision, + input_stride=input_stride, + prev_idx=prev_i, + idx=i, + return_token_timestamps=return_token_timestamps, + ) + + current_segments[prev_i] += segments + + if is_shortform: + seek[prev_i] += max_frames[i] + else: + seek[prev_i] += segment_offset + + # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted + # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output + final_segments = ( + [x[1:] for x in current_segments] + if (prompt_ids is not None and generation_config.prompt_condition_type == "first-segment") + else current_segments + ) + + sequences = _pad_to_max_length( + final_segments, generation_config.pad_token_id, device=self.device, padding_side="right" + ) + + # 8. If we return all segments, the predicted output sequences are put under `"sequences"`. + if return_segments: + return {"sequences": sequences, "segments": final_segments} + + if is_shortform: + # add eos token: + if generation_config.max_new_tokens is None and generation_config.max_length is None: + eos_tokens = torch.full((sequences.shape[0], 1), generation_config.eos_token_id) + sequences = torch.cat([sequences, eos_tokens], dim=-1) + + if return_token_timestamps: + outputs = {} + outputs["sequences"] = sequences + outputs["token_timestamps"] = torch.stack([d["token_timestamps"] for d in seek_outputs], dim=0) + else: + outputs = sequences + + if return_dict_in_generate and generation_config.return_dict_in_generate: + dict_outputs = self._stack_split_outputs(seek_outputs, model_output_type, sequences.device, kwargs) + + if num_return_sequences > 1: + if hasattr(dict_outputs, "encoder_attentions") and dict_outputs.encoder_attentions is not None: + dict_outputs.encoder_attentions = tuple( + dict_outputs.encoder_attentions[i][::num_return_sequences] + for i in range(len(dict_outputs.encoder_attentions)) + ) + if ( + hasattr(dict_outputs, "encoder_hidden_states") + and dict_outputs.encoder_hidden_states is not None + ): + dict_outputs.encoder_hidden_states = tuple( + dict_outputs.encoder_hidden_states[i][::num_return_sequences] + for i in range(len(dict_outputs.encoder_hidden_states)) + ) + if return_token_timestamps: + dict_outputs["token_timestamps"] = outputs["token_timestamps"] + return dict_outputs + + return outputs + + return sequences + + def generate_with_fallback( + self, + segment_input, + decoder_input_ids, + cur_bsz, + batch_idx_map, + seek, + num_segment_frames, + max_frames, + temperatures, + generation_config, + logits_processor, + stopping_criteria, + prefix_allowed_tokens_fn, + synced_gpus, + return_token_timestamps, + do_condition_on_prev_tokens, + is_shortform, + batch_size, + kwargs, + ): + kwargs = copy.copy(kwargs) + + # 6.6 Batch generate current chunk + seek_sequence_list = [None for _ in range(cur_bsz)] + seek_outputs_list = [None for _ in range(cur_bsz)] + needs_fallback = [False for _ in range(cur_bsz)] + should_skip = [False for _ in range(cur_bsz)] + fallback_index_map = list(range(cur_bsz)) + if generation_config.no_speech_threshold is not None: + self._setup_no_speech_detection(logits_processor, segment_input, decoder_input_ids, kwargs) + + for fallback_idx, temperature in enumerate(temperatures): + generation_config.do_sample = temperature is not None and temperature > 0.0 + generation_config.temperature = temperature if generation_config.do_sample else 1.0 + if generation_config.do_sample: + generation_config.num_beams = 1 + + generate_kwargs = copy.copy(kwargs) + for key in ["do_sample", "temperature", "num_beams"]: + if key in generate_kwargs: + del generate_kwargs[key] + + cur_bsz = decoder_input_ids.shape[0] + if generation_config.cache_implementation == "static" and cur_bsz < batch_size: + segment_input = F.pad(segment_input, (0, 0, 0, 0, 0, batch_size - cur_bsz), value=0) + decoder_input_ids = F.pad( + decoder_input_ids, (0, 0, 0, batch_size - cur_bsz), value=generation_config.pad_token_id + ) + if generate_kwargs.get("decoder_attention_mask") is not None: + generate_kwargs["decoder_attention_mask"] = F.pad( + generate_kwargs["decoder_attention_mask"], (0, 0, 0, batch_size - cur_bsz), value=True + ) + if generate_kwargs.get("encoder_outputs") is not None: + generate_kwargs["encoder_outputs"] = F.pad( + generate_kwargs["encoder_outputs"], (0, 0, 0, 0, 0, batch_size - cur_bsz), value=0 + ) + + seek_outputs = super().generate( + segment_input, + generation_config=generation_config, + logits_processor=logits_processor, + stopping_criteria=stopping_criteria, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + synced_gpus=synced_gpus, + decoder_input_ids=decoder_input_ids, + **generate_kwargs, + ) + + model_output_type = type(seek_outputs) + + # post-process sequence tokens and outputs to be in list form + seek_sequences, seek_outputs = self._postprocess_outputs( + seek_outputs=seek_outputs, + decoder_input_ids=decoder_input_ids, + return_token_timestamps=return_token_timestamps, + generation_config=generation_config, + is_shortform=is_shortform, + ) + + if cur_bsz < batch_size: + seek_sequences = seek_sequences[:cur_bsz] + seek_outputs = seek_outputs[:cur_bsz] + + # 6.7 Extract cut sequences from every sequence and check if fallback should be applied + # Loop over each decoded audio individually as each decoding can be of a different length + new_fallback_index_map = [] + new_segment_input = [] + new_decoder_input_ids = [] + new_decoder_attention_mask = [] + + for i, seek_sequence in enumerate(seek_sequences): + # make sure we cut a predicted EOS token if we are not finished with the generation yet + prev_i = batch_idx_map[fallback_index_map[i]] + is_not_final = (seek[prev_i] + num_segment_frames) < max_frames[prev_i] + + # remove eos token id + if is_not_final and seek_sequence[-1] == generation_config.eos_token_id: + seek_sequence = seek_sequence[:-1] + if return_token_timestamps and not is_shortform: + seek_outputs[i]["token_timestamps"] = seek_outputs[i]["token_timestamps"][:-1] + + # remove all padding tokens + if seek_sequence[-1] == generation_config.pad_token_id: + num_paddings = (seek_sequence == generation_config.pad_token_id).sum() + seek_sequence = seek_sequence[:-num_paddings] + if return_token_timestamps and not is_shortform: + seek_outputs[i]["token_timestamps"] = seek_outputs[i]["token_timestamps"][:-num_paddings] + + # check which sequences in batch need fallback & which should be skipped + needs_fallback[i], should_skip[i] = self._need_fallback( + seek_sequence, + seek_outputs, + i, + logits_processor, + generation_config, + self.config.vocab_size, + temperature, + ) + + seek_sequence_list[fallback_index_map[i]] = seek_sequence + seek_outputs_list[fallback_index_map[i]] = seek_outputs[i] + is_low_temperature = temperature is None or temperature < 0.5 + do_condition_on_prev_tokens[fallback_index_map[i]] = ( + generation_config.condition_on_prev_tokens and is_low_temperature + ) + + if needs_fallback[i]: + new_fallback_index_map.append(fallback_index_map[i]) + new_segment_input.append(segment_input[i]) + new_decoder_input_ids.append(decoder_input_ids[i]) + if "decoder_attention_mask" in kwargs: + new_decoder_attention_mask.append(kwargs["decoder_attention_mask"][i]) + + fallback_index_map = new_fallback_index_map + + # if no sequence needs to be run with temperature fallback, we're finished + if len(fallback_index_map) == 0 or fallback_idx == len(temperatures) - 1: + seek_sequences = seek_sequence_list + seek_outputs = seek_outputs_list + break + + # if we're still in the loop, make sure that decoder_input_ids and segment inputs are tensors + decoder_input_ids = torch.stack(new_decoder_input_ids) + segment_input = torch.stack(new_segment_input) + if "decoder_attention_mask" in kwargs: + kwargs["decoder_attention_mask"] = torch.stack(new_decoder_attention_mask) + + return seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens, model_output_type + + @staticmethod + def _prepare_segments(prompt_ids, batch_size, generation_config): + if prompt_ids is not None and generation_config.prompt_condition_type == "first-segment": + prev_sot_token_id = getattr(generation_config, "prev_sot_token_id", None) + prompt_ids = prompt_ids[1:] if prompt_ids[0] == prev_sot_token_id else prompt_ids + current_segments = [[{"tokens": prompt_ids}] for _ in range(batch_size)] + else: + current_segments = [[] for _ in range(batch_size)] + + return current_segments + + def _postprocess_outputs( + self, seek_outputs, decoder_input_ids, return_token_timestamps, generation_config, is_shortform + ): + # remove all previously passed decoder input ids + start_idx = decoder_input_ids.shape[-1] if not is_shortform else torch.tensor(0) + + if isinstance(seek_outputs, torch.Tensor): + seek_outputs = seek_outputs[:, start_idx:] + return seek_outputs, seek_outputs + + if return_token_timestamps and hasattr(generation_config, "alignment_heads"): + num_frames = getattr(generation_config, "num_frames", None) + seek_outputs["token_timestamps"] = self._extract_token_timestamps( + seek_outputs, generation_config.alignment_heads, num_frames=num_frames + ) + seek_outputs["token_timestamps"] = seek_outputs["token_timestamps"][:, start_idx:] + + seek_outputs["sequences"] = seek_outputs["sequences"][:, start_idx:] + + def split_by_batch_index(values, key, batch_idx, is_shortform): + if key in ["scores", "encoder_attentions", "encoder_hidden_states", "logits"]: + return [v[batch_idx].cpu() for v in values] + if key in ["decoder_attentions", "decoder_hidden_states", "cross_attentions"]: + return tuple(tuple(w[batch_idx][None].cpu() for w in v) for v in values) + elif key == "past_key_values": + if not is_shortform: + # we don't save `past_key_values` as this is too costly for longform + return None + elif isinstance(values, EncoderDecoderCache): + all_past_key_values = [] + for layer_idx in range(self.config.decoder_layers): + layer_past_key_values = [] + for cache_cls in [values.self_attention_cache, values.cross_attention_cache]: + for v in [cache_cls.key_cache, cache_cls.value_cache]: + layer_past_key_values.append(v[layer_idx][batch_idx][None].cpu()) + all_past_key_values.append(tuple(layer_past_key_values)) + return tuple(all_past_key_values) + else: + all_past_key_values = [] + for v in range(len(values)): + layer_past_key_values = [] + for w in values[v]: + layer_past_key_values.append(w[batch_idx][None].cpu()) + all_past_key_values.append(tuple(layer_past_key_values)) + return tuple(all_past_key_values) + + return values[batch_idx].cpu() + + sequence_tokens = seek_outputs["sequences"] + seek_outputs = [ + {k: split_by_batch_index(v, k, i, is_shortform) for k, v in seek_outputs.items()} + for i in range(sequence_tokens.shape[0]) + ] + + return sequence_tokens, seek_outputs + + def _stack_split_outputs(self, seek_outputs, model_output_type, device, kwargs): + # Stack back seek_outputs tensors after splitting them with the split_by_batch_index method + outputs = {} + for key in seek_outputs[0].keys(): + if key == "sequences": + outputs[key] = torch.stack([v[key] for v in seek_outputs], dim=0).to(device) + if key in ["scores", "encoder_attentions", "encoder_hidden_states", "logits"]: + outputs[key] = tuple( + torch.stack([v[key][i] for v in seek_outputs]).to(device) for i in range(len(seek_outputs[0][key])) + ) + if key in ["decoder_attentions", "decoder_hidden_states", "cross_attentions"]: + outputs[key] = tuple( + tuple( + torch.stack([v[key][i][j] for v in seek_outputs]).squeeze(1).to(device) + for j in range(len(seek_outputs[0][key][0])) + ) + for i in range(len(seek_outputs[0][key])) + ) + if key == "past_key_values": + past_key_value_type = kwargs.get("past_key_values") + if seek_outputs[0][key] is not None: + outputs[key] = tuple( + tuple( + torch.stack([v[key][i][j] for v in seek_outputs]).squeeze(1).to(device) + for j in range(len(seek_outputs[0][key][0])) + ) + for i in range(len(seek_outputs[0][key])) + ) + if past_key_value_type is not None and isinstance(past_key_value_type, EncoderDecoderCache): + outputs[key] = past_key_value_type.from_legacy_cache(outputs[key]) + else: + outputs[key] = None + + return model_output_type(**outputs) + + def _need_fallback( + self, + seek_sequence, + seek_outputs, + index, + logits_processor, + generation_config, + vocab_size, + temperature, + ): + needs_fallback = False + should_skip = False + if generation_config.compression_ratio_threshold is not None: + compression_ratio = self._retrieve_compression_ratio(seek_sequence, vocab_size) + + if compression_ratio > generation_config.compression_ratio_threshold: + needs_fallback = True + + if generation_config.logprob_threshold is not None: + if hasattr(seek_outputs[0], "sequences_scores"): + logprobs = [s["sequences_scores"] for s in seek_outputs][index] + else: + scores = seek_outputs[index]["scores"] + logprobs = self._retrieve_avg_logprobs( + scores, seek_sequence, generation_config.eos_token_id, temperature + ) + + if logprobs < generation_config.logprob_threshold: + needs_fallback = True + + if generation_config.no_speech_threshold is not None: + no_speech_prob = _get_attr_from_logit_processors( + logits_processor, WhisperNoSpeechDetection, "no_speech_prob" + ) + + if ( + logprobs < generation_config.logprob_threshold + and no_speech_prob[index] > generation_config.no_speech_threshold + ): + needs_fallback = False + should_skip = True + + return needs_fallback, should_skip + + def _expand_variables_for_generation( + self, input_features, seek, max_frames, init_tokens, batch_size, condition_on_prev_tokens, generation_config + ): + if generation_config.num_return_sequences is not None and generation_config.num_return_sequences > 1: + batch_idx_map = list(range(batch_size * generation_config.num_return_sequences)) + cur_bsz = len(batch_idx_map) + do_condition_on_prev_tokens = [condition_on_prev_tokens for _ in range(len(batch_idx_map))] + input_features = input_features.repeat_interleave(generation_config.num_return_sequences, dim=0) + seek = seek.repeat_interleave(generation_config.num_return_sequences, dim=0) + max_frames = max_frames.repeat_interleave(generation_config.num_return_sequences, dim=0) + init_tokens = init_tokens.repeat_interleave(generation_config.num_return_sequences, dim=0) + generation_config.num_return_sequences = 1 + else: + cur_bsz = batch_size + batch_idx_map = list(range(cur_bsz)) + do_condition_on_prev_tokens = [condition_on_prev_tokens for _ in range(cur_bsz)] + + return ( + batch_idx_map, + cur_bsz, + input_features, + seek, + max_frames, + init_tokens, + do_condition_on_prev_tokens, + ) + + @staticmethod + def _setup_no_speech_detection(logits_processor, segment_input, decoder_input_ids, kwargs): + set_inputs = _get_attr_from_logit_processors(logits_processor, WhisperNoSpeechDetection, "set_inputs") + extra_kwargs = {k: v for k, v in kwargs.items() if torch.is_tensor(v)} + set_inputs({"inputs": segment_input, "decoder_input_ids": decoder_input_ids, **extra_kwargs}) + + @staticmethod + def _retrieve_total_input_frames(input_features, input_stride, kwargs): + if input_features is not None: + return input_features.shape[0], input_features.shape[-1] + + if "encoder_outputs" in kwargs: + encoder_outputs_shape = ( + kwargs["encoder_outputs"][0].shape + if isinstance(kwargs["encoder_outputs"], BaseModelOutput) + else kwargs["encoder_outputs"].shape + ) + return encoder_outputs_shape[0], encoder_outputs_shape[1] * input_stride + + raise ValueError("Make sure to provide either `input_features` or `encoder_outputs` to `generate`.") + + @staticmethod + def _maybe_warn_unused_inputs( + condition_on_prev_tokens, + temperature, + compression_ratio_threshold, + logprob_threshold, + no_speech_threshold, + total_input_frames, + ): + warning_prefix = ( + f"Audio input consists of only {total_input_frames}. " + "Short-form transcription is activated." + "{}, but will be ignored." + ) + if condition_on_prev_tokens is not None: + logger.warning(warning_prefix.format(f"condition_on_prev_tokens is set to {condition_on_prev_tokens}")) + + if compression_ratio_threshold is not None: + logger.warning( + warning_prefix.format(f"compression_ratio_threshold is set to {compression_ratio_threshold}") + ) + + if logprob_threshold is not None: + logger.warning(warning_prefix.format(f"logprob_threshold is set to {logprob_threshold}")) + + if no_speech_threshold is not None: + logger.warning(warning_prefix.format(f"no_speech_threshold is set to {no_speech_threshold}")) + + # when passing temperature as a list it cannot just be ignored => throw error in this case + if isinstance(temperature, (list, tuple)): + raise ValueError( + f"Audio input consists of only {total_input_frames}. Short-form transcription is activated." + f"temperature cannot be set to {temperature} which can only be used for temperature fallback for long-form generation. Make sure to set `temperature` to a float value or `None` for short-form generation." + ) + + @staticmethod + def _set_return_outputs(return_dict_in_generate, return_token_timestamps, logprob_threshold, generation_config): + if return_dict_in_generate is None: + return_dict_in_generate = generation_config.return_dict_in_generate + else: + generation_config.return_dict_in_generate = return_dict_in_generate + + generation_config.return_token_timestamps = return_token_timestamps + if return_token_timestamps: + generation_config.return_dict_in_generate = True + generation_config.output_attentions = True + generation_config.output_scores = True + + if logprob_threshold is not None: + generation_config.return_dict_in_generate = True + generation_config.output_scores = True + + return return_dict_in_generate + + def _set_return_timestamps(self, return_timestamps, is_shortform, generation_config): + if return_timestamps is None and hasattr(generation_config, "return_timestamps"): + return_timestamps = generation_config.return_timestamps + + if not is_shortform: + if return_timestamps is False: + raise ValueError( + "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which " + "requires the model to predict timestamp tokens. Please either pass `return_timestamps=True` or make sure to pass no more than 3000 mel input features." + ) + + logger.info("Setting `return_timestamps=True` for long-form generation.") + return_timestamps = True + + if return_timestamps and not hasattr(generation_config, "no_timestamps_token_id"): + raise ValueError( + "You are trying to return timestamps, but the generation config is not properly set. " + "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. " + "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363" + ) + + generation_config.return_timestamps = return_timestamps + + if hasattr(generation_config, "no_timestamps_token_id"): + timestamp_begin = generation_config.no_timestamps_token_id + 1 + else: + # BC for models missing the `no_timestamps_token_id` in the generation config when generating short-form with no timestamps + # We set the timestamp begin token larger than the vocab size, such that the timestamp condition is never met in the decoding loop + timestamp_begin = self.config.vocab_size + 1 + + return timestamp_begin + + @staticmethod + def _set_language_and_task(language, task, is_multilingual, generation_config): + if is_multilingual is not None: + if not hasattr(generation_config, "is_multilingual"): + raise ValueError( + "The generation config is outdated and is thus not compatible with the `is_multilingual` argument " + "to `generate`. Please update the generation config as per the instructions " + "https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224" + ) + generation_config.is_multilingual = is_multilingual + + if hasattr(generation_config, "is_multilingual") and not generation_config.is_multilingual: + if task is not None or language is not None: + raise ValueError( + "Cannot specify `task` or `language` for an English-only model. If the model is intended to be " + "multilingual, pass `is_multilingual=True` to generate, or update the generation config." + ) + + if language is not None: + if not hasattr(generation_config, "lang_to_id"): + raise ValueError( + "The generation config is outdated and is thus not compatible with the `language` argument " + "to `generate`. Either set the language using the `forced_decoder_ids` in the model config, " + "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224" + ) + generation_config.language = language + + if task is not None: + if not hasattr(generation_config, "task_to_id"): + raise ValueError( + "The generation config is outdated and is thus not compatible with the `task` argument " + "to `generate`. Either set the task using the `forced_decoder_ids` in the model config, " + "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224" + ) + generation_config.task = task + + def _retrieve_init_tokens(self, input_features, batch_size, generation_config, config, num_segment_frames, kwargs): + def replace_or_add(lst: List[int], num: int, itr: Iterator[int]): + """short function to replace num with a itr in lst""" + found = any(i in lst for i in itr) + if found: + lst = [num if i in itr else i for i in lst] + else: + lst.append(num) + return lst + + def language_to_id(language: str) -> int: + language = language.lower() + if language in generation_config.lang_to_id.keys(): + language_token = language + elif language in TO_LANGUAGE_CODE.keys(): + language_token = f"<|{TO_LANGUAGE_CODE[language]}|>" + elif language in TO_LANGUAGE_CODE.values(): + language_token = f"<|{language}|>" + else: + is_language_code = len(language) == 2 + raise ValueError( + f"Unsupported language: {language}. Language should be one of:" + f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}." + ) + if language_token not in generation_config.lang_to_id: + raise ValueError( + f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`." + "(You should just add it to the generation config)" + ) + + return generation_config.lang_to_id[language_token] + + task = getattr(generation_config, "task", None) + language = getattr(generation_config, "language", None) + + forced_decoder_ids = generation_config.forced_decoder_ids + if forced_decoder_ids is not None: + if language is None and task is None and forced_decoder_ids[0][1] is None: + logger.warning_once( + "Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English." + "This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`." + ) + elif hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None: + forced_decoder_ids = config.forced_decoder_ids + + if forced_decoder_ids is not None and task is not None: + logger.warning_once( + f"You have passed task={task}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of task={task}." + ) + forced_decoder_ids = None + elif forced_decoder_ids is not None and language is not None: + logger.warning_once( + f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of language={language}." + ) + forced_decoder_ids = None + + init_tokens = [generation_config.decoder_start_token_id] + if forced_decoder_ids is not None and forced_decoder_ids[0][0] == 1: + i = 1 + while len(forced_decoder_ids) > 0 and forced_decoder_ids[0][0] == i: + init_tokens += [forced_decoder_ids[0][1]] + forced_decoder_ids = forced_decoder_ids[1:] + i += 1 + + if len(forced_decoder_ids) > 0: + raise ValueError( + f"You are using token ids in `forced_decoder_ids` that do not seem to correctly follow the prompt pattern of Whisper. Make sure that {forced_decoder_ids} has an entry for all indices >= 1 and < {forced_decoder_ids[0][0]}.", + ) + + # from v4.39 the forced decoder ids are always None in favour of decoder input ids + generation_config.forced_decoder_ids = None + + is_lang_id_undefined = len(init_tokens) <= 1 or (len(init_tokens) > 1 and init_tokens[1] is None) + + # Make sure language is a list of strings of the correct length + if isinstance(language, (list, tuple)): + if any(l is None for l in language): + raise TypeError( + "Expected `language` to be `None`, a single string (e.g. `'en'`), or a list of strings with length equal to the batch size (e.g. `('en', 'fr')` for a batch size of 2). Got a list containing `None`." + ) + if len(language) != batch_size: + raise ValueError( + "When passing a list of languages, the length of the list must match the batch size. " + f"Expected length of {batch_size}, but got {len(language)} languages." + ) + languages = language + elif language is None: + # Language will be detected for each item in batch + languages = [None] * batch_size + else: + languages = [language] # Use a length-1 list now, broadcast later + + # Separate init_tokens for each language + init_tokens = [copy.copy(init_tokens) for _ in languages] + + # Update init_tokens with languages + lang_ids = None + if language is not None: + lang_ids = [language_to_id(l) for l in languages] + elif hasattr(generation_config, "lang_to_id") and is_lang_id_undefined: + # language is not defined or intentially set to `None` to trigger language detection + lang_ids = self.detect_language( + input_features=input_features, + encoder_outputs=kwargs.get("encoder_outputs", None), + attention_mask=kwargs.get("attention_mask", None), + generation_config=generation_config, + num_segment_frames=num_segment_frames, + ).tolist() + if lang_ids is not None: + # append or replace lang_ids to init_tokens + for i in range(len(init_tokens)): + if len(init_tokens[i]) > 1: + init_tokens[i][1] = lang_ids[i] + else: + init_tokens[i].append(lang_ids[i]) + del languages + + # Update init_tokens with task + for i in range(len(init_tokens)): + if task is not None: + if task in TASK_IDS: + init_tokens[i].append(generation_config.task_to_id[generation_config.task]) + task_id = generation_config.task_to_id[generation_config.task] + + # if task is defined it'll overwrite task ids that might have already been defined via the generation_config + replace_or_add(init_tokens[i], task_id, generation_config.task_to_id.values()) + else: + raise ValueError(f"The `{task}`task is not supported. The task should be one of `{TASK_IDS}`") + elif language is not None and hasattr(generation_config, "task_to_id"): + # if language is defined, but no task id is in `init_tokens`, default to transcribe + if not any(ti in init_tokens[i] for ti in generation_config.task_to_id.values()): + init_tokens[i].append(generation_config.task_to_id["transcribe"]) + + if ( + not generation_config.return_timestamps + and hasattr(generation_config, "no_timestamps_token_id") + and init_tokens[i][-1] != generation_config.no_timestamps_token_id + ): + init_tokens[i].append(generation_config.no_timestamps_token_id) + elif ( + generation_config.return_timestamps and init_tokens[i][-1] == generation_config.no_timestamps_token_id + ): + logger.info( + "<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `'True'`." + ) + init_tokens[i] = init_tokens[i][:-1] + + # let's make sure we don't pass `None` tokens as prompt tokens + init_tokens[i] = [t for t in init_tokens[i] if t is not None] + + return torch.as_tensor(init_tokens, dtype=torch.long, device=self.device).expand(batch_size, -1) + + def detect_language( + self, + input_features: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]] = None, + generation_config: Optional[GenerationConfig] = None, + num_segment_frames: int = 3000, + ) -> torch.Tensor: + """ + Detects language from log-mel input features or encoder_outputs + + Parameters: + input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*): + Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by + loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the + [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a + tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details. + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + generation_config (`~generation.GenerationConfig`, *optional*): + The generation configuration to be used as base parametrization for the generation call. `**kwargs` + passed to generate matching the attributes of `generation_config` will override them. If + `generation_config` is not provided, the default will be used, which had the following loading + priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model + configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s + default values, whose documentation should be checked to parameterize generation. + num_segment_frames (`int`, *optional*, defaults to 3000): + The number of log-mel frames the model expects + + Return: + A `torch.LongTensor` representing the detected language ids. + """ + if input_features is None and encoder_outputs is None: + raise ValueError("You have to specify either `input_features` or `encoder_outputs`") + elif input_features is not None and encoder_outputs is not None: + raise ValueError("Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!") + elif input_features is not None: + inputs = {"input_features": input_features[:, :, :num_segment_frames]} + batch_size = input_features.shape[0] + elif encoder_outputs is not None: + inputs = {"encoder_outputs": encoder_outputs} + batch_size = ( + encoder_outputs[0].shape[0] if isinstance(encoder_outputs, BaseModelOutput) else encoder_outputs[0] + ) + if attention_mask is not None: + inputs["attention_mask"] = attention_mask + + generation_config = generation_config or self.generation_config + decoder_input_ids = ( + torch.ones((batch_size, 1), device=self.device, dtype=torch.long) + * generation_config.decoder_start_token_id + ) + + with torch.no_grad(): + logits = self(**inputs, decoder_input_ids=decoder_input_ids).logits[:, -1] + + non_lang_mask = torch.ones_like(logits[0], dtype=torch.bool) + non_lang_mask[list(generation_config.lang_to_id.values())] = False + + logits[:, non_lang_mask] = -np.inf + + lang_ids = logits.argmax(-1) + + return lang_ids + + @staticmethod + def _check_decoder_input_ids(kwargs): + decoder_input_ids = kwargs.get("decoder_input_ids", None) + assistant_model = kwargs.get("assistant_model", None) + if decoder_input_ids is not None and assistant_model is not None: + raise ValueError( + "Passing `decoder_input_ids` is deprecated. Consider passing `prompt_ids` instead.", + ) + + @staticmethod + def _set_num_frames(return_token_timestamps, generation_config, kwargs): + if return_token_timestamps: + if getattr(generation_config, "task", None) == "translate": + logger.warning("Token-level timestamps may not be reliable for task 'translate'.") + if not hasattr(generation_config, "alignment_heads"): + raise ValueError( + "Model generation config has no `alignment_heads`, token-level timestamps not available. " + "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config." + ) + generation_config.num_frames = kwargs.pop("num_frames", None) + + @staticmethod + def _set_thresholds_and_condition( + generation_config, + logprob_threshold, + compression_ratio_threshold, + no_speech_threshold, + condition_on_prev_tokens, + ): + generation_config.logprob_threshold = ( + logprob_threshold + if logprob_threshold is not None + else getattr(generation_config, "logprob_threshold", None) + ) + generation_config.compression_ratio_threshold = ( + compression_ratio_threshold + if compression_ratio_threshold is not None + else getattr(generation_config, "compression_ratio_threshold", None) + ) + generation_config.no_speech_threshold = ( + no_speech_threshold + if no_speech_threshold is not None + else getattr(generation_config, "no_speech_threshold", None) + ) + generation_config.condition_on_prev_tokens = ( + condition_on_prev_tokens + if condition_on_prev_tokens is not None + else getattr(generation_config, "condition_on_prev_tokens", None) + ) + + @staticmethod + def _set_prompt_condition_type(generation_config, prompt_condition_type): + allowed_cond_types = ["first-segment", "all-segments"] + + # default to "first-segment" + prompt_condition_type = prompt_condition_type or allowed_cond_types[0] + + if prompt_condition_type not in allowed_cond_types: + raise ValueError( + f"`prompt_condition_type={prompt_condition_type} does not exist. Make sure to set `prompt_condition_type` to one of {', '.join(allowed_cond_types)}" + ) + + if generation_config.condition_on_prev_tokens is not True and prompt_condition_type == "all-segments": + raise ValueError( + "Make sure to set `condition_on_prev_tokens=True` when setting `prompt_condition_type='all-segments'`." + ) + + generation_config.prompt_condition_type = prompt_condition_type + + @staticmethod + def _set_condition_on_prev_tokens(condition_on_prev_tokens, generation_config): + condition_on_prev_tokens = ( + condition_on_prev_tokens + if condition_on_prev_tokens is not None + else getattr(generation_config, "condition_on_prev_tokens", False) + ) + generation_config.condition_on_prev_tokens = condition_on_prev_tokens + + @staticmethod + def _retrieve_max_frames_and_seek(batch_size, attention_mask, total_input_frames, is_shortform): + if batch_size > 1 and not is_shortform and attention_mask is None: + raise ValueError( + "When doing batched long-form audio transcription, make sure to pass an `attention_mask`. You can retrieve the `attention_mask` by doing `processor(audio, ..., return_attention_mask=True)` " + ) + elif batch_size > 1 and not is_shortform: + max_frames = attention_mask.sum(-1).cpu().to(torch.long) + seek = torch.zeros((batch_size,), dtype=torch.long) + else: + max_frames = torch.ones((batch_size,), dtype=torch.long) * total_input_frames + seek = torch.zeros((batch_size,), dtype=torch.long) + + return max_frames, seek + + def _retrieve_logit_processors(self, generation_config, logits_processor, begin_index, num_beams, device): + if generation_config.return_timestamps is True: + timestamp_processor = WhisperTimeStampLogitsProcessor(generation_config, begin_index=begin_index) + logits_processor = ( + [timestamp_processor] if logits_processor is None else [timestamp_processor] + logits_processor + ) + + if generation_config.suppress_tokens is not None: + suppress_tokens_processor = SuppressTokensLogitsProcessor(generation_config.suppress_tokens, device=device) + logits_processor = ( + [suppress_tokens_processor] + if logits_processor is None + else [suppress_tokens_processor] + logits_processor + ) + generation_config.suppress_tokens = None + + if generation_config.begin_suppress_tokens is not None: + begin_suppress_processor = SuppressTokensAtBeginLogitsProcessor( + generation_config.begin_suppress_tokens, begin_index=begin_index, device=device + ) + logits_processor = ( + [begin_suppress_processor] + if logits_processor is None + else [begin_suppress_processor] + logits_processor + ) + generation_config.begin_suppress_tokens = None + + if generation_config.no_speech_threshold is not None: + no_speech_detector = WhisperNoSpeechDetection( + no_speech_token=generation_config.no_timestamps_token_id - 1, + begin_index=begin_index, + scores_is_logprobs=num_beams > 1, + ) + logits_processor = ( + [no_speech_detector] if logits_processor is None else [no_speech_detector] + logits_processor + ) + no_speech_detector.set_model(self) + + return logits_processor + + @staticmethod + def _maybe_reduce_batch(input_features, seek, max_frames, cur_bsz, batch_idx_map): + prev_bsz = cur_bsz + new_batch_idx_map = [] + for i in range(prev_bsz): + prev_i = batch_idx_map[i] + if seek[prev_i] >= max_frames[prev_i]: + cut_index = i + (cur_bsz - prev_bsz) + cur_bsz -= 1 + input_features = torch.cat([input_features[:cut_index], input_features[cut_index + 1 :]], dim=0) + else: + # cut out index that goes away + new_batch_idx_map.append(prev_i) + + return input_features, cur_bsz, new_batch_idx_map + + @staticmethod + def _get_input_segment(input_features, seek, seek_num_frames, num_segment_frames, cur_bsz, batch_idx_map): + if input_features is None: + return None + + segment_input = [] + for i in range(cur_bsz): + prev_i = batch_idx_map[i] + segment_input_slice = input_features[i : i + 1, :, seek[prev_i] : seek[prev_i] + seek_num_frames[prev_i]] + + if segment_input_slice.shape[-1] < num_segment_frames: + # pad to 3000 if necessary + segment_input_slice = F.pad( + segment_input_slice, pad=(0, num_segment_frames - segment_input_slice.shape[-1]) + ) + + segment_input.append(segment_input_slice) + + segment_input = torch.cat(segment_input, dim=0) + + return segment_input + + @staticmethod + def _prepare_decoder_input_ids( + cur_bsz, + init_tokens, + current_segments, + batch_idx_map, + do_condition_on_prev_tokens, + prompt_ids, + generation_config, + config, + device, + suppress_tokens, + kwargs, + ): + if "decoder_input_ids" in kwargs: + decoder_input_ids = kwargs.pop("decoder_input_ids") + + return decoder_input_ids, kwargs + + cut_off_length = config.max_target_positions // 2 - 1 + + decoder_input_ids = init_tokens[batch_idx_map] + + prev_start_of_text = getattr(generation_config, "prev_sot_token_id", None) + if prev_start_of_text is None: + prev_start_of_text = suppress_tokens[-2] if suppress_tokens is not None else None + + if any(do_condition_on_prev_tokens) and len(current_segments[0]) > 0: + # according to https://github.com/openai/whisper/blob/e58f28804528831904c3b6f2c0e473f346223433/whisper/decoding.py#L609 + active_segments = [current_segments[i] if do_condition_on_prev_tokens[i] else None for i in batch_idx_map] + + if prompt_ids is not None and generation_config.prompt_condition_type == "all-segments": + prev_ids = prompt_ids + else: + one_tensor = torch.ones((cur_bsz, 1), device=device, dtype=torch.long) + prev_ids = prev_start_of_text * one_tensor[0] if prev_start_of_text is not None else None + + padding = "max_length" if generation_config.cache_implementation == "static" else "longest" + + prev_tokens = _pad_to_max_length( + active_segments, + generation_config.pad_token_id, + device=device, + padding_side="left", + padding=padding, + bos_token_tensor=prev_ids, + cut_off_length=cut_off_length, + ) + decoder_input_ids = torch.cat([prev_tokens, decoder_input_ids], dim=-1) + + kwargs["decoder_attention_mask"] = decoder_input_ids != generation_config.pad_token_id + elif prompt_ids is not None: + prev_tokens = prompt_ids[None].repeat(decoder_input_ids.shape[0], 1) + decoder_input_ids = torch.cat([prev_tokens, decoder_input_ids], dim=-1) + # make sure `"decoder_attention_mask"` is not passed to forward + kwargs.pop("decoder_attention_mask", None) + else: + # make sure `"decoder_attention_mask"` is not passed to forward + kwargs.pop("decoder_attention_mask", None) + + return decoder_input_ids, kwargs + + def _set_max_new_tokens_and_length(self, config, decoder_input_ids, generation_config): + max_new_tokens = generation_config.max_new_tokens if generation_config.max_new_tokens is not None else 0 + if max_new_tokens + decoder_input_ids.shape[-1] > self.config.max_target_positions: + raise ValueError( + f"The length of `decoder_input_ids` equal `prompt_ids` plus special start tokens is {decoder_input_ids.shape[-1]}, and the `max_new_tokens` " + f"is {max_new_tokens}. Thus, the combined length of " + f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the " + f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. " + "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, " + f"so that their combined length is less than {self.config.max_target_positions}." + ) + + num_initial_tokens = min(config.max_target_positions // 2 - 1, decoder_input_ids.shape[-1] - 1) + + # Make sure we don't get larger than `max_length` + if generation_config.max_length is not None and generation_config.max_new_tokens is None: + max_length = min(generation_config.max_length + num_initial_tokens, config.max_target_positions) + logger.info( + f"Increase max_length from {generation_config.max_length} to {max_length} since input is conditioned on previous segment." + ) + elif ( + generation_config.max_new_tokens is not None + and generation_config.max_new_tokens + decoder_input_ids.shape[-1] > config.max_target_positions + ): + max_new_tokens = config.max_target_positions - decoder_input_ids.shape[-1] + generation_config.max_new_tokens = max_new_tokens + + @staticmethod + def _retrieve_compression_ratio(tokens, vocab_size): + """Compute byte length of zlib compressed token bytes vs. byte length of raw token bytes""" + length = int(math.log2(vocab_size) / 8) + 1 + token_bytes = b"".join([t.to_bytes(length, "little") for t in tokens.tolist()]) + compression_ratio = len(token_bytes) / len(zlib.compress(token_bytes)) + + return compression_ratio + + @staticmethod + def _retrieve_avg_logprobs(scores, tokens, eos_token_id, temperature): + rescale_temperature = temperature if temperature > 0.0 else 1 + scores = torch.stack(scores).to(tokens.device) + + if scores.shape[0] > tokens.shape[0]: + scores = scores[: tokens.shape[0]] + else: + tokens = tokens[-scores.shape[0] :] + + logprobs = F.log_softmax((scores * rescale_temperature).float(), dim=-1).to(scores.dtype) + + # retrieve logprob of selected tokens and sum + sum_logprobs = sum((logprobs[i][tokens[i]] * (tokens[i] != eos_token_id)) for i in range(logprobs.shape[0])) + length = (tokens != eos_token_id).sum(-1) if eos_token_id is not None else tokens.shape[0] + + avg_logprobs = sum_logprobs / (length + 1) + return avg_logprobs + + @staticmethod + def _retrieve_segment( + seek_sequence, + seek_outputs, + time_offset, + timestamp_begin, + seek_num_frames, + time_precision, + input_stride, + prev_idx, + idx, + return_token_timestamps, + ): + # find the predicted "end of segment" predictions of Whisper + # "end of segment" predictions occur whenever Whisper predicts a timestamp token + timestamp_tokens: torch.Tensor = seek_sequence.ge(timestamp_begin) + single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True] + timestamp_segment_indices = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + timestamp_segment_indices.add_(1) + token_timestamps = seek_outputs[idx]["token_timestamps"] if return_token_timestamps else [] + + # If whisper predicted a "end of segment" via a timestep token, let's go ever each + # "end of segment" prediction and slice the decoding into segments accordingly + if len(timestamp_segment_indices) > 0: + # if the output contains two consecutive timestamp tokens + slices = timestamp_segment_indices.tolist() + segments = [] + if single_timestamp_ending: + slices.append(len(seek_sequence)) + + last_slice = 0 + # Add each segment to list of all segments + for current_slice in slices: + sliced_tokens = seek_sequence[last_slice:current_slice] + start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin + end_timestamp_pos = sliced_tokens[-1].item() - timestamp_begin + segments.append( + { + "start": time_offset[prev_idx] + start_timestamp_pos * time_precision, + "end": time_offset[prev_idx] + end_timestamp_pos * time_precision, + "tokens": sliced_tokens, + "result": seek_outputs[idx], + } + ) + if return_token_timestamps: + segments[-1]["token_timestamps"] = ( + token_timestamps[last_slice:current_slice] + time_offset[prev_idx] + ) + last_slice = current_slice + + if single_timestamp_ending: + # single timestamp at the end means no speech after the last timestamp. + segment_offset = seek_num_frames[prev_idx] + else: + # otherwise, ignore the unfinished segment and seek to the last timestamp + # here we throw away all predictions after the last predicted "end of segment" + # since we are cutting right in the middle of an audio + last_timestamp_pos = seek_sequence[last_slice - 1].item() - timestamp_begin + segment_offset = last_timestamp_pos * input_stride + else: + # If whisper does not predict any "end of segment" token, then + # the whole decoding is considered a segment and we add it to the list of segments + timestamps = seek_sequence[timestamp_tokens.nonzero().flatten()] + last_timestamp_pos = seek_num_frames[prev_idx] + if timestamps.numel() > 0 and timestamps[-1].item() != timestamp_begin: + # no consecutive timestamps but it has a timestamp; use the last one. + last_timestamp_pos = timestamps[-1].item() - timestamp_begin + segments = [ + { + "start": time_offset[prev_idx], + "end": time_offset[prev_idx] + last_timestamp_pos * time_precision, + "tokens": seek_sequence, + "result": seek_outputs[idx], + } + ] + if return_token_timestamps: + segments[-1]["token_timestamps"] = token_timestamps + time_offset[prev_idx] + segment_offset = seek_num_frames[prev_idx] + + return segments, segment_offset diff --git a/speech_tokenizer/modeling_whisper.py b/speech_tokenizer/modeling_whisper.py new file mode 100644 index 0000000000000000000000000000000000000000..35128981a161453aeece6d2a1fefe3cbf3bb7bcf --- /dev/null +++ b/speech_tokenizer/modeling_whisper.py @@ -0,0 +1,2546 @@ +# coding=utf-8 +# Copyright 2022 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Whisper model.""" + +import math +import os.path +import random +from typing import Optional, Tuple, Union + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache +from transformers.modeling_attn_mask_utils import AttentionMaskConverter +from dataclasses import dataclass +from transformers.modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + SequenceClassifierOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_whisper import WhisperVQConfig +from .generation_whisper import WhisperGenerationMixin + +if is_flash_attn_2_available(): + from transformers.modeling_flash_attention_utils import _flash_attention_forward + +logger = logging.get_logger(__name__) + +_HIDDEN_STATES_START_POSITION = 1 + +_CONFIG_FOR_DOC = "WhisperConfig" +_CHECKPOINT_FOR_DOC = "openai/whisper-tiny" + + +@dataclass +class QuantizedBaseModelOutput(BaseModelOutput): + quantized_token_ids: Optional[torch.LongTensor] = None + + +def vector_quantize(inputs, codebook): + embedding_size = codebook.size(1) + inputs_flatten = inputs.reshape(-1, embedding_size) + codebook_sqr = torch.sum(codebook ** 2, dim=1) + inputs_sqr = torch.sum(inputs_flatten ** 2, dim=1, keepdim=True) + # Compute the distances to the codebook + distances = torch.addmm(codebook_sqr + inputs_sqr, + inputs_flatten, codebook.t(), alpha=-2.0, beta=1.0) + + _, indices_flatten = torch.min(distances, dim=1) + codes_flatten = torch.index_select(codebook, dim=0, + index=indices_flatten) + codes = codes_flatten.view_as(inputs) + return codes, indices_flatten, distances + + +def mse_loss_with_mask(input, target, mask): + loss = torch.nn.functional.mse_loss(input, target, reduction='none') + loss = loss.mean(dim=-1) + loss = loss * mask + return loss.sum() / mask.sum() + + +class CausalConv1d(nn.Conv1d): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + **kwargs + ): + super(CausalConv1d, self).__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=0, + dilation=dilation, + groups=groups, + bias=bias, + **kwargs + ) + + self.left_padding = dilation * (kernel_size - 1) + + def forward(self, inp): + x = torch.nn.functional.pad(inp.unsqueeze(2), (self.left_padding, 0, 0, 0)).squeeze(2) + + return super(CausalConv1d, self).forward(x) + + +# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position +def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + min_dtype: float, + cache_position: torch.Tensor, + batch_size: int, +): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + min_dtype (`float`): + The minimum value representable with the dtype `dtype`. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +def sinusoids(length: int, channels: int, max_timescale: float = 10000) -> torch.Tensor: + """Returns sinusoids for positional embedding""" + if channels % 2 != 0: + raise ValueError( + f"Number of channels has to be divisible by 2 for sinusoidal positional embeddings, got {channels} channels." + ) + log_timescale_increment = math.log(max_timescale) / (channels // 2 - 1) + inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2)) + scaled_time = torch.arange(length).view(-1, 1) * inv_timescales.view(1, -1) + return torch.cat([scaled_time.sin(), scaled_time.cos()], dim=1) + + +# Copied from transformers.models.bart.modeling_bart.shift_tokens_right +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices +def _compute_mask_indices( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + attention_mask: Optional[torch.LongTensor] = None, + min_masks: int = 0, +) -> np.ndarray: + """ + Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for + ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on + CPU as part of the preprocessing during training. + + Args: + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. + mask_length: size of the mask + min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. + """ + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" + ) + + # epsilon is used for probabilistic rounding + epsilon = np.random.rand(1).item() + + def compute_num_masked_span(input_length): + """Given input length, compute how many spans should be masked""" + num_masked_span = int(mask_prob * input_length / mask_length + epsilon) + num_masked_span = max(num_masked_span, min_masks) + + # make sure num masked span <= sequence_length + if num_masked_span * mask_length > sequence_length: + num_masked_span = sequence_length // mask_length + + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + + return num_masked_span + + # compute number of masked spans in batch + input_lengths = ( + attention_mask.sum(-1).detach().tolist() + if attention_mask is not None + else [sequence_length for _ in range(batch_size)] + ) + + # SpecAugment mask to fill + spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool) + spec_aug_mask_idxs = [] + + max_num_masked_span = compute_num_masked_span(sequence_length) + + if max_num_masked_span == 0: + return spec_aug_mask + + for input_length in input_lengths: + # compute num of masked spans for this input + num_masked_span = compute_num_masked_span(input_length) + + # get random indices to mask + spec_aug_mask_idx = np.random.choice( + np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False + ) + + # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] + + spec_aug_mask_idx = np.concatenate( + [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] + ) + spec_aug_mask_idxs.append(spec_aug_mask_idx) + + spec_aug_mask_idxs = np.array(spec_aug_mask_idxs) + + # expand masked indices to masked spans + spec_aug_mask_idxs = np.broadcast_to( + spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length) + ) + spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + + # add offset to the starting indexes so that indexes now create a span + offsets = np.arange(mask_length)[None, None, :] + offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( + batch_size, max_num_masked_span * mask_length + ) + spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + + # scatter indices to mask + np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) + + return spec_aug_mask + + +class WhisperPositionalEmbedding(nn.Embedding): + def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None): + super().__init__(num_positions, embedding_dim) + + def forward(self, input_ids, past_key_values_length=0, position_ids=None): + if position_ids is None: + return self.weight[past_key_values_length: past_key_values_length + input_ids.shape[1]] + else: + return self.weight[position_ids] + + +class WhisperAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + layer_idx: Optional[int] = None, + config: Optional[WhisperVQConfig] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim ** -0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + + if layer_idx is None and is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + self.layer_idx = layer_idx + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + # Copied from transformers.models.bart.modeling_bart.BartAttention._shape with BART->whisper + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[EncoderDecoderCache] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self._shape(self.q_proj(hidden_states) * self.scaling, tgt_len, bsz) + + if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + past_key_value.is_updated[self.layer_idx] = True + past_key_value = past_key_value.cross_attention_cache + else: + past_key_value = past_key_value.self_attention_cache + + # use key_value_states if cross attention + current_states = key_value_states if key_value_states is not None else hidden_states + if is_cross_attention and past_key_value and is_updated: + # reuse k,v, cross_attentions + key_states = past_key_value.key_cache[self.layer_idx] + value_states = past_key_value.value_cache[self.layer_idx] + else: + key_states = self._shape(self.k_proj(current_states), -1, bsz) + value_states = self._shape(self.v_proj(current_states), -1, bsz) + if past_key_value is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + attn_output = torch.matmul(attn_probs, value_states) + + if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights, past_key_value + + +class WhisperFlashAttention2(WhisperAttention): + """ + Whisper flash attention module. This module inherits from `WhisperAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[EncoderDecoderCache] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if isinstance(past_key_value, StaticCache): + raise ValueError( + "The `static` cache implementation is not compatible with `attn_implementation='flash_attention_2'`. " + "Use `attn_implementation='sdpa'` in the meantime, and open an issue at https://github.com/huggingface/transformers" + ) + # WhisperFlashAttention2 attention does not support output_attentions + if output_attentions: + raise ValueError("WhisperFlashAttention2 attention does not support output_attentions") + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = torch.reshape(self.q_proj(hidden_states), (bsz, tgt_len, self.num_heads, self.head_dim)) + + if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + past_key_value.is_updated[self.layer_idx] = True + past_key_value = past_key_value.cross_attention_cache + else: + past_key_value = past_key_value.self_attention_cache + + # use key_value_states if cross attention + current_states = key_value_states if key_value_states is not None else hidden_states + if is_cross_attention and past_key_value and is_updated: + # reuse k,v, cross_attentions + key_states = past_key_value.key_cache[self.layer_idx] + value_states = past_key_value.value_cache[self.layer_idx] + else: + key_states = self._shape(self.k_proj(current_states), -1, bsz) + value_states = self._shape(self.v_proj(current_states), -1, bsz) + if past_key_value is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim] + # We would need to refactor the KV cache to be able to avoid many of these transpose/reshape/view. + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + causal_mask = attention_mask + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (LlamaRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + causal_mask, + tgt_len, + dropout=self.dropout, + is_causal=self.is_causal, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + ) + + attn_output = attn_output.reshape(bsz, tgt_len, -1) + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class WhisperSdpaAttention(WhisperAttention): + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[EncoderDecoderCache] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + if output_attentions or layer_head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "WhisperModel is using WhisperSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention" + ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + key_value_states=key_value_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz) + + if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + past_key_value.is_updated[self.layer_idx] = True + past_key_value = past_key_value.cross_attention_cache + else: + past_key_value = past_key_value.self_attention_cache + + # use key_value_states if cross attention + current_states = key_value_states if key_value_states is not None else hidden_states + if is_cross_attention and past_key_value and is_updated: + # reuse k,v, cross_attentions + key_states = past_key_value.key_cache[self.layer_idx] + value_states = past_key_value.value_cache[self.layer_idx] + else: + key_states = self._shape(self.k_proj(current_states), -1, bsz) + value_states = self._shape(self.v_proj(current_states), -1, bsz) + if past_key_value is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + + causal_mask = attention_mask + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1. + is_causal = True if self.is_causal and causal_mask is None and tgt_len > 1 else False + + # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask, + # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577 + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.dropout if self.training else 0.0, + is_causal=is_causal, + ) + + if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, None, past_key_value + + +WHISPER_ATTENTION_CLASSES = { + "eager": WhisperAttention, + # "flash_attention_2": WhisperFlashAttention2, + "sdpa": WhisperSdpaAttention, +} + + +# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Whisper, MBART->WHISPER +class WhisperVQEncoderLayer(nn.Module): + def __init__(self, config: WhisperVQConfig, is_causal=False): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + config=config, + is_causal=is_causal + ) + self.is_causal = is_causal + if self.is_causal: + assert isinstance(self.self_attn, WhisperSdpaAttention), "Causal attention is only supported for SDPA" + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, + output_attentions: bool = False, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask if not self.is_causal else None, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class WhisperDecoderLayer(nn.Module): + def __init__(self, config: WhisperVQConfig, layer_idx: int = None): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + is_causal=True, + layer_idx=layer_idx, + config=config, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation]( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + layer_idx=layer_idx, + config=config, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[EncoderDecoderCache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + cache_position: Optional[torch.LongTensor] = None, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # add cross-attn to positions 1 of present_key_value tuple + present_key_value = (present_key_value, cross_attn_present_key_value) + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class WhisperPreTrainedModel(PreTrainedModel): + config_class = WhisperVQConfig + base_model_prefix = "model" + main_input_name = "input_features" + supports_gradient_checkpointing = True + _no_split_modules = ["WhisperEncoderLayer", "WhisperDecoderLayer"] + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + _supports_static_cache = True + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, (nn.Linear, nn.Conv1d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, WhisperVQEncoder): + with torch.no_grad(): + embed_positions = module.embed_positions.weight + embed_positions.copy_(sinusoids(*embed_positions.shape)) + + def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor): + """ + Computes the output length of the convolutional layers + """ + input_lengths = (input_lengths - 1) // 2 + 1 + + return input_lengths + + +WHISPER_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`WhisperConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +WHISPER_INPUTS_DOCSTRING = r""" + Args: + input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): + Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by + loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the + [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a + tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing *SpecAugment* data augmentation on padding token indices. Mask values selected in + `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Whisper uses the `decoder_start_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read + [`modeling_whisper._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in [the BART + paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy. + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`EncoderDecoderCache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are + four sets of pre-computed hidden-states: key and values states in the self-attention blocks (2) and + in the cross-attention blocks (2). The `past_key_values` are returned when `use_cache=True` is passed or + when `config.use_cache=True` + + Two formats are allowed: + - An [`~cache_utils.EncoderDecoderCache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more control over how to convert + `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the cache + in the correct position and to infer the complete sequence length. +""" + +WHISPER_ENCODER_INPUTS_DOCSTRING = r""" + Args: + input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): + Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by + loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the + [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a + tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class WhisperVQEncoder(WhisperPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`WhisperEncoderLayer`]. + + Args: + config: WhisperConfig + """ + + def __init__(self, config: WhisperVQConfig): + super().__init__(config) + self.config = config + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.num_mel_bins = config.num_mel_bins + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_source_positions + self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + if config.encoder_causal_convolution: + conv_class = CausalConv1d + else: + conv_class = nn.Conv1d + self.conv1 = conv_class(self.num_mel_bins, embed_dim, kernel_size=3, padding=1) + self.conv2 = conv_class(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1) + + self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim) + self.embed_positions.requires_grad_(False) + if config.quantize_encoder_only: + self.layers = nn.ModuleList([WhisperVQEncoderLayer(config, + is_causal=config.encoder_causal_attention or config.quantize_causal_encoder) + for _ in range(config.quantize_position)]) + else: + self.layers = nn.ModuleList([WhisperVQEncoderLayer(config, is_causal=config.encoder_causal_attention or ( + config.quantize_causal_encoder and layer_id < config.quantize_position)) for layer_id in + range(config.encoder_layers)]) + self.layer_norm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Parameters related to pooling layer + self.pooling_layer = None + # Parameters related to quantization layer + self.codebook = None + self.embed_positions2 = None + self.quantize_loss = None + self.num_active_codes = None + self.quantize_ema_count = 0 + # Save hiddens + self.save_hidden_dir = None + self.save_hidden_position = None + # Initialize weights and apply final processing + self.init_pooling_layer(config) + self.init_quantize_layer(config) + self.post_init() + + def init_pooling_layer(self, config: WhisperVQConfig): + if config.pooling_kernel_size is not None: + if config.pooling_type == "max": + self.pooling_layer = nn.MaxPool1d(kernel_size=config.pooling_kernel_size) + elif config.pooling_type == "avg": + self.pooling_layer = nn.AvgPool1d(kernel_size=config.pooling_kernel_size) + else: + raise NotImplementedError(f"Pooling type {config.pooling_type} not implemented") + + def init_quantize_layer(self, config: WhisperVQConfig, quantize_load_codebook=None): + if config.quantize_vocab_size is not None: + if config.pooling_position is not None: + assert config.quantize_position >= config.pooling_position + self.codebook = nn.Embedding(config.quantize_vocab_size, self.config.d_model) + if quantize_load_codebook is not None: + init_codes = np.load(quantize_load_codebook) + self.codebook.weight.data.copy_(torch.from_numpy(init_codes)) + max_source_positions = self.max_source_positions + if config.pooling_kernel_size is not None: + max_source_positions = math.ceil(max_source_positions / self.config.pooling_kernel_size) + self.embed_positions2 = nn.Embedding(max_source_positions, self.config.d_model) + self.embed_positions2.weight.data.copy_(self.embed_positions.weight.data[:max_source_positions]) + if config.quantize_ema_decay is not None: + self.codebook.weight.requires_grad = False + self.register_buffer("ema_count", torch.ones(config.quantize_vocab_size, dtype=torch.float)) + self.register_buffer("ema_weight", self.codebook.weight.data.clone().float()) + + def _freeze_parameters(self): + for param in self.parameters(): + param.requires_grad = False + self._requires_grad = False + + def get_input_embeddings(self) -> nn.Module: + return self.conv1 + + def set_input_embeddings(self, value: nn.Module): + self.conv1 = value + + def get_block_causal_attention_mask(self, attention_mask, block_size=50): + dtype = self.dtype + batch_size, seq_length = attention_mask.shape + causal_mask = torch.torch.tril( + torch.ones(1, seq_length, seq_length, dtype=torch.bool, device=attention_mask.device)) + block_square_mask = [] + for start in range(0, seq_length, block_size): + end = min(start + block_size, seq_length) + length = end - start + block_square_mask.append(causal_mask.new_ones((length, length))) + block_square_mask = torch.block_diag(*block_square_mask) + block_causal_mask = causal_mask | block_square_mask + block_causal_mask = block_causal_mask & attention_mask[:, None, :] + block_causal_mask = block_causal_mask.to(dtype=dtype) # fp16 compatibility + block_causal_mask = (1.0 - block_causal_mask) * torch.finfo(dtype).min + block_causal_mask = block_causal_mask.unsqueeze(1) + return block_causal_mask + + def forward( + self, + input_features, + attention_mask=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + quantized_token_ids=None + ): + r""" + Args: + input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): + Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be + obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a + `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding + and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + attention_mask (`torch.Tensor`)`, *optional*): + Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, + but it is not used. By default the silence in the input log mel spectrogram are ignored. + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + + # expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0] + # if input_features.shape[-1] != expected_seq_length: + # raise ValueError( + # f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}." + # ) + + batch_size, feature_size, seq_length = input_features.shape + seq_length = seq_length // (self.conv1.stride[0] * self.conv2.stride[0]) + + attention_mask = attention_mask[:, :: self.conv1.stride[0] * self.conv2.stride[0]] + if self.config.quantize_causal_block_size is not None: + extended_attention_mask = self.get_block_causal_attention_mask(attention_mask, + block_size=self.config.quantize_causal_block_size) + else: + extended_attention_mask = self.get_extended_attention_mask(attention_mask, (batch_size, seq_length)) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + inputs_embeds = nn.functional.gelu(self.conv1(input_features)) + inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) + + inputs_embeds = inputs_embeds.permute(0, 2, 1) + embed_pos = self.embed_positions.weight + + hidden_states = inputs_embeds + embed_pos[:seq_length] + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + assert attention_mask.shape[-1] == hidden_states.shape[1] + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + assert head_mask.size()[0] == ( + len(self.layers) + ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + extended_attention_mask, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + extended_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + if idx + 1 == self.config.pooling_position and self.config.pooling_kernel_size is not None: + hidden_states = hidden_states.permute(0, 2, 1) + if hidden_states.shape[-1] % self.config.pooling_kernel_size != 0: + hidden_states = torch.nn.functional.pad(hidden_states, ( + 0, self.config.pooling_kernel_size - hidden_states.shape[-1] % self.config.pooling_kernel_size)) + hidden_states = self.pooling_layer(hidden_states).permute(0, 2, 1) + attention_mask = attention_mask[:, ::self.config.pooling_kernel_size] + if self.config.quantize_causal_block_size is not None: + extended_attention_mask = self.get_block_causal_attention_mask(attention_mask, block_size=self.config.quantize_causal_block_size // self.config.pooling_kernel_size) + else: + extended_attention_mask = self.get_extended_attention_mask(attention_mask, ( + batch_size, seq_length // self.config.pooling_kernel_size)) + + if idx + 1 == self.config.quantize_position and self.config.quantize_vocab_size is not None: + if quantized_token_ids is not None: + hidden_states = self.codebook(quantized_token_ids) + else: + hidden_quantized, indices_flat, distances = vector_quantize(hidden_states, self.codebook.weight) + quantized_token_ids = indices_flat.reshape(batch_size, hidden_quantized.shape[1]) + if self.training: + encodings = torch.nn.functional.one_hot(indices_flat, self.config.quantize_vocab_size).float() + encodings = encodings * attention_mask.reshape(-1, 1) + n = torch.sum(encodings, dim=0) + torch.distributed.all_reduce(n, op=torch.distributed.ReduceOp.SUM) + self.num_active_codes = n.nonzero().shape[0] + if self.config.quantize_ema_decay: + hidden_flat = hidden_states.detach().float().reshape(-1, hidden_states.shape[-1]) + with torch.autocast(device_type='cuda', dtype=torch.float32): + dw = torch.matmul(encodings.t(), hidden_flat) + torch.distributed.all_reduce(dw, op=torch.distributed.ReduceOp.SUM) + self.ema_count = self.ema_count * self.config.quantize_ema_decay + ( + 1 - self.config.quantize_ema_decay) * n + total_count = torch.sum(self.ema_count) + self.ema_count = (self.ema_count + 1e-5) / ( + total_count + self.config.quantize_vocab_size * 1e-5) * total_count + self.ema_weight = self.ema_weight * self.config.quantize_ema_decay + ( + 1 - self.config.quantize_ema_decay) * dw + self.codebook.weight.data = self.ema_weight / self.ema_count.unsqueeze(1) + self.quantize_loss = self.config.quantize_loss_scale * self.config.quantize_commit_coefficient * mse_loss_with_mask( + hidden_states, hidden_quantized.detach(), attention_mask) + self.quantize_ema_count += 1 + if self.config.quantize_restart_interval is not None and self.quantize_ema_count % self.config.quantize_restart_interval == 0: + rank, world_size = torch.distributed.get_rank(), torch.distributed.get_world_size() + segment_vocab_size = self.config.quantize_vocab_size // world_size + start_idx = segment_vocab_size * rank + ema_count_segment = self.ema_count[start_idx: start_idx + segment_vocab_size] + threshold = 1 * ( + self.config.quantize_ema_decay ** self.config.quantize_restart_interval) + update_indices = (ema_count_segment < threshold).nonzero()[:, 0] + start_idx + num_update = update_indices.shape[0] + mask_flat = attention_mask.reshape(-1) > 0 + hidden_selected = hidden_flat[mask_flat] + hidden_update = hidden_selected[random.sample(range(len(hidden_selected)), num_update)] + num_update = torch.as_tensor([num_update], dtype=torch.long, + device=hidden_states.device) + num_update_list = [torch.as_tensor([0], dtype=torch.long, device=hidden_states.device) + for _ + in range(world_size)] + torch.distributed.all_gather(num_update_list, num_update) + update_indices_list = [ + torch.zeros(num.item(), dtype=torch.long, device=hidden_states.device) for num in + num_update_list] + torch.distributed.all_gather(update_indices_list, update_indices) + update_indices = torch.cat(update_indices_list) + hidden_update_list = [ + torch.zeros(num.item(), hidden_flat.shape[-1], dtype=hidden_update.dtype, + device=hidden_states.device) for num in num_update_list] + torch.distributed.all_gather(hidden_update_list, hidden_update) + hidden_update = torch.cat(hidden_update_list) + self.codebook.weight.data[update_indices] = hidden_update + self.ema_count[update_indices] = 1 + self.ema_weight[update_indices] = hidden_update + if torch.distributed.get_rank() == 0: + print(f"restart {len(update_indices)} tokens") + else: + loss = self.config.quantize_loss_scale * ( + self.config.quantize_commit_coefficient * mse_loss_with_mask(hidden_states, + hidden_quantized.detach(), + attention_mask) + mse_loss_with_mask( + hidden_quantized, hidden_states.detach(), attention_mask)) + self.quantize_loss = loss + hidden_states = hidden_states + (hidden_quantized - hidden_states).detach() + else: + hidden_states = hidden_quantized + hidden_states = hidden_states + self.embed_positions2.weight[:hidden_states.shape[1]] + + if idx + 1 == self.save_hidden_position: + import numpy as np + import uuid + to_save = [] + for batch_idx, hidden_state in enumerate(hidden_states): + for seq_idx, hidden in enumerate(hidden_state): + if attention_mask[batch_idx, seq_idx]: + to_save.append(hidden.detach().cpu().numpy()) + np.save(os.path.join(self.save_hidden_dir, f"{str(uuid.uuid4())}.npy"), to_save) + if not self.config.quantize_encoder_only: + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return QuantizedBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions, + quantized_token_ids=quantized_token_ids, + ) + + +class WhisperVQDecoder(WhisperPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`WhisperDecoderLayer`] + + Args: + config: WhisperConfig + """ + + main_input_name = "input_ids" + + def __init__(self, config: WhisperVQConfig): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_target_positions + self.max_source_positions = config.max_source_positions + self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx) + self.embed_positions = WhisperPositionalEmbedding(self.max_target_positions, config.d_model) + + self.layers = nn.ModuleList( + [WhisperDecoderLayer(config, layer_idx) for layer_idx in range(config.decoder_layers)] + ) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self._use_sdpa = config._attn_implementation == "sdpa" + + self.layer_norm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + inputs_embeds=None, + position_ids=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + cache_position=None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder.] + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention + on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`EncoderDecoderCache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are + four sets of pre-computed hidden-states: key and values states in the self-attention blocks (2) and + in the cross-attention blocks (2). The `past_key_values` are returned when `use_cache=True` is passed or + when `config.use_cache=True` + + Two formats are allowed: + - An [`~cache_utils.EncoderDecoderCache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of + shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing + `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more + control over how to convert `input_ids` indices into associated vectors than the model's internal + embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + assert encoder_attention_mask.shape[-1] == encoder_hidden_states.shape[1] + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + + return_legacy_cache = False + return_self_attention_cache = False + if use_cache or past_key_values is not None: + if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache): + return_self_attention_cache = True + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache()) + elif not isinstance(past_key_values, EncoderDecoderCache): + return_legacy_cache = True + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + past_key_values_length = 0 + if cache_position is not None: + past_key_values_length = cache_position[0] + elif past_key_values is not None: + past_key_values_length = past_key_values.get_seq_length() + + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + input_shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # embed positions + if input_ids is not None: + positions = self.embed_positions( + input_ids, past_key_values_length=past_key_values_length, position_ids=position_ids + ) + else: + positions = self.embed_positions( + inputs_embeds, past_key_values_length=past_key_values_length, position_ids=position_ids + ) + + hidden_states = inputs_embeds + positions.to(inputs_embeds.device) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + causal_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + past_key_values.self_attention_cache if past_key_values is not None else None, + output_attentions, + ) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..." + ) + use_cache = False + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + assert attn_mask.size()[0] == (len(self.layers)), ( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + encoder_hidden_states, + encoder_extended_attention_mask, # encoder attention mask + head_mask[idx] if head_mask is not None else None, + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + None, # past_key_value + output_attentions, + use_cache, + cache_position, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=( + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None + ), + past_key_value=past_key_values if use_cache else None, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + hidden_states = self.layer_norm(hidden_states) + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = past_key_values if use_cache else None + if return_self_attention_cache: + next_cache = past_key_values.self_attention_cache + if return_legacy_cache: + next_cache = past_key_values.to_legacy_cache() + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static + # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. + # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using + # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_length() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + min_dtype=min_dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + +@add_start_docstrings( + "The bare Whisper Model outputting raw hidden-states without any specific head on top.", + WHISPER_START_DOCSTRING, +) +class WhisperVQModel(WhisperPreTrainedModel): + def __init__(self, config: WhisperVQConfig): + super().__init__(config) + + self.encoder = WhisperVQEncoder(config) + self.decoder = WhisperVQDecoder(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.decoder.embed_tokens = value + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def freeze_encoder(self): + """ + Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will + not be updated during training. + """ + self.encoder._freeze_parameters() + + def _mask_input_features( + self, + input_features: torch.FloatTensor, + attention_mask: Optional[torch.LongTensor] = None, + ): + """ + Masks extracted features along time axis and/or along feature axis according to + [SpecAugment](https://arxiv.org/abs/1904.08779). + """ + + # `config.apply_spec_augment` can set masking to False + if not getattr(self.config, "apply_spec_augment", True): + return input_features + + # generate indices & apply SpecAugment along time axis + batch_size, hidden_size, sequence_length = input_features.size() + + if self.config.mask_time_prob > 0 and self.training: + # generate indices & apply SpecAugment along time axis + mask_time_indices = _compute_mask_indices( + (batch_size, sequence_length), + mask_prob=self.config.mask_time_prob, + mask_length=self.config.mask_time_length, + attention_mask=attention_mask, + min_masks=self.config.mask_time_min_masks, + ) + mask_time_indices = torch.tensor(mask_time_indices, device=input_features.device, dtype=torch.bool) + mask_time_indices = mask_time_indices[:, None].expand(-1, hidden_size, -1) + input_features[mask_time_indices] = 0 + + if self.config.mask_feature_prob > 0 and self.training: + # generate indices & apply SpecAugment along feature axis + mask_feature_indices = _compute_mask_indices( + (batch_size, hidden_size), + mask_prob=self.config.mask_feature_prob, + mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, + ) + mask_feature_indices = torch.tensor(mask_feature_indices, device=input_features.device, dtype=torch.bool) + input_features[mask_feature_indices] = 0 + + return input_features + + @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_features: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None, + decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None, + decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + quantized_token_ids: Optional[torch.LongTensor] = None + ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]: + r""" + Returns: + + Example: + ```python + >>> import torch + >>> from transformers import AutoFeatureExtractor, WhisperModel + >>> from datasets import load_dataset + + >>> model = WhisperVQModel.from_pretrained("openai/whisper-base") + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base") + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") + >>> input_features = inputs.input_features + >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id + >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state + >>> list(last_hidden_state.shape) + [1, 2, 512] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + input_features = self._mask_input_features(input_features, attention_mask=attention_mask) + + encoder_outputs = self.encoder( + input_features, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + quantized_token_ids=quantized_token_ids + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + attention_mask = attention_mask[:, ::self.encoder.conv1.stride[0] * self.encoder.conv2.stride[0]] + if self.encoder.config.pooling_kernel_size is not None: + attention_mask = attention_mask[:, ::self.encoder.config.pooling_kernel_size] + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_attention_mask=attention_mask, + encoder_hidden_states=encoder_outputs[0], + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + position_ids=decoder_position_ids, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + "The Whisper Model with a language modeling head. Can be used for automatic speech recognition.", + WHISPER_START_DOCSTRING, +) +class WhisperVQForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedModel): + base_model_prefix = "model" + _tied_weights_keys = ["proj_out.weight"] + + def __init__(self, config: WhisperVQConfig): + super().__init__(config) + self.model = WhisperVQModel(config) + self.proj_out = nn.Linear(config.d_model, config.vocab_size, bias=False) + self.quantize_loss = None + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def get_output_embeddings(self): + return self.proj_out + + def set_output_embeddings(self, new_embeddings): + self.proj_out = new_embeddings + + def get_input_embeddings(self) -> nn.Module: + return self.model.get_input_embeddings() + + def freeze_encoder(self): + """ + Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will + not be updated during training. + """ + self.model.encoder._freeze_parameters() + + @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_features: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None, + decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None, + decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + quantized_token_ids: Optional[torch.LongTensor] = None + ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` + or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is + only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, WhisperForConditionalGeneration + >>> from datasets import load_dataset + + >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") + >>> model = WhisperVQForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") + + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + + >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") + >>> input_features = inputs.input_features + + >>> generated_ids = model.generate(inputs=input_features) + + >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + >>> transcription + ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.' + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_features, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + decoder_inputs_embeds=decoder_inputs_embeds, + decoder_position_ids=decoder_position_ids, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + quantized_token_ids=quantized_token_ids + ) + lm_logits = self.proj_out(outputs[0]) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # move labels to correct device to enable PP + labels = labels.to(lm_logits.device) + loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.reshape(-1)) + if self.training and self.model.encoder.quantize_loss is not None: + loss = loss + self.model.encoder.quantize_loss + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return Seq2SeqLMOutput( + loss=loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + past_key_values=None, + use_cache=None, + encoder_outputs=None, + attention_mask=None, + decoder_attention_mask=None, + cache_position=None, + quantized_token_ids=None, + **kwargs, + ): + decoder_position_ids = None + if decoder_attention_mask is not None: + decoder_position_ids = (decoder_attention_mask.cumsum(-1) - 1).clamp(min=0) + + past_length = 0 + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length() + else: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if decoder_input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = decoder_input_ids.shape[1] - 1 + + decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] + + if decoder_position_ids is not None: + decoder_position_ids = decoder_position_ids[:, remove_prefix_length:] + # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. + decoder_position_ids = decoder_position_ids.clone(memory_format=torch.contiguous_format) + + if cache_position is None: + cache_position = torch.arange( + past_length, past_length + decoder_input_ids.shape[1], device=decoder_input_ids.device + ) + elif use_cache: + cache_position = cache_position[-decoder_input_ids.shape[1]:] + + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114 + decoder_input_ids = decoder_input_ids.contiguous() + + if ( + isinstance(past_key_values, EncoderDecoderCache) + and ( + isinstance(past_key_values.self_attention_cache, StaticCache) + or isinstance(past_key_values.cross_attention_cache, StaticCache) + ) + and decoder_attention_mask is not None + and decoder_attention_mask.ndim == 2 + ): + batch_size, sequence_length = decoder_input_ids.shape + device = decoder_input_ids.device + + dtype = self.proj_out.weight.dtype + min_dtype = torch.finfo(dtype).min + + decoder_attention_mask = _prepare_4d_causal_attention_mask_with_cache_position( + decoder_attention_mask, + sequence_length=sequence_length, + target_length=past_key_values.self_attention_cache.get_max_length(), + dtype=dtype, + device=device, + min_dtype=min_dtype, + cache_position=cache_position, + batch_size=batch_size, + ) + + return { + "encoder_outputs": encoder_outputs, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "decoder_input_ids": decoder_input_ids, + "use_cache": use_cache, + "decoder_attention_mask": decoder_attention_mask, + "decoder_position_ids": decoder_position_ids, + "cache_position": cache_position, + "quantized_token_ids": quantized_token_ids + } + + def _retrieve_init_tokens(self, input_features, batch_size, generation_config, config, num_segment_frames, kwargs): + if self.config.skip_language_detection: + return torch.as_tensor([[generation_config.decoder_start_token_id] for _ in range(batch_size)], + dtype=torch.long, device=self.device).expand(batch_size, -1) + else: + return super()._retrieve_init_tokens(input_features, batch_size, generation_config, config, + num_segment_frames, kwargs) + + +class WhisperDecoderWrapper(WhisperPreTrainedModel): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + def __init__(self, config): + super().__init__(config) + config.is_encoder_decoder = False + self.decoder = WhisperVQDecoder(config) + + def get_input_embeddings(self): + return self.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.decoder.embed_tokens = value + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +@add_start_docstrings( + """ + Whisper decoder with a language modeling head on top (linear layer with weights tied to the input embeddings). + """, + WHISPER_START_DOCSTRING, +) +class WhisperForCausalLM(WhisperPreTrainedModel): + _tied_weights_keys = ["proj_out.weight"] + main_input_name = "input_ids" + + def __init__(self, config): + super().__init__(config) + config.is_encoder_decoder = False + self.model = WhisperDecoderWrapper(config) + + self.proj_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.proj_out + + def set_output_embeddings(self, new_embeddings): + self.proj_out = new_embeddings + + def get_input_embeddings(self) -> nn.Module: + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + encoder_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional + tensors are only required when the model is used as a decoder in a Sequence to Sequence model. Contains + pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. If + `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the cache + in the correct position and to infer the complete sequence length. + + Returns: + + Example: + + ```python + >>> from transformers import WhisperForCausalLM, WhisperForConditionalGeneration, WhisperProcessor + >>> import torch + >>> from datasets import load_dataset + + >>> processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") + >>> model = WhisperVQForConditionalGeneration.from_pretrained("openai/whisper-large-v2") + + >>> assistant_model = WhisperForCausalLM.from_pretrained("distil-whisper/distil-large-v2") + + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> sample = ds[0]["audio"] + >>> input_features = processor( + ... sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt" + ... ).input_features + + >>> predicted_ids = model.generate(input_features, assistant_model=assistant_model) + + >>> # decode token ids to text + >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] + >>> transcription + ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.' + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # If the user passed a tuple or `BaseModelOutput` for encoder_outputs, we extract only the hidden states + if isinstance(encoder_outputs, (BaseModelOutput, tuple, list)): + encoder_outputs = encoder_outputs[0] + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_outputs, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + logits = self.proj_out(outputs[0]) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + use_cache=None, + encoder_outputs=None, + attention_mask=None, + cache_position=None, + **kwargs, + ): + past_length = 0 + if past_key_values is not None: + if isinstance(past_key_values, (Cache, EncoderDecoderCache)): + past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length() + else: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + if cache_position is None: + cache_position = torch.arange(past_length, past_length + input_ids.shape[1], device=input_ids.device) + elif use_cache: + cache_position = cache_position[-input_ids.shape[1]:] + + return { + "encoder_outputs": encoder_outputs, + "past_key_values": past_key_values, + "input_ids": input_ids, + "use_cache": use_cache, + "attention_mask": attention_mask, + "cache_position": cache_position, + } + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + Whisper Encoder Model with a sequence classification head on top (a linear layer over the pooled output) for tasks + like SUPERB Keyword Spotting. + """, + WHISPER_ENCODER_INPUTS_DOCSTRING, +) +class WhisperForAudioClassification(WhisperPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.encoder = WhisperVQEncoder(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) + self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def freeze_encoder(self): + """ + Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will + not be updated during training. Only the projection layers and classification head will be updated. + """ + self.encoder._freeze_parameters() + + def get_input_embeddings(self) -> nn.Module: + return self.encoder.get_input_embeddings() + + def set_input_embeddings(self, value: nn.Module): + self.encoder.set_input_embeddings(value) + + @add_start_docstrings_to_model_forward(WHISPER_ENCODER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_features: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Example: + + ```python + >>> import torch + >>> from transformers import AutoFeatureExtractor, WhisperForAudioClassification + >>> from datasets import load_dataset + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id") + >>> model = WhisperForAudioClassification.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id") + + >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True) + >>> sample = next(iter(ds)) + + >>> inputs = feature_extractor( + ... sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="pt" + ... ) + >>> input_features = inputs.input_features + + >>> with torch.no_grad(): + ... logits = model(input_features).logits + + >>> predicted_class_ids = torch.argmax(logits).item() + >>> predicted_label = model.config.id2label[predicted_class_ids] + >>> predicted_label + 'Afrikaans' + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + if self.config.use_weighted_layer_sum: + output_hidden_states = True + elif output_hidden_states is None: + output_hidden_states = self.config.output_hidden_states + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_features, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = encoder_outputs[_HIDDEN_STATES_START_POSITION] + hidden_states = torch.stack(hidden_states, dim=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) + else: + hidden_states = encoder_outputs[0] + + hidden_states = self.projector(hidden_states) + pooled_output = hidden_states.mean(dim=1) + + logits = self.classifier(pooled_output) + + loss = None + + if labels is not None: + loss_fct = CrossEntropyLoss() + # move labels to correct device to enable PP + labels = labels.to(logits.device) + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + encoder_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) diff --git a/speech_tokenizer/utils.py b/speech_tokenizer/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c155274625173827e59fcd3fc9568fe18f3eb148 --- /dev/null +++ b/speech_tokenizer/utils.py @@ -0,0 +1,84 @@ +import os +import io +import glob +import math +import tarfile +import torch +import torchaudio +import safetensors +from .configuration_whisper import WhisperVQConfig +from .modeling_whisper import WhisperVQEncoder, WhisperVQForConditionalGeneration +from transformers import WhisperFeatureExtractor, WhisperTokenizerFast + + +def load_quantize_encoder(model_path): + config = WhisperVQConfig.from_pretrained(model_path) + config.quantize_encoder_only = True + model = WhisperVQEncoder(config) + state_dict = {} + for path in glob.glob(os.path.join(model_path, "model*.safetensors")): + with safetensors.safe_open(path, framework="pt", device="cpu") as f: + for key in f.keys(): + if key.startswith("model.encoder."): + new_key = key[len("model.encoder."):] + if new_key.startswith("layer_norm"): + continue + if new_key.startswith("layers"): + layer_id = int(new_key.split(".")[1]) + if layer_id >= config.quantize_position: + continue + state_dict[new_key] = f.get_tensor(key) + model.load_state_dict(state_dict) + model.eval() + model.cuda() + return model + + +_resample_buffer: dict[int, torchaudio.transforms.Resample] = {} + + +def extract_speech_token(model: WhisperVQEncoder, feature_extractor: WhisperFeatureExtractor, utts): + with torch.no_grad(): + audios, indices = [], [] + for idx, utt in enumerate(utts): + if isinstance(utt, tuple): + audio, sample_rate = utt + else: + audio, sample_rate = torchaudio.load(utt) + audio = audio.cuda() + if sample_rate != 16000: + if sample_rate not in _resample_buffer: + _resample_buffer[sample_rate] = torchaudio.transforms.Resample( + orig_freq=sample_rate, + new_freq=16000 + ).to('cuda') + audio = _resample_buffer[sample_rate](audio) + # if audio.shape[0] > 1: + # audio = audio[:1] + audio = audio[0] + audio = audio.cpu().numpy() + time_step = 0 + while time_step * 16000 < audio.shape[0]: + audio_segment = audio[time_step * 16000: (time_step + 30) * 16000] + audios.append(audio_segment) + indices.append(idx) + time_step += 30 + pooling_kernel_size = model.config.pooling_kernel_size or 1 + stride = model.conv1.stride[0] * model.conv2.stride[0] * pooling_kernel_size * feature_extractor.hop_length + all_speech_tokens = [[] for _ in range(len(utts))] + batch_size = 128 + for start in range(0, len(audios), batch_size): + features = feature_extractor(audios[start: start + batch_size], sampling_rate=16000, + return_attention_mask=True, return_tensors="pt", device='cuda', + padding="longest", pad_to_multiple_of=stride) + features = features.to(device="cuda") + outputs = model(**features) + speech_tokens = outputs.quantized_token_ids + attention_mask = features.attention_mask[:, ::model.conv1.stride[0] * model.conv2.stride[0]] + attention_mask = attention_mask[:, ::model.config.pooling_kernel_size] + assert attention_mask.shape == speech_tokens.shape + for i in range(len(speech_tokens)): + idx = indices[start + i] + speech_token = speech_tokens[i][attention_mask[i].bool()].tolist() + all_speech_tokens[idx].extend(speech_token) + return all_speech_tokens diff --git a/third_party/Matcha-TTS/.env.example b/third_party/Matcha-TTS/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..a790e320464ebc778ca07f5bcd826a9c8412ed0e --- /dev/null +++ b/third_party/Matcha-TTS/.env.example @@ -0,0 +1,6 @@ +# example of file for storing private and user specific environment variables, like keys or system paths +# rename it to ".env" (excluded from version control by default) +# .env is loaded by train.py automatically +# hydra allows you to reference variables in .yaml configs with special syntax: ${oc.env:MY_VAR} + +MY_VAR="/home/user/my/system/path" diff --git a/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md b/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000000000000000000000000000000000..410bcd87a45297ab8f0d369574a032858b6b1811 --- /dev/null +++ b/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,22 @@ +## What does this PR do? + + + +Fixes #\ + +## Before submitting + +- [ ] Did you make sure **title is self-explanatory** and **the description concisely explains the PR**? +- [ ] Did you make sure your **PR does only one thing**, instead of bundling different changes together? +- [ ] Did you list all the **breaking changes** introduced by this pull request? +- [ ] Did you **test your PR locally** with `pytest` command? +- [ ] Did you **run pre-commit hooks** with `pre-commit run -a` command? + +## Did you have fun? + +Make sure you had fun coding 🙃 diff --git a/third_party/Matcha-TTS/.github/codecov.yml b/third_party/Matcha-TTS/.github/codecov.yml new file mode 100644 index 0000000000000000000000000000000000000000..c66853c4bd9991f730da5dda7dc9881986779558 --- /dev/null +++ b/third_party/Matcha-TTS/.github/codecov.yml @@ -0,0 +1,15 @@ +coverage: + status: + # measures overall project coverage + project: + default: + threshold: 100% # how much decrease in coverage is needed to not consider success + + # measures PR or single commit coverage + patch: + default: + threshold: 100% # how much decrease in coverage is needed to not consider success + + + # project: off + # patch: off diff --git a/third_party/Matcha-TTS/.github/dependabot.yml b/third_party/Matcha-TTS/.github/dependabot.yml new file mode 100644 index 0000000000000000000000000000000000000000..b19ccab12a3c573025ce6ba6d9068b062b29cc1b --- /dev/null +++ b/third_party/Matcha-TTS/.github/dependabot.yml @@ -0,0 +1,17 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" # Location of package manifests + target-branch: "dev" + schedule: + interval: "daily" + ignore: + - dependency-name: "pytorch-lightning" + update-types: ["version-update:semver-patch"] + - dependency-name: "torchmetrics" + update-types: ["version-update:semver-patch"] diff --git a/third_party/Matcha-TTS/.github/release-drafter.yml b/third_party/Matcha-TTS/.github/release-drafter.yml new file mode 100644 index 0000000000000000000000000000000000000000..59af159f671abe75311eb626c8ec92ca6ea09d3c --- /dev/null +++ b/third_party/Matcha-TTS/.github/release-drafter.yml @@ -0,0 +1,44 @@ +name-template: "v$RESOLVED_VERSION" +tag-template: "v$RESOLVED_VERSION" + +categories: + - title: "🚀 Features" + labels: + - "feature" + - "enhancement" + - title: "🐛 Bug Fixes" + labels: + - "fix" + - "bugfix" + - "bug" + - title: "🧹 Maintenance" + labels: + - "maintenance" + - "dependencies" + - "refactoring" + - "cosmetic" + - "chore" + - title: "📝️ Documentation" + labels: + - "documentation" + - "docs" + +change-template: "- $TITLE @$AUTHOR (#$NUMBER)" +change-title-escapes: '\<*_&' # You can add # and @ to disable mentions + +version-resolver: + major: + labels: + - "major" + minor: + labels: + - "minor" + patch: + labels: + - "patch" + default: patch + +template: | + ## Changes + + $CHANGES diff --git a/third_party/Matcha-TTS/.gitignore b/third_party/Matcha-TTS/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..cbec8b43a0414bbbf4cc9feae49b9dc091a60c92 --- /dev/null +++ b/third_party/Matcha-TTS/.gitignore @@ -0,0 +1,163 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +### VisualStudioCode +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace +**/.vscode + +# JetBrains +.idea/ + +# Data & Models +*.h5 +*.tar +*.tar.gz + +# Lightning-Hydra-Template +configs/local/default.yaml +/data/ +/logs/ +.env + +# Aim logging +.aim + +# Cython complied files +matcha/utils/monotonic_align/core.c + +# Ignoring hifigan checkpoint +generator_v1 +g_02500000 +gradio_cached_examples/ +synth_output/ diff --git a/third_party/Matcha-TTS/.pre-commit-config.yaml b/third_party/Matcha-TTS/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e695f115eba12d84fe6f465c5d834dfa35c3d2ec --- /dev/null +++ b/third_party/Matcha-TTS/.pre-commit-config.yaml @@ -0,0 +1,59 @@ +default_language_version: + python: python3.10 + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + # list of supported hooks: https://pre-commit.com/hooks.html + - id: trailing-whitespace + - id: end-of-file-fixer + # - id: check-docstring-first + - id: check-yaml + - id: debug-statements + - id: detect-private-key + - id: check-toml + - id: check-case-conflict + - id: check-added-large-files + + # python code formatting + - repo: https://github.com/psf/black + rev: 23.12.1 + hooks: + - id: black + args: [--line-length, "120"] + + # python import sorting + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + args: ["--profile", "black", "--filter-files"] + + # python upgrading syntax to newer version + - repo: https://github.com/asottile/pyupgrade + rev: v3.15.0 + hooks: + - id: pyupgrade + args: [--py38-plus] + + # python check (PEP8), programming errors and code complexity + - repo: https://github.com/PyCQA/flake8 + rev: 7.0.0 + hooks: + - id: flake8 + args: + [ + "--max-line-length", "120", + "--extend-ignore", + "E203,E402,E501,F401,F841,RST2,RST301", + "--exclude", + "logs/*,data/*,matcha/hifigan/*", + ] + additional_dependencies: [flake8-rst-docstrings==0.3.0] + + # pylint + - repo: https://github.com/pycqa/pylint + rev: v3.0.3 + hooks: + - id: pylint diff --git a/third_party/Matcha-TTS/.project-root b/third_party/Matcha-TTS/.project-root new file mode 100644 index 0000000000000000000000000000000000000000..63eab774b9e36aa1a46cbd31b59cbd373bc5477f --- /dev/null +++ b/third_party/Matcha-TTS/.project-root @@ -0,0 +1,2 @@ +# this file is required for inferring the project root directory +# do not delete diff --git a/third_party/Matcha-TTS/.pylintrc b/third_party/Matcha-TTS/.pylintrc new file mode 100644 index 0000000000000000000000000000000000000000..962864189eab99a66b315b80f5a9976e7a423d4a --- /dev/null +++ b/third_party/Matcha-TTS/.pylintrc @@ -0,0 +1,525 @@ +[MASTER] + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-whitelist= + +# Add files or directories to the blacklist. They should be base names, not +# paths. +ignore=CVS + +# Add files or directories matching the regex patterns to the blacklist. The +# regex matches against base names, not paths. +ignore-patterns= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Specify a configuration file. +#rcfile= + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. +confidence= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=missing-docstring, + too-many-public-methods, + too-many-lines, + bare-except, + ## for avoiding weird p3.6 CI linter error + ## TODO: see later if we can remove this + assigning-non-slot, + unsupported-assignment-operation, + ## end + line-too-long, + fixme, + wrong-import-order, + ungrouped-imports, + wrong-import-position, + import-error, + invalid-name, + too-many-instance-attributes, + arguments-differ, + arguments-renamed, + no-name-in-module, + no-member, + unsubscriptable-object, + raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + useless-object-inheritance, + too-few-public-methods, + too-many-branches, + too-many-arguments, + too-many-locals, + too-many-statements, + duplicate-code, + not-callable, + import-outside-toplevel, + logging-fstring-interpolation, + logging-not-lazy, + unused-argument, + no-else-return, + chained-comparison, + redefined-outer-name + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member + + +[REPORTS] + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables errors warning, statement which +# respectively contain the number of errors / warnings messages and the total +# number of statements analyzed. This is used by the global evaluation report +# (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +#msg-template= + +# Set the output format. Available formats are text, parseable, colorized, json +# and msvs (visual studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit + + +[LOGGING] + +# Format style used to check logging format string. `old` means using % +# formatting, while `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. Available dictionaries: none. To make it working +# install python-enchant package.. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to indicated private dictionary in +# --spelling-private-dict-file option instead of raising a message. +spelling-store-unknown-words=no + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members=numpy.*,torch.* + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis. It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. Default to name +# with leading underscore. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=120 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[SIMILARITIES] + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. +argument-rgx=[a-z_][a-z0-9_]{0,30}$ + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. +#class-attribute-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + x, + ex, + Run, + _ + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. +variable-rgx=[a-z_][a-z0-9_]{0,30}$ + + +[STRING] + +# This flag controls whether the implicit-str-concat-in-sequence should +# generate a warning on implicit string concatenation in sequences defined over +# several lines. +check-str-concat-over-line-jumps=no + + +[IMPORTS] + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules=optparse,tkinter.tix + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled). +ext-import-graph= + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled). +import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=cls + + +[DESIGN] + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement. +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=15 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "BaseException, Exception". +overgeneral-exceptions=builtins.BaseException, + builtins.Exception diff --git a/third_party/Matcha-TTS/LICENSE b/third_party/Matcha-TTS/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..858018e750da7be7b271bb7307e68d159ed67ef6 --- /dev/null +++ b/third_party/Matcha-TTS/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Shivam Mehta + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/third_party/Matcha-TTS/MANIFEST.in b/third_party/Matcha-TTS/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..c013140cdfb9de19c4d4e73c73a44e33f33fa871 --- /dev/null +++ b/third_party/Matcha-TTS/MANIFEST.in @@ -0,0 +1,14 @@ +include README.md +include LICENSE.txt +include requirements.*.txt +include *.cff +include requirements.txt +include matcha/VERSION +recursive-include matcha *.json +recursive-include matcha *.html +recursive-include matcha *.png +recursive-include matcha *.md +recursive-include matcha *.py +recursive-include matcha *.pyx +recursive-exclude tests * +prune tests* diff --git a/third_party/Matcha-TTS/Makefile b/third_party/Matcha-TTS/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4b523dd17b13a19617c9cc9d9dad7f7d8d4c24a0 --- /dev/null +++ b/third_party/Matcha-TTS/Makefile @@ -0,0 +1,42 @@ + +help: ## Show help + @grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' + +clean: ## Clean autogenerated files + rm -rf dist + find . -type f -name "*.DS_Store" -ls -delete + find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf + find . | grep -E ".pytest_cache" | xargs rm -rf + find . | grep -E ".ipynb_checkpoints" | xargs rm -rf + rm -f .coverage + +clean-logs: ## Clean logs + rm -rf logs/** + +create-package: ## Create wheel and tar gz + rm -rf dist/ + python setup.py bdist_wheel --plat-name=manylinux1_x86_64 + python setup.py sdist + python -m twine upload dist/* --verbose --skip-existing + +format: ## Run pre-commit hooks + pre-commit run -a + +sync: ## Merge changes from main branch to your current branch + git pull + git pull origin main + +test: ## Run not slow tests + pytest -k "not slow" + +test-full: ## Run all tests + pytest + +train-ljspeech: ## Train the model + python matcha/train.py experiment=ljspeech + +train-ljspeech-min: ## Train the model with minimum memory + python matcha/train.py experiment=ljspeech_min_memory + +start_app: ## Start the app + python matcha/app.py diff --git a/third_party/Matcha-TTS/README.md b/third_party/Matcha-TTS/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ebc6b7c0a76d30c33bf95583d629825c02183e31 --- /dev/null +++ b/third_party/Matcha-TTS/README.md @@ -0,0 +1,278 @@ +
+ +# 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching + +### [Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [Éva Székely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/) + +[![python](https://img.shields.io/badge/-Python_3.10-blue?logo=python&logoColor=white)](https://www.python.org/downloads/release/python-3100/) +[![pytorch](https://img.shields.io/badge/PyTorch_2.0+-ee4c2c?logo=pytorch&logoColor=white)](https://pytorch.org/get-started/locally/) +[![lightning](https://img.shields.io/badge/-Lightning_2.0+-792ee5?logo=pytorchlightning&logoColor=white)](https://pytorchlightning.ai/) +[![hydra](https://img.shields.io/badge/Config-Hydra_1.3-89b8cd)](https://hydra.cc/) +[![black](https://img.shields.io/badge/Code%20Style-Black-black.svg?labelColor=gray)](https://black.readthedocs.io/en/stable/) +[![isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) + +

+ +

+ +
+ +> This is the official code implementation of 🍵 Matcha-TTS [ICASSP 2024]. + +We propose 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses [conditional flow matching](https://arxiv.org/abs/2210.02747) (similar to [rectified flows](https://arxiv.org/abs/2209.03003)) to speed up ODE-based speech synthesis. Our method: + +- Is probabilistic +- Has compact memory footprint +- Sounds highly natural +- Is very fast to synthesise from + +Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS) and read [our ICASSP 2024 paper](https://arxiv.org/abs/2309.03199) for more details. + +[Pre-trained models](https://drive.google.com/drive/folders/17C_gYgEHOxI5ZypcfE_k1piKCtyR0isJ?usp=sharing) will be automatically downloaded with the CLI or gradio interface. + +You can also [try 🍵 Matcha-TTS in your browser on HuggingFace 🤗 spaces](https://huggingface.co/spaces/shivammehta25/Matcha-TTS). + +## Teaser video + +[![Watch the video](https://img.youtube.com/vi/xmvJkz3bqw0/hqdefault.jpg)](https://youtu.be/xmvJkz3bqw0) + +## Installation + +1. Create an environment (suggested but optional) + +``` +conda create -n matcha-tts python=3.10 -y +conda activate matcha-tts +``` + +2. Install Matcha TTS using pip or from source + +```bash +pip install matcha-tts +``` + +from source + +```bash +pip install git+https://github.com/shivammehta25/Matcha-TTS.git +cd Matcha-TTS +pip install -e . +``` + +3. Run CLI / gradio app / jupyter notebook + +```bash +# This will download the required models +matcha-tts --text "" +``` + +or + +```bash +matcha-tts-app +``` + +or open `synthesis.ipynb` on jupyter notebook + +### CLI Arguments + +- To synthesise from given text, run: + +```bash +matcha-tts --text "" +``` + +- To synthesise from a file, run: + +```bash +matcha-tts --file +``` + +- To batch synthesise from a file, run: + +```bash +matcha-tts --file --batched +``` + +Additional arguments + +- Speaking rate + +```bash +matcha-tts --text "" --speaking_rate 1.0 +``` + +- Sampling temperature + +```bash +matcha-tts --text "" --temperature 0.667 +``` + +- Euler ODE solver steps + +```bash +matcha-tts --text "" --steps 10 +``` + +## Train with your own dataset + +Let's assume we are training with LJ Speech + +1. Download the dataset from [here](https://keithito.com/LJ-Speech-Dataset/), extract it to `data/LJSpeech-1.1`, and prepare the file lists to point to the extracted data like for [item 5 in the setup of the NVIDIA Tacotron 2 repo](https://github.com/NVIDIA/tacotron2#setup). + +2. Clone and enter the Matcha-TTS repository + +```bash +git clone https://github.com/shivammehta25/Matcha-TTS.git +cd Matcha-TTS +``` + +3. Install the package from source + +```bash +pip install -e . +``` + +4. Go to `configs/data/ljspeech.yaml` and change + +```yaml +train_filelist_path: data/filelists/ljs_audio_text_train_filelist.txt +valid_filelist_path: data/filelists/ljs_audio_text_val_filelist.txt +``` + +5. Generate normalisation statistics with the yaml file of dataset configuration + +```bash +matcha-data-stats -i ljspeech.yaml +# Output: +#{'mel_mean': -5.53662231756592, 'mel_std': 2.1161014277038574} +``` + +Update these values in `configs/data/ljspeech.yaml` under `data_statistics` key. + +```bash +data_statistics: # Computed for ljspeech dataset + mel_mean: -5.536622 + mel_std: 2.116101 +``` + +to the paths of your train and validation filelists. + +6. Run the training script + +```bash +make train-ljspeech +``` + +or + +```bash +python matcha/train.py experiment=ljspeech +``` + +- for a minimum memory run + +```bash +python matcha/train.py experiment=ljspeech_min_memory +``` + +- for multi-gpu training, run + +```bash +python matcha/train.py experiment=ljspeech trainer.devices=[0,1] +``` + +7. Synthesise from the custom trained model + +```bash +matcha-tts --text "" --checkpoint_path +``` + +## ONNX support + +> Special thanks to [@mush42](https://github.com/mush42) for implementing ONNX export and inference support. + +It is possible to export Matcha checkpoints to [ONNX](https://onnx.ai/), and run inference on the exported ONNX graph. + +### ONNX export + +To export a checkpoint to ONNX, first install ONNX with + +```bash +pip install onnx +``` + +then run the following: + +```bash +python3 -m matcha.onnx.export matcha.ckpt model.onnx --n-timesteps 5 +``` + +Optionally, the ONNX exporter accepts **vocoder-name** and **vocoder-checkpoint** arguments. This enables you to embed the vocoder in the exported graph and generate waveforms in a single run (similar to end-to-end TTS systems). + +**Note** that `n_timesteps` is treated as a hyper-parameter rather than a model input. This means you should specify it during export (not during inference). If not specified, `n_timesteps` is set to **5**. + +**Important**: for now, torch>=2.1.0 is needed for export since the `scaled_product_attention` operator is not exportable in older versions. Until the final version is released, those who want to export their models must install torch>=2.1.0 manually as a pre-release. + +### ONNX Inference + +To run inference on the exported model, first install `onnxruntime` using + +```bash +pip install onnxruntime +pip install onnxruntime-gpu # for GPU inference +``` + +then use the following: + +```bash +python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs +``` + +You can also control synthesis parameters: + +```bash +python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --temperature 0.4 --speaking_rate 0.9 --spk 0 +``` + +To run inference on **GPU**, make sure to install **onnxruntime-gpu** package, and then pass `--gpu` to the inference command: + +```bash +python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --gpu +``` + +If you exported only Matcha to ONNX, this will write mel-spectrogram as graphs and `numpy` arrays to the output directory. +If you embedded the vocoder in the exported graph, this will write `.wav` audio files to the output directory. + +If you exported only Matcha to ONNX, and you want to run a full TTS pipeline, you can pass a path to a vocoder model in `ONNX` format: + +```bash +python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --vocoder hifigan.small.onnx +``` + +This will write `.wav` audio files to the output directory. + +## Citation information + +If you use our code or otherwise find this work useful, please cite our paper: + +```text +@inproceedings{mehta2024matcha, + title={Matcha-{TTS}: A fast {TTS} architecture with conditional flow matching}, + author={Mehta, Shivam and Tu, Ruibo and Beskow, Jonas and Sz{\'e}kely, {\'E}va and Henter, Gustav Eje}, + booktitle={Proc. ICASSP}, + year={2024} +} +``` + +## Acknowledgements + +Since this code uses [Lightning-Hydra-Template](https://github.com/ashleve/lightning-hydra-template), you have all the powers that come with it. + +Other source code we would like to acknowledge: + +- [Coqui-TTS](https://github.com/coqui-ai/TTS/tree/dev): For helping me figure out how to make cython binaries pip installable and encouragement +- [Hugging Face Diffusers](https://huggingface.co/): For their awesome diffusers library and its components +- [Grad-TTS](https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS): For the monotonic alignment search source code +- [torchdyn](https://github.com/DiffEqML/torchdyn): Useful for trying other ODE solvers during research and development +- [labml.ai](https://nn.labml.ai/transformers/rope/index.html): For the RoPE implementation diff --git a/third_party/Matcha-TTS/configs/__init__.py b/third_party/Matcha-TTS/configs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..56bf7f4aa4906bc0f997132708cc0826c198e4aa --- /dev/null +++ b/third_party/Matcha-TTS/configs/__init__.py @@ -0,0 +1 @@ +# this file is needed here to include configs when building project as a package diff --git a/third_party/Matcha-TTS/configs/callbacks/default.yaml b/third_party/Matcha-TTS/configs/callbacks/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ebaa3ed31a7f626bc62f90184dc4b25b631e52a9 --- /dev/null +++ b/third_party/Matcha-TTS/configs/callbacks/default.yaml @@ -0,0 +1,5 @@ +defaults: + - model_checkpoint.yaml + - model_summary.yaml + - rich_progress_bar.yaml + - _self_ diff --git a/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml b/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3d085c711a8521b6b98ad6401b686bb601ceacd6 --- /dev/null +++ b/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml @@ -0,0 +1,17 @@ +# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html + +model_checkpoint: + _target_: lightning.pytorch.callbacks.ModelCheckpoint + dirpath: ${paths.output_dir}/checkpoints # directory to save the model file + filename: checkpoint_{epoch:03d} # checkpoint filename + monitor: epoch # name of the logged metric which determines when model is improving + verbose: False # verbosity mode + save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt + save_top_k: 10 # save k best models (determined by above metric) + mode: "max" # "max" means higher metric value is better, can be also "min" + auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name + save_weights_only: False # if True, then only the model’s weights will be saved + every_n_train_steps: null # number of training steps between checkpoints + train_time_interval: null # checkpoints are monitored at the specified time interval + every_n_epochs: 100 # number of epochs between checkpoints + save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation diff --git a/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml b/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6e5368d0e94298cce6d5421365b4583bd763ba92 --- /dev/null +++ b/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml @@ -0,0 +1,5 @@ +# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html + +model_summary: + _target_: lightning.pytorch.callbacks.RichModelSummary + max_depth: 3 # the maximum depth of layer nesting that the summary will include diff --git a/third_party/Matcha-TTS/configs/callbacks/none.yaml b/third_party/Matcha-TTS/configs/callbacks/none.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml b/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml new file mode 100644 index 0000000000000000000000000000000000000000..de6f1ccb11205a4db93645fb6f297e50205de172 --- /dev/null +++ b/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml @@ -0,0 +1,4 @@ +# https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html + +rich_progress_bar: + _target_: lightning.pytorch.callbacks.RichProgressBar diff --git a/third_party/Matcha-TTS/configs/data/hi-fi_en-US_female.yaml b/third_party/Matcha-TTS/configs/data/hi-fi_en-US_female.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1269f9b3b421d27a204bb0697e2f27a0fa0864a3 --- /dev/null +++ b/third_party/Matcha-TTS/configs/data/hi-fi_en-US_female.yaml @@ -0,0 +1,14 @@ +defaults: + - ljspeech + - _self_ + +# Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/ +_target_: matcha.data.text_mel_datamodule.TextMelDataModule +name: hi-fi_en-US_female +train_filelist_path: data/filelists/hi-fi-captain-en-us-female_train.txt +valid_filelist_path: data/filelists/hi-fi-captain-en-us-female_val.txt +batch_size: 32 +cleaners: [english_cleaners_piper] +data_statistics: # Computed for this dataset + mel_mean: -6.38385 + mel_std: 2.541796 diff --git a/third_party/Matcha-TTS/configs/data/ljspeech.yaml b/third_party/Matcha-TTS/configs/data/ljspeech.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f251420c3caadd9b27a0b01ad28ee37b5eec1440 --- /dev/null +++ b/third_party/Matcha-TTS/configs/data/ljspeech.yaml @@ -0,0 +1,21 @@ +_target_: matcha.data.text_mel_datamodule.TextMelDataModule +name: ljspeech +train_filelist_path: data/filelists/ljs_audio_text_train_filelist.txt +valid_filelist_path: data/filelists/ljs_audio_text_val_filelist.txt +batch_size: 32 +num_workers: 20 +pin_memory: True +cleaners: [english_cleaners2] +add_blank: True +n_spks: 1 +n_fft: 1024 +n_feats: 80 +sample_rate: 22050 +hop_length: 256 +win_length: 1024 +f_min: 0 +f_max: 8000 +data_statistics: # Computed for ljspeech dataset + mel_mean: -5.536622 + mel_std: 2.116101 +seed: ${seed} diff --git a/third_party/Matcha-TTS/configs/data/vctk.yaml b/third_party/Matcha-TTS/configs/data/vctk.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ba11cc63371ad6308d6711513268de7efe50eed9 --- /dev/null +++ b/third_party/Matcha-TTS/configs/data/vctk.yaml @@ -0,0 +1,14 @@ +defaults: + - ljspeech + - _self_ + +_target_: matcha.data.text_mel_datamodule.TextMelDataModule +name: vctk +train_filelist_path: data/filelists/vctk_audio_sid_text_train_filelist.txt +valid_filelist_path: data/filelists/vctk_audio_sid_text_val_filelist.txt +batch_size: 32 +add_blank: True +n_spks: 109 +data_statistics: # Computed for vctk dataset + mel_mean: -6.630575 + mel_std: 2.482914 diff --git a/third_party/Matcha-TTS/configs/debug/default.yaml b/third_party/Matcha-TTS/configs/debug/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e3932c82585fbe44047c1569a5cfe9ee9895c71a --- /dev/null +++ b/third_party/Matcha-TTS/configs/debug/default.yaml @@ -0,0 +1,35 @@ +# @package _global_ + +# default debugging setup, runs 1 full epoch +# other debugging configs can inherit from this one + +# overwrite task name so debugging logs are stored in separate folder +task_name: "debug" + +# disable callbacks and loggers during debugging +# callbacks: null +# logger: null + +extras: + ignore_warnings: False + enforce_tags: False + +# sets level of all command line loggers to 'DEBUG' +# https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ +hydra: + job_logging: + root: + level: DEBUG + + # use this to also set hydra loggers to 'DEBUG' + # verbose: True + +trainer: + max_epochs: 1 + accelerator: cpu # debuggers don't like gpus + devices: 1 # debuggers don't like multiprocessing + detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor + +data: + num_workers: 0 # debuggers don't like multiprocessing + pin_memory: False # disable gpu memory pin diff --git a/third_party/Matcha-TTS/configs/debug/fdr.yaml b/third_party/Matcha-TTS/configs/debug/fdr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7f2d34fa37c31017e749d5a4fc5ae6763e688b46 --- /dev/null +++ b/third_party/Matcha-TTS/configs/debug/fdr.yaml @@ -0,0 +1,9 @@ +# @package _global_ + +# runs 1 train, 1 validation and 1 test step + +defaults: + - default + +trainer: + fast_dev_run: true diff --git a/third_party/Matcha-TTS/configs/debug/limit.yaml b/third_party/Matcha-TTS/configs/debug/limit.yaml new file mode 100644 index 0000000000000000000000000000000000000000..514d77fbd1475b03fff0372e3da3c2fa7ea7d190 --- /dev/null +++ b/third_party/Matcha-TTS/configs/debug/limit.yaml @@ -0,0 +1,12 @@ +# @package _global_ + +# uses only 1% of the training data and 5% of validation/test data + +defaults: + - default + +trainer: + max_epochs: 3 + limit_train_batches: 0.01 + limit_val_batches: 0.05 + limit_test_batches: 0.05 diff --git a/third_party/Matcha-TTS/configs/debug/overfit.yaml b/third_party/Matcha-TTS/configs/debug/overfit.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9906586a67a12aa81ff69138f589a366dbe2222f --- /dev/null +++ b/third_party/Matcha-TTS/configs/debug/overfit.yaml @@ -0,0 +1,13 @@ +# @package _global_ + +# overfits to 3 batches + +defaults: + - default + +trainer: + max_epochs: 20 + overfit_batches: 3 + +# model ckpt and early stopping need to be disabled during overfitting +callbacks: null diff --git a/third_party/Matcha-TTS/configs/debug/profiler.yaml b/third_party/Matcha-TTS/configs/debug/profiler.yaml new file mode 100644 index 0000000000000000000000000000000000000000..266295f15e0166e1d1b58b88caa7673f4b6493b5 --- /dev/null +++ b/third_party/Matcha-TTS/configs/debug/profiler.yaml @@ -0,0 +1,15 @@ +# @package _global_ + +# runs with execution time profiling + +defaults: + - default + +trainer: + max_epochs: 1 + # profiler: "simple" + profiler: "advanced" + # profiler: "pytorch" + accelerator: gpu + + limit_train_batches: 0.02 diff --git a/third_party/Matcha-TTS/configs/eval.yaml b/third_party/Matcha-TTS/configs/eval.yaml new file mode 100644 index 0000000000000000000000000000000000000000..be312992b2a486b04d83a54dbd8f670d94979709 --- /dev/null +++ b/third_party/Matcha-TTS/configs/eval.yaml @@ -0,0 +1,18 @@ +# @package _global_ + +defaults: + - _self_ + - data: mnist # choose datamodule with `test_dataloader()` for evaluation + - model: mnist + - logger: null + - trainer: default + - paths: default + - extras: default + - hydra: default + +task_name: "eval" + +tags: ["dev"] + +# passing checkpoint path is necessary for evaluation +ckpt_path: ??? diff --git a/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml b/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e6c57a0d0a399f7463f4ff2d96e1928c435779b --- /dev/null +++ b/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml @@ -0,0 +1,14 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=multispeaker + +defaults: + - override /data: hi-fi_en-US_female.yaml + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"] + +run_name: hi-fi_en-US_female_piper_phonemizer diff --git a/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml b/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d5723f42cf3552226c42bd91202cc18818b685f0 --- /dev/null +++ b/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml @@ -0,0 +1,14 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=multispeaker + +defaults: + - override /data: ljspeech.yaml + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +tags: ["ljspeech"] + +run_name: ljspeech diff --git a/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml b/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ef554dc633c392b1592d90d9d7734f2329264fdd --- /dev/null +++ b/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml @@ -0,0 +1,18 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=multispeaker + +defaults: + - override /data: ljspeech.yaml + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +tags: ["ljspeech"] + +run_name: ljspeech_min + + +model: + out_size: 172 diff --git a/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml b/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml new file mode 100644 index 0000000000000000000000000000000000000000..553842f4e2168db0fee4e44db11b5d086295b044 --- /dev/null +++ b/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml @@ -0,0 +1,14 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=multispeaker + +defaults: + - override /data: vctk.yaml + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +tags: ["multispeaker"] + +run_name: multispeaker diff --git a/third_party/Matcha-TTS/configs/extras/default.yaml b/third_party/Matcha-TTS/configs/extras/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b9c6b622283a647fbc513166fc14f016cc3ed8a0 --- /dev/null +++ b/third_party/Matcha-TTS/configs/extras/default.yaml @@ -0,0 +1,8 @@ +# disable python warnings if they annoy you +ignore_warnings: False + +# ask user for tags if none are provided in the config +enforce_tags: True + +# pretty print config tree at the start of the run using Rich library +print_config: True diff --git a/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml b/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1391183ebcdec3d8f5eb61374e0719d13c7545da --- /dev/null +++ b/third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml @@ -0,0 +1,52 @@ +# @package _global_ + +# example hyperparameter optimization of some experiment with Optuna: +# python train.py -m hparams_search=mnist_optuna experiment=example + +defaults: + - override /hydra/sweeper: optuna + +# choose metric which will be optimized by Optuna +# make sure this is the correct name of some metric logged in lightning module! +optimized_metric: "val/acc_best" + +# here we define Optuna hyperparameter search +# it optimizes for value returned from function with @hydra.main decorator +# docs: https://hydra.cc/docs/next/plugins/optuna_sweeper +hydra: + mode: "MULTIRUN" # set hydra to multirun by default if this config is attached + + sweeper: + _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper + + # storage URL to persist optimization results + # for example, you can use SQLite if you set 'sqlite:///example.db' + storage: null + + # name of the study to persist optimization results + study_name: null + + # number of parallel workers + n_jobs: 1 + + # 'minimize' or 'maximize' the objective + direction: maximize + + # total number of runs that will be executed + n_trials: 20 + + # choose Optuna hyperparameter sampler + # you can choose bayesian sampler (tpe), random search (without optimization), grid sampler, and others + # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html + sampler: + _target_: optuna.samplers.TPESampler + seed: 1234 + n_startup_trials: 10 # number of random sampling runs before optimization starts + + # define hyperparameter search space + params: + model.optimizer.lr: interval(0.0001, 0.1) + data.batch_size: choice(32, 64, 128, 256) + model.net.lin1_size: choice(64, 128, 256) + model.net.lin2_size: choice(64, 128, 256) + model.net.lin3_size: choice(32, 64, 128, 256) diff --git a/third_party/Matcha-TTS/configs/hydra/default.yaml b/third_party/Matcha-TTS/configs/hydra/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1533136b22802a4f81e5387b74e407289edce94d --- /dev/null +++ b/third_party/Matcha-TTS/configs/hydra/default.yaml @@ -0,0 +1,19 @@ +# https://hydra.cc/docs/configure_hydra/intro/ + +# enable color logging +defaults: + - override hydra_logging: colorlog + - override job_logging: colorlog + +# output directory, generated dynamically on each run +run: + dir: ${paths.log_dir}/${task_name}/${run_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S} +sweep: + dir: ${paths.log_dir}/${task_name}/${run_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S} + subdir: ${hydra.job.num} + +job_logging: + handlers: + file: + # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242 + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log diff --git a/third_party/Matcha-TTS/configs/local/.gitkeep b/third_party/Matcha-TTS/configs/local/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/Matcha-TTS/configs/logger/aim.yaml b/third_party/Matcha-TTS/configs/logger/aim.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8f9f6adad7feb2780c2efd5ddb0ed053621e05f8 --- /dev/null +++ b/third_party/Matcha-TTS/configs/logger/aim.yaml @@ -0,0 +1,28 @@ +# https://aimstack.io/ + +# example usage in lightning module: +# https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py + +# open the Aim UI with the following command (run in the folder containing the `.aim` folder): +# `aim up` + +aim: + _target_: aim.pytorch_lightning.AimLogger + repo: ${paths.root_dir} # .aim folder will be created here + # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html# + + # aim allows to group runs under experiment name + experiment: null # any string, set to "default" if not specified + + train_metric_prefix: "train/" + val_metric_prefix: "val/" + test_metric_prefix: "test/" + + # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.) + system_tracking_interval: 10 # set to null to disable system metrics tracking + + # enable/disable logging of system params such as installed packages, git info, env vars, etc. + log_system_params: true + + # enable/disable tracking console logs (default value is true) + capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550 diff --git a/third_party/Matcha-TTS/configs/logger/comet.yaml b/third_party/Matcha-TTS/configs/logger/comet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e0789274e2137ee6c97ca37a5d56c2b8abaf0aaa --- /dev/null +++ b/third_party/Matcha-TTS/configs/logger/comet.yaml @@ -0,0 +1,12 @@ +# https://www.comet.ml + +comet: + _target_: lightning.pytorch.loggers.comet.CometLogger + api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable + save_dir: "${paths.output_dir}" + project_name: "lightning-hydra-template" + rest_api_key: null + # experiment_name: "" + experiment_key: null # set to resume experiment + offline: False + prefix: "" diff --git a/third_party/Matcha-TTS/configs/logger/csv.yaml b/third_party/Matcha-TTS/configs/logger/csv.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fa028e9c146430c319101ffdfce466514338591c --- /dev/null +++ b/third_party/Matcha-TTS/configs/logger/csv.yaml @@ -0,0 +1,7 @@ +# csv logger built in lightning + +csv: + _target_: lightning.pytorch.loggers.csv_logs.CSVLogger + save_dir: "${paths.output_dir}" + name: "csv/" + prefix: "" diff --git a/third_party/Matcha-TTS/configs/logger/many_loggers.yaml b/third_party/Matcha-TTS/configs/logger/many_loggers.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd586800bdccb4e8f4b0236a181b7ddd756ba9ab --- /dev/null +++ b/third_party/Matcha-TTS/configs/logger/many_loggers.yaml @@ -0,0 +1,9 @@ +# train with many loggers at once + +defaults: + # - comet + - csv + # - mlflow + # - neptune + - tensorboard + - wandb diff --git a/third_party/Matcha-TTS/configs/logger/mlflow.yaml b/third_party/Matcha-TTS/configs/logger/mlflow.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8fb7e685fa27fc8141387a421b90a0b9b492d9e --- /dev/null +++ b/third_party/Matcha-TTS/configs/logger/mlflow.yaml @@ -0,0 +1,12 @@ +# https://mlflow.org + +mlflow: + _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger + # experiment_name: "" + # run_name: "" + tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI + tags: null + # save_dir: "./mlruns" + prefix: "" + artifact_location: null + # run_id: "" diff --git a/third_party/Matcha-TTS/configs/logger/neptune.yaml b/third_party/Matcha-TTS/configs/logger/neptune.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8233c140018ecce6ab62971beed269991d31c89b --- /dev/null +++ b/third_party/Matcha-TTS/configs/logger/neptune.yaml @@ -0,0 +1,9 @@ +# https://neptune.ai + +neptune: + _target_: lightning.pytorch.loggers.neptune.NeptuneLogger + api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable + project: username/lightning-hydra-template + # name: "" + log_model_checkpoints: True + prefix: "" diff --git a/third_party/Matcha-TTS/configs/logger/tensorboard.yaml b/third_party/Matcha-TTS/configs/logger/tensorboard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2bd31f6d8ba68d1f5c36a804885d5b9f9c1a9302 --- /dev/null +++ b/third_party/Matcha-TTS/configs/logger/tensorboard.yaml @@ -0,0 +1,10 @@ +# https://www.tensorflow.org/tensorboard/ + +tensorboard: + _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger + save_dir: "${paths.output_dir}/tensorboard/" + name: null + log_graph: False + default_hp_metric: True + prefix: "" + # version: "" diff --git a/third_party/Matcha-TTS/configs/logger/wandb.yaml b/third_party/Matcha-TTS/configs/logger/wandb.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ece165889b3d0d9dc750a8f3c7454188cfdf12b7 --- /dev/null +++ b/third_party/Matcha-TTS/configs/logger/wandb.yaml @@ -0,0 +1,16 @@ +# https://wandb.ai + +wandb: + _target_: lightning.pytorch.loggers.wandb.WandbLogger + # name: "" # name of the run (normally generated by wandb) + save_dir: "${paths.output_dir}" + offline: False + id: null # pass correct id to resume experiment! + anonymous: null # enable anonymous logging + project: "lightning-hydra-template" + log_model: False # upload lightning ckpts + prefix: "" # a string to put at the beginning of metric keys + # entity: "" # set to name of your wandb team + group: "" + tags: [] + job_type: "" diff --git a/third_party/Matcha-TTS/configs/model/cfm/default.yaml b/third_party/Matcha-TTS/configs/model/cfm/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d1d9609e2d05c7b0a12a26115520340ac18e584 --- /dev/null +++ b/third_party/Matcha-TTS/configs/model/cfm/default.yaml @@ -0,0 +1,3 @@ +name: CFM +solver: euler +sigma_min: 1e-4 diff --git a/third_party/Matcha-TTS/configs/model/decoder/default.yaml b/third_party/Matcha-TTS/configs/model/decoder/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aaa00e63402ade5c76247a2f1d6b294ec3c61e63 --- /dev/null +++ b/third_party/Matcha-TTS/configs/model/decoder/default.yaml @@ -0,0 +1,7 @@ +channels: [256, 256] +dropout: 0.05 +attention_head_dim: 64 +n_blocks: 1 +num_mid_blocks: 2 +num_heads: 2 +act_fn: snakebeta diff --git a/third_party/Matcha-TTS/configs/model/encoder/default.yaml b/third_party/Matcha-TTS/configs/model/encoder/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d4d5e5adee8f707bd384b682a3ad9a116c40c6ed --- /dev/null +++ b/third_party/Matcha-TTS/configs/model/encoder/default.yaml @@ -0,0 +1,18 @@ +encoder_type: RoPE Encoder +encoder_params: + n_feats: ${model.n_feats} + n_channels: 192 + filter_channels: 768 + filter_channels_dp: 256 + n_heads: 2 + n_layers: 6 + kernel_size: 3 + p_dropout: 0.1 + spk_emb_dim: 64 + n_spks: 1 + prenet: true + +duration_predictor_params: + filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp} + kernel_size: 3 + p_dropout: ${model.encoder.encoder_params.p_dropout} diff --git a/third_party/Matcha-TTS/configs/model/matcha.yaml b/third_party/Matcha-TTS/configs/model/matcha.yaml new file mode 100644 index 0000000000000000000000000000000000000000..36f6eafbdcaa324f7494a4b97a7590da7824f357 --- /dev/null +++ b/third_party/Matcha-TTS/configs/model/matcha.yaml @@ -0,0 +1,15 @@ +defaults: + - _self_ + - encoder: default.yaml + - decoder: default.yaml + - cfm: default.yaml + - optimizer: adam.yaml + +_target_: matcha.models.matcha_tts.MatchaTTS +n_vocab: 178 +n_spks: ${data.n_spks} +spk_emb_dim: 64 +n_feats: 80 +data_statistics: ${data.data_statistics} +out_size: null # Must be divisible by 4 +prior_loss: true diff --git a/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml b/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml new file mode 100644 index 0000000000000000000000000000000000000000..42795577474eaee5b0b96845a95e1a11c9152385 --- /dev/null +++ b/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml @@ -0,0 +1,4 @@ +_target_: torch.optim.Adam +_partial_: true +lr: 1e-4 +weight_decay: 0.0 diff --git a/third_party/Matcha-TTS/configs/paths/default.yaml b/third_party/Matcha-TTS/configs/paths/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ec81db2d34712909a79be3e42e65efe08c35ecee --- /dev/null +++ b/third_party/Matcha-TTS/configs/paths/default.yaml @@ -0,0 +1,18 @@ +# path to root directory +# this requires PROJECT_ROOT environment variable to exist +# you can replace it with "." if you want the root to be the current working directory +root_dir: ${oc.env:PROJECT_ROOT} + +# path to data directory +data_dir: ${paths.root_dir}/data/ + +# path to logging directory +log_dir: ${paths.root_dir}/logs/ + +# path to output directory, created dynamically by hydra +# path generation pattern is specified in `configs/hydra/default.yaml` +# use it to store all files generated during the run, like ckpts and metrics +output_dir: ${hydra:runtime.output_dir} + +# path to working directory +work_dir: ${hydra:runtime.cwd} diff --git a/third_party/Matcha-TTS/configs/train.yaml b/third_party/Matcha-TTS/configs/train.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e6f5c2e7b9781758c8d25f941f004ca383c3f494 --- /dev/null +++ b/third_party/Matcha-TTS/configs/train.yaml @@ -0,0 +1,51 @@ +# @package _global_ + +# specify here default configuration +# order of defaults determines the order in which configs override each other +defaults: + - _self_ + - data: ljspeech + - model: matcha + - callbacks: default + - logger: tensorboard # set logger here or use command line (e.g. `python train.py logger=tensorboard`) + - trainer: default + - paths: default + - extras: default + - hydra: default + + # experiment configs allow for version control of specific hyperparameters + # e.g. best hyperparameters for given model and datamodule + - experiment: null + + # config for hyperparameter optimization + - hparams_search: null + + # optional local config for machine/user specific settings + # it's optional since it doesn't need to exist and is excluded from version control + - optional local: default + + # debugging config (enable through command line, e.g. `python train.py debug=default) + - debug: null + +# task name, determines output directory path +task_name: "train" + +run_name: ??? + +# tags to help you identify your experiments +# you can overwrite this in experiment configs +# overwrite from command line with `python train.py tags="[first_tag, second_tag]"` +tags: ["dev"] + +# set False to skip model training +train: True + +# evaluate on test set, using best model weights achieved during training +# lightning chooses best weights based on the metric specified in checkpoint callback +test: True + +# simply provide checkpoint path to resume training +ckpt_path: null + +# seed for random number generators in pytorch, numpy and python.random +seed: 1234 diff --git a/third_party/Matcha-TTS/configs/trainer/cpu.yaml b/third_party/Matcha-TTS/configs/trainer/cpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b7d6767e60c956567555980654f15e7bb673a41f --- /dev/null +++ b/third_party/Matcha-TTS/configs/trainer/cpu.yaml @@ -0,0 +1,5 @@ +defaults: + - default + +accelerator: cpu +devices: 1 diff --git a/third_party/Matcha-TTS/configs/trainer/ddp.yaml b/third_party/Matcha-TTS/configs/trainer/ddp.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94b43e20ca7bf1f2ea92627fd46906e4f0a273a1 --- /dev/null +++ b/third_party/Matcha-TTS/configs/trainer/ddp.yaml @@ -0,0 +1,9 @@ +defaults: + - default + +strategy: ddp + +accelerator: gpu +devices: [0,1] +num_nodes: 1 +sync_batchnorm: True diff --git a/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml b/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8404419e5c295654967d0dfb73a7366e75be2f1f --- /dev/null +++ b/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml @@ -0,0 +1,7 @@ +defaults: + - default + +# simulate DDP on CPU, useful for debugging +accelerator: cpu +devices: 2 +strategy: ddp_spawn diff --git a/third_party/Matcha-TTS/configs/trainer/default.yaml b/third_party/Matcha-TTS/configs/trainer/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee3d370d8ca6b08d7ee7a86d34184c2104f0e1ef --- /dev/null +++ b/third_party/Matcha-TTS/configs/trainer/default.yaml @@ -0,0 +1,20 @@ +_target_: lightning.pytorch.trainer.Trainer + +default_root_dir: ${paths.output_dir} + +max_epochs: -1 + +accelerator: gpu +devices: [0] + +# mixed precision for extra speed-up +precision: 16-mixed + +# perform a validation loop every N training epochs +check_val_every_n_epoch: 1 + +# set True to to ensure deterministic results +# makes training slower but gives more reproducibility than just setting seeds +deterministic: False + +gradient_clip_val: 5.0 diff --git a/third_party/Matcha-TTS/configs/trainer/gpu.yaml b/third_party/Matcha-TTS/configs/trainer/gpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b2389510a90f5f0161cff6ccfcb4a96097ddf9a1 --- /dev/null +++ b/third_party/Matcha-TTS/configs/trainer/gpu.yaml @@ -0,0 +1,5 @@ +defaults: + - default + +accelerator: gpu +devices: 1 diff --git a/third_party/Matcha-TTS/configs/trainer/mps.yaml b/third_party/Matcha-TTS/configs/trainer/mps.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1ecf6d5cc3a34ca127c5510f4a18e989561e38e4 --- /dev/null +++ b/third_party/Matcha-TTS/configs/trainer/mps.yaml @@ -0,0 +1,5 @@ +defaults: + - default + +accelerator: mps +devices: 1 diff --git a/third_party/Matcha-TTS/matcha/VERSION b/third_party/Matcha-TTS/matcha/VERSION new file mode 100644 index 0000000000000000000000000000000000000000..442b1138f7851df1c22deb15fd5d6ff5b742e550 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/VERSION @@ -0,0 +1 @@ +0.0.5.1 diff --git a/third_party/Matcha-TTS/matcha/__init__.py b/third_party/Matcha-TTS/matcha/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/Matcha-TTS/matcha/__pycache__/__init__.cpython-310.pyc b/third_party/Matcha-TTS/matcha/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51bb46ef7dc003c0d25f2b0d9755c0241ed3d2cb Binary files /dev/null and b/third_party/Matcha-TTS/matcha/__pycache__/__init__.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/app.py b/third_party/Matcha-TTS/matcha/app.py new file mode 100644 index 0000000000000000000000000000000000000000..d68fbaa2d10d1faab606d89906af5e8b6baa5aa4 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/app.py @@ -0,0 +1,357 @@ +import tempfile +from argparse import Namespace +from pathlib import Path + +import gradio as gr +import soundfile as sf +import torch + +from matcha.cli import ( + MATCHA_URLS, + VOCODER_URLS, + assert_model_downloaded, + get_device, + load_matcha, + load_vocoder, + process_text, + to_waveform, +) +from matcha.utils.utils import get_user_data_dir, plot_tensor + +LOCATION = Path(get_user_data_dir()) + +args = Namespace( + cpu=False, + model="matcha_vctk", + vocoder="hifigan_univ_v1", + spk=0, +) + +CURRENTLY_LOADED_MODEL = args.model + + +def MATCHA_TTS_LOC(x): + return LOCATION / f"{x}.ckpt" + + +def VOCODER_LOC(x): + return LOCATION / f"{x}" + + +LOGO_URL = "https://shivammehta25.github.io/Matcha-TTS/images/logo.png" +RADIO_OPTIONS = { + "Multi Speaker (VCTK)": { + "model": "matcha_vctk", + "vocoder": "hifigan_univ_v1", + }, + "Single Speaker (LJ Speech)": { + "model": "matcha_ljspeech", + "vocoder": "hifigan_T2_v1", + }, +} + +# Ensure all the required models are downloaded +assert_model_downloaded(MATCHA_TTS_LOC("matcha_ljspeech"), MATCHA_URLS["matcha_ljspeech"]) +assert_model_downloaded(VOCODER_LOC("hifigan_T2_v1"), VOCODER_URLS["hifigan_T2_v1"]) +assert_model_downloaded(MATCHA_TTS_LOC("matcha_vctk"), MATCHA_URLS["matcha_vctk"]) +assert_model_downloaded(VOCODER_LOC("hifigan_univ_v1"), VOCODER_URLS["hifigan_univ_v1"]) + +device = get_device(args) + +# Load default model +model = load_matcha(args.model, MATCHA_TTS_LOC(args.model), device) +vocoder, denoiser = load_vocoder(args.vocoder, VOCODER_LOC(args.vocoder), device) + + +def load_model(model_name, vocoder_name): + model = load_matcha(model_name, MATCHA_TTS_LOC(model_name), device) + vocoder, denoiser = load_vocoder(vocoder_name, VOCODER_LOC(vocoder_name), device) + return model, vocoder, denoiser + + +def load_model_ui(model_type, textbox): + model_name, vocoder_name = RADIO_OPTIONS[model_type]["model"], RADIO_OPTIONS[model_type]["vocoder"] + + global model, vocoder, denoiser, CURRENTLY_LOADED_MODEL # pylint: disable=global-statement + if CURRENTLY_LOADED_MODEL != model_name: + model, vocoder, denoiser = load_model(model_name, vocoder_name) + CURRENTLY_LOADED_MODEL = model_name + + if model_name == "matcha_ljspeech": + spk_slider = gr.update(visible=False, value=-1) + single_speaker_examples = gr.update(visible=True) + multi_speaker_examples = gr.update(visible=False) + length_scale = gr.update(value=0.95) + else: + spk_slider = gr.update(visible=True, value=0) + single_speaker_examples = gr.update(visible=False) + multi_speaker_examples = gr.update(visible=True) + length_scale = gr.update(value=0.85) + + return ( + textbox, + gr.update(interactive=True), + spk_slider, + single_speaker_examples, + multi_speaker_examples, + length_scale, + ) + + +@torch.inference_mode() +def process_text_gradio(text): + output = process_text(1, text, device) + return output["x_phones"][1::2], output["x"], output["x_lengths"] + + +@torch.inference_mode() +def synthesise_mel(text, text_length, n_timesteps, temperature, length_scale, spk): + spk = torch.tensor([spk], device=device, dtype=torch.long) if spk >= 0 else None + output = model.synthesise( + text, + text_length, + n_timesteps=n_timesteps, + temperature=temperature, + spks=spk, + length_scale=length_scale, + ) + output["waveform"] = to_waveform(output["mel"], vocoder, denoiser) + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: + sf.write(fp.name, output["waveform"], 22050, "PCM_24") + + return fp.name, plot_tensor(output["mel"].squeeze().cpu().numpy()) + + +def multispeaker_example_cacher(text, n_timesteps, mel_temp, length_scale, spk): + global CURRENTLY_LOADED_MODEL # pylint: disable=global-statement + if CURRENTLY_LOADED_MODEL != "matcha_vctk": + global model, vocoder, denoiser # pylint: disable=global-statement + model, vocoder, denoiser = load_model("matcha_vctk", "hifigan_univ_v1") + CURRENTLY_LOADED_MODEL = "matcha_vctk" + + phones, text, text_lengths = process_text_gradio(text) + audio, mel_spectrogram = synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale, spk) + return phones, audio, mel_spectrogram + + +def ljspeech_example_cacher(text, n_timesteps, mel_temp, length_scale, spk=-1): + global CURRENTLY_LOADED_MODEL # pylint: disable=global-statement + if CURRENTLY_LOADED_MODEL != "matcha_ljspeech": + global model, vocoder, denoiser # pylint: disable=global-statement + model, vocoder, denoiser = load_model("matcha_ljspeech", "hifigan_T2_v1") + CURRENTLY_LOADED_MODEL = "matcha_ljspeech" + + phones, text, text_lengths = process_text_gradio(text) + audio, mel_spectrogram = synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale, spk) + return phones, audio, mel_spectrogram + + +def main(): + description = """# 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching + ### [Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [Éva Székely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/) + We propose 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis. Our method: + + + * Is probabilistic + * Has compact memory footprint + * Sounds highly natural + * Is very fast to synthesise from + + + Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS). Read our [arXiv preprint for more details](https://arxiv.org/abs/2309.03199). + Code is available in our [GitHub repository](https://github.com/shivammehta25/Matcha-TTS), along with pre-trained models. + + Cached examples are available at the bottom of the page. + """ + + with gr.Blocks(title="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching") as demo: + processed_text = gr.State(value=None) + processed_text_len = gr.State(value=None) + + with gr.Box(): + with gr.Row(): + gr.Markdown(description, scale=3) + with gr.Column(): + gr.Image(LOGO_URL, label="Matcha-TTS logo", height=50, width=50, scale=1, show_label=False) + html = '
' + gr.HTML(html) + + with gr.Box(): + radio_options = list(RADIO_OPTIONS.keys()) + model_type = gr.Radio( + radio_options, value=radio_options[0], label="Choose a Model", interactive=True, container=False + ) + + with gr.Row(): + gr.Markdown("# Text Input") + with gr.Row(): + text = gr.Textbox(value="", lines=2, label="Text to synthesise", scale=3) + spk_slider = gr.Slider( + minimum=0, maximum=107, step=1, value=args.spk, label="Speaker ID", interactive=True, scale=1 + ) + + with gr.Row(): + gr.Markdown("### Hyper parameters") + with gr.Row(): + n_timesteps = gr.Slider( + label="Number of ODE steps", + minimum=1, + maximum=100, + step=1, + value=10, + interactive=True, + ) + length_scale = gr.Slider( + label="Length scale (Speaking rate)", + minimum=0.5, + maximum=1.5, + step=0.05, + value=1.0, + interactive=True, + ) + mel_temp = gr.Slider( + label="Sampling temperature", + minimum=0.00, + maximum=2.001, + step=0.16675, + value=0.667, + interactive=True, + ) + + synth_btn = gr.Button("Synthesise") + + with gr.Box(): + with gr.Row(): + gr.Markdown("### Phonetised text") + phonetised_text = gr.Textbox(interactive=False, scale=10, label="Phonetised text") + + with gr.Box(): + with gr.Row(): + mel_spectrogram = gr.Image(interactive=False, label="mel spectrogram") + + # with gr.Row(): + audio = gr.Audio(interactive=False, label="Audio") + + with gr.Row(visible=False) as example_row_lj_speech: + examples = gr.Examples( # pylint: disable=unused-variable + examples=[ + [ + "We propose Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up O D E-based speech synthesis.", + 50, + 0.677, + 0.95, + ], + [ + "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.", + 2, + 0.677, + 0.95, + ], + [ + "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.", + 4, + 0.677, + 0.95, + ], + [ + "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.", + 10, + 0.677, + 0.95, + ], + [ + "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.", + 50, + 0.677, + 0.95, + ], + [ + "The narrative of these events is based largely on the recollections of the participants.", + 10, + 0.677, + 0.95, + ], + [ + "The jury did not believe him, and the verdict was for the defendants.", + 10, + 0.677, + 0.95, + ], + ], + fn=ljspeech_example_cacher, + inputs=[text, n_timesteps, mel_temp, length_scale], + outputs=[phonetised_text, audio, mel_spectrogram], + cache_examples=True, + ) + + with gr.Row() as example_row_multispeaker: + multi_speaker_examples = gr.Examples( # pylint: disable=unused-variable + examples=[ + [ + "Hello everyone! I am speaker 0 and I am here to tell you that Matcha-TTS is amazing!", + 10, + 0.677, + 0.85, + 0, + ], + [ + "Hello everyone! I am speaker 16 and I am here to tell you that Matcha-TTS is amazing!", + 10, + 0.677, + 0.85, + 16, + ], + [ + "Hello everyone! I am speaker 44 and I am here to tell you that Matcha-TTS is amazing!", + 50, + 0.677, + 0.85, + 44, + ], + [ + "Hello everyone! I am speaker 45 and I am here to tell you that Matcha-TTS is amazing!", + 50, + 0.677, + 0.85, + 45, + ], + [ + "Hello everyone! I am speaker 58 and I am here to tell you that Matcha-TTS is amazing!", + 4, + 0.677, + 0.85, + 58, + ], + ], + fn=multispeaker_example_cacher, + inputs=[text, n_timesteps, mel_temp, length_scale, spk_slider], + outputs=[phonetised_text, audio, mel_spectrogram], + cache_examples=True, + label="Multi Speaker Examples", + ) + + model_type.change(lambda x: gr.update(interactive=False), inputs=[synth_btn], outputs=[synth_btn]).then( + load_model_ui, + inputs=[model_type, text], + outputs=[text, synth_btn, spk_slider, example_row_lj_speech, example_row_multispeaker, length_scale], + ) + + synth_btn.click( + fn=process_text_gradio, + inputs=[ + text, + ], + outputs=[phonetised_text, processed_text, processed_text_len], + api_name="matcha_tts", + queue=True, + ).then( + fn=synthesise_mel, + inputs=[processed_text, processed_text_len, n_timesteps, mel_temp, length_scale, spk_slider], + outputs=[audio, mel_spectrogram], + ) + + demo.queue().launch(share=True) + + +if __name__ == "__main__": + main() diff --git a/third_party/Matcha-TTS/matcha/cli.py b/third_party/Matcha-TTS/matcha/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..579d7d636450a41f1c06a4393d64ddbae38c5011 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/cli.py @@ -0,0 +1,418 @@ +import argparse +import datetime as dt +import os +import warnings +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import soundfile as sf +import torch + +from matcha.hifigan.config import v1 +from matcha.hifigan.denoiser import Denoiser +from matcha.hifigan.env import AttrDict +from matcha.hifigan.models import Generator as HiFiGAN +from matcha.models.matcha_tts import MatchaTTS +from matcha.text import sequence_to_text, text_to_sequence +from matcha.utils.utils import assert_model_downloaded, get_user_data_dir, intersperse + +MATCHA_URLS = { + "matcha_ljspeech": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/matcha_ljspeech.ckpt", + "matcha_vctk": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/matcha_vctk.ckpt", +} + +VOCODER_URLS = { + "hifigan_T2_v1": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/generator_v1", # Old url: https://drive.google.com/file/d/14NENd4equCBLyyCSke114Mv6YR_j_uFs/view?usp=drive_link + "hifigan_univ_v1": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/g_02500000", # Old url: https://drive.google.com/file/d/1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW/view?usp=drive_link +} + +MULTISPEAKER_MODEL = { + "matcha_vctk": {"vocoder": "hifigan_univ_v1", "speaking_rate": 0.85, "spk": 0, "spk_range": (0, 107)} +} + +SINGLESPEAKER_MODEL = {"matcha_ljspeech": {"vocoder": "hifigan_T2_v1", "speaking_rate": 0.95, "spk": None}} + + +def plot_spectrogram_to_numpy(spectrogram, filename): + fig, ax = plt.subplots(figsize=(12, 3)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.title("Synthesised Mel-Spectrogram") + fig.canvas.draw() + plt.savefig(filename) + + +def process_text(i: int, text: str, device: torch.device): + print(f"[{i}] - Input text: {text}") + x = torch.tensor( + intersperse(text_to_sequence(text, ["english_cleaners2"]), 0), + dtype=torch.long, + device=device, + )[None] + x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device) + x_phones = sequence_to_text(x.squeeze(0).tolist()) + print(f"[{i}] - Phonetised text: {x_phones[1::2]}") + + return {"x_orig": text, "x": x, "x_lengths": x_lengths, "x_phones": x_phones} + + +def get_texts(args): + if args.text: + texts = [args.text] + else: + with open(args.file, encoding="utf-8") as f: + texts = f.readlines() + return texts + + +def assert_required_models_available(args): + save_dir = get_user_data_dir() + if not hasattr(args, "checkpoint_path") and args.checkpoint_path is None: + model_path = args.checkpoint_path + else: + model_path = save_dir / f"{args.model}.ckpt" + assert_model_downloaded(model_path, MATCHA_URLS[args.model]) + + vocoder_path = save_dir / f"{args.vocoder}" + assert_model_downloaded(vocoder_path, VOCODER_URLS[args.vocoder]) + return {"matcha": model_path, "vocoder": vocoder_path} + + +def load_hifigan(checkpoint_path, device): + h = AttrDict(v1) + hifigan = HiFiGAN(h).to(device) + hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"]) + _ = hifigan.eval() + hifigan.remove_weight_norm() + return hifigan + + +def load_vocoder(vocoder_name, checkpoint_path, device): + print(f"[!] Loading {vocoder_name}!") + vocoder = None + if vocoder_name in ("hifigan_T2_v1", "hifigan_univ_v1"): + vocoder = load_hifigan(checkpoint_path, device) + else: + raise NotImplementedError( + f"Vocoder {vocoder_name} not implemented! define a load_<> method for it" + ) + + denoiser = Denoiser(vocoder, mode="zeros") + print(f"[+] {vocoder_name} loaded!") + return vocoder, denoiser + + +def load_matcha(model_name, checkpoint_path, device): + print(f"[!] Loading {model_name}!") + model = MatchaTTS.load_from_checkpoint(checkpoint_path, map_location=device) + _ = model.eval() + + print(f"[+] {model_name} loaded!") + return model + + +def to_waveform(mel, vocoder, denoiser=None): + audio = vocoder(mel).clamp(-1, 1) + if denoiser is not None: + audio = denoiser(audio.squeeze(), strength=0.00025).cpu().squeeze() + + return audio.cpu().squeeze() + + +def save_to_folder(filename: str, output: dict, folder: str): + folder = Path(folder) + folder.mkdir(exist_ok=True, parents=True) + plot_spectrogram_to_numpy(np.array(output["mel"].squeeze().float().cpu()), f"{filename}.png") + np.save(folder / f"{filename}", output["mel"].cpu().numpy()) + sf.write(folder / f"{filename}.wav", output["waveform"], 22050, "PCM_24") + return folder.resolve() / f"{filename}.wav" + + +def validate_args(args): + assert ( + args.text or args.file + ), "Either text or file must be provided Matcha-T(ea)TTS need sometext to whisk the waveforms." + assert args.temperature >= 0, "Sampling temperature cannot be negative" + assert args.steps > 0, "Number of ODE steps must be greater than 0" + + if args.checkpoint_path is None: + # When using pretrained models + if args.model in SINGLESPEAKER_MODEL: + args = validate_args_for_single_speaker_model(args) + + if args.model in MULTISPEAKER_MODEL: + args = validate_args_for_multispeaker_model(args) + else: + # When using a custom model + if args.vocoder != "hifigan_univ_v1": + warn_ = "[-] Using custom model checkpoint! I would suggest passing --vocoder hifigan_univ_v1, unless the custom model is trained on LJ Speech." + warnings.warn(warn_, UserWarning) + if args.speaking_rate is None: + args.speaking_rate = 1.0 + + if args.batched: + assert args.batch_size > 0, "Batch size must be greater than 0" + assert args.speaking_rate > 0, "Speaking rate must be greater than 0" + + return args + + +def validate_args_for_multispeaker_model(args): + if args.vocoder is not None: + if args.vocoder != MULTISPEAKER_MODEL[args.model]["vocoder"]: + warn_ = f"[-] Using {args.model} model! I would suggest passing --vocoder {MULTISPEAKER_MODEL[args.model]['vocoder']}" + warnings.warn(warn_, UserWarning) + else: + args.vocoder = MULTISPEAKER_MODEL[args.model]["vocoder"] + + if args.speaking_rate is None: + args.speaking_rate = MULTISPEAKER_MODEL[args.model]["speaking_rate"] + + spk_range = MULTISPEAKER_MODEL[args.model]["spk_range"] + if args.spk is not None: + assert ( + args.spk >= spk_range[0] and args.spk <= spk_range[-1] + ), f"Speaker ID must be between {spk_range} for this model." + else: + available_spk_id = MULTISPEAKER_MODEL[args.model]["spk"] + warn_ = f"[!] Speaker ID not provided! Using speaker ID {available_spk_id}" + warnings.warn(warn_, UserWarning) + args.spk = available_spk_id + + return args + + +def validate_args_for_single_speaker_model(args): + if args.vocoder is not None: + if args.vocoder != SINGLESPEAKER_MODEL[args.model]["vocoder"]: + warn_ = f"[-] Using {args.model} model! I would suggest passing --vocoder {SINGLESPEAKER_MODEL[args.model]['vocoder']}" + warnings.warn(warn_, UserWarning) + else: + args.vocoder = SINGLESPEAKER_MODEL[args.model]["vocoder"] + + if args.speaking_rate is None: + args.speaking_rate = SINGLESPEAKER_MODEL[args.model]["speaking_rate"] + + if args.spk != SINGLESPEAKER_MODEL[args.model]["spk"]: + warn_ = f"[-] Ignoring speaker id {args.spk} for {args.model}" + warnings.warn(warn_, UserWarning) + args.spk = SINGLESPEAKER_MODEL[args.model]["spk"] + + return args + + +@torch.inference_mode() +def cli(): + parser = argparse.ArgumentParser( + description=" 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching" + ) + parser.add_argument( + "--model", + type=str, + default="matcha_ljspeech", + help="Model to use", + choices=MATCHA_URLS.keys(), + ) + + parser.add_argument( + "--checkpoint_path", + type=str, + default=None, + help="Path to the custom model checkpoint", + ) + + parser.add_argument( + "--vocoder", + type=str, + default=None, + help="Vocoder to use (default: will use the one suggested with the pretrained model))", + choices=VOCODER_URLS.keys(), + ) + parser.add_argument("--text", type=str, default=None, help="Text to synthesize") + parser.add_argument("--file", type=str, default=None, help="Text file to synthesize") + parser.add_argument("--spk", type=int, default=None, help="Speaker ID") + parser.add_argument( + "--temperature", + type=float, + default=0.667, + help="Variance of the x0 noise (default: 0.667)", + ) + parser.add_argument( + "--speaking_rate", + type=float, + default=None, + help="change the speaking rate, a higher value means slower speaking rate (default: 1.0)", + ) + parser.add_argument("--steps", type=int, default=10, help="Number of ODE steps (default: 10)") + parser.add_argument("--cpu", action="store_true", help="Use CPU for inference (default: use GPU if available)") + parser.add_argument( + "--denoiser_strength", + type=float, + default=0.00025, + help="Strength of the vocoder bias denoiser (default: 0.00025)", + ) + parser.add_argument( + "--output_folder", + type=str, + default=os.getcwd(), + help="Output folder to save results (default: current dir)", + ) + parser.add_argument("--batched", action="store_true", help="Batched inference (default: False)") + parser.add_argument( + "--batch_size", type=int, default=32, help="Batch size only useful when --batched (default: 32)" + ) + + args = parser.parse_args() + + args = validate_args(args) + device = get_device(args) + print_config(args) + paths = assert_required_models_available(args) + + if args.checkpoint_path is not None: + print(f"[🍵] Loading custom model from {args.checkpoint_path}") + paths["matcha"] = args.checkpoint_path + args.model = "custom_model" + + model = load_matcha(args.model, paths["matcha"], device) + vocoder, denoiser = load_vocoder(args.vocoder, paths["vocoder"], device) + + texts = get_texts(args) + + spk = torch.tensor([args.spk], device=device, dtype=torch.long) if args.spk is not None else None + if len(texts) == 1 or not args.batched: + unbatched_synthesis(args, device, model, vocoder, denoiser, texts, spk) + else: + batched_synthesis(args, device, model, vocoder, denoiser, texts, spk) + + +class BatchedSynthesisDataset(torch.utils.data.Dataset): + def __init__(self, processed_texts): + self.processed_texts = processed_texts + + def __len__(self): + return len(self.processed_texts) + + def __getitem__(self, idx): + return self.processed_texts[idx] + + +def batched_collate_fn(batch): + x = [] + x_lengths = [] + + for b in batch: + x.append(b["x"].squeeze(0)) + x_lengths.append(b["x_lengths"]) + + x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True) + x_lengths = torch.concat(x_lengths, dim=0) + return {"x": x, "x_lengths": x_lengths} + + +def batched_synthesis(args, device, model, vocoder, denoiser, texts, spk): + total_rtf = [] + total_rtf_w = [] + processed_text = [process_text(i, text, "cpu") for i, text in enumerate(texts)] + dataloader = torch.utils.data.DataLoader( + BatchedSynthesisDataset(processed_text), + batch_size=args.batch_size, + collate_fn=batched_collate_fn, + num_workers=8, + ) + for i, batch in enumerate(dataloader): + i = i + 1 + start_t = dt.datetime.now() + output = model.synthesise( + batch["x"].to(device), + batch["x_lengths"].to(device), + n_timesteps=args.steps, + temperature=args.temperature, + spks=spk, + length_scale=args.speaking_rate, + ) + + output["waveform"] = to_waveform(output["mel"], vocoder, denoiser) + t = (dt.datetime.now() - start_t).total_seconds() + rtf_w = t * 22050 / (output["waveform"].shape[-1]) + print(f"[🍵-Batch: {i}] Matcha-TTS RTF: {output['rtf']:.4f}") + print(f"[🍵-Batch: {i}] Matcha-TTS + VOCODER RTF: {rtf_w:.4f}") + total_rtf.append(output["rtf"]) + total_rtf_w.append(rtf_w) + for j in range(output["mel"].shape[0]): + base_name = f"utterance_{j:03d}_speaker_{args.spk:03d}" if args.spk is not None else f"utterance_{j:03d}" + length = output["mel_lengths"][j] + new_dict = {"mel": output["mel"][j][:, :length], "waveform": output["waveform"][j][: length * 256]} + location = save_to_folder(base_name, new_dict, args.output_folder) + print(f"[🍵-{j}] Waveform saved: {location}") + + print("".join(["="] * 100)) + print(f"[🍵] Average Matcha-TTS RTF: {np.mean(total_rtf):.4f} ± {np.std(total_rtf)}") + print(f"[🍵] Average Matcha-TTS + VOCODER RTF: {np.mean(total_rtf_w):.4f} ± {np.std(total_rtf_w)}") + print("[🍵] Enjoy the freshly whisked 🍵 Matcha-TTS!") + + +def unbatched_synthesis(args, device, model, vocoder, denoiser, texts, spk): + total_rtf = [] + total_rtf_w = [] + for i, text in enumerate(texts): + i = i + 1 + base_name = f"utterance_{i:03d}_speaker_{args.spk:03d}" if args.spk is not None else f"utterance_{i:03d}" + + print("".join(["="] * 100)) + text = text.strip() + text_processed = process_text(i, text, device) + + print(f"[🍵] Whisking Matcha-T(ea)TS for: {i}") + start_t = dt.datetime.now() + output = model.synthesise( + text_processed["x"], + text_processed["x_lengths"], + n_timesteps=args.steps, + temperature=args.temperature, + spks=spk, + length_scale=args.speaking_rate, + ) + output["waveform"] = to_waveform(output["mel"], vocoder, denoiser) + # RTF with HiFiGAN + t = (dt.datetime.now() - start_t).total_seconds() + rtf_w = t * 22050 / (output["waveform"].shape[-1]) + print(f"[🍵-{i}] Matcha-TTS RTF: {output['rtf']:.4f}") + print(f"[🍵-{i}] Matcha-TTS + VOCODER RTF: {rtf_w:.4f}") + total_rtf.append(output["rtf"]) + total_rtf_w.append(rtf_w) + + location = save_to_folder(base_name, output, args.output_folder) + print(f"[+] Waveform saved: {location}") + + print("".join(["="] * 100)) + print(f"[🍵] Average Matcha-TTS RTF: {np.mean(total_rtf):.4f} ± {np.std(total_rtf)}") + print(f"[🍵] Average Matcha-TTS + VOCODER RTF: {np.mean(total_rtf_w):.4f} ± {np.std(total_rtf_w)}") + print("[🍵] Enjoy the freshly whisked 🍵 Matcha-TTS!") + + +def print_config(args): + print("[!] Configurations: ") + print(f"\t- Model: {args.model}") + print(f"\t- Vocoder: {args.vocoder}") + print(f"\t- Temperature: {args.temperature}") + print(f"\t- Speaking rate: {args.speaking_rate}") + print(f"\t- Number of ODE steps: {args.steps}") + print(f"\t- Speaker: {args.spk}") + + +def get_device(args): + if torch.cuda.is_available() and not args.cpu: + print("[+] GPU Available! Using GPU") + device = torch.device("cuda") + else: + print("[-] GPU not available or forced CPU run! Using CPU") + device = torch.device("cpu") + return device + + +if __name__ == "__main__": + cli() diff --git a/third_party/Matcha-TTS/matcha/data/__init__.py b/third_party/Matcha-TTS/matcha/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/Matcha-TTS/matcha/data/components/__init__.py b/third_party/Matcha-TTS/matcha/data/components/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/Matcha-TTS/matcha/data/text_mel_datamodule.py b/third_party/Matcha-TTS/matcha/data/text_mel_datamodule.py new file mode 100644 index 0000000000000000000000000000000000000000..704f93629f1874b88efd07609409653ffbb8338a --- /dev/null +++ b/third_party/Matcha-TTS/matcha/data/text_mel_datamodule.py @@ -0,0 +1,231 @@ +import random +from typing import Any, Dict, Optional + +import torch +import torchaudio as ta +from lightning import LightningDataModule +from torch.utils.data.dataloader import DataLoader + +from matcha.text import text_to_sequence +from matcha.utils.audio import mel_spectrogram +from matcha.utils.model import fix_len_compatibility, normalize +from matcha.utils.utils import intersperse + + +def parse_filelist(filelist_path, split_char="|"): + with open(filelist_path, encoding="utf-8") as f: + filepaths_and_text = [line.strip().split(split_char) for line in f] + return filepaths_and_text + + +class TextMelDataModule(LightningDataModule): + def __init__( # pylint: disable=unused-argument + self, + name, + train_filelist_path, + valid_filelist_path, + batch_size, + num_workers, + pin_memory, + cleaners, + add_blank, + n_spks, + n_fft, + n_feats, + sample_rate, + hop_length, + win_length, + f_min, + f_max, + data_statistics, + seed, + ): + super().__init__() + + # this line allows to access init params with 'self.hparams' attribute + # also ensures init params will be stored in ckpt + self.save_hyperparameters(logger=False) + + def setup(self, stage: Optional[str] = None): # pylint: disable=unused-argument + """Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`. + + This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be + careful not to execute things like random split twice! + """ + # load and split datasets only if not loaded already + + self.trainset = TextMelDataset( # pylint: disable=attribute-defined-outside-init + self.hparams.train_filelist_path, + self.hparams.n_spks, + self.hparams.cleaners, + self.hparams.add_blank, + self.hparams.n_fft, + self.hparams.n_feats, + self.hparams.sample_rate, + self.hparams.hop_length, + self.hparams.win_length, + self.hparams.f_min, + self.hparams.f_max, + self.hparams.data_statistics, + self.hparams.seed, + ) + self.validset = TextMelDataset( # pylint: disable=attribute-defined-outside-init + self.hparams.valid_filelist_path, + self.hparams.n_spks, + self.hparams.cleaners, + self.hparams.add_blank, + self.hparams.n_fft, + self.hparams.n_feats, + self.hparams.sample_rate, + self.hparams.hop_length, + self.hparams.win_length, + self.hparams.f_min, + self.hparams.f_max, + self.hparams.data_statistics, + self.hparams.seed, + ) + + def train_dataloader(self): + return DataLoader( + dataset=self.trainset, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + pin_memory=self.hparams.pin_memory, + shuffle=True, + collate_fn=TextMelBatchCollate(self.hparams.n_spks), + ) + + def val_dataloader(self): + return DataLoader( + dataset=self.validset, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + pin_memory=self.hparams.pin_memory, + shuffle=False, + collate_fn=TextMelBatchCollate(self.hparams.n_spks), + ) + + def teardown(self, stage: Optional[str] = None): + """Clean up after fit or test.""" + pass # pylint: disable=unnecessary-pass + + def state_dict(self): # pylint: disable=no-self-use + """Extra things to save to checkpoint.""" + return {} + + def load_state_dict(self, state_dict: Dict[str, Any]): + """Things to do when loading checkpoint.""" + pass # pylint: disable=unnecessary-pass + + +class TextMelDataset(torch.utils.data.Dataset): + def __init__( + self, + filelist_path, + n_spks, + cleaners, + add_blank=True, + n_fft=1024, + n_mels=80, + sample_rate=22050, + hop_length=256, + win_length=1024, + f_min=0.0, + f_max=8000, + data_parameters=None, + seed=None, + ): + self.filepaths_and_text = parse_filelist(filelist_path) + self.n_spks = n_spks + self.cleaners = cleaners + self.add_blank = add_blank + self.n_fft = n_fft + self.n_mels = n_mels + self.sample_rate = sample_rate + self.hop_length = hop_length + self.win_length = win_length + self.f_min = f_min + self.f_max = f_max + if data_parameters is not None: + self.data_parameters = data_parameters + else: + self.data_parameters = {"mel_mean": 0, "mel_std": 1} + random.seed(seed) + random.shuffle(self.filepaths_and_text) + + def get_datapoint(self, filepath_and_text): + if self.n_spks > 1: + filepath, spk, text = ( + filepath_and_text[0], + int(filepath_and_text[1]), + filepath_and_text[2], + ) + else: + filepath, text = filepath_and_text[0], filepath_and_text[1] + spk = None + + text = self.get_text(text, add_blank=self.add_blank) + mel = self.get_mel(filepath) + + return {"x": text, "y": mel, "spk": spk} + + def get_mel(self, filepath): + audio, sr = ta.load(filepath) + assert sr == self.sample_rate + mel = mel_spectrogram( + audio, + self.n_fft, + self.n_mels, + self.sample_rate, + self.hop_length, + self.win_length, + self.f_min, + self.f_max, + center=False, + ).squeeze() + mel = normalize(mel, self.data_parameters["mel_mean"], self.data_parameters["mel_std"]) + return mel + + def get_text(self, text, add_blank=True): + text_norm = text_to_sequence(text, self.cleaners) + if self.add_blank: + text_norm = intersperse(text_norm, 0) + text_norm = torch.IntTensor(text_norm) + return text_norm + + def __getitem__(self, index): + datapoint = self.get_datapoint(self.filepaths_and_text[index]) + return datapoint + + def __len__(self): + return len(self.filepaths_and_text) + + +class TextMelBatchCollate: + def __init__(self, n_spks): + self.n_spks = n_spks + + def __call__(self, batch): + B = len(batch) + y_max_length = max([item["y"].shape[-1] for item in batch]) + y_max_length = fix_len_compatibility(y_max_length) + x_max_length = max([item["x"].shape[-1] for item in batch]) + n_feats = batch[0]["y"].shape[-2] + + y = torch.zeros((B, n_feats, y_max_length), dtype=torch.float32) + x = torch.zeros((B, x_max_length), dtype=torch.long) + y_lengths, x_lengths = [], [] + spks = [] + for i, item in enumerate(batch): + y_, x_ = item["y"], item["x"] + y_lengths.append(y_.shape[-1]) + x_lengths.append(x_.shape[-1]) + y[i, :, : y_.shape[-1]] = y_ + x[i, : x_.shape[-1]] = x_ + spks.append(item["spk"]) + + y_lengths = torch.tensor(y_lengths, dtype=torch.long) + x_lengths = torch.tensor(x_lengths, dtype=torch.long) + spks = torch.tensor(spks, dtype=torch.long) if self.n_spks > 1 else None + + return {"x": x, "x_lengths": x_lengths, "y": y, "y_lengths": y_lengths, "spks": spks} diff --git a/third_party/Matcha-TTS/matcha/hifigan/LICENSE b/third_party/Matcha-TTS/matcha/hifigan/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..91751daed806f63ac594cf077a3065f719a41662 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/hifigan/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Jungil Kong + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/third_party/Matcha-TTS/matcha/hifigan/README.md b/third_party/Matcha-TTS/matcha/hifigan/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5db25850451a794b1db1b15b08e82c1d802edbb3 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/hifigan/README.md @@ -0,0 +1,101 @@ +# HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis + +### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae + +In our [paper](https://arxiv.org/abs/2010.05646), +we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.
+We provide our implementation and pretrained models as open source in this repository. + +**Abstract :** +Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms. +Although such methods improve the sampling efficiency and memory usage, +their sample quality has not yet reached that of autoregressive and flow-based generative models. +In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis. +As speech audio consists of sinusoidal signals with various periods, +we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality. +A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method +demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than +real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen +speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times +faster than real-time on CPU with comparable quality to an autoregressive counterpart. + +Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples. + +## Pre-requisites + +1. Python >= 3.6 +2. Clone this repository. +3. Install python requirements. Please refer [requirements.txt](requirements.txt) +4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/). + And move all wav files to `LJSpeech-1.1/wavs` + +## Training + +``` +python train.py --config config_v1.json +``` + +To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.
+Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.
+You can change the path by adding `--checkpoint_path` option. + +Validation loss during training with V1 generator.
+![validation loss](./validation_loss.png) + +## Pretrained Model + +You can also use pretrained models we provide.
+[Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)
+Details of each folder are as in follows: + +| Folder Name | Generator | Dataset | Fine-Tuned | +| ------------ | --------- | --------- | ------------------------------------------------------ | +| LJ_V1 | V1 | LJSpeech | No | +| LJ_V2 | V2 | LJSpeech | No | +| LJ_V3 | V3 | LJSpeech | No | +| LJ_FT_T2_V1 | V1 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) | +| LJ_FT_T2_V2 | V2 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) | +| LJ_FT_T2_V3 | V3 | LJSpeech | Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2)) | +| VCTK_V1 | V1 | VCTK | No | +| VCTK_V2 | V2 | VCTK | No | +| VCTK_V3 | V3 | VCTK | No | +| UNIVERSAL_V1 | V1 | Universal | No | + +We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets. + +## Fine-Tuning + +1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.
+ The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.
+ Example: + ` Audio File : LJ001-0001.wav +Mel-Spectrogram File : LJ001-0001.npy` +2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.
+3. Run the following command. + ``` + python train.py --fine_tuning True --config config_v1.json + ``` + For other command line options, please refer to the training section. + +## Inference from wav file + +1. Make `test_files` directory and copy wav files into the directory. +2. Run the following command. + ` python inference.py --checkpoint_file [generator checkpoint file path]` + Generated wav files are saved in `generated_files` by default.
+ You can change the path by adding `--output_dir` option. + +## Inference for end-to-end speech synthesis + +1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.
+ You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2), + [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth. +2. Run the following command. + ` python inference_e2e.py --checkpoint_file [generator checkpoint file path]` + Generated wav files are saved in `generated_files_from_mel` by default.
+ You can change the path by adding `--output_dir` option. + +## Acknowledgements + +We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips) +and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this. diff --git a/third_party/Matcha-TTS/matcha/hifigan/__init__.py b/third_party/Matcha-TTS/matcha/hifigan/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/Matcha-TTS/matcha/hifigan/config.py b/third_party/Matcha-TTS/matcha/hifigan/config.py new file mode 100644 index 0000000000000000000000000000000000000000..b3abea9e151a08864353d32066bd4935e24b82e7 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/hifigan/config.py @@ -0,0 +1,28 @@ +v1 = { + "resblock": "1", + "num_gpus": 0, + "batch_size": 16, + "learning_rate": 0.0004, + "adam_b1": 0.8, + "adam_b2": 0.99, + "lr_decay": 0.999, + "seed": 1234, + "upsample_rates": [8, 8, 2, 2], + "upsample_kernel_sizes": [16, 16, 4, 4], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [3, 7, 11], + "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + "resblock_initial_channel": 256, + "segment_size": 8192, + "num_mels": 80, + "num_freq": 1025, + "n_fft": 1024, + "hop_size": 256, + "win_size": 1024, + "sampling_rate": 22050, + "fmin": 0, + "fmax": 8000, + "fmax_loss": None, + "num_workers": 4, + "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1}, +} diff --git a/third_party/Matcha-TTS/matcha/hifigan/denoiser.py b/third_party/Matcha-TTS/matcha/hifigan/denoiser.py new file mode 100644 index 0000000000000000000000000000000000000000..9fd33312a09b1940374a0e29a97fe3a1a1dac7d2 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/hifigan/denoiser.py @@ -0,0 +1,64 @@ +# Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea0267ae1c2b1/denoiser.py + +"""Waveglow style denoiser can be used to remove the artifacts from the HiFiGAN generated audio.""" +import torch + + +class Denoiser(torch.nn.Module): + """Removes model bias from audio produced with waveglow""" + + def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"): + super().__init__() + self.filter_length = filter_length + self.hop_length = int(filter_length / n_overlap) + self.win_length = win_length + + dtype, device = next(vocoder.parameters()).dtype, next(vocoder.parameters()).device + self.device = device + if mode == "zeros": + mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device) + elif mode == "normal": + mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device) + else: + raise Exception(f"Mode {mode} if not supported") + + def stft_fn(audio, n_fft, hop_length, win_length, window): + spec = torch.stft( + audio, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + return_complex=True, + ) + spec = torch.view_as_real(spec) + return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0]) + + self.stft = lambda x: stft_fn( + audio=x, + n_fft=self.filter_length, + hop_length=self.hop_length, + win_length=self.win_length, + window=torch.hann_window(self.win_length, device=device), + ) + self.istft = lambda x, y: torch.istft( + torch.complex(x * torch.cos(y), x * torch.sin(y)), + n_fft=self.filter_length, + hop_length=self.hop_length, + win_length=self.win_length, + window=torch.hann_window(self.win_length, device=device), + ) + + with torch.no_grad(): + bias_audio = vocoder(mel_input).float().squeeze(0) + bias_spec, _ = self.stft(bias_audio) + + self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None]) + + @torch.inference_mode() + def forward(self, audio, strength=0.0005): + audio_spec, audio_angles = self.stft(audio) + audio_spec_denoised = audio_spec - self.bias_spec.to(audio.device) * strength + audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) + audio_denoised = self.istft(audio_spec_denoised, audio_angles) + return audio_denoised diff --git a/third_party/Matcha-TTS/matcha/hifigan/env.py b/third_party/Matcha-TTS/matcha/hifigan/env.py new file mode 100644 index 0000000000000000000000000000000000000000..9ea4f948a3f002921bf9bc24f52cbc1c0b1fc2ec --- /dev/null +++ b/third_party/Matcha-TTS/matcha/hifigan/env.py @@ -0,0 +1,17 @@ +""" from https://github.com/jik876/hifi-gan """ + +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) diff --git a/third_party/Matcha-TTS/matcha/hifigan/meldataset.py b/third_party/Matcha-TTS/matcha/hifigan/meldataset.py new file mode 100644 index 0000000000000000000000000000000000000000..8b43ea7965e04a52d5427a485ee911b743057c4a --- /dev/null +++ b/third_party/Matcha-TTS/matcha/hifigan/meldataset.py @@ -0,0 +1,217 @@ +""" from https://github.com/jik876/hifi-gan """ + +import math +import os +import random + +import numpy as np +import torch +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn +from librosa.util import normalize +from scipy.io.wavfile import read + +MAX_WAV_VALUE = 32768.0 + + +def load_wav(full_path): + sampling_rate, data = read(full_path) + return data, sampling_rate + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): + if torch.min(y) < -1.0: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.0: + print("max value is ", torch.max(y)) + + global mel_basis, hann_window # pylint: disable=global-statement + if fmax not in mel_basis: + mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) + mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device) + hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) + + y = torch.nn.functional.pad( + y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" + ) + y = y.squeeze(1) + + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[str(y.device)], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + ) + + spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) + + spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec) + spec = spectral_normalize_torch(spec) + + return spec + + +def get_dataset_filelist(a): + with open(a.input_training_file, encoding="utf-8") as fi: + training_files = [ + os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0 + ] + + with open(a.input_validation_file, encoding="utf-8") as fi: + validation_files = [ + os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0 + ] + return training_files, validation_files + + +class MelDataset(torch.utils.data.Dataset): + def __init__( + self, + training_files, + segment_size, + n_fft, + num_mels, + hop_size, + win_size, + sampling_rate, + fmin, + fmax, + split=True, + shuffle=True, + n_cache_reuse=1, + device=None, + fmax_loss=None, + fine_tuning=False, + base_mels_path=None, + ): + self.audio_files = training_files + random.seed(1234) + if shuffle: + random.shuffle(self.audio_files) + self.segment_size = segment_size + self.sampling_rate = sampling_rate + self.split = split + self.n_fft = n_fft + self.num_mels = num_mels + self.hop_size = hop_size + self.win_size = win_size + self.fmin = fmin + self.fmax = fmax + self.fmax_loss = fmax_loss + self.cached_wav = None + self.n_cache_reuse = n_cache_reuse + self._cache_ref_count = 0 + self.device = device + self.fine_tuning = fine_tuning + self.base_mels_path = base_mels_path + + def __getitem__(self, index): + filename = self.audio_files[index] + if self._cache_ref_count == 0: + audio, sampling_rate = load_wav(filename) + audio = audio / MAX_WAV_VALUE + if not self.fine_tuning: + audio = normalize(audio) * 0.95 + self.cached_wav = audio + if sampling_rate != self.sampling_rate: + raise ValueError(f"{sampling_rate} SR doesn't match target {self.sampling_rate} SR") + self._cache_ref_count = self.n_cache_reuse + else: + audio = self.cached_wav + self._cache_ref_count -= 1 + + audio = torch.FloatTensor(audio) + audio = audio.unsqueeze(0) + + if not self.fine_tuning: + if self.split: + if audio.size(1) >= self.segment_size: + max_audio_start = audio.size(1) - self.segment_size + audio_start = random.randint(0, max_audio_start) + audio = audio[:, audio_start : audio_start + self.segment_size] + else: + audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant") + + mel = mel_spectrogram( + audio, + self.n_fft, + self.num_mels, + self.sampling_rate, + self.hop_size, + self.win_size, + self.fmin, + self.fmax, + center=False, + ) + else: + mel = np.load(os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + ".npy")) + mel = torch.from_numpy(mel) + + if len(mel.shape) < 3: + mel = mel.unsqueeze(0) + + if self.split: + frames_per_seg = math.ceil(self.segment_size / self.hop_size) + + if audio.size(1) >= self.segment_size: + mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) + mel = mel[:, :, mel_start : mel_start + frames_per_seg] + audio = audio[:, mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size] + else: + mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant") + audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant") + + mel_loss = mel_spectrogram( + audio, + self.n_fft, + self.num_mels, + self.sampling_rate, + self.hop_size, + self.win_size, + self.fmin, + self.fmax_loss, + center=False, + ) + + return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) + + def __len__(self): + return len(self.audio_files) diff --git a/third_party/Matcha-TTS/matcha/hifigan/models.py b/third_party/Matcha-TTS/matcha/hifigan/models.py new file mode 100644 index 0000000000000000000000000000000000000000..d209d9a4e99ec29e4167a5a2eaa62d72b3eff694 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/hifigan/models.py @@ -0,0 +1,368 @@ +""" from https://github.com/jik876/hifi-gan """ + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from .xutils import get_padding, init_weights + +LRELU_SLOPE = 0.1 + + +class ResBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super().__init__() + self.h = h + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super().__init__() + self.h = h + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Generator(torch.nn.Module): + def __init__(self, h): + super().__init__() + self.h = h + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)) + resblock = ResBlock1 if h.resblock == "1" else ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + h.upsample_initial_channel // (2**i), + h.upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2 ** (i + 1)) + for _, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x): + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super().__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self): + super().__init__() + self.discriminators = nn.ModuleList( + [ + DiscriminatorP(2), + DiscriminatorP(3), + DiscriminatorP(5), + DiscriminatorP(7), + DiscriminatorP(11), + ] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for _, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super().__init__() + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 128, 15, 1, padding=7)), + norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(torch.nn.Module): + def __init__(self): + super().__init__() + self.discriminators = nn.ModuleList( + [ + DiscriminatorS(use_spectral_norm=True), + DiscriminatorS(), + DiscriminatorS(), + ] + ) + self.meanpools = nn.ModuleList([AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i - 1](y) + y_hat = self.meanpools[i - 1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg**2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses diff --git a/third_party/Matcha-TTS/matcha/hifigan/xutils.py b/third_party/Matcha-TTS/matcha/hifigan/xutils.py new file mode 100644 index 0000000000000000000000000000000000000000..eefadcb7a1d0bf9015e636b88fee3e22c9771bc5 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/hifigan/xutils.py @@ -0,0 +1,60 @@ +""" from https://github.com/jik876/hifi-gan """ + +import glob +import os + +import matplotlib +import torch +from torch.nn.utils import weight_norm + +matplotlib.use("Agg") +import matplotlib.pylab as plt + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print(f"Loading '{filepath}'") + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print(f"Saving checkpoint to {filepath}") + torch.save(obj, filepath) + print("Complete.") + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + "????????") + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] diff --git a/third_party/Matcha-TTS/matcha/models/__init__.py b/third_party/Matcha-TTS/matcha/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/Matcha-TTS/matcha/models/__pycache__/__init__.cpython-310.pyc b/third_party/Matcha-TTS/matcha/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76fde176a32eefaf55bb6e3e6384da414bd15166 Binary files /dev/null and b/third_party/Matcha-TTS/matcha/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/models/baselightningmodule.py b/third_party/Matcha-TTS/matcha/models/baselightningmodule.py new file mode 100644 index 0000000000000000000000000000000000000000..3724888090e36b5f55445d33a87fcdae687b35a5 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/models/baselightningmodule.py @@ -0,0 +1,209 @@ +""" +This is a base lightning module that can be used to train a model. +The benefit of this abstraction is that all the logic outside of model definition can be reused for different models. +""" +import inspect +from abc import ABC +from typing import Any, Dict + +import torch +from lightning import LightningModule +from lightning.pytorch.utilities import grad_norm + +from matcha import utils +from matcha.utils.utils import plot_tensor + +log = utils.get_pylogger(__name__) + + +class BaseLightningClass(LightningModule, ABC): + def update_data_statistics(self, data_statistics): + if data_statistics is None: + data_statistics = { + "mel_mean": 0.0, + "mel_std": 1.0, + } + + self.register_buffer("mel_mean", torch.tensor(data_statistics["mel_mean"])) + self.register_buffer("mel_std", torch.tensor(data_statistics["mel_std"])) + + def configure_optimizers(self) -> Any: + optimizer = self.hparams.optimizer(params=self.parameters()) + if self.hparams.scheduler not in (None, {}): + scheduler_args = {} + # Manage last epoch for exponential schedulers + if "last_epoch" in inspect.signature(self.hparams.scheduler.scheduler).parameters: + if hasattr(self, "ckpt_loaded_epoch"): + current_epoch = self.ckpt_loaded_epoch - 1 + else: + current_epoch = -1 + + scheduler_args.update({"optimizer": optimizer}) + scheduler = self.hparams.scheduler.scheduler(**scheduler_args) + scheduler.last_epoch = current_epoch + return { + "optimizer": optimizer, + "lr_scheduler": { + "scheduler": scheduler, + "interval": self.hparams.scheduler.lightning_args.interval, + "frequency": self.hparams.scheduler.lightning_args.frequency, + "name": "learning_rate", + }, + } + + return {"optimizer": optimizer} + + def get_losses(self, batch): + x, x_lengths = batch["x"], batch["x_lengths"] + y, y_lengths = batch["y"], batch["y_lengths"] + spks = batch["spks"] + + dur_loss, prior_loss, diff_loss = self( + x=x, + x_lengths=x_lengths, + y=y, + y_lengths=y_lengths, + spks=spks, + out_size=self.out_size, + ) + return { + "dur_loss": dur_loss, + "prior_loss": prior_loss, + "diff_loss": diff_loss, + } + + def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: + self.ckpt_loaded_epoch = checkpoint["epoch"] # pylint: disable=attribute-defined-outside-init + + def training_step(self, batch: Any, batch_idx: int): + loss_dict = self.get_losses(batch) + self.log( + "step", + float(self.global_step), + on_step=True, + prog_bar=True, + logger=True, + sync_dist=True, + ) + + self.log( + "sub_loss/train_dur_loss", + loss_dict["dur_loss"], + on_step=True, + on_epoch=True, + logger=True, + sync_dist=True, + ) + self.log( + "sub_loss/train_prior_loss", + loss_dict["prior_loss"], + on_step=True, + on_epoch=True, + logger=True, + sync_dist=True, + ) + self.log( + "sub_loss/train_diff_loss", + loss_dict["diff_loss"], + on_step=True, + on_epoch=True, + logger=True, + sync_dist=True, + ) + + total_loss = sum(loss_dict.values()) + self.log( + "loss/train", + total_loss, + on_step=True, + on_epoch=True, + logger=True, + prog_bar=True, + sync_dist=True, + ) + + return {"loss": total_loss, "log": loss_dict} + + def validation_step(self, batch: Any, batch_idx: int): + loss_dict = self.get_losses(batch) + self.log( + "sub_loss/val_dur_loss", + loss_dict["dur_loss"], + on_step=True, + on_epoch=True, + logger=True, + sync_dist=True, + ) + self.log( + "sub_loss/val_prior_loss", + loss_dict["prior_loss"], + on_step=True, + on_epoch=True, + logger=True, + sync_dist=True, + ) + self.log( + "sub_loss/val_diff_loss", + loss_dict["diff_loss"], + on_step=True, + on_epoch=True, + logger=True, + sync_dist=True, + ) + + total_loss = sum(loss_dict.values()) + self.log( + "loss/val", + total_loss, + on_step=True, + on_epoch=True, + logger=True, + prog_bar=True, + sync_dist=True, + ) + + return total_loss + + def on_validation_end(self) -> None: + if self.trainer.is_global_zero: + one_batch = next(iter(self.trainer.val_dataloaders)) + if self.current_epoch == 0: + log.debug("Plotting original samples") + for i in range(2): + y = one_batch["y"][i].unsqueeze(0).to(self.device) + self.logger.experiment.add_image( + f"original/{i}", + plot_tensor(y.squeeze().cpu()), + self.current_epoch, + dataformats="HWC", + ) + + log.debug("Synthesising...") + for i in range(2): + x = one_batch["x"][i].unsqueeze(0).to(self.device) + x_lengths = one_batch["x_lengths"][i].unsqueeze(0).to(self.device) + spks = one_batch["spks"][i].unsqueeze(0).to(self.device) if one_batch["spks"] is not None else None + output = self.synthesise(x[:, :x_lengths], x_lengths, n_timesteps=10, spks=spks) + y_enc, y_dec = output["encoder_outputs"], output["decoder_outputs"] + attn = output["attn"] + self.logger.experiment.add_image( + f"generated_enc/{i}", + plot_tensor(y_enc.squeeze().cpu()), + self.current_epoch, + dataformats="HWC", + ) + self.logger.experiment.add_image( + f"generated_dec/{i}", + plot_tensor(y_dec.squeeze().cpu()), + self.current_epoch, + dataformats="HWC", + ) + self.logger.experiment.add_image( + f"alignment/{i}", + plot_tensor(attn.squeeze().cpu()), + self.current_epoch, + dataformats="HWC", + ) + + def on_before_optimizer_step(self, optimizer): + self.log_dict({f"grad_norm/{k}": v for k, v in grad_norm(self, norm_type=2).items()}) diff --git a/third_party/Matcha-TTS/matcha/models/components/__init__.py b/third_party/Matcha-TTS/matcha/models/components/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/Matcha-TTS/matcha/models/components/__pycache__/__init__.cpython-310.pyc b/third_party/Matcha-TTS/matcha/models/components/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99ec1f7f48fd517790822a038355e93190d453cd Binary files /dev/null and b/third_party/Matcha-TTS/matcha/models/components/__pycache__/__init__.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/models/components/__pycache__/decoder.cpython-310.pyc b/third_party/Matcha-TTS/matcha/models/components/__pycache__/decoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8368c527f466a7fd4a102df8ff8045c8effd0216 Binary files /dev/null and b/third_party/Matcha-TTS/matcha/models/components/__pycache__/decoder.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/models/components/__pycache__/flow_matching.cpython-310.pyc b/third_party/Matcha-TTS/matcha/models/components/__pycache__/flow_matching.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c952ef1f8958ec4b30356d3c32f8ce15fccd302d Binary files /dev/null and b/third_party/Matcha-TTS/matcha/models/components/__pycache__/flow_matching.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/models/components/__pycache__/transformer.cpython-310.pyc b/third_party/Matcha-TTS/matcha/models/components/__pycache__/transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1772334c1a3730f72443b643ab210813feccce1 Binary files /dev/null and b/third_party/Matcha-TTS/matcha/models/components/__pycache__/transformer.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/models/components/decoder.py b/third_party/Matcha-TTS/matcha/models/components/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..1137cd7008e9d07b4f306926a82e44c2b2cddbdf --- /dev/null +++ b/third_party/Matcha-TTS/matcha/models/components/decoder.py @@ -0,0 +1,443 @@ +import math +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from conformer import ConformerBlock +from diffusers.models.activations import get_activation +from einops import pack, rearrange, repeat + +from matcha.models.components.transformer import BasicTransformerBlock + + +class SinusoidalPosEmb(torch.nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even" + + def forward(self, x, scale=1000): + if x.ndim < 1: + x = x.unsqueeze(0) + device = x.device + half_dim = self.dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb) + emb = scale * x.unsqueeze(1) * emb.unsqueeze(0) + emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + return emb + + +class Block1D(torch.nn.Module): + def __init__(self, dim, dim_out, groups=8): + super().__init__() + self.block = torch.nn.Sequential( + torch.nn.Conv1d(dim, dim_out, 3, padding=1), + torch.nn.GroupNorm(groups, dim_out), + nn.Mish(), + ) + + def forward(self, x, mask): + output = self.block(x * mask) + return output * mask + + +class ResnetBlock1D(torch.nn.Module): + def __init__(self, dim, dim_out, time_emb_dim, groups=8): + super().__init__() + self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out)) + + self.block1 = Block1D(dim, dim_out, groups=groups) + self.block2 = Block1D(dim_out, dim_out, groups=groups) + + self.res_conv = torch.nn.Conv1d(dim, dim_out, 1) + + def forward(self, x, mask, time_emb): + h = self.block1(x, mask) + h += self.mlp(time_emb).unsqueeze(-1) + h = self.block2(h, mask) + output = h + self.res_conv(x * mask) + return output + + +class Downsample1D(nn.Module): + def __init__(self, dim): + super().__init__() + self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1) + + def forward(self, x): + return self.conv(x) + + +class TimestepEmbedding(nn.Module): + def __init__( + self, + in_channels: int, + time_embed_dim: int, + act_fn: str = "silu", + out_dim: int = None, + post_act_fn: Optional[str] = None, + cond_proj_dim=None, + ): + super().__init__() + + self.linear_1 = nn.Linear(in_channels, time_embed_dim) + + if cond_proj_dim is not None: + self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False) + else: + self.cond_proj = None + + self.act = get_activation(act_fn) + + if out_dim is not None: + time_embed_dim_out = out_dim + else: + time_embed_dim_out = time_embed_dim + self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out) + + if post_act_fn is None: + self.post_act = None + else: + self.post_act = get_activation(post_act_fn) + + def forward(self, sample, condition=None): + if condition is not None: + sample = sample + self.cond_proj(condition) + sample = self.linear_1(sample) + + if self.act is not None: + sample = self.act(sample) + + sample = self.linear_2(sample) + + if self.post_act is not None: + sample = self.post_act(sample) + return sample + + +class Upsample1D(nn.Module): + """A 1D upsampling layer with an optional convolution. + + Parameters: + channels (`int`): + number of channels in the inputs and outputs. + use_conv (`bool`, default `False`): + option to use a convolution. + use_conv_transpose (`bool`, default `False`): + option to use a convolution transpose. + out_channels (`int`, optional): + number of output channels. Defaults to `channels`. + """ + + def __init__(self, channels, use_conv=False, use_conv_transpose=True, out_channels=None, name="conv"): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_conv_transpose = use_conv_transpose + self.name = name + + self.conv = None + if use_conv_transpose: + self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1) + elif use_conv: + self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1) + + def forward(self, inputs): + assert inputs.shape[1] == self.channels + if self.use_conv_transpose: + return self.conv(inputs) + + outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest") + + if self.use_conv: + outputs = self.conv(outputs) + + return outputs + + +class ConformerWrapper(ConformerBlock): + def __init__( # pylint: disable=useless-super-delegation + self, + *, + dim, + dim_head=64, + heads=8, + ff_mult=4, + conv_expansion_factor=2, + conv_kernel_size=31, + attn_dropout=0, + ff_dropout=0, + conv_dropout=0, + conv_causal=False, + ): + super().__init__( + dim=dim, + dim_head=dim_head, + heads=heads, + ff_mult=ff_mult, + conv_expansion_factor=conv_expansion_factor, + conv_kernel_size=conv_kernel_size, + attn_dropout=attn_dropout, + ff_dropout=ff_dropout, + conv_dropout=conv_dropout, + conv_causal=conv_causal, + ) + + def forward( + self, + hidden_states, + attention_mask, + encoder_hidden_states=None, + encoder_attention_mask=None, + timestep=None, + ): + return super().forward(x=hidden_states, mask=attention_mask.bool()) + + +class Decoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + channels=(256, 256), + dropout=0.05, + attention_head_dim=64, + n_blocks=1, + num_mid_blocks=2, + num_heads=4, + act_fn="snake", + down_block_type="transformer", + mid_block_type="transformer", + up_block_type="transformer", + ): + super().__init__() + channels = tuple(channels) + self.in_channels = in_channels + self.out_channels = out_channels + + self.time_embeddings = SinusoidalPosEmb(in_channels) + time_embed_dim = channels[0] * 4 + self.time_mlp = TimestepEmbedding( + in_channels=in_channels, + time_embed_dim=time_embed_dim, + act_fn="silu", + ) + + self.down_blocks = nn.ModuleList([]) + self.mid_blocks = nn.ModuleList([]) + self.up_blocks = nn.ModuleList([]) + + output_channel = in_channels + for i in range(len(channels)): # pylint: disable=consider-using-enumerate + input_channel = output_channel + output_channel = channels[i] + is_last = i == len(channels) - 1 + resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + transformer_blocks = nn.ModuleList( + [ + self.get_block( + down_block_type, + output_channel, + attention_head_dim, + num_heads, + dropout, + act_fn, + ) + for _ in range(n_blocks) + ] + ) + downsample = ( + Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1) + ) + + self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample])) + + for i in range(num_mid_blocks): + input_channel = channels[-1] + out_channels = channels[-1] + + resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + + transformer_blocks = nn.ModuleList( + [ + self.get_block( + mid_block_type, + output_channel, + attention_head_dim, + num_heads, + dropout, + act_fn, + ) + for _ in range(n_blocks) + ] + ) + + self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks])) + + channels = channels[::-1] + (channels[0],) + for i in range(len(channels) - 1): + input_channel = channels[i] + output_channel = channels[i + 1] + is_last = i == len(channels) - 2 + + resnet = ResnetBlock1D( + dim=2 * input_channel, + dim_out=output_channel, + time_emb_dim=time_embed_dim, + ) + transformer_blocks = nn.ModuleList( + [ + self.get_block( + up_block_type, + output_channel, + attention_head_dim, + num_heads, + dropout, + act_fn, + ) + for _ in range(n_blocks) + ] + ) + upsample = ( + Upsample1D(output_channel, use_conv_transpose=True) + if not is_last + else nn.Conv1d(output_channel, output_channel, 3, padding=1) + ) + + self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample])) + + self.final_block = Block1D(channels[-1], channels[-1]) + self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1) + + self.initialize_weights() + # nn.init.normal_(self.final_proj.weight) + + @staticmethod + def get_block(block_type, dim, attention_head_dim, num_heads, dropout, act_fn): + if block_type == "conformer": + block = ConformerWrapper( + dim=dim, + dim_head=attention_head_dim, + heads=num_heads, + ff_mult=1, + conv_expansion_factor=2, + ff_dropout=dropout, + attn_dropout=dropout, + conv_dropout=dropout, + conv_kernel_size=31, + ) + elif block_type == "transformer": + block = BasicTransformerBlock( + dim=dim, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + else: + raise ValueError(f"Unknown block type {block_type}") + + return block + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_(m.weight, nonlinearity="relu") + + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + elif isinstance(m, nn.GroupNorm): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + elif isinstance(m, nn.Linear): + nn.init.kaiming_normal_(m.weight, nonlinearity="relu") + + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x, mask, mu, t, spks=None, cond=None): + """Forward pass of the UNet1DConditional model. + + Args: + x (torch.Tensor): shape (batch_size, in_channels, time) + mask (_type_): shape (batch_size, 1, time) + t (_type_): shape (batch_size) + spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None. + cond (_type_, optional): placeholder for future use. Defaults to None. + + Raises: + ValueError: _description_ + ValueError: _description_ + + Returns: + _type_: _description_ + """ + + t = self.time_embeddings(t) + t = self.time_mlp(t) + + x = pack([x, mu], "b * t")[0] + + if spks is not None: + spks = repeat(spks, "b c -> b c t", t=x.shape[-1]) + x = pack([x, spks], "b * t")[0] + + hiddens = [] + masks = [mask] + for resnet, transformer_blocks, downsample in self.down_blocks: + mask_down = masks[-1] + x = resnet(x, mask_down, t) + x = rearrange(x, "b c t -> b t c") + mask_down = rearrange(mask_down, "b 1 t -> b t") + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=mask_down, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t") + mask_down = rearrange(mask_down, "b t -> b 1 t") + hiddens.append(x) # Save hidden states for skip connections + x = downsample(x * mask_down) + masks.append(mask_down[:, :, ::2]) + + masks = masks[:-1] + mask_mid = masks[-1] + + for resnet, transformer_blocks in self.mid_blocks: + x = resnet(x, mask_mid, t) + x = rearrange(x, "b c t -> b t c") + mask_mid = rearrange(mask_mid, "b 1 t -> b t") + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=mask_mid, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t") + mask_mid = rearrange(mask_mid, "b t -> b 1 t") + + for resnet, transformer_blocks, upsample in self.up_blocks: + mask_up = masks.pop() + x = resnet(pack([x, hiddens.pop()], "b * t")[0], mask_up, t) + x = rearrange(x, "b c t -> b t c") + mask_up = rearrange(mask_up, "b 1 t -> b t") + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=mask_up, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t") + mask_up = rearrange(mask_up, "b t -> b 1 t") + x = upsample(x * mask_up) + + x = self.final_block(x, mask_up) + output = self.final_proj(x * mask_up) + + return output * mask diff --git a/third_party/Matcha-TTS/matcha/models/components/flow_matching.py b/third_party/Matcha-TTS/matcha/models/components/flow_matching.py new file mode 100644 index 0000000000000000000000000000000000000000..5cad7431ef66a8d11da32a77c1af7f6e31d6b774 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/models/components/flow_matching.py @@ -0,0 +1,132 @@ +from abc import ABC + +import torch +import torch.nn.functional as F + +from matcha.models.components.decoder import Decoder +from matcha.utils.pylogger import get_pylogger + +log = get_pylogger(__name__) + + +class BASECFM(torch.nn.Module, ABC): + def __init__( + self, + n_feats, + cfm_params, + n_spks=1, + spk_emb_dim=128, + ): + super().__init__() + self.n_feats = n_feats + self.n_spks = n_spks + self.spk_emb_dim = spk_emb_dim + self.solver = cfm_params.solver + if hasattr(cfm_params, "sigma_min"): + self.sigma_min = cfm_params.sigma_min + else: + self.sigma_min = 1e-4 + + self.estimator = None + + @torch.inference_mode() + def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): + """Forward diffusion + + Args: + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + n_timesteps (int): number of diffusion steps + temperature (float, optional): temperature for scaling noise. Defaults to 1.0. + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + + Returns: + sample: generated mel-spectrogram + shape: (batch_size, n_feats, mel_timesteps) + """ + z = torch.randn_like(mu) * temperature + t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device) + return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond) + + def solve_euler(self, x, t_span, mu, mask, spks, cond): + """ + Fixed euler solver for ODEs. + Args: + x (torch.Tensor): random noise + t_span (torch.Tensor): n_timesteps interpolated + shape: (n_timesteps + 1,) + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + """ + t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0] + + # I am storing this because I can later plot it by putting a debugger here and saving it to a file + # Or in future might add like a return_all_steps flag + sol = [] + + for step in range(1, len(t_span)): + dphi_dt = self.estimator(x, mask, mu, t, spks, cond) + + x = x + dt * dphi_dt + t = t + dt + sol.append(x) + if step < len(t_span) - 1: + dt = t_span[step + 1] - t + + return sol[-1] + + def compute_loss(self, x1, mask, mu, spks=None, cond=None): + """Computes diffusion loss + + Args: + x1 (torch.Tensor): Target + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): target mask + shape: (batch_size, 1, mel_timesteps) + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + spks (torch.Tensor, optional): speaker embedding. Defaults to None. + shape: (batch_size, spk_emb_dim) + + Returns: + loss: conditional flow matching loss + y: conditional flow + shape: (batch_size, n_feats, mel_timesteps) + """ + b, _, t = mu.shape + + # random timestep + t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype) + # sample noise p(x_0) + z = torch.randn_like(x1) + + y = (1 - (1 - self.sigma_min) * t) * z + t * x1 + u = x1 - (1 - self.sigma_min) * z + + loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / ( + torch.sum(mask) * u.shape[1] + ) + return loss, y + + +class CFM(BASECFM): + def __init__(self, in_channels, out_channel, cfm_params, decoder_params, n_spks=1, spk_emb_dim=64): + super().__init__( + n_feats=in_channels, + cfm_params=cfm_params, + n_spks=n_spks, + spk_emb_dim=spk_emb_dim, + ) + + in_channels = in_channels + (spk_emb_dim if n_spks > 1 else 0) + # Just change the architecture of the estimator here + self.estimator = Decoder(in_channels=in_channels, out_channels=out_channel, **decoder_params) diff --git a/third_party/Matcha-TTS/matcha/models/components/text_encoder.py b/third_party/Matcha-TTS/matcha/models/components/text_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..a388d05d6351fa2c9d9632fed0942d51fbec067b --- /dev/null +++ b/third_party/Matcha-TTS/matcha/models/components/text_encoder.py @@ -0,0 +1,410 @@ +""" from https://github.com/jaywalnut310/glow-tts """ + +import math + +import torch +import torch.nn as nn +from einops import rearrange + +import matcha.utils as utils +from matcha.utils.model import sequence_mask + +log = utils.get_pylogger(__name__) + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-4): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = torch.nn.Parameter(torch.ones(channels)) + self.beta = torch.nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + n_dims = len(x.shape) + mean = torch.mean(x, 1, keepdim=True) + variance = torch.mean((x - mean) ** 2, 1, keepdim=True) + + x = (x - mean) * torch.rsqrt(variance + self.eps) + + shape = [1, -1] + [1] * (n_dims - 2) + x = x * self.gamma.view(*shape) + self.beta.view(*shape) + return x + + +class ConvReluNorm(nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.conv_layers = torch.nn.ModuleList() + self.norm_layers = torch.nn.ModuleList() + self.conv_layers.append(torch.nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = torch.nn.Sequential(torch.nn.ReLU(), torch.nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append( + torch.nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = torch.nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DurationPredictor(nn.Module): + def __init__(self, in_channels, filter_channels, kernel_size, p_dropout): + super().__init__() + self.in_channels = in_channels + self.filter_channels = filter_channels + self.p_dropout = p_dropout + + self.drop = torch.nn.Dropout(p_dropout) + self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) + self.norm_1 = LayerNorm(filter_channels) + self.conv_2 = torch.nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) + self.norm_2 = LayerNorm(filter_channels) + self.proj = torch.nn.Conv1d(filter_channels, 1, 1) + + def forward(self, x, x_mask): + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + x = self.drop(x) + x = self.proj(x * x_mask) + return x * x_mask + + +class RotaryPositionalEmbeddings(nn.Module): + """ + ## RoPE module + + Rotary encoding transforms pairs of features by rotating in the 2D plane. + That is, it organizes the $d$ features as $\frac{d}{2}$ pairs. + Each pair can be considered a coordinate in a 2D plane, and the encoding will rotate it + by an angle depending on the position of the token. + """ + + def __init__(self, d: int, base: int = 10_000): + r""" + * `d` is the number of features $d$ + * `base` is the constant used for calculating $\Theta$ + """ + super().__init__() + + self.base = base + self.d = int(d) + self.cos_cached = None + self.sin_cached = None + + def _build_cache(self, x: torch.Tensor): + r""" + Cache $\cos$ and $\sin$ values + """ + # Return if cache is already built + if self.cos_cached is not None and x.shape[0] <= self.cos_cached.shape[0]: + return + + # Get sequence length + seq_len = x.shape[0] + + # $\Theta = {\theta_i = 10000^{-\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ + theta = 1.0 / (self.base ** (torch.arange(0, self.d, 2).float() / self.d)).to(x.device) + + # Create position indexes `[0, 1, ..., seq_len - 1]` + seq_idx = torch.arange(seq_len, device=x.device).float().to(x.device) + + # Calculate the product of position index and $\theta_i$ + idx_theta = torch.einsum("n,d->nd", seq_idx, theta) + + # Concatenate so that for row $m$ we have + # $[m \theta_0, m \theta_1, ..., m \theta_{\frac{d}{2}}, m \theta_0, m \theta_1, ..., m \theta_{\frac{d}{2}}]$ + idx_theta2 = torch.cat([idx_theta, idx_theta], dim=1) + + # Cache them + self.cos_cached = idx_theta2.cos()[:, None, None, :] + self.sin_cached = idx_theta2.sin()[:, None, None, :] + + def _neg_half(self, x: torch.Tensor): + # $\frac{d}{2}$ + d_2 = self.d // 2 + + # Calculate $[-x^{(\frac{d}{2} + 1)}, -x^{(\frac{d}{2} + 2)}, ..., -x^{(d)}, x^{(1)}, x^{(2)}, ..., x^{(\frac{d}{2})}]$ + return torch.cat([-x[:, :, :, d_2:], x[:, :, :, :d_2]], dim=-1) + + def forward(self, x: torch.Tensor): + """ + * `x` is the Tensor at the head of a key or a query with shape `[seq_len, batch_size, n_heads, d]` + """ + # Cache $\cos$ and $\sin$ values + x = rearrange(x, "b h t d -> t b h d") + + self._build_cache(x) + + # Split the features, we can choose to apply rotary embeddings only to a partial set of features. + x_rope, x_pass = x[..., : self.d], x[..., self.d :] + + # Calculate + # $[-x^{(\frac{d}{2} + 1)}, -x^{(\frac{d}{2} + 2)}, ..., -x^{(d)}, x^{(1)}, x^{(2)}, ..., x^{(\frac{d}{2})}]$ + neg_half_x = self._neg_half(x_rope) + + x_rope = (x_rope * self.cos_cached[: x.shape[0]]) + (neg_half_x * self.sin_cached[: x.shape[0]]) + + return rearrange(torch.cat((x_rope, x_pass), dim=-1), "t b h d -> b h t d") + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + heads_share=True, + p_dropout=0.0, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.heads_share = heads_share + self.proximal_bias = proximal_bias + self.p_dropout = p_dropout + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = torch.nn.Conv1d(channels, channels, 1) + self.conv_k = torch.nn.Conv1d(channels, channels, 1) + self.conv_v = torch.nn.Conv1d(channels, channels, 1) + + # from https://nn.labml.ai/transformers/rope/index.html + self.query_rotary_pe = RotaryPositionalEmbeddings(self.k_channels * 0.5) + self.key_rotary_pe = RotaryPositionalEmbeddings(self.k_channels * 0.5) + + self.conv_o = torch.nn.Conv1d(channels, out_channels, 1) + self.drop = torch.nn.Dropout(p_dropout) + + torch.nn.init.xavier_uniform_(self.conv_q.weight) + torch.nn.init.xavier_uniform_(self.conv_k.weight) + if proximal_init: + self.conv_k.weight.data.copy_(self.conv_q.weight.data) + self.conv_k.bias.data.copy_(self.conv_q.bias.data) + torch.nn.init.xavier_uniform_(self.conv_v.weight) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = rearrange(query, "b (h c) t-> b h t c", h=self.n_heads) + key = rearrange(key, "b (h c) t-> b h t c", h=self.n_heads) + value = rearrange(value, "b (h c) t-> b h t c", h=self.n_heads) + + query = self.query_rotary_pe(query) + key = self.key_rotary_pe(key) + + scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels) + + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + p_attn = torch.nn.functional.softmax(scores, dim=-1) + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + output = output.transpose(2, 3).contiguous().view(b, d, t_t) + return output, p_attn + + @staticmethod + def _attention_bias_proximal(length): + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + + self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) + self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size, padding=kernel_size // 2) + self.drop = torch.nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + return x * x_mask + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + **kwargs, + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + + self.drop = torch.nn.Dropout(p_dropout) + self.attn_layers = torch.nn.ModuleList() + self.norm_layers_1 = torch.nn.ModuleList() + self.ffn_layers = torch.nn.ModuleList() + self.norm_layers_2 = torch.nn.ModuleList() + for _ in range(self.n_layers): + self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + for i in range(self.n_layers): + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class TextEncoder(nn.Module): + def __init__( + self, + encoder_type, + encoder_params, + duration_predictor_params, + n_vocab, + n_spks=1, + spk_emb_dim=128, + ): + super().__init__() + self.encoder_type = encoder_type + self.n_vocab = n_vocab + self.n_feats = encoder_params.n_feats + self.n_channels = encoder_params.n_channels + self.spk_emb_dim = spk_emb_dim + self.n_spks = n_spks + + self.emb = torch.nn.Embedding(n_vocab, self.n_channels) + torch.nn.init.normal_(self.emb.weight, 0.0, self.n_channels**-0.5) + + if encoder_params.prenet: + self.prenet = ConvReluNorm( + self.n_channels, + self.n_channels, + self.n_channels, + kernel_size=5, + n_layers=3, + p_dropout=0.5, + ) + else: + self.prenet = lambda x, x_mask: x + + self.encoder = Encoder( + encoder_params.n_channels + (spk_emb_dim if n_spks > 1 else 0), + encoder_params.filter_channels, + encoder_params.n_heads, + encoder_params.n_layers, + encoder_params.kernel_size, + encoder_params.p_dropout, + ) + + self.proj_m = torch.nn.Conv1d(self.n_channels + (spk_emb_dim if n_spks > 1 else 0), self.n_feats, 1) + self.proj_w = DurationPredictor( + self.n_channels + (spk_emb_dim if n_spks > 1 else 0), + duration_predictor_params.filter_channels_dp, + duration_predictor_params.kernel_size, + duration_predictor_params.p_dropout, + ) + + def forward(self, x, x_lengths, spks=None): + """Run forward pass to the transformer based encoder and duration predictor + + Args: + x (torch.Tensor): text input + shape: (batch_size, max_text_length) + x_lengths (torch.Tensor): text input lengths + shape: (batch_size,) + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size,) + + Returns: + mu (torch.Tensor): average output of the encoder + shape: (batch_size, n_feats, max_text_length) + logw (torch.Tensor): log duration predicted by the duration predictor + shape: (batch_size, 1, max_text_length) + x_mask (torch.Tensor): mask for the text input + shape: (batch_size, 1, max_text_length) + """ + x = self.emb(x) * math.sqrt(self.n_channels) + x = torch.transpose(x, 1, -1) + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + + x = self.prenet(x, x_mask) + if self.n_spks > 1: + x = torch.cat([x, spks.unsqueeze(-1).repeat(1, 1, x.shape[-1])], dim=1) + x = self.encoder(x, x_mask) + mu = self.proj_m(x) * x_mask + + x_dp = torch.detach(x) + logw = self.proj_w(x_dp, x_mask) + + return mu, logw, x_mask diff --git a/third_party/Matcha-TTS/matcha/models/components/transformer.py b/third_party/Matcha-TTS/matcha/models/components/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..dd1afa3aff5383912209e508676c6885e13ef4ee --- /dev/null +++ b/third_party/Matcha-TTS/matcha/models/components/transformer.py @@ -0,0 +1,316 @@ +from typing import Any, Dict, Optional + +import torch +import torch.nn as nn +from diffusers.models.attention import ( + GEGLU, + GELU, + AdaLayerNorm, + AdaLayerNormZero, + ApproximateGELU, +) +from diffusers.models.attention_processor import Attention +from diffusers.models.lora import LoRACompatibleLinear +from diffusers.utils.torch_utils import maybe_allow_in_graph + + +class SnakeBeta(nn.Module): + """ + A modified Snake function which uses separate parameters for the magnitude of the periodic components + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + References: + - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snakebeta(256) + >>> x = torch.randn(256) + >>> x = a1(x) + """ + + def __init__(self, in_features, out_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True): + """ + Initialization. + INPUT: + - in_features: shape of the input + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + alpha is initialized to 1 by default, higher values = higher-frequency. + beta is initialized to 1 by default, higher values = higher-magnitude. + alpha will be trained along with the rest of your model. + """ + super().__init__() + self.in_features = out_features if isinstance(out_features, list) else [out_features] + self.proj = LoRACompatibleLinear(in_features, out_features) + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: # log scale alphas initialized to zeros + self.alpha = nn.Parameter(torch.zeros(self.in_features) * alpha) + self.beta = nn.Parameter(torch.zeros(self.in_features) * alpha) + else: # linear scale alphas initialized to ones + self.alpha = nn.Parameter(torch.ones(self.in_features) * alpha) + self.beta = nn.Parameter(torch.ones(self.in_features) * alpha) + + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + """ + Forward pass of the function. + Applies the function to the input elementwise. + SnakeBeta ∶= x + 1/b * sin^2 (xa) + """ + x = self.proj(x) + if self.alpha_logscale: + alpha = torch.exp(self.alpha) + beta = torch.exp(self.beta) + else: + alpha = self.alpha + beta = self.beta + + x = x + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2) + + return x + + +class FeedForward(nn.Module): + r""" + A feed-forward layer. + + Parameters: + dim (`int`): The number of channels in the input. + dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`. + mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + final_dropout (`bool` *optional*, defaults to False): Apply a final dropout. + """ + + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + mult: int = 4, + dropout: float = 0.0, + activation_fn: str = "geglu", + final_dropout: bool = False, + ): + super().__init__() + inner_dim = int(dim * mult) + dim_out = dim_out if dim_out is not None else dim + + if activation_fn == "gelu": + act_fn = GELU(dim, inner_dim) + if activation_fn == "gelu-approximate": + act_fn = GELU(dim, inner_dim, approximate="tanh") + elif activation_fn == "geglu": + act_fn = GEGLU(dim, inner_dim) + elif activation_fn == "geglu-approximate": + act_fn = ApproximateGELU(dim, inner_dim) + elif activation_fn == "snakebeta": + act_fn = SnakeBeta(dim, inner_dim) + + self.net = nn.ModuleList([]) + # project in + self.net.append(act_fn) + # project dropout + self.net.append(nn.Dropout(dropout)) + # project out + self.net.append(LoRACompatibleLinear(inner_dim, dim_out)) + # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout + if final_dropout: + self.net.append(nn.Dropout(dropout)) + + def forward(self, hidden_states): + for module in self.net: + hidden_states = module(hidden_states) + return hidden_states + + +@maybe_allow_in_graph +class BasicTransformerBlock(nn.Module): + r""" + A basic Transformer block. + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention. + only_cross_attention (`bool`, *optional*): + Whether to use only cross-attention layers. In this case two cross attention layers are used. + double_self_attention (`bool`, *optional*): + Whether to use two self-attention layers. In this case no cross attention layers are used. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm (: + obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`. + attention_bias (: + obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter. + """ + + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + only_cross_attention: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_elementwise_affine: bool = True, + norm_type: str = "layer_norm", + final_dropout: bool = False, + ): + super().__init__() + self.only_cross_attention = only_cross_attention + + self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" + self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" + + if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: + raise ValueError( + f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" + f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}." + ) + + # Define 3 blocks. Each block has its own normalization layer. + # 1. Self-Attn + if self.use_ada_layer_norm: + self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) + elif self.use_ada_layer_norm_zero: + self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm) + else: + self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + cross_attention_dim=cross_attention_dim if only_cross_attention else None, + upcast_attention=upcast_attention, + ) + + # 2. Cross-Attn + if cross_attention_dim is not None or double_self_attention: + # We currently only use AdaLayerNormZero for self attention where there will only be one attention block. + # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during + # the second cross attention block. + self.norm2 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + ) + self.attn2 = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim if not double_self_attention else None, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + # scale_qk=False, # uncomment this to not to use flash attention + ) # is self-attn if encoder_hidden_states is none + else: + self.norm2 = None + self.attn2 = None + + # 3. Feed-forward + self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout) + + # let chunk size default to None + self._chunk_size = None + self._chunk_dim = 0 + + def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int): + # Sets chunk feed-forward + self._chunk_size = chunk_size + self._chunk_dim = dim + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, + ): + # Notice that normalization is always applied before the real computation in the following blocks. + # 1. Self-Attention + if self.use_ada_layer_norm: + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.use_ada_layer_norm_zero: + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + else: + norm_hidden_states = self.norm1(hidden_states) + + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} + + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=encoder_attention_mask if self.only_cross_attention else attention_mask, + **cross_attention_kwargs, + ) + if self.use_ada_layer_norm_zero: + attn_output = gate_msa.unsqueeze(1) * attn_output + hidden_states = attn_output + hidden_states + + # 2. Cross-Attention + if self.attn2 is not None: + norm_hidden_states = ( + self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states) + ) + + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + hidden_states = attn_output + hidden_states + + # 3. Feed-forward + norm_hidden_states = self.norm3(hidden_states) + + if self.use_ada_layer_norm_zero: + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + if self._chunk_size is not None: + # "feed_forward_chunk_size" can be used to save memory + if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0: + raise ValueError( + f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`." + ) + + num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size + ff_output = torch.cat( + [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)], + dim=self._chunk_dim, + ) + else: + ff_output = self.ff(norm_hidden_states) + + if self.use_ada_layer_norm_zero: + ff_output = gate_mlp.unsqueeze(1) * ff_output + + hidden_states = ff_output + hidden_states + + return hidden_states diff --git a/third_party/Matcha-TTS/matcha/models/matcha_tts.py b/third_party/Matcha-TTS/matcha/models/matcha_tts.py new file mode 100644 index 0000000000000000000000000000000000000000..64b2c07fe8de4760aee1aed80d206112d30df55f --- /dev/null +++ b/third_party/Matcha-TTS/matcha/models/matcha_tts.py @@ -0,0 +1,239 @@ +import datetime as dt +import math +import random + +import torch + +import matcha.utils.monotonic_align as monotonic_align +from matcha import utils +from matcha.models.baselightningmodule import BaseLightningClass +from matcha.models.components.flow_matching import CFM +from matcha.models.components.text_encoder import TextEncoder +from matcha.utils.model import ( + denormalize, + duration_loss, + fix_len_compatibility, + generate_path, + sequence_mask, +) + +log = utils.get_pylogger(__name__) + + +class MatchaTTS(BaseLightningClass): # 🍵 + def __init__( + self, + n_vocab, + n_spks, + spk_emb_dim, + n_feats, + encoder, + decoder, + cfm, + data_statistics, + out_size, + optimizer=None, + scheduler=None, + prior_loss=True, + ): + super().__init__() + + self.save_hyperparameters(logger=False) + + self.n_vocab = n_vocab + self.n_spks = n_spks + self.spk_emb_dim = spk_emb_dim + self.n_feats = n_feats + self.out_size = out_size + self.prior_loss = prior_loss + + if n_spks > 1: + self.spk_emb = torch.nn.Embedding(n_spks, spk_emb_dim) + + self.encoder = TextEncoder( + encoder.encoder_type, + encoder.encoder_params, + encoder.duration_predictor_params, + n_vocab, + n_spks, + spk_emb_dim, + ) + + self.decoder = CFM( + in_channels=2 * encoder.encoder_params.n_feats, + out_channel=encoder.encoder_params.n_feats, + cfm_params=cfm, + decoder_params=decoder, + n_spks=n_spks, + spk_emb_dim=spk_emb_dim, + ) + + self.update_data_statistics(data_statistics) + + @torch.inference_mode() + def synthesise(self, x, x_lengths, n_timesteps, temperature=1.0, spks=None, length_scale=1.0): + """ + Generates mel-spectrogram from text. Returns: + 1. encoder outputs + 2. decoder outputs + 3. generated alignment + + Args: + x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids. + shape: (batch_size, max_text_length) + x_lengths (torch.Tensor): lengths of texts in batch. + shape: (batch_size,) + n_timesteps (int): number of steps to use for reverse diffusion in decoder. + temperature (float, optional): controls variance of terminal distribution. + spks (bool, optional): speaker ids. + shape: (batch_size,) + length_scale (float, optional): controls speech pace. + Increase value to slow down generated speech and vice versa. + + Returns: + dict: { + "encoder_outputs": torch.Tensor, shape: (batch_size, n_feats, max_mel_length), + # Average mel spectrogram generated by the encoder + "decoder_outputs": torch.Tensor, shape: (batch_size, n_feats, max_mel_length), + # Refined mel spectrogram improved by the CFM + "attn": torch.Tensor, shape: (batch_size, max_text_length, max_mel_length), + # Alignment map between text and mel spectrogram + "mel": torch.Tensor, shape: (batch_size, n_feats, max_mel_length), + # Denormalized mel spectrogram + "mel_lengths": torch.Tensor, shape: (batch_size,), + # Lengths of mel spectrograms + "rtf": float, + # Real-time factor + """ + # For RTF computation + t = dt.datetime.now() + + if self.n_spks > 1: + # Get speaker embedding + spks = self.spk_emb(spks.long()) + + # Get encoder_outputs `mu_x` and log-scaled token durations `logw` + mu_x, logw, x_mask = self.encoder(x, x_lengths, spks) + + w = torch.exp(logw) * x_mask + w_ceil = torch.ceil(w) * length_scale + y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() + y_max_length = y_lengths.max() + y_max_length_ = fix_len_compatibility(y_max_length) + + # Using obtained durations `w` construct alignment map `attn` + y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype) + attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2) + attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) + + # Align encoded text and get mu_y + mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2)) + mu_y = mu_y.transpose(1, 2) + encoder_outputs = mu_y[:, :, :y_max_length] + + # Generate sample tracing the probability flow + decoder_outputs = self.decoder(mu_y, y_mask, n_timesteps, temperature, spks) + decoder_outputs = decoder_outputs[:, :, :y_max_length] + + t = (dt.datetime.now() - t).total_seconds() + rtf = t * 22050 / (decoder_outputs.shape[-1] * 256) + + return { + "encoder_outputs": encoder_outputs, + "decoder_outputs": decoder_outputs, + "attn": attn[:, :, :y_max_length], + "mel": denormalize(decoder_outputs, self.mel_mean, self.mel_std), + "mel_lengths": y_lengths, + "rtf": rtf, + } + + def forward(self, x, x_lengths, y, y_lengths, spks=None, out_size=None, cond=None): + """ + Computes 3 losses: + 1. duration loss: loss between predicted token durations and those extracted by Monotinic Alignment Search (MAS). + 2. prior loss: loss between mel-spectrogram and encoder outputs. + 3. flow matching loss: loss between mel-spectrogram and decoder outputs. + + Args: + x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids. + shape: (batch_size, max_text_length) + x_lengths (torch.Tensor): lengths of texts in batch. + shape: (batch_size,) + y (torch.Tensor): batch of corresponding mel-spectrograms. + shape: (batch_size, n_feats, max_mel_length) + y_lengths (torch.Tensor): lengths of mel-spectrograms in batch. + shape: (batch_size,) + out_size (int, optional): length (in mel's sampling rate) of segment to cut, on which decoder will be trained. + Should be divisible by 2^{num of UNet downsamplings}. Needed to increase batch size. + spks (torch.Tensor, optional): speaker ids. + shape: (batch_size,) + """ + if self.n_spks > 1: + # Get speaker embedding + spks = self.spk_emb(spks) + + # Get encoder_outputs `mu_x` and log-scaled token durations `logw` + mu_x, logw, x_mask = self.encoder(x, x_lengths, spks) + y_max_length = y.shape[-1] + + y_mask = sequence_mask(y_lengths, y_max_length).unsqueeze(1).to(x_mask) + attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2) + + # Use MAS to find most likely alignment `attn` between text and mel-spectrogram + with torch.no_grad(): + const = -0.5 * math.log(2 * math.pi) * self.n_feats + factor = -0.5 * torch.ones(mu_x.shape, dtype=mu_x.dtype, device=mu_x.device) + y_square = torch.matmul(factor.transpose(1, 2), y**2) + y_mu_double = torch.matmul(2.0 * (factor * mu_x).transpose(1, 2), y) + mu_square = torch.sum(factor * (mu_x**2), 1).unsqueeze(-1) + log_prior = y_square - y_mu_double + mu_square + const + + attn = monotonic_align.maximum_path(log_prior, attn_mask.squeeze(1)) + attn = attn.detach() + + # Compute loss between predicted log-scaled durations and those obtained from MAS + # refered to as prior loss in the paper + logw_ = torch.log(1e-8 + torch.sum(attn.unsqueeze(1), -1)) * x_mask + dur_loss = duration_loss(logw, logw_, x_lengths) + + # Cut a small segment of mel-spectrogram in order to increase batch size + # - "Hack" taken from Grad-TTS, in case of Grad-TTS, we cannot train batch size 32 on a 24GB GPU without it + # - Do not need this hack for Matcha-TTS, but it works with it as well + if not isinstance(out_size, type(None)): + max_offset = (y_lengths - out_size).clamp(0) + offset_ranges = list(zip([0] * max_offset.shape[0], max_offset.cpu().numpy())) + out_offset = torch.LongTensor( + [torch.tensor(random.choice(range(start, end)) if end > start else 0) for start, end in offset_ranges] + ).to(y_lengths) + attn_cut = torch.zeros(attn.shape[0], attn.shape[1], out_size, dtype=attn.dtype, device=attn.device) + y_cut = torch.zeros(y.shape[0], self.n_feats, out_size, dtype=y.dtype, device=y.device) + + y_cut_lengths = [] + for i, (y_, out_offset_) in enumerate(zip(y, out_offset)): + y_cut_length = out_size + (y_lengths[i] - out_size).clamp(None, 0) + y_cut_lengths.append(y_cut_length) + cut_lower, cut_upper = out_offset_, out_offset_ + y_cut_length + y_cut[i, :, :y_cut_length] = y_[:, cut_lower:cut_upper] + attn_cut[i, :, :y_cut_length] = attn[i, :, cut_lower:cut_upper] + + y_cut_lengths = torch.LongTensor(y_cut_lengths) + y_cut_mask = sequence_mask(y_cut_lengths).unsqueeze(1).to(y_mask) + + attn = attn_cut + y = y_cut + y_mask = y_cut_mask + + # Align encoded text with mel-spectrogram and get mu_y segment + mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2)) + mu_y = mu_y.transpose(1, 2) + + # Compute loss of the decoder + diff_loss, _ = self.decoder.compute_loss(x1=y, mask=y_mask, mu=mu_y, spks=spks, cond=cond) + + if self.prior_loss: + prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask) + prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats) + else: + prior_loss = 0 + + return dur_loss, prior_loss, diff_loss diff --git a/third_party/Matcha-TTS/matcha/onnx/__init__.py b/third_party/Matcha-TTS/matcha/onnx/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/Matcha-TTS/matcha/onnx/export.py b/third_party/Matcha-TTS/matcha/onnx/export.py new file mode 100644 index 0000000000000000000000000000000000000000..9b795086158e1ad8a4bb5cd92306f3fa765f71ea --- /dev/null +++ b/third_party/Matcha-TTS/matcha/onnx/export.py @@ -0,0 +1,181 @@ +import argparse +import random +from pathlib import Path + +import numpy as np +import torch +from lightning import LightningModule + +from matcha.cli import VOCODER_URLS, load_matcha, load_vocoder + +DEFAULT_OPSET = 15 + +SEED = 1234 +random.seed(SEED) +np.random.seed(SEED) +torch.manual_seed(SEED) +torch.cuda.manual_seed(SEED) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + + +class MatchaWithVocoder(LightningModule): + def __init__(self, matcha, vocoder): + super().__init__() + self.matcha = matcha + self.vocoder = vocoder + + def forward(self, x, x_lengths, scales, spks=None): + mel, mel_lengths = self.matcha(x, x_lengths, scales, spks) + wavs = self.vocoder(mel).clamp(-1, 1) + lengths = mel_lengths * 256 + return wavs.squeeze(1), lengths + + +def get_exportable_module(matcha, vocoder, n_timesteps): + """ + Return an appropriate `LighteningModule` and output-node names + based on whether the vocoder is embedded in the final graph + """ + + def onnx_forward_func(x, x_lengths, scales, spks=None): + """ + Custom forward function for accepting + scaler parameters as tensors + """ + # Extract scaler parameters from tensors + temperature = scales[0] + length_scale = scales[1] + output = matcha.synthesise(x, x_lengths, n_timesteps, temperature, spks, length_scale) + return output["mel"], output["mel_lengths"] + + # Monkey-patch Matcha's forward function + matcha.forward = onnx_forward_func + + if vocoder is None: + model, output_names = matcha, ["mel", "mel_lengths"] + else: + model = MatchaWithVocoder(matcha, vocoder) + output_names = ["wav", "wav_lengths"] + return model, output_names + + +def get_inputs(is_multi_speaker): + """ + Create dummy inputs for tracing + """ + dummy_input_length = 50 + x = torch.randint(low=0, high=20, size=(1, dummy_input_length), dtype=torch.long) + x_lengths = torch.LongTensor([dummy_input_length]) + + # Scales + temperature = 0.667 + length_scale = 1.0 + scales = torch.Tensor([temperature, length_scale]) + + model_inputs = [x, x_lengths, scales] + input_names = [ + "x", + "x_lengths", + "scales", + ] + + if is_multi_speaker: + spks = torch.LongTensor([1]) + model_inputs.append(spks) + input_names.append("spks") + + return tuple(model_inputs), input_names + + +def main(): + parser = argparse.ArgumentParser(description="Export 🍵 Matcha-TTS to ONNX") + + parser.add_argument( + "checkpoint_path", + type=str, + help="Path to the model checkpoint", + ) + parser.add_argument("output", type=str, help="Path to output `.onnx` file") + parser.add_argument( + "--n-timesteps", type=int, default=5, help="Number of steps to use for reverse diffusion in decoder (default 5)" + ) + parser.add_argument( + "--vocoder-name", + type=str, + choices=list(VOCODER_URLS.keys()), + default=None, + help="Name of the vocoder to embed in the ONNX graph", + ) + parser.add_argument( + "--vocoder-checkpoint-path", + type=str, + default=None, + help="Vocoder checkpoint to embed in the ONNX graph for an `e2e` like experience", + ) + parser.add_argument("--opset", type=int, default=DEFAULT_OPSET, help="ONNX opset version to use (default 15") + + args = parser.parse_args() + + print(f"[🍵] Loading Matcha checkpoint from {args.checkpoint_path}") + print(f"Setting n_timesteps to {args.n_timesteps}") + + checkpoint_path = Path(args.checkpoint_path) + matcha = load_matcha(checkpoint_path.stem, checkpoint_path, "cpu") + + if args.vocoder_name or args.vocoder_checkpoint_path: + assert ( + args.vocoder_name and args.vocoder_checkpoint_path + ), "Both vocoder_name and vocoder-checkpoint are required when embedding the vocoder in the ONNX graph." + vocoder, _ = load_vocoder(args.vocoder_name, args.vocoder_checkpoint_path, "cpu") + else: + vocoder = None + + is_multi_speaker = matcha.n_spks > 1 + + dummy_input, input_names = get_inputs(is_multi_speaker) + model, output_names = get_exportable_module(matcha, vocoder, args.n_timesteps) + + # Set dynamic shape for inputs/outputs + dynamic_axes = { + "x": {0: "batch_size", 1: "time"}, + "x_lengths": {0: "batch_size"}, + } + + if vocoder is None: + dynamic_axes.update( + { + "mel": {0: "batch_size", 2: "time"}, + "mel_lengths": {0: "batch_size"}, + } + ) + else: + print("Embedding the vocoder in the ONNX graph") + dynamic_axes.update( + { + "wav": {0: "batch_size", 1: "time"}, + "wav_lengths": {0: "batch_size"}, + } + ) + + if is_multi_speaker: + dynamic_axes["spks"] = {0: "batch_size"} + + # Create the output directory (if not exists) + Path(args.output).parent.mkdir(parents=True, exist_ok=True) + + model.to_onnx( + args.output, + dummy_input, + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes, + opset_version=args.opset, + export_params=True, + do_constant_folding=True, + ) + print(f"[🍵] ONNX model exported to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/third_party/Matcha-TTS/matcha/onnx/infer.py b/third_party/Matcha-TTS/matcha/onnx/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..89ca92559c6df3776a07a038d7838242a3d19189 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/onnx/infer.py @@ -0,0 +1,168 @@ +import argparse +import os +import warnings +from pathlib import Path +from time import perf_counter + +import numpy as np +import onnxruntime as ort +import soundfile as sf +import torch + +from matcha.cli import plot_spectrogram_to_numpy, process_text + + +def validate_args(args): + assert ( + args.text or args.file + ), "Either text or file must be provided Matcha-T(ea)TTS need sometext to whisk the waveforms." + assert args.temperature >= 0, "Sampling temperature cannot be negative" + assert args.speaking_rate >= 0, "Speaking rate must be greater than 0" + return args + + +def write_wavs(model, inputs, output_dir, external_vocoder=None): + if external_vocoder is None: + print("The provided model has the vocoder embedded in the graph.\nGenerating waveform directly") + t0 = perf_counter() + wavs, wav_lengths = model.run(None, inputs) + infer_secs = perf_counter() - t0 + mel_infer_secs = vocoder_infer_secs = None + else: + print("[🍵] Generating mel using Matcha") + mel_t0 = perf_counter() + mels, mel_lengths = model.run(None, inputs) + mel_infer_secs = perf_counter() - mel_t0 + print("Generating waveform from mel using external vocoder") + vocoder_inputs = {external_vocoder.get_inputs()[0].name: mels} + vocoder_t0 = perf_counter() + wavs = external_vocoder.run(None, vocoder_inputs)[0] + vocoder_infer_secs = perf_counter() - vocoder_t0 + wavs = wavs.squeeze(1) + wav_lengths = mel_lengths * 256 + infer_secs = mel_infer_secs + vocoder_infer_secs + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for i, (wav, wav_length) in enumerate(zip(wavs, wav_lengths)): + output_filename = output_dir.joinpath(f"output_{i + 1}.wav") + audio = wav[:wav_length] + print(f"Writing audio to {output_filename}") + sf.write(output_filename, audio, 22050, "PCM_24") + + wav_secs = wav_lengths.sum() / 22050 + print(f"Inference seconds: {infer_secs}") + print(f"Generated wav seconds: {wav_secs}") + rtf = infer_secs / wav_secs + if mel_infer_secs is not None: + mel_rtf = mel_infer_secs / wav_secs + print(f"Matcha RTF: {mel_rtf}") + if vocoder_infer_secs is not None: + vocoder_rtf = vocoder_infer_secs / wav_secs + print(f"Vocoder RTF: {vocoder_rtf}") + print(f"Overall RTF: {rtf}") + + +def write_mels(model, inputs, output_dir): + t0 = perf_counter() + mels, mel_lengths = model.run(None, inputs) + infer_secs = perf_counter() - t0 + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for i, mel in enumerate(mels): + output_stem = output_dir.joinpath(f"output_{i + 1}") + plot_spectrogram_to_numpy(mel.squeeze(), output_stem.with_suffix(".png")) + np.save(output_stem.with_suffix(".numpy"), mel) + + wav_secs = (mel_lengths * 256).sum() / 22050 + print(f"Inference seconds: {infer_secs}") + print(f"Generated wav seconds: {wav_secs}") + rtf = infer_secs / wav_secs + print(f"RTF: {rtf}") + + +def main(): + parser = argparse.ArgumentParser( + description=" 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching" + ) + parser.add_argument( + "model", + type=str, + help="ONNX model to use", + ) + parser.add_argument("--vocoder", type=str, default=None, help="Vocoder to use (defaults to None)") + parser.add_argument("--text", type=str, default=None, help="Text to synthesize") + parser.add_argument("--file", type=str, default=None, help="Text file to synthesize") + parser.add_argument("--spk", type=int, default=None, help="Speaker ID") + parser.add_argument( + "--temperature", + type=float, + default=0.667, + help="Variance of the x0 noise (default: 0.667)", + ) + parser.add_argument( + "--speaking-rate", + type=float, + default=1.0, + help="change the speaking rate, a higher value means slower speaking rate (default: 1.0)", + ) + parser.add_argument("--gpu", action="store_true", help="Use CPU for inference (default: use GPU if available)") + parser.add_argument( + "--output-dir", + type=str, + default=os.getcwd(), + help="Output folder to save results (default: current dir)", + ) + + args = parser.parse_args() + args = validate_args(args) + + if args.gpu: + providers = ["GPUExecutionProvider"] + else: + providers = ["CPUExecutionProvider"] + model = ort.InferenceSession(args.model, providers=providers) + + model_inputs = model.get_inputs() + model_outputs = list(model.get_outputs()) + + if args.text: + text_lines = args.text.splitlines() + else: + with open(args.file, encoding="utf-8") as file: + text_lines = file.read().splitlines() + + processed_lines = [process_text(0, line, "cpu") for line in text_lines] + x = [line["x"].squeeze() for line in processed_lines] + # Pad + x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True) + x = x.detach().cpu().numpy() + x_lengths = np.array([line["x_lengths"].item() for line in processed_lines], dtype=np.int64) + inputs = { + "x": x, + "x_lengths": x_lengths, + "scales": np.array([args.temperature, args.speaking_rate], dtype=np.float32), + } + is_multi_speaker = len(model_inputs) == 4 + if is_multi_speaker: + if args.spk is None: + args.spk = 0 + warn = "[!] Speaker ID not provided! Using speaker ID 0" + warnings.warn(warn, UserWarning) + inputs["spks"] = np.repeat(args.spk, x.shape[0]).astype(np.int64) + + has_vocoder_embedded = model_outputs[0].name == "wav" + if has_vocoder_embedded: + write_wavs(model, inputs, args.output_dir) + elif args.vocoder: + external_vocoder = ort.InferenceSession(args.vocoder, providers=providers) + write_wavs(model, inputs, args.output_dir, external_vocoder=external_vocoder) + else: + warn = "[!] A vocoder is not embedded in the graph nor an external vocoder is provided. The mel output will be written as numpy arrays to `*.npy` files in the output directory" + warnings.warn(warn, UserWarning) + write_mels(model, inputs, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/third_party/Matcha-TTS/matcha/text/__init__.py b/third_party/Matcha-TTS/matcha/text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..71a4b57891d3c06ad9f25493c1b40bc2f5962d17 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/text/__init__.py @@ -0,0 +1,53 @@ +""" from https://github.com/keithito/tacotron """ +from matcha.text import cleaners +from matcha.text.symbols import symbols + +# Mappings from symbol to numeric ID and vice versa: +_symbol_to_id = {s: i for i, s in enumerate(symbols)} +_id_to_symbol = {i: s for i, s in enumerate(symbols)} # pylint: disable=unnecessary-comprehension + + +def text_to_sequence(text, cleaner_names): + """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + Args: + text: string to convert to a sequence + cleaner_names: names of the cleaner functions to run the text through + Returns: + List of integers corresponding to the symbols in the text + """ + sequence = [] + + clean_text = _clean_text(text, cleaner_names) + for symbol in clean_text: + symbol_id = _symbol_to_id[symbol] + sequence += [symbol_id] + return sequence + + +def cleaned_text_to_sequence(cleaned_text): + """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + Args: + text: string to convert to a sequence + Returns: + List of integers corresponding to the symbols in the text + """ + sequence = [_symbol_to_id[symbol] for symbol in cleaned_text] + return sequence + + +def sequence_to_text(sequence): + """Converts a sequence of IDs back to a string""" + result = "" + for symbol_id in sequence: + s = _id_to_symbol[symbol_id] + result += s + return result + + +def _clean_text(text, cleaner_names): + for name in cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception("Unknown cleaner: %s" % name) + text = cleaner(text) + return text diff --git a/third_party/Matcha-TTS/matcha/text/cleaners.py b/third_party/Matcha-TTS/matcha/text/cleaners.py new file mode 100644 index 0000000000000000000000000000000000000000..5e8d96b681eb9f57356a1b86a7008e74b65ff44b --- /dev/null +++ b/third_party/Matcha-TTS/matcha/text/cleaners.py @@ -0,0 +1,116 @@ +""" from https://github.com/keithito/tacotron + +Cleaners are transformations that run over the input text at both training and eval time. + +Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" +hyperparameter. Some cleaners are English-specific. You'll typically want to use: + 1. "english_cleaners" for English text + 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using + the Unidecode library (https://pypi.python.org/pypi/Unidecode) + 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update + the symbols in symbols.py to match your data). +""" + +import logging +import re + +import phonemizer +import piper_phonemize +from unidecode import unidecode + +# To avoid excessive logging we set the log level of the phonemizer package to Critical +critical_logger = logging.getLogger("phonemizer") +critical_logger.setLevel(logging.CRITICAL) + +# Intializing the phonemizer globally significantly reduces the speed +# now the phonemizer is not initialising at every call +# Might be less flexible, but it is much-much faster +global_phonemizer = phonemizer.backend.EspeakBackend( + language="en-us", + preserve_punctuation=True, + with_stress=True, + language_switch="remove-flags", + logger=critical_logger, +) + + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r"\s+") + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("mrs", "misess"), + ("mr", "mister"), + ("dr", "doctor"), + ("st", "saint"), + ("co", "company"), + ("jr", "junior"), + ("maj", "major"), + ("gen", "general"), + ("drs", "doctors"), + ("rev", "reverend"), + ("lt", "lieutenant"), + ("hon", "honorable"), + ("sgt", "sergeant"), + ("capt", "captain"), + ("esq", "esquire"), + ("ltd", "limited"), + ("col", "colonel"), + ("ft", "fort"), + ] +] + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, " ", text) + + +def convert_to_ascii(text): + return unidecode(text) + + +def basic_cleaners(text): + """Basic pipeline that lowercases and collapses whitespace without transliteration.""" + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + """Pipeline for non-English text that transliterates to ASCII.""" + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners2(text): + """Pipeline for English text, including abbreviation expansion. + punctuation + stress""" + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_abbreviations(text) + phonemes = global_phonemizer.phonemize([text], strip=True, njobs=1)[0] + phonemes = collapse_whitespace(phonemes) + return phonemes + + +def english_cleaners_piper(text): + """Pipeline for English text, including abbreviation expansion. + punctuation + stress""" + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_abbreviations(text) + phonemes = "".join(piper_phonemize.phonemize_espeak(text=text, voice="en-US")[0]) + phonemes = collapse_whitespace(phonemes) + return phonemes diff --git a/third_party/Matcha-TTS/matcha/text/numbers.py b/third_party/Matcha-TTS/matcha/text/numbers.py new file mode 100644 index 0000000000000000000000000000000000000000..f99a8686dcb73532091122613e74bd643a8a327f --- /dev/null +++ b/third_party/Matcha-TTS/matcha/text/numbers.py @@ -0,0 +1,71 @@ +""" from https://github.com/keithito/tacotron """ + +import re + +import inflect + +_inflect = inflect.engine() +_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") +_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") +_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)") +_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)") +_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") +_number_re = re.compile(r"[0-9]+") + + +def _remove_commas(m): + return m.group(1).replace(",", "") + + +def _expand_decimal_point(m): + return m.group(1).replace(".", " point ") + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split(".") + if len(parts) > 2: + return match + " dollars" + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = "dollar" if dollars == 1 else "dollars" + cent_unit = "cent" if cents == 1 else "cents" + return f"{dollars} {dollar_unit}, {cents} {cent_unit}" + elif dollars: + dollar_unit = "dollar" if dollars == 1 else "dollars" + return f"{dollars} {dollar_unit}" + elif cents: + cent_unit = "cent" if cents == 1 else "cents" + return f"{cents} {cent_unit}" + else: + return "zero dollars" + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return "two thousand" + elif num > 2000 and num < 2010: + return "two thousand " + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + " hundred" + else: + return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") + else: + return _inflect.number_to_words(num, andword="") + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r"\1 pounds", text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text diff --git a/third_party/Matcha-TTS/matcha/text/symbols.py b/third_party/Matcha-TTS/matcha/text/symbols.py new file mode 100644 index 0000000000000000000000000000000000000000..7018df549a1e50c3be20416069b6913c641024bd --- /dev/null +++ b/third_party/Matcha-TTS/matcha/text/symbols.py @@ -0,0 +1,17 @@ +""" from https://github.com/keithito/tacotron + +Defines the set of symbols used in text input to the model. +""" +_pad = "_" +_punctuation = ';:,.!?¡¿—…"«»“” ' +_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" +_letters_ipa = ( + "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" +) + + +# Export all symbols: +symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) + +# Special symbol ids +SPACE_ID = symbols.index(" ") diff --git a/third_party/Matcha-TTS/matcha/train.py b/third_party/Matcha-TTS/matcha/train.py new file mode 100644 index 0000000000000000000000000000000000000000..d1d64c6c44af2622be5e6bf368616feb6619ed7e --- /dev/null +++ b/third_party/Matcha-TTS/matcha/train.py @@ -0,0 +1,122 @@ +from typing import Any, Dict, List, Optional, Tuple + +import hydra +import lightning as L +import rootutils +from lightning import Callback, LightningDataModule, LightningModule, Trainer +from lightning.pytorch.loggers import Logger +from omegaconf import DictConfig + +from matcha import utils + +rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True) +# ------------------------------------------------------------------------------------ # +# the setup_root above is equivalent to: +# - adding project root dir to PYTHONPATH +# (so you don't need to force user to install project as a package) +# (necessary before importing any local modules e.g. `from src import utils`) +# - setting up PROJECT_ROOT environment variable +# (which is used as a base for paths in "configs/paths/default.yaml") +# (this way all filepaths are the same no matter where you run the code) +# - loading environment variables from ".env" in root dir +# +# you can remove it if you: +# 1. either install project as a package or move entry files to project root dir +# 2. set `root_dir` to "." in "configs/paths/default.yaml" +# +# more info: https://github.com/ashleve/rootutils +# ------------------------------------------------------------------------------------ # + + +log = utils.get_pylogger(__name__) + + +@utils.task_wrapper +def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """Trains the model. Can additionally evaluate on a testset, using best weights obtained during + training. + + This method is wrapped in optional @task_wrapper decorator, that controls the behavior during + failure. Useful for multiruns, saving info about the crash, etc. + + :param cfg: A DictConfig configuration composed by Hydra. + :return: A tuple with metrics and dict with all instantiated objects. + """ + # set seed for random number generators in pytorch, numpy and python.random + if cfg.get("seed"): + L.seed_everything(cfg.seed, workers=True) + + log.info(f"Instantiating datamodule <{cfg.data._target_}>") # pylint: disable=protected-access + datamodule: LightningDataModule = hydra.utils.instantiate(cfg.data) + + log.info(f"Instantiating model <{cfg.model._target_}>") # pylint: disable=protected-access + model: LightningModule = hydra.utils.instantiate(cfg.model) + + log.info("Instantiating callbacks...") + callbacks: List[Callback] = utils.instantiate_callbacks(cfg.get("callbacks")) + + log.info("Instantiating loggers...") + logger: List[Logger] = utils.instantiate_loggers(cfg.get("logger")) + + log.info(f"Instantiating trainer <{cfg.trainer._target_}>") # pylint: disable=protected-access + trainer: Trainer = hydra.utils.instantiate(cfg.trainer, callbacks=callbacks, logger=logger) + + object_dict = { + "cfg": cfg, + "datamodule": datamodule, + "model": model, + "callbacks": callbacks, + "logger": logger, + "trainer": trainer, + } + + if logger: + log.info("Logging hyperparameters!") + utils.log_hyperparameters(object_dict) + + if cfg.get("train"): + log.info("Starting training!") + trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path")) + + train_metrics = trainer.callback_metrics + + if cfg.get("test"): + log.info("Starting testing!") + ckpt_path = trainer.checkpoint_callback.best_model_path + if ckpt_path == "": + log.warning("Best ckpt not found! Using current weights for testing...") + ckpt_path = None + trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path) + log.info(f"Best ckpt path: {ckpt_path}") + + test_metrics = trainer.callback_metrics + + # merge train and test metrics + metric_dict = {**train_metrics, **test_metrics} + + return metric_dict, object_dict + + +@hydra.main(version_base="1.3", config_path="../configs", config_name="train.yaml") +def main(cfg: DictConfig) -> Optional[float]: + """Main entry point for training. + + :param cfg: DictConfig configuration composed by Hydra. + :return: Optional[float] with optimized metric value. + """ + # apply extra utilities + # (e.g. ask for tags if none are provided in cfg, print cfg tree, etc.) + utils.extras(cfg) + + # train the model + metric_dict, _ = train(cfg) + + # safely retrieve metric value for hydra-based hyperparameter optimization + metric_value = utils.get_metric_value(metric_dict=metric_dict, metric_name=cfg.get("optimized_metric")) + + # return optimized metric + return metric_value + + +if __name__ == "__main__": + main() # pylint: disable=no-value-for-parameter diff --git a/third_party/Matcha-TTS/matcha/utils/__init__.py b/third_party/Matcha-TTS/matcha/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..074db6461184e8cbb86d977cb41d9ebd918e958a --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/__init__.py @@ -0,0 +1,5 @@ +from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers +from matcha.utils.logging_utils import log_hyperparameters +from matcha.utils.pylogger import get_pylogger +from matcha.utils.rich_utils import enforce_tags, print_config_tree +from matcha.utils.utils import extras, get_metric_value, task_wrapper diff --git a/third_party/Matcha-TTS/matcha/utils/__pycache__/__init__.cpython-310.pyc b/third_party/Matcha-TTS/matcha/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb70706c0921a7ca56229119037e1e7a838b0249 Binary files /dev/null and b/third_party/Matcha-TTS/matcha/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/utils/__pycache__/instantiators.cpython-310.pyc b/third_party/Matcha-TTS/matcha/utils/__pycache__/instantiators.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ee3ab0d866d3e9d198e4622203d8b2f96eca0b3 Binary files /dev/null and b/third_party/Matcha-TTS/matcha/utils/__pycache__/instantiators.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/utils/__pycache__/logging_utils.cpython-310.pyc b/third_party/Matcha-TTS/matcha/utils/__pycache__/logging_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5618aa49c569d9cdb6842bfba09bff305b0b94f7 Binary files /dev/null and b/third_party/Matcha-TTS/matcha/utils/__pycache__/logging_utils.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/utils/__pycache__/pylogger.cpython-310.pyc b/third_party/Matcha-TTS/matcha/utils/__pycache__/pylogger.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66c99ca2722893960aabbbf895efac2a5676aebe Binary files /dev/null and b/third_party/Matcha-TTS/matcha/utils/__pycache__/pylogger.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/utils/__pycache__/rich_utils.cpython-310.pyc b/third_party/Matcha-TTS/matcha/utils/__pycache__/rich_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0219ee1bd6b941d6462dc3eef0d423213b250b7 Binary files /dev/null and b/third_party/Matcha-TTS/matcha/utils/__pycache__/rich_utils.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/utils/__pycache__/utils.cpython-310.pyc b/third_party/Matcha-TTS/matcha/utils/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e5342e4518e2e399e6f21bdba56a2c051bbf38b Binary files /dev/null and b/third_party/Matcha-TTS/matcha/utils/__pycache__/utils.cpython-310.pyc differ diff --git a/third_party/Matcha-TTS/matcha/utils/audio.py b/third_party/Matcha-TTS/matcha/utils/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..0bcd74df47fb006f68deb5a5f4a4c2fb0aa84f57 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/audio.py @@ -0,0 +1,82 @@ +import numpy as np +import torch +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn +from scipy.io.wavfile import read + +MAX_WAV_VALUE = 32768.0 + + +def load_wav(full_path): + sampling_rate, data = read(full_path) + return data, sampling_rate + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): + if torch.min(y) < -1.0: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.0: + print("max value is ", torch.max(y)) + + global mel_basis, hann_window # pylint: disable=global-statement + if f"{str(fmax)}_{str(y.device)}" not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device) + hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) + + y = torch.nn.functional.pad( + y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" + ) + y = y.squeeze(1) + + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[str(y.device)], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + ) + + spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) + + spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec) + spec = spectral_normalize_torch(spec) + + return spec diff --git a/third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py b/third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py new file mode 100644 index 0000000000000000000000000000000000000000..96a5382296426803f1010385d184af7bfc901290 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py @@ -0,0 +1,111 @@ +r""" +The file creates a pickle file where the values needed for loading of dataset is stored and the model can load it +when needed. + +Parameters from hparam.py will be used +""" +import argparse +import json +import os +import sys +from pathlib import Path + +import rootutils +import torch +from hydra import compose, initialize +from omegaconf import open_dict +from tqdm.auto import tqdm + +from matcha.data.text_mel_datamodule import TextMelDataModule +from matcha.utils.logging_utils import pylogger + +log = pylogger.get_pylogger(__name__) + + +def compute_data_statistics(data_loader: torch.utils.data.DataLoader, out_channels: int): + """Generate data mean and standard deviation helpful in data normalisation + + Args: + data_loader (torch.utils.data.Dataloader): _description_ + out_channels (int): mel spectrogram channels + """ + total_mel_sum = 0 + total_mel_sq_sum = 0 + total_mel_len = 0 + + for batch in tqdm(data_loader, leave=False): + mels = batch["y"] + mel_lengths = batch["y_lengths"] + + total_mel_len += torch.sum(mel_lengths) + total_mel_sum += torch.sum(mels) + total_mel_sq_sum += torch.sum(torch.pow(mels, 2)) + + data_mean = total_mel_sum / (total_mel_len * out_channels) + data_std = torch.sqrt((total_mel_sq_sum / (total_mel_len * out_channels)) - torch.pow(data_mean, 2)) + + return {"mel_mean": data_mean.item(), "mel_std": data_std.item()} + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "-i", + "--input-config", + type=str, + default="vctk.yaml", + help="The name of the yaml config file under configs/data", + ) + + parser.add_argument( + "-b", + "--batch-size", + type=int, + default="256", + help="Can have increased batch size for faster computation", + ) + + parser.add_argument( + "-f", + "--force", + action="store_true", + default=False, + required=False, + help="force overwrite the file", + ) + args = parser.parse_args() + output_file = Path(args.input_config).with_suffix(".json") + + if os.path.exists(output_file) and not args.force: + print("File already exists. Use -f to force overwrite") + sys.exit(1) + + with initialize(version_base="1.3", config_path="../../configs/data"): + cfg = compose(config_name=args.input_config, return_hydra_config=True, overrides=[]) + + root_path = rootutils.find_root(search_from=__file__, indicator=".project-root") + + with open_dict(cfg): + del cfg["hydra"] + del cfg["_target_"] + cfg["data_statistics"] = None + cfg["seed"] = 1234 + cfg["batch_size"] = args.batch_size + cfg["train_filelist_path"] = str(os.path.join(root_path, cfg["train_filelist_path"])) + cfg["valid_filelist_path"] = str(os.path.join(root_path, cfg["valid_filelist_path"])) + + text_mel_datamodule = TextMelDataModule(**cfg) + text_mel_datamodule.setup() + data_loader = text_mel_datamodule.train_dataloader() + log.info("Dataloader loaded! Now computing stats...") + params = compute_data_statistics(data_loader, cfg["n_feats"]) + print(params) + json.dump( + params, + open(output_file, "w"), + ) + + +if __name__ == "__main__": + main() diff --git a/third_party/Matcha-TTS/matcha/utils/instantiators.py b/third_party/Matcha-TTS/matcha/utils/instantiators.py new file mode 100644 index 0000000000000000000000000000000000000000..5547b4ed61ed8c21e63c528f58526a949879a94f --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/instantiators.py @@ -0,0 +1,56 @@ +from typing import List + +import hydra +from lightning import Callback +from lightning.pytorch.loggers import Logger +from omegaconf import DictConfig + +from matcha.utils import pylogger + +log = pylogger.get_pylogger(__name__) + + +def instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]: + """Instantiates callbacks from config. + + :param callbacks_cfg: A DictConfig object containing callback configurations. + :return: A list of instantiated callbacks. + """ + callbacks: List[Callback] = [] + + if not callbacks_cfg: + log.warning("No callback configs found! Skipping..") + return callbacks + + if not isinstance(callbacks_cfg, DictConfig): + raise TypeError("Callbacks config must be a DictConfig!") + + for _, cb_conf in callbacks_cfg.items(): + if isinstance(cb_conf, DictConfig) and "_target_" in cb_conf: + log.info(f"Instantiating callback <{cb_conf._target_}>") # pylint: disable=protected-access + callbacks.append(hydra.utils.instantiate(cb_conf)) + + return callbacks + + +def instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]: + """Instantiates loggers from config. + + :param logger_cfg: A DictConfig object containing logger configurations. + :return: A list of instantiated loggers. + """ + logger: List[Logger] = [] + + if not logger_cfg: + log.warning("No logger configs found! Skipping...") + return logger + + if not isinstance(logger_cfg, DictConfig): + raise TypeError("Logger config must be a DictConfig!") + + for _, lg_conf in logger_cfg.items(): + if isinstance(lg_conf, DictConfig) and "_target_" in lg_conf: + log.info(f"Instantiating logger <{lg_conf._target_}>") # pylint: disable=protected-access + logger.append(hydra.utils.instantiate(lg_conf)) + + return logger diff --git a/third_party/Matcha-TTS/matcha/utils/logging_utils.py b/third_party/Matcha-TTS/matcha/utils/logging_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1a12d1ddafa25ca3ae8e497bcd7de2191f13659b --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/logging_utils.py @@ -0,0 +1,53 @@ +from typing import Any, Dict + +from lightning.pytorch.utilities import rank_zero_only +from omegaconf import OmegaConf + +from matcha.utils import pylogger + +log = pylogger.get_pylogger(__name__) + + +@rank_zero_only +def log_hyperparameters(object_dict: Dict[str, Any]) -> None: + """Controls which config parts are saved by Lightning loggers. + + Additionally saves: + - Number of model parameters + + :param object_dict: A dictionary containing the following objects: + - `"cfg"`: A DictConfig object containing the main config. + - `"model"`: The Lightning model. + - `"trainer"`: The Lightning trainer. + """ + hparams = {} + + cfg = OmegaConf.to_container(object_dict["cfg"]) + model = object_dict["model"] + trainer = object_dict["trainer"] + + if not trainer.logger: + log.warning("Logger not found! Skipping hyperparameter logging...") + return + + hparams["model"] = cfg["model"] + + # save number of model parameters + hparams["model/params/total"] = sum(p.numel() for p in model.parameters()) + hparams["model/params/trainable"] = sum(p.numel() for p in model.parameters() if p.requires_grad) + hparams["model/params/non_trainable"] = sum(p.numel() for p in model.parameters() if not p.requires_grad) + + hparams["data"] = cfg["data"] + hparams["trainer"] = cfg["trainer"] + + hparams["callbacks"] = cfg.get("callbacks") + hparams["extras"] = cfg.get("extras") + + hparams["task_name"] = cfg.get("task_name") + hparams["tags"] = cfg.get("tags") + hparams["ckpt_path"] = cfg.get("ckpt_path") + hparams["seed"] = cfg.get("seed") + + # send hparams to all loggers + for logger in trainer.loggers: + logger.log_hyperparams(hparams) diff --git a/third_party/Matcha-TTS/matcha/utils/model.py b/third_party/Matcha-TTS/matcha/utils/model.py new file mode 100644 index 0000000000000000000000000000000000000000..869cc6092f5952930534c47544fae88308e96abf --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/model.py @@ -0,0 +1,90 @@ +""" from https://github.com/jaywalnut310/glow-tts """ + +import numpy as np +import torch + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def fix_len_compatibility(length, num_downsamplings_in_unet=2): + factor = torch.scalar_tensor(2).pow(num_downsamplings_in_unet) + length = (length / factor).ceil() * factor + if not torch.onnx.is_in_onnx_export(): + return length.int().item() + else: + return length + + +def convert_pad_shape(pad_shape): + inverted_shape = pad_shape[::-1] + pad_shape = [item for sublist in inverted_shape for item in sublist] + return pad_shape + + +def generate_path(duration, mask): + device = duration.device + + b, t_x, t_y = mask.shape + cum_duration = torch.cumsum(duration, 1) + path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path * mask + return path + + +def duration_loss(logw, logw_, lengths): + loss = torch.sum((logw - logw_) ** 2) / torch.sum(lengths) + return loss + + +def normalize(data, mu, std): + if not isinstance(mu, (float, int)): + if isinstance(mu, list): + mu = torch.tensor(mu, dtype=data.dtype, device=data.device) + elif isinstance(mu, torch.Tensor): + mu = mu.to(data.device) + elif isinstance(mu, np.ndarray): + mu = torch.from_numpy(mu).to(data.device) + mu = mu.unsqueeze(-1) + + if not isinstance(std, (float, int)): + if isinstance(std, list): + std = torch.tensor(std, dtype=data.dtype, device=data.device) + elif isinstance(std, torch.Tensor): + std = std.to(data.device) + elif isinstance(std, np.ndarray): + std = torch.from_numpy(std).to(data.device) + std = std.unsqueeze(-1) + + return (data - mu) / std + + +def denormalize(data, mu, std): + if not isinstance(mu, float): + if isinstance(mu, list): + mu = torch.tensor(mu, dtype=data.dtype, device=data.device) + elif isinstance(mu, torch.Tensor): + mu = mu.to(data.device) + elif isinstance(mu, np.ndarray): + mu = torch.from_numpy(mu).to(data.device) + mu = mu.unsqueeze(-1) + + if not isinstance(std, float): + if isinstance(std, list): + std = torch.tensor(std, dtype=data.dtype, device=data.device) + elif isinstance(std, torch.Tensor): + std = std.to(data.device) + elif isinstance(std, np.ndarray): + std = torch.from_numpy(std).to(data.device) + std = std.unsqueeze(-1) + + return data * std + mu diff --git a/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py b/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eee6e0d47c2e3612ef02bc17442e6886998e5a94 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py @@ -0,0 +1,22 @@ +import numpy as np +import torch + +from matcha.utils.monotonic_align.core import maximum_path_c + + +def maximum_path(value, mask): + """Cython optimised version. + value: [b, t_x, t_y] + mask: [b, t_x, t_y] + """ + value = value * mask + device = value.device + dtype = value.dtype + value = value.data.cpu().numpy().astype(np.float32) + path = np.zeros_like(value).astype(np.int32) + mask = mask.data.cpu().numpy() + + t_x_max = mask.sum(1)[:, 0].astype(np.int32) + t_y_max = mask.sum(2)[:, 0].astype(np.int32) + maximum_path_c(path, value, t_x_max, t_y_max) + return torch.from_numpy(path).to(device=device, dtype=dtype) diff --git a/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx b/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx new file mode 100644 index 0000000000000000000000000000000000000000..091fcc3a50a51f3d3fee47a70825260757e6d885 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx @@ -0,0 +1,47 @@ +import numpy as np + +cimport cython +cimport numpy as np + +from cython.parallel import prange + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: + cdef int x + cdef int y + cdef float v_prev + cdef float v_cur + cdef float tmp + cdef int index = t_x - 1 + + for y in range(t_y): + for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): + if x == y: + v_cur = max_neg_val + else: + v_cur = value[x, y-1] + if x == 0: + if y == 0: + v_prev = 0. + else: + v_prev = max_neg_val + else: + v_prev = value[x-1, y-1] + value[x, y] = max(v_cur, v_prev) + value[x, y] + + for y in range(t_y - 1, -1, -1): + path[index, y] = 1 + if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): + index = index - 1 + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: + cdef int b = values.shape[0] + + cdef int i + for i in prange(b, nogil=True): + maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) diff --git a/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py b/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..f22bc6a35a5a04c9e6d7b82040973722c9b770c9 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py @@ -0,0 +1,7 @@ +# from distutils.core import setup +# from Cython.Build import cythonize +# import numpy + +# setup(name='monotonic_align', +# ext_modules=cythonize("core.pyx"), +# include_dirs=[numpy.get_include()]) diff --git a/third_party/Matcha-TTS/matcha/utils/pylogger.py b/third_party/Matcha-TTS/matcha/utils/pylogger.py new file mode 100644 index 0000000000000000000000000000000000000000..61600678029362e110f655edb91d5f3bc5b1cd1c --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/pylogger.py @@ -0,0 +1,21 @@ +import logging + +from lightning.pytorch.utilities import rank_zero_only + + +def get_pylogger(name: str = __name__) -> logging.Logger: + """Initializes a multi-GPU-friendly python command line logger. + + :param name: The name of the logger, defaults to ``__name__``. + + :return: A logger object. + """ + logger = logging.getLogger(name) + + # this ensures all logging levels get marked with the rank zero decorator + # otherwise logs would get multiplied for each GPU process in multi-GPU setup + logging_levels = ("debug", "info", "warning", "error", "exception", "fatal", "critical") + for level in logging_levels: + setattr(logger, level, rank_zero_only(getattr(logger, level))) + + return logger diff --git a/third_party/Matcha-TTS/matcha/utils/rich_utils.py b/third_party/Matcha-TTS/matcha/utils/rich_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f602f6e9351d948946eb419eb4e420190ea634bc --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/rich_utils.py @@ -0,0 +1,101 @@ +from pathlib import Path +from typing import Sequence + +import rich +import rich.syntax +import rich.tree +from hydra.core.hydra_config import HydraConfig +from lightning.pytorch.utilities import rank_zero_only +from omegaconf import DictConfig, OmegaConf, open_dict +from rich.prompt import Prompt + +from matcha.utils import pylogger + +log = pylogger.get_pylogger(__name__) + + +@rank_zero_only +def print_config_tree( + cfg: DictConfig, + print_order: Sequence[str] = ( + "data", + "model", + "callbacks", + "logger", + "trainer", + "paths", + "extras", + ), + resolve: bool = False, + save_to_file: bool = False, +) -> None: + """Prints the contents of a DictConfig as a tree structure using the Rich library. + + :param cfg: A DictConfig composed by Hydra. + :param print_order: Determines in what order config components are printed. Default is ``("data", "model", + "callbacks", "logger", "trainer", "paths", "extras")``. + :param resolve: Whether to resolve reference fields of DictConfig. Default is ``False``. + :param save_to_file: Whether to export config to the hydra output folder. Default is ``False``. + """ + style = "dim" + tree = rich.tree.Tree("CONFIG", style=style, guide_style=style) + + queue = [] + + # add fields from `print_order` to queue + for field in print_order: + _ = ( + queue.append(field) + if field in cfg + else log.warning(f"Field '{field}' not found in config. Skipping '{field}' config printing...") + ) + + # add all the other fields to queue (not specified in `print_order`) + for field in cfg: + if field not in queue: + queue.append(field) + + # generate config tree from queue + for field in queue: + branch = tree.add(field, style=style, guide_style=style) + + config_group = cfg[field] + if isinstance(config_group, DictConfig): + branch_content = OmegaConf.to_yaml(config_group, resolve=resolve) + else: + branch_content = str(config_group) + + branch.add(rich.syntax.Syntax(branch_content, "yaml")) + + # print config tree + rich.print(tree) + + # save config tree to file + if save_to_file: + with open(Path(cfg.paths.output_dir, "config_tree.log"), "w") as file: + rich.print(tree, file=file) + + +@rank_zero_only +def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None: + """Prompts user to input tags from command line if no tags are provided in config. + + :param cfg: A DictConfig composed by Hydra. + :param save_to_file: Whether to export tags to the hydra output folder. Default is ``False``. + """ + if not cfg.get("tags"): + if "id" in HydraConfig().cfg.hydra.job: + raise ValueError("Specify tags before launching a multirun!") + + log.warning("No tags provided in config. Prompting user to input tags...") + tags = Prompt.ask("Enter a list of comma separated tags", default="dev") + tags = [t.strip() for t in tags.split(",") if t != ""] + + with open_dict(cfg): + cfg.tags = tags + + log.info(f"Tags: {cfg.tags}") + + if save_to_file: + with open(Path(cfg.paths.output_dir, "tags.log"), "w") as file: + rich.print(cfg.tags, file=file) diff --git a/third_party/Matcha-TTS/matcha/utils/utils.py b/third_party/Matcha-TTS/matcha/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..af65e09070b4a4786ad139ec6e3d57d5ef578204 --- /dev/null +++ b/third_party/Matcha-TTS/matcha/utils/utils.py @@ -0,0 +1,219 @@ +import os +import sys +import warnings +from importlib.util import find_spec +from pathlib import Path +from typing import Any, Callable, Dict, Tuple + +import gdown +import matplotlib.pyplot as plt +import numpy as np +import torch +import wget +from omegaconf import DictConfig + +from matcha.utils import pylogger, rich_utils + +log = pylogger.get_pylogger(__name__) + + +def extras(cfg: DictConfig) -> None: + """Applies optional utilities before the task is started. + + Utilities: + - Ignoring python warnings + - Setting tags from command line + - Rich config printing + + :param cfg: A DictConfig object containing the config tree. + """ + # return if no `extras` config + if not cfg.get("extras"): + log.warning("Extras config not found! ") + return + + # disable python warnings + if cfg.extras.get("ignore_warnings"): + log.info("Disabling python warnings! ") + warnings.filterwarnings("ignore") + + # prompt user to input tags from command line if none are provided in the config + if cfg.extras.get("enforce_tags"): + log.info("Enforcing tags! ") + rich_utils.enforce_tags(cfg, save_to_file=True) + + # pretty print config tree using Rich library + if cfg.extras.get("print_config"): + log.info("Printing config tree with Rich! ") + rich_utils.print_config_tree(cfg, resolve=True, save_to_file=True) + + +def task_wrapper(task_func: Callable) -> Callable: + """Optional decorator that controls the failure behavior when executing the task function. + + This wrapper can be used to: + - make sure loggers are closed even if the task function raises an exception (prevents multirun failure) + - save the exception to a `.log` file + - mark the run as failed with a dedicated file in the `logs/` folder (so we can find and rerun it later) + - etc. (adjust depending on your needs) + + Example: + ``` + @utils.task_wrapper + def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]: + ... + return metric_dict, object_dict + ``` + + :param task_func: The task function to be wrapped. + + :return: The wrapped task function. + """ + + def wrap(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]: + # execute the task + try: + metric_dict, object_dict = task_func(cfg=cfg) + + # things to do if exception occurs + except Exception as ex: + # save exception to `.log` file + log.exception("") + + # some hyperparameter combinations might be invalid or cause out-of-memory errors + # so when using hparam search plugins like Optuna, you might want to disable + # raising the below exception to avoid multirun failure + raise ex + + # things to always do after either success or exception + finally: + # display output dir path in terminal + log.info(f"Output dir: {cfg.paths.output_dir}") + + # always close wandb run (even if exception occurs so multirun won't fail) + if find_spec("wandb"): # check if wandb is installed + import wandb + + if wandb.run: + log.info("Closing wandb!") + wandb.finish() + + return metric_dict, object_dict + + return wrap + + +def get_metric_value(metric_dict: Dict[str, Any], metric_name: str) -> float: + """Safely retrieves value of the metric logged in LightningModule. + + :param metric_dict: A dict containing metric values. + :param metric_name: The name of the metric to retrieve. + :return: The value of the metric. + """ + if not metric_name: + log.info("Metric name is None! Skipping metric value retrieval...") + return None + + if metric_name not in metric_dict: + raise ValueError( + f"Metric value not found! \n" + "Make sure metric name logged in LightningModule is correct!\n" + "Make sure `optimized_metric` name in `hparams_search` config is correct!" + ) + + metric_value = metric_dict[metric_name].item() + log.info(f"Retrieved metric value! <{metric_name}={metric_value}>") + + return metric_value + + +def intersperse(lst, item): + # Adds blank symbol + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def save_figure_to_numpy(fig): + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + return data + + +def plot_tensor(tensor): + plt.style.use("default") + fig, ax = plt.subplots(figsize=(12, 3)) + im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.tight_layout() + fig.canvas.draw() + data = save_figure_to_numpy(fig) + plt.close() + return data + + +def save_plot(tensor, savepath): + plt.style.use("default") + fig, ax = plt.subplots(figsize=(12, 3)) + im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.tight_layout() + fig.canvas.draw() + plt.savefig(savepath) + plt.close() + + +def to_numpy(tensor): + if isinstance(tensor, np.ndarray): + return tensor + elif isinstance(tensor, torch.Tensor): + return tensor.detach().cpu().numpy() + elif isinstance(tensor, list): + return np.array(tensor) + else: + raise TypeError("Unsupported type for conversion to numpy array") + + +def get_user_data_dir(appname="matcha_tts"): + """ + Args: + appname (str): Name of application + + Returns: + Path: path to user data directory + """ + + MATCHA_HOME = os.environ.get("MATCHA_HOME") + if MATCHA_HOME is not None: + ans = Path(MATCHA_HOME).expanduser().resolve(strict=False) + elif sys.platform == "win32": + import winreg # pylint: disable=import-outside-toplevel + + key = winreg.OpenKey( + winreg.HKEY_CURRENT_USER, + r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders", + ) + dir_, _ = winreg.QueryValueEx(key, "Local AppData") + ans = Path(dir_).resolve(strict=False) + elif sys.platform == "darwin": + ans = Path("~/Library/Application Support/").expanduser() + else: + ans = Path.home().joinpath(".local/share") + + final_path = ans.joinpath(appname) + final_path.mkdir(parents=True, exist_ok=True) + return final_path + + +def assert_model_downloaded(checkpoint_path, url, use_wget=True): + if Path(checkpoint_path).exists(): + log.debug(f"[+] Model already present at {checkpoint_path}!") + print(f"[+] Model already present at {checkpoint_path}!") + return + log.info(f"[-] Model not found at {checkpoint_path}! Will download it") + print(f"[-] Model not found at {checkpoint_path}! Will download it") + checkpoint_path = str(checkpoint_path) + if not use_wget: + gdown.download(url=url, output=checkpoint_path, quiet=False, fuzzy=True) + else: + wget.download(url=url, out=checkpoint_path) diff --git a/third_party/Matcha-TTS/notebooks/.gitkeep b/third_party/Matcha-TTS/notebooks/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/third_party/Matcha-TTS/pyproject.toml b/third_party/Matcha-TTS/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..74aa39300a61b8b3607dc634d68aa47013141ec5 --- /dev/null +++ b/third_party/Matcha-TTS/pyproject.toml @@ -0,0 +1,51 @@ +[build-system] +requires = ["setuptools", "wheel", "cython==0.29.35", "numpy==1.24.3", "packaging"] + +[tool.black] +line-length = 120 +target-version = ['py310'] +exclude = ''' + +( + /( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + )/ + | foo.py # also separately exclude a file named foo.py in + # the root of the project +) +''' + +[tool.pytest.ini_options] +addopts = [ + "--color=yes", + "--durations=0", + "--strict-markers", + "--doctest-modules", +] +filterwarnings = [ + "ignore::DeprecationWarning", + "ignore::UserWarning", +] +log_cli = "True" +markers = [ + "slow: slow tests", +] +minversion = "6.0" +testpaths = "tests/" + +[tool.coverage.report] +exclude_lines = [ + "pragma: nocover", + "raise NotImplementedError", + "raise NotImplementedError()", + "if __name__ == .__main__.:", +] diff --git a/third_party/Matcha-TTS/requirements.txt b/third_party/Matcha-TTS/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e14a532cb14f99190404472915213940bfad4b9 --- /dev/null +++ b/third_party/Matcha-TTS/requirements.txt @@ -0,0 +1,45 @@ +# --------- pytorch --------- # +torch>=2.0.0 +torchvision>=0.15.0 +lightning>=2.0.0 +torchmetrics>=0.11.4 + +# --------- hydra --------- # +hydra-core==1.3.2 +hydra-colorlog==1.2.0 +hydra-optuna-sweeper==1.2.0 + +# --------- loggers --------- # +# wandb +# neptune-client +# mlflow +# comet-ml +# aim>=3.16.2 # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550 + +# --------- others --------- # +rootutils # standardizing the project root setup +pre-commit # hooks for applying linters on commit +rich # beautiful text formatting in terminal +pytest # tests +# sh # for running bash commands in some tests (linux/macos only) +phonemizer # phonemization of text +tensorboard +librosa +Cython +numpy +einops +inflect +Unidecode +scipy +torchaudio +matplotlib +pandas +conformer==0.3.2 +diffusers==0.25.0 +notebook +ipywidgets +gradio==3.43.2 +gdown +wget +seaborn +piper_phonemize diff --git a/third_party/Matcha-TTS/scripts/schedule.sh b/third_party/Matcha-TTS/scripts/schedule.sh new file mode 100644 index 0000000000000000000000000000000000000000..44b3da1116ef4d54e9acffee7d639d549e136d45 --- /dev/null +++ b/third_party/Matcha-TTS/scripts/schedule.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Schedule execution of many runs +# Run from root folder with: bash scripts/schedule.sh + +python src/train.py trainer.max_epochs=5 logger=csv + +python src/train.py trainer.max_epochs=10 logger=csv diff --git a/third_party/Matcha-TTS/setup.py b/third_party/Matcha-TTS/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..80d4aac04c6cd36859c5d753468ef2e105770098 --- /dev/null +++ b/third_party/Matcha-TTS/setup.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +import os + +import numpy +from Cython.Build import cythonize +from setuptools import Extension, find_packages, setup + +exts = [ + Extension( + name="matcha.utils.monotonic_align.core", + sources=["matcha/utils/monotonic_align/core.pyx"], + ) +] + +with open("README.md", encoding="utf-8") as readme_file: + README = readme_file.read() + +cwd = os.path.dirname(os.path.abspath(__file__)) +with open(os.path.join(cwd, "matcha", "VERSION")) as fin: + version = fin.read().strip() + +setup( + name="matcha-tts", + version=version, + description="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching", + long_description=README, + long_description_content_type="text/markdown", + author="Shivam Mehta", + author_email="shivam.mehta25@gmail.com", + url="https://shivammehta25.github.io/Matcha-TTS", + install_requires=[str(r) for r in open(os.path.join(os.path.dirname(__file__), "requirements.txt"))], + include_dirs=[numpy.get_include()], + include_package_data=True, + packages=find_packages(exclude=["tests", "tests/*", "examples", "examples/*"]), + # use this to customize global commands available in the terminal after installing the package + entry_points={ + "console_scripts": [ + "matcha-data-stats=matcha.utils.generate_data_statistics:main", + "matcha-tts=matcha.cli:cli", + "matcha-tts-app=matcha.app:main", + ] + }, + ext_modules=cythonize(exts, language_level=3), + python_requires=">=3.9.0", +) diff --git a/third_party/Matcha-TTS/synthesis.ipynb b/third_party/Matcha-TTS/synthesis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..dfbde30b5ad98f1368be3aa181145a4eac97da93 --- /dev/null +++ b/third_party/Matcha-TTS/synthesis.ipynb @@ -0,0 +1,419 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f37f4e3b-f764-4502-a6a2-6417bd9bfab9", + "metadata": {}, + "source": [ + "# Matcha-TTS: A fast TTS architecture with conditional flow matching\n", + "---\n", + "[Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [Éva Székely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/)\n", + "\n", + "We introduce Matcha-TTS, a new encoder-decoder architecture for speedy TTS acoustic modelling, trained using optimal-transport conditional flow matching (OT-CFM). This yields an ODE-based decoder capable of high output quality in fewer synthesis steps than models trained using score matching. Careful design choices additionally ensure each synthesis step is fast to run. The method is probabilistic, non-autoregressive, and learns to speak from scratch without external alignments. Compared to strong pre-trained baseline models, the Matcha-TTS system has the smallest memory footprint, rivals the speed of the fastest models on long utterances, and attains the highest mean opinion score in a listening test.\n", + "\n", + "Demo Page: https://shivammehta25.github.io/Matcha-TTS \\\n", + "Code: https://github.com/shivammehta25/Matcha-TTS\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "148f4bc0-c28e-4670-9a5e-4c7928ab8992", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: CUDA_VISIBLE_DEVICES=0\n" + ] + } + ], + "source": [ + "%env CUDA_VISIBLE_DEVICES=0" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8d5876c0-b47e-4c80-9e9c-62550f81b64e", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime as dt\n", + "from pathlib import Path\n", + "\n", + "import IPython.display as ipd\n", + "import numpy as np\n", + "import soundfile as sf\n", + "import torch\n", + "from tqdm.auto import tqdm\n", + "\n", + "# Hifigan imports\n", + "from matcha.hifigan.config import v1\n", + "from matcha.hifigan.denoiser import Denoiser\n", + "from matcha.hifigan.env import AttrDict\n", + "from matcha.hifigan.models import Generator as HiFiGAN\n", + "# Matcha imports\n", + "from matcha.models.matcha_tts import MatchaTTS\n", + "from matcha.text import sequence_to_text, text_to_sequence\n", + "from matcha.utils.model import denormalize\n", + "from matcha.utils.utils import get_user_data_dir, intersperse" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b1a30306-588c-4f22-8d9b-e2676880b0e5", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline\n", + "# This allows for real time code changes being reflected in the notebook, no need to restart the kernel" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a312856b-01a9-4d75-a4c8-4666dffa0692", + "metadata": {}, + "outputs": [], + "source": [ + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" + ] + }, + { + "cell_type": "markdown", + "id": "88f3b3c3-d014-443b-84eb-e143cdec3e21", + "metadata": {}, + "source": [ + "## Filepaths" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7640a4c1-44ce-447c-a8ff-45012fb7bddd", + "metadata": {}, + "outputs": [], + "source": [ + "MATCHA_CHECKPOINT = get_user_data_dir()/\"matcha_ljspeech.ckpt\"\n", + "HIFIGAN_CHECKPOINT = get_user_data_dir() / \"hifigan_T2_v1\"\n", + "OUTPUT_FOLDER = \"synth_output\"" + ] + }, + { + "cell_type": "markdown", + "id": "6477a3a9-71f2-4d2f-bb86-bdf3e31c2461", + "metadata": {}, + "source": [ + "## Load Matcha-TTS" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "26a16230-04ba-4825-a844-2fb5ab945e24", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model loaded! Parameter count: 18,204,193\n" + ] + } + ], + "source": [ + "def load_model(checkpoint_path):\n", + " model = MatchaTTS.load_from_checkpoint(checkpoint_path, map_location=device)\n", + " model.eval()\n", + " return model\n", + "count_params = lambda x: f\"{sum(p.numel() for p in x.parameters()):,}\"\n", + "\n", + "\n", + "model = load_model(MATCHA_CHECKPOINT)\n", + "print(f\"Model loaded! Parameter count: {count_params(model)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3077b84b-e3b6-42e1-a84b-2f7084b13f92", + "metadata": {}, + "source": [ + "## Load HiFi-GAN (Vocoder)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f6b68184-968d-4868-9029-f0c40e9e68af", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Removing weight norm...\n" + ] + } + ], + "source": [ + "def load_vocoder(checkpoint_path):\n", + " h = AttrDict(v1)\n", + " hifigan = HiFiGAN(h).to(device)\n", + " hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])\n", + " _ = hifigan.eval()\n", + " hifigan.remove_weight_norm()\n", + " return hifigan\n", + "\n", + "vocoder = load_vocoder(HIFIGAN_CHECKPOINT)\n", + "denoiser = Denoiser(vocoder, mode='zeros')" + ] + }, + { + "cell_type": "markdown", + "id": "4cbc2ba0-09ff-40e2-9e60-6b77b534f9fb", + "metadata": {}, + "source": [ + "### Helper functions to synthesise" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "880a1879-24fd-4757-849c-850339120796", + "metadata": {}, + "outputs": [], + "source": [ + "@torch.inference_mode()\n", + "def process_text(text: str):\n", + " x = torch.tensor(intersperse(text_to_sequence(text, ['english_cleaners2']), 0),dtype=torch.long, device=device)[None]\n", + " x_lengths = torch.tensor([x.shape[-1]],dtype=torch.long, device=device)\n", + " x_phones = sequence_to_text(x.squeeze(0).tolist())\n", + " return {\n", + " 'x_orig': text,\n", + " 'x': x,\n", + " 'x_lengths': x_lengths,\n", + " 'x_phones': x_phones\n", + " }\n", + "\n", + "\n", + "@torch.inference_mode()\n", + "def synthesise(text, spks=None):\n", + " text_processed = process_text(text)\n", + " start_t = dt.datetime.now()\n", + " output = model.synthesise(\n", + " text_processed['x'], \n", + " text_processed['x_lengths'],\n", + " n_timesteps=n_timesteps,\n", + " temperature=temperature,\n", + " spks=spks,\n", + " length_scale=length_scale\n", + " )\n", + " # merge everything to one dict \n", + " output.update({'start_t': start_t, **text_processed})\n", + " return output\n", + "\n", + "@torch.inference_mode()\n", + "def to_waveform(mel, vocoder):\n", + " audio = vocoder(mel).clamp(-1, 1)\n", + " audio = denoiser(audio.squeeze(0), strength=0.00025).cpu().squeeze()\n", + " return audio.cpu().squeeze()\n", + " \n", + "def save_to_folder(filename: str, output: dict, folder: str):\n", + " folder = Path(folder)\n", + " folder.mkdir(exist_ok=True, parents=True)\n", + " np.save(folder / f'{filename}', output['mel'].cpu().numpy())\n", + " sf.write(folder / f'{filename}.wav', output['waveform'], 22050, 'PCM_24')" + ] + }, + { + "cell_type": "markdown", + "id": "78f857e3-2ef7-4c86-b776-596c4d3cf875", + "metadata": {}, + "source": [ + "## Setup text to synthesise" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2e0a9acd-0845-4192-ba09-b9683e28a3ac", + "metadata": {}, + "outputs": [], + "source": [ + "texts = [\n", + " \"The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "a9da9e2d-99b9-4c6f-8a08-c828e2cba121", + "metadata": {}, + "source": [ + "### Hyperparameters" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f0d216e5-4895-4da8-9d24-9e61021d2556", + "metadata": {}, + "outputs": [], + "source": [ + "## Number of ODE Solver steps\n", + "n_timesteps = 10\n", + "\n", + "## Changes to the speaking rate\n", + "length_scale=1.0\n", + "\n", + "## Sampling temperature\n", + "temperature = 0.667" + ] + }, + { + "cell_type": "markdown", + "id": "b93aac89-c7f8-4975-8510-4e763c9689f4", + "metadata": {}, + "source": [ + "## Synthesis" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "5a227963-aa12-43b9-a706-1168b6fc0ba5", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8342d12401c54017b0e19b8d293a06bf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00\n", + " \n", + " Your browser does not support the audio element.\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of ODE steps: 10\n", + "Mean RTF:\t\t\t\t0.017228 ± 0.000000\n", + "Mean RTF Waveform (incl. vocoder):\t0.021445 ± 0.000000\n" + ] + } + ], + "source": [ + "outputs, rtfs = [], []\n", + "rtfs_w = []\n", + "for i, text in enumerate(tqdm(texts)):\n", + " output = synthesise(text) #, torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0))\n", + " output['waveform'] = to_waveform(output['mel'], vocoder)\n", + "\n", + " # Compute Real Time Factor (RTF) with HiFi-GAN\n", + " t = (dt.datetime.now() - output['start_t']).total_seconds()\n", + " rtf_w = t * 22050 / (output['waveform'].shape[-1])\n", + "\n", + " ## Pretty print\n", + " print(f\"{'*' * 53}\")\n", + " print(f\"Input text - {i}\")\n", + " print(f\"{'-' * 53}\")\n", + " print(output['x_orig'])\n", + " print(f\"{'*' * 53}\")\n", + " print(f\"Phonetised text - {i}\")\n", + " print(f\"{'-' * 53}\")\n", + " print(output['x_phones'])\n", + " print(f\"{'*' * 53}\")\n", + " print(f\"RTF:\\t\\t{output['rtf']:.6f}\")\n", + " print(f\"RTF Waveform:\\t{rtf_w:.6f}\")\n", + " rtfs.append(output['rtf'])\n", + " rtfs_w.append(rtf_w)\n", + "\n", + " ## Display the synthesised waveform\n", + " ipd.display(ipd.Audio(output['waveform'], rate=22050))\n", + "\n", + " ## Save the generated waveform\n", + " save_to_folder(i, output, OUTPUT_FOLDER)\n", + "\n", + "print(f\"Number of ODE steps: {n_timesteps}\")\n", + "print(f\"Mean RTF:\\t\\t\\t\\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}\")\n", + "print(f\"Mean RTF Waveform (incl. vocoder):\\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3e85c3f-1623-4647-b40c-fa96907656fc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/web_client.py b/web_client.py new file mode 100644 index 0000000000000000000000000000000000000000..3108426a4d57ee0a222a6b3c7af4e2d57dd1e32c --- /dev/null +++ b/web_client.py @@ -0,0 +1,260 @@ +import json +import os.path +import tempfile +import sys +import re +import uuid +import requests +from argparse import ArgumentParser + +import torchaudio +from transformers import WhisperFeatureExtractor, AutoTokenizer, AutoModel +from speech_tokenizer.modeling_whisper import WhisperVQEncoder +import os + +os.environ["no_proxy"]="localhost,127.0.0.1,::1" + +sys.path.insert(0, "./cosyvoice") +sys.path.insert(0, "./third_party/Matcha-TTS") + +from speech_tokenizer.utils import extract_speech_token + +import gradio as gr +import torch + +audio_token_pattern = re.compile(r"<\|audio_(\d+)\|>") + +from flow_inference import AudioDecoder + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--port", type=int, default="8888") + parser.add_argument("--flow-path", type=str, default="./glm-4-voice-decoder") + parser.add_argument("--model-path", type=str, default="THUDM/glm-4-voice-9b") + parser.add_argument("--tokenizer-path", type=str, default="THUDM/glm-4-voice-tokenizer") + args = parser.parse_args() + + flow_config = os.path.join(args.flow_path, "config.yaml") + flow_checkpoint = os.path.join(args.flow_path, 'flow.pt') + hift_checkpoint = os.path.join(args.flow_path, 'hift.pt') + glm_tokenizer = None + device = "cuda" + audio_decoder: AudioDecoder = None + whisper_model, feature_extractor = None, None + + + def initialize_fn(): + global audio_decoder, feature_extractor, whisper_model, glm_model, glm_tokenizer + if audio_decoder is not None: + return + + # GLM + glm_tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) + + # Flow & Hift + audio_decoder = AudioDecoder(config_path=flow_config, flow_ckpt_path=flow_checkpoint, + hift_ckpt_path=hift_checkpoint, + device=device) + + # Speech tokenizer + whisper_model = WhisperVQEncoder.from_pretrained(args.tokenizer_path).eval().to(device) + feature_extractor = WhisperFeatureExtractor.from_pretrained(args.tokenizer_path) + + + def clear_fn(): + return [], [], '', '', '', None, None + + +def inference_fn( + temperature: float, + top_p: float, + max_new_token: int, + input_mode, + audio_path: str | None, + input_text: str | None, + history: list[dict], + previous_input_tokens: str, + previous_completion_tokens: str, + ): + + if input_mode == "audio": + assert audio_path is not None + history.append({"role": "user", "content": {"path": audio_path}}) + audio_tokens = extract_speech_token( + whisper_model, feature_extractor, [audio_path] + )[0] + if len(audio_tokens) == 0: + raise gr.Error("No audio tokens extracted") + audio_tokens = "".join([f"<|audio_{x}|>" for x in audio_tokens]) + audio_tokens = "<|begin_of_audio|>" + audio_tokens + "<|end_of_audio|>" + user_input = audio_tokens + system_prompt = "User will provide you with a speech instruction. Do it step by step. First, think about the instruction and respond in a interleaved manner, with 13 text token followed by 26 audio tokens. " + + else: + assert input_text is not None + history.append({"role": "user", "content": input_text}) + user_input = input_text + system_prompt = "User will provide you with a text instruction. Do it step by step. First, think about the instruction and respond in a interleaved manner, with 13 text token followed by 26 audio tokens." + + + # Gather history + inputs = previous_input_tokens + previous_completion_tokens + inputs = inputs.strip() + if "<|system|>" not in inputs: + inputs += f"<|system|>\n{system_prompt}" + inputs += f"<|user|>\n{user_input}<|assistant|>streaming_transcription\n" + + with torch.no_grad(): + response = requests.post( + "http://localhost:10000/generate_stream", + data=json.dumps({ + "prompt": inputs, + "temperature": temperature, + "top_p": top_p, + "max_new_tokens": max_new_token, + }), + stream=True + ) + text_tokens, audio_tokens = [], [] + audio_offset = glm_tokenizer.convert_tokens_to_ids('<|audio_0|>') + end_token_id = glm_tokenizer.convert_tokens_to_ids('<|user|>') + complete_tokens = [] + prompt_speech_feat = torch.zeros(1, 0, 80).to(device) + flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int64).to(device) + this_uuid = str(uuid.uuid4()) + tts_speechs = [] + tts_mels = [] + prev_mel = None + is_finalize = False + block_size = 10 + for chunk in response.iter_lines(): + token_id = json.loads(chunk)["token_id"] + if token_id == end_token_id: + is_finalize = True + if len(audio_tokens) >= block_size or (is_finalize and audio_tokens): + block_size = 20 + tts_token = torch.tensor(audio_tokens, device=device).unsqueeze(0) + + if prev_mel is not None: + prompt_speech_feat = torch.cat(tts_mels, dim=-1).transpose(1, 2) + + tts_speech, tts_mel = audio_decoder.token2wav(tts_token, uuid=this_uuid, + prompt_token=flow_prompt_speech_token.to(device), + prompt_feat=prompt_speech_feat.to(device), + finalize=is_finalize) + prev_mel = tts_mel + + tts_speechs.append(tts_speech.squeeze()) + tts_mels.append(tts_mel) + yield history, inputs, '', '', (22050, tts_speech.squeeze().cpu().numpy()), None + flow_prompt_speech_token = torch.cat((flow_prompt_speech_token, tts_token), dim=-1) + audio_tokens = [] + if not is_finalize: + complete_tokens.append(token_id) + if token_id >= audio_offset: + audio_tokens.append(token_id - audio_offset) + else: + text_tokens.append(token_id) + tts_speech = torch.cat(tts_speechs, dim=-1).cpu() + complete_text = glm_tokenizer.decode(complete_tokens, spaces_between_special_tokens=False) + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: + torchaudio.save(f, tts_speech.unsqueeze(0), 22050, format="wav") + history.append({"role": "assistant", "content": {"path": f.name, "type": "audio/wav"}}) + history.append({"role": "assistant", "content": glm_tokenizer.decode(text_tokens, ignore_special_tokens=False)}) + yield history, inputs, complete_text, '', None, (22050, tts_speech.numpy()) + + +def update_input_interface(input_mode): + if input_mode == "audio": + return [gr.update(visible=True), gr.update(visible=False)] + else: + return [gr.update(visible=False), gr.update(visible=True)] + + +# Create the Gradio interface +with gr.Blocks(title="Qiuguo voice exmaple", fill_height=True) as demo: + with gr.Row(): + temperature = gr.Number( + label="Temperature", + value=0.2 + ) + + top_p = gr.Number( + label="Top p", + value=0.8 + ) + + max_new_token = gr.Number( + label="Max new tokens", + value=2000, + ) + + chatbot = gr.Chatbot( + elem_id="chatbot", + bubble_full_width=False, + type="messages", + scale=1, + ) + + with gr.Row(): + with gr.Column(): + input_mode = gr.Radio(["audio", "text"], label="Input Mode", value="audio") + audio = gr.Audio(sources=['upload', 'microphone'],type='filepath',label="点击麦克风按钮开始录音",visible=True) + text_input = gr.Textbox(label="Input text", placeholder="Enter your text here...", lines=2, visible=False) + + with gr.Column(): + submit_btn = gr.Button("Submit") + reset_btn = gr.Button("Clear") + output_audio = gr.Audio(label="Play", streaming=True, + autoplay=True, show_download_button=False) + complete_audio = gr.Audio(label="Last Output Audio (If Any)", show_download_button=True) + + + + gr.Markdown("""## Debug Info""") + with gr.Row(): + input_tokens = gr.Textbox( + label=f"Input Tokens", + interactive=False, + ) + + completion_tokens = gr.Textbox( + label=f"Completion Tokens", + interactive=False, + ) + + detailed_error = gr.Textbox( + label=f"Detailed Error", + interactive=False, + ) + + history_state = gr.State([]) + + respond = submit_btn.click( + inference_fn, + inputs=[ + temperature, + top_p, + max_new_token, + input_mode, + audio, + text_input, + history_state, + input_tokens, + completion_tokens, + ], + outputs=[history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio] + ) + + respond.then(lambda s: s, [history_state], chatbot) + + reset_btn.click(clear_fn, outputs=[chatbot, history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio]) + input_mode.input(clear_fn, outputs=[chatbot, history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio]).then(update_input_interface, inputs=[input_mode], outputs=[audio, text_input]) + +initialize_fn() +# Launch the interface +demo.launch( + server_port=args.port, + server_name=args.host +) \ No newline at end of file diff --git a/web_demo.py b/web_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..d1d66214feaf86f020bca8d4f75d715cd5eaeea9 --- /dev/null +++ b/web_demo.py @@ -0,0 +1,261 @@ +import json +import os.path +import tempfile +import sys +import re +import uuid +import requests +from argparse import ArgumentParser + +import torchaudio +from transformers import WhisperFeatureExtractor, AutoTokenizer, AutoModel +from speech_tokenizer.modeling_whisper import WhisperVQEncoder +#import os + +#os.environ["no_proxy"]="localhost,127.0.0.1,::1" + +sys.path.insert(0, "./cosyvoice") +sys.path.insert(0, "./third_party/Matcha-TTS") + +from speech_tokenizer.utils import extract_speech_token + +import gradio as gr +import torch + +audio_token_pattern = re.compile(r"<\|audio_(\d+)\|>") + +from flow_inference import AudioDecoder + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--port", type=int, default="8888") + parser.add_argument("--flow-path", type=str, default="./glm-4-voice-decoder") + parser.add_argument("--model-path", type=str, default="THUDM/glm-4-voice-9b") + parser.add_argument("--tokenizer-path", type=str, default="THUDM/glm-4-voice-tokenizer") + args = parser.parse_args() + + flow_config = os.path.join(args.flow_path, "config.yaml") + flow_checkpoint = os.path.join(args.flow_path, 'flow.pt') + hift_checkpoint = os.path.join(args.flow_path, 'hift.pt') + glm_tokenizer = None + device = "cuda" + audio_decoder: AudioDecoder = None + whisper_model, feature_extractor = None, None + + + def initialize_fn(): + global audio_decoder, feature_extractor, whisper_model, glm_model, glm_tokenizer + if audio_decoder is not None: + return + + # GLM + glm_tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) + + # Flow & Hift + audio_decoder = AudioDecoder(config_path=flow_config, flow_ckpt_path=flow_checkpoint, + hift_ckpt_path=hift_checkpoint, + device=device) + + # Speech tokenizer + whisper_model = WhisperVQEncoder.from_pretrained(args.tokenizer_path).eval().to(device) + feature_extractor = WhisperFeatureExtractor.from_pretrained(args.tokenizer_path) + + + def clear_fn(): + return [], [], '', '', '', None, None + + +def inference_fn( + temperature: float, + top_p: float, + max_new_token: int, + input_mode, + audio_path: str | None, + input_text: str | None, + history: list[dict], + previous_input_tokens: str, + previous_completion_tokens: str, + ): + + if input_mode == "audio": + assert audio_path is not None + history.append({"role": "user", "content": {"path": audio_path}}) + audio_tokens = extract_speech_token( + whisper_model, feature_extractor, [audio_path] + )[0] + if len(audio_tokens) == 0: + raise gr.Error("No audio tokens extracted") + audio_tokens = "".join([f"<|audio_{x}|>" for x in audio_tokens]) + audio_tokens = "<|begin_of_audio|>" + audio_tokens + "<|end_of_audio|>" + user_input = audio_tokens + system_prompt = "User will provide you with a speech instruction. Do it step by step. First, think about the instruction and respond in a interleaved manner, with 13 text token followed by 26 audio tokens. " + + else: + assert input_text is not None + history.append({"role": "user", "content": input_text}) + user_input = input_text + system_prompt = "User will provide you with a text instruction. Do it step by step. First, think about the instruction and respond in a interleaved manner, with 13 text token followed by 26 audio tokens." + + + # Gather history + inputs = previous_input_tokens + previous_completion_tokens + inputs = inputs.strip() + if "<|system|>" not in inputs: + inputs += f"<|system|>\n{system_prompt}" + inputs += f"<|user|>\n{user_input}<|assistant|>streaming_transcription\n" + + with torch.no_grad(): + response = requests.post( + "http://localhost:10000/generate_stream", + data=json.dumps({ + "prompt": inputs, + "temperature": temperature, + "top_p": top_p, + "max_new_tokens": max_new_token, + }), + stream=True + ) + text_tokens, audio_tokens = [], [] + audio_offset = glm_tokenizer.convert_tokens_to_ids('<|audio_0|>') + end_token_id = glm_tokenizer.convert_tokens_to_ids('<|user|>') + complete_tokens = [] + prompt_speech_feat = torch.zeros(1, 0, 80).to(device) + flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int64).to(device) + this_uuid = str(uuid.uuid4()) + tts_speechs = [] + tts_mels = [] + prev_mel = None + is_finalize = False + block_size = 10 + for chunk in response.iter_lines(): + token_id = json.loads(chunk)["token_id"] + if token_id == end_token_id: + is_finalize = True + if len(audio_tokens) >= block_size or (is_finalize and audio_tokens): + block_size = 20 + tts_token = torch.tensor(audio_tokens, device=device).unsqueeze(0) + + if prev_mel is not None: + prompt_speech_feat = torch.cat(tts_mels, dim=-1).transpose(1, 2) + + tts_speech, tts_mel = audio_decoder.token2wav(tts_token, uuid=this_uuid, + prompt_token=flow_prompt_speech_token.to(device), + prompt_feat=prompt_speech_feat.to(device), + finalize=is_finalize) + prev_mel = tts_mel + + tts_speechs.append(tts_speech.squeeze()) + tts_mels.append(tts_mel) + yield history, inputs, '', '', (22050, tts_speech.squeeze().cpu().numpy()), None + flow_prompt_speech_token = torch.cat((flow_prompt_speech_token, tts_token), dim=-1) + audio_tokens = [] + if not is_finalize: + complete_tokens.append(token_id) + if token_id >= audio_offset: + audio_tokens.append(token_id - audio_offset) + else: + text_tokens.append(token_id) + tts_speech = torch.cat(tts_speechs, dim=-1).cpu() + complete_text = glm_tokenizer.decode(complete_tokens, spaces_between_special_tokens=False) + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: + torchaudio.save(f, tts_speech.unsqueeze(0), 22050, format="wav") + history.append({"role": "assistant", "content": {"path": f.name, "type": "audio/wav"}}) + history.append({"role": "assistant", "content": glm_tokenizer.decode(text_tokens, ignore_special_tokens=False)}) + yield history, inputs, complete_text, '', None, (22050, tts_speech.numpy()) + + +def update_input_interface(input_mode): + if input_mode == "audio": + return [gr.update(visible=True), gr.update(visible=False)] + else: + return [gr.update(visible=False), gr.update(visible=True)] + + +# Create the Gradio interface +with gr.Blocks(title="GLM-4-Voice Demo", fill_height=True) as demo: + with gr.Row(): + temperature = gr.Number( + label="Temperature", + value=0.2 + ) + + top_p = gr.Number( + label="Top p", + value=0.8 + ) + + max_new_token = gr.Number( + label="Max new tokens", + value=2000, + ) + + chatbot = gr.Chatbot( + elem_id="chatbot", + bubble_full_width=False, + type="messages", + scale=1, + ) + + with gr.Row(): + with gr.Column(): + input_mode = gr.Radio(["audio", "text"], label="Input Mode", value="audio") + audio = gr.Audio(label="Input audio", type='filepath', show_download_button=True, visible=True) + text_input = gr.Textbox(label="Input text", placeholder="Enter your text here...", lines=2, visible=False) + + with gr.Column(): + submit_btn = gr.Button("Submit") + reset_btn = gr.Button("Clear") + output_audio = gr.Audio(label="Play", streaming=True, + autoplay=True, show_download_button=False) + complete_audio = gr.Audio(label="Last Output Audio (If Any)", show_download_button=True) + + + + gr.Markdown("""## Debug Info""") + with gr.Row(): + input_tokens = gr.Textbox( + label=f"Input Tokens", + interactive=False, + ) + + completion_tokens = gr.Textbox( + label=f"Completion Tokens", + interactive=False, + ) + + detailed_error = gr.Textbox( + label=f"Detailed Error", + interactive=False, + ) + + history_state = gr.State([]) + + respond = submit_btn.click( + inference_fn, + inputs=[ + temperature, + top_p, + max_new_token, + input_mode, + audio, + text_input, + history_state, + input_tokens, + completion_tokens, + ], + outputs=[history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio] + ) + + respond.then(lambda s: s, [history_state], chatbot) + + reset_btn.click(clear_fn, outputs=[chatbot, history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio]) + input_mode.input(clear_fn, outputs=[chatbot, history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio]).then(update_input_interface, inputs=[input_mode], outputs=[audio, text_input]) + +initialize_fn() +# Launch the interface +demo.launch( + server_port=args.port, + server_name=args.host, + share=True +)