MusicGen

Running on A10G

App Files Files Community

adefossez commited on Nov 8, 2023

Commit

a16e65e

1 Parent(s): ed87f04

updated demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.github/actions/audiocraft_build/action.yml +2 -0
.github/workflows/audiocraft_docs.yml +3 -3
.github/workflows/audiocraft_tests.yml +6 -1
.gitignore +8 -1
CHANGELOG.md +31 -1
CONTRIBUTING.md +2 -2
LICENSE_weights +399 -157
MANIFEST.in +7 -0
Makefile +23 -4
README.md +43 -83
assets/a_duck_quacking_as_birds_chirp_and_a_pigeon_cooing.mp3 +0 -0
assets/sirens_and_a_humming_engine_approach_and_pass.mp3 +0 -0
audiocraft/__init__.py +17 -1
audiocraft/adversarial/__init__.py +22 -0
audiocraft/adversarial/discriminators/__init__.py +10 -0
audiocraft/adversarial/discriminators/base.py +34 -0
audiocraft/adversarial/discriminators/mpd.py +106 -0
audiocraft/adversarial/discriminators/msd.py +126 -0
audiocraft/adversarial/discriminators/msstftd.py +134 -0
audiocraft/adversarial/losses.py +228 -0
audiocraft/data/__init__.py +3 -1
audiocraft/data/audio.py +37 -21
audiocraft/data/audio_dataset.py +93 -31
audiocraft/data/audio_utils.py +12 -10
audiocraft/data/info_audio_dataset.py +110 -0
audiocraft/data/music_dataset.py +270 -0
audiocraft/data/sound_dataset.py +330 -0
audiocraft/data/zip.py +8 -6
audiocraft/environment.py +176 -0
audiocraft/grids/__init__.py +6 -0
audiocraft/grids/_base_explorers.py +80 -0
audiocraft/grids/audiogen/__init__.py +6 -0
audiocraft/grids/audiogen/audiogen_base_16khz.py +23 -0
audiocraft/grids/audiogen/audiogen_pretrained_16khz_eval.py +68 -0
audiocraft/grids/compression/__init__.py +6 -0
audiocraft/grids/compression/_explorers.py +55 -0
audiocraft/grids/compression/debug.py +31 -0
audiocraft/grids/compression/encodec_audiogen_16khz.py +29 -0
audiocraft/grids/compression/encodec_base_24khz.py +28 -0
audiocraft/grids/compression/encodec_musicgen_32khz.py +34 -0
audiocraft/grids/diffusion/4_bands_base_32khz.py +27 -0
audiocraft/grids/diffusion/__init__.py +6 -0
audiocraft/grids/diffusion/_explorers.py +66 -0
audiocraft/grids/musicgen/__init__.py +6 -0
audiocraft/grids/musicgen/_explorers.py +93 -0
audiocraft/grids/musicgen/musicgen_base_32khz.py +43 -0
audiocraft/grids/musicgen/musicgen_base_cached_32khz.py +67 -0
audiocraft/grids/musicgen/musicgen_clapemb_32khz.py +32 -0
audiocraft/grids/musicgen/musicgen_melody_32khz.py +65 -0
audiocraft/grids/musicgen/musicgen_pretrained_32khz_eval.py +99 -0

.github/actions/audiocraft_build/action.yml CHANGED Viewed

@@ -21,6 +21,8 @@ runs:
       python3 -m venv env
       .  env/bin/activate
       python -m pip install --upgrade pip
       pip install -e '.[dev]'
   - name: System Dependencies
     shell: bash

       python3 -m venv env
       .  env/bin/activate
       python -m pip install --upgrade pip
+      pip install torch torchvision torchaudio
+      pip install xformers
       pip install -e '.[dev]'
   - name: System Dependencies
     shell: bash

.github/workflows/audiocraft_docs.yml CHANGED Viewed

@@ -23,9 +23,9 @@ jobs:
       - name: Make docs
         run: |
           . env/bin/activate
-          make docs
-          git add -f docs
-          git commit -m docs
       - name: Push branch
         run: |

       - name: Make docs
         run: |
           . env/bin/activate
+          make api_docs
+          git add -f api_docs
+          git commit -m api_docs
       - name: Push branch
         run: |

.github/workflows/audiocraft_tests.yml CHANGED Viewed

@@ -12,6 +12,11 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - uses: ./.github/actions/audiocraft_build
-      - run: |
           . env/bin/activate
           make tests

     steps:
       - uses: actions/checkout@v2
       - uses: ./.github/actions/audiocraft_build
+      - name: Run unit tests
+        run: |
           . env/bin/activate
           make tests
+      - name: Run integration tests
+        run: |
+          . env/bin/activate
+          make tests_integ

.gitignore CHANGED Viewed

@@ -35,7 +35,7 @@ wheels/
 .coverage
 # docs
-/docs
 # dotenv
 .env
@@ -46,6 +46,13 @@ wheels/
 venv/
 ENV/
 # personal notebooks & scripts
 */local_scripts
 */notes

 .coverage
 # docs
+/api_docs
 # dotenv
 .env
 venv/
 ENV/
+# egs with manifest files
+egs/*
+!egs/example
+# local datasets
+dataset/*
+!dataset/example
 # personal notebooks & scripts
 */local_scripts
 */notes

CHANGELOG.md CHANGED Viewed

@@ -4,7 +4,37 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
-## [0.0.2a] - TBD
 Improved demo, fixed top p (thanks @jnordberg).

 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+## [1.2.0a] - TBD
+Adding stereo models.
+## [1.1.0] - 2023-11-06
+Not using torchaudio anymore when writing audio files, relying instead directly on the commandline ffmpeg. Also not using it anymore for reading audio files, for similar reasons.
+Fixed DAC support with non default number of codebooks.
+Fixed bug when `two_step_cfg` was overriden when calling `generate()`.
+Fixed samples being always prompted with audio, rather than having both prompted and unprompted.
+**Backward incompatible change:** A `torch.no_grad` around the computation of the conditioning made its way in the public release.
+	The released models were trained without this. Those impact linear layers applied to the output of the T5 or melody conditioners.
+	We removed it, so you might need to retrain models.
+**Backward incompatible change:** Fixing wrong sample rate in CLAP (WARNING if you trained model with CLAP before).
+**Backward incompatible change:** Renamed VALLEPattern to CoarseFirstPattern, as it was wrongly named. Probably no one
+	retrained a model with this pattern, so hopefully this won't impact you!
+## [1.0.0] - 2023-09-07
+Major revision, added training code for EnCodec, AudioGen, MusicGen, and MultiBandDiffusion.
+Added pretrained model for AudioGen and MultiBandDiffusion.
+## [0.0.2] - 2023-08-01
 Improved demo, fixed top p (thanks @jnordberg).

CONTRIBUTING.md CHANGED Viewed

@@ -1,11 +1,11 @@
-# Contributing to Audiocraft
 We want to make contributing to this project as easy and transparent as
 possible.
 ## Pull Requests
-Audiocraft is the implementation of a research paper.
 Therefore, we do not plan on accepting many pull requests for new features.
 We certainly welcome them for bug fixes.

+# Contributing to AudioCraft
 We want to make contributing to this project as easy and transparent as
 possible.
 ## Pull Requests
+AudioCraft is the implementation of a research paper.
 Therefore, we do not plan on accepting many pull requests for new features.
 We certainly welcome them for bug fixes.

LICENSE_weights CHANGED Viewed

@@ -1,157 +1,399 @@
-# Attribution-NonCommercial-NoDerivatives 4.0 International
-> *Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible.*
->
-> ### Using Creative Commons Public Licenses
->
-> Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses.
->
-> * __Considerations for licensors:__ Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. [More considerations for licensors](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensors).
->
-> * __Considerations for the public:__ By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. [More considerations for the public](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensees).
-## Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License
-By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
-### Section 1 – Definitions.
-a. __Adapted Material__ means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
-b. __Copyright and Similar Rights__ means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
-e. __Effective Technological Measures__ means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
-f. __Exceptions and Limitations__ means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
-h. __Licensed Material__ means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
-i. __Licensed Rights__ means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
-h. __Licensor__ means the individual(s) or entity(ies) granting rights under this Public License.
-i. __NonCommercial__ means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
-j. __Share__ means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
-k. __Sui Generis Database Rights__ means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
-l. __You__ means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
-### Section 2 – Scope.
-a. ___License grant.___
-   1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
-        A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
-        B. produce and reproduce, but not Share, Adapted Material for NonCommercial purposes only.
-   2. __Exceptions and Limitations.__ For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
-   3. __Term.__ The term of this Public License is specified in Section 6(a).
-   4. __Media and formats; technical modifications allowed.__ The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
-   5. __Downstream recipients.__
-        A. __Offer from the Licensor – Licensed Material.__ Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
-        B. __No downstream restrictions.__ You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
-   6. __No endorsement.__ Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
-b. ___Other rights.___
-   1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
-   2. Patent and trademark rights are not licensed under this Public License.
-   3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
-### Section 3 – License Conditions.
-Your exercise of the Licensed Rights is expressly made subject to the following conditions.
-a. ___Attribution.___
-   1. If You Share the Licensed Material, You must:
-      A. retain the following if it is supplied by the Licensor with the Licensed Material:
-         i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
-         ii. a copyright notice;
-         iii. a notice that refers to this Public License;
-         iv. a notice that refers to the disclaimer of warranties;
-         v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
-      B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
-      C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
-        For the avoidance of doubt, You do not have permission under this Public License to Share Adapted Material.
-   2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
-   3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
-### Section 4 – Sui Generis Database Rights.
-Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
-a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only and provided You do not Share Adapted Material;
-b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and
-c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
-For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
-### Section 5 – Disclaimer of Warranties and Limitation of Liability.
-a. __Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.__
-b. __To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.__
-c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
-### Section 6 – Term and Termination.
-a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
-b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
-   1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
-   2. upon express reinstatement by the Licensor.
-   For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
-c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
-d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
-### Section 7 – Other Terms and Conditions.
-a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
-b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
-### Section 8 – Interpretation.
-a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
-b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
-c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
-d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
-> Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at [creativecommons.org/policies](http://creativecommons.org/policies), Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses.
->
-> Creative Commons may be contacted at [creativecommons.org](http://creativecommons.org).

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

MANIFEST.in CHANGED Viewed

@@ -6,3 +6,10 @@ include *.ini
 include requirements.txt
 include audiocraft/py.typed
 include assets/*.mp3

 include requirements.txt
 include audiocraft/py.typed
 include assets/*.mp3
+include datasets/*.mp3
+recursive-include config *.yaml
+recursive-include demos *.py
+recursive-include demos *.ipynb
+recursive-include scripts *.py
+recursive-include model_cards *.md
+recursive-include docs *.md

Makefile CHANGED Viewed

@@ -1,3 +1,15 @@
 default: linter tests
 install:
@@ -10,12 +22,19 @@ linter:
 tests:
 	coverage run -m pytest tests
-	coverage report --include 'audiocraft/*'
-docs:
-	pdoc3 --html -o docs -f audiocraft
 dist:
 	python setup.py sdist
-.PHONY: linter tests docs dist

+INTEG=AUDIOCRAFT_DORA_DIR="/tmp/magma_$(USER)" python3 -m dora -v run --clear device=cpu dataset.num_workers=0 optim.epochs=1 \
+	dataset.train.num_samples=10 dataset.valid.num_samples=10 \
+	dataset.evaluate.num_samples=10 dataset.generate.num_samples=2 sample_rate=16000 \
+	logging.level=DEBUG
+INTEG_COMPRESSION = $(INTEG) solver=compression/debug rvq.n_q=2 rvq.bins=48 checkpoint.save_last=true   # SIG is 5091833e
+INTEG_MUSICGEN = $(INTEG) solver=musicgen/debug dset=audio/example compression_model_checkpoint=//sig/5091833e \
+	transformer_lm.n_q=2 transformer_lm.card=48 transformer_lm.dim=16 checkpoint.save_last=false  # Using compression model from 5091833e
+INTEG_AUDIOGEN = $(INTEG) solver=audiogen/debug dset=audio/example compression_model_checkpoint=//sig/5091833e \
+	transformer_lm.n_q=2 transformer_lm.card=48 transformer_lm.dim=16 checkpoint.save_last=false  # Using compression model from 5091833e
+INTEG_MBD = $(INTEG) solver=diffusion/debug dset=audio/example  \
+	checkpoint.save_last=false  # Using compression model from 616d7b3c
 default: linter tests
 install:
 tests:
 	coverage run -m pytest tests
+	coverage report
+tests_integ:
+	$(INTEG_COMPRESSION)
+	$(INTEG_MBD)
+	$(INTEG_MUSICGEN)
+	$(INTEG_AUDIOGEN)
+api_docs:
+	pdoc3 --html -o api_docs -f audiocraft
 dist:
 	python setup.py sdist
+.PHONY: linter tests api_docs dist

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ tags:
   - "music generation"
   - "language models"
   - "LLMs"
-app_file: "app.py"
 emoji: 🎵
 colorFrom: gray
 colorTo: blue
@@ -14,33 +14,17 @@ sdk_version: 3.34.0
 pinned: true
 license: "cc-by-nc-4.0"
 ---
-# Audiocraft
 ![docs badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_docs/badge.svg)
 ![linter badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_linter/badge.svg)
 ![tests badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_tests/badge.svg)
-Audiocraft is a PyTorch library for deep learning research on audio generation. At the moment, it contains the code for MusicGen, a state-of-the-art controllable text-to-music model.
-## MusicGen
-Audiocraft provides the code and models for MusicGen, [a simple and controllable model for music generation][arxiv]. MusicGen is a single stage auto-regressive
-Transformer model trained over a 32kHz <a href="https://github.com/facebookresearch/encodec">EnCodec tokenizer</a> with 4 codebooks sampled at 50 Hz. Unlike existing methods like [MusicLM](https://arxiv.org/abs/2301.11325), MusicGen doesn't require a self-supervised semantic representation, and it generates
-all 4 codebooks in one pass. By introducing a small delay between the codebooks, we show we can predict
-them in parallel, thus having only 50 auto-regressive steps per second of audio.
-Check out our [sample page][musicgen_samples] or test the available demo!
-<a target="_blank" href="https://colab.research.google.com/drive/1-Xe9NCdIs2sCUbiSmwHXozK6AAhMm7_i?usp=sharing">
-  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
-</a>
-<a target="_blank" href="https://huggingface.co/spaces/facebook/MusicGen">
-  <img src="https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm.svg" alt="Open in HugginFace"/>
-</a>
-<br>
-We use 20K hours of licensed music to train MusicGen. Specifically, we rely on an internal dataset of 10K high-quality music tracks, and on the ShutterStock and Pond5 music data.
 ## Installation
-Audiocraft requires Python 3.9, PyTorch 2.0.0, and a GPU with at least 16 GB of memory (for the medium-sized model). To install Audiocraft, you can run the following:
 ```shell
 # Best to make sure you have torch installed first, in particular before installing xformers.
@@ -49,92 +33,68 @@ pip install 'torch>=2.0'
 # Then proceed to one of the following
 pip install -U audiocraft  # stable release
 pip install -U git+https://git@github.com/facebookresearch/audiocraft#egg=audiocraft  # bleeding edge
-pip install -e .  # or if you cloned the repo locally
 ```
-## Usage
-We offer a number of way to interact with MusicGen:
-1. A demo is also available on the [`facebook/MusicGen`  HuggingFace Space](https://huggingface.co/spaces/facebook/MusicGen) (huge thanks to all the HF team for their support).
-2. You can run the Gradio demo in Colab: [colab notebook](https://colab.research.google.com/drive/1fxGqfg96RBUvGxZ1XXN07s3DthrKUl4-?usp=sharing).
-3. You can use the gradio demo locally by running `python app.py`.
-4. You can play with MusicGen by running the jupyter notebook at [`demo.ipynb`](./demo.ipynb) locally (if you have a GPU).
-5. Finally, checkout [@camenduru Colab page](https://github.com/camenduru/MusicGen-colab) which is regularly
-  updated with contributions from @camenduru and the community.
-## API
-We provide a simple API and 4 pre-trained models. The pre trained models are:
-- `small`: 300M model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-small)
-- `medium`: 1.5B model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-medium)
-- `melody`: 1.5B model, text to music and text+melody to music - [🤗 Hub](https://huggingface.co/facebook/musicgen-melody)
-- `large`: 3.3B model, text to music only - [🤗 Hub](https://huggingface.co/facebook/musicgen-large)
-We observe the best trade-off between quality and compute with the `medium` or `melody` model.
-In order to use MusicGen locally **you must have a GPU**. We recommend 16GB of memory, but smaller
-GPUs will be able to generate short sequences, or longer sequences with the `small` model.
-**Note**: Please make sure to have [ffmpeg](https://ffmpeg.org/download.html) installed when using newer version of `torchaudio`.
-You can install it with:
-```
-apt-get install ffmpeg
 ```
-See after a quick example for using the API.
-```python
-import torchaudio
-from audiocraft.models import MusicGen
-from audiocraft.data.audio import audio_write
-model = MusicGen.get_pretrained('melody')
-model.set_generation_params(duration=8)  # generate 8 seconds.
-wav = model.generate_unconditional(4)    # generates 4 unconditional audio samples
-descriptions = ['happy rock', 'energetic EDM', 'sad jazz']
-wav = model.generate(descriptions)  # generates 3 samples.
-melody, sr = torchaudio.load('./assets/bach.mp3')
-# generates using the melody from the given audio and the provided descriptions.
-wav = model.generate_with_chroma(descriptions, melody[None].expand(3, -1, -1), sr)
-for idx, one_wav in enumerate(wav):
-    # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
-    audio_write(f'{idx}', one_wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
-```
-## Model Card
-See [the model card page](./MODEL_CARD.md).
-## FAQ
-#### Will the training code be released?
-Yes. We will soon release the training code for MusicGen and EnCodec.
-#### I need help on Windows
-@FurkanGozukara made a complete tutorial for [Audiocraft/MusicGen on Windows](https://youtu.be/v-YpvPkhdO4)
-#### I need help for running the demo on Colab
-Check [@camenduru tutorial on Youtube](https://www.youtube.com/watch?v=EGfxuTy9Eeo).
 ## Citation
 ```
 @article{copet2023simple,
-      title={Simple and Controllable Music Generation},
-      author={Jade Copet and Felix Kreuk and Itai Gat and Tal Remez and David Kant and Gabriel Synnaeve and Yossi Adi and Alexandre Défossez},
-      year={2023},
-      journal={arXiv preprint arXiv:2306.05284},
 }
 ```
-## License
-* The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE).
-* The weights in this repository are released under the CC-BY-NC 4.0 license as found in the [LICENSE_weights file](LICENSE_weights).
-[arxiv]: https://arxiv.org/abs/2306.05284
-[musicgen_samples]: https://ai.honu.io/papers/musicgen/

   - "music generation"
   - "language models"
   - "LLMs"
+app_file: "demos/musicgen_app.py"
 emoji: 🎵
 colorFrom: gray
 colorTo: blue
 pinned: true
 license: "cc-by-nc-4.0"
 ---
+# AudioCraft
 ![docs badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_docs/badge.svg)
 ![linter badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_linter/badge.svg)
 ![tests badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_tests/badge.svg)
+AudioCraft is a PyTorch library for deep learning research on audio generation. AudioCraft contains inference and training code
+for two state-of-the-art AI generative models producing high-quality audio: AudioGen and MusicGen.
 ## Installation
+AudioCraft requires Python 3.9, PyTorch 2.0.0. To install AudioCraft, you can run the following:
 ```shell
 # Best to make sure you have torch installed first, in particular before installing xformers.
 # Then proceed to one of the following
 pip install -U audiocraft  # stable release
 pip install -U git+https://git@github.com/facebookresearch/audiocraft#egg=audiocraft  # bleeding edge
+pip install -e .  # or if you cloned the repo locally (mandatory if you want to train).
 ```
+We also recommend having `ffmpeg` installed, either through your system or Anaconda:
+```bash
+sudo apt-get install ffmpeg
+# Or if you are using Anaconda or Miniconda
+conda install "ffmpeg<5" -c conda-forge
 ```
+## Models
+At the moment, AudioCraft contains the training code and inference code for:
+* [MusicGen](./docs/MUSICGEN.md): A state-of-the-art controllable text-to-music model.
+* [AudioGen](./docs/AUDIOGEN.md): A state-of-the-art text-to-sound model.
+* [EnCodec](./docs/ENCODEC.md): A state-of-the-art high fidelity neural audio codec.
+* [Multi Band Diffusion](./docs/MBD.md): An EnCodec compatible decoder using diffusion.
+## Training code
+AudioCraft contains PyTorch components for deep learning research in audio and training pipelines for the developed models.
+For a general introduction of AudioCraft design principles and instructions to develop your own training pipeline, refer to
+the [AudioCraft training documentation](./docs/TRAINING.md).
+For reproducing existing work and using the developed training pipelines, refer to the instructions for each specific model
+that provides pointers to configuration, example grids and model/task-specific information and FAQ.
+## API documentation
+We provide some [API documentation](https://facebookresearch.github.io/audiocraft/api_docs/audiocraft/index.html) for AudioCraft.
+## FAQ
+#### Is the training code available?
+Yes! We provide the training code for [EnCodec](./docs/ENCODEC.md), [MusicGen](./docs/MUSICGEN.md) and [Multi Band Diffusion](./docs/MBD.md).
+#### Where are the models stored?
+Hugging Face stored the model in a specific location, which can be overriden by setting the `AUDIOCRAFT_CACHE_DIR` environment variable for the AudioCraft models.
+In order to change the cache location of the other Hugging Face models, please check out the [Hugging Face Transformers documentation for the cache setup](https://huggingface.co/docs/transformers/installation#cache-setup).
+Finally, if you use a model that relies on Demucs (e.g. `musicgen-melody`) and want to change the download location for Demucs, refer to the [Torch Hub documentation](https://pytorch.org/docs/stable/hub.html#where-are-my-downloaded-models-saved).
+## License
+* The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE).
+* The models weights in this repository are released under the CC-BY-NC 4.0 license as found in the [LICENSE_weights file](LICENSE_weights).
 ## Citation
+For the general framework of AudioCraft, please cite the following.
 ```
 @article{copet2023simple,
+    title={Simple and Controllable Music Generation},
+    author={Jade Copet and Felix Kreuk and Itai Gat and Tal Remez and David Kant and Gabriel Synnaeve and Yossi Adi and Alexandre Défossez},
+    year={2023},
+    journal={arXiv preprint arXiv:2306.05284},
 }
 ```
+When referring to a specific model, please cite as mentioned in the model specific README, e.g
+[./docs/MUSICGEN.md](./docs/MUSICGEN.md), [./docs/AUDIOGEN.md](./docs/AUDIOGEN.md), etc.

assets/a_duck_quacking_as_birds_chirp_and_a_pigeon_cooing.mp3 ADDED Viewed

Binary file (15.2 kB). View file

assets/sirens_and_a_humming_engine_approach_and_pass.mp3 ADDED Viewed

Binary file (15.2 kB). View file

audiocraft/__init__.py CHANGED Viewed

@@ -3,8 +3,24 @@
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 # flake8: noqa
 from . import data, modules, models
-__version__ = '0.0.2a2'

 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+"""
+AudioCraft is a general framework for training audio generative models.
+At the moment we provide the training code for:
+- [MusicGen](https://arxiv.org/abs/2306.05284), a state-of-the-art
+    text-to-music and melody+text autoregressive generative model.
+    For the solver, see `audiocraft.solvers.musicgen.MusicGenSolver`, and for the model,
+    `audiocraft.models.musicgen.MusicGen`.
+- [AudioGen](https://arxiv.org/abs/2209.15352), a state-of-the-art
+    text-to-general-audio generative model.
+- [EnCodec](https://arxiv.org/abs/2210.13438), efficient and high fidelity
+    neural audio codec which provides an excellent tokenizer for autoregressive language models.
+    See `audiocraft.solvers.compression.CompressionSolver`, and `audiocraft.models.encodec.EncodecModel`.
+- [MultiBandDiffusion](TODO), alternative diffusion-based decoder compatible with EnCodec that
+    improves the perceived quality and reduces the artifacts coming from adversarial decoders.
+"""
 # flake8: noqa
 from . import data, modules, models
+__version__ = '1.1.0'

audiocraft/adversarial/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Adversarial losses and discriminator architectures."""
+# flake8: noqa
+from .discriminators import (
+    MultiPeriodDiscriminator,
+    MultiScaleDiscriminator,
+    MultiScaleSTFTDiscriminator
+)
+from .losses import (
+    AdversarialLoss,
+    AdvLossType,
+    get_adv_criterion,
+    get_fake_criterion,
+    get_real_criterion,
+    FeatLossType,
+    FeatureMatchingLoss
+)

audiocraft/adversarial/discriminators/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# flake8: noqa
+from .mpd import MultiPeriodDiscriminator
+from .msd import MultiScaleDiscriminator
+from .msstftd import MultiScaleSTFTDiscriminator

audiocraft/adversarial/discriminators/base.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import ABC, abstractmethod
+import typing as tp
+import torch
+import torch.nn as nn
+FeatureMapType = tp.List[torch.Tensor]
+LogitsType = torch.Tensor
+MultiDiscriminatorOutputType = tp.Tuple[tp.List[LogitsType], tp.List[FeatureMapType]]
+class MultiDiscriminator(ABC, nn.Module):
+    """Base implementation for discriminators composed of sub-discriminators acting at different scales.
+    """
+    def __init__(self):
+        super().__init__()
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
+        ...
+    @property
+    @abstractmethod
+    def num_discriminators(self) -> int:
+        """Number of discriminators.
+        """
+        ...

audiocraft/adversarial/discriminators/mpd.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...modules import NormConv2d
+from .base import MultiDiscriminator, MultiDiscriminatorOutputType
+def get_padding(kernel_size: int, dilation: int = 1) -> int:
+    return int((kernel_size * dilation - dilation) / 2)
+class PeriodDiscriminator(nn.Module):
+    """Period sub-discriminator.
+    Args:
+        period (int): Period between samples of audio.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        n_layers (int): Number of convolutional layers.
+        kernel_sizes (list of int): Kernel sizes for convolutions.
+        stride (int): Stride for convolutions.
+        filters (int): Initial number of filters in convolutions.
+        filters_scale (int): Multiplier of number of filters as we increase depth.
+        max_filters (int): Maximum number of filters.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+    """
+    def __init__(self, period: int, in_channels: int = 1, out_channels: int = 1,
+                 n_layers: int = 5, kernel_sizes: tp.List[int] = [5, 3], stride: int = 3,
+                 filters: int = 8, filters_scale: int = 4, max_filters: int = 1024,
+                 norm: str = 'weight_norm', activation: str = 'LeakyReLU',
+                 activation_params: dict = {'negative_slope': 0.2}):
+        super().__init__()
+        self.period = period
+        self.n_layers = n_layers
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.convs = nn.ModuleList()
+        in_chs = in_channels
+        for i in range(self.n_layers):
+            out_chs = min(filters * (filters_scale ** (i + 1)), max_filters)
+            eff_stride = 1 if i == self.n_layers - 1 else stride
+            self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_sizes[0], 1), stride=(eff_stride, 1),
+                                         padding=((kernel_sizes[0] - 1) // 2, 0), norm=norm))
+            in_chs = out_chs
+        self.conv_post = NormConv2d(in_chs, out_channels, kernel_size=(kernel_sizes[1], 1), stride=1,
+                                    padding=((kernel_sizes[1] - 1) // 2, 0), norm=norm)
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), 'reflect')
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for conv in self.convs:
+            x = conv(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        # x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(MultiDiscriminator):
+    """Multi-Period (MPD) Discriminator.
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        periods (Sequence[int]): Periods between samples of audio for the sub-discriminators.
+        **kwargs: Additional args for `PeriodDiscriminator`
+    """
+    def __init__(self, in_channels: int = 1, out_channels: int = 1,
+                 periods: tp.Sequence[int] = [2, 3, 5, 7, 11], **kwargs):
+        super().__init__()
+        self.discriminators = nn.ModuleList([
+            PeriodDiscriminator(p, in_channels, out_channels, **kwargs) for p in periods
+        ])
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for disc in self.discriminators:
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps

audiocraft/adversarial/discriminators/msd.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import numpy as np
+import torch
+import torch.nn as nn
+from ...modules import NormConv1d
+from .base import MultiDiscriminator, MultiDiscriminatorOutputType
+class ScaleDiscriminator(nn.Module):
+    """Waveform sub-discriminator.
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_sizes (Sequence[int]): Kernel sizes for first and last convolutions.
+        filters (int): Number of initial filters for convolutions.
+        max_filters (int): Maximum number of filters.
+        downsample_scales (Sequence[int]): Scale for downsampling implemented as strided convolutions.
+        inner_kernel_sizes (Sequence[int] or None): Kernel sizes for inner convolutions.
+        groups (Sequence[int] or None): Groups for inner convolutions.
+        strides (Sequence[int] or None): Strides for inner convolutions.
+        paddings (Sequence[int] or None): Paddings for inner convolutions.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        pad (str): Padding for initial convolution.
+        pad_params (dict): Parameters to provide to the padding module.
+    """
+    def __init__(self, in_channels=1, out_channels=1, kernel_sizes: tp.Sequence[int] = [5, 3],
+                 filters: int = 16, max_filters: int = 1024, downsample_scales: tp.Sequence[int] = [4, 4, 4, 4],
+                 inner_kernel_sizes: tp.Optional[tp.Sequence[int]] = None, groups: tp.Optional[tp.Sequence[int]] = None,
+                 strides: tp.Optional[tp.Sequence[int]] = None, paddings: tp.Optional[tp.Sequence[int]] = None,
+                 norm: str = 'weight_norm', activation: str = 'LeakyReLU',
+                 activation_params: dict = {'negative_slope': 0.2}, pad: str = 'ReflectionPad1d',
+                 pad_params: dict = {}):
+        super().__init__()
+        assert len(kernel_sizes) == 2
+        assert kernel_sizes[0] % 2 == 1
+        assert kernel_sizes[1] % 2 == 1
+        assert (inner_kernel_sizes is None or len(inner_kernel_sizes) == len(downsample_scales))
+        assert (groups is None or len(groups) == len(downsample_scales))
+        assert (strides is None or len(strides) == len(downsample_scales))
+        assert (paddings is None or len(paddings) == len(downsample_scales))
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.convs = nn.ModuleList()
+        self.convs.append(
+            nn.Sequential(
+                getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params),
+                NormConv1d(in_channels, filters, kernel_size=np.prod(kernel_sizes), stride=1, norm=norm)
+            )
+        )
+        in_chs = filters
+        for i, downsample_scale in enumerate(downsample_scales):
+            out_chs = min(in_chs * downsample_scale, max_filters)
+            default_kernel_size = downsample_scale * 10 + 1
+            default_stride = downsample_scale
+            default_padding = (default_kernel_size - 1) // 2
+            default_groups = in_chs // 4
+            self.convs.append(
+                NormConv1d(in_chs, out_chs,
+                           kernel_size=inner_kernel_sizes[i] if inner_kernel_sizes else default_kernel_size,
+                           stride=strides[i] if strides else default_stride,
+                           groups=groups[i] if groups else default_groups,
+                           padding=paddings[i] if paddings else default_padding,
+                           norm=norm))
+            in_chs = out_chs
+        out_chs = min(in_chs * 2, max_filters)
+        self.convs.append(NormConv1d(in_chs, out_chs, kernel_size=kernel_sizes[0], stride=1,
+                                     padding=(kernel_sizes[0] - 1) // 2, norm=norm))
+        self.conv_post = NormConv1d(out_chs, out_channels, kernel_size=kernel_sizes[1], stride=1,
+                                    padding=(kernel_sizes[1] - 1) // 2, norm=norm)
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        for layer in self.convs:
+            x = layer(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        # x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(MultiDiscriminator):
+    """Multi-Scale (MSD) Discriminator,
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        downsample_factor (int): Downsampling factor between the different scales.
+        scale_norms (Sequence[str]): Normalization for each sub-discriminator.
+        **kwargs: Additional args for ScaleDiscriminator.
+    """
+    def __init__(self, in_channels: int = 1, out_channels: int = 1, downsample_factor: int = 2,
+                 scale_norms: tp.Sequence[str] = ['weight_norm', 'weight_norm', 'weight_norm'], **kwargs):
+        super().__init__()
+        self.discriminators = nn.ModuleList([
+            ScaleDiscriminator(in_channels, out_channels, norm=norm, **kwargs) for norm in scale_norms
+        ])
+        self.downsample = nn.AvgPool1d(downsample_factor * 2, downsample_factor, padding=downsample_factor)
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for i, disc in enumerate(self.discriminators):
+            if i != 0:
+                self.downsample(x)
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps

audiocraft/adversarial/discriminators/msstftd.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import torchaudio
+import torch
+from torch import nn
+from einops import rearrange
+from ...modules import NormConv2d
+from .base import MultiDiscriminator, MultiDiscriminatorOutputType
+def get_2d_padding(kernel_size: tp.Tuple[int, int], dilation: tp.Tuple[int, int] = (1, 1)):
+    return (((kernel_size[0] - 1) * dilation[0]) // 2, ((kernel_size[1] - 1) * dilation[1]) // 2)
+class DiscriminatorSTFT(nn.Module):
+    """STFT sub-discriminator.
+    Args:
+        filters (int): Number of filters in convolutions.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        n_fft (int): Size of FFT for each scale.
+        hop_length (int): Length of hop between STFT windows for each scale.
+        kernel_size (tuple of int): Inner Conv2d kernel sizes.
+        stride (tuple of int): Inner Conv2d strides.
+        dilations (list of int): Inner Conv2d dilation on the time dimension.
+        win_length (int): Window size for each scale.
+        normalized (bool): Whether to normalize by magnitude after stft.
+        norm (str): Normalization method.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        growth (int): Growth factor for the filters.
+    """
+    def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1,
+                 n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024, max_filters: int = 1024,
+                 filters_scale: int = 1, kernel_size: tp.Tuple[int, int] = (3, 9), dilations: tp.List = [1, 2, 4],
+                 stride: tp.Tuple[int, int] = (1, 2), normalized: bool = True, norm: str = 'weight_norm',
+                 activation: str = 'LeakyReLU', activation_params: dict = {'negative_slope': 0.2}):
+        super().__init__()
+        assert len(kernel_size) == 2
+        assert len(stride) == 2
+        self.filters = filters
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.normalized = normalized
+        self.activation = getattr(torch.nn, activation)(**activation_params)
+        self.spec_transform = torchaudio.transforms.Spectrogram(
+            n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window_fn=torch.hann_window,
+            normalized=self.normalized, center=False, pad_mode=None, power=None)
+        spec_channels = 2 * self.in_channels
+        self.convs = nn.ModuleList()
+        self.convs.append(
+            NormConv2d(spec_channels, self.filters, kernel_size=kernel_size, padding=get_2d_padding(kernel_size))
+        )
+        in_chs = min(filters_scale * self.filters, max_filters)
+        for i, dilation in enumerate(dilations):
+            out_chs = min((filters_scale ** (i + 1)) * self.filters, max_filters)
+            self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=kernel_size, stride=stride,
+                                         dilation=(dilation, 1), padding=get_2d_padding(kernel_size, (dilation, 1)),
+                                         norm=norm))
+            in_chs = out_chs
+        out_chs = min((filters_scale ** (len(dilations) + 1)) * self.filters, max_filters)
+        self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_size[0], kernel_size[0]),
+                                     padding=get_2d_padding((kernel_size[0], kernel_size[0])),
+                                     norm=norm))
+        self.conv_post = NormConv2d(out_chs, self.out_channels,
+                                    kernel_size=(kernel_size[0], kernel_size[0]),
+                                    padding=get_2d_padding((kernel_size[0], kernel_size[0])),
+                                    norm=norm)
+    def forward(self, x: torch.Tensor):
+        fmap = []
+        z = self.spec_transform(x)  # [B, 2, Freq, Frames, 2]
+        z = torch.cat([z.real, z.imag], dim=1)
+        z = rearrange(z, 'b c w t -> b c t w')
+        for i, layer in enumerate(self.convs):
+            z = layer(z)
+            z = self.activation(z)
+            fmap.append(z)
+        z = self.conv_post(z)
+        return z, fmap
+class MultiScaleSTFTDiscriminator(MultiDiscriminator):
+    """Multi-Scale STFT (MS-STFT) discriminator.
+    Args:
+        filters (int): Number of filters in convolutions.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        sep_channels (bool): Separate channels to distinct samples for stereo support.
+        n_ffts (Sequence[int]): Size of FFT for each scale.
+        hop_lengths (Sequence[int]): Length of hop between STFT windows for each scale.
+        win_lengths (Sequence[int]): Window size for each scale.
+        **kwargs: Additional args for STFTDiscriminator.
+    """
+    def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1, sep_channels: bool = False,
+                 n_ffts: tp.List[int] = [1024, 2048, 512], hop_lengths: tp.List[int] = [256, 512, 128],
+                 win_lengths: tp.List[int] = [1024, 2048, 512], **kwargs):
+        super().__init__()
+        assert len(n_ffts) == len(hop_lengths) == len(win_lengths)
+        self.sep_channels = sep_channels
+        self.discriminators = nn.ModuleList([
+            DiscriminatorSTFT(filters, in_channels=in_channels, out_channels=out_channels,
+                              n_fft=n_ffts[i], win_length=win_lengths[i], hop_length=hop_lengths[i], **kwargs)
+            for i in range(len(n_ffts))
+        ])
+    @property
+    def num_discriminators(self):
+        return len(self.discriminators)
+    def _separate_channels(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, T = x.shape
+        return x.view(-1, 1, T)
+    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
+        logits = []
+        fmaps = []
+        for disc in self.discriminators:
+            logit, fmap = disc(x)
+            logits.append(logit)
+            fmaps.append(fmap)
+        return logits, fmaps

audiocraft/adversarial/losses.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Utility module to handle adversarial losses without requiring to mess up the main training loop.
+"""
+import typing as tp
+import flashy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+ADVERSARIAL_LOSSES = ['mse', 'hinge', 'hinge2']
+AdvLossType = tp.Union[nn.Module, tp.Callable[[torch.Tensor], torch.Tensor]]
+FeatLossType = tp.Union[nn.Module, tp.Callable[[torch.Tensor, torch.Tensor], torch.Tensor]]
+class AdversarialLoss(nn.Module):
+    """Adversary training wrapper.
+    Args:
+        adversary (nn.Module): The adversary module will be used to estimate the logits given the fake and real samples.
+            We assume here the adversary output is ``Tuple[List[torch.Tensor], List[List[torch.Tensor]]]``
+            where the first item is a list of logits and the second item is a list of feature maps.
+        optimizer (torch.optim.Optimizer): Optimizer used for training the given module.
+        loss (AdvLossType): Loss function for generator training.
+        loss_real (AdvLossType): Loss function for adversarial training on logits from real samples.
+        loss_fake (AdvLossType): Loss function for adversarial training on logits from fake samples.
+        loss_feat (FeatLossType): Feature matching loss function for generator training.
+        normalize (bool): Whether to normalize by number of sub-discriminators.
+    Example of usage:
+        adv_loss = AdversarialLoss(adversaries, optimizer, loss, loss_real, loss_fake)
+        for real in loader:
+            noise = torch.randn(...)
+            fake = model(noise)
+            adv_loss.train_adv(fake, real)
+            loss, _ = adv_loss(fake, real)
+            loss.backward()
+    """
+    def __init__(self,
+                 adversary: nn.Module,
+                 optimizer: torch.optim.Optimizer,
+                 loss: AdvLossType,
+                 loss_real: AdvLossType,
+                 loss_fake: AdvLossType,
+                 loss_feat: tp.Optional[FeatLossType] = None,
+                 normalize: bool = True):
+        super().__init__()
+        self.adversary: nn.Module = adversary
+        flashy.distrib.broadcast_model(self.adversary)
+        self.optimizer = optimizer
+        self.loss = loss
+        self.loss_real = loss_real
+        self.loss_fake = loss_fake
+        self.loss_feat = loss_feat
+        self.normalize = normalize
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        # Add the optimizer state dict inside our own.
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'optimizer'] = self.optimizer.state_dict()
+        return destination
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        # Load optimizer state.
+        self.optimizer.load_state_dict(state_dict.pop(prefix + 'optimizer'))
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    def get_adversary_pred(self, x):
+        """Run adversary model, validating expected output format."""
+        logits, fmaps = self.adversary(x)
+        assert isinstance(logits, list) and all([isinstance(t, torch.Tensor) for t in logits]), \
+            f'Expecting a list of tensors as logits but {type(logits)} found.'
+        assert isinstance(fmaps, list), f'Expecting a list of features maps but {type(fmaps)} found.'
+        for fmap in fmaps:
+            assert isinstance(fmap, list) and all([isinstance(f, torch.Tensor) for f in fmap]), \
+                f'Expecting a list of tensors as feature maps but {type(fmap)} found.'
+        return logits, fmaps
+    def train_adv(self, fake: torch.Tensor, real: torch.Tensor) -> torch.Tensor:
+        """Train the adversary with the given fake and real example.
+        We assume the adversary output is the following format: Tuple[List[torch.Tensor], List[List[torch.Tensor]]].
+        The first item being the logits and second item being a list of feature maps for each sub-discriminator.
+        This will automatically synchronize gradients (with `flashy.distrib.eager_sync_model`)
+        and call the optimizer.
+        """
+        loss = torch.tensor(0., device=fake.device)
+        all_logits_fake_is_fake, _ = self.get_adversary_pred(fake.detach())
+        all_logits_real_is_fake, _ = self.get_adversary_pred(real.detach())
+        n_sub_adversaries = len(all_logits_fake_is_fake)
+        for logit_fake_is_fake, logit_real_is_fake in zip(all_logits_fake_is_fake, all_logits_real_is_fake):
+            loss += self.loss_fake(logit_fake_is_fake) + self.loss_real(logit_real_is_fake)
+        if self.normalize:
+            loss /= n_sub_adversaries
+        self.optimizer.zero_grad()
+        with flashy.distrib.eager_sync_model(self.adversary):
+            loss.backward()
+        self.optimizer.step()
+        return loss
+    def forward(self, fake: torch.Tensor, real: torch.Tensor) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+        """Return the loss for the generator, i.e. trying to fool the adversary,
+        and feature matching loss if provided.
+        """
+        adv = torch.tensor(0., device=fake.device)
+        feat = torch.tensor(0., device=fake.device)
+        with flashy.utils.readonly(self.adversary):
+            all_logits_fake_is_fake, all_fmap_fake = self.get_adversary_pred(fake)
+            all_logits_real_is_fake, all_fmap_real = self.get_adversary_pred(real)
+            n_sub_adversaries = len(all_logits_fake_is_fake)
+            for logit_fake_is_fake in all_logits_fake_is_fake:
+                adv += self.loss(logit_fake_is_fake)
+            if self.loss_feat:
+                for fmap_fake, fmap_real in zip(all_fmap_fake, all_fmap_real):
+                    feat += self.loss_feat(fmap_fake, fmap_real)
+        if self.normalize:
+            adv /= n_sub_adversaries
+            feat /= n_sub_adversaries
+        return adv, feat
+def get_adv_criterion(loss_type: str) -> tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == 'mse':
+        return mse_loss
+    elif loss_type == 'hinge':
+        return hinge_loss
+    elif loss_type == 'hinge2':
+        return hinge2_loss
+    raise ValueError('Unsupported loss')
+def get_fake_criterion(loss_type: str) -> tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == 'mse':
+        return mse_fake_loss
+    elif loss_type in ['hinge', 'hinge2']:
+        return hinge_fake_loss
+    raise ValueError('Unsupported loss')
+def get_real_criterion(loss_type: str) -> tp.Callable:
+    assert loss_type in ADVERSARIAL_LOSSES
+    if loss_type == 'mse':
+        return mse_real_loss
+    elif loss_type in ['hinge', 'hinge2']:
+        return hinge_real_loss
+    raise ValueError('Unsupported loss')
+def mse_real_loss(x: torch.Tensor) -> torch.Tensor:
+    return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))
+def mse_fake_loss(x: torch.Tensor) -> torch.Tensor:
+    return F.mse_loss(x, torch.tensor(0., device=x.device).expand_as(x))
+def hinge_real_loss(x: torch.Tensor) -> torch.Tensor:
+    return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))
+def hinge_fake_loss(x: torch.Tensor) -> torch.Tensor:
+    return -torch.mean(torch.min(-x - 1, torch.tensor(0., device=x.device).expand_as(x)))
+def mse_loss(x: torch.Tensor) -> torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0], device=x.device)
+    return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))
+def hinge_loss(x: torch.Tensor) -> torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0], device=x.device)
+    return -x.mean()
+def hinge2_loss(x: torch.Tensor) -> torch.Tensor:
+    if x.numel() == 0:
+        return torch.tensor([0.0])
+    return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))
+class FeatureMatchingLoss(nn.Module):
+    """Feature matching loss for adversarial training.
+    Args:
+        loss (nn.Module): Loss to use for feature matching (default=torch.nn.L1).
+        normalize (bool): Whether to normalize the loss.
+            by number of feature maps.
+    """
+    def __init__(self, loss: nn.Module = torch.nn.L1Loss(), normalize: bool = True):
+        super().__init__()
+        self.loss = loss
+        self.normalize = normalize
+    def forward(self, fmap_fake: tp.List[torch.Tensor], fmap_real: tp.List[torch.Tensor]) -> torch.Tensor:
+        assert len(fmap_fake) == len(fmap_real) and len(fmap_fake) > 0
+        feat_loss = torch.tensor(0., device=fmap_fake[0].device)
+        feat_scale = torch.tensor(0., device=fmap_fake[0].device)
+        n_fmaps = 0
+        for (feat_fake, feat_real) in zip(fmap_fake, fmap_real):
+            assert feat_fake.shape == feat_real.shape
+            n_fmaps += 1
+            feat_loss += self.loss(feat_fake, feat_real)
+            feat_scale += torch.mean(torch.abs(feat_real))
+        if self.normalize:
+            feat_loss /= n_fmaps
+        return feat_loss

audiocraft/data/__init__.py CHANGED Viewed

@@ -3,6 +3,8 @@
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 # flake8: noqa
-from . import audio, audio_dataset

 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+"""Audio loading and writing support. Datasets for raw audio
+or also including some metadata."""
 # flake8: noqa
+from . import audio, audio_dataset, info_audio_dataset, music_dataset, sound_dataset

audiocraft/data/audio.py CHANGED Viewed

@@ -18,11 +18,11 @@ import numpy as np
 import soundfile
 import torch
 from torch.nn import functional as F
-import torchaudio as ta
 import av
-from .audio_utils import f32_pcm, i16_pcm, normalize_audio
 _av_initialized = False
@@ -78,7 +78,7 @@ def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: floa
         seek_time (float): Time at which to start reading in the file.
         duration (float): Duration to read from the file. If set to -1, the whole file is read.
     Returns:
-        Tuple[torch.Tensor, int]: Tuple containing audio data and sample rate
     """
     _init_av()
     with av.open(str(filepath)) as af:
@@ -123,7 +123,7 @@ def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
         duration (float): Duration to read from the file. If set to -1, the whole file is read.
         pad (bool): Pad output audio if not reaching expected duration.
     Returns:
-        Tuple[torch.Tensor, int]: Tuple containing audio data and sample rate.
     """
     fp = Path(filepath)
     if fp.suffix in ['.flac', '.ogg']:  # TODO: check if we can safely use av_read for .ogg
@@ -136,12 +136,6 @@ def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
         wav = torch.from_numpy(wav).t().contiguous()
         if len(wav.shape) == 1:
             wav = torch.unsqueeze(wav, 0)
-    elif (
-        fp.suffix in ['.wav', '.mp3'] and fp.suffix[1:] in ta.utils.sox_utils.list_read_formats()
-        and duration <= 0 and seek_time == 0
-    ):
-        # Torchaudio is faster if we load an entire file at once.
-        wav, sr = ta.load(fp)
     else:
         wav, sr = _av_read(filepath, seek_time, duration)
     if pad and duration > 0:
@@ -150,10 +144,22 @@ def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
     return wav, sr
 def audio_write(stem_name: tp.Union[str, Path],
                 wav: torch.Tensor, sample_rate: int,
-                format: str = 'wav', mp3_rate: int = 320, normalize: bool = True,
-                strategy: str = 'peak', peak_clip_headroom_db: float = 1,
                 rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
                 loudness_compressor: bool = False,
                 log_clipping: bool = True, make_parent_dir: bool = True,
@@ -162,8 +168,11 @@ def audio_write(stem_name: tp.Union[str, Path],
     Args:
         stem_name (str or Path): Filename without extension which will be added automatically.
-        format (str): Either "wav" or "mp3".
         mp3_rate (int): kbps when using mp3s.
         normalize (bool): if `True` (default), normalizes according to the prescribed
             strategy (see after). If `False`, the strategy is only used in case clipping
             would happen.
@@ -175,7 +184,7 @@ def audio_write(stem_name: tp.Union[str, Path],
             than the `peak_clip` one to avoid further clipping.
         loudness_headroom_db (float): Target loudness for loudness normalization.
         loudness_compressor (bool): Uses tanh for soft clipping when strategy is 'loudness'.
-         when strategy is 'loudness'log_clipping (bool): If True, basic logging on stderr when clipping still
             occurs despite strategy (only for 'rms').
         make_parent_dir (bool): Make parent directory if it doesn't exist.
     Returns:
@@ -188,16 +197,23 @@ def audio_write(stem_name: tp.Union[str, Path],
         raise ValueError("Input wav should be at most 2 dimension.")
     assert wav.isfinite().all()
     wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
-                          rms_headroom_db, loudness_headroom_db, log_clipping=log_clipping,
-                          sample_rate=sample_rate, stem_name=str(stem_name))
-    kwargs: dict = {}
     if format == 'mp3':
         suffix = '.mp3'
-        kwargs.update({"compression": mp3_rate})
     elif format == 'wav':
-        wav = i16_pcm(wav)
         suffix = '.wav'
-        kwargs.update({"encoding": "PCM_S", "bits_per_sample": 16})
     else:
         raise RuntimeError(f"Invalid format {format}. Only wav or mp3 are supported.")
     if not add_suffix:
@@ -206,7 +222,7 @@ def audio_write(stem_name: tp.Union[str, Path],
     if make_parent_dir:
         path.parent.mkdir(exist_ok=True, parents=True)
     try:
-        ta.save(path, wav, sample_rate, **kwargs)
     except Exception:
         if path.exists():
             # we do not want to leave half written files around.

 import soundfile
 import torch
 from torch.nn import functional as F
 import av
+import subprocess as sp
+from .audio_utils import f32_pcm, normalize_audio
 _av_initialized = False
         seek_time (float): Time at which to start reading in the file.
         duration (float): Duration to read from the file. If set to -1, the whole file is read.
     Returns:
+        tuple of torch.Tensor, int: Tuple containing audio data and sample rate
     """
     _init_av()
     with av.open(str(filepath)) as af:
         duration (float): Duration to read from the file. If set to -1, the whole file is read.
         pad (bool): Pad output audio if not reaching expected duration.
     Returns:
+        tuple of torch.Tensor, int: Tuple containing audio data and sample rate.
     """
     fp = Path(filepath)
     if fp.suffix in ['.flac', '.ogg']:  # TODO: check if we can safely use av_read for .ogg
         wav = torch.from_numpy(wav).t().contiguous()
         if len(wav.shape) == 1:
             wav = torch.unsqueeze(wav, 0)
     else:
         wav, sr = _av_read(filepath, seek_time, duration)
     if pad and duration > 0:
     return wav, sr
+def _piping_to_ffmpeg(out_path: tp.Union[str, Path], wav: torch.Tensor, sample_rate: int, flags: tp.List[str]):
+    # ffmpeg is always installed and torchaudio is a bit unstable lately, so let's bypass it entirely.
+    assert wav.dim() == 2, wav.shape
+    command = [
+        'ffmpeg',
+        '-loglevel', 'error',
+        '-y', '-f', 'f32le', '-ar', str(sample_rate), '-ac', str(wav.shape[0]),
+        '-i', '-'] + flags + [str(out_path)]
+    input_ = f32_pcm(wav).t().detach().cpu().numpy().tobytes()
+    sp.run(command, input=input_, check=True)
 def audio_write(stem_name: tp.Union[str, Path],
                 wav: torch.Tensor, sample_rate: int,
+                format: str = 'wav', mp3_rate: int = 320, ogg_rate: tp.Optional[int] = None,
+                normalize: bool = True, strategy: str = 'peak', peak_clip_headroom_db: float = 1,
                 rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
                 loudness_compressor: bool = False,
                 log_clipping: bool = True, make_parent_dir: bool = True,
     Args:
         stem_name (str or Path): Filename without extension which will be added automatically.
+        wav (torch.Tensor): Audio data to save.
+        sample_rate (int): Sample rate of audio data.
+        format (str): Either "wav", "mp3", "ogg", or "flac".
         mp3_rate (int): kbps when using mp3s.
+        ogg_rate (int): kbps when using ogg/vorbis. If not provided, let ffmpeg decide for itself.
         normalize (bool): if `True` (default), normalizes according to the prescribed
             strategy (see after). If `False`, the strategy is only used in case clipping
             would happen.
             than the `peak_clip` one to avoid further clipping.
         loudness_headroom_db (float): Target loudness for loudness normalization.
         loudness_compressor (bool): Uses tanh for soft clipping when strategy is 'loudness'.
+         when strategy is 'loudness' log_clipping (bool): If True, basic logging on stderr when clipping still
             occurs despite strategy (only for 'rms').
         make_parent_dir (bool): Make parent directory if it doesn't exist.
     Returns:
         raise ValueError("Input wav should be at most 2 dimension.")
     assert wav.isfinite().all()
     wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
+                          rms_headroom_db, loudness_headroom_db, loudness_compressor,
+                          log_clipping=log_clipping, sample_rate=sample_rate,
+                          stem_name=str(stem_name))
     if format == 'mp3':
         suffix = '.mp3'
+        flags = ['-f', 'mp3', '-c:a', 'libmp3lame', '-b:a', f'{mp3_rate}k']
     elif format == 'wav':
         suffix = '.wav'
+        flags = ['-f', 'wav', '-c:a', 'pcm_s16le']
+    elif format == 'ogg':
+        suffix = '.ogg'
+        flags = ['-f', 'ogg', '-c:a', 'libvorbis']
+        if ogg_rate is not None:
+            flags += ['-b:a', f'{ogg_rate}k']
+    elif format == 'flac':
+        suffix = '.flac'
+        flags = ['-f', 'flac']
     else:
         raise RuntimeError(f"Invalid format {format}. Only wav or mp3 are supported.")
     if not add_suffix:
     if make_parent_dir:
         path.parent.mkdir(exist_ok=True, parents=True)
     try:
+        _piping_to_ffmpeg(path, wav, sample_rate, flags)
     except Exception:
         if path.exists():
             # we do not want to leave half written files around.

audiocraft/data/audio_dataset.py CHANGED Viewed

@@ -3,12 +3,16 @@
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import argparse
 import copy
 from concurrent.futures import ThreadPoolExecutor, Future
 from dataclasses import dataclass, fields
 from contextlib import ExitStack
 import gzip
 import json
 import logging
@@ -81,9 +85,12 @@ class AudioMeta(BaseInfo):
 class SegmentInfo(BaseInfo):
     meta: AudioMeta
     seek_time: float
-    n_frames: int  # actual number of frames without padding
     total_frames: int  # total number of frames, padding included
-    sample_rate: int  # actual sample rate
 DEFAULT_EXTS = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
@@ -114,8 +121,8 @@ def _resolve_audio_meta(m: AudioMeta, fast: bool = True) -> AudioMeta:
     Args:
         m (AudioMeta): Audio meta to resolve.
-        fast (bool): If True, uses a really fast check for determining if a file is already absolute or not.
-            Only valid on Linux/Mac.
     Returns:
         AudioMeta: Audio meta with resolved path.
     """
@@ -151,7 +158,7 @@ def find_audio_files(path: tp.Union[Path, str],
         progress (bool): Whether to log progress on audio files collection.
         workers (int): number of parallel workers, if 0, use only the current thread.
     Returns:
-        List[AudioMeta]: List of audio file path and its metadata.
     """
     audio_files = []
     futures: tp.List[Future] = []
@@ -203,7 +210,7 @@ def load_audio_meta(path: tp.Union[str, Path],
         resolve (bool): Whether to resolve the path from AudioMeta (default=True).
         fast (bool): activates some tricks to make things faster.
     Returns:
-        List[AudioMeta]: List of audio file path and its total duration.
     """
     open_fn = gzip.open if str(path).lower().endswith('.gz') else open
     with open_fn(path, 'rb') as fp:  # type: ignore
@@ -250,9 +257,14 @@ class AudioDataset:
     allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
     original audio meta.
     Args:
-        meta (tp.List[AudioMeta]): List of audio files metadata.
-        segment_duration (float): Optional segment duration of audio to load.
             If not specified, the dataset will load the full audio segment from the file.
         shuffle (bool): Set to `True` to have the data reshuffled at every epoch.
         sample_rate (int): Target sample rate of the loaded audio samples.
@@ -266,10 +278,19 @@ class AudioDataset:
             is shorter than the desired segment.
         max_read_retry (int): Maximum number of retries to sample an audio segment from the dataset.
         return_info (bool): Whether to return the wav only or return wav along with segment info and metadata.
-        min_audio_duration (tp.Optional[float], optional): Minimum audio file duration, in seconds, if provided
             audio shorter than this will be filtered out.
-        max_audio_duration (tp.Optional[float], optional): Maximal audio file duration in seconds, if provided
             audio longer than this will be filtered out.
     """
     def __init__(self,
                  meta: tp.List[AudioMeta],
@@ -285,16 +306,14 @@ class AudioDataset:
                  max_read_retry: int = 10,
                  return_info: bool = False,
                  min_audio_duration: tp.Optional[float] = None,
-                 max_audio_duration: tp.Optional[float] = None
                  ):
-        assert len(meta) > 0, 'No audio meta provided to AudioDataset. Please check loading of audio meta.'
         assert segment_duration is None or segment_duration > 0
         assert segment_duration is None or min_segment_ratio >= 0
-        logging.debug(f'sample_on_duration: {sample_on_duration}')
-        logging.debug(f'sample_on_weight: {sample_on_weight}')
-        logging.debug(f'pad: {pad}')
-        logging.debug(f'min_segment_ratio: {min_segment_ratio}')
         self.segment_duration = segment_duration
         self.min_segment_ratio = min_segment_ratio
         self.max_audio_duration = max_audio_duration
@@ -317,13 +336,25 @@ class AudioDataset:
         self.sampling_probabilities = self._get_sampling_probabilities()
         self.max_read_retry = max_read_retry
         self.return_info = return_info
     def __len__(self):
         return self.num_samples
     def _get_sampling_probabilities(self, normalized: bool = True):
-        """Return the sampling probabilities for each file inside `self.meta`.
-        """
         scores: tp.List[float] = []
         for file_meta in self.meta:
             score = 1.
@@ -337,12 +368,32 @@ class AudioDataset:
             probabilities /= probabilities.sum()
         return probabilities
-    def sample_file(self, rng: torch.Generator) -> AudioMeta:
-        """Sample a given file from `self.meta`. Can be overriden in subclasses.
         This is only called if `segment_duration` is not None.
         You must use the provided random number generator `rng` for reproducibility.
         """
         if not self.sample_on_weight and not self.sample_on_duration:
             file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
         else:
@@ -350,6 +401,15 @@ class AudioDataset:
         return self.meta[file_index]
     def __getitem__(self, index: int) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentInfo]]:
         if self.segment_duration is None:
             file_meta = self.meta[index]
@@ -357,18 +417,22 @@ class AudioDataset:
             out = convert_audio(out, sr, self.sample_rate, self.channels)
             n_frames = out.shape[-1]
             segment_info = SegmentInfo(file_meta, seek_time=0., n_frames=n_frames, total_frames=n_frames,
-                                       sample_rate=self.sample_rate)
         else:
             rng = torch.Generator()
             if self.shuffle:
-                # We use index, plus extra randomness
-                rng.manual_seed(index + self.num_samples * random.randint(0, 2**24))
             else:
                 # We only use index
                 rng.manual_seed(index)
             for retry in range(self.max_read_retry):
-                file_meta = self.sample_file(rng)
                 # We add some variance in the file position even if audio file is smaller than segment
                 # without ending up with empty segments
                 max_seek = max(0, file_meta.duration - self.segment_duration * self.min_segment_ratio)
@@ -381,7 +445,7 @@ class AudioDataset:
                     if self.pad:
                         out = F.pad(out, (0, target_frames - n_frames))
                     segment_info = SegmentInfo(file_meta, seek_time, n_frames=n_frames, total_frames=target_frames,
-                                               sample_rate=self.sample_rate)
                 except Exception as exc:
                     logger.warning("Error opening file %s: %r", file_meta.path, exc)
                     if retry == self.max_read_retry - 1:
@@ -423,7 +487,7 @@ class AudioDataset:
             if to_pad:
                 # Each wav could be of a different duration as they are not segmented.
                 for i in range(len(samples)):
-                    # Determines the total legth of the signal with padding, so we update here as we pad.
                     segment_infos[i].total_frames = max_len
                     wavs[i] = _pad_wav(wavs[i])
@@ -436,9 +500,7 @@ class AudioDataset:
             return torch.stack(samples)
     def _filter_duration(self, meta: tp.List[AudioMeta]) -> tp.List[AudioMeta]:
-        """Filters out audio files with short durations.
-        Removes from meta files that have durations that will not allow to samples examples from them.
-        """
         orig_len = len(meta)
         # Filter data that is too short.

 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+"""AudioDataset support. In order to handle a larger number of files
+without having to scan again the folders, we precompute some metadata
+(filename, sample rate, duration), and use that to efficiently sample audio segments.
+"""
 import argparse
 import copy
 from concurrent.futures import ThreadPoolExecutor, Future
 from dataclasses import dataclass, fields
 from contextlib import ExitStack
+from functools import lru_cache
 import gzip
 import json
 import logging
 class SegmentInfo(BaseInfo):
     meta: AudioMeta
     seek_time: float
+    # The following values are given once the audio is processed, e.g.
+    # at the target sample rate and target number of channels.
+    n_frames: int      # actual number of frames without padding
     total_frames: int  # total number of frames, padding included
+    sample_rate: int   # actual sample rate
+    channels: int      # number of audio channels.
 DEFAULT_EXTS = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
     Args:
         m (AudioMeta): Audio meta to resolve.
+        fast (bool): If True, uses a really fast check for determining if a file
+            is already absolute or not. Only valid on Linux/Mac.
     Returns:
         AudioMeta: Audio meta with resolved path.
     """
         progress (bool): Whether to log progress on audio files collection.
         workers (int): number of parallel workers, if 0, use only the current thread.
     Returns:
+        list of AudioMeta: List of audio file path and its metadata.
     """
     audio_files = []
     futures: tp.List[Future] = []
         resolve (bool): Whether to resolve the path from AudioMeta (default=True).
         fast (bool): activates some tricks to make things faster.
     Returns:
+        list of AudioMeta: List of audio file path and its total duration.
     """
     open_fn = gzip.open if str(path).lower().endswith('.gz') else open
     with open_fn(path, 'rb') as fp:  # type: ignore
     allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
     original audio meta.
+    Note that you can call `start_epoch(epoch)` in order to get
+    a deterministic "randomization" for `shuffle=True`.
+    For a given epoch and dataset index, this will always return the same extract.
+    You can get back some diversity by setting the `shuffle_seed` param.
     Args:
+        meta (list of AudioMeta): List of audio files metadata.
+        segment_duration (float, optional): Optional segment duration of audio to load.
             If not specified, the dataset will load the full audio segment from the file.
         shuffle (bool): Set to `True` to have the data reshuffled at every epoch.
         sample_rate (int): Target sample rate of the loaded audio samples.
             is shorter than the desired segment.
         max_read_retry (int): Maximum number of retries to sample an audio segment from the dataset.
         return_info (bool): Whether to return the wav only or return wav along with segment info and metadata.
+        min_audio_duration (float, optional): Minimum audio file duration, in seconds, if provided
             audio shorter than this will be filtered out.
+        max_audio_duration (float, optional): Maximal audio file duration in seconds, if provided
             audio longer than this will be filtered out.
+        shuffle_seed (int): can be used to further randomize
+        load_wav (bool): if False, skip loading the wav but returns a tensor of 0
+            with the expected segment_duration (which must be provided if load_wav is False).
+        permutation_on_files (bool): only if `sample_on_weight` and `sample_on_duration`
+            are False. Will ensure a permutation on files when going through the dataset.
+            In that case the epoch number must be provided in order for the model
+            to continue the permutation across epochs. In that case, it is assumed
+            that `num_samples = total_batch_size * num_updates_per_epoch`, with
+            `total_batch_size` the overall batch size accounting for all gpus.
     """
     def __init__(self,
                  meta: tp.List[AudioMeta],
                  max_read_retry: int = 10,
                  return_info: bool = False,
                  min_audio_duration: tp.Optional[float] = None,
+                 max_audio_duration: tp.Optional[float] = None,
+                 shuffle_seed: int = 0,
+                 load_wav: bool = True,
+                 permutation_on_files: bool = False,
                  ):
+        assert len(meta) > 0, "No audio meta provided to AudioDataset. Please check loading of audio meta."
         assert segment_duration is None or segment_duration > 0
         assert segment_duration is None or min_segment_ratio >= 0
         self.segment_duration = segment_duration
         self.min_segment_ratio = min_segment_ratio
         self.max_audio_duration = max_audio_duration
         self.sampling_probabilities = self._get_sampling_probabilities()
         self.max_read_retry = max_read_retry
         self.return_info = return_info
+        self.shuffle_seed = shuffle_seed
+        self.current_epoch: tp.Optional[int] = None
+        self.load_wav = load_wav
+        if not load_wav:
+            assert segment_duration is not None
+        self.permutation_on_files = permutation_on_files
+        if permutation_on_files:
+            assert not self.sample_on_duration
+            assert not self.sample_on_weight
+            assert self.shuffle
+    def start_epoch(self, epoch: int):
+        self.current_epoch = epoch
     def __len__(self):
         return self.num_samples
     def _get_sampling_probabilities(self, normalized: bool = True):
+        """Return the sampling probabilities for each file inside `self.meta`."""
         scores: tp.List[float] = []
         for file_meta in self.meta:
             score = 1.
             probabilities /= probabilities.sum()
         return probabilities
+    @staticmethod
+    @lru_cache(16)
+    def _get_file_permutation(num_files: int, permutation_index: int, base_seed: int):
+        # Used to keep the most recent files permutation in memory implicitely.
+        # will work unless someone is using a lot of Datasets in parallel.
+        rng = torch.Generator()
+        rng.manual_seed(base_seed + permutation_index)
+        return torch.randperm(num_files, generator=rng)
+    def sample_file(self, index: int, rng: torch.Generator) -> AudioMeta:
+        """Sample a given file from `self.meta`. Can be overridden in subclasses.
         This is only called if `segment_duration` is not None.
         You must use the provided random number generator `rng` for reproducibility.
+        You can further make use of the index accessed.
         """
+        if self.permutation_on_files:
+            assert self.current_epoch is not None
+            total_index = self.current_epoch * len(self) + index
+            permutation_index = total_index // len(self.meta)
+            relative_index = total_index % len(self.meta)
+            permutation = AudioDataset._get_file_permutation(
+                len(self.meta), permutation_index, self.shuffle_seed)
+            file_index = permutation[relative_index]
+            return self.meta[file_index]
         if not self.sample_on_weight and not self.sample_on_duration:
             file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
         else:
         return self.meta[file_index]
+    def _audio_read(self, path: str, seek_time: float = 0, duration: float = -1):
+        # Override this method in subclass if needed.
+        if self.load_wav:
+            return audio_read(path, seek_time, duration, pad=False)
+        else:
+            assert self.segment_duration is not None
+            n_frames = int(self.sample_rate * self.segment_duration)
+            return torch.zeros(self.channels, n_frames), self.sample_rate
     def __getitem__(self, index: int) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentInfo]]:
         if self.segment_duration is None:
             file_meta = self.meta[index]
             out = convert_audio(out, sr, self.sample_rate, self.channels)
             n_frames = out.shape[-1]
             segment_info = SegmentInfo(file_meta, seek_time=0., n_frames=n_frames, total_frames=n_frames,
+                                       sample_rate=self.sample_rate, channels=out.shape[0])
         else:
             rng = torch.Generator()
             if self.shuffle:
+                # We use index, plus extra randomness, either totally random if we don't know the epoch.
+                # otherwise we make use of the epoch number and optional shuffle_seed.
+                if self.current_epoch is None:
+                    rng.manual_seed(index + self.num_samples * random.randint(0, 2**24))
+                else:
+                    rng.manual_seed(index + self.num_samples * (self.current_epoch + self.shuffle_seed))
             else:
                 # We only use index
                 rng.manual_seed(index)
             for retry in range(self.max_read_retry):
+                file_meta = self.sample_file(index, rng)
                 # We add some variance in the file position even if audio file is smaller than segment
                 # without ending up with empty segments
                 max_seek = max(0, file_meta.duration - self.segment_duration * self.min_segment_ratio)
                     if self.pad:
                         out = F.pad(out, (0, target_frames - n_frames))
                     segment_info = SegmentInfo(file_meta, seek_time, n_frames=n_frames, total_frames=target_frames,
+                                               sample_rate=self.sample_rate, channels=out.shape[0])
                 except Exception as exc:
                     logger.warning("Error opening file %s: %r", file_meta.path, exc)
                     if retry == self.max_read_retry - 1:
             if to_pad:
                 # Each wav could be of a different duration as they are not segmented.
                 for i in range(len(samples)):
+                    # Determines the total length of the signal with padding, so we update here as we pad.
                     segment_infos[i].total_frames = max_len
                     wavs[i] = _pad_wav(wavs[i])
             return torch.stack(samples)
     def _filter_duration(self, meta: tp.List[AudioMeta]) -> tp.List[AudioMeta]:
+        """Filters out audio files with audio durations that will not allow to sample examples from them."""
         orig_len = len(meta)
         # Filter data that is too short.

audiocraft/data/audio_utils.py CHANGED Viewed

@@ -3,7 +3,8 @@
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import sys
 import typing as tp
@@ -47,8 +48,7 @@ def convert_audio_channels(wav: torch.Tensor, channels: int = 2) -> torch.Tensor
 def convert_audio(wav: torch.Tensor, from_rate: float,
                   to_rate: float, to_channels: int) -> torch.Tensor:
-    """Convert audio to new sample rate and number of audio channels.
-    """
     wav = julius.resample_frac(wav, int(from_rate), int(to_rate))
     wav = convert_audio_channels(wav, to_channels)
     return wav
@@ -66,7 +66,7 @@ def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db
         loudness_compressor (bool): Uses tanh for soft clipping.
         energy_floor (float): anything below that RMS level will not be rescaled.
     Returns:
-        output (torch.Tensor): Loudness normalized output data.
     """
     energy = wav.pow(2).mean().sqrt().item()
     if energy < energy_floor:
@@ -117,7 +117,7 @@ def normalize_audio(wav: torch.Tensor, normalize: bool = True,
         log_clipping (bool): If True, basic logging on stderr when clipping still
             occurs despite strategy (only for 'rms').
         sample_rate (int): Sample rate for the audio data (required for loudness).
-        stem_name (Optional[str]): Stem name for clipping logging.
     Returns:
         torch.Tensor: Normalized audio.
     """
@@ -150,17 +150,19 @@ def f32_pcm(wav: torch.Tensor) -> torch.Tensor:
     """
     if wav.dtype.is_floating_point:
         return wav
-    else:
-        assert wav.dtype == torch.int16
         return wav.float() / 2**15
 def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
     """Convert audio to int 16 bits PCM format.
-    ..Warning:: There exist many formula for doing this convertion. None are perfect
-    due to the asymetry of the int16 range. One either have possible clipping, DC offset,
-    or inconsistancies with f32_pcm. If the given wav doesn't have enough headroom,
     it is possible that `i16_pcm(f32_pcm)) != Identity`.
     """
     if wav.dtype.is_floating_point:

 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+"""Various utilities for audio convertion (pcm format, sample rate and channels),
+and volume normalization."""
 import sys
 import typing as tp
 def convert_audio(wav: torch.Tensor, from_rate: float,
                   to_rate: float, to_channels: int) -> torch.Tensor:
+    """Convert audio to new sample rate and number of audio channels."""
     wav = julius.resample_frac(wav, int(from_rate), int(to_rate))
     wav = convert_audio_channels(wav, to_channels)
     return wav
         loudness_compressor (bool): Uses tanh for soft clipping.
         energy_floor (float): anything below that RMS level will not be rescaled.
     Returns:
+        torch.Tensor: Loudness normalized output data.
     """
     energy = wav.pow(2).mean().sqrt().item()
     if energy < energy_floor:
         log_clipping (bool): If True, basic logging on stderr when clipping still
             occurs despite strategy (only for 'rms').
         sample_rate (int): Sample rate for the audio data (required for loudness).
+        stem_name (str, optional): Stem name for clipping logging.
     Returns:
         torch.Tensor: Normalized audio.
     """
     """
     if wav.dtype.is_floating_point:
         return wav
+    elif wav.dtype == torch.int16:
         return wav.float() / 2**15
+    elif wav.dtype == torch.int32:
+        return wav.float() / 2**31
+    raise ValueError(f"Unsupported wav dtype: {wav.dtype}")
 def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
     """Convert audio to int 16 bits PCM format.
+    ..Warning:: There exist many formula for doing this conversion. None are perfect
+    due to the asymmetry of the int16 range. One either have possible clipping, DC offset,
+    or inconsistencies with f32_pcm. If the given wav doesn't have enough headroom,
     it is possible that `i16_pcm(f32_pcm)) != Identity`.
     """
     if wav.dtype.is_floating_point:

audiocraft/data/info_audio_dataset.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Base classes for the datasets that also provide non-audio metadata,
+e.g. description, text transcription etc.
+"""
+from dataclasses import dataclass
+import logging
+import math
+import re
+import typing as tp
+import torch
+from .audio_dataset import AudioDataset, AudioMeta
+from ..environment import AudioCraftEnvironment
+from ..modules.conditioners import SegmentWithAttributes, ConditioningAttributes
+logger = logging.getLogger(__name__)
+def _clusterify_meta(meta: AudioMeta) -> AudioMeta:
+    """Monkey-patch meta to match cluster specificities."""
+    meta.path = AudioCraftEnvironment.apply_dataset_mappers(meta.path)
+    if meta.info_path is not None:
+        meta.info_path.zip_path = AudioCraftEnvironment.apply_dataset_mappers(meta.info_path.zip_path)
+    return meta
+def clusterify_all_meta(meta: tp.List[AudioMeta]) -> tp.List[AudioMeta]:
+    """Monkey-patch all meta to match cluster specificities."""
+    return [_clusterify_meta(m) for m in meta]
+@dataclass
+class AudioInfo(SegmentWithAttributes):
+    """Dummy SegmentInfo with empty attributes.
+    The InfoAudioDataset is expected to return metadata that inherits
+    from SegmentWithAttributes class and can return conditioning attributes.
+    This basically guarantees all datasets will be compatible with current
+    solver that contain conditioners requiring this.
+    """
+    audio_tokens: tp.Optional[torch.Tensor] = None  # populated when using cached batch for training a LM.
+    def to_condition_attributes(self) -> ConditioningAttributes:
+        return ConditioningAttributes()
+class InfoAudioDataset(AudioDataset):
+    """AudioDataset that always returns metadata as SegmentWithAttributes along with the audio waveform.
+    See `audiocraft.data.audio_dataset.AudioDataset` for initialization arguments.
+    """
+    def __init__(self, meta: tp.List[AudioMeta], **kwargs):
+        super().__init__(clusterify_all_meta(meta), **kwargs)
+    def __getitem__(self, index: int) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentWithAttributes]]:
+        if not self.return_info:
+            wav = super().__getitem__(index)
+            assert isinstance(wav, torch.Tensor)
+            return wav
+        wav, meta = super().__getitem__(index)
+        return wav, AudioInfo(**meta.to_dict())
+def get_keyword_or_keyword_list(value: tp.Optional[str]) -> tp.Union[tp.Optional[str], tp.Optional[tp.List[str]]]:
+    """Preprocess a single keyword or possible a list of keywords."""
+    if isinstance(value, list):
+        return get_keyword_list(value)
+    else:
+        return get_keyword(value)
+def get_string(value: tp.Optional[str]) -> tp.Optional[str]:
+    """Preprocess a single keyword."""
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
+        return None
+    else:
+        return value.strip()
+def get_keyword(value: tp.Optional[str]) -> tp.Optional[str]:
+    """Preprocess a single keyword."""
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
+        return None
+    else:
+        return value.strip().lower()
+def get_keyword_list(values: tp.Union[str, tp.List[str]]) -> tp.Optional[tp.List[str]]:
+    """Preprocess a list of keywords."""
+    if isinstance(values, str):
+        values = [v.strip() for v in re.split(r'[,\s]', values)]
+    elif isinstance(values, float) and math.isnan(values):
+        values = []
+    if not isinstance(values, list):
+        logger.debug(f"Unexpected keyword list {values}")
+        values = [str(values)]
+    kws = [get_keyword(v) for v in values]
+    kw_list = [k for k in kws if k is not None]
+    if len(kw_list) == 0:
+        return None
+    else:
+        return kw_list

audiocraft/data/music_dataset.py ADDED Viewed

	@@ -0,0 +1,270 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Dataset of music tracks with rich metadata.
+"""
+from dataclasses import dataclass, field, fields, replace
+import gzip
+import json
+import logging
+from pathlib import Path
+import random
+import typing as tp
+import torch
+from .info_audio_dataset import (
+    InfoAudioDataset,
+    AudioInfo,
+    get_keyword_list,
+    get_keyword,
+    get_string
+)
+from ..modules.conditioners import (
+    ConditioningAttributes,
+    JointEmbedCondition,
+    WavCondition,
+)
+from ..utils.utils import warn_once
+logger = logging.getLogger(__name__)
+@dataclass
+class MusicInfo(AudioInfo):
+    """Segment info augmented with music metadata.
+    """
+    # music-specific metadata
+    title: tp.Optional[str] = None
+    artist: tp.Optional[str] = None  # anonymized artist id, used to ensure no overlap between splits
+    key: tp.Optional[str] = None
+    bpm: tp.Optional[float] = None
+    genre: tp.Optional[str] = None
+    moods: tp.Optional[list] = None
+    keywords: tp.Optional[list] = None
+    description: tp.Optional[str] = None
+    name: tp.Optional[str] = None
+    instrument: tp.Optional[str] = None
+    # original wav accompanying the metadata
+    self_wav: tp.Optional[WavCondition] = None
+    # dict mapping attributes names to tuple of wav, text and metadata
+    joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
+    @property
+    def has_music_meta(self) -> bool:
+        return self.name is not None
+    def to_condition_attributes(self) -> ConditioningAttributes:
+        out = ConditioningAttributes()
+        for _field in fields(self):
+            key, value = _field.name, getattr(self, _field.name)
+            if key == 'self_wav':
+                out.wav[key] = value
+            elif key == 'joint_embed':
+                for embed_attribute, embed_cond in value.items():
+                    out.joint_embed[embed_attribute] = embed_cond
+            else:
+                if isinstance(value, list):
+                    value = ' '.join(value)
+                out.text[key] = value
+        return out
+    @staticmethod
+    def attribute_getter(attribute):
+        if attribute == 'bpm':
+            preprocess_func = get_bpm
+        elif attribute == 'key':
+            preprocess_func = get_musical_key
+        elif attribute in ['moods', 'keywords']:
+            preprocess_func = get_keyword_list
+        elif attribute in ['genre', 'name', 'instrument']:
+            preprocess_func = get_keyword
+        elif attribute in ['title', 'artist', 'description']:
+            preprocess_func = get_string
+        else:
+            preprocess_func = None
+        return preprocess_func
+    @classmethod
+    def from_dict(cls, dictionary: dict, fields_required: bool = False):
+        _dictionary: tp.Dict[str, tp.Any] = {}
+        # allow a subset of attributes to not be loaded from the dictionary
+        # these attributes may be populated later
+        post_init_attributes = ['self_wav', 'joint_embed']
+        optional_fields = ['keywords']
+        for _field in fields(cls):
+            if _field.name in post_init_attributes:
+                continue
+            elif _field.name not in dictionary:
+                if fields_required and _field.name not in optional_fields:
+                    raise KeyError(f"Unexpected missing key: {_field.name}")
+            else:
+                preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
+                value = dictionary[_field.name]
+                if preprocess_func:
+                    value = preprocess_func(value)
+                _dictionary[_field.name] = value
+        return cls(**_dictionary)
+def augment_music_info_description(music_info: MusicInfo, merge_text_p: float = 0.,
+                                   drop_desc_p: float = 0., drop_other_p: float = 0.) -> MusicInfo:
+    """Augment MusicInfo description with additional metadata fields and potential dropout.
+    Additional textual attributes are added given probability 'merge_text_conditions_p' and
+    the original textual description is dropped from the augmented description given probability drop_desc_p.
+    Args:
+        music_info (MusicInfo): The music metadata to augment.
+        merge_text_p (float): Probability of merging additional metadata to the description.
+            If provided value is 0, then no merging is performed.
+        drop_desc_p (float): Probability of dropping the original description on text merge.
+            if provided value is 0, then no drop out is performed.
+        drop_other_p (float): Probability of dropping the other fields used for text augmentation.
+    Returns:
+        MusicInfo: The MusicInfo with augmented textual description.
+    """
+    def is_valid_field(field_name: str, field_value: tp.Any) -> bool:
+        valid_field_name = field_name in ['key', 'bpm', 'genre', 'moods', 'instrument', 'keywords']
+        valid_field_value = field_value is not None and isinstance(field_value, (int, float, str, list))
+        keep_field = random.uniform(0, 1) < drop_other_p
+        return valid_field_name and valid_field_value and keep_field
+    def process_value(v: tp.Any) -> str:
+        if isinstance(v, (int, float, str)):
+            return str(v)
+        if isinstance(v, list):
+            return ", ".join(v)
+        else:
+            raise ValueError(f"Unknown type for text value! ({type(v), v})")
+    description = music_info.description
+    metadata_text = ""
+    if random.uniform(0, 1) < merge_text_p:
+        meta_pairs = [f'{_field.name}: {process_value(getattr(music_info, _field.name))}'
+                      for _field in fields(music_info) if is_valid_field(_field.name, getattr(music_info, _field.name))]
+        random.shuffle(meta_pairs)
+        metadata_text = ". ".join(meta_pairs)
+        description = description if not random.uniform(0, 1) < drop_desc_p else None
+        logger.debug(f"Applying text augmentation on MMI info. description: {description}, metadata: {metadata_text}")
+    if description is None:
+        description = metadata_text if len(metadata_text) > 1 else None
+    else:
+        description = ". ".join([description.rstrip('.'), metadata_text])
+    description = description.strip() if description else None
+    music_info = replace(music_info)
+    music_info.description = description
+    return music_info
+class Paraphraser:
+    def __init__(self, paraphrase_source: tp.Union[str, Path], paraphrase_p: float = 0.):
+        self.paraphrase_p = paraphrase_p
+        open_fn = gzip.open if str(paraphrase_source).lower().endswith('.gz') else open
+        with open_fn(paraphrase_source, 'rb') as f:  # type: ignore
+            self.paraphrase_source = json.loads(f.read())
+        logger.info(f"loaded paraphrasing source from: {paraphrase_source}")
+    def sample_paraphrase(self, audio_path: str, description: str):
+        if random.random() >= self.paraphrase_p:
+            return description
+        info_path = Path(audio_path).with_suffix('.json')
+        if info_path not in self.paraphrase_source:
+            warn_once(logger, f"{info_path} not in paraphrase source!")
+            return description
+        new_desc = random.choice(self.paraphrase_source[info_path])
+        logger.debug(f"{description} -> {new_desc}")
+        return new_desc
+class MusicDataset(InfoAudioDataset):
+    """Music dataset is an AudioDataset with music-related metadata.
+    Args:
+        info_fields_required (bool): Whether to enforce having required fields.
+        merge_text_p (float): Probability of merging additional metadata to the description.
+        drop_desc_p (float): Probability of dropping the original description on text merge.
+        drop_other_p (float): Probability of dropping the other fields used for text augmentation.
+        joint_embed_attributes (list[str]): A list of attributes for which joint embedding metadata is returned.
+        paraphrase_source (str, optional): Path to the .json or .json.gz file containing the
+            paraphrases for the description. The json should be a dict with keys are the
+            original info path (e.g. track_path.json) and each value is a list of possible
+            paraphrased.
+        paraphrase_p (float): probability of taking a paraphrase.
+    See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
+    """
+    def __init__(self, *args, info_fields_required: bool = True,
+                 merge_text_p: float = 0., drop_desc_p: float = 0., drop_other_p: float = 0.,
+                 joint_embed_attributes: tp.List[str] = [],
+                 paraphrase_source: tp.Optional[str] = None, paraphrase_p: float = 0,
+                 **kwargs):
+        kwargs['return_info'] = True  # We require the info for each song of the dataset.
+        super().__init__(*args, **kwargs)
+        self.info_fields_required = info_fields_required
+        self.merge_text_p = merge_text_p
+        self.drop_desc_p = drop_desc_p
+        self.drop_other_p = drop_other_p
+        self.joint_embed_attributes = joint_embed_attributes
+        self.paraphraser = None
+        if paraphrase_source is not None:
+            self.paraphraser = Paraphraser(paraphrase_source, paraphrase_p)
+    def __getitem__(self, index):
+        wav, info = super().__getitem__(index)
+        info_data = info.to_dict()
+        music_info_path = Path(info.meta.path).with_suffix('.json')
+        if Path(music_info_path).exists():
+            with open(music_info_path, 'r') as json_file:
+                music_data = json.load(json_file)
+                music_data.update(info_data)
+                music_info = MusicInfo.from_dict(music_data, fields_required=self.info_fields_required)
+            if self.paraphraser is not None:
+                music_info.description = self.paraphraser.sample(music_info.meta.path, music_info.description)
+            if self.merge_text_p:
+                music_info = augment_music_info_description(
+                    music_info, self.merge_text_p, self.drop_desc_p, self.drop_other_p)
+        else:
+            music_info = MusicInfo.from_dict(info_data, fields_required=False)
+        music_info.self_wav = WavCondition(
+            wav=wav[None], length=torch.tensor([info.n_frames]),
+            sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+        for att in self.joint_embed_attributes:
+            att_value = getattr(music_info, att)
+            joint_embed_cond = JointEmbedCondition(
+                wav[None], [att_value], torch.tensor([info.n_frames]),
+                sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+            music_info.joint_embed[att] = joint_embed_cond
+        return wav, music_info
+def get_musical_key(value: tp.Optional[str]) -> tp.Optional[str]:
+    """Preprocess key keywords, discarding them if there are multiple key defined."""
+    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
+        return None
+    elif ',' in value:
+        # For now, we discard when multiple keys are defined separated with comas
+        return None
+    else:
+        return value.strip().lower()
+def get_bpm(value: tp.Optional[str]) -> tp.Optional[float]:
+    """Preprocess to a float."""
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except ValueError:
+        return None

audiocraft/data/sound_dataset.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Dataset of audio with a simple description.
+"""
+from dataclasses import dataclass, fields, replace
+import json
+from pathlib import Path
+import random
+import typing as tp
+import numpy as np
+import torch
+from .info_audio_dataset import (
+    InfoAudioDataset,
+    get_keyword_or_keyword_list
+)
+from ..modules.conditioners import (
+    ConditioningAttributes,
+    SegmentWithAttributes,
+    WavCondition,
+)
+EPS = torch.finfo(torch.float32).eps
+TARGET_LEVEL_LOWER = -35
+TARGET_LEVEL_UPPER = -15
+@dataclass
+class SoundInfo(SegmentWithAttributes):
+    """Segment info augmented with Sound metadata.
+    """
+    description: tp.Optional[str] = None
+    self_wav: tp.Optional[torch.Tensor] = None
+    @property
+    def has_sound_meta(self) -> bool:
+        return self.description is not None
+    def to_condition_attributes(self) -> ConditioningAttributes:
+        out = ConditioningAttributes()
+        for _field in fields(self):
+            key, value = _field.name, getattr(self, _field.name)
+            if key == 'self_wav':
+                out.wav[key] = value
+            else:
+                out.text[key] = value
+        return out
+    @staticmethod
+    def attribute_getter(attribute):
+        if attribute == 'description':
+            preprocess_func = get_keyword_or_keyword_list
+        else:
+            preprocess_func = None
+        return preprocess_func
+    @classmethod
+    def from_dict(cls, dictionary: dict, fields_required: bool = False):
+        _dictionary: tp.Dict[str, tp.Any] = {}
+        # allow a subset of attributes to not be loaded from the dictionary
+        # these attributes may be populated later
+        post_init_attributes = ['self_wav']
+        for _field in fields(cls):
+            if _field.name in post_init_attributes:
+                continue
+            elif _field.name not in dictionary:
+                if fields_required:
+                    raise KeyError(f"Unexpected missing key: {_field.name}")
+            else:
+                preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
+                value = dictionary[_field.name]
+                if preprocess_func:
+                    value = preprocess_func(value)
+                _dictionary[_field.name] = value
+        return cls(**_dictionary)
+class SoundDataset(InfoAudioDataset):
+    """Sound audio dataset: Audio dataset with environmental sound-specific metadata.
+    Args:
+        info_fields_required (bool): Whether all the mandatory metadata fields should be in the loaded metadata.
+        external_metadata_source (tp.Optional[str]): Folder containing JSON metadata for the corresponding dataset.
+            The metadata files contained in this folder are expected to match the stem of the audio file with
+            a json extension.
+        aug_p (float): Probability of performing audio mixing augmentation on the batch.
+        mix_p (float): Proportion of batch items that are mixed together when applying audio mixing augmentation.
+        mix_snr_low (int): Lowerbound for SNR value sampled for mixing augmentation.
+        mix_snr_high (int): Upperbound for SNR value sampled for mixing augmentation.
+        mix_min_overlap (float): Minimum overlap between audio files when performing mixing augmentation.
+        kwargs: Additional arguments for AudioDataset.
+    See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
+    """
+    def __init__(
+        self,
+        *args,
+        info_fields_required: bool = True,
+        external_metadata_source: tp.Optional[str] = None,
+        aug_p: float = 0.,
+        mix_p: float = 0.,
+        mix_snr_low: int = -5,
+        mix_snr_high: int = 5,
+        mix_min_overlap: float = 0.5,
+        **kwargs
+    ):
+        kwargs['return_info'] = True  # We require the info for each song of the dataset.
+        super().__init__(*args, **kwargs)
+        self.info_fields_required = info_fields_required
+        self.external_metadata_source = external_metadata_source
+        self.aug_p = aug_p
+        self.mix_p = mix_p
+        if self.aug_p > 0:
+            assert self.mix_p > 0, "Expecting some mixing proportion mix_p if aug_p > 0"
+            assert self.channels == 1, "SoundDataset with audio mixing considers only monophonic audio"
+        self.mix_snr_low = mix_snr_low
+        self.mix_snr_high = mix_snr_high
+        self.mix_min_overlap = mix_min_overlap
+    def _get_info_path(self, path: tp.Union[str, Path]) -> Path:
+        """Get path of JSON with metadata (description, etc.).
+        If there exists a JSON with the same name as 'path.name', then it will be used.
+        Else, such JSON will be searched for in an external json source folder if it exists.
+        """
+        info_path = Path(path).with_suffix('.json')
+        if Path(info_path).exists():
+            return info_path
+        elif self.external_metadata_source and (Path(self.external_metadata_source) / info_path.name).exists():
+            return Path(self.external_metadata_source) / info_path.name
+        else:
+            raise Exception(f"Unable to find a metadata JSON for path: {path}")
+    def __getitem__(self, index):
+        wav, info = super().__getitem__(index)
+        info_data = info.to_dict()
+        info_path = self._get_info_path(info.meta.path)
+        if Path(info_path).exists():
+            with open(info_path, 'r') as json_file:
+                sound_data = json.load(json_file)
+                sound_data.update(info_data)
+                sound_info = SoundInfo.from_dict(sound_data, fields_required=self.info_fields_required)
+                # if there are multiple descriptions, sample one randomly
+                if isinstance(sound_info.description, list):
+                    sound_info.description = random.choice(sound_info.description)
+        else:
+            sound_info = SoundInfo.from_dict(info_data, fields_required=False)
+        sound_info.self_wav = WavCondition(
+            wav=wav[None], length=torch.tensor([info.n_frames]),
+            sample_rate=[sound_info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
+        return wav, sound_info
+    def collater(self, samples):
+        # when training, audio mixing is performed in the collate function
+        wav, sound_info = super().collater(samples)  # SoundDataset always returns infos
+        if self.aug_p > 0:
+            wav, sound_info = mix_samples(wav, sound_info, self.aug_p, self.mix_p,
+                                          snr_low=self.mix_snr_low, snr_high=self.mix_snr_high,
+                                          min_overlap=self.mix_min_overlap)
+        return wav, sound_info
+def rms_f(x: torch.Tensor) -> torch.Tensor:
+    return (x ** 2).mean(1).pow(0.5)
+def normalize(audio: torch.Tensor, target_level: int = -25) -> torch.Tensor:
+    """Normalize the signal to the target level."""
+    rms = rms_f(audio)
+    scalar = 10 ** (target_level / 20) / (rms + EPS)
+    audio = audio * scalar.unsqueeze(1)
+    return audio
+def is_clipped(audio: torch.Tensor, clipping_threshold: float = 0.99) -> torch.Tensor:
+    return (abs(audio) > clipping_threshold).any(1)
+def mix_pair(src: torch.Tensor, dst: torch.Tensor, min_overlap: float) -> torch.Tensor:
+    start = random.randint(0, int(src.shape[1] * (1 - min_overlap)))
+    remainder = src.shape[1] - start
+    if dst.shape[1] > remainder:
+        src[:, start:] = src[:, start:] + dst[:, :remainder]
+    else:
+        src[:, start:start+dst.shape[1]] = src[:, start:start+dst.shape[1]] + dst
+    return src
+def snr_mixer(clean: torch.Tensor, noise: torch.Tensor, snr: int, min_overlap: float,
+              target_level: int = -25, clipping_threshold: float = 0.99) -> torch.Tensor:
+    """Function to mix clean speech and noise at various SNR levels.
+    Args:
+        clean (torch.Tensor): Clean audio source to mix, of shape [B, T].
+        noise (torch.Tensor): Noise audio source to mix, of shape [B, T].
+        snr (int): SNR level when mixing.
+        min_overlap (float): Minimum overlap between the two mixed sources.
+        target_level (int): Gain level in dB.
+        clipping_threshold (float): Threshold for clipping the audio.
+    Returns:
+        torch.Tensor: The mixed audio, of shape [B, T].
+    """
+    if clean.shape[1] > noise.shape[1]:
+        noise = torch.nn.functional.pad(noise, (0, clean.shape[1] - noise.shape[1]))
+    else:
+        noise = noise[:, :clean.shape[1]]
+    # normalizing to -25 dB FS
+    clean = clean / (clean.max(1)[0].abs().unsqueeze(1) + EPS)
+    clean = normalize(clean, target_level)
+    rmsclean = rms_f(clean)
+    noise = noise / (noise.max(1)[0].abs().unsqueeze(1) + EPS)
+    noise = normalize(noise, target_level)
+    rmsnoise = rms_f(noise)
+    # set the noise level for a given SNR
+    noisescalar = (rmsclean / (10 ** (snr / 20)) / (rmsnoise + EPS)).unsqueeze(1)
+    noisenewlevel = noise * noisescalar
+    # mix noise and clean speech
+    noisyspeech = mix_pair(clean, noisenewlevel, min_overlap)
+    # randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value
+    # there is a chance of clipping that might happen with very less probability, which is not a major issue.
+    noisy_rms_level = np.random.randint(TARGET_LEVEL_LOWER, TARGET_LEVEL_UPPER)
+    rmsnoisy = rms_f(noisyspeech)
+    scalarnoisy = (10 ** (noisy_rms_level / 20) / (rmsnoisy + EPS)).unsqueeze(1)
+    noisyspeech = noisyspeech * scalarnoisy
+    clean = clean * scalarnoisy
+    noisenewlevel = noisenewlevel * scalarnoisy
+    # final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly
+    clipped = is_clipped(noisyspeech)
+    if clipped.any():
+        noisyspeech_maxamplevel = noisyspeech[clipped].max(1)[0].abs().unsqueeze(1) / (clipping_threshold - EPS)
+        noisyspeech[clipped] = noisyspeech[clipped] / noisyspeech_maxamplevel
+    return noisyspeech
+def snr_mix(src: torch.Tensor, dst: torch.Tensor, snr_low: int, snr_high: int, min_overlap: float):
+    if snr_low == snr_high:
+        snr = snr_low
+    else:
+        snr = np.random.randint(snr_low, snr_high)
+    mix = snr_mixer(src, dst, snr, min_overlap)
+    return mix
+def mix_text(src_text: str, dst_text: str):
+    """Mix text from different sources by concatenating them."""
+    if src_text == dst_text:
+        return src_text
+    return src_text + " " + dst_text
+def mix_samples(wavs: torch.Tensor, infos: tp.List[SoundInfo], aug_p: float, mix_p: float,
+                snr_low: int, snr_high: int, min_overlap: float):
+    """Mix samples within a batch, summing the waveforms and concatenating the text infos.
+    Args:
+        wavs (torch.Tensor): Audio tensors of shape [B, C, T].
+        infos (list[SoundInfo]): List of SoundInfo items corresponding to the audio.
+        aug_p (float): Augmentation probability.
+        mix_p (float): Proportion of items in the batch to mix (and merge) together.
+        snr_low (int): Lowerbound for sampling SNR.
+        snr_high (int): Upperbound for sampling SNR.
+        min_overlap (float): Minimum overlap between mixed samples.
+    Returns:
+        tuple[torch.Tensor, list[SoundInfo]]: A tuple containing the mixed wavs
+            and mixed SoundInfo for the given batch.
+    """
+    # no mixing to perform within the batch
+    if mix_p == 0:
+        return wavs, infos
+    if random.uniform(0, 1) < aug_p:
+        # perform all augmentations on waveforms as [B, T]
+        # randomly picking pairs of audio to mix
+        assert wavs.size(1) == 1, f"Mix samples requires monophonic audio but C={wavs.size(1)}"
+        wavs = wavs.mean(dim=1, keepdim=False)
+        B, T = wavs.shape
+        k = int(mix_p * B)
+        mixed_sources_idx = torch.randperm(B)[:k]
+        mixed_targets_idx = torch.randperm(B)[:k]
+        aug_wavs = snr_mix(
+            wavs[mixed_sources_idx],
+            wavs[mixed_targets_idx],
+            snr_low,
+            snr_high,
+            min_overlap,
+        )
+        # mixing textual descriptions in metadata
+        descriptions = [info.description for info in infos]
+        aug_infos = []
+        for i, j in zip(mixed_sources_idx, mixed_targets_idx):
+            text = mix_text(descriptions[i], descriptions[j])
+            m = replace(infos[i])
+            m.description = text
+            aug_infos.append(m)
+        # back to [B, C, T]
+        aug_wavs = aug_wavs.unsqueeze(1)
+        assert aug_wavs.shape[0] > 0, "Samples mixing returned empty batch."
+        assert aug_wavs.dim() == 3, f"Returned wav should be [B, C, T] but dim = {aug_wavs.dim()}"
+        assert aug_wavs.shape[0] == len(aug_infos), "Mismatch between number of wavs and infos in the batch"
+        return aug_wavs, aug_infos  # [B, C, T]
+    else:
+        # randomly pick samples in the batch to match
+        # the batch size when performing audio mixing
+        B, C, T = wavs.shape
+        k = int(mix_p * B)
+        wav_idx = torch.randperm(B)[:k]
+        wavs = wavs[wav_idx]
+        infos = [infos[i] for i in wav_idx]
+        assert wavs.shape[0] == len(infos), "Mismatch between number of wavs and infos in the batch"
+        return wavs, infos  # [B, C, T]

audiocraft/data/zip.py CHANGED Viewed

@@ -3,6 +3,8 @@
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import typing
 import zipfile
@@ -18,13 +20,13 @@ MODE = Literal['r', 'w', 'x', 'a']
 @dataclass(order=True)
 class PathInZip:
-    """Class for holding a path of file within a zip file.
     Args:
-        path: The convention is <path_to_zip>:<relative_path_inside_zip>
             Let's assume there is a zip file /some/location/foo.zip
             and inside of it is a json file located at /data/file1.json,
-            Then we expect path = "/some/location/foo.zip:/data/file1.json"
     """
     INFO_PATH_SEP = ':'
@@ -55,7 +57,7 @@ def set_zip_cache_size(max_size: int):
     """Sets the maximal LRU caching for zip file opening.
     Args:
-        max_size: the maximal LRU cache.
     """
     global _cached_open_zip
     _cached_open_zip = lru_cache(max_size)(_open_zip)
@@ -65,8 +67,8 @@ def open_file_in_zip(path_in_zip: PathInZip, mode: str = 'r') -> typing.IO:
     """Opens a file stored inside a zip and returns a file-like object.
     Args:
-        path_in_zip: A PathInZip object representing the file to return a file-like object of.
-        mode: The mode in which to open the file with.
     Returns:
         A file-like object for PathInZip.
     """

 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+"""Utility for reading some info from inside a zip file.
+"""
 import typing
 import zipfile
 @dataclass(order=True)
 class PathInZip:
+    """Hold a path of file within a zip file.
     Args:
+        path (str): The convention is <path_to_zip>:<relative_path_inside_zip>.
             Let's assume there is a zip file /some/location/foo.zip
             and inside of it is a json file located at /data/file1.json,
+            Then we expect path = "/some/location/foo.zip:/data/file1.json".
     """
     INFO_PATH_SEP = ':'
     """Sets the maximal LRU caching for zip file opening.
     Args:
+        max_size (int): the maximal LRU cache.
     """
     global _cached_open_zip
     _cached_open_zip = lru_cache(max_size)(_open_zip)
     """Opens a file stored inside a zip and returns a file-like object.
     Args:
+        path_in_zip (PathInZip): A PathInZip object representing the file to return a file-like object of.
+        mode (str): The mode in which to open the file with.
     Returns:
         A file-like object for PathInZip.
     """

audiocraft/environment.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Provides cluster and tools configuration across clusters (slurm, dora, utilities).
+"""
+import logging
+import os
+from pathlib import Path
+import re
+import typing as tp
+import omegaconf
+from .utils.cluster import _guess_cluster_type
+logger = logging.getLogger(__name__)
+class AudioCraftEnvironment:
+    """Environment configuration for teams and clusters.
+    AudioCraftEnvironment picks compute cluster settings (slurm, dora) from the current running environment
+    or declared variable and the loaded team configuration. Additionally, the AudioCraftEnvironment
+    provides pointers to a reference folder resolved automatically across clusters that is shared across team members,
+    allowing to share sigs or other files to run jobs. Finally, it provides dataset mappers to automatically
+    map dataset file paths to new locations across clusters, allowing to use the same manifest of files across cluters.
+    The cluster type is identified automatically and base configuration file is read from config/teams.yaml.
+    Use the following environment variables to specify the cluster, team or configuration:
+        AUDIOCRAFT_CLUSTER (optional): Cluster type to enforce. Useful if the cluster type
+            cannot be inferred automatically.
+        AUDIOCRAFT_CONFIG (optional): Path to yaml config holding the teams configuration.
+            If not set, configuration is read from config/teams.yaml.
+        AUDIOCRAFT_TEAM (optional): Name of the team. Recommended to set to your own team.
+            Cluster configuration are shared across teams to match compute allocation,
+            specify your cluster configuration in the configuration file under a key mapping
+            your team name.
+    """
+    _instance = None
+    DEFAULT_TEAM = "default"
+    def __init__(self) -> None:
+        """Loads configuration."""
+        self.team: str = os.getenv("AUDIOCRAFT_TEAM", self.DEFAULT_TEAM)
+        cluster_type = _guess_cluster_type()
+        cluster = os.getenv(
+            "AUDIOCRAFT_CLUSTER", cluster_type.value
+        )
+        logger.info("Detecting cluster type %s", cluster_type)
+        self.cluster: str = cluster
+        config_path = os.getenv(
+            "AUDIOCRAFT_CONFIG",
+            Path(__file__)
+            .parent.parent.joinpath("config/teams", self.team)
+            .with_suffix(".yaml"),
+        )
+        self.config = omegaconf.OmegaConf.load(config_path)
+        self._dataset_mappers = []
+        cluster_config = self._get_cluster_config()
+        if "dataset_mappers" in cluster_config:
+            for pattern, repl in cluster_config["dataset_mappers"].items():
+                regex = re.compile(pattern)
+                self._dataset_mappers.append((regex, repl))
+    def _get_cluster_config(self) -> omegaconf.DictConfig:
+        assert isinstance(self.config, omegaconf.DictConfig)
+        return self.config[self.cluster]
+    @classmethod
+    def instance(cls):
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+    @classmethod
+    def reset(cls):
+        """Clears the environment and forces a reload on next invocation."""
+        cls._instance = None
+    @classmethod
+    def get_team(cls) -> str:
+        """Gets the selected team as dictated by the AUDIOCRAFT_TEAM env var.
+        If not defined, defaults to "labs".
+        """
+        return cls.instance().team
+    @classmethod
+    def get_cluster(cls) -> str:
+        """Gets the detected cluster.
+        This value can be overridden by the AUDIOCRAFT_CLUSTER env var.
+        """
+        return cls.instance().cluster
+    @classmethod
+    def get_dora_dir(cls) -> Path:
+        """Gets the path to the dora directory for the current team and cluster.
+        Value is overridden by the AUDIOCRAFT_DORA_DIR env var.
+        """
+        cluster_config = cls.instance()._get_cluster_config()
+        dora_dir = os.getenv("AUDIOCRAFT_DORA_DIR", cluster_config["dora_dir"])
+        logger.warning(f"Dora directory: {dora_dir}")
+        return Path(dora_dir)
+    @classmethod
+    def get_reference_dir(cls) -> Path:
+        """Gets the path to the reference directory for the current team and cluster.
+        Value is overridden by the AUDIOCRAFT_REFERENCE_DIR env var.
+        """
+        cluster_config = cls.instance()._get_cluster_config()
+        return Path(os.getenv("AUDIOCRAFT_REFERENCE_DIR", cluster_config["reference_dir"]))
+    @classmethod
+    def get_slurm_exclude(cls) -> tp.Optional[str]:
+        """Get the list of nodes to exclude for that cluster."""
+        cluster_config = cls.instance()._get_cluster_config()
+        return cluster_config.get("slurm_exclude")
+    @classmethod
+    def get_slurm_partitions(cls, partition_types: tp.Optional[tp.List[str]] = None) -> str:
+        """Gets the requested partitions for the current team and cluster as a comma-separated string.
+        Args:
+            partition_types (list[str], optional): partition types to retrieve. Values must be
+                from ['global', 'team']. If not provided, the global partition is returned.
+        """
+        if not partition_types:
+            partition_types = ["global"]
+        cluster_config = cls.instance()._get_cluster_config()
+        partitions = [
+            cluster_config["partitions"][partition_type]
+            for partition_type in partition_types
+        ]
+        return ",".join(partitions)
+    @classmethod
+    def resolve_reference_path(cls, path: tp.Union[str, Path]) -> Path:
+        """Converts reference placeholder in path with configured reference dir to resolve paths.
+        Args:
+            path (str or Path): Path to resolve.
+        Returns:
+            Path: Resolved path.
+        """
+        path = str(path)
+        if path.startswith("//reference"):
+            reference_dir = cls.get_reference_dir()
+            logger.warn(f"Reference directory: {reference_dir}")
+            assert (
+                reference_dir.exists() and reference_dir.is_dir()
+            ), f"Reference directory does not exist: {reference_dir}."
+            path = re.sub("^//reference", str(reference_dir), path)
+        return Path(path)
+    @classmethod
+    def apply_dataset_mappers(cls, path: str) -> str:
+        """Applies dataset mapping regex rules as defined in the configuration.
+        If no rules are defined, the path is returned as-is.
+        """
+        instance = cls.instance()
+        for pattern, repl in instance._dataset_mappers:
+            path = pattern.sub(repl, path)
+        return path

audiocraft/grids/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Dora Grids."""

audiocraft/grids/_base_explorers.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import ABC, abstractmethod
+import time
+import typing as tp
+from dora import Explorer
+import treetable as tt
+def get_sheep_ping(sheep) -> tp.Optional[str]:
+    """Return the amount of time since the Sheep made some update
+    to its log. Returns a str using the relevant time unit."""
+    ping = None
+    if sheep.log is not None and sheep.log.exists():
+        delta = time.time() - sheep.log.stat().st_mtime
+        if delta > 3600 * 24:
+            ping = f'{delta / (3600 * 24):.1f}d'
+        elif delta > 3600:
+            ping = f'{delta / (3600):.1f}h'
+        elif delta > 60:
+            ping = f'{delta / 60:.1f}m'
+        else:
+            ping = f'{delta:.1f}s'
+    return ping
+class BaseExplorer(ABC, Explorer):
+    """Base explorer for AudioCraft grids.
+    All task specific solvers are expected to implement the `get_grid_metrics`
+    method to specify logic about metrics to display for a given task.
+    If additional stages are used, the child explorer must define how to handle
+    these new stages in the `process_history` and `process_sheep` methods.
+    """
+    def stages(self):
+        return ["train", "valid", "evaluate"]
+    def get_grid_meta(self):
+        """Returns the list of Meta information to display for each XP/job.
+        """
+        return [
+            tt.leaf("index", align=">"),
+            tt.leaf("name", wrap=140),
+            tt.leaf("state"),
+            tt.leaf("sig", align=">"),
+            tt.leaf("sid", align="<"),
+        ]
+    @abstractmethod
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table.
+        """
+        ...
+    def process_sheep(self, sheep, history):
+        train = {
+            "epoch": len(history),
+        }
+        parts = {"train": train}
+        for metrics in history:
+            for key, sub in metrics.items():
+                part = parts.get(key, {})
+                if 'duration' in sub:
+                    # Convert to minutes for readability.
+                    sub['duration'] = sub['duration'] / 60.
+                part.update(sub)
+                parts[key] = part
+        ping = get_sheep_ping(sheep)
+        if ping is not None:
+            for name in self.stages():
+                if name not in parts:
+                    parts[name] = {}
+                # Add the ping to each part for convenience.
+                parts[name]['ping'] = ping
+        return parts

audiocraft/grids/audiogen/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""AudioGen grids."""

audiocraft/grids/audiogen/audiogen_base_16khz.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from ..musicgen._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=64, partition=partitions)
+    launcher.bind_(solver='audiogen/audiogen_base_16khz')
+    # replace this by the desired environmental sound dataset
+    launcher.bind_(dset='internal/sounds_16khz')
+    fsdp = {'autocast': False, 'fsdp.use': True}
+    medium = {'model/lm/model_scale': 'medium'}
+    launcher.bind_(fsdp)
+    launcher(medium)

audiocraft/grids/audiogen/audiogen_pretrained_16khz_eval.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Evaluation with objective metrics for the pretrained AudioGen models.
+This grid takes signature from the training grid and runs evaluation-only stage.
+When running the grid for the first time, please use:
+REGEN=1 dora grid audiogen.audiogen_pretrained_16khz_eval
+and re-use the REGEN=1 option when the grid is changed to force regenerating it.
+Note that you need the proper metrics external libraries setup to use all
+the objective metrics activated in this grid. Refer to the README for more information.
+"""
+import os
+from ..musicgen._explorers import GenerationEvalExplorer
+from ...environment import AudioCraftEnvironment
+from ... import train
+def eval(launcher, batch_size: int = 32):
+    opts = {
+        'dset': 'audio/audiocaps_16khz',
+        'solver/audiogen/evaluation': 'objective_eval',
+        'execute_only': 'evaluate',
+        '+dataset.evaluate.batch_size': batch_size,
+        '+metrics.fad.tf.batch_size': 32,
+    }
+    # binary for FAD computation: replace this path with your own path
+    metrics_opts = {
+        'metrics.fad.tf.bin': '/data/home/jadecopet/local/usr/opt/google-research'
+    }
+    opt1 = {'generate.lm.use_sampling': True, 'generate.lm.top_k': 250, 'generate.lm.top_p': 0.}
+    opt2 = {'transformer_lm.two_step_cfg': True}
+    sub = launcher.bind(opts)
+    sub.bind_(metrics_opts)
+    # base objective metrics
+    sub(opt1, opt2)
+@GenerationEvalExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=4, partition=partitions)
+    if 'REGEN' not in os.environ:
+        folder = train.main.dora.dir / 'grids' / __name__.split('.', 2)[-1]
+        with launcher.job_array():
+            for sig in folder.iterdir():
+                if not sig.is_symlink():
+                    continue
+                xp = train.main.get_xp_from_sig(sig.name)
+                launcher(xp.argv)
+        return
+    audiogen_base = launcher.bind(solver="audiogen/audiogen_base_16khz")
+    audiogen_base.bind_({'autocast': False, 'fsdp.use': True})
+    audiogen_base_medium = audiogen_base.bind({'continue_from': '//pretrained/facebook/audiogen-medium'})
+    audiogen_base_medium.bind_({'model/lm/model_scale': 'medium'})
+    eval(audiogen_base_medium, batch_size=128)

audiocraft/grids/compression/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""EnCodec grids."""

audiocraft/grids/compression/_explorers.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import treetable as tt
+from .._base_explorers import BaseExplorer
+class CompressionExplorer(BaseExplorer):
+    eval_metrics = ["sisnr", "visqol"]
+    def stages(self):
+        return ["train", "valid", "evaluate"]
+    def get_grid_meta(self):
+        """Returns the list of Meta information to display for each XP/job.
+        """
+        return [
+            tt.leaf("index", align=">"),
+            tt.leaf("name", wrap=140),
+            tt.leaf("state"),
+            tt.leaf("sig", align=">"),
+        ]
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table.
+        """
+        return [
+            tt.group(
+                "train",
+                [
+                    tt.leaf("epoch"),
+                    tt.leaf("bandwidth", ".2f"),
+                    tt.leaf("adv", ".4f"),
+                    tt.leaf("d_loss", ".4f"),
+                ],
+                align=">",
+            ),
+            tt.group(
+                "valid",
+                [
+                    tt.leaf("bandwidth", ".2f"),
+                    tt.leaf("adv", ".4f"),
+                    tt.leaf("msspec", ".4f"),
+                    tt.leaf("sisnr", ".2f"),
+                ],
+                align=">",
+            ),
+            tt.group(
+                "evaluate", [tt.leaf(name, ".3f") for name in self.eval_metrics], align=">"
+            ),
+        ]

audiocraft/grids/compression/debug.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+This grid is a minimal example for debugging compression task
+and how to override parameters directly in a grid.
+Learn more about dora grids: https://github.com/facebookresearch/dora
+"""
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=2, partition=partitions)
+    launcher.bind_(solver='compression/debug')
+    with launcher.job_array():
+        # base debug task using config from solver=compression/debug
+        launcher()
+        # we can override parameters in the grid to launch additional xps
+        launcher({'rvq.bins': 2048, 'rvq.n_q': 4})

audiocraft/grids/compression/encodec_audiogen_16khz.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+This grid shows how to train the new AudioGen EnCodec model at 16 kHz.
+"""
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=8, partition=partitions)
+    # use configuration for AudioGen's EnCodec model trained on monophonic audio sampled at 16 kHz
+    # AudioGen's EnCodec is trained with a total stride of 320 leading to a frame rate of 50 hz
+    launcher.bind_(solver='compression/encodec_audiogen_16khz')
+    # replace this by the desired sound dataset
+    launcher.bind_(dset='internal/sounds_16khz')
+    # launch xp
+    launcher()

audiocraft/grids/compression/encodec_base_24khz.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+This grid shows how to train a base causal EnCodec model at 24 kHz.
+"""
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=8, partition=partitions)
+    # base causal EnCodec trained on monophonic audio sampled at 24 kHz
+    launcher.bind_(solver='compression/encodec_base_24khz')
+    # replace this by the desired dataset
+    launcher.bind_(dset='audio/example')
+    # launch xp
+    launcher()

audiocraft/grids/compression/encodec_musicgen_32khz.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Grid search file, simply list all the exp you want in `explorer`.
+Any new exp added there will be scheduled.
+You can cancel and experiment by commenting its line.
+This grid shows how to train a MusicGen EnCodec model at 32 kHz.
+"""
+from ._explorers import CompressionExplorer
+from ...environment import AudioCraftEnvironment
+@CompressionExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=8, partition=partitions)
+    # use configuration for MusicGen's EnCodec model trained on monophonic audio sampled at 32 kHz
+    # MusicGen's EnCodec is trained with a total stride of 640 leading to a frame rate of 50 hz
+    launcher.bind_(solver='compression/encodec_musicgen_32khz')
+    # replace this by the desired music dataset
+    launcher.bind_(dset='internal/music_400k_32khz')
+    # launch xp
+    launcher()
+    launcher({
+        'metrics.visqol.bin': '/data/home/jadecopet/local/usr/opt/visqol',
+        'label': 'visqol',
+        'evaluate.metrics.visqol': True
+    })

audiocraft/grids/diffusion/4_bands_base_32khz.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Training of the 4 diffusion models described in
+"From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion"
+(paper link).
+"""
+from ._explorers import DiffusionExplorer
+@DiffusionExplorer
+def explorer(launcher):
+    launcher.slurm_(gpus=4, partition='learnfair')
+    launcher.bind_({'solver': 'diffusion/default',
+                    'dset': 'internal/music_10k_32khz'})
+    with launcher.job_array():
+        launcher({'filter.use': True, 'filter.idx_band': 0, "processor.use": False, 'processor.power_std': 0.4})
+        launcher({'filter.use': True, 'filter.idx_band': 1, "processor.use": False, 'processor.power_std': 0.4})
+        launcher({'filter.use': True, 'filter.idx_band': 2, "processor.use": True, 'processor.power_std': 0.4})
+        launcher({'filter.use': True, 'filter.idx_band': 3, "processor.use": True, 'processor.power_std': 0.75})

audiocraft/grids/diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Diffusion grids."""

audiocraft/grids/diffusion/_explorers.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import treetable as tt
+from .._base_explorers import BaseExplorer
+class DiffusionExplorer(BaseExplorer):
+    eval_metrics = ["sisnr", "visqol"]
+    def stages(self):
+        return ["train", "valid", "valid_ema", "evaluate", "evaluate_ema"]
+    def get_grid_meta(self):
+        """Returns the list of Meta information to display for each XP/job.
+        """
+        return [
+            tt.leaf("index", align=">"),
+            tt.leaf("name", wrap=140),
+            tt.leaf("state"),
+            tt.leaf("sig", align=">"),
+        ]
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table.
+        """
+        return [
+            tt.group(
+                "train",
+                [
+                    tt.leaf("epoch"),
+                    tt.leaf("loss", ".3%"),
+                ],
+                align=">",
+            ),
+            tt.group(
+                "valid",
+                [
+                    tt.leaf("loss", ".3%"),
+                    # tt.leaf("loss_0", ".3%"),
+                ],
+                align=">",
+            ),
+            tt.group(
+                "valid_ema",
+                [
+                    tt.leaf("loss", ".3%"),
+                    # tt.leaf("loss_0", ".3%"),
+                ],
+                align=">",
+            ),
+            tt.group(
+                "evaluate", [tt.leaf("rvm", ".4f"), tt.leaf("rvm_0", ".4f"),
+                             tt.leaf("rvm_1", ".4f"), tt.leaf("rvm_2", ".4f"),
+                             tt.leaf("rvm_3", ".4f"), ], align=">"
+            ),
+            tt.group(
+                "evaluate_ema", [tt.leaf("rvm", ".4f"), tt.leaf("rvm_0", ".4f"),
+                                 tt.leaf("rvm_1", ".4f"), tt.leaf("rvm_2", ".4f"),
+                                 tt.leaf("rvm_3", ".4f")], align=">"
+            ),
+        ]

audiocraft/grids/musicgen/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""MusicGen grids."""

audiocraft/grids/musicgen/_explorers.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import treetable as tt
+from .._base_explorers import BaseExplorer
+class LMExplorer(BaseExplorer):
+    eval_metrics: tp.List[str] = []
+    def stages(self) -> tp.List[str]:
+        return ['train', 'valid']
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table."""
+        return [
+            tt.group(
+                'train',
+                [
+                    tt.leaf('epoch'),
+                    tt.leaf('duration', '.1f'),  # duration in minutes
+                    tt.leaf('ping'),
+                    tt.leaf('ce', '.4f'),  # cross entropy
+                    tt.leaf("ppl", '.3f'),  # perplexity
+                ],
+                align='>',
+            ),
+            tt.group(
+                'valid',
+                [
+                    tt.leaf('ce', '.4f'),
+                    tt.leaf('ppl', '.3f'),
+                    tt.leaf('best_ppl', '.3f'),
+                ],
+                align='>',
+            ),
+        ]
+    def process_sheep(self, sheep, history):
+        parts = super().process_sheep(sheep, history)
+        track_by = {'ppl': 'lower'}  # values should be in ['lower', 'higher']
+        best_metrics = {k: (1 if v == 'lower' else -1) * float('inf') for k, v in track_by.items()}
+        def comparator(mode, a, b):
+            return a < b if mode == 'lower' else a > b
+        for metrics in history:
+            for key, sub in metrics.items():
+                for metric in track_by:
+                    # for the validation set, keep track of best metrics (ppl in this example)
+                    # this is so we can conveniently compare metrics between runs in the grid
+                    if key == 'valid' and metric in sub and comparator(
+                        track_by[metric], sub[metric], best_metrics[metric]
+                    ):
+                        best_metrics[metric] = sub[metric]
+        if 'valid' in parts:
+            parts['valid'].update({f'best_{k}': v for k, v in best_metrics.items()})
+        return parts
+class GenerationEvalExplorer(BaseExplorer):
+    eval_metrics: tp.List[str] = []
+    def stages(self) -> tp.List[str]:
+        return ['evaluate']
+    def get_grid_metrics(self):
+        """Return the metrics that should be displayed in the tracking table."""
+        return [
+            tt.group(
+                'evaluate',
+                [
+                    tt.leaf('epoch', '.3f'),
+                    tt.leaf('duration', '.1f'),
+                    tt.leaf('ping'),
+                    tt.leaf('ce', '.4f'),
+                    tt.leaf('ppl', '.3f'),
+                    tt.leaf('fad', '.3f'),
+                    tt.leaf('kld', '.3f'),
+                    tt.leaf('text_consistency', '.3f'),
+                    tt.leaf('chroma_cosine', '.3f'),
+                ],
+                align='>',
+            ),
+        ]

audiocraft/grids/musicgen/musicgen_base_32khz.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=32, partition=partitions)
+    launcher.bind_(solver='musicgen/musicgen_base_32khz')
+    # replace this by the desired music dataset
+    launcher.bind_(dset='internal/music_400k_32khz')
+    fsdp = {'autocast': False, 'fsdp.use': True}
+    medium = {'model/lm/model_scale': 'medium'}
+    large = {'model/lm/model_scale': 'large'}
+    cfg_low = {'classifier_free_guidance.training_dropout': 0.2}
+    wd_low = {'conditioners.description.t5.word_dropout': 0.2}
+    adam = {'optim.optimizer': 'adamw', 'optim.lr': 1e-4}
+    launcher.bind_(fsdp)
+    launcher.slurm_(gpus=32).bind_(label='32gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub()
+    launcher.slurm_(gpus=64).bind_(label='64gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(medium, adam)
+    launcher.slurm_(gpus=96).bind_(label='96gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(large, cfg_low, wd_low, adam, {'optim.max_norm': 3})

audiocraft/grids/musicgen/musicgen_base_cached_32khz.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=32, partition=partitions)
+    launcher.bind_(solver='musicgen/musicgen_base_32khz')
+    # replace this by the desired music dataset
+    launcher.bind_(dset='internal/music_400k_32khz')
+    fsdp = {'autocast': False, 'fsdp.use': True}
+    medium = {'model/lm/model_scale': 'medium'}
+    large = {'model/lm/model_scale': 'large'}
+    cfg_low = {'classifier_free_guidance.training_dropout': 0.2}
+    wd_low = {'conditioners.description.t5.word_dropout': 0.2}
+    adam = {'optim.optimizer': 'adamw', 'optim.lr': 1e-4}
+    # BEGINNING OF CACHE WRITING JOBS.
+    cache_write = {
+        'cache.path': '/fsx-codegen/defossez/cache/interleave_stereo_nv_32k',
+        'cache.write': True,
+        'generate.every': 500,
+        'evaluate.every': 500,
+        'logging.log_updates': 50,
+    }
+    cache_sub = launcher.bind({'model/lm/model_scale': 'xsmall', 'conditioner': 'none'})
+    cache_sub.bind_({'deadlock.use': True})
+    cache_sub.slurm_(gpus=8)
+    with launcher.job_array():
+        num_shards = 10  # total number of jobs running in parallel.
+        for shard in range(0, num_shards):
+            launcher(cache_write, {'cache.write_num_shards': num_shards, 'cache.write_shard': shard})
+    # REMOVE THE FOLLOWING RETURN STATEMENT ONCE THE ABOVE JOBS ARE DONE,
+    # OR SUFFICIENTLY AHEAD.
+    return
+    cache = {
+        'cache.path': '/fsx-codegen/defossez/cache/interleave_stereo_nv_32k',
+    }
+    launcher.bind_(fsdp, cache)
+    launcher.slurm_(gpus=32).bind_(label='32gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub()
+    launcher.slurm_(gpus=64).bind_(label='64gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(medium, adam)
+    launcher.slurm_(gpus=96).bind_(label='96gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(large, cfg_low, wd_low, adam, {'optim.max_norm': 3})

audiocraft/grids/musicgen/musicgen_clapemb_32khz.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=32, partition=partitions)
+    launcher.bind_(solver='musicgen/musicgen_base_32khz')
+    # replace this by the desired music dataset
+    launcher.bind_(dset='internal/music_400k_32khz')
+    launcher.bind_(conditioner='clapemb2music')
+    fsdp = {'autocast': False, 'fsdp.use': True}
+    cache_path = {'conditioners.description.clap.cache_path':
+                  '/fsx-audio-craft-llm/jadecopet/experiments/audiocraft/caches/clap_embed_music'}
+    text_wav_training_opt = {'conditioners.description.clap.text_p': 0.5}
+    launcher.bind_(fsdp)
+    launcher.slurm_(gpus=32).bind_(label='32gpus')
+    with launcher.job_array():
+        launcher()
+        launcher(text_wav_training_opt)
+        launcher(cache_path)
+        launcher(cache_path, text_wav_training_opt)

audiocraft/grids/musicgen/musicgen_melody_32khz.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from ._explorers import LMExplorer
+from ...environment import AudioCraftEnvironment
+@LMExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=32, partition=partitions)
+    launcher.bind_(solver='musicgen/musicgen_melody_32khz')
+    # replace this by the desired music dataset
+    launcher.bind_(dset='internal/music_400k_32khz')
+    fsdp = {'autocast': False, 'fsdp.use': True}
+    medium = {'model/lm/model_scale': 'medium'}
+    large = {'model/lm/model_scale': 'large'}
+    cfg_low = {'classifier_free_guidance.training_dropout': 0.2}
+    wd_low = {'conditioners.description.t5.word_dropout': 0.2}
+    adam = {'optim.optimizer': 'adamw', 'optim.lr': 1e-4}
+    cache_path = {'conditioners.self_wav.chroma_stem.cache_path':
+                  '/fsx-audio-craft-llm/jadecopet/experiments/audiocraft/caches/chroma_stem'}
+    # CACHE GENERATION JOBS
+    n_cache_gen_jobs = 4
+    gen_sub = launcher.slurm(gpus=1)
+    gen_sub.bind_(
+        cache_path, {
+            # the cache is always computed over the whole file, so duration doesn't matter here.
+            'dataset.segment_duration': 2.,
+            'dataset.batch_size': 8,
+            'dataset.train.permutation_on_files': True,  # try to not repeat files.
+            'optim.epochs': 10,
+            'model/lm/model_scale': 'xsmall',
+        })
+    with gen_sub.job_array():
+        for gen_job in range(n_cache_gen_jobs):
+            gen_sub({'dataset.train.shuffle_seed': gen_job})
+    # ACTUAL TRAINING JOBS.
+    launcher.bind_(fsdp)
+    launcher.slurm_(gpus=32).bind_(label='32gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub()
+        sub(cache_path)
+    launcher.slurm_(gpus=64).bind_(label='64gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(medium, adam)
+    launcher.slurm_(gpus=96).bind_(label='96gpus')
+    with launcher.job_array():
+        sub = launcher.bind()
+        sub(large, cfg_low, wd_low, adam, {'optim.max_norm': 3})

audiocraft/grids/musicgen/musicgen_pretrained_32khz_eval.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Evaluation with objective metrics for the pretrained MusicGen models.
+This grid takes signature from the training grid and runs evaluation-only stage.
+When running the grid for the first time, please use:
+REGEN=1 dora grid musicgen.musicgen_pretrained_32khz_eval
+and re-use the REGEN=1 option when the grid is changed to force regenerating it.
+Note that you need the proper metrics external libraries setup to use all
+the objective metrics activated in this grid. Refer to the README for more information.
+"""
+import os
+from ._explorers import GenerationEvalExplorer
+from ...environment import AudioCraftEnvironment
+from ... import train
+def eval(launcher, batch_size: int = 32, eval_melody: bool = False):
+    opts = {
+        'dset': 'audio/musiccaps_32khz',
+        'solver/musicgen/evaluation': 'objective_eval',
+        'execute_only': 'evaluate',
+        '+dataset.evaluate.batch_size': batch_size,
+        '+metrics.fad.tf.batch_size': 16,
+    }
+    # chroma-specific evaluation
+    chroma_opts = {
+        'dset': 'internal/music_400k_32khz',
+        'dataset.evaluate.segment_duration': 30,
+        'dataset.evaluate.num_samples': 1000,
+        'evaluate.metrics.chroma_cosine': True,
+        'evaluate.metrics.fad': False,
+        'evaluate.metrics.kld': False,
+        'evaluate.metrics.text_consistency': False,
+    }
+    # binary for FAD computation: replace this path with your own path
+    metrics_opts = {
+        'metrics.fad.tf.bin': '/data/home/jadecopet/local/usr/opt/google-research'
+    }
+    opt1 = {'generate.lm.use_sampling': True, 'generate.lm.top_k': 250, 'generate.lm.top_p': 0.}
+    opt2 = {'transformer_lm.two_step_cfg': True}
+    sub = launcher.bind(opts)
+    sub.bind_(metrics_opts)
+    # base objective metrics
+    sub(opt1, opt2)
+    if eval_melody:
+        # chroma-specific metrics
+        sub(opt1, opt2, chroma_opts)
+@GenerationEvalExplorer
+def explorer(launcher):
+    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
+    launcher.slurm_(gpus=4, partition=partitions)
+    if 'REGEN' not in os.environ:
+        folder = train.main.dora.dir / 'grids' / __name__.split('.', 2)[-1]
+        with launcher.job_array():
+            for sig in folder.iterdir():
+                if not sig.is_symlink():
+                    continue
+                xp = train.main.get_xp_from_sig(sig.name)
+                launcher(xp.argv)
+        return
+    with launcher.job_array():
+        musicgen_base = launcher.bind(solver="musicgen/musicgen_base_32khz")
+        musicgen_base.bind_({'autocast': False, 'fsdp.use': True})
+        # base musicgen models
+        musicgen_base_small = musicgen_base.bind({'continue_from': '//pretrained/facebook/musicgen-small'})
+        eval(musicgen_base_small, batch_size=128)
+        musicgen_base_medium = musicgen_base.bind({'continue_from': '//pretrained/facebook/musicgen-medium'})
+        musicgen_base_medium.bind_({'model/lm/model_scale': 'medium'})
+        eval(musicgen_base_medium, batch_size=128)
+        musicgen_base_large = musicgen_base.bind({'continue_from': '//pretrained/facebook/musicgen-large'})
+        musicgen_base_large.bind_({'model/lm/model_scale': 'large'})
+        eval(musicgen_base_large, batch_size=128)
+        # melody musicgen model
+        musicgen_melody = launcher.bind(solver="musicgen/musicgen_melody_32khz")
+        musicgen_melody.bind_({'autocast': False, 'fsdp.use': True})
+        musicgen_melody_medium = musicgen_melody.bind({'continue_from': '//pretrained/facebook/musicgen-melody'})
+        musicgen_melody_medium.bind_({'model/lm/model_scale': 'medium'})
+        eval(musicgen_melody_medium, batch_size=128, eval_melody=True)