videoshop-backend

Runtime error

App Files Files Community

anthonyrusso commited on Aug 29, 2023

Commit

b57c851

•

1 Parent(s): f1e9197

upload dependencies

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

2.0 +0 -0
2.0' +0 -0
CHANGELOG.md +28 -0
CODE_OF_CONDUCT.md +80 -0
CONTRIBUTING.md +35 -0
LICENSE +21 -0
LICENSE_weights +399 -0
MANIFEST.in +9 -0
Makefile +40 -0
README.md +86 -13
config/conditioner/chroma2music.yaml +46 -0
config/conditioner/clapemb2music.yaml +44 -0
config/conditioner/none.yaml +19 -0
config/conditioner/text2music.yaml +30 -0
config/conditioner/text2sound.yaml +24 -0
config/config.yaml +75 -0
config/dset/audio/audiocaps_16khz.yaml +11 -0
config/dset/audio/default.yaml +10 -0
config/dset/audio/example.yaml +10 -0
config/dset/audio/musiccaps_32khz.yaml +12 -0
config/dset/default.yaml +10 -0
config/dset/internal/music_10k_32khz.yaml +11 -0
config/dset/internal/music_400k_32khz.yaml +10 -0
config/dset/internal/sounds_16khz.yaml +12 -0
config/model/encodec/default.yaml +54 -0
config/model/encodec/encodec_base_causal.yaml +11 -0
config/model/encodec/encodec_large_nq4_s320.yaml +13 -0
config/model/encodec/encodec_large_nq4_s640.yaml +13 -0
config/model/lm/audiogen_lm.yaml +36 -0
config/model/lm/default.yaml +47 -0
config/model/lm/model_scale/base.yaml +3 -0
config/model/lm/model_scale/large.yaml +7 -0
config/model/lm/model_scale/medium.yaml +7 -0
config/model/lm/model_scale/small.yaml +8 -0
config/model/lm/model_scale/xsmall.yaml +8 -0
config/model/lm/musicgen_lm.yaml +36 -0
config/model/none.yaml +4 -0
config/model/score/basic.yaml +17 -0
config/solver/audiogen/audiogen_base_16khz.yaml +70 -0
config/solver/audiogen/debug.yaml +52 -0
config/solver/audiogen/default.yaml +40 -0
config/solver/audiogen/evaluation/none.yaml +5 -0
config/solver/audiogen/evaluation/objective_eval.yaml +29 -0
config/solver/compression/debug.yaml +55 -0
config/solver/compression/default.yaml +160 -0
config/solver/compression/encodec_audiogen_16khz.yaml +10 -0
config/solver/compression/encodec_base_24khz.yaml +10 -0
config/solver/compression/encodec_musicgen_32khz.yaml +10 -0
config/solver/default.yaml +108 -0
config/solver/diffusion/debug.yaml +106 -0

2.0 ADDED Viewed

File without changes

2.0' ADDED Viewed

File without changes

CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,28 @@

+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+## [1.0.0] - 2023-08-02
+Major revision, added training code for EnCodec, AudioGen, MusicGen, and MultiBandDiffusion.
+Added pretrained model for AudioGen and MultiBandDiffusion.
+## [0.0.2] - 2023-08-01
+Improved demo, fixed top p (thanks @jnordberg).
+Compressor tanh on output to avoid clipping with some style (especially piano).
+Now repeating the conditioning periodically if it is too short.
+More options when launching Gradio app locally (thanks @ashleykleynhans).
+Testing out PyTorch 2.0 memory efficient attention.
+Added extended generation (infinite length) by slowly moving the windows.
+Note that other implementations exist: https://github.com/camenduru/MusicGen-colab.
+## [0.0.1] - 2023-06-09
+Initial release, with model evaluation only.

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# Contributing to AudioCraft
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+AudioCraft is the implementation of a research paper.
+Therefore, we do not plan on accepting many pull requests for new features.
+We certainly welcome them for bug fixes.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## License
+By contributing to encodec, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Meta Platforms, Inc. and affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

LICENSE_weights ADDED Viewed

	@@ -0,0 +1,399 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,9 @@

+include Makefile
+include LICENSE
+include LICENSE_weights
+include *.md
+include *.ini
+include requirements.txt
+include audiocraft/py.typed
+include assets/*.mp3
+recursive-include conf *.yaml

Makefile ADDED Viewed

	@@ -0,0 +1,40 @@

+INTEG=AUDIOCRAFT_DORA_DIR="/tmp/magma_$(USER)" python3 -m dora -v run --clear device=cpu dataset.num_workers=0 optim.epochs=1 \
+	dataset.train.num_samples=10 dataset.valid.num_samples=10 \
+	dataset.evaluate.num_samples=10 dataset.generate.num_samples=2 sample_rate=16000 \
+	logging.level=DEBUG
+INTEG_COMPRESSION = $(INTEG) solver=compression/debug rvq.n_q=2 rvq.bins=48 checkpoint.save_last=true   # SIG is 5091833e
+INTEG_MUSICGEN = $(INTEG) solver=musicgen/debug dset=audio/example compression_model_checkpoint=//sig/5091833e \
+	transformer_lm.n_q=2 transformer_lm.card=48 transformer_lm.dim=16 checkpoint.save_last=false  # Using compression model from 5091833e
+INTEG_AUDIOGEN = $(INTEG) solver=audiogen/debug dset=audio/example compression_model_checkpoint=//sig/5091833e \
+	transformer_lm.n_q=2 transformer_lm.card=48 transformer_lm.dim=16 checkpoint.save_last=false  # Using compression model from 5091833e
+INTEG_MBD = $(INTEG) solver=diffusion/debug dset=audio/example  \
+	checkpoint.save_last=false  # Using compression model from 616d7b3c
+default: linter tests
+install:
+	pip install -U pip
+	pip install -U -e '.[dev]'
+linter:
+	flake8 audiocraft && mypy audiocraft
+	flake8 tests && mypy tests
+tests:
+	coverage run -m pytest tests
+	coverage report
+tests_integ:
+	$(INTEG_COMPRESSION)
+	$(INTEG_MBD)
+	$(INTEG_MUSICGEN)
+	$(INTEG_AUDIOGEN)
+api_docs:
+	pdoc3 --html -o api_docs -f audiocraft
+dist:
+	python setup.py sdist
+.PHONY: linter tests api_docs dist

README.md CHANGED Viewed

@@ -1,13 +1,86 @@
----
-title: Videoshop Backend
-emoji: 📚
-colorFrom: green
-colorTo: red
-sdk: gradio
-sdk_version: 3.41.2
-app_file: app.py
-pinned: false
-license: openrail
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# AudioCraft
+![docs badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_docs/badge.svg)
+![linter badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_linter/badge.svg)
+![tests badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_tests/badge.svg)
+AudioCraft is a PyTorch library for deep learning research on audio generation. AudioCraft contains inference and trainingcon code
+for two state-of-the-art AI generative models producing high-quality audio: AudioGen and MusicGen.
+## Installation
+AudioCraft requires Python 3.9, PyTorch 2.0.0. To install AudioCraft, you can run the following:
+```shell
+# Best to make sure you have torch installed first, in particular before installing xformers.
+# Don't run this if you already have PyTorch installed.
+pip install 'torch>=2.0'
+# Then proceed to one of the following
+pip install -U audiocraft  # stable release
+pip install -U git+https://git@github.com/facebookresearch/audiocraft#egg=audiocraft  # bleeding edge
+pip install -e .  # or if you cloned the repo locally (mandatory if you want to train).
+```
+We also recommend having `ffmpeg` installed, either through your system or Anaconda:
+```bash
+sudo apt-get install ffmpeg
+# Or if you are using Anaconda or Miniconda
+conda install 'ffmpeg<5' -c  conda-forge
+```
+## Models
+At the moment, AudioCraft contains the training code and inference code for:
+* [MusicGen](./docs/MUSICGEN.md): A state-of-the-art controllable text-to-music model.
+* [AudioGen](./docs/AUDIOGEN.md): A state-of-the-art text-to-sound model.
+* [EnCodec](./docs/ENCODEC.md): A state-of-the-art high fidelity neural audio codec.
+* [Multi Band Diffusion](./docs/MBD.md): An EnCodec compatible decoder using diffusion.
+## Training code
+AudioCraft contains PyTorch components for deep learning research in audio and training pipelines for the developed models.
+For a general introduction of AudioCraft design principles and instructions to develop your own training pipeline, refer to
+the [AudioCraft training documentation](./docs/TRAINING.md).
+For reproducing existing work and using the developed training pipelines, refer to the instructions for each specific model
+that provides pointers to configuration, example grids and model/task-specific information and FAQ.
+## API documentation
+We provide some [API documentation](https://facebookresearch.github.io/audiocraft/api_docs/audiocraft/index.html) for AudioCraft.
+## FAQ
+#### Is the training code available?
+Yes! We provide the training code for [EnCodec](./docs/ENCODEC.md), [MusicGen](./docs/MUSICGEN.md) and [Multi Band Diffusion](./docs/MBD.md).
+#### Where are the models stored?
+Hugging Face stored the model in a specific location, which can be overriden by setting the `AUDIOCRAFT_CACHE_DIR` environment variable.
+## License
+* The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE).
+* The models weights in this repository are released under the CC-BY-NC 4.0 license as found in the [LICENSE_weights file](LICENSE_weights).
+## Citation
+For the general framework of AudioCraft, please cite the following.
+```json
+@article{copet2023simple,
+    title={Simple and Controllable Music Generation},
+    author={Jade Copet and Felix Kreuk and Itai Gat and Tal Remez and David Kant and Gabriel Synnaeve and Yossi Adi and Alexandre Défossez},
+    year={2023},
+    journal={arXiv preprint arXiv:2306.05284},
+}
+```
+When referring to a specific model, please cite as mentioned in the model specific README, e.g
+[./docs/MUSICGEN.md](./docs/MUSICGEN.md), [./docs/AUDIOGEN.md](./docs/AUDIOGEN.md), etc.

config/conditioner/chroma2music.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+# @package __global__
+classifier_free_guidance:
+  training_dropout: 0.2
+  inference_coef: 3.0
+attribute_dropout:
+  args:
+    active_on_eval: false
+  text: {}
+  wav:
+    self_wav: 0.5
+fuser:
+  cross_attention_pos_emb: false
+  cross_attention_pos_emb_scale: 1
+  sum: []
+  prepend: [self_wav, description]
+  cross: []
+  input_interpolate: []
+conditioners:
+  self_wav:
+    model: chroma_stem
+    chroma_stem:
+      sample_rate: ${sample_rate}
+      n_chroma: 12
+      radix2_exp: 14
+      argmax: true
+      match_len_on_eval: false
+      eval_wavs: null
+      n_eval_wavs: 100
+      cache_path: null
+  description:
+    model: t5
+    t5:
+      name: t5-base
+      finetune: false
+      word_dropout: 0.2
+      normalize_text: false
+dataset:
+  train:
+    merge_text_p: 0.25
+    drop_desc_p: 0.5
+    drop_other_p: 0.5

config/conditioner/clapemb2music.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+# @package __global__
+classifier_free_guidance:
+  training_dropout: 0.3
+  inference_coef: 3.0
+attribute_dropout:
+  text: {}
+  wav: {}
+fuser:
+  cross_attention_pos_emb: false
+  cross_attention_pos_emb_scale: 1
+  sum: []
+  prepend: []
+  cross: [description]
+  input_interpolate: []
+conditioners:
+  description:
+    model: clap
+    clap:
+      checkpoint: //reference/clap/music_audioset_epoch_15_esc_90.14.pt
+      model_arch: 'HTSAT-base'
+      enable_fusion: false
+      sample_rate: 44100
+      max_audio_length: 10
+      audio_stride: 1
+      dim: 512
+      attribute: description
+      normalize: true
+      quantize: true  # use RVQ quantization
+      n_q: 12
+      bins: 1024
+      kmeans_iters: 50
+      text_p: 0.  # probability of using text embed at train time
+      cache_path: null
+dataset:
+  joint_embed_attributes: [description]
+  train:
+    merge_text_p: 0.25
+    drop_desc_p: 0.5
+    drop_other_p: 0.5

config/conditioner/none.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+# @package __global__
+# No conditioning
+classifier_free_guidance:
+  training_dropout: 0
+  inference_coef: 1
+attribute_dropout:
+  text: {}
+  wav: {}
+fuser:
+  sum: []
+  prepend: []
+  cross: []
+  input_interpolate: []
+conditioners: null

config/conditioner/text2music.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# @package __global__
+classifier_free_guidance:
+  training_dropout: 0.3
+  inference_coef: 3.0
+attribute_dropout: {}
+fuser:
+  cross_attention_pos_emb: false
+  cross_attention_pos_emb_scale: 1
+  sum: []
+  prepend: []
+  cross: [description]
+  input_interpolate: []
+conditioners:
+  description:
+    model: t5
+    t5:
+      name: t5-base
+      finetune: false
+      word_dropout: 0.3
+      normalize_text: false
+dataset:
+  train:
+    merge_text_p: 0.25
+    drop_desc_p: 0.5
+    drop_other_p: 0.5

config/conditioner/text2sound.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# @package __global__
+classifier_free_guidance:
+  training_dropout: 0.1
+  inference_coef: 3.0
+attribute_dropout: {}
+fuser:
+  cross_attention_pos_emb: false
+  cross_attention_pos_emb_scale: 1
+  sum: []
+  prepend: []
+  cross: [description]
+  input_interpolate: []
+conditioners:
+  description:
+    model: t5
+    t5:
+      name: t5-large
+      finetune: false
+      word_dropout: 0.
+      normalize_text: false

config/config.yaml ADDED Viewed

	@@ -0,0 +1,75 @@

+# WARNING: This is the base configuration file shared across ALL solvers in AudioCraft
+# Please don't update this file directly. Instead use distinct configuration files
+# to override the below configuration.
+defaults:
+  - _self_
+  - dset: default
+  - solver: default
+device: cuda
+dtype: float32
+autocast: false
+autocast_dtype: bfloat16
+seed: 2036
+show: false  # just show the model and its size and exit
+continue_from:  # continue from a given sig or path
+execute_only:  # can be set to generate/evaluate/valid to run that stage
+execute_inplace: false # don't enforce continue_from to be set
+                       # to enable inplace execution of the stage. This assume
+                       # that you know what you are doing and execute stage
+                       # preserving the original xp sig.
+benchmark_no_load: false  # if set to true, will repeat the same batch instead of loading them
+efficient_attention_backend: torch  # can be torch or xformers.
+num_threads: 1                      # called with torch.set_num_thread.
+mp_start_method: forkserver               # multiprocessing method (spawn, fork or fork_server).
+label:  # use this if you want twice the same exp, with a name.
+# logging parameters
+logging:
+  level: INFO
+  log_updates: 10
+  log_tensorboard: false
+  log_wandb: false
+tensorboard:
+  with_media_logging: false
+  name:  # optional name for the experiment
+  sub_dir:  # optional sub directory to store tensorboard data
+wandb:
+  with_media_logging: true
+  project:  # project name
+  name:  # optional name for the experiment
+  group:  # optional group
+# SLURM launcher configuration.
+slurm:
+  gpus: 4  # convenience parameter, number of GPUs to use.
+  mem_per_gpu: 40  # in GB, total mem is automatically scaled with `gpus`.
+  time: 3600
+  constraint:
+  partition:
+  comment:
+  setup: []
+  exclude: ''
+# dora parameters
+dora:
+  # Output folder for all artifacts of an experiment.
+  dir: /checkpoint/${oc.env:USER}/experiments/audiocraft/outputs
+  # The following entries will be ignored by dora when computing the unique XP signature.
+  # Note that slurm.* and dora.* are automatically ignored.
+  exclude: [
+    'device', 'wandb.*', 'tensorboard.*', 'logging.*',
+    'dataset.num_workers', 'eval.num_workers', 'special.*',
+    'metrics.visqol.bin', 'metrics.fad.bin',
+    'execute_only', 'execute_best', 'generate.every',
+    'optim.eager_sync', 'profiler.*', 'deadlock.*',
+    'efficient_attention_backend', 'num_threads', 'mp_start_method',
+  ]
+  use_rendezvous: false
+  # for grids, always run from a clean repo, allowing reliable runs and storing
+  # the exact commit. Your repo must be absolutely pristine clean.
+  # Local `dora run` are not impacted for easier debugging.
+  git_save: true

config/dset/audio/audiocaps_16khz.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+# @package __global__
+# AudioCaps dataset
+datasource:
+  max_sample_rate: 16000
+  max_channels: 1
+  train: null  # only evaluation set
+  valid: null  # only evaluation set
+  evaluate: egs/audiocaps/audiocaps_16khz
+  generate: egs/audiocaps/audiocaps_16khz # identical to evaluate

config/dset/audio/default.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# @package __global__
+datasource:
+  max_sample_rate: ???
+  max_channels: ???
+  train: ???
+  valid: ???
+  evaluate: ???
+  generate: null

config/dset/audio/example.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# @package __global__
+datasource:
+  max_sample_rate: 44100
+  max_channels: 2
+  train: egs/example
+  valid: egs/example
+  evaluate: egs/example
+  generate: egs/example

config/dset/audio/musiccaps_32khz.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# @package __global__
+# total samples obtained from MusicCaps = 5469
+# (out of 5521 due to AudioSet corrupted samples)
+datasource:
+  max_sample_rate: 32000
+  max_channels: 2
+  train: null  # only evaluation set
+  valid: null  # only evaluation set
+  evaluate: egs/musiccaps/musiccaps_32khz
+  generate: egs/musiccaps/musiccaps_32khz # identical to evaluate

config/dset/default.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# @package __global__
+# WARNING: This is a base configuration file shared across ALL solvers in AudioCraft
+# Please don't update this file directly. Instead use distinct configuration files
+# to override the below configuration.
+datasource:
+  train: ???
+  valid: ???
+  evaluate: ???
+  generate: ???

config/dset/internal/music_10k_32khz.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+# @package __global__
+# high quality music dataset with no artist overlap between splits
+datasource:
+  max_sample_rate: 32000
+  max_channels: 1
+  train: egs/music/music_10k_32khz/train
+  valid: egs/music/music_10k_32khz/valid
+  evaluate: egs/music/music_10k_32khz/test
+  generate: egs/music/music_10k_32khz/test # identical to evaluate

config/dset/internal/music_400k_32khz.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# @package __global__
+datasource:
+  max_sample_rate: 32000
+  max_channels: 1
+  train: egs/music/music_400k_32khz/train
+  valid: egs/music/music_400k_32khz/valid
+  evaluate: egs/music/music_400k_32khz/test
+  generate: egs/music/music_400k_32khz/test # identical to evaluate

config/dset/internal/sounds_16khz.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# @package __global__
+# environmental sounds dataset compiling all datasets
+# with applied filters on tags
+datasource:
+  max_sample_rate: 16000
+  max_channels: 1
+  train: egs/sound/sounds_16khz/train
+  valid: egs/sound/sounds_16khz/valid
+  evaluate: egs/sound/sounds_16khz/test
+  generate: egs/sound/sounds_16khz/test # identical to evaluate

config/model/encodec/default.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+# @package __global__
+compression_model: encodec
+encodec:
+  autoencoder: seanet
+  quantizer: rvq
+  sample_rate: ${sample_rate}
+  channels: ${channels}
+  causal: false
+  renormalize: false
+seanet:
+  dimension: 128
+  channels: ${channels}
+  causal: ${encodec.causal}
+  n_filters: 32
+  n_residual_layers: 1
+  ratios: [8, 5, 4, 2]
+  activation: ELU
+  activation_params: {"alpha": 1.}
+  norm: weight_norm
+  norm_params: {}
+  kernel_size: 7
+  residual_kernel_size: 3
+  last_kernel_size: 7
+  dilation_base: 2
+  pad_mode: constant
+  true_skip: true
+  compress: 2
+  lstm: 2
+  disable_norm_outer_blocks: 0
+  # Specific encoder or decoder params.
+  # You can also override any param for the encoder or decoder only
+  # by using Hydra `+param=` syntax, i.e.`
+  # `+seanet.decoder.n_filters=64`.
+  decoder:
+    trim_right_ratio: 1.0
+    final_activation: null
+    final_activation_params: null
+  encoder: {}
+rvq:
+  n_q: 8
+  q_dropout: false
+  bins: 1024
+  decay: 0.99
+  kmeans_init: true
+  kmeans_iters: 50
+  threshold_ema_dead_code: 2
+  orthogonal_reg_weight: 0.0
+  orthogonal_reg_active_codes_only: false
+no_quant: {}

config/model/encodec/encodec_base_causal.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+# @package __global__
+defaults:
+  - encodec/default
+encodec:
+  causal: true
+rvq:
+  n_q: 32
+  q_dropout: true

config/model/encodec/encodec_large_nq4_s320.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+# @package __global__
+defaults:
+  - encodec/default
+seanet:
+  # default ratios are [8, 5, 4, 2]
+  n_filters: 64
+rvq:
+  bins: 2048
+  n_q: 4
+  q_dropout: false

config/model/encodec/encodec_large_nq4_s640.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+# @package __global__
+defaults:
+  - encodec/default
+seanet:
+  ratios: [8, 5, 4, 4]
+  n_filters: 64
+rvq:
+  bins: 2048
+  n_q: 4
+  q_dropout: false

config/model/lm/audiogen_lm.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package __global__
+defaults:
+  - lm/default
+  - override /conditioner: text2sound
+  - override /model/lm/model_scale: small # prefer this group to set model scale instead of transformer_lm keys directly
+lm_model: transformer_lm
+codebooks_pattern:
+  modeling: delay
+  delay:
+    delays: [0, 1, 2, 3]
+    flatten_first: 0
+    empty_initial: 0
+  unroll:
+    flattening: [0, 1, 2, 3]
+    delays: [0, 0, 0, 0]
+  music_lm:
+    group_by: 2
+  valle:
+    delays: [0, 0, 0]
+transformer_lm:
+  n_q: 4
+  card: 2048
+  memory_efficient: true
+  bias_proj: false
+  bias_ff: false
+  bias_attn: false
+  norm_first: true
+  layer_scale: null
+  weight_init: gaussian
+  depthwise_init: current
+  zero_bias_init: true
+  attention_as_float32: false

config/model/lm/default.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+# @package __global__
+defaults:
+  - _self_
+  - /model/lm/model_scale: base # prefer this group to set model scale instead of transformer_lm keys directly
+lm_model: transformer_lm
+codebooks_pattern:
+  modeling: parallel
+transformer_lm:
+  dim: 512
+  num_heads: 8
+  num_layers: 8
+  hidden_scale: 4
+  n_q: 8                   # number of streams to model
+  card: 1024
+  dropout: 0.
+  emb_lr: null
+  activation: gelu
+  norm_first: false        # use pre-norm instead of post-norm
+  bias_ff: true            # use bias for the feedforward
+  bias_attn: true          # use bias for the attention
+  bias_proj: true          # use bias for the output projections
+  past_context: null
+  causal: true
+  custom: false                 # use custom MHA implementation
+  memory_efficient: false       # use flash attention
+  attention_as_float32: false   # use float32 for the attention part,
+                                # recommended at the moment when memory_efficient is True.
+  layer_scale: null
+  positional_embedding: sin     # positional embedding strategy (sin, rope, or sin_rope).
+  xpos: false                   # apply xpos decay (rope only).
+  checkpointing: none      # layer checkpointing method, can be none, torch, xformers_default.
+                           # torch is the slowest but uses the least memory,
+                           # xformers_default is somewhere in between.
+  weight_init: null     # weight initialization (null, gaussian or uniform)
+  depthwise_init: null  # perform depthwise initialization (null, current, global)
+  zero_bias_init: false # initialize bias to zero if bias in linears and
+                        # if a weight_init method is used.
+  norm: layer_norm             # normalization method to use in transformer.
+  cross_attention: false
+  qk_layer_norm: false
+  qk_layer_norm_cross: false
+  attention_dropout: null
+  kv_repeat: 1
+  two_step_cfg: false          # whether to do true 2 steps CFG, potentially resolving some padding issues or not...

config/model/lm/model_scale/base.yaml ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # @package __global__
2	+
3	+ # overrides nothing because default is already transformer base (~ 60M params)

config/model/lm/model_scale/large.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# @package _global_
+# gpt2 inspired, even bigger (~3.3B params)
+transformer_lm:
+  dim: 2048
+  num_heads: 32
+  num_layers: 48

config/model/lm/model_scale/medium.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# @package _global_
+# gpt2 like (~1.5B params)
+transformer_lm:
+  dim: 1536
+  num_heads: 24
+  num_layers: 48

config/model/lm/model_scale/small.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+# @package _global_
+# 300M Param.
+transformer_lm:
+  dim: 1024
+  num_heads: 16
+  num_layers: 24

config/model/lm/model_scale/xsmall.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+# @package _global_
+# just used for debugging or when we just want to populate the cache
+# and do not care about training.
+transformer_lm:
+  dim: 64
+  num_heads: 2
+  num_layers: 2

config/model/lm/musicgen_lm.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package __global__
+defaults:
+  - lm/default
+  - override /conditioner: text2music
+  - override /model/lm/model_scale: small # prefer this group to set model scale instead of transformer_lm keys directly
+lm_model: transformer_lm
+codebooks_pattern:
+  modeling: delay
+  delay:
+    delays: [0, 1, 2, 3]
+    flatten_first: 0
+    empty_initial: 0
+  unroll:
+    flattening: [0, 1, 2, 3]
+    delays: [0, 0, 0, 0]
+  music_lm:
+    group_by: 2
+  valle:
+    delays: [0, 0, 0]
+transformer_lm:
+  n_q: 4
+  card: 2048
+  memory_efficient: true
+  bias_proj: false
+  bias_ff: false
+  bias_attn: false
+  norm_first: true
+  layer_scale: null
+  weight_init: gaussian
+  depthwise_init: current
+  zero_bias_init: true
+  attention_as_float32: false

config/model/none.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# @package __global__
+# This file exist so that model is recognized as a config group
+# by Hydra, and Dora. A bit weird we might need a better fix someday.

config/model/score/basic.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+# @package _global_
+diffusion_unet:
+  hidden: 48
+  depth: 4
+  res_blocks: 1
+  norm_groups: 4
+  kernel: 8
+  stride: 4
+  growth: 4
+  max_channels: 10_000
+  dropout: 0.
+  emb_all_layers: true
+  bilstm: false
+  codec_dim: null
+  transformer: false
+  cross_attention: false

config/solver/audiogen/audiogen_base_16khz.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+# @package __global__
+# This is the training loop solver
+# for the base AudioGen model (text-to-sound)
+# on monophonic audio sampled at 16 kHz
+# using a similar EnCodec+LM setup to MusicGen
+defaults:
+  - audiogen/default
+  - /model: lm/audiogen_lm
+  - override /dset: audio/default
+  - _self_
+autocast: true
+autocast_dtype: float16
+# EnCodec large trained on mono-channel music audio sampled at 16khz
+# with a total stride of 320 leading to 50 frames/s.
+# rvq.n_q=4, rvq.bins=2048, no quantization dropout
+# (transformer_lm card and n_q must be compatible)
+compression_model_checkpoint: //reference/bd44a852/checkpoint.th
+channels: 1
+sample_rate: 16000
+deadlock:
+  use: true  # deadlock detection
+dataset:
+  batch_size: 128  # matching AudioGen paper setup (256 * mix_p=0.5 = 128)
+  num_workers: 10
+  segment_duration: 10
+  min_segment_ratio: 1.0
+  sample_on_weight: false  # Uniform sampling all the way
+  sample_on_duration: false  # Uniform sampling all the way
+  external_metadata_source: null
+  # sample mixing augmentation at train time
+  train:
+    batch_size: 256  # matching AudioGen paper setup
+    aug_p: 0.5  # perform audio mixing 50% of the time
+    mix_p: 0.5  # proportion of batch items mixed together
+                # important: note that this will reduce the
+                # actual batch size used at train time
+                # which will be equal to mix_p * batch_size
+    mix_snr_low: -5
+    mix_snr_high: 5
+    mix_min_overlap: 0.5
+generate:
+  lm:
+    use_sampling: true
+    top_k: 250
+    top_p: 0.0
+optim:
+  epochs: 100
+  optimizer: adamw
+  lr: 5e-4
+  ema:
+    use: true
+    updates: 10
+    device: cuda
+logging:
+  log_tensorboard: true
+schedule:
+  lr_scheduler: inverse_sqrt
+  inverse_sqrt:
+    warmup: 3000
+    warmup_init_lr: 0.0

config/solver/audiogen/debug.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+# @package __global__
+# This is a minimal debugging configuration
+# for MusicGen training solver
+defaults:
+  - audiogen/default
+  - /model: lm/audiogen_lm
+  - override /model/lm/model_scale: xsmall
+  - override /dset: audio/example
+  - _self_
+autocast: false
+compression_model_checkpoint: null
+codebooks_pattern:
+  modeling: parallel
+channels: 1
+sample_rate: 16000
+deadlock:
+  use: false  # deadlock detection
+dataset:
+  batch_size: 4
+  segment_duration: 5
+  sample_on_weight: false  # Uniform sampling all the way
+  sample_on_duration: false  # Uniform sampling all the way
+generate:
+  audio:
+    strategy: peak
+  lm:
+    use_sampling: false
+    top_k: 0
+    top_p: 0.0
+checkpoint:
+  save_every: 0
+  keep_last: 0
+optim:
+  epochs: 2
+  updates_per_epoch: 10
+  optimizer: adamw
+  lr: 1e-4
+logging:
+  log_tensorboard: true
+schedule:
+  lr_scheduler: null

config/solver/audiogen/default.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+# @package __global__
+defaults:
+  - /solver/musicgen/default
+  - _self_
+  - /solver/audiogen/evaluation: none
+  - override /dset: audio/default
+# See config/solver/musicgen/default.yaml for a list of possible values.
+# We only keep the most important here.
+autocast: true
+autocast_dtype: float16
+solver: audiogen
+sample_rate: ???
+channels: ???
+compression_model_checkpoint: ???
+tokens:
+  padding_with_special_token: false
+dataset:
+  batch_size: 128
+  segment_duration: 10
+  min_segment_ratio: 1.0  # lower values such as 0.5 result in generations with a lot of silence.
+optim:
+  epochs: 100
+  updates_per_epoch: 2000
+  lr: 1e-4
+  optimizer: adamw
+  max_norm: 1.0
+  adam:
+    betas: [0.9, 0.95]
+    weight_decay: 0.1
+    eps: 1e-8
+schedule:
+  lr_scheduler: null

config/solver/audiogen/evaluation/none.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+# @package __global__
+dataset:
+  evaluate:
+    num_samples: 10000

config/solver/audiogen/evaluation/objective_eval.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# @package __global__
+# Setup for execute only on audiocaps for audio generation
+# evaluation with objective metrics
+# execute_only=evaluate
+dataset:
+  max_audio_duration: null
+  # ensure the proper values are broadcasted here for evaluate
+  evaluate:
+    min_audio_duration: 1.  # some metrics requires a minimum audio length
+    max_audio_duration: null  # all samples from audiocaps should be ~10s
+    num_samples: null
+    segment_duration: null
+  generate:
+    min_audio_duration: 1.
+    max_audio_duration: null
+    num_samples: 500
+evaluate:
+  metrics:
+    fad: true
+    kld: true
+    text_consistency: true
+metrics:
+  kld:
+    passt:
+      pretrained_length: 10  # similarly to reported results in AudioGen paper

config/solver/compression/debug.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+# @package __global__
+defaults:
+  - compression/default
+  - /model: encodec/encodec_base_causal
+  - override /dset: audio/example
+  - _self_
+channels: 1
+sample_rate: 16000
+# debug config uses just L1
+losses:
+  adv: 0.
+  feat: 0.
+  l1: 1.
+  mel: 0.
+  msspec: 0.
+# no balancer
+balancer:
+  balance_grads: false
+  ema_decay: 1.
+  total_norm: 1.
+  per_batch_item: false
+# no adversaries
+adversarial:
+  adversaries: []
+  adv_loss: hinge
+  feat_loss: l1
+# faster model for local dev
+seanet:
+  dimension: 16
+  n_filters: 4
+# very small dataset
+dataset:
+  batch_size: 8
+  num_workers: 10
+  num_samples: 100
+  segment_duration: 1
+  evaluate:
+    batch_size: 32
+  generate:
+    batch_size: 1
+    num_samples: 5
+    segment_duration: 10
+# limited training
+evaluate:
+  every: 5
+generate:
+  every: 5
+optim:
+  epochs: 50

config/solver/compression/default.yaml ADDED Viewed

	@@ -0,0 +1,160 @@

+# @package __global__
+defaults:
+  - ../default
+  - override /dset: audio/default
+  - _self_
+solver: compression
+sample_rate: ???
+channels: ???
+# loss balancing
+losses:
+  adv: 4.
+  feat: 4.
+  l1: 0.1
+  mel: 0.
+  msspec: 2.
+  sisnr: 0.
+balancer:
+  balance_grads: true
+  ema_decay: 0.999
+  per_batch_item: true
+  total_norm: 1.
+adversarial:
+  every: 1
+  adversaries: [msstftd]
+  adv_loss: hinge
+  feat_loss: l1
+# losses hyperparameters
+l1: {}
+l2: {}
+mrstft:
+  factor_sc: .5
+  factor_mag: .5
+  normalized: false
+mel:
+  sample_rate: ${sample_rate}
+  n_fft: 1024
+  hop_length: 256
+  win_length: 1024
+  n_mels: 64
+  f_min: 64
+  f_max: null
+  normalized: false
+  floor_level: 1e-5
+sisnr:
+  sample_rate: ${sample_rate}
+  segment: 5.
+msspec:
+  sample_rate: ${sample_rate}
+  range_start: 6
+  range_end: 11
+  n_mels: 64
+  f_min: 64
+  f_max: null
+  normalized: true
+  alphas: false
+  floor_level: 1e-5
+# metrics
+metrics:
+  visqol:
+    mode: audio
+    bin: null  # path to visqol install
+    model: tcdaudio14_aacvopus_coresv_svrnsim_n.68_g.01_c1.model # visqol v3
+# adversaries hyperparameters
+msstftd:
+  in_channels: 1
+  out_channels: 1
+  filters: 32
+  norm: weight_norm
+  n_ffts: [1024, 2048, 512, 256, 128]
+  hop_lengths: [256, 512, 128, 64, 32]
+  win_lengths: [1024, 2048, 512, 256, 128]
+  activation: LeakyReLU
+  activation_params: {negative_slope: 0.3}
+msd:
+  in_channels: 1
+  out_channels: 1
+  scale_norms: [spectral_norm, weight_norm, weight_norm]
+  kernel_sizes: [5, 3]
+  filters: 16
+  max_filters: 1024
+  downsample_scales: [4, 4, 4, 4]
+  inner_kernel_sizes: null
+  groups: [4, 4, 4, 4]
+  strides: null
+  paddings: null
+  activation: LeakyReLU
+  activation_params: {negative_slope: 0.3}
+mpd:
+  in_channels: 1
+  out_channels: 1
+  periods: [2, 3, 5, 7, 11]
+  n_layers: 5
+  kernel_size: 5
+  stride: 3
+  filters: 8
+  filter_scales: 4
+  max_filters: 1024
+  activation: LeakyReLU
+  activation_params: {negative_slope: 0.3}
+  norm: weight_norm
+# data hyperparameters
+dataset:
+  batch_size: 64
+  num_workers: 10
+  segment_duration: 1
+  train:
+    num_samples: 500000
+  valid:
+    num_samples: 10000
+  evaluate:
+    batch_size: 32
+    num_samples: 10000
+  generate:
+    batch_size: 32
+    num_samples: 50
+    segment_duration: 10
+# solver hyperparameters
+evaluate:
+  every: 25
+  num_workers: 5
+  metrics:
+    visqol: false
+    sisnr: true
+generate:
+  every: 25
+  num_workers: 5
+  audio:
+    sample_rate: ${sample_rate}
+# checkpointing schedule
+checkpoint:
+  save_last: true
+  save_every: 25
+  keep_last: 10
+  keep_every_states: null
+# optimization hyperparameters
+optim:
+  epochs: 200
+  updates_per_epoch: 2000
+  lr: 3e-4
+  max_norm: 0.
+  optimizer: adam
+  adam:
+    betas: [0.5, 0.9]
+    weight_decay: 0.
+  ema:
+    use: true         # whether to use EMA or not
+    updates: 1        # update at every step
+    device: ${device} # device for EMA, can be put on GPU if more frequent updates
+    decay: 0.99       # EMA decay value, if null, no EMA is used

config/solver/compression/encodec_audiogen_16khz.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# @package __global__
+defaults:
+  - compression/default
+  - /model: encodec/encodec_large_nq4_s320
+  - override /dset: audio/default
+  - _self_
+channels: 1
+sample_rate: 16000

config/solver/compression/encodec_base_24khz.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# @package __global__
+defaults:
+  - compression/default
+  - /model: encodec/encodec_base_causal
+  - override /dset: audio/default
+  - _self_
+channels: 1
+sample_rate: 24000

config/solver/compression/encodec_musicgen_32khz.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# @package __global__
+defaults:
+  - compression/default
+  - /model: encodec/encodec_large_nq4_s640
+  - override /dset: audio/default
+  - _self_
+channels: 1
+sample_rate: 32000

config/solver/default.yaml ADDED Viewed

	@@ -0,0 +1,108 @@

+# @package __global__
+# WARNING: This is a base configuration file shared across ALL solvers in AudioCraft
+# Please don't update this file directly. Instead use distinct configuration files
+# to override the below configuration.
+solver: ???
+fsdp:
+  use: false  # should we use FSDP.
+  param_dtype: float16  # equivalent to autocast_dtype for FSDP.
+  reduce_dtype: float32  # gradient averaging dtype, float32 will give max stability.
+  buffer_dtype: float32  # dtype used for buffers, we don't have much buffers, so let's leave it.
+  sharding_strategy: shard_grad_op  # can be shard_grad_op or full_shard.
+                                    # full_shard will use less memory but slower ??
+  per_block: true  # If True, uses nested FSDP.
+profiler:
+  enabled: false
+deadlock:
+  use: false
+  timeout: 600
+dataset:
+  batch_size: ???
+  num_workers: 10
+  segment_duration: null
+  num_samples: null
+  return_info: false
+  shuffle: false
+  sample_on_duration: true
+  sample_on_weight: true
+  min_segment_ratio: 0.5
+  train:
+    num_samples: null
+    shuffle: true
+    shuffle_seed: 0  # if you want to sample the data differently.
+    permutation_on_files: false
+  valid:
+    num_samples: null
+  evaluate:
+    num_samples: null
+  generate:
+    num_samples: null
+    return_info: true
+checkpoint:
+  save_last: true
+  save_every: null
+  keep_last: null
+  keep_every_states: null
+generate:
+  every: null
+  path: 'samples'
+  audio:
+    format: 'mp3'
+    strategy: 'clip'
+    sample_rate: null
+  lm:
+    use_sampling: false
+    temp: 1.0
+    top_k: 0
+    top_p: 0.0
+evaluate:
+  every: null
+  num_workers: 5
+  truncate_audio: null
+  fixed_generation_duration: null  # in secs
+  metrics:
+    base: true  # run default evaluation (e.g. like train/valid stage)
+optim:
+  epochs: ???
+  updates_per_epoch: null
+  lr: ???
+  optimizer: ???
+  adam:
+    betas: [0.9, 0.999]
+    weight_decay: 0.
+  ema:
+    use: false  # whether to use EMA or not
+    updates: ${optim.updates_per_epoch}  # frequency of updates of the EMA
+    device: cpu  # device for EMA, can be put on GPU if more frequent updates
+    decay: 0.99  # EMA decay value, if null, no EMA is used
+schedule:
+  lr_scheduler: null
+  step:
+    step_size: null
+    gamma: null
+  exponential:
+    lr_decay: null
+  cosine:
+    warmup: null
+    lr_min_ratio: 0.0
+    cycle_length: 1.0
+  polynomial_decay:
+    warmup: null
+    zero_lr_warmup_steps: 0
+    end_lr: 0.0
+    power: 1
+  inverse_sqrt:
+    warmup: null
+    warmup_init_lr: 0.0
+  linear_warmup:
+    warmup: null
+    warmup_init_lr: 0.0

config/solver/diffusion/debug.yaml ADDED Viewed

	@@ -0,0 +1,106 @@

+# @package __global__
+defaults:
+  - /solver/default
+  - /model: score/basic
+  - override /dset: audio/default
+  - _self_
+solver: diffusion
+sample_rate: 16000
+channels: 1
+compression_model_checkpoint: //sig/5091833e
+n_q: 2   # number of codebooks to keep
+dataset:
+  batch_size: 8
+  num_workers: 10
+  segment_duration: 1
+  train:
+    num_samples: 100
+  valid:
+    num_samples: 100
+  evaluate:
+    batch_size: 8
+    num_samples: 10
+  generate:
+    batch_size: 8
+    num_samples: 10
+    segment_duration: 10
+loss:
+  kind: mse
+  norm_power: 0.
+valid:
+  every: 1
+evaluate:
+  every: 5
+  num_workers: 5
+  metrics:
+    visqol: false
+    sisnr: false
+    rvm: true
+generate:
+  every: 5
+  num_workers: 5
+  audio:
+    sample_rate: ${sample_rate}
+checkpoint:
+  save_last: true
+  save_every: 25
+  keep_last: 10
+  keep_every_states: null
+optim:
+  epochs: 50
+  updates_per_epoch: 2000
+  lr: 2e-4
+  max_norm: 0
+  optimizer: adam
+  adam:
+    betas: [0.9, 0.999]
+    weight_decay: 0.
+  ema:
+    use: true         # whether to use EMA or not
+    updates: 1        # update at every step
+    device: ${device} # device for EMA, can be put on GPU if more frequent updates
+    decay: 0.99       # EMA decay value, if null, no EMA is used
+processor:
+  name: multi_band_processor
+  use: false
+  n_bands: 8
+  num_samples: 10_000
+  power_std: 1.
+resampling:
+  use: false
+  target_sr: 16000
+filter:
+  use: false
+  n_bands: 4
+  idx_band: 0
+  cutoffs: null
+schedule:
+  repartition: "power"
+  variable_step_batch: true
+  beta_t0: 1.0e-5
+  beta_t1: 2.9e-2
+  beta_exp: 7.5
+  num_steps: 1000
+  variance: 'beta'
+  clip: 5.
+  rescale: 1.
+  n_bands: null
+  noise_scale: 1.0
+metrics:
+  num_stage: 4