dbal0503 commited on Jan 31, 2023

Commit

2ce7b1a

•

1 Parent(s): cdb23c5

Upload 693 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Essay_classifier/.gitignore +168 -0
Essay_classifier/.idea/.gitignore +3 -0
Essay_classifier/.idea/Essay_classifier.iml +14 -0
Essay_classifier/.idea/inspectionProfiles/profiles_settings.xml +6 -0
Essay_classifier/.idea/misc.xml +4 -0
Essay_classifier/.idea/modules.xml +8 -0
Essay_classifier/.idea/vcs.xml +6 -0
Essay_classifier/.idea/workspace.xml +42 -0
Essay_classifier/CITATION.cff +14 -0
Essay_classifier/LICENSE +21 -0
Essay_classifier/README.md +95 -0
Essay_classifier/S5.egg-info/PKG-INFO +13 -0
Essay_classifier/S5.egg-info/SOURCES.txt +7 -0
Essay_classifier/S5.egg-info/dependency_links.txt +1 -0
Essay_classifier/S5.egg-info/top_level.txt +1 -0
Essay_classifier/bin/download_aan.sh +4 -0
Essay_classifier/bin/download_all.sh +8 -0
Essay_classifier/bin/download_lra.sh +9 -0
Essay_classifier/bin/download_sc35.sh +4 -0
Essay_classifier/bin/python_scripts/download_sc.py +4 -0
Essay_classifier/bin/run_experiments/run_gpt_classifier.sh +7 -0
Essay_classifier/bin/run_experiments/run_lra_aan.sh +5 -0
Essay_classifier/bin/run_experiments/run_lra_cifar.sh +4 -0
Essay_classifier/bin/run_experiments/run_lra_imdb.sh +7 -0
Essay_classifier/bin/run_experiments/run_lra_listops.sh +4 -0
Essay_classifier/bin/run_experiments/run_lra_pathfinder.sh +5 -0
Essay_classifier/bin/run_experiments/run_lra_pathx.sh +5 -0
Essay_classifier/bin/run_experiments/run_speech35.sh +4 -0
Essay_classifier/docs/figures/pdfs/s3-block-diagram-2.pdf +0 -0
Essay_classifier/docs/figures/pdfs/s4-matrix-blocks.pdf +0 -0
Essay_classifier/docs/figures/pdfs/s4-s3-block-diagram-2.pdf +0 -0
Essay_classifier/docs/figures/pdfs/s5-matrix-blocks.pdf +0 -0
Essay_classifier/docs/figures/pngs/pendulum.png +0 -0
Essay_classifier/docs/figures/pngs/s3-block-diagram-2.png +0 -0
Essay_classifier/docs/figures/pngs/s4-matrix-blocks.png +0 -0
Essay_classifier/docs/figures/pngs/s4-s3-block-diagram-2.png +0 -0
Essay_classifier/docs/figures/pngs/s5-matrix-blocks.png +0 -0
Essay_classifier/docs/s5_blog.md +77 -0
Essay_classifier/essays/dataset_dict.json +1 -0
Essay_classifier/essays/test/data-00000-of-00001.arrow +3 -0
Essay_classifier/essays/test/dataset_info.json +20 -0
Essay_classifier/essays/test/state.json +13 -0
Essay_classifier/essays/train/data-00000-of-00001.arrow +3 -0
Essay_classifier/essays/train/dataset_info.json +20 -0
Essay_classifier/essays/train/state.json +13 -0
Essay_classifier/requirements_cpu.txt +9 -0
Essay_classifier/requirements_gpu.txt +9 -0
Essay_classifier/run_train.py +101 -0
Essay_classifier/s5/__init__.py +0 -0
Essay_classifier/s5/dataloaders/README.md +8 -0

Essay_classifier/.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+cache_dir/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wandb/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+*.pyc
+# S5 specific stuff
+wandb/
+cache_dir/
+raw_datasets/

Essay_classifier/.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

Essay_classifier/.idea/Essay_classifier.iml ADDED Viewed

	@@ -0,0 +1,14 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

Essay_classifier/.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

Essay_classifier/.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (Essay_classifier)" project-jdk-type="Python SDK" />
+</project>

Essay_classifier/.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Essay_classifier.iml" filepath="$PROJECT_DIR$/.idea/Essay_classifier.iml" />
+    </modules>
+  </component>
+</project>

Essay_classifier/.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

Essay_classifier/.idea/workspace.xml ADDED Viewed

	@@ -0,0 +1,42 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="4563d2f1-3686-4dcf-84a5-992172d73207" name="Changes" comment="" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="ProjectId" id="2KEFml6EVBHbZyIrqa6fvMDUtj0" />
+  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "RunOnceActivity.OpenProjectViewOnStart": "true",
+    "RunOnceActivity.ShowReadmeOnStart": "true"
+  }
+}]]></component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="4563d2f1-3686-4dcf-84a5-992172d73207" name="Changes" comment="" />
+      <created>1673531998138</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1673531998138</updated>
+    </task>
+    <servers />
+  </component>
+</project>

Essay_classifier/CITATION.cff ADDED Viewed

	@@ -0,0 +1,14 @@

+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: “Smith”
+  given-names: “Jimmy T. H.“
+- family-names: “Warrington”
+  given-names: “Andrew”
+- family-names: “Linderman”
+  given-names: “Scott”
+title: "Simplified State Space Layers for Sequence Modeling"
+version: 0.0.1
+doi: arXiv:2208.04933
+url: "https://github.com/lindermanlab/S5"
+date-released: 2022-OCT-04

Essay_classifier/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Linderman Lab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Essay_classifier/README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+# S5: Simplified State Space Layers for Sequence Modeling
+This repository provides the implementation for the
+paper: Simplified State Space Layers for Sequence Modeling.  The preprint is available [here](https://arxiv.org/abs/2208.04933).
+![](./docs/figures/pngs/s5-matrix-blocks.png)
+<p style="text-align: center;">
+Figure 1:  S5 uses a single multi-input, multi-output linear state-space model, coupled with non-linearities, to define a non-linear sequence-to-sequence transformation. Parallel scans are used for efficient offline processing.
+</p>
+The S5 layer builds on the prior S4 work ([paper](https://arxiv.org/abs/2111.00396)). While it has departed considerably, this repository originally started off with much of the JAX implementation of S4 from the
+Annotated S4 blog by Rush and Karamcheti (available [here](https://github.com/srush/annotated-s4)).
+## Requirements & Installation
+To run the code on your own machine, run either `pip install -r requirements_cpu.txt` or `pip install -r requirements_gpu.txt`.  The GPU installation of JAX can be tricky, and so we include requirements that should work for most people, although further instructions are available [here](https://github.com/google/jax#installation).
+Run from within the root directory `pip install -e .` to install the package.
+## Data Download
+Downloading the raw data is done differently for each dataset.  The following datasets require no action:
+- Text (IMDb)
+- Image (Cifar black & white)
+- sMNIST
+- psMNIST
+- Cifar (Color)
+The remaining datasets need to be manually downloaded.  To download _everything_, run `./bin/download_all.sh`.  This will download quite a lot of data and will take some time.
+Below is a summary of the steps for each dataset:
+- ListOps: run `./bin/download_lra.sh` to download the full LRA dataset.
+- Retrieval (AAN): run `./bin/download_aan.sh`
+- Pathfinder: run `./bin/download_lra.sh` to download the full LRA dataset.
+- Path-X: run `./bin/download_lra.sh` to download the full LRA dataset.
+- Speech commands 35: run `./bin/download_sc35.sh` to download the speech commands data.
+*With the exception of SC35.*  When the dataset is used for the first time, a cache is created in `./cache_dir`.  Converting the data (e.g. tokenizing) can be quite slow, and so this cache contains the processed dataset.  The cache can be moved and specified with the `--dir_name` argument (i.e. the default is `--dir_name=./cache_dir`) to avoid applying this preprocessing every time the code is run somewhere new.
+SC35 is slightly different.  SC35 doesn't use `--dir_name`, and instead requires that the following path exists: `./raw_datasets/speech_commands/0.0.2/SpeechCommands` (i.e. the directory `./raw_datasets/speech_commands/0.0.2/SpeechCommands/zero` must exist).  The cache is then stored in `./raw_datasets/speech_commands/0.0.2/SpeechCommands/processed_data`.  This directory can then be copied (preserving the directory path) to move the preprocessed dataset to a new location.
+## Repository Structure
+Directories and files that ship with GitHub repo:
+```
+s5/                    Source code for models, datasets, etc.
+    dataloading.py          Dataloading functions.
+    layers.py               Defines the S5 layer which wraps the S5 SSM with nonlinearity, norms, dropout, etc.
+    seq_model.py            Defines deep sequence models that consist of stacks of S5 layers.
+    ssm.py                  S5 SSM implementation.
+    ssm_init.py             Helper functions for initializing the S5 SSM .
+    train.py                Training loop code.
+    train_helpers.py        Functions for optimization, training and evaluation steps.
+    dataloaders/            Code mainly derived from S4 processing each dataset.
+    utils/                  Range of utility functions.
+bin/                    Shell scripts for downloading data and running example experiments.
+requirements_cpu.txt    Requirements for running in CPU mode (not advised).
+requirements_gpu.txt    Requirements for running in GPU mode (installation can be highly system-dependent).
+run_train.py            Training loop entrypoint.
+```
+Directories that may be created on-the-fly:
+```
+raw_datasets/       Raw data as downloaded.
+cache_dir/          Precompiled caches of data.  Can be copied to new locations to avoid preprocessing.
+wandb/              Local WandB log files.
+```
+## Experiments
+The configurations to run the LRA and 35-way Speech Commands experiments from the paper are located in  `bin/run_experiments`. For example,
+to run the LRA text (character level IMDB) experiment, run `./bin/run_experiments/run_lra_imdb.sh`.
+To log with W&B, adjust the default `USE_WANDB, wandb_entity, wandb_project` arguments.
+Note: the pendulum
+regression dataloading and experiments will be added soon.
+## Citation
+Please use the following when citing our work:
+```
+@misc{smith2022s5,
+  doi = {10.48550/ARXIV.2208.04933},
+  url = {https://arxiv.org/abs/2208.04933},
+  author = {Smith, Jimmy T. H. and Warrington, Andrew and Linderman, Scott W.},
+  keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {Simplified State Space Layers for Sequence Modeling},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {Creative Commons Attribution 4.0 International}
+}
+```
+Please reach out if you have any questions.
+-- The S5 authors.

Essay_classifier/S5.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,13 @@

+Metadata-Version: 2.1
+Name: S5
+Version: 0.1
+Summary: Simplified State Space Models for Sequence Modeling.
+Home-page: UNKNOWN
+Author: J.T.H. Smith, A. Warrington, S. Linderman.
+Author-email: jsmith14@stanford.edu
+License: UNKNOWN
+Platform: UNKNOWN
+License-File: LICENSE
+UNKNOWN

Essay_classifier/S5.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+LICENSE
+README.md
+setup.py
+S5.egg-info/PKG-INFO
+S5.egg-info/SOURCES.txt
+S5.egg-info/dependency_links.txt
+S5.egg-info/top_level.txt

Essay_classifier/S5.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

Essay_classifier/S5.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

Essay_classifier/bin/download_aan.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+mkdir raw_datasets
+# Download the raw AAN data from the TutorialBank Corpus.
+wget -v https://github.com/Yale-LILY/TutorialBank/blob/master/resources-v2022-clean.tsv -P ./raw_datasets

Essay_classifier/bin/download_all.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+# Make a directory to dump the raw data into.
+rm -rf ./raw_datasets
+mkdir ./raw_datasets
+./bin/download_lra.sh
+./bin/download_aan.sh
+./bin/download_sc35.sh

Essay_classifier/bin/download_lra.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+mkdir raw_datasets
+# Clone and unpack the LRA object.
+# This can take a long time, so get comfortable.
+rm -rf ./raw_datasets/lra_release.gz ./raw_datasets/lra_release  # Clean out any old datasets.
+wget -v https://storage.googleapis.com/long-range-arena/lra_release.gz -P ./raw_datasets
+# Add a progress bar because this can be slow.
+pv ./raw_datasets/lra_release.gz | tar -zx -C ./raw_datasets/

Essay_classifier/bin/download_sc35.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+mkdir raw_datasets
+# Use tfds to download the speech commands dataset.
+python ./bin/python_scripts/download_sc.py

Essay_classifier/bin/python_scripts/download_sc.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import tensorflow_datasets as tfds
+import os
+cfg = tfds.download.DownloadConfig(extract_dir=os.getcwd() + '/raw_datasets/')
+tfds.load('speech_commands', data_dir='./raw_datasets', download=True, download_and_prepare_kwargs={'download_dir': os.getcwd() + '/raw_datasets/', 'download_config': cfg})

Essay_classifier/bin/run_experiments/run_gpt_classifier.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+python run_train.py --C_init=lecun_normal --activation_fn=half_glu2 \
+                    --batchnorm=True --bidirectional=True --blocks=12 --bsz=8 \
+                    --d_model=64 --dataset=imdb-classification \
+                    --dt_global=True --epochs=35 --jax_seed=8825365 --lr_factor=4 \
+                    --n_layers=6 --opt_config=standard --p_dropout=0.1 --ssm_lr_base=0.001 \
+                    --ssm_size_base=192 --warmup_end=0 --weight_decay=0.07 \
+                    --USE_WANDB True --wandb_project awsome_0 --wandb_entity Vodolay

Essay_classifier/bin/run_experiments/run_lra_aan.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+python run_train.py --C_init=trunc_standard_normal --batchnorm=True --bidirectional=True \
+                    --blocks=16 --bsz=32 --d_model=128 --dataset=aan-classification \
+                    --dt_global=True --epochs=20 --jax_seed=5464368 --lr_factor=2 --n_layers=6 \
+                    --opt_config=standard --p_dropout=0.0 --ssm_lr_base=0.001 --ssm_size_base=256 \
+                    --warmup_end=1 --weight_decay=0.05

Essay_classifier/bin/run_experiments/run_lra_cifar.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+python run_train.py --C_init=lecun_normal --batchnorm=True --bidirectional=True \
+                    --blocks=3 --bsz=50 --clip_eigs=True --d_model=512 --dataset=lra-cifar-classification \
+                    --epochs=250 --jax_seed=16416 --lr_factor=4.5 --n_layers=6 --opt_config=BfastandCdecay \
+                    --p_dropout=0.1 --ssm_lr_base=0.001 --ssm_size_base=384 --warmup_end=1 --weight_decay=0.07

Essay_classifier/bin/run_experiments/run_lra_imdb.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+python run_train.py --C_init=lecun_normal --activation_fn=half_glu2 \
+                    --batchnorm=True --bidirectional=True --blocks=12 --bsz=8 \
+                    --d_model=64 --dataset=imdb-classification \
+                    --dt_global=True --epochs=35 --jax_seed=8825365 --lr_factor=4 \
+                    --n_layers=6 --opt_config=standard --p_dropout=0.1 --ssm_lr_base=0.001 \
+                    --ssm_size_base=192 --warmup_end=0 --weight_decay=0.07 \
+                    --USE_WANDB True --wandb_project awsome_0 --wandb_entity Vodolay

Essay_classifier/bin/run_experiments/run_lra_listops.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+python run_train.py --C_init=lecun_normal --activation_fn=half_glu2 --batchnorm=True \
+                    --bidirectional=True --blocks=8 --bsz=50 --d_model=128 --dataset=listops-classification \
+                    --epochs=40 --jax_seed=6554595 --lr_factor=3 --n_layers=8 --opt_config=BfastandCdecay \
+                    --p_dropout=0 --ssm_lr_base=0.001 --ssm_size_base=16 --warmup_end=1 --weight_decay=0.04

Essay_classifier/bin/run_experiments/run_lra_pathfinder.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+python run_train.py --C_init=trunc_standard_normal --batchnorm=True --bidirectional=True \
+                    --blocks=8 --bn_momentum=0.9 --bsz=64 --d_model=192 \
+                    --dataset=pathfinder-classification  --epochs=200 --jax_seed=8180844 --lr_factor=5 \
+                    --n_layers=6 --opt_config=standard --p_dropout=0.05 --ssm_lr_base=0.0009 \
+                    --ssm_size_base=256 --warmup_end=1 --weight_decay=0.03

Essay_classifier/bin/run_experiments/run_lra_pathx.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+python run_train.py --C_init=complex_normal --batchnorm=True --bidirectional=True \
+                    --blocks=16 --bn_momentum=0.9 --bsz=32 --d_model=128 --dataset=pathx-classification \
+                    --dt_min=0.0001 --epochs=75 --jax_seed=6429262 --lr_factor=3 --n_layers=6 \
+                    --opt_config=BandCdecay --p_dropout=0.0 --ssm_lr_base=0.0006 --ssm_size_base=256 \
+                    --warmup_end=1 --weight_decay=0.06

Essay_classifier/bin/run_experiments/run_speech35.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+python run_train.py --C_init=lecun_normal --batchnorm=True --bidirectional=True \
+                    --blocks=16 --bsz=16 --d_model=96 --dataset=speech35-classification \
+                    --epochs=40 --jax_seed=4062966 --lr_factor=4 --n_layers=6 --opt_config=noBCdecay \
+                    --p_dropout=0.1 --ssm_lr_base=0.002 --ssm_size_base=128 --warmup_end=1 --weight_decay=0.04

Essay_classifier/docs/figures/pdfs/s3-block-diagram-2.pdf ADDED Viewed

Binary file (146 kB). View file

Essay_classifier/docs/figures/pdfs/s4-matrix-blocks.pdf ADDED Viewed

Binary file (209 kB). View file

Essay_classifier/docs/figures/pdfs/s4-s3-block-diagram-2.pdf ADDED Viewed

Binary file (200 kB). View file

Essay_classifier/docs/figures/pdfs/s5-matrix-blocks.pdf ADDED Viewed

Binary file (208 kB). View file

Essay_classifier/docs/figures/pngs/pendulum.png ADDED Viewed

Essay_classifier/docs/figures/pngs/s3-block-diagram-2.png ADDED Viewed

Essay_classifier/docs/figures/pngs/s4-matrix-blocks.png ADDED Viewed

Essay_classifier/docs/figures/pngs/s4-s3-block-diagram-2.png ADDED Viewed

Essay_classifier/docs/figures/pngs/s5-matrix-blocks.png ADDED Viewed

Essay_classifier/docs/s5_blog.md ADDED Viewed

	@@ -0,0 +1,77 @@

+<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+# S5: Simplified State Space Layers for Sequence Modeling
+_By [Jimmy Smith](https://icme.stanford.edu/people/jimmy-smith), [Andrew Warrington](https://github.com/andrewwarrington) & [Scott Linderman](https://web.stanford.edu/~swl1/)._
+_This post accompanies the preprint Smith et al [2022], available [here](https://arxiv.org/pdf/2208.04933.pdf).  Code for the paper is available [here](https://github.com/lindermanlab/S5)_.
+## TL;DR.
+In our preprint we demonstrate that we can build a state-of-the-art deep sequence-to-sequence model using by stacking many dense, multi-input, multi-output (MIMO) state space models (SSMs) as a layer.  This replaces the many single-input, single-output (SISO) SSMs used by the _structured state space sequence_ (S4) model [Gu et al, 2021].  This allows us to make use of efficient parallel scan to achieve the same computational effiency of S4, without the need to use frequency domain and convolutional methods.  We show that S5 achieves the same, if not better, performance than S4 on a range of long-range sequence modeling tasks.
+![](./figures/pngs/s5-matrix-blocks.png)
+_Figure 1: Our S5 layer uses a single, dense, multi-input, multi-output state space model as a layer in a deep sequence-to-sequence model._
+## S4 is Epically Good.  So... Why?
+<a name="fig_s4_stack"></a>
+![](./figures/pngs/s4-s3-block-diagram-2.png)
+_Figure 2: A schematic of the computations required by S4.  \\(H\\) SISO SSMs are applied in the frequency domain, passed through a non-linearity, and then mixed to provide the input to the next layer.  Deriving the "Frequency domain convolution kernel generation" (and the required parameterization, indicated in blue) is the primary focus of Gu et al [2021]._
+The performance of S4 is unarguable.  Transformer-based methods were clawing for single percentage point gains on the long range arena benchmark dataset [Tay et al, 2021].  S4 beat many SotA transformer methods by as much as twenty percentage points.  AND, to top it off, could process sequences with complexity linear in the sequence length, and sublinear in parallel time (with a reasonable number of processors).
+However, the original S4 is a very involved method.  It required specific matrix parameterizations, decompositions, mathematical identities, Fourier transforms, and more, as illustrated in [Figure 2](#fig_s4_stack).  As a research group, we spent several weeks trying to understand all the intricacies of the method.  This left us asking:  is there a different way of using the same core concepts, retaining performance and complexity, but, maybe, making it (subjectively, we admit!) simpler?
+Enter S5.
+## From SISO to MIMO.  From Convolution to Parallel Recurrence.
+<a name="fig_s4_block"></a>
+![](./figures/pngs/s4-matrix-blocks.png)
+_Figure 3: Our S5 layer uses a single, dense, multi-input, multi-output state space model as a layer in a deep sequence-to-sequence model._
+---
+todo
+---
+## S4 and Its Variants.
+Since publishing the original S4 model, the original authors have released three further papers studying the S4 model.  Most significant of those papers are S4D [Gu, 2022] and DSS [Gupta, 2022].  These paper explores using diagonal state spaces, similar to what we use.  S4D provided a proof as to why the (diagonalizable) normal matrix, from the normal-plus-low-rank factorization of the HiPPO-LegS matrix, provides such a good initialization for SISO systems.  We show (although its really not that difficult!) that using this initialization in the MIMO case enjoys similar characteristics.  We note, however, that S4D and DSS provide computationally simpler implementations of S4; but, doe not perform quite as strongly.  Most importantly, though, S5 isn't the only simplification to S4.
+## Other Resources.
+- Much of our understanding and early code was based on the _excellent_ blog post, _The Annotated S4_, by [Rush and Karamcheti \[2021\]](https://srush.github.io/annotated-s4/).
+- Full code for the original S4 implementation, and many of its forerunners and derivatives, is available [here](https://github.com/HazyResearch/state-spaces).
+- Instructions for obtaining the LRA dataset are [here](https://openreview.net/pdf?id=qVyeW-grC2k).
+## Awesome Other Work.
+There are obviously many other great researchers working on adapting, extending, and understanding S4.  We outline some very recent work here:
+- Mega, by Ma et al [2022], combines linear state space layers with transformer heads for sequence modeling.  The main Mega method has \\(O(L^2)\\) complexity.  A second method, Mega-chunk, is presented that has \\(O(L)\\), but does not achieve the same performance as Mega.  Combining SSMs with transformer heads is a great avenue for future work.
+- Liquid-S4, by Hasani et al [2022], extends S4 by adding a dependence on the input signal into the state matrix.  When expanded, this is equivilant to adding cross-terms between the \\(k^{th}\\) input and all previous inputs.  Evaluating all previous terms is intractable, and so this sequence is often truncated.  Extending the linear SSM, such that it is conditionally linear, is a really exciting opportunity for making the more model of linear state space layers more expressive.
+- ADD "what makes conv great" once it is de-anonymysed.
+## Bibliography
+- Smith, Jimmy TH, Andrew Warrington, and Scott W. Linderman. "Simplified State Space Layers for Sequence Modeling." arXiv preprint arXiv:2208.04933 (2022).  [Link](https://arxiv.org/pdf/2208.04933.pdf).
+- Gu, Albert, Karan Goel, and Christopher Re. "Efficiently Modeling Long Sequences with Structured State Spaces." International Conference on Learning Representations (2021).  [Link](https://openreview.net/pdf?id=uYLFoz1vlAC).
+- Rush, Sasha, and Sidd Karamcheti. "The Annotated S4." Blog Track at ICLR 2022 (2022).  [Link](https://srush.github.io/annotated-s4/).
+- Yi Tay, et al. "Long Range Arena : A Benchmark for Efficient Transformers ." International Conference on Learning Representations (2021).  [Link](https://openreview.net/pdf?id=qVyeW-grC2k).
+-  Ma, Xuezhe, et al. "Mega: Moving Average Equipped Gated Attention." arXiv preprint arXiv:2209.10655 (2022).  [Link](https://arxiv.org/pdf/2209.10655).
+- Hasani, Ramin, et al. "Liquid Structural State-Space Models." arXiv preprint arXiv:2209.12951 (2022).  [Link](https://web10.arxiv.org/pdf/2209.12951.pdf).
+- Gu S4d.

Essay_classifier/essays/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train", "test"]}

Essay_classifier/essays/test/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf303ddc5053fcc1d3d8ad2e8a885dbf12d31960c1e097a4979980362c4c94ea
+size 470136

Essay_classifier/essays/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "__index_level_0__": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

Essay_classifier/essays/test/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "cf3f779c3519cf1d",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

Essay_classifier/essays/train/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a92f80195891f57e75536c3660c1a67e36e044a6079678742eaa8f7f72711411
+size 1943280

Essay_classifier/essays/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "label": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "__index_level_0__": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

Essay_classifier/essays/train/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c8e09f6301a80e82",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

Essay_classifier/requirements_cpu.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+flax==0.5.2
+torch==1.11.0
+torchtext==0.12.0
+tensorflow-datasets==4.5.2
+pydub==0.25.1
+datasets==2.4.0
+tqdm==4.62.3
+jaxlib==0.3.5
+jax==0.3.5

Essay_classifier/requirements_gpu.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+flax
+torch
+torchtext
+tensorflow-datasets==4.5.2
+pydub==0.25.1
+datasets
+tqdm
+--find-links https://storage.googleapis.com/jax-releases/jax_releases.html
+jax[cuda]>=version

Essay_classifier/run_train.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import argparse
+from s5.utils.util import str2bool
+from s5.train import train
+from s5.dataloading import Datasets
+if __name__ == "__main__":
+	parser = argparse.ArgumentParser()
+	parser.add_argument("--USE_WANDB", type=str2bool, default=False,
+						help="log with wandb?")
+	parser.add_argument("--wandb_project", type=str, default=None,
+						help="wandb project name")
+	parser.add_argument("--wandb_entity", type=str, default=None,
+						help="wandb entity name, e.g. username")
+	parser.add_argument("--dir_name", type=str, default='./cache_dir',
+						help="name of directory where data is cached")
+	parser.add_argument("--dataset", type=str, choices=Datasets.keys(),
+						default='mnist-classification',
+						help="dataset name")
+	# Model Parameters
+	parser.add_argument("--n_layers", type=int, default=6,
+						help="Number of layers in the network")
+	parser.add_argument("--d_model", type=int, default=128,
+						help="Number of features, i.e. H, "
+							 "dimension of layer inputs/outputs")
+	parser.add_argument("--ssm_size_base", type=int, default=256,
+						help="SSM Latent size, i.e. P")
+	parser.add_argument("--blocks", type=int, default=8,
+						help="How many blocks, J, to initialize with")
+	parser.add_argument("--C_init", type=str, default="trunc_standard_normal",
+						choices=["trunc_standard_normal", "lecun_normal", "complex_normal"],
+						help="Options for initialization of C: \\"
+							 "trunc_standard_normal: sample from trunc. std. normal then multiply by V \\ " \
+							 "lecun_normal sample from lecun normal, then multiply by V\\ " \
+							 "complex_normal: sample directly from complex standard normal")
+	parser.add_argument("--discretization", type=str, default="zoh", choices=["zoh", "bilinear"])
+	parser.add_argument("--mode", type=str, default="pool", choices=["pool", "last"],
+						help="options: (for classification tasks) \\" \
+							 " pool: mean pooling \\" \
+							 "last: take last element")
+	parser.add_argument("--activation_fn", default="half_glu1", type=str,
+						choices=["full_glu", "half_glu1", "half_glu2", "gelu"])
+	parser.add_argument("--conj_sym", type=str2bool, default=True,
+						help="whether to enforce conjugate symmetry")
+	parser.add_argument("--clip_eigs", type=str2bool, default=False,
+						help="whether to enforce the left-half plane condition")
+	parser.add_argument("--bidirectional", type=str2bool, default=False,
+						help="whether to use bidirectional model")
+	parser.add_argument("--dt_min", type=float, default=0.001,
+						help="min value to sample initial timescale params from")
+	parser.add_argument("--dt_max", type=float, default=0.1,
+						help="max value to sample initial timescale params from")
+	# Optimization Parameters
+	parser.add_argument("--prenorm", type=str2bool, default=True,
+						help="True: use prenorm, False: use postnorm")
+	parser.add_argument("--batchnorm", type=str2bool, default=True,
+						help="True: use batchnorm, False: use layernorm")
+	parser.add_argument("--bn_momentum", type=float, default=0.95,
+						help="batchnorm momentum")
+	parser.add_argument("--bsz", type=int, default=64,
+						help="batch size")
+	parser.add_argument("--epochs", type=int, default=100,
+						help="max number of epochs")
+	parser.add_argument("--early_stop_patience", type=int, default=1000,
+						help="number of epochs to continue training when val loss plateaus")
+	parser.add_argument("--ssm_lr_base", type=float, default=1e-3,
+						help="initial ssm learning rate")
+	parser.add_argument("--lr_factor", type=float, default=1,
+						help="global learning rate = lr_factor*ssm_lr_base")
+	parser.add_argument("--dt_global", type=str2bool, default=False,
+						help="Treat timescale parameter as global parameter or SSM parameter")
+	parser.add_argument("--lr_min", type=float, default=0,
+						help="minimum learning rate")
+	parser.add_argument("--cosine_anneal", type=str2bool, default=True,
+						help="whether to use cosine annealing schedule")
+	parser.add_argument("--warmup_end", type=int, default=1,
+						help="epoch to end linear warmup")
+	parser.add_argument("--lr_patience", type=int, default=1000000,
+						help="patience before decaying learning rate for lr_decay_on_val_plateau")
+	parser.add_argument("--reduce_factor", type=float, default=1.0,
+						help="factor to decay learning rate for lr_decay_on_val_plateau")
+	parser.add_argument("--p_dropout", type=float, default=0.0,
+						help="probability of dropout")
+	parser.add_argument("--weight_decay", type=float, default=0.05,
+						help="weight decay value")
+	parser.add_argument("--opt_config", type=str, default="standard", choices=['standard',
+																			   'BandCdecay',
+																			   'BfastandCdecay',
+																			   'noBCdecay'],
+						help="Opt configurations: \\ " \
+			   "standard:       no weight decay on B (ssm lr), weight decay on C (global lr) \\" \
+	  	       "BandCdecay:     weight decay on B (ssm lr), weight decay on C (global lr) \\" \
+	  	       "BfastandCdecay: weight decay on B (global lr), weight decay on C (global lr) \\" \
+	  	       "noBCdecay:      no weight decay on B (ssm lr), no weight decay on C (ssm lr) \\")
+	parser.add_argument("--jax_seed", type=int, default=1919,
+						help="seed randomness")
+	train(parser.parse_args())

Essay_classifier/s5/__init__.py ADDED Viewed

File without changes

Essay_classifier/s5/dataloaders/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# Data & Dataloaders
+The scripts in this directory deal with downloading, preparing and caching datasets, as well as building dataloaders from (preferably) a cache
+or downloading the data directly.  The scripts in this directory are **HEAVILY** based on the scripts in the original S4 repository,
+but have been modified to remove them from the PyTorch Lightning ecosystem.
+These files were originally distributed under the Apache 2.0 license, (c) Albert Gu.  The original copyright therefore remains with the original
+authors, but we modify and distribute under the permissions of the license.  Warranty, trademarking and liability are also therefore not allowed.