Spaces:

flax-community
/

dalle-mini

Running

App Files Files Community

boris commited on Dec 18, 2021

Commit

f234ccf

2 Parent(s): e5a52b9 26651dd

Merge branch 'main' of https://github.com/borisdayma/dalle-mini into add-custom-model

Browse files

Files changed (45) hide show

.github/workflows/check_size.yml +17 -0
.github/workflows/style.yml +20 -0
.github/workflows/sync_to_hub.yml +20 -0
.github/workflows/sync_to_hub_debug.yml +17 -0
.gitignore +4 -0
CITATION.cff +44 -0
LICENSE +201 -0
Makefile +5 -0
README.md +144 -30
app/gradio/app_gradio.py +179 -0
app/gradio/requirements.txt +4 -0
app/streamlit/app.py +117 -0
app/streamlit/img/loading.gif +0 -0
dalle_mini/data.py +261 -0
dalle_mini/dataset.py +0 -122
dalle_mini/model.py +64 -0
dalle_mini/text.py +258 -0
dalle_mini/vqgan_jax/__init__.py +0 -0
dalle_mini/vqgan_jax/configuration_vqgan.py +0 -40
dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +0 -109
dalle_mini/vqgan_jax/modeling_flax_vqgan.py +0 -609
data/CC12M_downloader.py +0 -91
data/CC3M_downloader.py +0 -62
demo/CustomBARTv4b_model-generate.ipynb +0 -566
demo/demo_notebook.ipynb +0 -583
encoding/vqgan-jax-encoding-with-captions.ipynb +0 -363
encoding/vqgan-jax-encoding-yfcc100m.ipynb +0 -1136
encoding/vqgan-jax-encoding.ipynb +0 -0
environment.yaml +0 -10
img/logo.png +0 -0
model/data-pipeline.ipynb +0 -385
pyproject.toml +2 -0
requirements.txt +0 -9
seq2seq/do_big_run.sh +0 -16
seq2seq/do_small_run.sh +0 -16
seq2seq/requirements.txt +0 -8
seq2seq/run_seq2seq_flax.py +0 -897
setup.cfg +27 -0
setup.py +4 -0
tools/dataset/encode_dataset.ipynb +371 -0
tools/inference/inference_pipeline.ipynb +0 -0
tools/inference/log_inference_samples.ipynb +434 -0
tools/inference/samples.txt +124 -0
{seq2seq → tools/train}/sweep.yaml +34 -23
tools/train/train.py +857 -0

.github/workflows/check_size.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+name: Check file size
+on:
+  pull_request:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check large files
+        uses: ActionsDesk/lfs-warning@v2.0
+        with:
+          filesizelimit: 10485760 # = 10MB, so we can sync to HF spaces

.github/workflows/style.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Lint
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: psf/black@stable
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+      - name: Install requirements
+        run: pip install ".[dev]"
+      - uses: jamescurtin/isort-action@master

.github/workflows/sync_to_hub.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://boris:$HF_TOKEN@huggingface.co/spaces/flax-community/dalle-mini main

.github/workflows/sync_to_hub_debug.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+name: Deploy to debug app
+on:
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub-debug:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://boris:$HF_TOKEN@huggingface.co/spaces/flax-community/dalle-mini-debug +HEAD:main

.gitignore CHANGED Viewed

	@@ -1 +1,5 @@
1	__pycache__

 __pycache__
+.ipynb_checkpoints
+.streamlit
+wandb/
+*.egg-info/

CITATION.cff ADDED Viewed

	@@ -0,0 +1,44 @@

+# YAML 1.2
+---
+abstract: "DALL·E mini is a JAX/Flax reimplementation of OpenAI's DALL·E that requires much smaller hardware resources. By simplifying the architecture and model memory requirements, as well as leveraging open-source code and pre-trained models, we were able to create a model that is 27 times smaller than the original DALL·E and train it on a single TPU v3-8 for only 3 days. DALL·E mini achieves impressive results, albeit of a lower quality than the original system. It can be used for exploration and further experimentation on commodity hardware."
+authors:
+  -
+    family-names: Dayma
+    given-names: Boris
+  -
+    family-names: Patil
+    given-names: Suraj
+  -
+    family-names: Cuenca
+    given-names: Pedro
+  -
+    family-names: Saifullah
+    given-names: Khalid
+  -
+    family-names: Abraham
+    given-names: Tanishq
+  -
+    family-names: "Lê Khắc"
+    given-names: "Phúc"
+  -
+    family-names: Melas
+    given-names: Luke
+  -
+    family-names: Ghosh
+    given-names: Ritobrata
+cff-version: "1.1.0"
+date-released: 2021-07-29
+identifiers:
+keywords:
+  - dalle
+  - "text-to-image generation"
+  - transformer
+  - "zero-shot"
+  - JAX
+license: "Apache-2.0"
+doi: 10.5281/zenodo.5146400
+message: "If you use this project, please cite it using these metadata."
+repository-code: "https://github.com/borisdayma/dalle-mini"
+title: "DALL·E Mini"
+version: "v0.1-alpha"
+...

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2021 The DALL·E mini Authors
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Makefile ADDED Viewed

	@@ -0,0 +1,5 @@

+.PHONY: style
+style:
+	black .
+	isort .

README.md CHANGED Viewed

@@ -1,42 +1,156 @@
-## DALL-E Mini - Generate image from text
-## Tentative Strategy of training (proposed by Luke and Suraj)
-### Data:
-* [Conceptual 12M](https://github.com/google-research-datasets/conceptual-12m) Dataset (already loaded and preprocessed in TPU VM by Luke).
-* [YFCC100M Subset](https://github.com/openai/CLIP/blob/main/data/yfcc100m.md)
-* [Coneptual Captions 3M](https://github.com/google-research-datasets/conceptual-captions)
-### Architecture:
-  * Use the Taming Transformers VQ-GAN (with 16384 tokens)
-  * Use a seq2seq (language encoder --> image decoder) model with a pretrained non-autoregressive encoder (e.g. BERT) and an autoregressive decoder (like GPT).
-### Remaining Architecture Questions:
-  * Whether to freeze the text encoder?
-  * Whether to finetune the VQ-GAN?
-  * Which text encoder to use (e.g. BERT, RoBERTa, etc.)?
-  * Hyperparameter choices for the decoder (e.g. positional embedding, initialization, etc.)
-## TODO
-* experiment with flax/jax and setup of the TPU instance that we should get shortly
-* work on dataset loading - [see suggested datasets](https://discuss.huggingface.co/t/dall-e-mini-version/7324/4)
-* Optionally create the OpenAI YFCC100M subset (see [this post](https://discuss.huggingface.co/t/dall-e-mini-version/7324/30?u=boris))
-* work on text/image encoding
-* concatenate inputs (not sure if we need fixed length for text or use a special token separating text & image)
-* adapt training script
-* create inference function
-* integrate CLIP for better results (only if we have the time)
-* work on a demo (streamlit or colab or maybe just HF widget)
-* document (set up repo on model hub per instructions, start on README writeup…)
-* help with coordinating activities & progress
-## Dependencies Installation
-You should create a new python virtual environment and install the project dependencies inside the virtual env. You need to use the `-f` (`--find-links`) option for `pip` to be able to find the appropriate `libtpu` required for the TPU hardware:
 ```
-$ pip install -r requirements.txt -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
 ```
-If you use `conda`, you can create the virtual env and install everything using: `conda env update -f environments.yaml`

+---
+title: DALL·E mini
+emoji: 🥑
+colorFrom: yellow
+colorTo: green
+sdk: streamlit
+app_file: app/streamlit/app.py
+pinned: True
+---
+# DALL·E Mini
+[![Join us on Discord](https://img.shields.io/discord/823813159592001537?color=5865F2&logo=discord&logoColor=white)](https://discord.gg/xBPBXfcFHd)
+_Generate images from a text prompt_
+<img src="img/logo.png" width="200">
+Our logo was generated with DALL·E mini using the prompt "logo of an armchair in the shape of an avocado".
+You can create your own pictures with [the demo](https://huggingface.co/spaces/flax-community/dalle-mini).
+## How does it work?
+Refer to [our report](https://wandb.ai/dalle-mini/dalle-mini/reports/DALL-E-mini--Vmlldzo4NjIxODA).
+## Development
+### Dependencies Installation
+For inference only, use `pip install git+https://github.com/borisdayma/dalle-mini.git`.
+For development, clone the repo and use `pip install -e ".[dev]"`.
+### Training of VQGAN
+The VQGAN was trained using [taming-transformers](https://github.com/CompVis/taming-transformers).
+We recommend using the latest version available.
+### Conversion of VQGAN to JAX
+Use [patil-suraj/vqgan-jax](https://github.com/patil-suraj/vqgan-jax).
+### Training of Seq2Seq
+Use [`tools/train/train.py`](tools/train/train.py).
+You can also adjust the [sweep configuration file](https://docs.wandb.ai/guides/sweeps) if you need to perform a hyperparameter search.
+### Inference Pipeline
+To generate sample predictions and understand the inference pipeline step by step, refer to [`tools/inference/inference_pipeline.ipynb`](tools/inference/inference_pipeline.ipynb).
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/dalle-mini/blob/main/tools/inference/inference_pipeline.ipynb)
+## FAQ
+### Where to find the latest models?
+Trained models are on 🤗 Model Hub:
+- [VQGAN-f16-16384](https://huggingface.co/flax-community/vqgan_f16_16384) for encoding/decoding images
+- [DALL·E mini](https://huggingface.co/flax-community/dalle-mini) for generating images from a text prompt
+### Where does the logo come from?
+The "armchair in the shape of an avocado" was used by OpenAI when releasing DALL·E to illustrate the model's capabilities. Having successful predictions on this prompt represents a big milestone to us.
+## Authors & Contributors
+### Main Authors
+- [Boris Dayma](https://github.com/borisdayma)
+- [Suraj Patil](https://github.com/patil-suraj)
+- [Pedro Cuenca](https://github.com/pcuenca)
+### Other Members of dalle-mini team during FLAX/JAX community week
+- [Khalid Saifullah](https://github.com/khalidsaifullaah)
+- [Tanishq Abraham](https://github.com/tmabraham)
+- [Phúc Lê Khắc](https://github.com/lkhphuc)
+- [Luke Melas](https://github.com/lukemelas)
+- [Ritobrata Ghosh](https://github.com/ghosh-r)
+### Contributing
+Join the community on the [DALLE-Pytorch Discord](https://discord.gg/xBPBXfcFHd).
+Any contribution is welcome, from reporting issues to proposing fixes/improvements or testing the model on cool prompts!
+## Acknowledgements
+- 🤗 Hugging Face for organizing [the FLAX/JAX community week](https://github.com/huggingface/transformers/tree/master/examples/research_projects/jax-projects)
+- Google [TPU Research Cloud (TRC) program](https://sites.research.google/trc/) for providing computing resources
+- [Weights & Biases](https://wandb.com/) for providing the infrastructure for experiment tracking and model management
+## Citing DALL·E mini
+If you find DALL·E mini useful in your research or wish to refer, please use the following BibTeX entry.
+```
+@misc{Dayma_DALL·E_Mini_2021,
+author = {Dayma, Boris and Patil, Suraj and Cuenca, Pedro and Saifullah, Khalid and Abraham, Tanishq and Lê Khắc, Phúc and Melas, Luke and Ghosh, Ritobrata},
+doi = {10.5281/zenodo.5146400},
+month = {7},
+title = {DALL·E Mini},
+url = {https://github.com/borisdayma/dalle-mini},
+year = {2021}
+}
 ```
+## References
+```
+@misc{ramesh2021zeroshot,
+      title={Zero-Shot Text-to-Image Generation},
+      author={Aditya Ramesh and Mikhail Pavlov and Gabriel Goh and Scott Gray and Chelsea Voss and Alec Radford and Mark Chen and Ilya Sutskever},
+      year={2021},
+      eprint={2102.12092},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+```
+@misc{esser2021taming,
+      title={Taming Transformers for High-Resolution Image Synthesis},
+      author={Patrick Esser and Robin Rombach and Björn Ommer},
+      year={2021},
+      eprint={2012.09841},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
 ```
+```
+@misc{lewis2019bart,
+      title={BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension},
+      author={Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and Abdelrahman Mohamed and Omer Levy and Ves Stoyanov and Luke Zettlemoyer},
+      year={2019},
+      eprint={1910.13461},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+```
+@misc{radford2021learning,
+      title={Learning Transferable Visual Models From Natural Language Supervision},
+      author={Alec Radford and Jong Wook Kim and Chris Hallacy and Aditya Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},
+      year={2021},
+      eprint={2103.00020},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```

app/gradio/app_gradio.py ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/usr/bin/env python
+# coding: utf-8
+# Uncomment to run on cpu
+# import os
+# os.environ["JAX_PLATFORM_NAME"] = "cpu"
+import random
+import gradio as gr
+import jax
+import numpy as np
+from flax.jax_utils import replicate
+from flax.training.common_utils import shard
+from PIL import Image, ImageDraw, ImageFont
+# ## CLIP Scoring
+from transformers import BartTokenizer, CLIPProcessor, FlaxCLIPModel
+from vqgan_jax.modeling_flax_vqgan import VQModel
+from dalle_mini.model import CustomFlaxBartForConditionalGeneration
+DALLE_REPO = "flax-community/dalle-mini"
+DALLE_COMMIT_ID = "4d34126d0df8bc4a692ae933e3b902a1fa8b6114"
+VQGAN_REPO = "flax-community/vqgan_f16_16384"
+VQGAN_COMMIT_ID = "90cc46addd2dd8f5be21586a9a23e1b95aa506a9"
+tokenizer = BartTokenizer.from_pretrained(DALLE_REPO, revision=DALLE_COMMIT_ID)
+model = CustomFlaxBartForConditionalGeneration.from_pretrained(
+    DALLE_REPO, revision=DALLE_COMMIT_ID
+)
+vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)
+def captioned_strip(images, caption=None, rows=1):
+    increased_h = 0 if caption is None else 48
+    w, h = images[0].size[0], images[0].size[1]
+    img = Image.new("RGB", (len(images) * w // rows, h * rows + increased_h))
+    for i, img_ in enumerate(images):
+        img.paste(img_, (i // rows * w, increased_h + (i % rows) * h))
+    if caption is not None:
+        draw = ImageDraw.Draw(img)
+        font = ImageFont.truetype(
+            "/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf", 40
+        )
+        draw.text((20, 3), caption, (255, 255, 255), font=font)
+    return img
+def custom_to_pil(x):
+    x = np.clip(x, 0.0, 1.0)
+    x = (255 * x).astype(np.uint8)
+    x = Image.fromarray(x)
+    if not x.mode == "RGB":
+        x = x.convert("RGB")
+    return x
+def generate(input, rng, params):
+    return model.generate(
+        **input,
+        max_length=257,
+        num_beams=1,
+        do_sample=True,
+        prng_key=rng,
+        eos_token_id=50000,
+        pad_token_id=50000,
+        params=params,
+    )
+def get_images(indices, params):
+    return vqgan.decode_code(indices, params=params)
+p_generate = jax.pmap(generate, "batch")
+p_get_images = jax.pmap(get_images, "batch")
+bart_params = replicate(model.params)
+vqgan_params = replicate(vqgan.params)
+clip = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+print("Initialize FlaxCLIPModel")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+print("Initialize CLIPProcessor")
+def hallucinate(prompt, num_images=64):
+    prompt = [prompt] * jax.device_count()
+    inputs = tokenizer(
+        prompt,
+        return_tensors="jax",
+        padding="max_length",
+        truncation=True,
+        max_length=128,
+    ).data
+    inputs = shard(inputs)
+    all_images = []
+    for i in range(num_images // jax.device_count()):
+        key = random.randint(0, 1e7)
+        rng = jax.random.PRNGKey(key)
+        rngs = jax.random.split(rng, jax.local_device_count())
+        indices = p_generate(inputs, rngs, bart_params).sequences
+        indices = indices[:, :, 1:]
+        images = p_get_images(indices, vqgan_params)
+        images = np.squeeze(np.asarray(images), 1)
+        for image in images:
+            all_images.append(custom_to_pil(image))
+    return all_images
+def clip_top_k(prompt, images, k=8):
+    inputs = processor(text=prompt, images=images, return_tensors="np", padding=True)
+    outputs = clip(**inputs)
+    logits = outputs.logits_per_text
+    scores = np.array(logits[0]).argsort()[-k:][::-1]
+    return [images[score] for score in scores]
+def compose_predictions(images, caption=None):
+    increased_h = 0 if caption is None else 48
+    w, h = images[0].size[0], images[0].size[1]
+    img = Image.new("RGB", (len(images) * w, h + increased_h))
+    for i, img_ in enumerate(images):
+        img.paste(img_, (i * w, increased_h))
+    if caption is not None:
+        draw = ImageDraw.Draw(img)
+        font = ImageFont.truetype(
+            "/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf", 40
+        )
+        draw.text((20, 3), caption, (255, 255, 255), font=font)
+    return img
+def top_k_predictions(prompt, num_candidates=32, k=8):
+    images = hallucinate(prompt, num_images=num_candidates)
+    images = clip_top_k(prompt, images, k=k)
+    return images
+def run_inference(prompt, num_images=32, num_preds=8):
+    images = top_k_predictions(prompt, num_candidates=num_images, k=num_preds)
+    predictions = captioned_strip(images)
+    output_title = f"""
+    <b>{prompt}</b>
+    """
+    return (output_title, predictions)
+outputs = [
+    gr.outputs.HTML(label=""),  # To be used as title
+    gr.outputs.Image(label=""),
+]
+description = """
+DALL·E-mini is an AI model that generates images from any prompt you give! Generate images from text:
+"""
+gr.Interface(
+    run_inference,
+    inputs=[gr.inputs.Textbox(label="What do you want to see?")],
+    outputs=outputs,
+    title="DALL·E mini",
+    description=description,
+    article="<p style='text-align: center'> Created by Boris Dayma et al. 2021 | <a href='https://github.com/borisdayma/dalle-mini'>GitHub</a> | <a href='https://wandb.ai/dalle-mini/dalle-mini/reports/DALL-E-mini--Vmlldzo4NjIxODA'>Report</a></p>",
+    layout="vertical",
+    theme="huggingface",
+    examples=[
+        ["an armchair in the shape of an avocado"],
+        ["snowy mountains by the sea"],
+    ],
+    allow_flagging=False,
+    live=False,
+    # server_port=8999
+).launch(share=True)

app/gradio/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+# Requirements for huggingface spaces
+gradio>=2.2.3
+flax
+transformers

app/streamlit/app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+#!/usr/bin/env python
+# coding: utf-8
+import base64
+from io import BytesIO
+import requests
+import streamlit as st
+from PIL import Image
+class ServiceError(Exception):
+    def __init__(self, status_code):
+        self.status_code = status_code
+def get_images_from_backend(prompt, backend_url):
+    r = requests.post(backend_url, json={"prompt": prompt})
+    if r.status_code == 200:
+        images = r.json()["images"]
+        images = [Image.open(BytesIO(base64.b64decode(img))) for img in images]
+        return images
+    else:
+        raise ServiceError(r.status_code)
+st.sidebar.markdown(
+    """
+<style>
+.aligncenter {
+    text-align: center;
+}
+</style>
+<p class="aligncenter">
+    <img src="https://raw.githubusercontent.com/borisdayma/dalle-mini/main/img/logo.png"/>
+</p>
+""",
+    unsafe_allow_html=True,
+)
+st.sidebar.markdown(
+    """
+___
+<p style='text-align: center'>
+DALL·E mini is an AI model that generates images from any prompt you give!
+</p>
+<p style='text-align: center'>
+Created by Boris Dayma et al. 2021
+<br/>
+<a href="https://github.com/borisdayma/dalle-mini" target="_blank">GitHub</a> | <a href="https://wandb.ai/dalle-mini/dalle-mini/reports/DALL-E-mini--Vmlldzo4NjIxODA" target="_blank">Project Report</a>
+</p>
+        """,
+    unsafe_allow_html=True,
+)
+st.header("DALL·E mini")
+st.subheader("Generate images from text")
+prompt = st.text_input("What do you want to see?")
+DEBUG = False
+if prompt != "":
+    container = st.empty()
+    container.markdown(
+        f"""
+        <style> p {{ margin:0 }} div {{ margin:0 }} </style>
+        <div data-stale="false" class="element-container css-1e5imcs e1tzin5v1">
+        <div class="stAlert">
+        <div role="alert" data-baseweb="notification" class="st-ae st-af st-ag st-ah st-ai st-aj st-ak st-g3 st-am st-b8 st-ao st-ap st-aq st-ar st-as st-at st-au st-av st-aw st-ax st-ay st-az st-b9 st-b1 st-b2 st-b3 st-b4 st-b5 st-b6">
+        <div class="st-b7">
+        <div class="css-whx05o e13vu3m50">
+        <div data-testid="stMarkdownContainer" class="css-1ekf893 e16nr0p30">
+                <img src="https://raw.githubusercontent.com/borisdayma/dalle-mini/main/app/streamlit/img/loading.gif" width="30"/>
+                Generating predictions for: <b>{prompt}</b>
+        </div>
+        </div>
+        </div>
+        </div>
+        </div>
+        </div>
+        <small><i>Predictions may take up to 40s under high load. Please stand by.</i></small>
+    """,
+        unsafe_allow_html=True,
+    )
+    try:
+        backend_url = st.secrets["BACKEND_SERVER"]
+        print(f"Getting selections: {prompt}")
+        selected = get_images_from_backend(prompt, backend_url)
+        margin = 0.1  # for better position of zoom in arrow
+        n_columns = 3
+        cols = st.columns([1] + [margin, 1] * (n_columns - 1))
+        for i, img in enumerate(selected):
+            cols[(i % n_columns) * 2].image(img)
+        container.markdown(f"**{prompt}**")
+        st.button("Again!", key="again_button")
+    except ServiceError as error:
+        container.text(f"Service unavailable, status: {error.status_code}")
+    except KeyError:
+        if DEBUG:
+            container.markdown(
+                """
+            **Error: BACKEND_SERVER unset**
+            Please, create a file called `.streamlit/secrets.toml` inside the app's folder and include a line to configure the server URL:
+            ```
+            BACKEND_SERVER="<server url>"
+            ```
+            """
+            )
+        else:
+            container.markdown(
+                "Error -5, please try again or [report it](mailto:pcuenca-dalle@guenever.net)."
+            )

app/streamlit/img/loading.gif ADDED Viewed

dalle_mini/data.py ADDED Viewed

	@@ -0,0 +1,261 @@

+from dataclasses import dataclass, field
+from functools import partial
+import jax
+import jax.numpy as jnp
+import numpy as np
+from datasets import Dataset, load_dataset
+from flax.training.common_utils import shard
+from .text import TextNormalizer
+@dataclass
+class Dataset:
+    dataset_repo_or_path: str
+    train_file: str = None
+    validation_file: str = None
+    dataset_type: str = "dataset"
+    streaming: bool = True
+    use_auth_token: bool = False
+    text_column: str = "caption"
+    encoding_column: str = "encoding"
+    max_source_length: int = 128
+    max_train_samples: int = None
+    max_eval_samples: int = None
+    preprocessing_num_workers: int = None
+    overwrite_cache: bool = False
+    do_train: bool = False
+    do_eval: bool = True
+    seed_dataset: int = None
+    train_dataset: Dataset = field(init=False)
+    eval_dataset: Dataset = field(init=False)
+    rng_dataset: jnp.ndarray = field(init=False)
+    def __post_init__(self):
+        # define data_files
+        if self.train_file is not None or self.validation_file is not None:
+            data_files = {
+                "train": self.train_file,
+                "validation": self.validation_file,
+            }
+        else:
+            data_files = None
+        # load dataset
+        dataset = load_dataset(
+            self.dataset_repo_or_path,
+            data_files=data_files,
+            streaming=self.streaming,
+            use_auth_token=self.use_auth_token,
+        )
+        if self.do_train:
+            if "train" not in dataset:
+                raise ValueError("Training requires a training dataset")
+            self.train_dataset = dataset["train"]
+            if self.max_train_samples is not None:
+                self.train_dataset = (
+                    self.train_dataset.take(self.max_train_samples)
+                    if self.streaming
+                    else self.train_dataset.select(range(self.max_train_samples))
+                )
+        if self.do_eval:
+            if "validation" not in dataset:
+                raise ValueError("Evaluating requires a validation dataset")
+            self.eval_dataset = dataset["validation"]
+            if self.max_eval_samples is not None:
+                self.eval_dataset = (
+                    self.eval_dataset.take(self.max_eval_samples)
+                    if self.streaming
+                    else self.eval_dataset.select(range(self.max_eval_samples))
+                )
+    def preprocess(self, tokenizer, decoder_start_token_id, normalize_text):
+        if self.streaming:
+            # we need to shuffle early in streaming mode
+            if hasattr(self, "train_dataset"):
+                self.train_dataset = self.train_dataset.shuffle(1000, self.seed_dataset)
+        else:
+            # prepare rng for later shuffling
+            if self.seed_dataset is None:
+                self.seed_dataset = np.random.get_state()[1][0]
+            self.rng_dataset = jax.random.PRNGKey(self.seed_dataset)
+        # normalize text
+        if normalize_text:
+            text_normalizer = TextNormalizer()
+            partial_normalize_function = partial(
+                normalize_function,
+                text_column=self.text_column,
+                text_normalizer=text_normalizer,
+            )
+            for ds in ["train_dataset", "eval_dataset"]:
+                if hasattr(self, ds):
+                    setattr(
+                        self,
+                        ds,
+                        (
+                            getattr(self, ds).map(partial_normalize_function)
+                            if self.streaming
+                            else getattr(self, ds).map(
+                                partial_normalize_function,
+                                num_proc=self.preprocessing_num_workers,
+                                load_from_cache_file=not self.overwrite_cache,
+                                desc="Normalizing datasets",
+                            )
+                        ),
+                    )
+        # preprocess
+        partial_preprocess_function = partial(
+            preprocess_function,
+            tokenizer=tokenizer,
+            text_column=self.text_column,
+            encoding_column=self.encoding_column,
+            max_source_length=self.max_source_length,
+            decoder_start_token_id=decoder_start_token_id,
+        )
+        for ds in ["train_dataset", "eval_dataset"]:
+            if hasattr(self, ds):
+                setattr(
+                    self,
+                    ds,
+                    (
+                        getattr(self, ds).map(
+                            partial_preprocess_function,
+                            batched=True,
+                        )
+                        if self.streaming
+                        else getattr(self, ds).map(
+                            partial_preprocess_function,
+                            batched=True,
+                            remove_columns=getattr(ds, "column_names"),
+                            num_proc=self.preprocessing_num_workers,
+                            load_from_cache_file=not self.overwrite_cache,
+                            desc="Preprocessing datasets",
+                        )
+                    ),
+                )
+    def dataloader(self, split, batch_size, epoch=None):
+        def _dataloader_datasets_non_streaming(
+            dataset: Dataset,
+            batch_size: int,
+            rng: jax.random.PRNGKey = None,
+        ):
+            """
+            Returns batches of size `batch_size` from truncated `dataset`, sharded over all local devices.
+            Shuffle batches if `shuffle` is `True`.
+            """
+            steps_per_epoch = len(dataset) // batch_size
+            if rng is not None:
+                batch_idx = jax.random.permutation(rng, len(dataset))
+            else:
+                batch_idx = jnp.arange(len(dataset))
+            batch_idx = batch_idx[
+                : steps_per_epoch * batch_size
+            ]  # Skip incomplete batch.
+            batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))
+            for idx in batch_idx:
+                batch = dataset[idx]
+                batch = {k: jnp.array(v) for k, v in batch.items()}
+                batch = shard(batch)
+                yield batch
+        def _dataloader_datasets_streaming(dataset: Dataset, batch_size: int):
+            keys = ["input_ids", "attention_mask", "labels", "decoder_input_ids"]
+            batch = {k: [] for k in keys}
+            for item in dataset:
+                for k, v in item.items():
+                    batch[k].append(v)
+                if len(batch[keys[0]]) == batch_size:
+                    batch = {k: jnp.array(v) for k, v in batch.items()}
+                    batch = shard(batch)
+                    yield batch
+                    batch = {k: [] for k in keys}
+        if split == "train":
+            ds = self.train_dataset
+        elif split == "eval":
+            ds = self.eval_dataset
+        else:
+            raise ValueError(f'split must be "train" or "eval", got {split}')
+        if self.streaming:
+            if split == "train":
+                ds.set_epoch(epoch)
+            return _dataloader_datasets_streaming(ds, batch_size)
+        else:
+            if split == "train":
+                self.rng_dataset, input_rng = jax.random.split(self.rng_dataset)
+            return _dataloader_datasets_non_streaming(ds, batch_size, input_rng)
+    @property
+    def length(self):
+        len_train_dataset, len_eval_dataset = None, None
+        if self.streaming:
+            # we don't know the length, let's just assume max_samples if defined
+            if self.max_train_samples is not None:
+                len_train_dataset = self.max_train_samples
+            if self.max_eval_samples is not None:
+                len_eval_dataset = self.max_eval_samples
+        else:
+            len_train_dataset = (
+                len(self.train_dataset) if hasattr(self, "train_dataset") else None
+            )
+            len_eval_dataset = (
+                len(self.eval_dataset) if hasattr(self, "eval_dataset") else None
+            )
+        return len_train_dataset, len_eval_dataset
+def shift_tokens_right(input_ids: np.array, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = np.zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1]
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    return shifted_input_ids
+def normalize_function(example, text_column, text_normalizer):
+    example[text_column] = text_normalizer(example[text_column])
+    return example
+def preprocess_function(
+    examples,
+    tokenizer,
+    text_column,
+    encoding_column,
+    max_source_length,
+    decoder_start_token_id,
+):
+    inputs = examples[text_column]
+    # Setting padding="max_length" as we need fixed length inputs for jitted functions
+    model_inputs = tokenizer(
+        inputs,
+        max_length=max_source_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="np",
+    )
+    # set up targets
+    # Note: labels correspond to our target indices
+    # decoder input ids are the same but shifted to the right with bos at the beginning (and without last token)
+    labels = examples[encoding_column]
+    labels = np.asarray(labels)
+    # We need the labels, in addition to the decoder_input_ids, for the compute_loss function
+    model_inputs["labels"] = labels
+    # In our case, this prepends the bos token and removes the last one
+    decoder_input_ids = shift_tokens_right(labels, decoder_start_token_id)
+    model_inputs["decoder_input_ids"] = decoder_input_ids
+    return model_inputs

dalle_mini/dataset.py DELETED Viewed

@@ -1,122 +0,0 @@
-"""
-An image-caption dataset dataloader.
-Luke Melas-Kyriazi, 2021
-"""
-import warnings
-from typing import Optional, Callable
-from pathlib import Path
-import numpy as np
-import torch
-import pandas as pd
-from torch.utils.data import Dataset
-from torchvision.datasets.folder import default_loader
-from PIL import ImageFile
-from PIL.Image import DecompressionBombWarning
-ImageFile.LOAD_TRUNCATED_IMAGES = True
-warnings.filterwarnings("ignore", category=UserWarning)
-warnings.filterwarnings("ignore", category=DecompressionBombWarning)
-class CaptionDataset(Dataset):
-    """
-    A PyTorch Dataset class for (image, texts) tasks. Note that this dataset
-    returns the raw text rather than tokens. This is done on purpose, because
-    it's easy to tokenize a batch of text after loading it from this dataset.
-    """
-    def __init__(self, *, images_root: str, captions_path: str, text_transform: Optional[Callable] = None,
-                 image_transform: Optional[Callable] = None, image_transform_type: str = 'torchvision',
-                 include_captions: bool = True):
-        """
-        :param images_root: folder where images are stored
-        :param captions_path: path to csv that maps image filenames to captions
-        :param image_transform: image transform pipeline
-        :param text_transform: image transform pipeline
-        :param image_transform_type: image transform type, either `torchvision` or `albumentations`
-        :param include_captions: Returns a dictionary with `image`, `text` if `true`; otherwise returns just the images.
-        """
-        # Base path for images
-        self.images_root = Path(images_root)
-        # Load captions as DataFrame
-        self.captions = pd.read_csv(captions_path, delimiter='\t', header=0)
-        self.captions['image_file'] = self.captions['image_file'].astype(str)
-        # PyTorch transformation pipeline for the image (normalizing, etc.)
-        self.text_transform = text_transform
-        self.image_transform = image_transform
-        self.image_transform_type = image_transform_type.lower()
-        assert self.image_transform_type in ['torchvision', 'albumentations']
-        # Total number of datapoints
-        self.size = len(self.captions)
-        # Return image+captions or just images
-        self.include_captions = include_captions
-    def verify_that_all_images_exist(self):
-        for image_file in self.captions['image_file']:
-            p = self.images_root / image_file
-            if not p.is_file():
-                print(f'file does not exist: {p}')
-    def _get_raw_image(self, i):
-        image_file = self.captions.iloc[i]['image_file']
-        image_path = self.images_root / image_file
-        image = default_loader(image_path)
-        return image
-    def _get_raw_text(self, i):
-        return self.captions.iloc[i]['caption']
-    def __getitem__(self, i):
-        image = self._get_raw_image(i)
-        caption = self._get_raw_text(i)
-        if self.image_transform is not None:
-            if self.image_transform_type == 'torchvision':
-                image = self.image_transform(image)
-            elif self.image_transform_type == 'albumentations':
-                image = self.image_transform(image=np.array(image))['image']
-            else:
-                raise NotImplementedError(f"{self.image_transform_type=}")
-        return {'image': image, 'text': caption} if self.include_captions else image
-    def __len__(self):
-        return self.size
-if __name__ == "__main__":
-    import albumentations as A
-    from albumentations.pytorch import ToTensorV2
-    from transformers import AutoTokenizer
-    # Paths
-    images_root = './images'
-    captions_path = './images-list-clean.tsv'
-    # Create transforms
-    tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
-    def tokenize(text):
-        return tokenizer(text, max_length=32, truncation=True, return_tensors='pt', padding='max_length')
-    image_transform = A.Compose([
-        A.Resize(256, 256), A.CenterCrop(256, 256),
-        A.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), ToTensorV2()])
-    # Create dataset
-    dataset = CaptionDataset(
-        images_root=images_root,
-        captions_path=captions_path,
-        image_transform=image_transform,
-        text_transform=tokenize,
-        image_transform_type='albumentations')
-    # Create dataloader
-    dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
-    batch = next(iter(dataloader))
-    print({k: (v.shape if isinstance(v, torch.Tensor) else v) for k, v in batch.items()})
-    # # (Optional) Check that all the images exist
-    # dataset = CaptionDataset(images_root=images_root, captions_path=captions_path)
-    # dataset.verify_that_all_images_exist()
-    # print('Done')

dalle_mini/model.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import flax.linen as nn
+import jax
+from transformers import BartConfig
+from transformers.models.bart.modeling_flax_bart import (
+    FlaxBartDecoder,
+    FlaxBartEncoder,
+    FlaxBartForConditionalGeneration,
+    FlaxBartForConditionalGenerationModule,
+    FlaxBartModule,
+)
+class CustomFlaxBartModule(FlaxBartModule):
+    def setup(self):
+        # we keep shared to easily load pre-trained weights
+        self.shared = nn.Embed(
+            self.config.vocab_size,
+            self.config.d_model,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        # a separate embedding is used for the decoder
+        self.decoder_embed = nn.Embed(
+            self.config.image_vocab_size + 1,
+            self.config.d_model,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.encoder = FlaxBartEncoder(
+            self.config, dtype=self.dtype, embed_tokens=self.shared
+        )
+        # the decoder has a different config
+        # TODO: should not be needed once we have custom config/module
+        decoder_config = BartConfig(self.config.to_dict())
+        decoder_config.max_position_embeddings = (
+            self.config.image_length + 1  # image tokens + BOS
+        )
+        decoder_config.vocab_size = self.config.image_vocab_size + 1
+        self.decoder = FlaxBartDecoder(
+            decoder_config, dtype=self.dtype, embed_tokens=self.decoder_embed
+        )
+class CustomFlaxBartForConditionalGenerationModule(
+    FlaxBartForConditionalGenerationModule
+):
+    def setup(self):
+        # set default config
+        self.config.normalize_text = getattr(self.config, "normalize_text", False)
+        self.config.image_length = getattr(self.config, "image_length", 256)
+        self.config.image_vocab_size = getattr(self.config, "image_vocab_size", 16384)
+        self.model = CustomFlaxBartModule(config=self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.image_vocab_size + 1,  # encoded image token space + 1 for bos
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.final_logits_bias = self.param(
+            "final_logits_bias", self.bias_init, (1, self.config.image_vocab_size + 1)
+        )
+class CustomFlaxBartForConditionalGeneration(FlaxBartForConditionalGeneration):
+    module_class = CustomFlaxBartForConditionalGenerationModule

dalle_mini/text.py ADDED Viewed

	@@ -0,0 +1,258 @@

+"""
+Utilities for processing text.
+"""
+import html
+import math
+import random
+import re
+from pathlib import Path
+import ftfy
+from huggingface_hub import hf_hub_download
+from unidecode import unidecode
+# based on wiki word occurence
+person_token = [("a person", 282265), ("someone", 121194), ("somebody", 12219)]
+temp_token = "xtokx"  # avoid repeating chars
+class HashtagProcessor:
+    # Adapted from wordninja library
+    # We use our wikipedia word count + a good heuristic to make it work
+    def __init__(self):
+        wiki_word_frequency = hf_hub_download(
+            "dalle-mini/dalle-mini", filename="enwiki-words-frequency.txt"
+        )
+        self._word_cost = (
+            l.split()[0] for l in Path(wiki_word_frequency).read_text().splitlines()
+        )
+        self._word_cost = {
+            str(k): math.log(float(i + 1)) for i, k in enumerate(self._word_cost)
+        }
+        self._max_word = max(len(x) for x in self._word_cost.keys())
+        self._SPLIT_RE = re.compile("[^a-zA-Z0-9']+")
+    def __call__(self, s):
+        """Uses dynamic programming to infer the location of spaces in a string without spaces."""
+        l = [self._split(x) for x in self._SPLIT_RE.split(s)]
+        return " ".join([item for sublist in l for item in sublist])
+    def _split(self, s):
+        # Find the best match for the i first characters, assuming cost has
+        # been built for the i-1 first characters.
+        # Returns a pair (match_cost, match_length).
+        def best_match(i):
+            candidates = enumerate(reversed(cost[max(0, i - self._max_word) : i]))
+            return min(
+                (c + self._word_cost.get(s[i - k - 1 : i].lower(), 9e999), k + 1)
+                for k, c in candidates
+            )
+        # Build the cost array
+        cost = [0]
+        for i in range(1, len(s) + 1):
+            c, k = best_match(i)
+            cost.append(c)
+        # Backtrack to recover the minimal-cost string.
+        out = []
+        i = len(s)
+        while i > 0:
+            c, k = best_match(i)
+            assert c == cost[i]
+            newToken = True
+            if not s[i - k : i] == "'":  # ignore a lone apostrophe
+                if len(out) > 0:
+                    # re-attach split 's and split digits
+                    if out[-1] == "'s" or (
+                        s[i - 1].isdigit() and out[-1][0].isdigit()
+                    ):  # digit followed by digit
+                        out[-1] = (
+                            s[i - k : i] + out[-1]
+                        )  # combine current token with previous token
+                        newToken = False
+            if newToken:
+                out.append(s[i - k : i])
+            i -= k
+        return reversed(out)
+def replace_person_token(t):
+    "Used for CC12M"
+    t = re.sub("<person>([,\s]*(and)*[,\s]*<person>)+", " people ", t)
+    while "<person>" in t:
+        t = t.replace(
+            "<person>", f" {random.choices(*tuple(zip(*person_token)))[0]} ", 1
+        )
+    return t
+def fix_html(t):
+    # from OpenAI CLIP
+    return html.unescape(html.unescape(t))
+def replace_punctuation_with_commas(t):
+    return re.sub("[()[\].,|:;?!=+~\-\/{}]", ",", t)
+def simplify_quotes(t):
+    return re.sub("""['"`]""", ' " ', t)
+def merge_quotes(t):
+    return re.sub('(\s*"+\s*)+', ' " ', t)
+def remove_comma_numbers(t):
+    def _f(t):
+        return re.sub("(\d),(\d{3})", r"\1\2", t)
+    return _f(_f(t))
+def pre_process_dot_numbers(t):
+    return re.sub("(\w)\.(\w)", fr"\1{temp_token}dot{temp_token}\2", t)
+def post_process_dot_numbers(t):
+    return re.sub(f"{temp_token}dot{temp_token}", ".", t)
+def pre_process_quotes(t):
+    # allows quotes only for 's, 't, 'd, 'm, 'll, 're, 've
+    return re.sub(
+        r"'(?=([stdm]|(ll)|(re)|(ve)|(ll))\b)", fr"{temp_token}quote{temp_token}", t
+    )
+def post_process_quotes(t):
+    return re.sub(f"{temp_token}quote{temp_token}", "'", t)
+def pre_process_dates(t):
+    return re.sub("(\d)/(\d)", fr"\1{temp_token}slash{temp_token}\2", t)
+def post_process_dates(t):
+    return re.sub(f"{temp_token}slash{temp_token}", "/", t)
+def merge_commas(t):
+    return re.sub("(\s*,+\s*)+", ", ", t)
+def add_space_after_commas(t):
+    return re.sub(",", ", ", t)
+def handle_special_chars(t):
+    "Handle special characters"
+    # replace "-" with a space when between words without space
+    t = re.sub("(\w)-(\w)", r"\1 \2", t)
+    # always add space around some characters
+    return re.sub("([%&\/$*])", r" \1 ", t)
+def expand_hashtags(t, hashtag_processor):
+    "Remove # and try to split words"
+    return re.sub("#(\w+)", lambda m: hashtag_processor(m.group(1)), t)
+_re_ignore_chars = r"[_#\\]"
+def ignore_chars(t):
+    "Ignore useless characters"
+    return re.sub(_re_ignore_chars, " ", t)
+def remove_extra_spaces(t):
+    "Remove extra spaces (including \t and \n)"
+    return re.sub("\s+", " ", t)
+def remove_repeating_chars(t):
+    "If the same character is present 4+ times (not 3 because of roman 'VIII'), replace with single instance"
+    return re.sub(r"(\D)(\1{3,})", r"\1", t)
+def remove_urls(t):
+    return re.sub(r"http\S+", "", t)
+def remove_html_tags(t):
+    return re.sub("<[^<]+?>", "", t)
+def remove_first_last_commas(t):
+    t = t.strip()
+    t = t[:-1] if t and t[-1] == "," else t
+    t = t[1:] if t and t[0] == "," else t
+    return t.strip()
+def remove_wiki_ref(t):
+    t = re.sub(r"\A\s*\[\d+\]", "", t)
+    return re.sub(r"\[\d+\]\s*\Z", "", t)
+class TextNormalizer:
+    "Normalize text"
+    def __init__(self):
+        self._hashtag_processor = HashtagProcessor()
+    def __call__(self, t):
+        # fix some characters
+        t = ftfy.fix_text(t)
+        # fix html
+        t = fix_html(t)
+        # decode and simplify text: see unidecode library
+        t = unidecode(t)
+        # lower case
+        t = t.lower()
+        # replace <PERSON> (for CC12M)
+        t = replace_person_token(t)
+        # remove wiki reference (for WIT)
+        t = remove_wiki_ref(t)
+        # remove html tags
+        t = remove_html_tags(t)
+        # remove urls
+        t = remove_urls(t)
+        # remove commas in numbers
+        t = remove_comma_numbers(t)
+        # handle dots in numbers and quotes - Part 1
+        t = pre_process_dot_numbers(t)
+        t = pre_process_quotes(t)
+        t = pre_process_dates(t)
+        # handle special characters
+        t = handle_special_chars(t)
+        # handle hashtags
+        t = expand_hashtags(t, self._hashtag_processor)
+        # ignore useless characters
+        t = ignore_chars(t)
+        # simplify quotes
+        t = simplify_quotes(t)
+        # all punctuation becomes commas
+        t = replace_punctuation_with_commas(t)
+        # handle dots in numbers and quotes - Part 2
+        t = post_process_dot_numbers(t)
+        t = post_process_quotes(t)
+        t = post_process_dates(t)
+        # handle repeating characters
+        t = remove_repeating_chars(t)
+        # merge quotes
+        t = merge_quotes(t)
+        # merge commas
+        t = merge_commas(t)
+        # remove multiple spaces
+        t = remove_extra_spaces(t)
+        # remove first and last comma
+        t = remove_first_last_commas(t)
+        # always start with a space
+        return f" {t}"

dalle_mini/vqgan_jax/__init__.py DELETED Viewed

File without changes

dalle_mini/vqgan_jax/configuration_vqgan.py DELETED Viewed

@@ -1,40 +0,0 @@
-from typing import Tuple
-from transformers import PretrainedConfig
-class VQGANConfig(PretrainedConfig):
-    def __init__(
-        self,
-        ch: int = 128,
-        out_ch: int = 3,
-        in_channels: int = 3,
-        num_res_blocks: int = 2,
-        resolution: int = 256,
-        z_channels: int = 256,
-        ch_mult: Tuple = (1, 1, 2, 2, 4),
-        attn_resolutions: int = (16,),
-        n_embed: int = 1024,
-        embed_dim: int = 256,
-        dropout: float = 0.0,
-        double_z: bool = False,
-        resamp_with_conv: bool = True,
-        give_pre_end: bool = False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.ch = ch
-        self.out_ch = out_ch
-        self.in_channels = in_channels
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.z_channels = z_channels
-        self.ch_mult = list(ch_mult)
-        self.attn_resolutions = list(attn_resolutions)
-        self.n_embed = n_embed
-        self.embed_dim = embed_dim
-        self.dropout = dropout
-        self.double_z = double_z
-        self.resamp_with_conv = resamp_with_conv
-        self.give_pre_end = give_pre_end
-        self.num_resolutions = len(ch_mult)

dalle_mini/vqgan_jax/convert_pt_model_to_jax.py DELETED Viewed

@@ -1,109 +0,0 @@
-import re
-import jax.numpy as jnp
-from flax.traverse_util import flatten_dict, unflatten_dict
-import torch
-from modeling_flax_vqgan import VQModel
-from configuration_vqgan import VQGANConfig
-regex = r"\w+[.]\d+"
-def rename_key(key):
-    pats = re.findall(regex, key)
-    for pat in pats:
-        key = key.replace(pat, "_".join(pat.split(".")))
-    return key
-# Adapted from https://github.com/huggingface/transformers/blob/ff5cdc086be1e0c3e2bbad8e3469b34cffb55a85/src/transformers/modeling_flax_pytorch_utils.py#L61
-def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
-    # convert pytorch tensor to numpy
-    pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
-    random_flax_state_dict = flatten_dict(flax_model.params)
-    flax_state_dict = {}
-    remove_base_model_prefix = (flax_model.base_model_prefix not in flax_model.params) and (
-        flax_model.base_model_prefix in set([k.split(".")[0] for k in pt_state_dict.keys()])
-    )
-    add_base_model_prefix = (flax_model.base_model_prefix in flax_model.params) and (
-        flax_model.base_model_prefix not in set([k.split(".")[0] for k in pt_state_dict.keys()])
-    )
-    # Need to change some parameters name to match Flax names so that we don't have to fork any layer
-    for pt_key, pt_tensor in pt_state_dict.items():
-        pt_tuple_key = tuple(pt_key.split("."))
-        has_base_model_prefix = pt_tuple_key[0] == flax_model.base_model_prefix
-        require_base_model_prefix = (flax_model.base_model_prefix,) + pt_tuple_key in random_flax_state_dict
-        if remove_base_model_prefix and has_base_model_prefix:
-            pt_tuple_key = pt_tuple_key[1:]
-        elif add_base_model_prefix and require_base_model_prefix:
-            pt_tuple_key = (flax_model.base_model_prefix,) + pt_tuple_key
-        # Correctly rename weight parameters
-        if (
-            "norm" in pt_key
-            and (pt_tuple_key[-1] == "bias")
-            and (pt_tuple_key[:-1] + ("bias",) in random_flax_state_dict)
-        ):
-            pt_tensor = pt_tensor[None, None, None, :]
-        elif (
-            "norm" in pt_key
-            and (pt_tuple_key[-1] == "bias")
-            and (pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict)
-        ):
-            pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
-            pt_tensor = pt_tensor[None, None, None, :]
-        elif pt_tuple_key[-1] in ["weight", "gamma"] and pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict:
-            pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
-            pt_tensor = pt_tensor[None, None, None, :]
-        if pt_tuple_key[-1] == "weight" and pt_tuple_key[:-1] + ("embedding",) in random_flax_state_dict:
-            pt_tuple_key = pt_tuple_key[:-1] + ("embedding",)
-        elif pt_tuple_key[-1] == "weight" and pt_tensor.ndim == 4 and pt_tuple_key not in random_flax_state_dict:
-            # conv layer
-            pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
-            pt_tensor = pt_tensor.transpose(2, 3, 1, 0)
-        elif pt_tuple_key[-1] == "weight" and pt_tuple_key not in random_flax_state_dict:
-            # linear layer
-            pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
-            pt_tensor = pt_tensor.T
-        elif pt_tuple_key[-1] == "gamma":
-            pt_tuple_key = pt_tuple_key[:-1] + ("weight",)
-        elif pt_tuple_key[-1] == "beta":
-            pt_tuple_key = pt_tuple_key[:-1] + ("bias",)
-        if pt_tuple_key in random_flax_state_dict:
-            if pt_tensor.shape != random_flax_state_dict[pt_tuple_key].shape:
-                raise ValueError(
-                    f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape "
-                    f"{random_flax_state_dict[pt_tuple_key].shape}, but is {pt_tensor.shape}."
-                )
-        # also add unexpected weight so that warning is thrown
-        flax_state_dict[pt_tuple_key] = jnp.asarray(pt_tensor)
-    return unflatten_dict(flax_state_dict)
-def convert_model(config_path, pt_state_dict_path, save_path):
-    config = VQGANConfig.from_pretrained(config_path)
-    model = VQModel(config)
-    state_dict = torch.load(pt_state_dict_path, map_location="cpu")["state_dict"]
-    keys = list(state_dict.keys())
-    for key in keys:
-        if key.startswith("loss"):
-            state_dict.pop(key)
-            continue
-        renamed_key = rename_key(key)
-        state_dict[renamed_key] = state_dict.pop(key)
-    state = convert_pytorch_state_dict_to_flax(state_dict, model)
-    model.params = unflatten_dict(state)
-    model.save_pretrained(save_path)

dalle_mini/vqgan_jax/modeling_flax_vqgan.py DELETED Viewed

@@ -1,609 +0,0 @@
-# JAX implementation of VQGAN from taming-transformers https://github.com/CompVis/taming-transformers
-from functools import partial
-from typing import Tuple
-import math
-import jax
-import jax.numpy as jnp
-import numpy as np
-import flax.linen as nn
-from flax.core.frozen_dict import FrozenDict
-from transformers.modeling_flax_utils import FlaxPreTrainedModel
-from .configuration_vqgan import VQGANConfig
-class Upsample(nn.Module):
-    in_channels: int
-    with_conv: bool
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        if self.with_conv:
-            self.conv = nn.Conv(
-                self.in_channels,
-                kernel_size=(3, 3),
-                strides=(1, 1),
-                padding=((1, 1), (1, 1)),
-                dtype=self.dtype,
-            )
-    def __call__(self, hidden_states):
-        batch, height, width, channels = hidden_states.shape
-        hidden_states = jax.image.resize(
-            hidden_states,
-            shape=(batch, height * 2, width * 2, channels),
-            method="nearest",
-        )
-        if self.with_conv:
-            hidden_states = self.conv(hidden_states)
-        return hidden_states
-class Downsample(nn.Module):
-    in_channels: int
-    with_conv: bool
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        if self.with_conv:
-            self.conv = nn.Conv(
-                self.in_channels,
-                kernel_size=(3, 3),
-                strides=(2, 2),
-                padding="VALID",
-                dtype=self.dtype,
-            )
-    def __call__(self, hidden_states):
-        if self.with_conv:
-            pad = ((0, 0), (0, 1), (0, 1), (0, 0))  # pad height and width dim
-            hidden_states = jnp.pad(hidden_states, pad_width=pad)
-            hidden_states = self.conv(hidden_states)
-        else:
-            hidden_states = nn.avg_pool(hidden_states, window_shape=(2, 2), strides=(2, 2), padding="VALID")
-        return hidden_states
-class ResnetBlock(nn.Module):
-    in_channels: int
-    out_channels: int = None
-    use_conv_shortcut: bool = False
-    temb_channels: int = 512
-    dropout_prob: float = 0.0
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        self.out_channels_ = self.in_channels if self.out_channels is None else self.out_channels
-        self.norm1 = nn.GroupNorm(num_groups=32, epsilon=1e-6)
-        self.conv1 = nn.Conv(
-            self.out_channels_,
-            kernel_size=(3, 3),
-            strides=(1, 1),
-            padding=((1, 1), (1, 1)),
-            dtype=self.dtype,
-        )
-        if self.temb_channels:
-            self.temb_proj = nn.Dense(self.out_channels_, dtype=self.dtype)
-        self.norm2 = nn.GroupNorm(num_groups=32, epsilon=1e-6)
-        self.dropout = nn.Dropout(self.dropout_prob)
-        self.conv2 = nn.Conv(
-            self.out_channels_,
-            kernel_size=(3, 3),
-            strides=(1, 1),
-            padding=((1, 1), (1, 1)),
-            dtype=self.dtype,
-        )
-        if self.in_channels != self.out_channels_:
-            if self.use_conv_shortcut:
-                self.conv_shortcut = nn.Conv(
-                    self.out_channels_,
-                    kernel_size=(3, 3),
-                    strides=(1, 1),
-                    padding=((1, 1), (1, 1)),
-                    dtype=self.dtype,
-                )
-            else:
-                self.nin_shortcut = nn.Conv(
-                    self.out_channels_,
-                    kernel_size=(1, 1),
-                    strides=(1, 1),
-                    padding="VALID",
-                    dtype=self.dtype,
-                )
-    def __call__(self, hidden_states, temb=None, deterministic: bool = True):
-        residual = hidden_states
-        hidden_states = self.norm1(hidden_states)
-        hidden_states = nn.swish(hidden_states)
-        hidden_states = self.conv1(hidden_states)
-        if temb is not None:
-            hidden_states = hidden_states + self.temb_proj(nn.swish(temb))[:, :, None, None]  # TODO: check shapes
-        hidden_states = self.norm2(hidden_states)
-        hidden_states = nn.swish(hidden_states)
-        hidden_states = self.dropout(hidden_states, deterministic)
-        hidden_states = self.conv2(hidden_states)
-        if self.in_channels != self.out_channels_:
-            if self.use_conv_shortcut:
-                residual = self.conv_shortcut(residual)
-            else:
-                residual = self.nin_shortcut(residual)
-        return hidden_states + residual
-class AttnBlock(nn.Module):
-    in_channels: int
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        conv = partial(
-            nn.Conv, self.in_channels, kernel_size=(1, 1), strides=(1, 1), padding="VALID", dtype=self.dtype
-        )
-        self.norm = nn.GroupNorm(num_groups=32, epsilon=1e-6)
-        self.q, self.k, self.v = conv(), conv(), conv()
-        self.proj_out = conv()
-    def __call__(self, hidden_states):
-        residual = hidden_states
-        hidden_states = self.norm(hidden_states)
-        query = self.q(hidden_states)
-        key = self.k(hidden_states)
-        value = self.v(hidden_states)
-        # compute attentions
-        batch, height, width, channels = query.shape
-        query = query.reshape((batch, height * width, channels))
-        key = key.reshape((batch, height * width, channels))
-        attn_weights = jnp.einsum("...qc,...kc->...qk", query, key)
-        attn_weights = attn_weights * (int(channels) ** -0.5)
-        attn_weights = nn.softmax(attn_weights, axis=2)
-        ## attend to values
-        value = value.reshape((batch, height * width, channels))
-        hidden_states = jnp.einsum("...kc,...qk->...qc", value, attn_weights)
-        hidden_states = hidden_states.reshape((batch, height, width, channels))
-        hidden_states = self.proj_out(hidden_states)
-        hidden_states = hidden_states + residual
-        return hidden_states
-class UpsamplingBlock(nn.Module):
-    config: VQGANConfig
-    curr_res: int
-    block_idx: int
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        if self.block_idx == self.config.num_resolutions - 1:
-            block_in = self.config.ch * self.config.ch_mult[-1]
-        else:
-            block_in = self.config.ch * self.config.ch_mult[self.block_idx + 1]
-        block_out = self.config.ch * self.config.ch_mult[self.block_idx]
-        self.temb_ch = 0
-        res_blocks = []
-        attn_blocks = []
-        for _ in range(self.config.num_res_blocks + 1):
-            res_blocks.append(
-                ResnetBlock(
-                    block_in, block_out, temb_channels=self.temb_ch, dropout_prob=self.config.dropout, dtype=self.dtype
-                )
-            )
-            block_in = block_out
-            if self.curr_res in self.config.attn_resolutions:
-                attn_blocks.append(AttnBlock(block_in, dtype=self.dtype))
-        self.block = res_blocks
-        self.attn = attn_blocks
-        self.upsample = None
-        if self.block_idx != 0:
-            self.upsample = Upsample(block_in, self.config.resamp_with_conv, dtype=self.dtype)
-    def __call__(self, hidden_states, temb=None, deterministic: bool = True):
-        for res_block in self.block:
-            hidden_states = res_block(hidden_states, temb, deterministic=deterministic)
-            for attn_block in self.attn:
-                hidden_states = attn_block(hidden_states)
-        if self.upsample is not None:
-            hidden_states = self.upsample(hidden_states)
-        return hidden_states
-class DownsamplingBlock(nn.Module):
-    config: VQGANConfig
-    curr_res: int
-    block_idx: int
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        in_ch_mult = (1,) + tuple(self.config.ch_mult)
-        block_in = self.config.ch * in_ch_mult[self.block_idx]
-        block_out = self.config.ch * self.config.ch_mult[self.block_idx]
-        self.temb_ch = 0
-        res_blocks = []
-        attn_blocks = []
-        for _ in range(self.config.num_res_blocks):
-            res_blocks.append(
-                ResnetBlock(
-                    block_in, block_out, temb_channels=self.temb_ch, dropout_prob=self.config.dropout, dtype=self.dtype
-                )
-            )
-            block_in = block_out
-            if self.curr_res in self.config.attn_resolutions:
-                attn_blocks.append(AttnBlock(block_in, dtype=self.dtype))
-        self.block = res_blocks
-        self.attn = attn_blocks
-        self.downsample = None
-        if self.block_idx != self.config.num_resolutions - 1:
-            self.downsample = Downsample(block_in, self.config.resamp_with_conv, dtype=self.dtype)
-    def __call__(self, hidden_states, temb=None, deterministic: bool = True):
-        for res_block in self.block:
-            hidden_states = res_block(hidden_states, temb, deterministic=deterministic)
-            for attn_block in self.attn:
-                hidden_states = attn_block(hidden_states)
-        if self.downsample is not None:
-            hidden_states = self.downsample(hidden_states)
-        return hidden_states
-class MidBlock(nn.Module):
-    in_channels: int
-    temb_channels: int
-    dropout: float
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        self.block_1 = ResnetBlock(
-            self.in_channels,
-            self.in_channels,
-            temb_channels=self.temb_channels,
-            dropout_prob=self.dropout,
-            dtype=self.dtype,
-        )
-        self.attn_1 = AttnBlock(self.in_channels, dtype=self.dtype)
-        self.block_2 = ResnetBlock(
-            self.in_channels,
-            self.in_channels,
-            temb_channels=self.temb_channels,
-            dropout_prob=self.dropout,
-            dtype=self.dtype,
-        )
-    def __call__(self, hidden_states, temb=None, deterministic: bool = True):
-        hidden_states = self.block_1(hidden_states, temb, deterministic=deterministic)
-        hidden_states = self.attn_1(hidden_states)
-        hidden_states = self.block_2(hidden_states, temb, deterministic=deterministic)
-        return hidden_states
-class Encoder(nn.Module):
-    config: VQGANConfig
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        self.temb_ch = 0
-        # downsampling
-        self.conv_in = nn.Conv(
-            self.config.ch,
-            kernel_size=(3, 3),
-            strides=(1, 1),
-            padding=((1, 1), (1, 1)),
-            dtype=self.dtype,
-        )
-        curr_res = self.config.resolution
-        downsample_blocks = []
-        for i_level in range(self.config.num_resolutions):
-            downsample_blocks.append(DownsamplingBlock(self.config, curr_res, block_idx=i_level, dtype=self.dtype))
-            if i_level != self.config.num_resolutions - 1:
-                curr_res = curr_res // 2
-        self.down = downsample_blocks
-        # middle
-        mid_channels = self.config.ch * self.config.ch_mult[-1]
-        self.mid = MidBlock(mid_channels, self.temb_ch, self.config.dropout, dtype=self.dtype)
-        # end
-        self.norm_out = nn.GroupNorm(num_groups=32, epsilon=1e-6)
-        self.conv_out = nn.Conv(
-            2 * self.config.z_channels if self.config.double_z else self.config.z_channels,
-            kernel_size=(3, 3),
-            strides=(1, 1),
-            padding=((1, 1), (1, 1)),
-            dtype=self.dtype,
-        )
-    def __call__(self, pixel_values, deterministic: bool = True):
-        # timestep embedding
-        temb = None
-        # downsampling
-        hidden_states = self.conv_in(pixel_values)
-        for block in self.down:
-            hidden_states = block(hidden_states, temb, deterministic=deterministic)
-        # middle
-        hidden_states = self.mid(hidden_states, temb, deterministic=deterministic)
-        # end
-        hidden_states = self.norm_out(hidden_states)
-        hidden_states = nn.swish(hidden_states)
-        hidden_states = self.conv_out(hidden_states)
-        return hidden_states
-class Decoder(nn.Module):
-    config: VQGANConfig
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        self.temb_ch = 0
-        # compute in_ch_mult, block_in and curr_res at lowest res
-        block_in = self.config.ch * self.config.ch_mult[self.config.num_resolutions - 1]
-        curr_res = self.config.resolution // 2 ** (self.config.num_resolutions - 1)
-        self.z_shape = (1, self.config.z_channels, curr_res, curr_res)
-        print("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))
-        # z to block_in
-        self.conv_in = nn.Conv(
-            block_in,
-            kernel_size=(3, 3),
-            strides=(1, 1),
-            padding=((1, 1), (1, 1)),
-            dtype=self.dtype,
-        )
-        # middle
-        self.mid = MidBlock(block_in, self.temb_ch, self.config.dropout, dtype=self.dtype)
-        # upsampling
-        upsample_blocks = []
-        for i_level in reversed(range(self.config.num_resolutions)):
-            upsample_blocks.append(UpsamplingBlock(self.config, curr_res, block_idx=i_level, dtype=self.dtype))
-            if i_level != 0:
-                curr_res = curr_res * 2
-        self.up = list(reversed(upsample_blocks))  # reverse to get consistent order
-        # end
-        self.norm_out = nn.GroupNorm(num_groups=32, epsilon=1e-6)
-        self.conv_out = nn.Conv(
-            self.config.out_ch,
-            kernel_size=(3, 3),
-            strides=(1, 1),
-            padding=((1, 1), (1, 1)),
-            dtype=self.dtype,
-        )
-    def __call__(self, hidden_states, deterministic: bool = True):
-        # timestep embedding
-        temb = None
-        # z to block_in
-        hidden_states = self.conv_in(hidden_states)
-        # middle
-        hidden_states = self.mid(hidden_states, temb, deterministic=deterministic)
-        # upsampling
-        for block in reversed(self.up):
-            hidden_states = block(hidden_states, temb, deterministic=deterministic)
-        # end
-        if self.config.give_pre_end:
-            return hidden_states
-        hidden_states = self.norm_out(hidden_states)
-        hidden_states = nn.swish(hidden_states)
-        hidden_states = self.conv_out(hidden_states)
-        return hidden_states
-class VectorQuantizer(nn.Module):
-    """
-    see https://github.com/MishaLaskin/vqvae/blob/d761a999e2267766400dc646d82d3ac3657771d4/models/quantizer.py
-    ____________________________________________
-    Discretization bottleneck part of the VQ-VAE.
-    Inputs:
-    - n_e : number of embeddings
-    - e_dim : dimension of embedding
-    - beta : commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
-    _____________________________________________
-    """
-    config: VQGANConfig
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        self.embedding = nn.Embed(self.config.n_embed, self.config.embed_dim, dtype=self.dtype)  # TODO: init
-    def __call__(self, hidden_states):
-        """
-        Inputs the output of the encoder network z and maps it to a discrete
-        one-hot vector that is the index of the closest embedding vector e_j
-        z (continuous) -> z_q (discrete)
-        z.shape = (batch, channel, height, width)
-        quantization pipeline:
-            1. get encoder input (B,C,H,W)
-            2. flatten input to (B*H*W,C)
-        """
-        #  flatten
-        hidden_states_flattended = hidden_states.reshape((-1, self.config.embed_dim))
-        # dummy op to init the weights, so we can access them below
-        self.embedding(jnp.ones((1, 1), dtype="i4"))
-        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
-        emb_weights = self.variables["params"]["embedding"]["embedding"]
-        distance = (
-            jnp.sum(hidden_states_flattended ** 2, axis=1, keepdims=True)
-            + jnp.sum(emb_weights ** 2, axis=1)
-            - 2 * jnp.dot(hidden_states_flattended, emb_weights.T)
-        )
-        # get quantized latent vectors
-        min_encoding_indices = jnp.argmin(distance, axis=1)
-        z_q = self.embedding(min_encoding_indices).reshape(hidden_states.shape)
-        # reshape to (batch, num_tokens)
-        min_encoding_indices = min_encoding_indices.reshape(hidden_states.shape[0], -1)
-        # compute the codebook_loss (q_loss) outside the model
-        # here we return the embeddings and indices
-        return z_q, min_encoding_indices
-    def get_codebook_entry(self, indices, shape=None):
-        # indices are expected to be of shape (batch, num_tokens)
-        # get quantized latent vectors
-        batch, num_tokens = indices.shape
-        z_q = self.embedding(indices)
-        z_q = z_q.reshape(batch, int(math.sqrt(num_tokens)), int(math.sqrt(num_tokens)), -1)
-        return z_q
-class VQModule(nn.Module):
-    config: VQGANConfig
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        self.encoder = Encoder(self.config, dtype=self.dtype)
-        self.decoder = Decoder(self.config, dtype=self.dtype)
-        self.quantize = VectorQuantizer(self.config, dtype=self.dtype)
-        self.quant_conv = nn.Conv(
-            self.config.embed_dim,
-            kernel_size=(1, 1),
-            strides=(1, 1),
-            padding="VALID",
-            dtype=self.dtype,
-        )
-        self.post_quant_conv = nn.Conv(
-            self.config.z_channels,
-            kernel_size=(1, 1),
-            strides=(1, 1),
-            padding="VALID",
-            dtype=self.dtype,
-        )
-    def encode(self, pixel_values, deterministic: bool = True):
-        hidden_states = self.encoder(pixel_values, deterministic=deterministic)
-        hidden_states = self.quant_conv(hidden_states)
-        quant_states, indices = self.quantize(hidden_states)
-        return quant_states, indices
-    def decode(self, hidden_states, deterministic: bool = True):
-        hidden_states = self.post_quant_conv(hidden_states)
-        hidden_states = self.decoder(hidden_states, deterministic=deterministic)
-        return hidden_states
-    def decode_code(self, code_b):
-        hidden_states = self.quantize.get_codebook_entry(code_b)
-        hidden_states = self.decode(hidden_states)
-        return hidden_states
-    def __call__(self, pixel_values, deterministic: bool = True):
-        quant_states, indices = self.encode(pixel_values, deterministic)
-        hidden_states = self.decode(quant_states, deterministic)
-        return hidden_states, indices
-class VQGANPreTrainedModel(FlaxPreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface
-    for downloading and loading pretrained models.
-    """
-    config_class = VQGANConfig
-    base_model_prefix = "model"
-    module_class: nn.Module = None
-    def __init__(
-        self,
-        config: VQGANConfig,
-        input_shape: Tuple = (1, 256, 256, 3),
-        seed: int = 0,
-        dtype: jnp.dtype = jnp.float32,
-        **kwargs,
-    ):
-        module = self.module_class(config=config, dtype=dtype, **kwargs)
-        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype)
-    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple) -> FrozenDict:
-        # init input tensors
-        pixel_values = jnp.zeros(input_shape, dtype=jnp.float32)
-        params_rng, dropout_rng = jax.random.split(rng)
-        rngs = {"params": params_rng, "dropout": dropout_rng}
-        return self.module.init(rngs, pixel_values)["params"]
-    def encode(self, pixel_values, params: dict = None, dropout_rng: jax.random.PRNGKey = None, train: bool = False):
-        # Handle any PRNG if needed
-        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
-        return self.module.apply(
-            {"params": params or self.params}, jnp.array(pixel_values), not train, rngs=rngs, method=self.module.encode
-        )
-    def decode(self, hidden_states, params: dict = None, dropout_rng: jax.random.PRNGKey = None, train: bool = False):
-        # Handle any PRNG if needed
-        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
-        return self.module.apply(
-            {"params": params or self.params},
-            jnp.array(hidden_states),
-            not train,
-            rngs=rngs,
-            method=self.module.decode,
-        )
-    def decode_code(self, indices, params: dict = None):
-        return self.module.apply(
-            {"params": params or self.params}, jnp.array(indices, dtype="i4"), method=self.module.decode_code
-        )
-    def __call__(
-        self,
-        pixel_values,
-        params: dict = None,
-        dropout_rng: jax.random.PRNGKey = None,
-        train: bool = False,
-    ):
-        # Handle any PRNG if needed
-        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
-        return self.module.apply(
-            {"params": params or self.params},
-            jnp.array(pixel_values),
-            not train,
-            rngs=rngs,
-        )
-class VQModel(VQGANPreTrainedModel):
-    module_class = VQModule

data/CC12M_downloader.py DELETED Viewed

@@ -1,91 +0,0 @@
-# Luke Melas-Kyriazi's code. (https://twitter.com/lukemelas)
-#%%
-import sys
-import os
-from datetime import datetime
-import pandas as pd
-import contexttimer
-from urllib.request import urlopen
-import requests
-from PIL import Image
-import torch
-from torchvision.transforms import functional as TF
-from multiprocessing import Pool
-from tqdm import tqdm
-import logging
-# Setup
-logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
-requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
-# # For downloading SVG images (I can't get this to work)
-# from io import BytesIO
-# import cairosvg
-#%%
-# Load data
-print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}')
-with contexttimer.Timer(prefix="Loading from tsv"):
-    df = pd.read_csv('./cc12m.tsv', delimiter='\t', header=None)
-url_to_idx_map = {url: index for index, url, caption in df.itertuples()}
-print(f'Loaded {len(url_to_idx_map)} urls')
-#%%
-df.head()
-#%%
-# Note: it seems that there are no SVG images
-df.sample(10000)[1].str.contains('.svg').sum()
-#%%
-# Resize function
-def resize(img):
-    max_size_of_short_side = 512
-    if min(img.size) > max_size_of_short_side:
-        img = TF.resize(img, size=max_size_of_short_side, interpolation=Image.LANCZOS)
-    return img
-base_dir = os.path.join(os.getcwd(), 'images')
-def process(item):
-    url, image_id = item
-    try:
-        base_url = os.path.basename(url)  # extract base url
-        stem, ext = os.path.splitext(base_url)  # split into stem and extension
-        filename = f'{image_id:08d}---{stem}.jpg'  # create filename
-        filepath = os.path.join(base_dir, filename)  # concat to get filepath
-        if not os.path.isfile(filepath):
-            # if filepath.endswith('.svg'):
-            #     raise NotImplementedError()
-            #     image_bytes = BytesIO()  # create a bytestream
-            #     cairosvg.svg2png(url=url, write_to=image_bytes)  # convert svg into image
-            # else:
-            req = requests.get(url, stream=True, timeout=1, verify=False).raw
-            image = Image.open(req).convert('RGB')
-            if min(image.size) > 512:
-                image = TF.resize(image, size=512, interpolation=Image.LANCZOS)
-            # image = resize(image)  # resize PIL image
-            image.save(filepath)  # save PIL image
-    except Exception as e:
-        logging.info(" ".join(repr(e).splitlines()))
-        logging.error(url)
-#%%
-#for i, item in enumerate(tqdm(url_to_idx_map.items(), total=len(url_to_idx_map))):
-#    process(item)
-#    if i > 100:
-#        break
-# Use multiprocessing for speed
-list_of_items = list(url_to_idx_map.items())
-print(len(list_of_items))
-list_of_items = list_of_items[10_000_000:]
-print(len(list_of_items))
-with Pool(128) as p:
-    r = list(tqdm(p.imap(process, list_of_items), total=len(list_of_items)))
-    print('DONE')

data/CC3M_downloader.py DELETED Viewed

@@ -1,62 +0,0 @@
-'''
-This script was adapted from Luke Melas-Kyriazi's code. (https://twitter.com/lukemelas)
-Few changes were made for the particular dataset. You're required to have the `.tsv` file downloaded in your directory.
-Find them here- [https://github.com/google-research-datasets/conceptual-captions]
-'''
-import sys
-import os
-from datetime import datetime
-import pandas as pd
-import contexttimer
-from urllib.request import urlopen
-import requests
-from PIL import Image
-import torch
-from torchvision.transforms import functional as TF
-from multiprocessing import Pool
-from tqdm import tqdm
-import logging
-import sys
-# Setup
-logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
-requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
-if len(sys.argv) != 3:
-    print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training")
-    exit(1)
-# Load data
-print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}')
-with contexttimer.Timer(prefix="Loading from tsv"):
-    df = pd.read_csv(sys.argv[1], delimiter='\t', header=None)
-url_to_idx_map = {url: index for index, caption, url in df.itertuples()}
-print(f'Loaded {len(url_to_idx_map)} urls')
-base_dir = os.path.join(os.getcwd(), sys.argv[2])
-def process(item):
-    url, image_id = item
-    try:
-        base_url = os.path.basename(url)  # extract base url
-        stem, ext = os.path.splitext(base_url)  # split into stem and extension
-        filename = f'{image_id:08d}---{stem}.jpg'  # create filename
-        filepath = os.path.join(base_dir, filename)  # concat to get filepath
-        if not os.path.isfile(filepath):
-            req = requests.get(url, stream=True, timeout=1, verify=False).raw
-            image = Image.open(req).convert('RGB')
-            if min(image.size) > 512:
-                image = TF.resize(image, size=512, interpolation=Image.LANCZOS)
-            image.save(filepath)  # save PIL image
-    except Exception as e:
-        logging.info(" ".join(repr(e).splitlines()))
-        logging.error(url)
-list_of_items = list(url_to_idx_map.items())
-print(len(list_of_items))
-with Pool(128) as p:
-    r = list(tqdm(p.imap(process, list_of_items), total=len(list_of_items)))
-    print('DONE')

demo/CustomBARTv4b_model-generate.ipynb DELETED Viewed

@@ -1,566 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "CustomBARTv4b-model-generate.ipynb",
-      "provenance": [],
-      "collapsed_sections": [],
-      "machine_shape": "hm"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "TPU"
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ewer-Q-0w2xA"
-      },
-      "source": [
-        "# Installation"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "NpsF9ipLLl2s",
-        "outputId": "10bf54aa-b89d-4e42-9777-bc97b00a5f32"
-      },
-      "source": [
-        "!pip install git+https://github.com/huggingface/transformers/\n",
-        "!pip install git+https://github.com/google/flax"
-      ],
-      "execution_count": 1,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "Collecting git+https://github.com/huggingface/transformers/\n",
-            "  Cloning https://github.com/huggingface/transformers/ to /tmp/pip-req-build-oxejx1op\n",
-            "  Running command git clone -q https://github.com/huggingface/transformers/ /tmp/pip-req-build-oxejx1op\n",
-            "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
-            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
-            "    Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
-            "Requirement already satisfied (use --upgrade to upgrade): transformers==4.9.0.dev0 from git+https://github.com/huggingface/transformers/ in /usr/local/lib/python3.7/dist-packages\n",
-            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.9.0.dev0) (1.19.5)\n",
-            "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers==4.9.0.dev0) (20.9)\n",
-            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers==4.9.0.dev0) (5.4.1)\n",
-            "Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers==4.9.0.dev0) (0.0.45)\n",
-            "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers==4.9.0.dev0) (4.6.0)\n",
-            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.9.0.dev0) (4.41.1)\n",
-            "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.9.0.dev0) (3.0.12)\n",
-            "Requirement already satisfied: huggingface-hub==0.0.12 in /usr/local/lib/python3.7/dist-packages (from transformers==4.9.0.dev0) (0.0.12)\n",
-            "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers==4.9.0.dev0) (0.10.3)\n",
-            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.9.0.dev0) (2019.12.20)\n",
-            "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.9.0.dev0) (2.23.0)\n",
-            "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers==4.9.0.dev0) (2.4.7)\n",
-            "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.9.0.dev0) (1.15.0)\n",
-            "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.9.0.dev0) (1.0.1)\n",
-            "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.9.0.dev0) (7.1.2)\n",
-            "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers==4.9.0.dev0) (3.7.4.3)\n",
-            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers==4.9.0.dev0) (3.4.1)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.9.0.dev0) (2021.5.30)\n",
-            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.9.0.dev0) (3.0.4)\n",
-            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.9.0.dev0) (1.24.3)\n",
-            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.9.0.dev0) (2.10)\n",
-            "Building wheels for collected packages: transformers\n",
-            "  Building wheel for transformers (PEP 517) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for transformers: filename=transformers-4.9.0.dev0-cp37-none-any.whl size=2582229 sha256=249c593273ccca3027c6427d2c6fd749a89f21d722d628d97eb438a2cf3185a8\n",
-            "  Stored in directory: /tmp/pip-ephem-wheel-cache-l2rqt1b7/wheels/61/69/33/974fccec4d0ab5feee9fe83bd93e680d269a805be9ede5ec60\n",
-            "Successfully built transformers\n",
-            "Collecting git+https://github.com/google/flax\n",
-            "  Cloning https://github.com/google/flax to /tmp/pip-req-build-rt9g1_wx\n",
-            "  Running command git clone -q https://github.com/google/flax /tmp/pip-req-build-rt9g1_wx\n",
-            "Requirement already satisfied (use --upgrade to upgrade): flax==0.3.4 from git+https://github.com/google/flax in /usr/local/lib/python3.7/dist-packages\n",
-            "Requirement already satisfied: numpy>=1.12 in /usr/local/lib/python3.7/dist-packages (from flax==0.3.4) (1.19.5)\n",
-            "Requirement already satisfied: jax>=0.2.13 in /usr/local/lib/python3.7/dist-packages (from flax==0.3.4) (0.2.13)\n",
-            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from flax==0.3.4) (3.2.2)\n",
-            "Requirement already satisfied: msgpack in /usr/local/lib/python3.7/dist-packages (from flax==0.3.4) (1.0.2)\n",
-            "Requirement already satisfied: optax in /usr/local/lib/python3.7/dist-packages (from flax==0.3.4) (0.0.9)\n",
-            "Requirement already satisfied: opt-einsum in /usr/local/lib/python3.7/dist-packages (from jax>=0.2.13->flax==0.3.4) (3.3.0)\n",
-            "Requirement already satisfied: absl-py in /usr/local/lib/python3.7/dist-packages (from jax>=0.2.13->flax==0.3.4) (0.12.0)\n",
-            "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->flax==0.3.4) (2.8.1)\n",
-            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->flax==0.3.4) (0.10.0)\n",
-            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->flax==0.3.4) (2.4.7)\n",
-            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->flax==0.3.4) (1.3.1)\n",
-            "Requirement already satisfied: chex>=0.0.4 in /usr/local/lib/python3.7/dist-packages (from optax->flax==0.3.4) (0.0.8)\n",
-            "Requirement already satisfied: jaxlib>=0.1.37 in /usr/local/lib/python3.7/dist-packages (from optax->flax==0.3.4) (0.1.66+cuda110)\n",
-            "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from absl-py->jax>=0.2.13->flax==0.3.4) (1.15.0)\n",
-            "Requirement already satisfied: dm-tree>=0.1.5 in /usr/local/lib/python3.7/dist-packages (from chex>=0.0.4->optax->flax==0.3.4) (0.1.6)\n",
-            "Requirement already satisfied: toolz>=0.9.0 in /usr/local/lib/python3.7/dist-packages (from chex>=0.0.4->optax->flax==0.3.4) (0.11.1)\n",
-            "Requirement already satisfied: flatbuffers in /usr/local/lib/python3.7/dist-packages (from jaxlib>=0.1.37->optax->flax==0.3.4) (1.12)\n",
-            "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from jaxlib>=0.1.37->optax->flax==0.3.4) (1.4.1)\n",
-            "Building wheels for collected packages: flax\n",
-            "  Building wheel for flax (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for flax: filename=flax-0.3.4-cp37-none-any.whl size=184692 sha256=503b27995f372afe33631e71572d5edc1fffd4d2e0a4cd206d291ad6b0e4c299\n",
-            "  Stored in directory: /tmp/pip-ephem-wheel-cache-g1pzxnv6/wheels/3d/26/f4/0ea6051d7352289d9e4f8178348452b35a9a97bde6035405a5\n",
-            "Successfully built flax\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "M1wVkrpjU6zO"
-      },
-      "source": [
-        "%load_ext autoreload\n",
-        "%autoreload 2"
-      ],
-      "execution_count": 2,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "t47CH1H_IOT8"
-      },
-      "source": [
-        "# Custom BART Model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "9jQnM6S2vCpn"
-      },
-      "source": [
-        "# TODO: set those args in a config file\n",
-        "OUTPUT_VOCAB_SIZE = 16384 + 1  # encoded image token space + 1 for bos\n",
-        "OUTPUT_LENGTH = 256 + 1  # number of encoded tokens + 1 for bos\n",
-        "BOS_TOKEN_ID = 16384\n",
-        "BASE_MODEL = 'facebook/bart-large'"
-      ],
-      "execution_count": 3,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "_eEaJVxAKpV5"
-      },
-      "source": [
-        "import jax\n",
-        "import flax.linen as nn\n",
-        "\n",
-        "from transformers.models.bart.modeling_flax_bart import *\n",
-        "from transformers import BartTokenizer, FlaxBartForConditionalGeneration\n",
-        "\n",
-        "class CustomFlaxBartModule(FlaxBartModule):\n",
-        "    def setup(self):\n",
-        "        # we keep shared to easily load pre-trained weights\n",
-        "        self.shared = nn.Embed(\n",
-        "            self.config.vocab_size,\n",
-        "            self.config.d_model,\n",
-        "            embedding_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),\n",
-        "            dtype=self.dtype,\n",
-        "        )\n",
-        "        # a separate embedding is used for the decoder\n",
-        "        self.decoder_embed = nn.Embed(\n",
-        "            OUTPUT_VOCAB_SIZE,\n",
-        "            self.config.d_model,\n",
-        "            embedding_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),\n",
-        "            dtype=self.dtype,\n",
-        "        )\n",
-        "        self.encoder = FlaxBartEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared)\n",
-        "\n",
-        "        # the decoder has a different config\n",
-        "        decoder_config = BartConfig(self.config.to_dict())\n",
-        "        decoder_config.max_position_embeddings = OUTPUT_LENGTH\n",
-        "        decoder_config.vocab_size = OUTPUT_VOCAB_SIZE\n",
-        "        self.decoder = FlaxBartDecoder(decoder_config, dtype=self.dtype, embed_tokens=self.decoder_embed)\n",
-        "\n",
-        "class CustomFlaxBartForConditionalGenerationModule(FlaxBartForConditionalGenerationModule):\n",
-        "    def setup(self):\n",
-        "        self.model = CustomFlaxBartModule(config=self.config, dtype=self.dtype)\n",
-        "        self.lm_head = nn.Dense(\n",
-        "            OUTPUT_VOCAB_SIZE,\n",
-        "            use_bias=False,\n",
-        "            dtype=self.dtype,\n",
-        "            kernel_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),\n",
-        "        )\n",
-        "        self.final_logits_bias = self.param(\"final_logits_bias\", self.bias_init, (1, OUTPUT_VOCAB_SIZE))\n",
-        "\n",
-        "class CustomFlaxBartForConditionalGeneration(FlaxBartForConditionalGeneration):\n",
-        "    module_class = CustomFlaxBartForConditionalGenerationModule"
-      ],
-      "execution_count": 4,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "S7CP9Td9m2ge",
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "outputId": "5638ef68-9c40-46f7-90ba-a4d05b61360d"
-      },
-      "source": [
-        "# load pre-trained model for encoder weights\n",
-        "base_model = FlaxBartForConditionalGeneration.from_pretrained(BASE_MODEL)"
-      ],
-      "execution_count": 5,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n"
-          ],
-          "name": "stderr"
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "6lmynR-poceH"
-      },
-      "source": [
-        "# set up our new model config\n",
-        "config = BartConfig.from_pretrained(BASE_MODEL)\n",
-        "config.tie_word_embeddings = False\n",
-        "config.decoder_start_token_id = BOS_TOKEN_ID\n",
-        "config.bos_token_id = BOS_TOKEN_ID  # should not be used\n",
-        "config.pos_token_id = BOS_TOKEN_ID  # should not be used\n",
-        "#config.eos_token_id = None  # prevents generation from stopping until we reach max_length"
-      ],
-      "execution_count": 6,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "_6-XKK40oEfP"
-      },
-      "source": [
-        "# create our model and initialize it randomly\n",
-        "model = CustomFlaxBartForConditionalGeneration(config)"
-      ],
-      "execution_count": 7,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "-r_hZestr-NR"
-      },
-      "source": [
-        "# use pretrained weights\n",
-        "model.params['model']['encoder'] = base_model.params['model']['encoder']\n",
-        "model.params['model']['shared'] = base_model.params['model']['shared']"
-      ],
-      "execution_count": 8,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "5NEX8f62sVjx"
-      },
-      "source": [
-        "# no need for base_model anymore\n",
-        "del base_model"
-      ],
-      "execution_count": 9,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Jz032w73nHEf",
-        "outputId": "994d8e85-bff7-480b-8b69-f69dedc15c49"
-      },
-      "source": [
-        "# we verify that the shape has not been modified\n",
-        "model.params['final_logits_bias'].shape"
-      ],
-      "execution_count": 10,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "(1, 16385)"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 10
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "zLl24Ez5t7x1"
-      },
-      "source": [
-        "## Inference"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "XLLA2NK3uDQr"
-      },
-      "source": [
-        "tokenizer = BartTokenizer.from_pretrained(BASE_MODEL)"
-      ],
-      "execution_count": 11,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Ntow53I_t81D",
-        "outputId": "59289cdd-1429-4720-cc87-88810c4b99ac"
-      },
-      "source": [
-        "text = \"My friends are cool but they eat too many carbs.\"\n",
-        "inputs = tokenizer(text, max_length=1024, return_tensors='jax')\n",
-        "encoder_outputs = model.encode(**inputs)"
-      ],
-      "execution_count": 12,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
-          ],
-          "name": "stderr"
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "vcRNJnJ_uJOJ",
-        "outputId": "025afd54-7908-4a9c-fb59-e40bd3458711"
-      },
-      "source": [
-        "decoder_start_token_id = model.config.decoder_start_token_id\n",
-        "decoder_start_token_id"
-      ],
-      "execution_count": 13,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "16384"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 13
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "6QWmEwL_uMld"
-      },
-      "source": [
-        "decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype=\"i4\") * decoder_start_token_id\n",
-        "outputs = model.decode(decoder_input_ids, encoder_outputs)"
-      ],
-      "execution_count": 14,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "c_ys3yWBothF",
-        "outputId": "40d4d584-e0a8-44cb-bbea-0ffa38d50a53"
-      },
-      "source": [
-        "outputs"
-      ],
-      "execution_count": 15,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "FlaxCausalLMOutputWithCrossAttentions([('logits',\n",
-              "                                        DeviceArray([[[ 0.5263986 , -2.0947676 , -0.18830685, ...,  0.7599884 ,\n",
-              "                                                        0.6746795 , -1.0411576 ]]], dtype=float32))])"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 15
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "O6s0wtB_uTC_",
-        "outputId": "bc0e9e80-e346-4e99-d28e-3f658eda1f66"
-      },
-      "source": [
-        "outputs.logits.shape"
-      ],
-      "execution_count": 16,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "(1, 1, 16385)"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 16
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "ELzemGP3uBzy",
-        "outputId": "dc12f98a-1ccf-450d-ba2a-9c29d7d14885"
-      },
-      "source": [
-        "outputs.logits.argmax(axis=-1)"
-      ],
-      "execution_count": 17,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "DeviceArray([[12459]], dtype=int32)"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 17
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "fQjikkGEunpx",
-        "outputId": "3dba0209-ad4e-4069-be38-6c599c677ef1"
-      },
-      "source": [
-        "model.config.bos_token_id, model.config.eos_token_id, model.config.pad_token_id"
-      ],
-      "execution_count": 18,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "(16384, 2, 1)"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 18
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "P32mJJSbrU1F"
-      },
-      "source": [
-        "input_ids_test = tokenizer.encode('I enjoy walking with my cute dog', return_tensors='jax')"
-      ],
-      "execution_count": 19,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "C7cHbIHruELT"
-      },
-      "source": [
-        "greedy_output = model.generate(input_ids_test, max_length=50)"
-      ],
-      "execution_count": 20,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "jYugh9cOuwc9",
-        "outputId": "19c3a2ee-e7bc-4f1f-9c86-06bd7337b537"
-      },
-      "source": [
-        "greedy_output[0]"
-      ],
-      "execution_count": 21,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "DeviceArray([[16384,     0,  3570, 13405, 10186,  2392, 16362,  1869,\n",
-              "              15772, 13546, 15772, 13546,  9348, 14791, 15772, 15772,\n",
-              "              15772, 11272, 15772, 13546, 15772, 15772, 13546, 15772,\n",
-              "              13546, 15772,  6642, 15772, 10776,  6431, 15772, 14567,\n",
-              "              13406, 15772, 14567,  6235, 15772,  4909, 16160,   568,\n",
-              "               4664,  6650,  8952,  9089, 15772,  5952,  7375, 10843,\n",
-              "               8952,     2]], dtype=int32)"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 21
-        }
-      ]
-    }
-  ]
-}

demo/demo_notebook.ipynb DELETED Viewed

@@ -1,583 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "ewer-Q-0w2xA"
-   },
-   "source": [
-    "# Installation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "NpsF9ipLLl2s",
-    "outputId": "10bf54aa-b89d-4e42-9777-bc97b00a5f32"
-   },
-   "outputs": [],
-   "source": [
-    "#!pip install git+https://github.com/huggingface/transformers/\n",
-    "#!pip install git+https://github.com/google/flax"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "id": "M1wVkrpjU6zO"
-   },
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/home/tmabraham/vqgan-jax\n"
-     ]
-    }
-   ],
-   "source": [
-    "%cd ../../vqgan-jax"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "t47CH1H_IOT8"
-   },
-   "source": [
-    "# Custom BART Model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "id": "9jQnM6S2vCpn"
-   },
-   "outputs": [],
-   "source": [
-    "# TODO: set those args in a config file\n",
-    "OUTPUT_VOCAB_SIZE = 16384 + 1  # encoded image token space + 1 for bos\n",
-    "OUTPUT_LENGTH = 256 + 1  # number of encoded tokens + 1 for bos\n",
-    "BOS_TOKEN_ID = 16384\n",
-    "BASE_MODEL = 'facebook/bart-large'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "id": "_eEaJVxAKpV5"
-   },
-   "outputs": [],
-   "source": [
-    "import jax\n",
-    "import flax.linen as nn\n",
-    "\n",
-    "from transformers.models.bart.modeling_flax_bart import *\n",
-    "from transformers import BartTokenizer, FlaxBartForConditionalGeneration\n",
-    "\n",
-    "class CustomFlaxBartModule(FlaxBartModule):\n",
-    "    def setup(self):\n",
-    "        # we keep shared to easily load pre-trained weights\n",
-    "        self.shared = nn.Embed(\n",
-    "            self.config.vocab_size,\n",
-    "            self.config.d_model,\n",
-    "            embedding_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),\n",
-    "            dtype=self.dtype,\n",
-    "        )\n",
-    "        # a separate embedding is used for the decoder\n",
-    "        self.decoder_embed = nn.Embed(\n",
-    "            OUTPUT_VOCAB_SIZE,\n",
-    "            self.config.d_model,\n",
-    "            embedding_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),\n",
-    "            dtype=self.dtype,\n",
-    "        )\n",
-    "        self.encoder = FlaxBartEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared)\n",
-    "\n",
-    "        # the decoder has a different config\n",
-    "        decoder_config = BartConfig(self.config.to_dict())\n",
-    "        decoder_config.max_position_embeddings = OUTPUT_LENGTH\n",
-    "        decoder_config.vocab_size = OUTPUT_VOCAB_SIZE\n",
-    "        self.decoder = FlaxBartDecoder(decoder_config, dtype=self.dtype, embed_tokens=self.decoder_embed)\n",
-    "\n",
-    "class CustomFlaxBartForConditionalGenerationModule(FlaxBartForConditionalGenerationModule):\n",
-    "    def setup(self):\n",
-    "        self.model = CustomFlaxBartModule(config=self.config, dtype=self.dtype)\n",
-    "        self.lm_head = nn.Dense(\n",
-    "            OUTPUT_VOCAB_SIZE,\n",
-    "            use_bias=False,\n",
-    "            dtype=self.dtype,\n",
-    "            kernel_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),\n",
-    "        )\n",
-    "        self.final_logits_bias = self.param(\"final_logits_bias\", self.bias_init, (1, OUTPUT_VOCAB_SIZE))\n",
-    "\n",
-    "class CustomFlaxBartForConditionalGeneration(FlaxBartForConditionalGeneration):\n",
-    "    module_class = CustomFlaxBartForConditionalGenerationModule"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mtmabraham\u001b[0m (use `wandb login --relogin` to force relogin)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "                Tracking run with wandb version 0.10.33<br/>\n",
-       "                Syncing run <strong style=\"color:#cdcd00\">rare-night-7</strong> to <a href=\"https://wandb.ai\" target=\"_blank\">Weights & Biases</a> <a href=\"https://docs.wandb.com/integrations/jupyter.html\" target=\"_blank\">(Documentation)</a>.<br/>\n",
-       "                Project page: <a href=\"https://wandb.ai/tmabraham/vqgan-jax\" target=\"_blank\">https://wandb.ai/tmabraham/vqgan-jax</a><br/>\n",
-       "                Run page: <a href=\"https://wandb.ai/tmabraham/vqgan-jax/runs/qzxavce8\" target=\"_blank\">https://wandb.ai/tmabraham/vqgan-jax/runs/qzxavce8</a><br/>\n",
-       "                Run data is saved locally in <code>/home/tmabraham/vqgan-jax/wandb/run-20210715_075019-qzxavce8</code><br/><br/>\n",
-       "            "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Downloading large artifact model-1ef8yxby:latest, 1674.97MB. 2 files... Done. 0:0:0\n"
-     ]
-    }
-   ],
-   "source": [
-    "import wandb\n",
-    "run = wandb.init()\n",
-    "artifact = run.use_artifact('wandb/hf-flax-dalle-mini/model-1ef8yxby:latest', type='bart_model')\n",
-    "artifact_dir = artifact.download()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "id": "_6-XKK40oEfP",
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/tmabraham/dalle-mini/src/transformers/src/transformers/models/bart/configuration_bart.py:180: UserWarning: Please make sure the config includes `forced_bos_token_id=16384` in future versions.The config can simply be saved and uploaded again to be fixed.\n",
-      "  warnings.warn(\n",
-      "INFO:absl:Starting the local TPU driver.\n",
-      "INFO:absl:Unable to initialize backend 'tpu_driver': Not found: Unable to find driver in registry given worker: local://\n",
-      "INFO:absl:Unable to initialize backend 'gpu': Not found: Could not find registered platform with name: \"cuda\". Available platform names are: TPU Interpreter Host\n"
-     ]
-    }
-   ],
-   "source": [
-    "# create our model and initialize it randomly\n",
-    "model = CustomFlaxBartForConditionalGeneration.from_pretrained(artifact_dir)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.config.forced_bos_token_id = None"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "Jz032w73nHEf",
-    "outputId": "994d8e85-bff7-480b-8b69-f69dedc15c49"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(1, 16385)"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# we verify that the shape has not been modified\n",
-    "model.params['final_logits_bias'].shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "zLl24Ez5t7x1"
-   },
-   "source": [
-    "## Inference"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "id": "XLLA2NK3uDQr"
-   },
-   "outputs": [],
-   "source": [
-    "tokenizer = BartTokenizer.from_pretrained(BASE_MODEL)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_text = ['I enjoy walking with my cute dog']*8"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "id": "P32mJJSbrU1F"
-   },
-   "outputs": [],
-   "source": [
-    "input_ids_test = tokenizer(input_text, return_tensors='jax')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'input_ids': DeviceArray([[    0,   100,  2254,  3051,    19,   127, 11962,  2335,\n",
-       "                  2],\n",
-       "             [    0,   100,  2254,  3051,    19,   127, 11962,  2335,\n",
-       "                  2],\n",
-       "             [    0,   100,  2254,  3051,    19,   127, 11962,  2335,\n",
-       "                  2],\n",
-       "             [    0,   100,  2254,  3051,    19,   127, 11962,  2335,\n",
-       "                  2],\n",
-       "             [    0,   100,  2254,  3051,    19,   127, 11962,  2335,\n",
-       "                  2],\n",
-       "             [    0,   100,  2254,  3051,    19,   127, 11962,  2335,\n",
-       "                  2],\n",
-       "             [    0,   100,  2254,  3051,    19,   127, 11962,  2335,\n",
-       "                  2],\n",
-       "             [    0,   100,  2254,  3051,    19,   127, 11962,  2335,\n",
-       "                  2]], dtype=int32), 'attention_mask': DeviceArray([[1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
-       "             [1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
-       "             [1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
-       "             [1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
-       "             [1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
-       "             [1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
-       "             [1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
-       "             [1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)}"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "input_ids_test"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "id": "C7cHbIHruELT"
-   },
-   "outputs": [],
-   "source": [
-    "greedy_output = model.generate(input_ids_test['input_ids'], max_length=257)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(8, 257)"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "greedy_output[0].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "jYugh9cOuwc9",
-    "outputId": "19c3a2ee-e7bc-4f1f-9c86-06bd7337b537"
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "DeviceArray([[16384, 10042, 10042, ..., 10042, 10042,  9570],\n",
-       "             [16384, 10042, 10042, ..., 10042, 10042,  9570],\n",
-       "             [16384, 10042, 10042, ..., 10042, 10042,  9570],\n",
-       "             ...,\n",
-       "             [16384, 10042, 10042, ..., 10042, 10042,  9570],\n",
-       "             [16384, 10042, 10042, ..., 10042, 10042,  9570],\n",
-       "             [16384, 10042, 10042, ..., 10042, 10042,  9570]],            dtype=int32)"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "greedy_output[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "DeviceArray([16384, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "             10042, 10042, 10042, 10042, 10042, 10042, 10042, 10042,\n",
-       "              9570], dtype=int32)"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "greedy_output[0][0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# VGAN Jax"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import io\n",
-    "\n",
-    "import requests\n",
-    "from PIL import Image\n",
-    "import numpy as np\n",
-    "\n",
-    "import torch\n",
-    "import torchvision.transforms as T\n",
-    "import torchvision.transforms.functional as TF\n",
-    "from torchvision.transforms import InterpolationMode"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from modeling_flax_vqgan import VQModel"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def custom_to_pil(x):\n",
-    "    x = np.clip(x, 0., 1.)\n",
-    "    x = (255*x).astype(np.uint8)\n",
-    "    x = Image.fromarray(x)\n",
-    "    if not x.mode == \"RGB\":\n",
-    "        x = x.convert(\"RGB\")\n",
-    "    return x"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "Jz032w73nHEf",
-    "outputId": "994d8e85-bff7-480b-8b69-f69dedc15c49",
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Working with z of shape (1, 256, 16, 16) = 65536 dimensions.\n"
-     ]
-    }
-   ],
-   "source": [
-    "model = VQModel.from_pretrained(\"flax-community/vqgan_f16_16384\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_images(indices, model):\n",
-    "    indices =  indices[:, 1:]\n",
-    "    print(indices.shape)\n",
-    "    img = model.decode_code(indices)\n",
-    "    return img"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(1, 256)\n",
-      "Working with z of shape (1, 256, 16, 16) = 65536 dimensions.\n"
-     ]
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAIAAADTED8xAAAtSElEQVR4nO1d6Y4jOXKOyFJ3zwzWXgP2An4Dv/8D+QF8ADvrxczsTFcp/CMzmWRcDDJPqfShu0pi8giScZNS4cevf/3tx28/wjcAJKABBoI7ASEMCEgAAAQAAEhwB0CYgfOD9A4BaHqxVHvhk2NkBZpfU1auvs5bRTqn7GcOEkMDwB1gALgDfcD9HT4Qvt6+f337AkBw/wAkGIaJiREABxhZHgGIgAAw52xBMc7l+GL/F9YjwkWyTmL6USpywRhf3wEGQIK3G+AXgBvcfxre7gQEAG9wJ4Bh4nhEoAEGhDsBIgwA91lSJx1Pmvjh/JOJIAhSoFQM6oRJSDmJOu5KoTG+fCQ7fhqwfUiFcmUS1NWwtghKxgN3YTvVo08uq5kqD2Jgmn8O8A7wFf/4+A6j34MfAIjwls2EAIaxyegOZQPT6DLh/DMrL5ZEEM/oeeEFSPoNAQJakbcUqq14w7Q2ACLQO9Ad7l/hdgP4ABgAhpn1px5nR2jxbUpalkelx4+i5oghLMDPBib90gzmkDXjqtjqp1qfUaJ6z6xnx2JKslXrLUhCwfTyhT6WVTBTMqppoMmrJwQaAEfjcAOCAd5mnwcBhjvcZ84GmmLfRL0VBBclQvcjL9bhro9SOaokzgXZb32vy3na6q5V67cS2UQbaeWqS9AaQEa2HzP2G3mYZneGAG7DXAeBAIjgPpoh4n4cCyqIFiEpplRqOHXuwakF6/T57q06VO2B6TUWgMXjCqZ/QbS1HGAW48k+I1Y34nJU1Y0fxIXgbKdPvYwnncqzWz/mdeBGBDCp/OTxJ7ee8faIez6KJe5ob9TW6Ot4fcgr94vsR63ExDWyNUqwGqtQ1ewdZqlnqYW/nWvXgpuSXNoOU6GSCIhgSLVuOCABEgHNrj5NgewyYen9V9mORJ31HOcgomz9OlKLOEo4SBIYTVRvWHUJZLml8GXnV3cNXdCyCSPy2FLhK8sbz9/SlBTKtTzcgAaEpPAJAHE+85p9MlWXBFwvm6DII9TYsRqQ+XY6TrQq5R0sJZtYy9dKs0PMQ7P+jFz/4qyfze0tuUipM5sIRKCx5hgH3wgIgBCG+7LphHP+NBs170oOzdldUtGxYT47ygrZ66rLq/YktYbqfDNxa4PLnR2Komm0oEnz1QhDt3tbiSvKPbUdHIU9FnORPJEUXtN00gUEcAe4IWJ2wwHnJWEuV04xez2KZrGUpS98ij5q5RXLC3ec745ROshYOVZfh+tjmKaGk5qfi9KdmjocP1MSl7EvzYoebvPYlHqcT3lhdoFSJ6pCV9X9RcCVXJ8WvR4cm9w5P6mKrZgkAtkV639+rQQsKQVv0RlE6nooRqTZIEyHAzcWaNBUCWA6GaiOW6r7uQ+cgumoHO/Dl7zXp+B+EPNYOy3VEWHKVd1Ia4Ols5jblKwJiRdOZw2Q4geFDCxvbwCkTgOLFchluLom468GM6pVsqLH9PQJUh07oXllIltFGlurrapuSc5SYeomsxBsFVcJt5belig53H8dc3cyXFOj7ZyME0ExITzFM9xmxI6khV/HSaTEO8M5SFi/7GONm6/Ip6CA24EtQex38UQdLpLGkd6s5Td3TypivtnoHayZzyhOZHU4KbrcUd/VXWQ5lhq5VPyaHfX4x07U8IaABtsCOOHQfpEk16llNkD4cgttaQF9j9R5ai1lN9dqTmUP1MRDa8PIU2UdrPyGZfjUHXLKzYGVKuR8FkVvUH+CAHAjItTC9CxOzyHdvw1NPNepYg6W0jVCrKKCX265NL5uskasrknVHfDXVlWa3e6WOZZv4/xClgKRXbXQSqlDXMJLtY9pUMttyE91xxW8ZX2g075K3worvy1ynk4oLEbJ7HJrcjcgP+WwrKJ0sZrsieqhOSDxU+2feRkJ0qKCqMNqWjSoU5Yro3erjldjoCQDy1uDNrN9/noAuGX0jBc8qYv7odwSx+IdAC9sYIxjuNi8fv5SZbHAxvuP4krWr8eI5D0sd77q/VdHD07ZaYXygW8cSWyNMZ1CF6SjLUnKTSUqN2SWrXW5+wqmYILB4ma1eIf5W1XTegowNlDVpY60lY8kGV2eyXqYwwY1KPLTKt11txYKAYZR4ZcRBvnNwriEDARirA1AhmCoxAQ7zLulcgg5XIQkB8eskj2ybkwisZS0AMQ7ROXVXHOQn8BBW3p88yQerpagLsiNR/HiLLDFlDxNNcbddU2bZIY1XDEmQ8MupVs85ImNGi9NNdNViNy1IlYplUgBMoiVPnZ6wyQu6iwlw50cMHDlrDk+EoiHpVZzdcRIyeaQc9lcxlY7UVVW0VsRABaBjQUeBoxNbtlbfeLOcgSnSkXvy82LmYl5WMNsljRBunhp41r9SDSxSBX+cCtFqxW5e3PAuNV13gpsXq1e3KjWh1n9V8nuSI8ujWUYR6nTsmb2nLpHlIh4zEFEljgumR2dByGXN95wP7RsKPl82aiw9OfjXVFkypK9SI27l0bVedLf1WhUHbhT/IgjRlrZudQyrM/qDh5gnVoijUpMVA2ZMuhTv6WHVi/yUbdesYYg8ymJn+yRWoKNmx4BCUvGnF454lY0qP2oe0La50gIWq6m28xkRVU+72yC6XPqkcs/xBU36k9mjAKQFs5ZJoKSC6IrmoHtnuTTuBDWBukjMNItK7HeqiWtcDSP2rlTWCEmsMIyA5K/UGVjmy0gl7RqbOMILpSX4aS48LbVHYggX7AgZzc5jrV2lhKFcspHhqlBbEtS4S6R8sjS7pY/q8LUaZbcOLBYuTqY1slUV94GTWth9rZSqKXmzN045sU63BoYxNdAah0ZATkajhGbGq735CNxsj8Km521wLKQ9eCznDWoLNdVaocYtALL3+ztbaQpW/Lq/lV5tZvEDUHiRaRya514YROC6x/spBo7xfuMD6qWs1hujQ+ht03KNNfxaquUfB9yAvlLTinN35mVavLKrYYw9ZL/kw0v6I68sBrLbiucZDTxP2pLS7Von4MsksEOmQ8rbRNNaPA32IZ2LDwgK/fCeWBuSaRBs93wLaAiAJkvNP4qwgyDHXlEJW0eCg8xFwkWylD54oUnxWIBwnyd+yDhAexixwIQrwsA9esZBAuvE2Z8nzpnYSaW/0C8wBYN8cJDoc0ClH+qoiIFbp/Lye/g9KRF6FYIpXtm0vqo+t7qzhnyhaeAqmSd2swBqSQqI2clN6jIk/kBsVk80hzyj84sR5Iqf4PB4mqKziHuhcfHxDL5H9ryPaJSL+t5VCepY8UA1ojM58kTSXr+nCyiXFz28OmF/UGzZz9+ef8BHq8bA5h1y3c1R6Xy2WImPIdM+4UHAO6e+yg6H9TSmZTlpwrTaiwP9ZnITOsr3flCLyzHmZeq+VDvqxGdZulnUt6CCipb5ISaf1/serDo2slcbWIIH96a0mQHgrNgDKaqcpO/vBigQ/dbFcuv2i1infP2Sppayv4B8/eK93sJ7CbcD+VEHuw0hQoPorWpXu4kcvK/BNMJQ+6WYgSwbrSetzPyOI6xe0Fa1Ru8BpasoiaxDyMGcv0Z+jZCuNkE2V2g+upYuR1jaRk/obIpU+x/MAiyuyKWDxagSq2i8d4RoHwxk+7RGOUxbALVeLJjAtmCTAoNY1+PvjRSR41lMJ2mh0XAyyCSM9SUbjtQvNgJOXuTLDKGZ9rn4ibNJG81t0xKiiJBcBCta4mZuqKyvHXok9DHPXnigOlrv39ZgXOwmlsLEMOGuIpUtMTBnSNg5BxghBNG+BWaOkT37eoRdg25WfTpEMN8JMshqVbwHwWnajmDzn5Wt3pDF2t3TahbAOmT+P5PB5Lek1a5W6jKnpyRVy6rdcfE0Vgd67SNY5i6UPfUHd15ZJkOOdqOKrzDQBBnN8MCSJ8EaoO1kELFL3voXpCm7KncHgfVqUhlr8a+rFqrXmxbCb93FD9BkB4e2vLW5NxXgdjv7YHQfhVibbU0MGWN/JBUGyHCTFLfWZobS96ITMXZbCYYB8GfXv420LCuzUjxmoyKa7Gv1+oJgDNyNWTzUY3vGsmJ1GSaWO57Jdff59iu2X8SP3NKWm0K6we1QqMuuAOq7oJTcg4sd6NqAfQVXnkwNHtvfXmuuO/nq+FQP/m29znyKk3WKNKagOBEtcOqxUGjptEkf06yGvJqe0ORvXadyxeycg4gQxtZAbXXEWhaJ9JB64qbtFORaCuq5ROXQbmabWRrxVP0Lk1Be2iVS84OSp1fQXBMhWyjm9zktPKI1WewlWqO+IzqB2G+UlFfdyHYQV9uhDfJGL/4Zm2pa6u77qdufea2DkNkWxKFlgp3UlRNWKPdDIp6+siG5uSso2pCVQAwWUC5H9utkQN1ZGgRmKItc/+xEIYoNUytSeKqpLA+1WokeMeyRYyGDgtQJUwV3ZhWb90yc9B2LNSR0V9zDLCappViYmU7ojWdANl6CoYIQuNqpCEwJnJs0MRwudGwvDIqX0SGaEI15NgB+mgtNCjTbbgLVA0JoEWVnoGF9lzrR2jecEaOM2MVqmRYLlC8gvPI2mW13NcdZcVmiXMatGdieixAW69VlVPui793cmEje21xzvKIRBsIi27cAEUgyaXAGsoww/L7g4UqYU3lENr6cK0ZNrXLVoa70ys2C4DcsEaoIRGbCNo8bSkmKDk5b1s0UR3u+DqqNftEQurDiAMG7hJY1XzD0orVWqBNDLSqRRC8ZlJrLUC7mpH7QvaSSnZ1tlLthJdg9i/fB4zta5DKIILapOpzshL2D9wlzvupVlBfgNiYqv8WRMTrhuiC67WqX4uyPPZrCGUWVGcMvm5gHCufyrdmb8xeRLZH3YnEOpLbNnSWLEhHX65CxMpVg1ppka2AJLbxobWpVlpv0FIQ3BoOFTXcljmPrfFL49AJlqYnz+WZzWr9Btuuh7UTVlSav908OeEMuu1SEH+3hluUJVSyQGLIZpRkJlWLykO9kT8o66FOIWUcP5dgXpJ9v9022EMe1vS5Bz05N6lBmGOsg2BGBnlZT3+UKT5QBaCu80WvVXfELCuGrSJiEnkdNuGsI07PAVr8uSFj+gsuKVOF1SB4mVRVLW+BDs9ZhnycZiEWyhAX3KorQA15R+T63vd/TlpbyxMsMFS+wTBDUxAcznDsCyMY5YnRF/c78HWSFQys93/iiG2fbpBu8gv/O5JxUA9PsFQZR0BGvfLhi/V9tDrD+QvNg2d1j2MG0wKs7zg2iW5O63CKKLWEzEa/NP1OqCY33HZ9IPGzE6sEoHEG3Znx1hkizOyeJ+lTXy8x2BbBBOgaFeg+aWMqRsYQpGsjplEPT5w6/YkEJm0yInlJwinoWHNDwDrVPxOXqAVotG4qWKa/emLYbS6mdG9e9MJK+Oa7a6vW8m7j6KiOOsjWmxxbOBQEEDm4tBQA5s9ean5DVFfyyJA2P8Fv2uJOCyBbdlSI1SXhBckK3tPj8wufAZtwf66YdoLds5732+bzABtE4w1wQh9SXr2wP3o9+5X3GqKntA6G1uaS77bWtexAt3bAMEFx/V84EvGVLy8LrUkObbDbw9xXFNbJX1dTB/n06vmil7t/BQTXP/mo61i4OGvu7ipqATZir3g31WqjvicAovlP0r/wcFixa8G7w/rjVNhsAVag59iibEK5rpep/RceCFvs18JRTm9+FiX6iTALjdOIW6rKLUMqa7y4/xFB7HdzaxIcFeoq55x0EtyZP2mRGys/6ZzSKodiwYP3Fx4C8053b2Y9R2J1PTYbagakPv4O1cVnV7LXL1//GRHaVY17eqJf1QU6BkHuzbW8Mr1gZvSFB8GabNBa3X2wAERQ+fRFcvpfMvAsWOPMovu2jmHl+FtA3tpk09AD3bPJfuEEiE1fmwscAIB9h8+emrX7RsmL258bJ5jzOQgW2I3X2OdSWtu+8MQ4TcEpArAbr/m3OK2aVskLz4Qo023OnMpViCvx2kvxfwqsYLm1HKJchdiN6Viw65xoKZ7SlcTyhY2xguXW8oXyvUC7sVqezMm/qyF/bZIQ+ZDYCw8K6t/Y+jVI//Ewfy/Qkc4GO96Sb81mL5foWdErA2uvKacg+Bjd6ng7ITJeFuApEddrggHWXoQfzmYpNr5+ZfrTfNjlc8xSgMRhVAey9g3LOJzkVFjX3ZRD38/EFJ/Xxeu4Jjm3K+4zo6jl93yFu0AehdYV6mfEp7FzOvrmXg9h/csFDX8mdVPUZ0tqNvSZURXztAzPphCk2rYgEiFUlj/SZbh6+PJ0fL/y5hauj/meFZUvybExWI13RmjMp9tqdvH96ebXC+q/CuFlESOdnhsDPJ2Kr+PF9ArWnQQ3hLwSaz8UvwIVUrf53qOHgRoBqwugfoT6M+ORDsJyeCInd/XZNafl3Fv78nKlRvB1Q4C2c4BtyWlB9VT4M8O3Bms/BnU1HDyBnM/OygJVjPvnFgX11iDU7lA9LuTfqWtsn72OrEhe5wanZYHaNu85tjpDxMVjV2CdRXvaI4IaCADGP3uehcP1RUg1LmUBHg4bzoI0jR69J2vUUft8bGiX4Zy3OnJFcoWrEA+KzW9nb3XIlZj+s1kDH6h+sdpLALohP6JT+UxPDRFtjeKF39uznRxrk1lxmfq8u0A6SGOrC4MZgaadsBx6Z+r5XfFqtc8DylW7O/lizcdXwzU/ZnVBkgSUm9vikZPN9GXcOhOouvWPsHIlWih2UgUT3GVVLkpEPw9w2Lo6bHU24kSpH/9v8o66F+CSK7cLdO+uZgE4lA/Fq/g86+oi7rSol7Q6tH4THtHjbyI459bFHmIoftKfD4gh5t51XS9/rO/nVSySO6Zi9c/+rRzlWlitWzGcKiZZIWoBdkV8Dw/fbcn6Ulod7dMnAzmXOyKH2QtJ26OAoOUvXWvL0TZrLM3xgLFPIx+5tM5Yx+4w0yzlaSNP41jxru+WOIwebCgHfVAxyH61oU3LUKk2rvD16Auuas4tXnRyoMzpV70Xv/MmqKOzkuvss4Lxe4HaFwLZq2oPD3AQdhkxkFyrpnfS21RYvbEje9sQjxgKB4FiZYlZj1Ypv4QASKIfYQMljaQ92jA88OEYmQdA0BOPnAQ3ycAlBEDa70tb6wmRUCXnyDyYXsOmcQvT0fNpCH41olaJH2/5i8tyGpcQgEdDh4Sy3NEaVrO2+MJHiNthKwOXlukSAvBom7bq9tVGPTwf+nibq4PqerFhzvpATAF5tEPli0eGGhi09hBp69dZ38Nl0XydrTgH2JycNei4NHN5RDxSv/nKgOFR0EFqEVAFl0k5B7gaLq+IWg8u0TZme8/18mu5Cj3irZwDXEdLPNd2pYsS7O111vsJMK1n7jZH2iSc9dWIE57R50mI78jeqK7rgy78srbdLKT8lcjjcQUeaYHjl7PTKKnvr3lWdUGSdBB/10y5uA0KcOL81au9D6qOxEVDuAy7X4GGbYDKu+bZ5Q3O+gsxjw5V90juPwwPqzQaISzA2pjqilmga4PKTJrD5f5l6by8aQvVm54vPTai+T7FFQXgIn6DAedCqFqNXAbtMOLqJweeAXRIikw5Cb4arm3O1Vs9KXhR706fxaCO4F3RaKz/Q5EduKIAXDgUZtk2LIlVb0E3Zai3hTPi5bh/E0T+0gwT/eGCBzMXJElA3nOGGuFHTuvy67cC7tzqE+cnwRd0uK9GTwbVZc8DYhRRMmRP4RDWfFTuj9C9njeo3InT/1K8giuRJGlRt0B+6oVK7k8vLuzfnYwQb28Yu0yX4U5Rt5EPU10DHcuDtW3aNQB93FsPBKuJ67jzfU4QfGEnZyWkjyQvwx3D/eSScUFY6YQGROSHB8F9I+2Ey2qnETZ5zjmX6h3tRA6KCo+QUJiwDZ2tonMtAbgeppPa2ke21WW3Tnk30cErD4+fFpR+xHChc4BL7dJMDMJimwnK76WpMTJT9pv7Hn3xSY7rXNjeBvOKR2eEp1iA6uWBE6HRpn4jMSu3gNm/bYGae0PKK73CpbSNhEee49XVZrWos4QTBMA5nT8XyWUpKfGu8ZxO8wjpoQmimRBuGIqcAO1SdAQ80USXigHOtQDhAJer85P4SFJlATN2R1F+BbsrUaGKnHcB6LdBT1cIpxMAAEJBSKIUMg+nXMlpajSoN/Ouj0NJXQTg9BU6nQAAiFGxlQx05ChJiCi436t8jUUNYXTQQwTbLlDbfPEifyAj4Urb5a/LEgqnf+FLWjJy7TE7rMJj5fw1qF98q0BUc9a2NiTADTF1+8irtw3YBZ7WawVEWp5h7i1/bXnwZlbHP0/ALA4uc1aAc+q2ekPjbGxAWgcT3+ZFuQT3n03ExCQ5PzViyjMYpjxxoHpJAQSX1nliTHs7pz/zRPDa3A+u+thnPAC42ofiTyQGl98beIWG/Pg3SZ066hAwf4Yw4vtEHYyzEfLjNlSUy0nwFTTE2RZgxDbeYLaeuXOyeJwdHW6CRz/+ddZBfeQzdvUqxDV48lBsxmnqufJsZOTl0MIXyqQwT3qm+lQm+APUGMcBD4dtb9UmAfjsQfCBk7e8/HwLcuHJ+Z41jFON6pCXROgUbKtZXOgk+DLwT1X9CkE0iRvLt67ChbnfSoXtiwvdBj0RpL40K26yQ92ddDTE3F5cVQaSU3cclNug57pBZ+9N2oMr35NpxXRpjx4jERSycltNhDIB+NQBAAAYYdVWPs+ZmEX50Cz7CgRPP0IQiQSOJABXWZnz6HAOp05E9+g833qVDfawnxY2Z3+tvxBzFsTyXIdbVoUKp3zZ4GpsyY+OI4spBrjmIh1MVW24ay6Sj4NpXs+4eDzFl7MAJxKjrb5HzoXDZJTMdKldtnHcoo4LsqRB5XodL4646iLaBgQIqAdViEYN4+hX6XSLGS5ePrEio/acCNp8Yw/dr9bVszh5LDT/UPYx3K8disprvddBJZGiPUJNnhFXXblrDXApo/yqRqvlFuAmmG6DbnC62Akr53h1tC6XiERTarVpvrxyS+MDUkGtVzPMLoLYimn5QZhwhPZbMjaFZZPOkoPguB3koTY7nMpzeVBlA3FphfF0vqHaNld3G56OR2EpD1lYJYtfhWAOE+6bmpWdE9nP9saaEau3DNCYU+6WpPvSuKgewuWAuln05oAK56yoT2MfcNy1I1Ouhlvf41VWvxVidwtg3eG+LBSpjdavL6awAq3Ht4ven/vB1M9uq4pQfmfeAahlLKw6HOdfhrtOZBY2O6OmKZyZ0g8gyC70+3bZSiiJEiLNOUzqHU29mPrZa5E3jEp7Sewn4WQBmLdtmfspzk9ODxgEZHvDP18rspBosb7oaukBi9dEnGvN0GiWvXTXbTNGb709enzqfD0qArDnFcLcMb0Wcl6EUiYlT1AmNt2iy+xAZk8qmftMYjf+2Lszl32uVZ8jPsuH4o3Bj8kCoSy6CDD7OYNkhezFxL2thzVln4iT9x/Klu7KOGS83Tr70zYJUZsvVJCwShC8rUYxwkGT7tONg0gKT8UkCmn6GwKJK1LQGdoII6nXsABY8x434VRLGLKSVeOsI5LyV8G1q3414jYSXgsHaf5XeLqXMghy72d+z/98Rp7OJ9lqb7h52A26UpOP+WKsSIZuuVQ1v2aBeRt0/51LJn4p2fPYYQNkVlYNCpY4fvw9Ju/dKRFt8zVE9YUj8aIK5lVooxQG+zyLrY8cocd0gbadjKE5uCN0PPszZ4b9y4HKu3EKS6I9fxxRQhvOt7pfOcVxhD3SYvtOUmPKOUCVkuPuAok40s+AHwprERx5SAXlNNrcHlzkpwl8hHgP2JityvPCtvAsxYZB25vF+vs306D7J6UW54HmowA24gFnAh1DqOn2+SijmZu7FnnVzqxZVa2hf6qR3pJdfz30OeXSa8H8btDNT1VUzApjWR2ZYDkAQWPkWImDjZhrmngdiQ5arTSGNo50RZJIVEeO7jnxd53rz4PgI7eR2O8247x2XEvSIhTQ0smY+lATgtHeujB1j/ZJZUpIUbnE60ctB6mWxFNDa6jrbLvEABF7sRKSz0751Db6+korR01NjIofDf2TSjYXa1rWDZ376oFQvhlZc0uNEPAH8ZxkHyWpoKf/W2OjVW65lLH83OQwhydXh4yt1ZL8rXhE2bYv88Ppg2CLFFhX2coOF12Uyw9mP2G+IJ2KpcdddlgM2+ErYLlibsUC5T2a+LC2t7UDWi/DbcalaEz0gMA3h8XxKEpkw1Qx14uJ0bG4Eoc+D+EsNtqj9HO57oYK7bzzPDJJItTHVpp3YC2MOsL23LxVkoZ/K8RhHolYTbV8d0j1prqCqhuDCkvxz8ur5h8Bku8+15HMWR0wElBSOltcv7NysmI47/GBCDHRSN5x3w3K0mDZZuImp6FBGiRnWxV8krotVZ5ec1nEe+gapbyHaY2tGN3qKjyuERnxSKB1qeqy0710CSNNrV+P3i/VdpBNKJTb5rB2wB9RFZj0z/fpd0XVz8jXU11Yw46YcmGH8jLaX34irxPEBgIT7KLyofjNofVfbNAeBFhZmsy9jvYDbqvzjP5CQFWPuLrf9K/sbk1Oa1cH2CswysjBLlq/GrF5UhEfI89+7KFE1Qn6utzBwVGKNv4qEuIxhBxYy37qvXXZ86084YaR9/0LMQ5vzY+qrvCWxDAkXb65i7qWSSuDn2NpcHFWFfsA2YauGKJvdn5k50GmQX1+aOMW1fEo0uP8k6wrvzKtQgyVE6Bi55bCvD6ICmVdzPWWlsdMxm2rxF0+9MHIPdU0r4IlZhnozCzu8clmH5XvBRLocYFQvNY4j+UN9oJ6+JADDcHQuoHc3aTJl8OsnzWBoINtF4mlf+N0qnkN1sPpYdEI02e7MRI3/0gKW4y8d9IVxo4I7katGom9Vx05QqWJ7CpOl0PJGrC5VPtkNKuW1aoc6b+pWrWX3ALzAQBgaGS+bQTaSL1tN8C+yPfe8lnVjd8E1JvBYgkIlrWMQ04Zs3LUCmVlj9BItUAPoV6O/l4gg6BCCeUu08MCA9q9T8n1cQZq7K4WyretSTLsyq6xwTqcMe2NpvvzCqd9MRY576yiB0N1/w62do5iUcM0WCGfq6ZGejqhE1ZeYgS7CyRNmNN2LVmP4O08JXJfBQ2OX7k/608qvB5kuNU91gDF/S21oxejPg38BPdhY+1FRFPzsfIgCn269xKG/pOMFxqQJWx3H2iv5J7GglNZ03gpCxTof1NYVKpZtKcGGbzYnZ8J4hj3c+UU1gTQDWi6CrHjqn1KN8vKonZkOa+MjcW410XRn1gCwBPYG26FGpXLs6UXPhs2OqaM9iRjgHP0Dbsr8cJROCYMgK50fh8r0vS/ZWqKBRCD+yd5azO+qZfazbMXtsWuy0ylXW8ipYOwtrA+lzNFAEipuRfUc4eXHXhkJL5vcGlnQannQaouPpq1insGqYZyF6hRw7/Y9QUGzFRyk/9zXBiYxhgcMtv1//oD8BceFGtu141t6jLjs8d4eyI49mIBqpcu7B6ZMVnLvfnR/OuixMNCXrLo62RNywYJ7LgMR6WNy8s3xl5niS9sj7Wpkdl3z21IUyednBI9CNMub6onOBvjZQc+CcqPATRbj/LDxA3pV08AAjLFxnjp6xfWoJLNdJ61uv4Jg/3I6US1d83KWkxX78HSBi/j8EyI7KaTA9UesU8aohruDnPPFREyxibxswGCsxs+iPTi/mdE966yP7PSMEQ0CD7DuVHGfB2WPSU0PYh2Bb95GwZHdLa6dbQJ0hnJAZfZXzgJ5sbGdjxyDsCfD/nRcTs7r2fFly/zeWHnehrUf1nF/COF1p+w974WJTdFtat9VmRcRfSuyEvrPx/EnpostoWa1APdoW9UNFvtotFfB8PPh5pKjXcS6sDK86jnACguJQWDkHyUXT4H9BKDJ4C7ifzhih2vdDW+9Q7CqmPPQpy+A3BHPwWFEL+uDD0o4lnzwO23pv0vToutD8Wr8DKSWFgA+XxLHDfSC5ui5isn1jzsiyQmBM8BQlE5AlChpmXWXjZlj+rzd06FnYH7UD8gfDTgSZ9BpWwxxQ5ybXZM/n3swhQAFD8h47Pe8fMVkK9hJcv5NyYssv2xq7yS7yvz0JxWDp0quu+6+HVYfSkbKxmNqSStt9YRejhE3frxb4zf2EfF7PHQr8ksWrn91Rbq6yo31tci4ilZYztp3eq+qq5aH46M/jd3LLGiLiO5R+lTBK2EO8yM5e8DrPxAVtYcoWBMawednbXY2vsEgtodszLqUyjrpLcRPUN2TRLjqm6A1QMrDIaMwU2kWGV1xVTKDw7D4jP1O4HcAmSF/d3JLmj57ahUq7MGY6duFWNrEiWyk6abfUkjqb6dZA4pFczCov1I1pH6ILh/tkOiVLNKujuxiiOGolHkLNdj2WT5vUCLQSh+x6Bxf3ChNkbATyvgiERkCMzeolaYykmwL5ZrxKQISmFmP9m/R0A3pQQNQl7UKje3cKgWFwjLx1u5gONvx1Uosd6y6WSQVqKqUlXtWe5TK7lSHqo1H4St4+je4o5W01iOKbtldUk7VQ46wxWoilDjP8dNWiUb8m+fSYXqNufDn2PXTgCbup9EdZzzyvaJZ3rluI9neR+sh5t4VDic2ypky6A3toZsoTcQzrzTarVPwPEMMiZy1lxWzvfIXD+tlNSH8Z0qRyxcTsyqOAdhFBmpA35g4sJxVVBUWEtSR50XBFamWAoXIP53k0oL4FEVOQlmAds2SpcNAO7yUFFRNlW73J7OFzLI0IlEeZPPo/aZl2LVAiMAAWKpu0mxPMtAQ15W1tBdvVMUYSDLQbaZfunuzSFVlpWp0hsbzyzfqclhJtGX1+4GAGHLspCBQLR8+mbJT+2tda3wKpxT0ZoW2gtqk7BCwHhYEq8jyTZZJNZbUxRDYmXUnvXy9uiueE9zJyFrMlXmsWvEYix/JbK67YzdAYCAKNO9p/gcKwYl7QUVc+VQ80AypM8fgegtQnJaUed8Qk3C+L2lPtmOya5IzKINknR72vwJI4vchQAAxu254Nqtpsq39E7bK3X/RguASUbzzTheBpgZBoM1SviqQX2K2iMrnvcfpVFSIbP1mD7Aqn3hK9r80LT8UgaqU9OhRgPxcM13llqoWJCyQDOj6qAkAJQRjdkGwDQfGj89LFaIyv3mThscKxJMA9ekX1/eef/UhUMU3WIpeKICzjzNHuU0LpTMC7ooszjlWaeSxrw/zB9QfbkqjCidpDX+T05YGIsTDsWOWM7VAp4FUn1MVLSUeJk1yc1CUmkHRAiSmDUNjeZKMk66PuE+VcPST/ncvOD47IhTkQRGfNdwaxvlZHXRw/VtUgTV+JYJAKr+DJUywLYrezvqOUpik5bZt0TlaBfHSuY8qM/Z4HDbkpfLUcngxU2h9MocibzQJsPTqGRUI9HhLZcSKoP3oCyKJojaSD5GAUsbk03gUQTjWpD8xEqqMcoO3G+ybIvN9JqO/EOLp4ry64DYHG+AOLIsLgI39gLA00mTYYksjawTaIX59pB8aZa80IZ9VHtkNFRLu1DwQGa4Rq8PUznz6u9LwhQI4IbwcYfhAwaaU/sDwFDyOrtJZqt2Fg7l8dUSaJW2uCCuZvWUmT8jZFTZ9LSPtVgwvKGMqPu1Vf9K5ziJQe7O5N57+iZdRIDb+/udvsAdvhO8Idw/AL8CEsAbAMBAcC850te+ij0qy5OvpdBO9iNjtmzDOgVjN2e3G3GXU3naO53J7Mfv20Q6dGAlqvrGSnnPvOd0CYK4eqX08/brrz/Dn/8F4ctXQIJhALjP/94A3mAAuLM/vyGvFutE9c6mlr60CqtBXUOnmzpah/psazhpNe/n2tKUxPZRVE2nuhGMBnNcnKPN4ZeP39/gh6/w5Q2Gb3O9DwACuAMQwDD/JUnXlqUgYhvIvtS3LRESin+BDszx463Y+kga1CZWBfmoe0abgJHBSrwGgYepw5zxSKvAe8iIQBQ0paQwAtw+/vj7dwAEuAMAwABwm19/zMmcAUbDoBJP4oUzvajpkPVyz5QqnlIQ6vqmEtVByxuy8riaV/VWsEN/CMYKG5od1AjLdXDU83LrSYIV5o4YmfnRdADFSnOWvX376U8Ew/f5Xuj7PJsPgA+Ar3PtAYb7IgOjF7Sk+dNvlaKMXhScpAfD/lpureVURdvRNtjKVNVpt7Q8fQdJlSbZZimikucDkfcmVGonkHGz4b5Y3oA1br56OCf6CQBwFonkAt3+9OOPvwD8CvAV4PsU+wIB/AjwDgAA78V3qKeoYhk7FwPLqOfTFRgPidPDp0/ymEgBaK5T9hxOeymKdiNi1HTIBMwfLjdDbq3pTJZVnJvjHQjGIPj3t69/ALwD/DH5/fdvMADAd4AbwDvAlzkgnnX1EkxjSQ8Zmts2+ZT92oPv8+jfrwOi2ueVwy50iMmYsZnyNrMM1Dk8eyC3LJksmjeeebM0WwCYYoCfhm8E8DPQV0AE+A2GfwDcAb4AfAf4NtsB5AOPfbGSumMbRs6XOZjXVPV0I1YlHsY8J+QSg7a+NaUbhN4NU4LVIMmqgJOfw+ompkmaejokHv7nr//1O8Bv8NuvAB8AAO/fAWD+vpQ3Zf64DMTGjUw0itwC5/8kAaiOXuscWxqu9HSvDnU51qyv0XFDN1SrrXH/cgcnq4bpEQAMQAiIMADgKCe3f/3hT78D/AQ/0eTt3Gjm+3s2GIXUZOFLGPX207J8uVjawlpNEk+0+osMVOfvQPpbfiIj69Nf1VSz6E/7OOxUM5K1sZIytk+7ldbTR6+OJppMEpHMC8F9vvA23OF+A7h9G375PwACGgC/A3yZ4+BvAH8AfJ3dIWEYdQrswuDTbsjkErdQqV7cbbKmF+Eep3PUV5DTVUpIfUCaq2UOYpGpYzqM7WU58hTjseY58YEVONdmImSuDk4q/20mi0Y+v3254z8D/AT48zy9rwA/AgDAN4CP5U9pf5RcBOktOivpwWPCNRym9swcOaNzZyIFRWiUq8BM6gKjm81Dg3EKiwvx4SVdBERVIvlbSVLmoys5eI1StY9WnzYPDRZXaNYdxXEYAbwBfgACwA3ePu4A3wD+AvAd4AeAN4APgC8Av8/WAAFGWSAA9mGL7Bpz6JZoDJStisWO2a8ZKBhd1omMvlF58Zi0QiinxxidtPrt2Mvn9H3i0RzZn01j3cg+LLLlCmHprQMsiXUYH82mbAD4IKA3GN6n4124/UY/fQD8Av/7/f3f4AZ/B/oGCPDHz/D1O/z6T/DtBvcv8EaA75NZZEfC91zgAv7FtZCdkpxrr9dD1QYrPbX1deLwIjHfxyCuR9Kfpxh1NM3uECDgHd4+4GMAeiccEG4//+d//+U//v3Pv3zHb3/72+9/fPvhh/sbfvz6Tl//8eXXjx//9NP7HwBfBrrDgISICESjIMAcV2QTCHqrp4P4u/z7kQr9y4wepVqFmpmL5hcHw+JEy6SwkCB5SjGOJpjOU+t/YK5BRNjyLbTOHDfFJSPjpZQAldWn0GXetjsQIiAMBDQQwv2dAD8A8OPtDd7ubx/w/1lcuBBroJa/AAAAAElFTkSuQmCC\n",
-      "text/plain": [
-       "<PIL.Image.Image image mode=RGB size=256x256 at 0x7FA20677A400>"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "custom_to_pil(np.asarray(get_images(jnp.expand_dims(greedy_output[0][0],0), model)[0]))"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "TPU",
-  "colab": {
-   "collapsed_sections": [],
-   "machine_shape": "hm",
-   "name": "CustomBARTv4b-model-generate.ipynb",
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}

encoding/vqgan-jax-encoding-with-captions.ipynb DELETED Viewed

@@ -1,363 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "d0b72877",
-   "metadata": {},
-   "source": [
-    "# vqgan-jax-encoding-with-captions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "875c82b3",
-   "metadata": {},
-   "source": [
-    "Notebook based on [vqgan-jax-reconstruction](https://colab.research.google.com/drive/1mdXXsMbV6K_LTvCh3IImRsFIWcKU5m1w?usp=sharing) by @surajpatil.\n",
-    "\n",
-    "We process a `tsv` file with `image_file` and `caption` fields, and add a `vqgan_indices` column with indices extracted from a VQGAN-JAX model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "3b59489e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import io\n",
-    "\n",
-    "import requests\n",
-    "from PIL import Image\n",
-    "import numpy as np\n",
-    "from tqdm import tqdm\n",
-    "\n",
-    "import torch\n",
-    "import torchvision.transforms as T\n",
-    "import torchvision.transforms.functional as TF\n",
-    "from torchvision.transforms import InterpolationMode\n",
-    "from torch.utils.data import Dataset, DataLoader\n",
-    "\n",
-    "import jax\n",
-    "from jax import pmap"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "511c3b9e",
-   "metadata": {},
-   "source": [
-    "## VQGAN-JAX model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bb408f6c",
-   "metadata": {},
-   "source": [
-    "`dalle_mini` is a local package that contains the VQGAN-JAX model and other utilities."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "2ca50dc7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dalle_mini.vqgan_jax.modeling_flax_vqgan import VQModel"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7b60da9a",
-   "metadata": {},
-   "source": [
-    "We'll use a VQGAN trained by using Taming Transformers and converted to a JAX model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "29ce8b15",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "db406bdfc5d5428eaeae1631a04989dd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3e37f07fba6d48fca70313ae1fa8cc32",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/304M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:absl:Starting the local TPU driver.\n",
-      "INFO:absl:Unable to initialize backend 'tpu_driver': Not found: Unable to find driver in registry given worker: local://\n",
-      "INFO:absl:Unable to initialize backend 'gpu': Not found: Could not find registered platform with name: \"cuda\". Available platform names are: Interpreter Host TPU\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Working with z of shape (1, 256, 16, 16) = 65536 dimensions.\n"
-     ]
-    }
-   ],
-   "source": [
-    "model = VQModel.from_pretrained(\"flax-community/vqgan_f16_16384\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c7c4c1e6",
-   "metadata": {},
-   "source": [
-    "## Dataset"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7014a7ce",
-   "metadata": {},
-   "source": [
-    "We use Luke Melas-Kyriazi's `dataset.py` which reads image paths and captions from a tsv file that contains both. We only need the images for encoding."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "85832702",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dalle_mini.dataset import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "81b19eca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cc12m_images = '/data/CC12M/images'\n",
-    "cc12m_list = '/data/CC12M/images-list-clean.tsv'\n",
-    "# cc12m_list = '/data/CC12M/images-10000.tsv'\n",
-    "cc12m_output = '/data/CC12M/images-encoded.tsv'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "fecc9a00",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "image_size = 256\n",
-    "def image_transform(image):\n",
-    "    s = min(image.size)\n",
-    "    r = image_size / s\n",
-    "    s = (round(r * image.size[1]), round(r * image.size[0]))\n",
-    "    image = TF.resize(image, s, interpolation=InterpolationMode.LANCZOS)\n",
-    "    image = TF.center_crop(image, output_size = 2 * [image_size])\n",
-    "    image = torch.unsqueeze(T.ToTensor()(image), 0)\n",
-    "    image = image.permute(0, 2, 3, 1).numpy()\n",
-    "    return image"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "4ce2211f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = CaptionDataset(\n",
-    "    images_root=cc12m_images,\n",
-    "    captions_path=cc12m_list,\n",
-    "    image_transform=image_transform,\n",
-    "    image_transform_type='torchvision',\n",
-    "    include_captions=False\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "cc922704",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "8592141"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(dataset)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "62ad01c3",
-   "metadata": {},
-   "source": [
-    "## Encoding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "88f36d0b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def encode(model, batch):\n",
-    "#     print(\"jitting encode function\")\n",
-    "    _, indices = model.encode(batch)\n",
-    "    return indices"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "1f35f0cb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def superbatch_generator(dataloader, num_tpus):\n",
-    "    iter_loader = iter(dataloader)\n",
-    "    for batch in iter_loader:\n",
-    "        superbatch = [batch.squeeze(1)]\n",
-    "        try:\n",
-    "            for b in range(num_tpus-1):\n",
-    "                batch = next(iter_loader)\n",
-    "                if batch is None:\n",
-    "                    break\n",
-    "                # Skip incomplete last batch\n",
-    "                if batch.shape[0] == dataloader.batch_size:\n",
-    "                    superbatch.append(batch.squeeze(1))\n",
-    "        except StopIteration:\n",
-    "            pass\n",
-    "        superbatch = torch.stack(superbatch, axis=0)\n",
-    "        yield superbatch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "2210705b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "def encode_captioned_dataset(dataset, output_tsv, batch_size=32, num_workers=16):\n",
-    "    if os.path.isfile(output_tsv):\n",
-    "        print(f\"Destination file {output_tsv} already exists, please move away.\")\n",
-    "        return\n",
-    "    \n",
-    "    num_tpus = 8    \n",
-    "    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)\n",
-    "    superbatches = superbatch_generator(dataloader, num_tpus=num_tpus)\n",
-    "    \n",
-    "    p_encoder = pmap(lambda batch: encode(model, batch))\n",
-    "\n",
-    "    # We save each superbatch to avoid reallocation of buffers as we process them.\n",
-    "    # We keep the file open to prevent excessive file seeks.\n",
-    "    with open(output_tsv, \"w\") as file:\n",
-    "        iterations = len(dataset) // (batch_size * num_tpus)\n",
-    "        for n in tqdm(range(iterations)):\n",
-    "            superbatch = next(superbatches)\n",
-    "            encoded = p_encoder(superbatch.numpy())\n",
-    "            encoded = encoded.reshape(-1, encoded.shape[-1])\n",
-    "\n",
-    "            # Extract fields from the dataset internal `captions` property, and save to disk\n",
-    "            start_index = n * batch_size * num_tpus\n",
-    "            end_index = (n+1) * batch_size * num_tpus\n",
-    "            paths = dataset.captions[\"image_file\"][start_index:end_index].values\n",
-    "            captions = dataset.captions[\"caption\"][start_index:end_index].values\n",
-    "            encoded_as_string = list(map(lambda item: np.array2string(item, separator=',', max_line_width=50000, formatter={'int':lambda x: str(x)}), encoded))\n",
-    "            batch_df = pd.DataFrame.from_dict({\"image_file\": paths, \"caption\": captions, \"encoding\": encoded_as_string})\n",
-    "            batch_df.to_csv(file, sep='\\t', header=(n==0), index=None)\n",
-    "            "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7704863d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  4%|██▋                                                                      | 621/16781 [07:09<3:02:46,  1.47it/s]"
-     ]
-    }
-   ],
-   "source": [
-    "encode_captioned_dataset(dataset, cc12m_output, batch_size=64, num_workers=16)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8953dd84",
-   "metadata": {},
-   "source": [
-    "----"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

encoding/vqgan-jax-encoding-yfcc100m.ipynb DELETED Viewed

@@ -1,1136 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "d0b72877",
-   "metadata": {},
-   "source": [
-    "# vqgan-jax-encoding-yfcc100m"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ba7b31e6",
-   "metadata": {},
-   "source": [
-    "Same as `vqgan-jax-encoding-with-captions`, but for YFCC100M.\n",
-    "\n",
-    "This dataset was prepared by @borisdayma in Json lines format."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 92,
-   "id": "3b59489e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import io\n",
-    "\n",
-    "import requests\n",
-    "from PIL import Image\n",
-    "import numpy as np\n",
-    "from tqdm import tqdm\n",
-    "\n",
-    "import torch\n",
-    "import torchvision.transforms as T\n",
-    "import torchvision.transforms.functional as TF\n",
-    "from torchvision.transforms import InterpolationMode\n",
-    "from torch.utils.data import Dataset, DataLoader\n",
-    "from torchvision.datasets.folder import default_loader\n",
-    "import os\n",
-    "\n",
-    "import jax\n",
-    "from jax import pmap"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "511c3b9e",
-   "metadata": {},
-   "source": [
-    "## VQGAN-JAX model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bb408f6c",
-   "metadata": {},
-   "source": [
-    "`dalle_mini` is a local package that contains the VQGAN-JAX model and other utilities."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 93,
-   "id": "2ca50dc7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dalle_mini.vqgan_jax.modeling_flax_vqgan import VQModel"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7b60da9a",
-   "metadata": {},
-   "source": [
-    "We'll use a VQGAN trained by using Taming Transformers and converted to a JAX model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 167,
-   "id": "29ce8b15",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Working with z of shape (1, 256, 16, 16) = 65536 dimensions.\n"
-     ]
-    }
-   ],
-   "source": [
-    "model = VQModel.from_pretrained(\"flax-community/vqgan_f16_16384\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c7c4c1e6",
-   "metadata": {},
-   "source": [
-    "## Dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 94,
-   "id": "33861477",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from pathlib import Path"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 134,
-   "id": "81b19eca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "yfcc100m = Path('/home/khali/TPU-Test/YFCC100M_OpenAI_subset')\n",
-    "# Images are 'sharded' from the following directory\n",
-    "yfcc100m_images = yfcc100m/'data'/'data'/'images'\n",
-    "yfcc100m_metadata = yfcc100m/'metadata_YFCC100M.jsonl'\n",
-    "yfcc100m_output = yfcc100m/'metadata_encoded.tsv'"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1c58bb4a",
-   "metadata": {},
-   "source": [
-    "### Cleanup"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1a14ae3d",
-   "metadata": {},
-   "source": [
-    "We need to select entries with images that exist. Otherwise we can't build batches because `Dataloader` does not support `None` in batches. We use Huggingface Datasets, I understand they support threaded reading of jsonl files, and I was running out of memory when using pandas."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 96,
-   "id": "7811648c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import datasets\n",
-    "from datasets import Dataset, load_dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "4811a230",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "tcmalloc: large alloc 1254047744 bytes == 0xb2b08000 @  0x7f9e78632680 0x7f9e78653824 0x585b92 0x504d56 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56 0x56acb6 0x5f5956 0x56aadf 0x5f5956 0x56acb6 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0x56c332\n",
-      "tcmalloc: large alloc 1254047744 bytes == 0xfd74e000 @  0x7f9e78632680 0x7f9e78653824 0x590214 0x586f90 0x56e1f3 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56 0x56acb6 0x5f5956 0x56aadf 0x5f5956 0x56acb6 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0x56c332\n",
-      "tcmalloc: large alloc 5016190976 bytes == 0x148b42000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n",
-      "tcmalloc: large alloc 5019099136 bytes == 0x273f12000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n",
-      "tcmalloc: large alloc 5019811840 bytes == 0x39f9a8000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n",
-      "tcmalloc: large alloc 5024571392 bytes == 0x4cb4ec000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n",
-      "tcmalloc: large alloc 5021097984 bytes == 0x4cb4ec000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n",
-      "tcmalloc: large alloc 5022818304 bytes == 0x4cb4ec000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n",
-      "tcmalloc: large alloc 5020794880 bytes == 0x4cb4ec000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n",
-      "tcmalloc: large alloc 5019451392 bytes == 0x39f9a8000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n",
-      "tcmalloc: large alloc 5020565504 bytes == 0x4cb4ec000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n",
-      "tcmalloc: large alloc 5012561920 bytes == 0x273f12000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n",
-      "tcmalloc: large alloc 5021835264 bytes == 0x5f6cba000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n",
-      "tcmalloc: large alloc 5017436160 bytes == 0x273f12000 @  0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n"
-     ]
-    }
-   ],
-   "source": [
-    "# The metadata is too bog to load into memory at once, so chopping it into chunks\n",
-    "chunk_size=1000000\n",
-    "batch_no=1\n",
-    "for chunk in pd.read_json(yfcc100m_metadata, orient=\"records\", lines=True,chunksize=chunk_size):\n",
-    "    chunk.to_csv('./chunks/chunk'+str(batch_no)+'.tsv', sep=\"\\t\", index=False)\n",
-    "    batch_no+=1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "46b2f083",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>photoid</th>\n",
-       "      <th>uid</th>\n",
-       "      <th>unickname</th>\n",
-       "      <th>datetaken</th>\n",
-       "      <th>dateuploaded</th>\n",
-       "      <th>capturedevice</th>\n",
-       "      <th>title</th>\n",
-       "      <th>description</th>\n",
-       "      <th>usertags</th>\n",
-       "      <th>machinetags</th>\n",
-       "      <th>...</th>\n",
-       "      <th>licenseurl</th>\n",
-       "      <th>serverid</th>\n",
-       "      <th>farmid</th>\n",
-       "      <th>secret</th>\n",
-       "      <th>secretoriginal</th>\n",
-       "      <th>ext</th>\n",
-       "      <th>marker</th>\n",
-       "      <th>key</th>\n",
-       "      <th>title_clean</th>\n",
-       "      <th>description_clean</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>137943</td>\n",
-       "      <td>48600072071@N01</td>\n",
-       "      <td>doctor+paradox</td>\n",
-       "      <td>2004-08-01 18:13:06.0</td>\n",
-       "      <td>1091409186</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>A+Picture+Share%21</td>\n",
-       "      <td>Antenna</td>\n",
-       "      <td>cameraphone,cayugaheights,green,hydrant,ithaca...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>http://creativecommons.org/licenses/by-nc-sa/2.0/</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1650c7cdc6</td>\n",
-       "      <td>1650c7cdc6</td>\n",
-       "      <td>jpg</td>\n",
-       "      <td>0</td>\n",
-       "      <td>d29e7c6a3028418c64eb15e3cf577c2</td>\n",
-       "      <td>A Picture Share!</td>\n",
-       "      <td>Antenna</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1246361</td>\n",
-       "      <td>44124324682@N01</td>\n",
-       "      <td>mharrsch</td>\n",
-       "      <td>2004-11-03 23:04:02.0</td>\n",
-       "      <td>1099523042</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>An+ornate+Roman+urn</td>\n",
-       "      <td>Photographed+at+the+%3Ca+href%3D%22http%3A%2F%...</td>\n",
-       "      <td>ancient,baltimore,burial,death,empire,funeral,...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>http://creativecommons.org/licenses/by-nc-sa/2.0/</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>cf37054610</td>\n",
-       "      <td>cf37054610</td>\n",
-       "      <td>jpg</td>\n",
-       "      <td>0</td>\n",
-       "      <td>d29f01b149167d683f9ddde464bb3db</td>\n",
-       "      <td>An ornate Roman urn</td>\n",
-       "      <td>Photographed at the Walters Art Museum, Baltim...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1251599</td>\n",
-       "      <td>51035803024@N01</td>\n",
-       "      <td>bmitd67</td>\n",
-       "      <td>2004-10-30 17:09:32.0</td>\n",
-       "      <td>1099538888</td>\n",
-       "      <td>Canon+PowerShot+S30</td>\n",
-       "      <td>Jai+%26+Tara+on+the+Cumberland</td>\n",
-       "      <td>Another+trip+for+the+happy+couple.</td>\n",
-       "      <td>blue+heron,cumberland+river,jai,tara,tennessee</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>http://creativecommons.org/licenses/by-nc-sa/2.0/</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4a4234e32c</td>\n",
-       "      <td>4a4234e32c</td>\n",
-       "      <td>jpg</td>\n",
-       "      <td>0</td>\n",
-       "      <td>d296e9e34bdae41edb6c679ff824ab2a</td>\n",
-       "      <td>Jai &amp; Tara on the Cumberland</td>\n",
-       "      <td>Another trip for the happy couple.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2348587</td>\n",
-       "      <td>73621375@N00</td>\n",
-       "      <td>Thom+Watson</td>\n",
-       "      <td>2004-12-18 21:08:09.0</td>\n",
-       "      <td>1103497228</td>\n",
-       "      <td>SONY+DSC-W1</td>\n",
-       "      <td>Castle+gate+-+%22lite-brited%22</td>\n",
-       "      <td>Taken+at+the+Miracle+of+Lights+display+in+Cent...</td>\n",
-       "      <td>bullrunpark,castle,centreville,christmas,decor...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>http://creativecommons.org/licenses/by-nc-sa/2.0/</td>\n",
-       "      <td>2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>7162c974c3</td>\n",
-       "      <td>7162c974c3</td>\n",
-       "      <td>jpg</td>\n",
-       "      <td>0</td>\n",
-       "      <td>d29ce96395848478b1e8396e44899</td>\n",
-       "      <td>Castle gate - \"lite-brited\"</td>\n",
-       "      <td>Taken at the Miracle of Lights display in Cent...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>3516047</td>\n",
-       "      <td>48600072071@N01</td>\n",
-       "      <td>doctor+paradox</td>\n",
-       "      <td>2005-01-18 16:44:18.0</td>\n",
-       "      <td>1106084658</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>A+Picture+Share%21</td>\n",
-       "      <td>Tabular</td>\n",
-       "      <td>cameraphone,moblog,unfound</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>http://creativecommons.org/licenses/by-nc-sa/2.0/</td>\n",
-       "      <td>3</td>\n",
-       "      <td>1</td>\n",
-       "      <td>663e0d8b3d</td>\n",
-       "      <td>663e0d8b3d</td>\n",
-       "      <td>jpg</td>\n",
-       "      <td>0</td>\n",
-       "      <td>d29abf32c4e12ff881f975b70e0cec0</td>\n",
-       "      <td>A Picture Share!</td>\n",
-       "      <td>Tabular</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>999995</th>\n",
-       "      <td>4648651054</td>\n",
-       "      <td>24511045@N04</td>\n",
-       "      <td>mtfrazier</td>\n",
-       "      <td>2010-05-02 15:47:45.0</td>\n",
-       "      <td>1275083371</td>\n",
-       "      <td>Canon+EOS+50D</td>\n",
-       "      <td>U.S.+Navy+Blue+Angels%3A+2010</td>\n",
-       "      <td>2+May+2010%0ASunday%0ASt.+Joseph%2C+Missouri</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>http://creativecommons.org/licenses/by-nc-nd/2.0/</td>\n",
-       "      <td>4072</td>\n",
-       "      <td>5</td>\n",
-       "      <td>2d12d73fb0</td>\n",
-       "      <td>dd5856ea42</td>\n",
-       "      <td>jpg</td>\n",
-       "      <td>0</td>\n",
-       "      <td>60fa2911cb81eb25b356e9fee978aef</td>\n",
-       "      <td>U.S. Navy Blue Angels: 2010</td>\n",
-       "      <td>2 May 2010 Sunday St. Joseph, Missouri</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>999996</th>\n",
-       "      <td>4652130996</td>\n",
-       "      <td>21963865@N04</td>\n",
-       "      <td>GRAB1.0</td>\n",
-       "      <td>2010-05-29 19:23:10.0</td>\n",
-       "      <td>1275200833</td>\n",
-       "      <td>SONY+DSLR-A230</td>\n",
-       "      <td>Attempts+on+Her+Life</td>\n",
-       "      <td>BAPA+1+production+of+Martin+Crimp%27s+Attempts...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>http://creativecommons.org/licenses/by-nc-nd/2.0/</td>\n",
-       "      <td>4003</td>\n",
-       "      <td>5</td>\n",
-       "      <td>8889121579</td>\n",
-       "      <td>2f46599456</td>\n",
-       "      <td>jpg</td>\n",
-       "      <td>0</td>\n",
-       "      <td>60f5ef5ce4c2d24566226abebd67d4</td>\n",
-       "      <td>Attempts on Her Life</td>\n",
-       "      <td>BAPA 1 production of Martin Crimp's Attempts o...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>999997</th>\n",
-       "      <td>4652568339</td>\n",
-       "      <td>64025277@N00</td>\n",
-       "      <td>1Sock</td>\n",
-       "      <td>2010-05-13 15:38:37.0</td>\n",
-       "      <td>1275234267</td>\n",
-       "      <td>Canon+EOS+DIGITAL+REBEL+XT</td>\n",
-       "      <td>Carlsbad+Caverns+3</td>\n",
-       "      <td>%E2%99%A5%E2%99%A5%E2%99%A5%E2%99%A5%E2%99%A5%...</td>\n",
-       "      <td>carlsbad,carlsbad+caverns,cave,faa,new+mexico,...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>http://creativecommons.org/licenses/by-nc-nd/2.0/</td>\n",
-       "      <td>4010</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0a1808a69e</td>\n",
-       "      <td>cf6d348e3d</td>\n",
-       "      <td>jpg</td>\n",
-       "      <td>0</td>\n",
-       "      <td>60f029482d1d1028fda5281daf498f</td>\n",
-       "      <td>Carlsbad Caverns 3</td>\n",
-       "      <td>♥♥♥♥♥♥♥ Interested in purchasing this photogra...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>999998</th>\n",
-       "      <td>4653110895</td>\n",
-       "      <td>20483509@N00</td>\n",
-       "      <td>subberculture</td>\n",
-       "      <td>2010-05-30 15:37:05.0</td>\n",
-       "      <td>1275245596</td>\n",
-       "      <td>Canon+DIGITAL+IXUS+40</td>\n",
-       "      <td>Want</td>\n",
-       "      <td>Isn%27t+that+gorgeous%3F</td>\n",
-       "      <td>2010,edinburgh+museum,may,phonebox,wood</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>http://creativecommons.org/licenses/by-sa/2.0/</td>\n",
-       "      <td>4066</td>\n",
-       "      <td>5</td>\n",
-       "      <td>77c3b3a254</td>\n",
-       "      <td>c4697e1511</td>\n",
-       "      <td>jpg</td>\n",
-       "      <td>0</td>\n",
-       "      <td>60f72775f433cf8de3efaeb431866153</td>\n",
-       "      <td>Want</td>\n",
-       "      <td>Isn't that gorgeous?</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>999999</th>\n",
-       "      <td>4655503987</td>\n",
-       "      <td>8457193@N07</td>\n",
-       "      <td>zackojones</td>\n",
-       "      <td>2010-05-30 15:34:58.0</td>\n",
-       "      <td>1275310230</td>\n",
-       "      <td>Canon+EOS+7D</td>\n",
-       "      <td>Summertime</td>\n",
-       "      <td>You+gotta+love+it%21</td>\n",
-       "      <td>georgia,savannah,united+states,us</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>http://creativecommons.org/licenses/by-nc-sa/2.0/</td>\n",
-       "      <td>4043</td>\n",
-       "      <td>5</td>\n",
-       "      <td>caff543bfe</td>\n",
-       "      <td>f60952ac4d</td>\n",
-       "      <td>jpg</td>\n",
-       "      <td>0</td>\n",
-       "      <td>60f687e11b913bce461e9525d8047e0</td>\n",
-       "      <td>Summertime</td>\n",
-       "      <td>You gotta love it!</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>1000000 rows × 26 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "           photoid              uid       unickname              datetaken  \\\n",
-       "0           137943  48600072071@N01  doctor+paradox  2004-08-01 18:13:06.0   \n",
-       "1          1246361  44124324682@N01        mharrsch  2004-11-03 23:04:02.0   \n",
-       "2          1251599  51035803024@N01         bmitd67  2004-10-30 17:09:32.0   \n",
-       "3          2348587     73621375@N00     Thom+Watson  2004-12-18 21:08:09.0   \n",
-       "4          3516047  48600072071@N01  doctor+paradox  2005-01-18 16:44:18.0   \n",
-       "...            ...              ...             ...                    ...   \n",
-       "999995  4648651054     24511045@N04       mtfrazier  2010-05-02 15:47:45.0   \n",
-       "999996  4652130996     21963865@N04         GRAB1.0  2010-05-29 19:23:10.0   \n",
-       "999997  4652568339     64025277@N00           1Sock  2010-05-13 15:38:37.0   \n",
-       "999998  4653110895     20483509@N00   subberculture  2010-05-30 15:37:05.0   \n",
-       "999999  4655503987      8457193@N07      zackojones  2010-05-30 15:34:58.0   \n",
-       "\n",
-       "        dateuploaded               capturedevice  \\\n",
-       "0         1091409186                         NaN   \n",
-       "1         1099523042                         NaN   \n",
-       "2         1099538888         Canon+PowerShot+S30   \n",
-       "3         1103497228                 SONY+DSC-W1   \n",
-       "4         1106084658                         NaN   \n",
-       "...              ...                         ...   \n",
-       "999995    1275083371               Canon+EOS+50D   \n",
-       "999996    1275200833              SONY+DSLR-A230   \n",
-       "999997    1275234267  Canon+EOS+DIGITAL+REBEL+XT   \n",
-       "999998    1275245596       Canon+DIGITAL+IXUS+40   \n",
-       "999999    1275310230                Canon+EOS+7D   \n",
-       "\n",
-       "                                  title  \\\n",
-       "0                    A+Picture+Share%21   \n",
-       "1                   An+ornate+Roman+urn   \n",
-       "2        Jai+%26+Tara+on+the+Cumberland   \n",
-       "3       Castle+gate+-+%22lite-brited%22   \n",
-       "4                    A+Picture+Share%21   \n",
-       "...                                 ...   \n",
-       "999995    U.S.+Navy+Blue+Angels%3A+2010   \n",
-       "999996             Attempts+on+Her+Life   \n",
-       "999997               Carlsbad+Caverns+3   \n",
-       "999998                             Want   \n",
-       "999999                       Summertime   \n",
-       "\n",
-       "                                              description  \\\n",
-       "0                                                 Antenna   \n",
-       "1       Photographed+at+the+%3Ca+href%3D%22http%3A%2F%...   \n",
-       "2                      Another+trip+for+the+happy+couple.   \n",
-       "3       Taken+at+the+Miracle+of+Lights+display+in+Cent...   \n",
-       "4                                                 Tabular   \n",
-       "...                                                   ...   \n",
-       "999995       2+May+2010%0ASunday%0ASt.+Joseph%2C+Missouri   \n",
-       "999996  BAPA+1+production+of+Martin+Crimp%27s+Attempts...   \n",
-       "999997  %E2%99%A5%E2%99%A5%E2%99%A5%E2%99%A5%E2%99%A5%...   \n",
-       "999998                           Isn%27t+that+gorgeous%3F   \n",
-       "999999                               You+gotta+love+it%21   \n",
-       "\n",
-       "                                                 usertags machinetags  ...  \\\n",
-       "0       cameraphone,cayugaheights,green,hydrant,ithaca...         NaN  ...   \n",
-       "1       ancient,baltimore,burial,death,empire,funeral,...         NaN  ...   \n",
-       "2          blue+heron,cumberland+river,jai,tara,tennessee         NaN  ...   \n",
-       "3       bullrunpark,castle,centreville,christmas,decor...         NaN  ...   \n",
-       "4                              cameraphone,moblog,unfound         NaN  ...   \n",
-       "...                                                   ...         ...  ...   \n",
-       "999995                                                NaN         NaN  ...   \n",
-       "999996                                                NaN         NaN  ...   \n",
-       "999997  carlsbad,carlsbad+caverns,cave,faa,new+mexico,...         NaN  ...   \n",
-       "999998            2010,edinburgh+museum,may,phonebox,wood         NaN  ...   \n",
-       "999999                  georgia,savannah,united+states,us         NaN  ...   \n",
-       "\n",
-       "                                               licenseurl  serverid  farmid  \\\n",
-       "0       http://creativecommons.org/licenses/by-nc-sa/2.0/         1       1   \n",
-       "1       http://creativecommons.org/licenses/by-nc-sa/2.0/         1       1   \n",
-       "2       http://creativecommons.org/licenses/by-nc-sa/2.0/         1       1   \n",
-       "3       http://creativecommons.org/licenses/by-nc-sa/2.0/         2       1   \n",
-       "4       http://creativecommons.org/licenses/by-nc-sa/2.0/         3       1   \n",
-       "...                                                   ...       ...     ...   \n",
-       "999995  http://creativecommons.org/licenses/by-nc-nd/2.0/      4072       5   \n",
-       "999996  http://creativecommons.org/licenses/by-nc-nd/2.0/      4003       5   \n",
-       "999997  http://creativecommons.org/licenses/by-nc-nd/2.0/      4010       5   \n",
-       "999998     http://creativecommons.org/licenses/by-sa/2.0/      4066       5   \n",
-       "999999  http://creativecommons.org/licenses/by-nc-sa/2.0/      4043       5   \n",
-       "\n",
-       "            secret secretoriginal  ext marker  \\\n",
-       "0       1650c7cdc6     1650c7cdc6  jpg      0   \n",
-       "1       cf37054610     cf37054610  jpg      0   \n",
-       "2       4a4234e32c     4a4234e32c  jpg      0   \n",
-       "3       7162c974c3     7162c974c3  jpg      0   \n",
-       "4       663e0d8b3d     663e0d8b3d  jpg      0   \n",
-       "...            ...            ...  ...    ...   \n",
-       "999995  2d12d73fb0     dd5856ea42  jpg      0   \n",
-       "999996  8889121579     2f46599456  jpg      0   \n",
-       "999997  0a1808a69e     cf6d348e3d  jpg      0   \n",
-       "999998  77c3b3a254     c4697e1511  jpg      0   \n",
-       "999999  caff543bfe     f60952ac4d  jpg      0   \n",
-       "\n",
-       "                                     key                   title_clean  \\\n",
-       "0        d29e7c6a3028418c64eb15e3cf577c2              A Picture Share!   \n",
-       "1        d29f01b149167d683f9ddde464bb3db           An ornate Roman urn   \n",
-       "2       d296e9e34bdae41edb6c679ff824ab2a  Jai & Tara on the Cumberland   \n",
-       "3          d29ce96395848478b1e8396e44899   Castle gate - \"lite-brited\"   \n",
-       "4        d29abf32c4e12ff881f975b70e0cec0              A Picture Share!   \n",
-       "...                                  ...                           ...   \n",
-       "999995   60fa2911cb81eb25b356e9fee978aef   U.S. Navy Blue Angels: 2010   \n",
-       "999996    60f5ef5ce4c2d24566226abebd67d4          Attempts on Her Life   \n",
-       "999997    60f029482d1d1028fda5281daf498f            Carlsbad Caverns 3   \n",
-       "999998  60f72775f433cf8de3efaeb431866153                          Want   \n",
-       "999999   60f687e11b913bce461e9525d8047e0                    Summertime   \n",
-       "\n",
-       "                                        description_clean  \n",
-       "0                                                 Antenna  \n",
-       "1       Photographed at the Walters Art Museum, Baltim...  \n",
-       "2                      Another trip for the happy couple.  \n",
-       "3       Taken at the Miracle of Lights display in Cent...  \n",
-       "4                                                 Tabular  \n",
-       "...                                                   ...  \n",
-       "999995             2 May 2010 Sunday St. Joseph, Missouri  \n",
-       "999996  BAPA 1 production of Martin Crimp's Attempts o...  \n",
-       "999997  ♥♥♥♥♥♥♥ Interested in purchasing this photogra...  \n",
-       "999998                               Isn't that gorgeous?  \n",
-       "999999                                 You gotta love it!  \n",
-       "\n",
-       "[1000000 rows x 26 columns]"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# looking up at a chunk\n",
-    "pd.read_csv(\"./chunks/chunk1.tsv\", sep=\"\\t\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 98,
-   "id": "c51c5597",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>key</th>\n",
-       "      <th>title_clean</th>\n",
-       "      <th>description_clean</th>\n",
-       "      <th>ext</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>d29e7c6a3028418c64eb15e3cf577c2</td>\n",
-       "      <td>A Picture Share!</td>\n",
-       "      <td>Antenna</td>\n",
-       "      <td>jpg</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>d29f01b149167d683f9ddde464bb3db</td>\n",
-       "      <td>An ornate Roman urn</td>\n",
-       "      <td>Photographed at the Walters Art Museum, Baltim...</td>\n",
-       "      <td>jpg</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>d296e9e34bdae41edb6c679ff824ab2a</td>\n",
-       "      <td>Jai &amp; Tara on the Cumberland</td>\n",
-       "      <td>Another trip for the happy couple.</td>\n",
-       "      <td>jpg</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>d29ce96395848478b1e8396e44899</td>\n",
-       "      <td>Castle gate - \"lite-brited\"</td>\n",
-       "      <td>Taken at the Miracle of Lights display in Cent...</td>\n",
-       "      <td>jpg</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>d29abf32c4e12ff881f975b70e0cec0</td>\n",
-       "      <td>A Picture Share!</td>\n",
-       "      <td>Tabular</td>\n",
-       "      <td>jpg</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                key                   title_clean  \\\n",
-       "0   d29e7c6a3028418c64eb15e3cf577c2              A Picture Share!   \n",
-       "1   d29f01b149167d683f9ddde464bb3db           An ornate Roman urn   \n",
-       "2  d296e9e34bdae41edb6c679ff824ab2a  Jai & Tara on the Cumberland   \n",
-       "3     d29ce96395848478b1e8396e44899   Castle gate - \"lite-brited\"   \n",
-       "4   d29abf32c4e12ff881f975b70e0cec0              A Picture Share!   \n",
-       "\n",
-       "                                   description_clean  ext  \n",
-       "0                                            Antenna  jpg  \n",
-       "1  Photographed at the Walters Art Museum, Baltim...  jpg  \n",
-       "2                 Another trip for the happy couple.  jpg  \n",
-       "3  Taken at the Miracle of Lights display in Cent...  jpg  \n",
-       "4                                            Tabular  jpg  "
-      ]
-     },
-     "execution_count": 98,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Looking at a chunk with only the relevant columns that we need\n",
-    "df = pd.read_csv(\"./chunks/chunk1.tsv\", sep=\"\\t\")[[\"key\", \"title_clean\", \"description_clean\", \"ext\"]]\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cc1668f8",
-   "metadata": {},
-   "source": [
-    "### Grabbing each chunks from the folder, cleaning it up, only taking the entries which image exist and appending it to the global df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "abbcccf3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# the function that helps us to decide whether an image with certain id exists in storage, we only take the ones that we have the images for\n",
-    "def image_exists(item):\n",
-    "    name, _, _, ext, _ = item\n",
-    "    root=str(yfcc100m_images)\n",
-    "    image_path = (Path(root)/name[0:3]/name[3:6]/name).with_suffix(\".\"+ext)\n",
-    "    if image_path.exists():\n",
-    "        return True\n",
-    "    else:\n",
-    "        return None"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "id": "44fa86ab",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# This cell does it all, grabs each chunk, cleans it up based on image existing condition, etc.\n",
-    "global_df = pd.DataFrame()\n",
-    "chunks_dir = \"./chunks\"\n",
-    "for filename in os.listdir(chunks_dir):\n",
-    "        df = pd.read_csv(f\"./chunks/{str(filename)}\", sep=\"\\t\")[[\"key\", \"title_clean\", \"description_clean\", \"ext\"]]\n",
-    "        df['caption'] = df[\"title_clean\"]+\". \"+df['description_clean']\n",
-    "        df['is_exist'] = df.apply(image_exists, axis=1)\n",
-    "        df = df.dropna()[[\"key\", \"caption\"]]\n",
-    "        df.columns = ['image_file', 'caption']\n",
-    "        global_df = global_df.append(df, ignore_index=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 89,
-   "id": "45024fdc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# saving the tsv to disk\n",
-    "global_df.to_csv('./chunks/YFCC_subset_clean.tsv', sep=\"\\t\", index=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 101,
-   "id": "dca4eb73",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# loading the tsv from disk (for explicitness, also my electricity was gone, glad it happened after I saved to the disk :( )\n",
-    "\n",
-    "dataset = pd.read_csv(f\"./chunks/YFCC_subset_clean.tsv\", sep=\"\\t\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 153,
-   "id": "a511264a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"\"\"\n",
-    "Luke Melas-Kyriazi's dataset.py's modified version for YFCC\n",
-    "\"\"\"\n",
-    "import warnings\n",
-    "from typing import Optional, Callable\n",
-    "from pathlib import Path\n",
-    "import numpy as np\n",
-    "import torch\n",
-    "import pandas as pd\n",
-    "from torch.utils.data import Dataset\n",
-    "from torchvision.datasets.folder import default_loader\n",
-    "from PIL import ImageFile\n",
-    "from PIL.Image import DecompressionBombWarning\n",
-    "ImageFile.LOAD_TRUNCATED_IMAGES = True\n",
-    "warnings.filterwarnings(\"ignore\", category=UserWarning)\n",
-    "warnings.filterwarnings(\"ignore\", category=DecompressionBombWarning)\n",
-    "\n",
-    "\n",
-    "class CaptionDataset(Dataset):\n",
-    "    \"\"\"\n",
-    "    A PyTorch Dataset class for (image, texts) tasks. Note that this dataset \n",
-    "    returns the raw text rather than tokens. This is done on purpose, because\n",
-    "    it's easy to tokenize a batch of text after loading it from this dataset.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def __init__(self, *, images_root: str, captions_path: str, text_transform: Optional[Callable] = None, \n",
-    "                 image_transform: Optional[Callable] = None, image_transform_type: str = 'torchvision',\n",
-    "                 include_captions: bool = True):\n",
-    "        \"\"\"\n",
-    "        :param images_root: folder where images are stored\n",
-    "        :param captions_path: path to csv that maps image filenames to captions\n",
-    "        :param image_transform: image transform pipeline\n",
-    "        :param text_transform: image transform pipeline\n",
-    "        :param image_transform_type: image transform type, either `torchvision` or `albumentations`\n",
-    "        :param include_captions: Returns a dictionary with `image`, `text` if `true`; otherwise returns just the images.\n",
-    "        \"\"\"\n",
-    "\n",
-    "        # Base path for images\n",
-    "        self.images_root = Path(images_root)\n",
-    "\n",
-    "        # Load captions as DataFrame\n",
-    "        self.captions = pd.read_csv(f\"./chunks/YFCC_subset_clean.tsv\", sep=\"\\t\")\n",
-    "        self.captions['image_file'] = self.captions['image_file'].astype(str)\n",
-    "\n",
-    "        # PyTorch transformation pipeline for the image (normalizing, etc.)\n",
-    "        self.text_transform = text_transform\n",
-    "        self.image_transform = image_transform\n",
-    "        self.image_transform_type = image_transform_type.lower()\n",
-    "        assert self.image_transform_type in ['torchvision', 'albumentations']\n",
-    "\n",
-    "        # Total number of datapoints\n",
-    "        self.size = len(self.captions)\n",
-    "\n",
-    "        # Return image+captions or just images\n",
-    "        self.include_captions = include_captions\n",
-    "    \n",
-    "    def image_exists(item):\n",
-    "        name, caption = item\n",
-    "        root=str(self.images_root)\n",
-    "        image_path = (Path(root)/name[0:3]/name[3:6]/name).with_suffix(\".jpg\")\n",
-    "\n",
-    "        return image_path.exists()\n",
-    "\n",
-    "    def verify_that_all_images_exist(self):\n",
-    "        for image_file in self.captions['image_file']:\n",
-    "            if not image_exists:\n",
-    "                print(f'file does not exist: {p}')\n",
-    "\n",
-    "    def _get_raw_image(self, i):\n",
-    "        name = self.captions.iloc[i]['image_file']\n",
-    "        image_path = (Path(self.images_root)/name[0:3]/name[3:6]/name).with_suffix(\".jpg\")\n",
-    "        image = default_loader(image_path)\n",
-    "        return image\n",
-    "\n",
-    "    def _get_raw_text(self, i):\n",
-    "        return self.captions.iloc[i]['caption']\n",
-    "\n",
-    "    def __getitem__(self, i):\n",
-    "        image = self._get_raw_image(i)\n",
-    "        caption = self._get_raw_text(i)\n",
-    "        if self.image_transform is not None:\n",
-    "            if self.image_transform_type == 'torchvision':\n",
-    "                image = self.image_transform(image)\n",
-    "            elif self.image_transform_type == 'albumentations':\n",
-    "                image = self.image_transform(image=np.array(image))['image']\n",
-    "            else:\n",
-    "                raise NotImplementedError(f\"{self.image_transform_type=}\")\n",
-    "        return {'image': image, 'text': caption} if self.include_captions else image\n",
-    "\n",
-    "    def __len__(self):\n",
-    "        return self.size\n",
-    "\n",
-    "\n",
-    "if __name__ == \"__main__\":\n",
-    "    import albumentations as A\n",
-    "    from albumentations.pytorch import ToTensorV2\n",
-    "    from transformers import AutoTokenizer\n",
-    "    \n",
-    "\n",
-    "    images_root = \"/home/khali/TPU-Test/YFCC100M_OpenAI_subset/data/data/images\"\n",
-    "    captions_path = './YFCC_subset_clean.tsv'\n",
-    "    image_size = 256\n",
-    "    \n",
-    "    # Create transforms\n",
-    "    def image_transform(image):\n",
-    "        s = min(image.size)\n",
-    "        r = image_size / s\n",
-    "        s = (round(r * image.size[1]), round(r * image.size[0]))\n",
-    "        image = TF.resize(image, s, interpolation=InterpolationMode.LANCZOS)\n",
-    "        image = TF.center_crop(image, output_size = 2 * [image_size])\n",
-    "        image = torch.unsqueeze(T.ToTensor()(image), 0)\n",
-    "        image = image.permute(0, 2, 3, 1).numpy()\n",
-    "        return image\n",
-    "    \n",
-    "    # Create dataset\n",
-    "    dataset = CaptionDataset(\n",
-    "        images_root=images_root,\n",
-    "        captions_path=captions_path,\n",
-    "        image_transform=image_transform,\n",
-    "        image_transform_type='torchvision',\n",
-    "        include_captions=False\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 155,
-   "id": "cc922704",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2483316"
-      ]
-     },
-     "execution_count": 155,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(dataset)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 156,
-   "id": "6e47ba46",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataloader = DataLoader(dataset, batch_size=32, num_workers=4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "c8a130eb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# looking at a batch\n",
-    "next(iter(dataloader))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c192fd44",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# import matplotlib.pyplot as plt\n",
-    "# for tensor_image, _ in dataloader:\n",
-    "#     print(tensor_image)\n",
-    "#     plt.imshow(tensor_image.permute(1, 2, 0))\n",
-    "#     break"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "62ad01c3",
-   "metadata": {},
-   "source": [
-    "## Encoding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 158,
-   "id": "88f36d0b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def encode(model, batch):\n",
-    "#     print(\"jitting encode function\")\n",
-    "    _, indices = model.encode(batch)\n",
-    "    return indices"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 160,
-   "id": "1f35f0cb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def superbatch_generator(dataloader, num_tpus):\n",
-    "    iter_loader = iter(dataloader)\n",
-    "    for batch in iter_loader:\n",
-    "        superbatch = [batch.squeeze(1)]\n",
-    "        try:\n",
-    "            for b in range(num_tpus-1):\n",
-    "                batch = next(iter_loader)\n",
-    "                if batch is None:\n",
-    "                    break\n",
-    "                # Skip incomplete last batch\n",
-    "                if batch.shape[0] == dataloader.batch_size:\n",
-    "                    superbatch.append(batch.squeeze(1))\n",
-    "        except StopIteration:\n",
-    "            pass\n",
-    "        superbatch = torch.stack(superbatch, axis=0)\n",
-    "        yield superbatch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 170,
-   "id": "2210705b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "def encode_captioned_dataset(dataset, output_tsv, batch_size=32, num_workers=16):\n",
-    "    if os.path.isfile(output_tsv):\n",
-    "        print(f\"Destination file {output_tsv} already exists, please move away.\")\n",
-    "        return\n",
-    "    \n",
-    "    num_tpus = 8    \n",
-    "    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)\n",
-    "    superbatches = superbatch_generator(dataloader, num_tpus=num_tpus)\n",
-    "    \n",
-    "    p_encoder = pmap(lambda batch: encode(model, batch))\n",
-    "\n",
-    "    # We save each superbatch to avoid reallocation of buffers as we process them.\n",
-    "    # We keep the file open to prevent excessive file seeks.\n",
-    "    with open(output_tsv, \"w\") as file:\n",
-    "        iterations = len(dataset) // (batch_size * num_tpus)\n",
-    "        for n in tqdm(range(iterations)):\n",
-    "            superbatch = next(superbatches)\n",
-    "            encoded = p_encoder(superbatch.numpy())\n",
-    "            encoded = encoded.reshape(-1, encoded.shape[-1])\n",
-    "\n",
-    "            # Extract fields from the dataset internal `captions` property, and save to disk\n",
-    "            start_index = n * batch_size * num_tpus\n",
-    "            end_index = (n+1) * batch_size * num_tpus\n",
-    "            paths = dataset.captions[\"image_file\"][start_index:end_index].values\n",
-    "            captions = dataset.captions[\"caption\"][start_index:end_index].values\n",
-    "            encoded_as_string = list(map(lambda item: np.array2string(item, separator=',', max_line_width=50000, formatter={'int':lambda x: str(x)}), encoded))\n",
-    "            batch_df = pd.DataFrame.from_dict({\"image_file\": paths, \"caption\": captions, \"encoding\": encoded_as_string})\n",
-    "            batch_df.to_csv(file, sep='\\t', header=(n==0), index=None)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 171,
-   "id": "7704863d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4850/4850 [2:27:51<00:00,  1.83s/it]\n"
-     ]
-    }
-   ],
-   "source": [
-    "encode_captioned_dataset(dataset, yfcc100m_output, batch_size=64, num_workers=16)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8953dd84",
-   "metadata": {},
-   "source": [
-    "----"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3.9.0 64-bit ('Python39')"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.0"
-  },
-  "interpreter": {
-   "hash": "db471c52d602b4f5f40ecaf278e88ccfef85c29d0a1a07185b0d51fc7acf4e26"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

encoding/vqgan-jax-encoding.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

environment.yaml DELETED Viewed

@@ -1,10 +0,0 @@
-name: dalle
-channels:
-  - defaults
-dependencies:
-  - python=3.9.5
-  - pip=21.1.3
-  - ipython=7.22.0
-  - cudatoolkit
-  - pip:
-    - -r requirements.txt

img/logo.png ADDED Viewed

model/data-pipeline.ipynb DELETED Viewed

@@ -1,385 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "bf8fb38a",
-   "metadata": {},
-   "source": [
-    "# Data Pipeline"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "9b83dcb9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dataclasses import dataclass, field\n",
-    "from pathlib import Path\n",
-    "\n",
-    "import datasets\n",
-    "from datasets import Dataset, load_dataset\n",
-    "import numpy as np\n",
-    "\n",
-    "from transformers import BartTokenizer\n",
-    "\n",
-    "from tqdm import tqdm\n",
-    "\n",
-    "import jax\n",
-    "import jax.numpy as jnp\n",
-    "\n",
-    "from flax.training.common_utils import shard"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a661a89e",
-   "metadata": {},
-   "source": [
-    "File containing image paths, captions and VQGAN-encoded indices."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "0e84e889",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "datafile = '/data/CC12M/images-encoded-10000.tsv'   # 9999 encoded images from CC12M"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7fdc640b",
-   "metadata": {},
-   "source": [
-    "TODO: generate train/test splits if necessary."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "cc6789b4",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using custom data configuration default-91833df78e844785\n",
-      "Reusing dataset csv (/home/pedro/.cache/huggingface/datasets/csv/default-91833df78e844785/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23)\n"
-     ]
-    }
-   ],
-   "source": [
-    "dataset = load_dataset('csv', delimiter='\\t', data_files=[datafile])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "f3ed4919",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "DatasetDict({\n",
-       "    train: Dataset({\n",
-       "        features: ['image_file', 'caption', 'encoding'],\n",
-       "        num_rows: 9999\n",
-       "    })\n",
-       "})"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "a70c7354",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Dataset({\n",
-       "    features: ['image_file', 'caption', 'encoding'],\n",
-       "    num_rows: 9999\n",
-       "})"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dataset = dataset[\"train\"]\n",
-    "dataset"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a73454cf",
-   "metadata": {},
-   "source": [
-    "We don't really need the `image_file` field for training. We'll drop it during pre-processing because we won't be able to numericalize it to a `jnp.array`, which would be required in JAX."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7c0fa992",
-   "metadata": {},
-   "source": [
-    "## Preprocessing"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a0e36582",
-   "metadata": {},
-   "source": [
-    "The `encoding` field contains a string representation of the encoded indices. We'll convert them to numbers. We also need to tokenize the captions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "d46f6ac5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Setting padding=\"max_length\" as we need fixed length inputs for jitted functions\n",
-    "max_length = 256   # Read from data_args.max_source_length\n",
-    "tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')\n",
-    "image_bos = 16384   # Max token is 16383 in our VQGAN configuration"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "4cac6643",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def preprocess_function(examples):\n",
-    "    inputs = examples[\"caption\"]\n",
-    "#     inputs = [prefix + inp for inp in inputs]   # Do we need this?\n",
-    "    model_inputs = tokenizer(\n",
-    "        inputs, max_length=max_length, padding=\"max_length\", truncation=True, return_tensors=\"np\"\n",
-    "    )\n",
-    "\n",
-    "    model_inputs[\"labels\"] = [[image_bos] + eval(indices) for indices in examples['encoding']]\n",
-    "\n",
-    "    return model_inputs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "e6a4cb91",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "num_workers = 48     # We have 96 processors in the TPU\n",
-    "column_names = dataset.column_names\n",
-    "input_dataset = dataset.map(preprocess_function,\n",
-    "                            remove_columns=column_names,\n",
-    "                            batched=True,\n",
-    "                            num_proc=48\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "a9b1b467",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuffle: bool = False):\n",
-    "    \"\"\"\n",
-    "    Returns batches of size `batch_size` from truncated `dataset`, sharded over all local devices.\n",
-    "    Shuffle batches if `shuffle` is `True`.\n",
-    "    \"\"\"\n",
-    "    steps_per_epoch = len(dataset) // batch_size\n",
-    "\n",
-    "    if shuffle:\n",
-    "        batch_idx = jax.random.permutation(rng, len(dataset))\n",
-    "    else:\n",
-    "        batch_idx = jnp.arange(len(dataset))\n",
-    "\n",
-    "    batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.\n",
-    "    batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))\n",
-    "\n",
-    "    for idx in batch_idx:\n",
-    "        batch = dataset[idx]        \n",
-    "        batch = {k: jnp.array(v) for k, v in batch.items()}\n",
-    "        batch = shard(batch)\n",
-    "        yield batch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "0a628505",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:absl:Starting the local TPU driver.\n",
-      "INFO:absl:Unable to initialize backend 'tpu_driver': Not found: Unable to find driver in registry given worker: local://\n",
-      "INFO:absl:Unable to initialize backend 'gpu': Not found: Could not find registered platform with name: \"cuda\". Available platform names are: Host TPU Interpreter\n"
-     ]
-    }
-   ],
-   "source": [
-    "rng = jax.random.PRNGKey(23)  # Use training_args.seed\n",
-    "batch_size = 64    # Per device\n",
-    "super_batch_size = batch_size * jax.device_count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "b3a5ce7d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "loader = data_loader(rng, input_dataset, batch_size=super_batch_size)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "67aa8f9c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "superbatch = next(iter(loader))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "7cd99402",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "dict_keys(['attention_mask', 'input_ids', 'labels'])"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "superbatch.keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "652a4a9e",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "8"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(superbatch[\"labels\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "de7de4e8",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(8, 64, 257)"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "superbatch[\"labels\"].shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6800153b",
-   "metadata": {},
-   "source": [
-    "Any image sequence should begin with `image_bos`:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "cfe23a71",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "assert superbatch[\"labels\"][1][5][0].item() == image_bos"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0fb899b4",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [tool.isort]
2	+ profile = "black"

requirements.txt DELETED Viewed

@@ -1,9 +0,0 @@
-# Note: install with the following command:
-# pip install -r requirements.txt -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
-# Otherwise it won't find the appropriate libtpu_nightly
-requests
-jax[tpu]>=0.2.16
--e git+https://github.com/huggingface/transformers.git@master#egg=transformers
--e git+https://github.com/huggingface/datasets.git@master#egg=datasets
-flax
-jupyter

seq2seq/do_big_run.sh DELETED Viewed

@@ -1,16 +0,0 @@
-python run_seq2seq_flax.py \
-	--max_source_length 128 \
-	--train_file /data/CC12M/encoded-small-train.tsv \			# ignored for now in our script
-	--validation_file /data/CC12M/encoded-small-valid.tsv \		# ignored for now in our script
-	--output_dir output \
-	--per_device_train_batch_size 56 \
-	--per_device_eval_batch_size 56 \
-	--preprocessing_num_workers 80 \
-	--warmup_steps 125 \
-	--gradient_accumulation_steps 8 \
-	--do_train \
-	--do_eval \
-	--adafactor \
-	--num_train_epochs 10 \
-	--log_model \
-	--learning_rate 0.001

seq2seq/do_small_run.sh DELETED Viewed

@@ -1,16 +0,0 @@
-python run_seq2seq_flax.py \
-	--max_source_length 128 \
-	--train_file /data/CC12M/encoded-small-train.tsv \			# ignored for now in our script
-	--validation_file /data/CC12M/encoded-small-valid.tsv \		# ignored for now in our script
-	--output_dir output \
-	--per_device_train_batch_size 56 \
-	--per_device_eval_batch_size 56 \
-	--preprocessing_num_workers 80 \
-	--warmup_steps 125 \
-	--gradient_accumulation_steps 8 \
-	--do_train \
-	--do_eval \
-	--adafactor \
-	--num_train_epochs 1 \
-	--max_train_samples 20000 \
-	--learning_rate 0.003

seq2seq/requirements.txt DELETED Viewed

@@ -1,8 +0,0 @@
-datasets >= 1.1.3
-jax>=0.2.8
-jaxlib>=0.1.59
-flax>=0.3.4
-optax>=0.0.8
-tensorboard
-nltk
-wandb

seq2seq/run_seq2seq_flax.py DELETED Viewed

@@ -1,897 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for seq2seq, text to image.
-Script adapted from run_summarization_flax.py
-"""
-# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
-import os
-# set a common huggingface cache folder (used with datasets and transformers) and wandb cache folder (used with artifacts)
-os.environ['HF_HOME'] = '/data/huggingface/'     # required before importing transformers & datasets
-os.environ['WANDB_CACHE_DIR'] = '/data/wandb/'   # required before importing wandb
-import logging as pylogging    # To avoid collision with transformers.utils.logging
-import sys
-import time
-from dataclasses import dataclass, field
-from functools import partial
-from pathlib import Path
-from typing import Callable, Optional
-import datasets
-import nltk  # Here to have a nice missing dependency error message early on
-import numpy as np
-from datasets import Dataset, load_dataset, load_metric
-from tqdm import tqdm
-import jax
-import jax.numpy as jnp
-import optax
-import transformers
-from filelock import FileLock
-from flax import jax_utils, traverse_util
-import flax.linen as nn
-from flax.jax_utils import unreplicate
-from flax.training import train_state
-from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
-from transformers import (
-    CONFIG_MAPPING,
-    FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-    AutoConfig,
-    AutoTokenizer,
-    FlaxAutoModelForSeq2SeqLM,
-    FlaxBartForConditionalGeneration,
-    HfArgumentParser,
-    TrainingArguments,
-)
-from transformers.models.bart.modeling_flax_bart import *
-from transformers.file_utils import is_offline_mode
-import wandb
-logger = pylogging.getLogger(__name__)
-try:
-    nltk.data.find("tokenizers/punkt")
-except (LookupError, OSError):
-    if is_offline_mode():
-        raise LookupError(
-            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
-        )
-    with FileLock(".lock") as lock:
-        nltk.download("punkt", quiet=True)
-MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-# Model hyperparameters, for convenience
-OUTPUT_VOCAB_SIZE = 16384 + 1  # encoded image token space + 1 for bos
-OUTPUT_LENGTH = 256 + 1  # number of encoded tokens + 1 for bos
-BOS_TOKEN_ID = 16384
-BASE_MODEL = 'facebook/bart-large-cnn'  # we currently have issues with bart-large
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-    model_name_or_path: Optional[str] = field(
-        default=BASE_MODEL,
-        metadata={
-            "help": "The model checkpoint for weights initialization."
-            "Don't set if you want to train a model from scratch."
-        },
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    dtype: Optional[str] = field(
-        default="float32",
-        metadata={
-            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
-        },
-    )
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    text_column: Optional[str] = field(
-        default='caption',
-        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
-    )
-    encoding_column: Optional[str] = field(
-        default='encoding',
-        metadata={"help": "The name of the column in the datasets containing the image encodings."},
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input predict data file to do prediction on (a text file)."},
-    )
-    max_source_length: Optional[int] = field(
-        default=128,
-        metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        },
-    )
-    no_decay: bool = field(
-        default=False, metadata={"help": "Whether to use decay in the learning rate scheduler."}
-    )
-    max_target_length: Optional[int] = field(
-        default=OUTPUT_LENGTH,
-        metadata={
-            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        },
-    )
-    val_max_target_length: Optional[int] = field(
-        default=OUTPUT_LENGTH,
-        metadata={
-            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
-            "This argument is also used to override the `max_length` param of `model.generate`, which is used "
-            "during evaluation."
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-            "value if set."
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-            "value if set."
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=80,     # ensure we have the same datasets cached data and avoid using too much space
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    source_prefix: Optional[str] = field(
-        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
-    )
-    predict_with_generate: bool = field(
-        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
-    )
-    num_beams: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "Number of beams to use for evaluation. This argument will be passed to `model.generate`, "
-            "which is used during evaluation."
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    log_interval: Optional[int] = field(
-        default=40,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    log_model: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    save_model_steps: Optional[int] = field(
-        default=3000,   # about once every hour in our experiments
-        metadata={
-            "help": "For logging the model more frequently. Used only when `log_model` is set."
-        },
-    )
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["tsv", "csv", "json"], "`train_file` should be a tsv, csv or json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["tsv", "csv", "json"], "`validation_file` should be a tsv, csv or json file."
-        if self.val_max_target_length is None:
-            self.val_max_target_length = self.max_target_length
-class TrainState(train_state.TrainState):
-    dropout_rng: jnp.ndarray
-    grad_accum: jnp.ndarray
-    optimizer_step: int
-    def replicate(self):
-        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
-class CustomFlaxBartModule(FlaxBartModule):
-    def setup(self):
-        # we keep shared to easily load pre-trained weights
-        self.shared = nn.Embed(
-            self.config.vocab_size,
-            self.config.d_model,
-            embedding_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),
-            dtype=self.dtype,
-        )
-        # a separate embedding is used for the decoder
-        self.decoder_embed = nn.Embed(
-            OUTPUT_VOCAB_SIZE,
-            self.config.d_model,
-            embedding_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),
-            dtype=self.dtype,
-        )
-        self.encoder = FlaxBartEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
-        # the decoder has a different config
-        decoder_config = BartConfig(self.config.to_dict())
-        decoder_config.max_position_embeddings = OUTPUT_LENGTH
-        decoder_config.min_length = OUTPUT_LENGTH
-        decoder_config.max_length = OUTPUT_LENGTH
-        decoder_config.vocab_size = OUTPUT_VOCAB_SIZE
-        self.decoder = FlaxBartDecoder(decoder_config, dtype=self.dtype, embed_tokens=self.decoder_embed)
-class CustomFlaxBartForConditionalGenerationModule(FlaxBartForConditionalGenerationModule):
-    def setup(self):
-        self.model = CustomFlaxBartModule(config=self.config, dtype=self.dtype)
-        self.lm_head = nn.Dense(
-            OUTPUT_VOCAB_SIZE,
-            use_bias=False,
-            dtype=self.dtype,
-            kernel_init=jax.nn.initializers.normal(self.config.init_std, self.dtype),
-        )
-        self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, OUTPUT_VOCAB_SIZE))
-class CustomFlaxBartForConditionalGeneration(FlaxBartForConditionalGeneration):
-    module_class = CustomFlaxBartForConditionalGenerationModule
-def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuffle: bool = False):
-    """
-    Returns batches of size `batch_size` from truncated `dataset`, sharded over all local devices.
-    Shuffle batches if `shuffle` is `True`.
-    """
-    steps_per_epoch = len(dataset) // batch_size
-    if shuffle:
-        batch_idx = jax.random.permutation(rng, len(dataset))
-    else:
-        batch_idx = jnp.arange(len(dataset))
-    batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
-    batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))
-    for idx in batch_idx:
-        batch = dataset[idx]
-        batch = {k: jnp.array(v) for k, v in batch.items()}
-        batch = shard(batch)
-        yield batch
-def create_learning_rate_fn(
-    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float, no_decay: bool
-) -> Callable[[int], jnp.array]:
-    """Returns a linear warmup, linear_decay learning rate function."""
-    steps_per_epoch = train_ds_size // train_batch_size
-    num_train_steps = steps_per_epoch * num_train_epochs
-    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
-    if no_decay:
-        return warmup_fn
-    decay_fn = optax.linear_schedule(
-        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
-    )
-    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
-    return schedule_fn
-def wandb_log(metrics, step=None, prefix=None):
-    if jax.process_index() == 0:
-        log_metrics = {f'{prefix}/{k}' if prefix is not None else k: jax.device_get(v) for k,v in metrics.items()}
-        if step is not None:
-            log_metrics['train/step'] = step
-        wandb.log(log_metrics)
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    logger.warning(f"eval_steps has been manually hardcoded")  # TODO: remove it later, convenient for now
-    training_args.eval_steps = 400
-    if (
-        os.path.exists(training_args.output_dir)
-        and os.listdir(training_args.output_dir)
-        and training_args.do_train
-        and not training_args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({training_args.output_dir}) already exists and is not empty."
-            "Use --overwrite_output_dir to overcome."
-        )
-    # Set up wandb run
-    wandb.init(
-        entity='wandb',
-        project='hf-flax-dalle-mini',
-        job_type='Seq2SeqVQGAN',
-        config=parser.parse_args()
-    )
-    # set default x-axis as 'train/step'
-    wandb.define_metric('train/step')
-    wandb.define_metric('*', step_metric='train/step')
-    # Make one log on every process with the configuration for debugging.
-    pylogging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=pylogging.INFO,
-    )
-    # Setup logging, we only want one process per machine to log things on the screen.
-    logger.setLevel(pylogging.INFO if jax.process_index() == 0 else pylogging.ERROR)
-    if jax.process_index() == 0:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    logger.info(f"Training/evaluation parameters {training_args}")
-    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    data_files = {}
-    logger.warning(f"Datasets path have been manually hardcoded")  # TODO: remove it later, convenient for now
-    if data_args.train_file is not None:
-        data_files["train"] = ["/data/CC3M/training-encoded.tsv", "/data/CC12M/encoded-train.tsv"]
-    if data_args.validation_file is not None:
-        data_files["validation"] = ["/data/CC3M/validation-encoded.tsv"]
-    if data_args.test_file is not None:
-        data_files["test"] = data_args.test_file
-    dataset = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir, delimiter="\t")
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-    # Load pretrained model and tokenizer
-    base_model = FlaxAutoModelForSeq2SeqLM.from_pretrained(
-        model_args.model_name_or_path, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
-    )
-    # Set up our new model config
-    config = BartConfig.from_pretrained(model_args.model_name_or_path)
-    config.tie_word_embeddings = False
-    config.decoder_start_token_id = BOS_TOKEN_ID
-    config.bos_token_id = BOS_TOKEN_ID  # should not be used
-    config.pos_token_id = BOS_TOKEN_ID  # should not be needed (as we generate until max_length)
-    config.eos_token_id = BOS_TOKEN_ID + 1  # unreachable
-    config.forced_bos_token_id = None  # we don't need this token
-    config.forced_eos_token_id = None  # we don't need this token
-    #config.min_length = data_args.max_target_length        # Set only in decoder?
-    #config.max_length = data_args.max_target_length        # Set only in decoder?
-    print(f"TPUs: {jax.device_count()}")
-    assert jax.device_count() == 8, "TPUs in use, please check running processes"
-    # Create a custom model and initialize it randomly
-    model = CustomFlaxBartForConditionalGeneration(config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))
-    # Use pre-trained weights for encoder
-    model.params['model']['encoder'] = base_model.params['model']['encoder']
-    model.params['model']['shared'] = base_model.params['model']['shared']
-    del base_model
-    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    if training_args.do_train:
-        column_names = dataset["train"].column_names
-    elif training_args.do_eval:
-        column_names = dataset["validation"].column_names
-    elif training_args.do_predict:
-        column_names = dataset["test"].column_names
-    else:
-        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
-        return
-    # Get the column names for input/target.
-    text_column = data_args.text_column
-    encoding_column = data_args.encoding_column
-    # Temporarily set max_target_length for training.
-    max_target_length = data_args.max_target_length
-    def shift_tokens_right(input_ids: np.array, decoder_start_token_id: int):
-        """
-        Shift input ids one token to the right.
-        """
-        shifted_input_ids = np.zeros(input_ids.shape)
-        shifted_input_ids[:, 1:] = input_ids[:, :-1]
-        shifted_input_ids[:, 0] = decoder_start_token_id
-        return shifted_input_ids
-    def preprocess_function(examples):
-        inputs = examples[text_column]
-        inputs = [prefix + inp for inp in inputs]
-	# Setting padding="max_length" as we need fixed length inputs for jitted functions
-        model_inputs = tokenizer(
-            inputs, max_length=data_args.max_source_length, padding="max_length", truncation=True, return_tensors="np"
-        )
-        # set up targets
-        # Note: labels correspond to our target indices
-        # decoder input ids are the same but shifted to the right with bos at the beginning (and without last token)
-        labels = [eval(indices) for indices in examples['encoding']]
-        labels = np.asarray(labels)
-        # We need the labels, in addition to the decoder_input_ids, for the compute_loss function
-        model_inputs["labels"] = labels
-        # In our case, this prepends the bos token and removes the last one
-        decoder_input_ids = shift_tokens_right(labels, config.decoder_start_token_id)
-        model_inputs["decoder_input_ids"] = decoder_input_ids
-        return model_inputs
-    if training_args.do_train:
-        if "train" not in dataset:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = dataset["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        train_dataset = train_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on train dataset",
-        )
-    if training_args.do_eval:
-        max_target_length = data_args.val_max_target_length
-        if "validation" not in dataset:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = dataset["validation"]
-        if data_args.max_eval_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-        eval_dataset = eval_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on validation dataset",
-        )
-    if training_args.do_predict:
-        max_target_length = data_args.val_max_target_length
-        if "test" not in dataset:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_dataset = dataset["test"]
-        if data_args.max_predict_samples is not None:
-            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
-        predict_dataset = predict_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on prediction dataset",
-        )
-    # Metric
-    #metric = load_metric("rouge")
-    def postprocess_text(preds, labels):
-        preds = [pred.strip() for pred in preds]
-        labels = [label.strip() for label in labels]
-        # rougeLSum expects newline after each sentence
-        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
-        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
-        return preds, labels
-    def compute_metrics(preds, labels):
-        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-        # Some simple post-processing
-        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
-        result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
-        # Extract a few results from ROUGE
-        result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
-        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
-        result["gen_len"] = np.mean(prediction_lens)
-        result = {k: round(v, 4) for k, v in result.items()}
-        return result
-    # Initialize our training
-    rng = jax.random.PRNGKey(training_args.seed)
-    rng, dropout_rng = jax.random.split(rng)
-    # Store some constant
-    num_epochs = int(training_args.num_train_epochs)
-    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
-    total_batch_size = int(train_batch_size) * training_args.gradient_accumulation_steps
-    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
-    steps_per_epoch = len(train_dataset) // train_batch_size
-    total_steps = steps_per_epoch * num_epochs
-    total_optimization_steps = (len(train_dataset) // total_batch_size) * num_epochs
-    # Create learning rate schedule
-    linear_decay_lr_schedule_fn = create_learning_rate_fn(
-        len(train_dataset),
-        total_batch_size,
-        training_args.num_train_epochs,
-        training_args.warmup_steps,
-        training_args.learning_rate,
-        data_args.no_decay
-    )
-    # We use Optax's "masking" functionality to not apply weight decay
-    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
-    # mask boolean with the same structure as the parameters.
-    # The mask is True for parameters that should be decayed.
-    # Note that this mask is specifically adapted for FlaxBart.
-    # For FlaxT5, one should correct the layer norm parameter naming
-    # accordingly - see `run_t5_mlm_flax.py` e.g.
-    def decay_mask_fn(params):
-        flat_params = traverse_util.flatten_dict(params)
-        layer_norm_params = [
-            (name, "scale") for name in ["self_attn_layer_norm", "layernorm_embedding", "final_layer_norm"]
-        ]
-        flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_params) for path in flat_params}
-        return traverse_util.unflatten_dict(flat_mask)
-    # create adam optimizer
-    if training_args.adafactor:
-        # We use the default parameters here to initialize adafactor,
-        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
-        optimizer = optax.adafactor(
-            learning_rate=linear_decay_lr_schedule_fn,
-        )
-    else:
-        optimizer = optax.adamw(
-            learning_rate=linear_decay_lr_schedule_fn,
-            b1=training_args.adam_beta1,
-            b2=training_args.adam_beta2,
-            eps=training_args.adam_epsilon,
-            weight_decay=training_args.weight_decay,
-            mask=decay_mask_fn,
-        )
-    # Setup train state
-    state = TrainState.create(
-        apply_fn=model.__call__,
-        params=model.params,
-        tx=optimizer,
-        dropout_rng=dropout_rng,
-        grad_accum=jax.tree_map(jnp.zeros_like, model.params),
-        optimizer_step=0,
-    )
-    # label smoothed cross entropy
-    def loss_fn(logits, labels):
-        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1]))
-        loss = loss.mean()
-        return loss
-    # Define gradient update step fn
-    def train_step(state, batch):
-        dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
-        def compute_loss(params):
-            labels = batch.pop("labels")
-            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
-            loss = loss_fn(logits, labels)
-            return loss
-        grad_fn = jax.value_and_grad(compute_loss)
-        loss, grads = grad_fn(state.params)
-        grad_accum = jax.tree_multimap(lambda x, y: x + y, grads, state.grad_accum)
-        def update_fn():
-            grads = jax.tree_map(lambda x: x / training_args.gradient_accumulation_steps, grad_accum)
-            grads = jax.lax.pmean(grads, "batch")
-            new_state = state.apply_gradients(
-                grads=grads, grad_accum=jax.tree_map(jnp.zeros_like, grads), optimizer_step=state.optimizer_step + 1
-            )
-            return new_state
-        new_state = jax.lax.cond(
-            (state.step + 1) % training_args.gradient_accumulation_steps == 0,
-            lambda _: update_fn(),
-            lambda _: state.replace(grad_accum=grad_accum, step=state.step + 1),
-            None,
-        )
-        metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.optimizer_step)}
-        metrics = jax.lax.pmean(metrics, axis_name="batch")
-        return new_state.replace(dropout_rng=new_dropout_rng), metrics
-    # Define eval fn
-    def eval_step(params, batch):
-        labels = batch.pop("labels")
-        logits = model(**batch, params=params, train=False)[0]
-        loss = loss_fn(logits, labels)
-        # summarize metrics
-        metrics = {"loss": loss}
-        metrics = jax.lax.pmean(metrics, axis_name="batch")
-        return metrics
-    # Define generation function
-    max_length = (
-        data_args.val_max_target_length if data_args.val_max_target_length is not None else model.config.max_length
-    )
-    num_beams = data_args.num_beams if data_args.num_beams is not None else model.config.num_beams
-    gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
-    def generate_step(params, batch):
-        model.params = params
-        output_ids = model.generate(batch["input_ids"], attention_mask=batch["attention_mask"], **gen_kwargs)
-        return output_ids.sequences
-    # Create parallel version of the train and eval step
-    p_train_step = jax.pmap(
-        train_step, "batch", donate_argnums=(0,)
-    )
-    p_eval_step = jax.pmap(eval_step, "batch")
-    p_generate_step = jax.pmap(generate_step, "batch")
-    # Replicate the train state on each device
-    state = state.replicate()
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {num_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
-    logger.info(
-        f"  Total train batch size (w. parallel & distributed) = {train_batch_size * training_args.gradient_accumulation_steps}"
-    )
-    logger.info(f"  Total global steps = {total_steps}")
-    logger.info(f"  Total optimization steps = {total_optimization_steps}")
-    train_time = 0
-    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
-    global_step = 0
-    def run_evaluation():
-        # ======================== Evaluating ==============================
-        eval_metrics = []
-        if training_args.do_eval:
-            eval_preds = []
-            eval_labels = []
-            eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size)
-            eval_steps = len(eval_dataset) // eval_batch_size
-            for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
-                # Model forward
-                batch = next(eval_loader)
-                labels = batch["labels"]
-                metrics = p_eval_step(state.params, batch)
-                eval_metrics.append(metrics)
-                # generation
-                if data_args.predict_with_generate:
-                    generated_ids = p_generate_step(state.params, batch)
-                    eval_preds.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"])))
-                    eval_labels.extend(jax.device_get(labels.reshape(-1, labels.shape[-1])))
-            # normalize eval metrics
-            eval_metrics = get_metrics(eval_metrics)
-            eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
-            # log metrics
-            wandb_log(eval_metrics, step=global_step, prefix='eval')
-            # compute ROUGE metrics
-            rouge_desc = ""
-        #    if data_args.predict_with_generate:
-        #        rouge_metrics = compute_metrics(eval_preds, eval_labels)
-        #        eval_metrics.update(rouge_metrics)
-        #        rouge_desc = " ".join([f"Eval {key}: {value} |" for key, value in rouge_metrics.items()])
-            # Print metrics and update progress bar
-            desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']} | {rouge_desc})"
-            epochs.write(desc)
-            epochs.desc = desc
-            return eval_metrics
-    def run_save_model(step, epoch, eval_metrics=None):
-        if jax.process_index() == 0:
-            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-            # save model locally
-            model.save_pretrained(
-                training_args.output_dir,
-                params=params,
-            )
-            # save to W&B
-            if data_args.log_model:
-                metadata = {'step': step, 'epoch': epoch}
-                if eval_metrics is not None:
-                    metadata['eval/loss'] = eval_metrics['loss']
-                artifact = wandb.Artifact(
-                    name=f"model-{wandb.run.id}", type="bart_model", metadata=metadata
-                )
-                artifact.add_file(str(Path(training_args.output_dir) / 'flax_model.msgpack'))
-                artifact.add_file(str(Path(training_args.output_dir) / 'config.json'))
-                wandb.run.log_artifact(artifact)
-            # save to the hub
-            if training_args.push_to_hub:
-                model.save_pretrained(
-                    training_args.output_dir,
-                    params=params,
-                    push_to_hub=training_args.push_to_hub,
-                    commit_message=f"Saving weights and logs of epoch {epoch+1}",
-                    temp_dir=True  # avoid issues with being in a repository
-                )
-    for epoch in epochs:
-        # ======================== Training ================================
-        train_start = time.time()
-        # Create sampling rng
-        rng, input_rng = jax.random.split(rng)
-        # Generate an epoch by shuffling sampling indices from the train dataset
-        train_loader = data_loader(input_rng, train_dataset, train_batch_size, shuffle=True)
-        steps_per_epoch = len(train_dataset) // train_batch_size
-        # train
-        for step in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False):
-            global_step +=1
-            batch = next(train_loader)
-            state, train_metric = p_train_step(state, batch)
-            if global_step % data_args.log_interval == 0 and jax.process_index() == 0:
-                # log metrics
-                wandb_log(unreplicate(train_metric), step=global_step, prefix='train')
-            if global_step % training_args.eval_steps == 0:
-                run_evaluation()
-            if global_step % data_args.save_model_steps == 0:
-                run_save_model(global_step, epoch)
-        # log final train metrics
-        wandb_log(unreplicate(train_metric), step=global_step, prefix='train')
-        train_time += time.time() - train_start
-        train_metric = unreplicate(train_metric)
-        epochs.write(
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
-        )
-        # Final evaluation
-        eval_metrics = run_evaluation()
-        # save checkpoint after each epoch and push checkpoint to the hub
-        run_save_model(global_step, epoch, eval_metrics)
-    # ======================== Prediction loop ==============================
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-        pred_metrics = []
-        pred_generations = []
-        pred_labels = []
-        pred_loader = data_loader(input_rng, predict_dataset, eval_batch_size)
-        pred_steps = len(predict_dataset) // eval_batch_size
-        for _ in tqdm(range(pred_steps), desc="Predicting...", position=2, leave=False):
-            # Model forward
-            batch = next(pred_loader)
-            labels = batch["labels"]
-            metrics = p_eval_step(state.params, batch)
-            pred_metrics.append(metrics)
-            # generation
-            if data_args.predict_with_generate:
-                generated_ids = p_generate_step(state.params, batch)
-                pred_generations.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"])))
-                pred_labels.extend(jax.device_get(labels.reshape(-1, labels.shape[-1])))
-        # normalize prediction metrics
-        pred_metrics = get_metrics(pred_metrics)
-        pred_metrics = jax.tree_map(jnp.mean, pred_metrics)
-        # compute ROUGE metrics
-        rouge_desc = ""
-        if data_args.predict_with_generate:
-            rouge_metrics = compute_metrics(pred_generations, pred_labels)
-            pred_metrics.update(rouge_metrics)
-            rouge_desc = " ".join([f"Predict {key}: {value} |" for key, value in rouge_metrics.items()])
-        # Print metrics
-        desc = f"Predict Loss: {pred_metrics['loss']} | {rouge_desc})"
-        logger.info(desc)
-if __name__ == "__main__":
-    main()

setup.cfg ADDED Viewed

	@@ -0,0 +1,27 @@

+[metadata]
+name = dalle_mini
+version = attr: dalle_mini.__version__
+description = DALL·E mini - Generate images from a text prompt
+long_description = file: README.md
+long_description_content_type = text/markdown
+url = https://github.com/borisdayma/dalle-mini
+project_urls =
+    Bug Tracker = https://github.com/borisdayma/dalle-mini/issues
+[options]
+packages = find:
+install_requires =
+    transformers
+    unidecode
+    ftfy
+    pillow
+    jax
+    flax
+[options.extras_require]
+dev =
+    tqdm
+    wandb
+    optax
+    black[jupyter]
+    isort

setup.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from setuptools import setup
+if __name__ == "__main__":
+    setup()

tools/dataset/encode_dataset.ipynb ADDED Viewed

	@@ -0,0 +1,371 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d0b72877",
+   "metadata": {},
+   "source": [
+    "# Pre-encoding a dataset for DALLE·mini"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba7b31e6",
+   "metadata": {},
+   "source": [
+    "This notebook shows how to pre-encode images to token sequences using JAX, VQGAN and a dataset in the [`webdataset` format](https://webdataset.github.io/webdataset/).\n",
+    "\n",
+    "Adapt it to your own dataset and image encoder.\n",
+    "\n",
+    "At the end you should have a dataset of pairs:\n",
+    "* a caption defined as a string\n",
+    "* an encoded image defined as a list of int."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b59489e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "import torchvision.transforms as T\n",
+    "\n",
+    "import webdataset as wds\n",
+    "\n",
+    "import jax\n",
+    "import braceexpand\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c7c4c1e6",
+   "metadata": {},
+   "source": [
+    "## Configuration Parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1265dbfe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "shards = \"my_images/shard-{0000..0008}.tar\"  # defined using braceexpand format as used by webdataset\n",
+    "encoded_output = Path(\"encoded_data\")  # where we will save our encoded data\n",
+    "\n",
+    "VQGAN_REPO, VQGAN_COMMIT_ID = (\n",
+    "    \"dalle-mini/vqgan_imagenet_f16_16384\",\n",
+    "    \"85eb5d3b51a1c62a0cc8f4ccdee9882c0d0bd384\",\n",
+    ")\n",
+    "\n",
+    "# good defaults for a TPU v3-8\n",
+    "batch_size = 128  # Per device\n",
+    "num_workers = 8  # For parallel processing\n",
+    "total_bs = batch_size * jax.device_count()  # You can use a smaller size while testing\n",
+    "save_frequency = 128  # Number of batches to create a new file (180MB for f16 and 720MB for f8 per file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "cd956ec6-7d98-4d4d-a454-f80fe857eadd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['XXX/shard-0000.tar',\n",
+       " 'XXX/shard-0001.tar',\n",
+       " 'XXX/shard-0002.tar',\n",
+       " 'XXX/shard-0003.tar',\n",
+       " 'XXX/shard-0004.tar',\n",
+       " 'XXX/shard-0005.tar',\n",
+       " 'XXX/shard-0006.tar',\n",
+       " 'XXX/shard-0007.tar',\n",
+       " 'XXX/shard-0008.tar']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "shards = list(\n",
+    "    braceexpand.braceexpand(shards)\n",
+    ")  # better display for tqdm with known length"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "75dba8e2",
+   "metadata": {},
+   "source": [
+    "## Load data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1e8fb95",
+   "metadata": {},
+   "source": [
+    "We load data using `webdataset`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ef5de9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = (\n",
+    "    wds.WebDataset(shards, handler=wds.warn_and_continue)\n",
+    "    .decode(\"rgb\", handler=wds.warn_and_continue)\n",
+    "    .to_tuple(\"jpg\", \"txt\")  # assumes image is in `jpg` and caption in `txt`\n",
+    "    .batched(total_bs)  # load in batch per worker (faster)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "90981824",
+   "metadata": {},
+   "source": [
+    "Note:\n",
+    "* you can also shuffle shards and items using `shardshuffle` and `shuffle` if necessary.\n",
+    "* you may need to resize images in your pipeline (with `map_dict` for example), we assume they are already set to 256x256.\n",
+    "* you can also filter out some items using `select`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "129c377d",
+   "metadata": {},
+   "source": [
+    "We can now inspect our data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8cac98cb",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "images, captions = next(iter(ds))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd268fbf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "images.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5acfc4d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "captions[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c24693c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "T.ToPILImage()(images[0].permute(2, 0, 1))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3059ffb1",
+   "metadata": {},
+   "source": [
+    "Finally we create our dataloader."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c227c551",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dl = (\n",
+    "    wds.WebLoader(ds, batch_size=None, num_workers=8).unbatched().batched(total_bs)\n",
+    ")  # avoid partial batch at the end of each worker"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a354472b",
+   "metadata": {},
+   "source": [
+    "## Image encoder\n",
+    "\n",
+    "We'll use a VQGAN trained with Taming Transformers and converted to a JAX model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47a8b818",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from vqgan_jax.modeling_flax_vqgan import VQModel\n",
+    "from flax.jax_utils import replicate\n",
+    "\n",
+    "vqgan = VQModel.from_pretrained(\"flax-community/vqgan_f16_16384\")\n",
+    "vqgan_params = replicate(vqgan.params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62ad01c3",
+   "metadata": {},
+   "source": [
+    "## Encoding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20357f74",
+   "metadata": {},
+   "source": [
+    "Encoding is really simple using `shard` to automatically distribute batches across devices and `pmap`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "322a4619",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from flax.training.common_utils import shard\n",
+    "from functools import partial\n",
+    "\n",
+    "\n",
+    "@partial(jax.pmap, axis_name=\"batch\")\n",
+    "def p_encode(batch, params):\n",
+    "    # Not sure if we should `replicate` params, does not seem to have any effect\n",
+    "    _, indices = vqgan.encode(batch, params=params)\n",
+    "    return indices"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff6c10d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "\n",
+    "def encode_dataset(dataloader, output_dir, save_frequency):\n",
+    "    output_dir.mkdir(parents=True, exist_ok=True)\n",
+    "    all_captions = []\n",
+    "    all_encoding = []\n",
+    "    n_file = 1\n",
+    "    for idx, (images, captions) in enumerate(tqdm(dataloader)):\n",
+    "        images = images.numpy()\n",
+    "        n = len(images) // 8 * 8\n",
+    "        if n != len(images):\n",
+    "            # get the max number of images we can (multiple of 8)\n",
+    "            print(f\"Different sizes {n} vs {len(images)}\")\n",
+    "            images = images[:n]\n",
+    "            captions = captions[:n]\n",
+    "        if not len(captions):\n",
+    "            print(f\"No images/captions in batch...\")\n",
+    "            continue\n",
+    "        images = shard(images)\n",
+    "        encoded = p_encode(images, vqgan_params)\n",
+    "        encoded = encoded.reshape(-1, encoded.shape[-1])\n",
+    "        all_captions.extend(captions)\n",
+    "        all_encoding.extend(encoded.tolist())\n",
+    "\n",
+    "        # save files\n",
+    "        if (idx + 1) % save_frequency == 0:\n",
+    "            print(f\"Saving file {n_file}\")\n",
+    "            batch_df = pd.DataFrame.from_dict(\n",
+    "                {\"caption\": all_captions, \"encoding\": all_encoding}\n",
+    "            )\n",
+    "            batch_df.to_parquet(f\"{output_dir}/{n_file:03d}.parquet\")\n",
+    "            all_captions = []\n",
+    "            all_encoding = []\n",
+    "            n_file += 1\n",
+    "\n",
+    "    if len(all_captions):\n",
+    "        print(f\"Saving final file {n_file}\")\n",
+    "        batch_df = pd.DataFrame.from_dict(\n",
+    "            {\"caption\": all_captions, \"encoding\": all_encoding}\n",
+    "        )\n",
+    "        batch_df.to_parquet(f\"{output_dir}/{n_file:03d}.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7704863d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "encode_dataset(dl, output_dir=encoded_output, save_frequency=save_frequency)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8953dd84",
+   "metadata": {},
+   "source": [
+    "----"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "db471c52d602b4f5f40ecaf278e88ccfef85c29d0a1a07185b0d51fc7acf4e26"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

tools/inference/inference_pipeline.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

tools/inference/log_inference_samples.ipynb ADDED Viewed

	@@ -0,0 +1,434 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ff2a984-b8b2-4a69-89cf-0d16da2393c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tempfile\n",
+    "from functools import partial\n",
+    "import random\n",
+    "import numpy as np\n",
+    "from PIL import Image\n",
+    "from tqdm.notebook import tqdm\n",
+    "import jax\n",
+    "import jax.numpy as jnp\n",
+    "from flax.training.common_utils import shard, shard_prng_key\n",
+    "from flax.jax_utils import replicate\n",
+    "import wandb\n",
+    "from dalle_mini.model import CustomFlaxBartForConditionalGeneration\n",
+    "from vqgan_jax.modeling_flax_vqgan import VQModel\n",
+    "from transformers import BartTokenizer, CLIPProcessor, FlaxCLIPModel\n",
+    "from dalle_mini.text import TextNormalizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92f4557c-fd7f-4edc-81c2-de0b0a10c270",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_ids = [\"63otg87g\"]\n",
+    "ENTITY, PROJECT = \"dalle-mini\", \"dalle-mini\"  # used only for training run\n",
+    "VQGAN_REPO, VQGAN_COMMIT_ID = (\n",
+    "    \"dalle-mini/vqgan_imagenet_f16_16384\",\n",
+    "    \"e93a26e7707683d349bf5d5c41c5b0ef69b677a9\",\n",
+    ")\n",
+    "latest_only = True  # log only latest or all versions\n",
+    "suffix = \"\"  # mainly for duplicate inference runs with a deleted version\n",
+    "add_clip_32 = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71f27b96-7e6c-4472-a2e4-e99a8fb67a72",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model.generate parameters - Not used yet\n",
+    "gen_top_k = None\n",
+    "gen_top_p = None\n",
+    "temperature = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93b2e24b-f0e5-4abe-a3ec-0aa834cc3bf3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 8\n",
+    "num_images = 128\n",
+    "top_k = 8\n",
+    "text_normalizer = TextNormalizer()\n",
+    "padding_item = \"NONE\"\n",
+    "seed = random.randint(0, 2 ** 32 - 1)\n",
+    "key = jax.random.PRNGKey(seed)\n",
+    "api = wandb.Api()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6a878fa-4bf5-4978-abb5-e235841d765b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)\n",
+    "vqgan_params = replicate(vqgan.params)\n",
+    "\n",
+    "clip16 = FlaxCLIPModel.from_pretrained(\"openai/clip-vit-base-patch16\")\n",
+    "processor16 = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch16\")\n",
+    "clip16_params = replicate(clip16.params)\n",
+    "\n",
+    "if add_clip_32:\n",
+    "    clip32 = FlaxCLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "    processor32 = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "    clip32_params = replicate(clip32.params)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a500dd07-dbc3-477d-80d4-2b73a3b83ef3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@partial(jax.pmap, axis_name=\"batch\")\n",
+    "def p_decode(indices, params):\n",
+    "    return vqgan.decode_code(indices, params=params)\n",
+    "\n",
+    "\n",
+    "@partial(jax.pmap, axis_name=\"batch\")\n",
+    "def p_clip16(inputs, params):\n",
+    "    logits = clip16(params=params, **inputs).logits_per_image\n",
+    "    return logits\n",
+    "\n",
+    "\n",
+    "if add_clip_32:\n",
+    "\n",
+    "    @partial(jax.pmap, axis_name=\"batch\")\n",
+    "    def p_clip32(inputs, params):\n",
+    "        logits = clip32(params=params, **inputs).logits_per_image\n",
+    "        return logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e57797ab-0b3a-4490-be58-03d8d1c23fe9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"samples.txt\", encoding=\"utf8\") as f:\n",
+    "    samples = [l.strip() for l in f.readlines()]\n",
+    "    # make list multiple of batch_size by adding elements\n",
+    "    samples_to_add = [padding_item] * (-len(samples) % batch_size)\n",
+    "    samples.extend(samples_to_add)\n",
+    "    # reshape\n",
+    "    samples = [samples[i : i + batch_size] for i in range(0, len(samples), batch_size)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3e02d9d-4ee1-49e7-a7bc-4d8b139e9614",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_artifact_versions(run_id, latest_only=False):\n",
+    "    try:\n",
+    "        if latest_only:\n",
+    "            return [\n",
+    "                api.artifact(\n",
+    "                    type=\"bart_model\", name=f\"{ENTITY}/{PROJECT}/model-{run_id}:latest\"\n",
+    "                )\n",
+    "            ]\n",
+    "        else:\n",
+    "            return api.artifact_versions(\n",
+    "                type_name=\"bart_model\",\n",
+    "                name=f\"{ENTITY}/{PROJECT}/model-{run_id}\",\n",
+    "                per_page=10000,\n",
+    "            )\n",
+    "    except:\n",
+    "        return []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0d7ed17-7abb-4a31-ab3c-a12b9039a570",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_training_config(run_id):\n",
+    "    training_run = api.run(f\"{ENTITY}/{PROJECT}/{run_id}\")\n",
+    "    config = training_run.config\n",
+    "    return config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e784a43-626d-4e8d-9e47-a23775b2f35f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# retrieve inference run details\n",
+    "def get_last_inference_version(run_id):\n",
+    "    try:\n",
+    "        inference_run = api.run(f\"dalle-mini/dalle-mini/{run_id}-clip16{suffix}\")\n",
+    "        return inference_run.summary.get(\"version\", None)\n",
+    "    except:\n",
+    "        return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1cc9993-1bfc-4ec6-a004-c056189c42ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compile functions - needed only once per run\n",
+    "def pmap_model_function(model):\n",
+    "    @partial(jax.pmap, axis_name=\"batch\")\n",
+    "    def _generate(tokenized_prompt, key, params):\n",
+    "        return model.generate(\n",
+    "            **tokenized_prompt,\n",
+    "            do_sample=True,\n",
+    "            num_beams=1,\n",
+    "            prng_key=key,\n",
+    "            params=params,\n",
+    "            top_k=gen_top_k,\n",
+    "            top_p=gen_top_p\n",
+    "        )\n",
+    "\n",
+    "    return _generate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23b2444c-67a9-44d7-abd1-187ed83a9431",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_id = run_ids[0]\n",
+    "# TODO: loop over runs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bba70f33-af8b-4eb3-9973-7be672301a0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "artifact_versions = get_artifact_versions(run_id, latest_only)\n",
+    "last_inference_version = get_last_inference_version(run_id)\n",
+    "training_config = get_training_config(run_id)\n",
+    "run = None\n",
+    "p_generate = None\n",
+    "model_files = [\n",
+    "    \"config.json\",\n",
+    "    \"flax_model.msgpack\",\n",
+    "    \"merges.txt\",\n",
+    "    \"special_tokens_map.json\",\n",
+    "    \"tokenizer.json\",\n",
+    "    \"tokenizer_config.json\",\n",
+    "    \"vocab.json\",\n",
+    "]\n",
+    "for artifact in artifact_versions:\n",
+    "    print(f\"Processing artifact: {artifact.name}\")\n",
+    "    version = int(artifact.version[1:])\n",
+    "    results16, results32 = [], []\n",
+    "    columns = [\"Caption\"] + [f\"Image {i+1}\" for i in range(top_k)]\n",
+    "\n",
+    "    if latest_only:\n",
+    "        assert last_inference_version is None or version > last_inference_version\n",
+    "    else:\n",
+    "        if last_inference_version is None:\n",
+    "            # we should start from v0\n",
+    "            assert version == 0\n",
+    "        elif version <= last_inference_version:\n",
+    "            print(\n",
+    "                f\"v{version} has already been logged (versions logged up to v{last_inference_version}\"\n",
+    "            )\n",
+    "        else:\n",
+    "            # check we are logging the correct version\n",
+    "            assert version == last_inference_version + 1\n",
+    "\n",
+    "    # start/resume corresponding run\n",
+    "    if run is None:\n",
+    "        run = wandb.init(\n",
+    "            job_type=\"inference\",\n",
+    "            entity=\"dalle-mini\",\n",
+    "            project=\"dalle-mini\",\n",
+    "            config=training_config,\n",
+    "            id=f\"{run_id}-clip16{suffix}\",\n",
+    "            resume=\"allow\",\n",
+    "        )\n",
+    "\n",
+    "    # work in temporary directory\n",
+    "    with tempfile.TemporaryDirectory() as tmp:\n",
+    "\n",
+    "        # download model files\n",
+    "        artifact = run.use_artifact(artifact)\n",
+    "        for f in model_files:\n",
+    "            artifact.get_path(f).download(tmp)\n",
+    "\n",
+    "        # load tokenizer and model\n",
+    "        tokenizer = BartTokenizer.from_pretrained(tmp)\n",
+    "        model = CustomFlaxBartForConditionalGeneration.from_pretrained(tmp)\n",
+    "        model_params = replicate(model.params)\n",
+    "\n",
+    "        # pmap model function needs to happen only once per model config\n",
+    "        if p_generate is None:\n",
+    "            p_generate = pmap_model_function(model)\n",
+    "\n",
+    "        # process one batch of captions\n",
+    "        for batch in tqdm(samples):\n",
+    "            processed_prompts = (\n",
+    "                [text_normalizer(x) for x in batch]\n",
+    "                if model.config.normalize_text\n",
+    "                else list(batch)\n",
+    "            )\n",
+    "\n",
+    "            # repeat the prompts to distribute over each device and tokenize\n",
+    "            processed_prompts = processed_prompts * jax.device_count()\n",
+    "            tokenized_prompt = tokenizer(\n",
+    "                processed_prompts,\n",
+    "                return_tensors=\"jax\",\n",
+    "                padding=\"max_length\",\n",
+    "                truncation=True,\n",
+    "                max_length=128,\n",
+    "            ).data\n",
+    "            tokenized_prompt = shard(tokenized_prompt)\n",
+    "\n",
+    "            # generate images\n",
+    "            images = []\n",
+    "            pbar = tqdm(\n",
+    "                range(num_images // jax.device_count()),\n",
+    "                desc=\"Generating Images\",\n",
+    "                leave=True,\n",
+    "            )\n",
+    "            for i in pbar:\n",
+    "                key, subkey = jax.random.split(key)\n",
+    "                encoded_images = p_generate(\n",
+    "                    tokenized_prompt, shard_prng_key(subkey), model_params\n",
+    "                )\n",
+    "                encoded_images = encoded_images.sequences[..., 1:]\n",
+    "                decoded_images = p_decode(encoded_images, vqgan_params)\n",
+    "                decoded_images = decoded_images.clip(0.0, 1.0).reshape(\n",
+    "                    (-1, 256, 256, 3)\n",
+    "                )\n",
+    "                for img in decoded_images:\n",
+    "                    images.append(\n",
+    "                        Image.fromarray(np.asarray(img * 255, dtype=np.uint8))\n",
+    "                    )\n",
+    "\n",
+    "            def add_clip_results(results, processor, p_clip, clip_params):\n",
+    "                clip_inputs = processor(\n",
+    "                    text=batch,\n",
+    "                    images=images,\n",
+    "                    return_tensors=\"np\",\n",
+    "                    padding=\"max_length\",\n",
+    "                    max_length=77,\n",
+    "                    truncation=True,\n",
+    "                ).data\n",
+    "                # each shard will have one prompt, images need to be reorganized to be associated to the correct shard\n",
+    "                images_per_prompt_indices = np.asarray(\n",
+    "                    range(0, len(images), batch_size)\n",
+    "                )\n",
+    "                clip_inputs[\"pixel_values\"] = jnp.concatenate(\n",
+    "                    list(\n",
+    "                        clip_inputs[\"pixel_values\"][images_per_prompt_indices + i]\n",
+    "                        for i in range(batch_size)\n",
+    "                    )\n",
+    "                )\n",
+    "                clip_inputs = shard(clip_inputs)\n",
+    "                logits = p_clip(clip_inputs, clip_params)\n",
+    "                logits = logits.reshape(-1, num_images)\n",
+    "                top_scores = logits.argsort()[:, -top_k:][..., ::-1]\n",
+    "                logits = jax.device_get(logits)\n",
+    "                # add to results table\n",
+    "                for i, (idx, scores, sample) in enumerate(\n",
+    "                    zip(top_scores, logits, batch)\n",
+    "                ):\n",
+    "                    if sample == padding_item:\n",
+    "                        continue\n",
+    "                    cur_images = [images[x] for x in images_per_prompt_indices + i]\n",
+    "                    top_images = [\n",
+    "                        wandb.Image(cur_images[x], caption=f\"Score: {scores[x]:.2f}\")\n",
+    "                        for x in idx\n",
+    "                    ]\n",
+    "                    results.append([sample] + top_images)\n",
+    "\n",
+    "            # get clip scores\n",
+    "            pbar.set_description(\"Calculating CLIP 16 scores\")\n",
+    "            add_clip_results(results16, processor16, p_clip16, clip16_params)\n",
+    "\n",
+    "            # get clip 32 scores\n",
+    "            if add_clip_32:\n",
+    "                pbar.set_description(\"Calculating CLIP 32 scores\")\n",
+    "                add_clip_results(results32, processor32, p_clip32, clip32_params)\n",
+    "\n",
+    "            pbar.close()\n",
+    "\n",
+    "    # log results\n",
+    "    table = wandb.Table(columns=columns, data=results16)\n",
+    "    run.log({\"Samples\": table, \"version\": version})\n",
+    "    wandb.finish()\n",
+    "\n",
+    "    if add_clip_32:\n",
+    "        run = wandb.init(\n",
+    "            job_type=\"inference\",\n",
+    "            entity=\"dalle-mini\",\n",
+    "            project=\"dalle-mini\",\n",
+    "            config=training_config,\n",
+    "            id=f\"{run_id}-clip32{suffix}\",\n",
+    "            resume=\"allow\",\n",
+    "        )\n",
+    "        table = wandb.Table(columns=columns, data=results32)\n",
+    "        run.log({\"Samples\": table, \"version\": version})\n",
+    "        wandb.finish()\n",
+    "        run = None  # ensure we don't log on this run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "415d3f54-7226-43de-9eea-4283a948dc93",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

tools/inference/samples.txt ADDED Viewed

	@@ -0,0 +1,124 @@

+t-shirt, size M
+flower dress, size M
+white snow covered mountain under blue sky during daytime
+aerial view of the beach during daytime
+aerial view of the beach at night
+a beautiful sunset at a beach with a shell on the shore
+a farmhouse surrounded by beautiful flowers
+sunset over green mountains
+a photo of san francisco golden gate bridge
+painting of an oniric forest glade surrounded by tall trees
+a graphite sketch of a gothic cathedral
+a graphite sketch of Elon Musk
+still life in the style of Kandinsky
+still life in the style of Picasso
+a colorful stairway to heaven
+a background consisting of colors blue, green, and red
+Mohammed Ali and Mike Tyson in a match
+Pele and Maradona in a match
+view of Mars from space
+a picture of the Eiffel tower on the moon
+a picture of the Eiffel tower on the moon, Earth is in the background
+watercolor of the Eiffel tower on the moon
+the moon is a skull
+epic sword fight
+underwater cathedral
+a photo of a fantasy version of New York City
+a picture of fantasy kingdoms
+a volcano erupting next to San Francisco golden gate bridge
+Paris in a far future, futuristic Paris
+real painting of an alien from Monet
+the communist statue of liberty
+robots taking control over humans
+illustration of an astronaut in a space suit playing guitar
+a clown wearing a spacesuit floating in space
+a dog playing with a ball
+a cat sits on top of an alligator
+a very cute cat laying by a big bike
+a rat holding a red lightsaber in a white background
+a very cute giraffe making a funny face
+A unicorn is passing by a rainbow in a field of flowers
+an elephant made of carrots
+an elephant on a unicycle during a circus
+photography of a penguin watching television
+a penguin is walking on the Moon, Earth is in the background
+a penguin standing on a tower of books holds onto a rope from a helicopter
+rat wearing a crown
+looking into the sky, 10 airplanes are seen overhead
+shelves filled with books and alchemy potion bottles
+this is a detailed high-resolution scan of a human brain
+a restaurant menu
+a bottle of coca-cola on a table
+a peanut
+a cross-section view of a walnut
+a living room with two white armchairs and a painting of the collosseum. The painting is mounted above a modern fireplace.
+a long line of alternating green and red blocks
+a long line of green blocks on a beach at subset
+a long line of peaches on a beach at sunset
+a picture of a castle from minecraft
+a cute pikachu teapot
+an illustration of pikachu sitting on a bench eating an ice cream
+mario is jumping over a zebra
+famous anime hero
+star wars concept art
+Cartoon of a carrot with big eyes
+a cartoon of a superhero bear
+an illustration of a cute skeleton wearing a blue hoodie
+illustration of a baby shark swimming around corals
+an illustration of an avocado in a beanie riding a motorcycle
+logo of a robot wearing glasses and reading a book
+illustration of a cactus lifting weigths
+logo of a cactus lifting weights
+a photo of a camera from the future
+a skeleton with the shape of a spider
+a collection of glasses is sitting on a table
+a painting of a capybara sitting on a mountain during fall in surrealist style
+a pentagonal green clock
+a small red block sitting on a large green block
+a storefront that has the word 'openai' written on it
+a tatoo of a black broccoli
+a variety of clocks is sitting on a table
+a table has a train model on it with other cars and things
+a pixel art illustration of an eagle sitting in a field in the afternoon
+an emoji of a baby fox wearing a blue hat, green gloves, red shirt, and yellow pants
+an emoji of a baby penguin wearing a blue hat, blue gloves, red shirt, and green pants
+an extreme close-up view of a capybara sitting in a field
+an illustration of a baby cucumber with a mustache playing chess
+an illustration of a baby daikon radish in a tutu walking a dog
+an illustration of a baby hedgehog in a cape staring at its reflection in a mirror
+an illustration of a baby panda with headphones holding an umbrella in the rain
+urinals are lined up in a jungle
+a muscular banana sitting upright on a bench smoking watching a banana on television, high definition photography
+a human face
+a person is holding a phone and a waterbottle, running a marathon
+a child eating a birthday cake near some balloons
+Young woman riding her bike through the forest
+the best soccer team of the world
+the best football team of the world
+the best basketball team of the world
+happy, happiness
+sad, sadness
+the representation of infinity
+the end of the world
+the last sunrise on earth
+a portrait of a nightmare creature watching at you
+an avocado armchair
+an armchair in the shape of an avocado
+illustration of an avocado armchair
+illustration of an armchair in the shape of an avocado
+logo of an avocado armchair
+an avocado armchair flying into space
+a cute avocado armchair singing karaoke on stage in front of a crowd of strawberry shaped lamps
+an illustration of an avocado in a christmas sweater staring at its reflection in a mirror
+illustration of an avocado armchair getting married to a pineapple
+half human half cat
+half human half dog
+half human half pen
+half human half garbage
+half human half avocado
+half human half Eiffel tower
+a propaganda poster for transhumanism
+a propaganda poster for building a space elevator
+a beautiful epic fantasy painting of a space elevator
+a transformer architecture
+a transformer in real life

{seq2seq → tools/train}/sweep.yaml RENAMED Viewed

@@ -1,6 +1,6 @@
-program: run_seq2seq_flax.py
-entity: wandb
-project: hf-flax-dalle-mini
 method: random
 metric:
   name: eval/loss
@@ -8,36 +8,47 @@ metric:
 parameters:
   learning_rate:
     distribution: log_uniform
-    # from exp(min) to exp(max), ie 5e-5 to 5e-3 on log scale
-    min: -9.9
-    max: -5.3
   gradient_accumulation_steps:
     value: 8
   warmup_steps:
-    # in term of optimization steps so multiplied by gradient accumulation
-    value: 125
 command:
   - python3
   - ${program}
-  - "--train_file"
-  - "/data/CC12M/encoded-small-train.tsv"
-  - "--validation_file"
-  - "/data/CC12M/encoded-small-valid.tsv"
-  - "--output_dir"
-  - "./output_sweep"
-  - "--overwrite_output_dir"
-  - "--adafactor"
-  - "--num_train_epochs"
-  - 1
-  - "--max_train_samples"
-  - 1500000
   - "--per_device_train_batch_size"
   - 56
   - "--per_device_eval_batch_size"
   - 56
-  - "--preprocessing_num_workers"
-  - 80
-  - "--no_decay"
   - "--do_train"
   - "--do_eval"
   - ${args}

+program: train.py
+entity: dalle-mini
+project: dalle-mini
 method: random
 metric:
   name: eval/loss
 parameters:
   learning_rate:
     distribution: log_uniform
+    # from exp(min) to exp(max)
+    min: -6.9
+    max: -3.5
   gradient_accumulation_steps:
     value: 8
   warmup_steps:
+    value: 4000
+#TODO: outdated command
 command:
   - python3
   - ${program}
+  - "--tokenizer_name"
+  - "boris/dalle-mini-tokenizer"
+  - "--config_name"
+  - "facebook/bart-large-cnn"
+  - "--dataset_repo_or_path"
+  - "boris/gis_vqgan_f16_16384"
+  - "--streaming"
+  - "--use_auth_token"
+  - "--image_vocab_size"
+  - 16384
+  - "--image_length"
+  - 256
+  - "--normalize_text"
+  - True
   - "--per_device_train_batch_size"
   - 56
   - "--per_device_eval_batch_size"
   - 56
+  - "--adafactor"
   - "--do_train"
   - "--do_eval"
+  - "--num_train_epochs"
+  - 1
+  - "--logging_steps"
+  - 40
+  - "--eval_steps"
+  - 800
+  - "--output_dir"
+  - "./output"
+  - "--overwrite_output_dir"
+  - "--max_train_samples"
+  - 10000000
   - ${args}

tools/train/train.py ADDED Viewed

	@@ -0,0 +1,857 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for seq2seq, text to image.
+Script adapted from run_summarization_flax.py
+"""
+import json
+import logging
+import os
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Callable, Optional
+import datasets
+import jax
+import jax.numpy as jnp
+import optax
+import transformers
+import wandb
+from datasets import Dataset
+from flax import jax_utils, traverse_util
+from flax.jax_utils import unreplicate
+from flax.serialization import from_bytes, to_bytes
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard_prng_key
+from tqdm import tqdm
+from transformers import AutoTokenizer, HfArgumentParser
+from transformers.models.bart.modeling_flax_bart import BartConfig
+from dalle_mini.data import Dataset
+from dalle_mini.model import CustomFlaxBartForConditionalGeneration
+logger = logging.getLogger(__name__)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained config name or path if not the same as model_name"
+        },
+    )
+    image_vocab_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "Vocab size of image encoder"},
+    )
+    image_length: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of tokens per image"},
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name_or_path"
+        },
+    )
+    normalize_text: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "Whether to normalize text or not. By default, we refer to base model or don't normalize for new models."
+        },
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    text_column: Optional[str] = field(
+        default="caption",
+        metadata={
+            "help": "The name of the column in the datasets containing the full texts (for summarization)."
+        },
+    )
+    encoding_column: Optional[str] = field(
+        default="encoding",
+        metadata={
+            "help": "The name of the column in the datasets containing the image encodings."
+        },
+    )
+    dataset_repo_or_path: str = field(
+        default=None,
+        metadata={"help": "The dataset repository containing encoded files."},
+    )
+    train_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "The input training data file (glob acceptable)."},
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file (glob acceptable)."},
+    )
+    dataset_type: str = field(
+        default="datasets",
+        metadata={"help": "Either 🤗 'dataset' (default) or 'webdataset'."},
+    )
+    # data loading should not be a bottleneck so we use "streaming" mode by default
+    streaming: bool = field(
+        default=True,
+        metadata={"help": "Whether to stream the dataset."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use the authentication token for private datasets."
+        },
+    )
+    max_source_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The number of processes to use for the preprocessing. Not used in streaming mode."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={
+            "help": "Overwrite the cached training and evaluation sets. Not used in streaming mode."
+        },
+    )
+    # default seed of None ensures we don't repeat the same items if script was interrupted during an epoch
+    seed_dataset: int = field(
+        default=None,
+        metadata={
+            "help": "Random seed for the dataset that will be set at the beginning of training."
+        },
+    )
+    def __post_init__(self):
+        if self.dataset_repo_or_path is None:
+            raise ValueError("Need a dataset repository or path.")
+@dataclass
+class TrainingArguments:
+    """
+    Arguments pertaining to training parameters.
+    """
+    output_dir: str = field(
+        metadata={
+            "help": "The output directory where the model predictions and checkpoints will be written."
+        },
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Overwrite the content of the output directory. "
+                "Use this to continue training if output_dir points to a checkpoint directory."
+            )
+        },
+    )
+    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
+    do_eval: bool = field(
+        default=False, metadata={"help": "Whether to run eval on the dev set."}
+    )
+    per_device_train_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
+    )
+    per_device_eval_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
+    )
+    gradient_accumulation_steps: int = field(
+        default=1,
+        metadata={
+            "help": "Number of updates steps to accumulate before performing a backward/update pass."
+        },
+    )
+    learning_rate: float = field(
+        default=5e-5, metadata={"help": "The initial learning rate."}
+    )
+    adafactor: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to replace AdamW by Adafactor."},
+    )
+    weight_decay: float = field(
+        default=None, metadata={"help": "Weight decay if we apply some."}
+    )
+    adam_beta1: float = field(
+        default=0.9, metadata={"help": "Beta1 for AdamW optimizer"}
+    )
+    adam_beta2: float = field(
+        default=0.999, metadata={"help": "Beta2 for AdamW optimizer"}
+    )
+    adam_epsilon: float = field(
+        default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."}
+    )
+    max_grad_norm: float = field(
+        default=1.0, metadata={"help": "Max gradient norm for Adafactor."}
+    )
+    use_decay: bool = field(
+        default=False,
+        metadata={"help": "Whether to use decay in the learning rate scheduler."},
+    )
+    num_train_epochs: float = field(
+        default=3.0, metadata={"help": "Total number of training epochs to perform."}
+    )
+    warmup_steps: int = field(
+        default=0, metadata={"help": "Linear warmup over warmup_steps."}
+    )
+    logging_steps: int = field(
+        default=40, metadata={"help": "Log every X updates steps."}
+    )
+    eval_steps: int = field(
+        default=400, metadata={"help": "Run an evaluation every X steps."}
+    )
+    save_steps: int = field(
+        default=4000, metadata={"help": "Save checkpoint every X updates steps."}
+    )
+    log_model: bool = field(
+        default=False,
+        metadata={"help": "Log model to wandb at `save_steps` frequency."},
+    )
+    seed_model: int = field(
+        default=42,
+        metadata={
+            "help": "Random seed for the model that will be set at the beginning of training."
+        },
+    )
+    push_to_hub: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to upload the trained model to the model hub after training."
+        },
+    )
+    resume_from_checkpoint: Optional[str] = field(
+        default=None,
+        metadata={"help": "Reference to a wandb artifact for resuming training."},
+    )
+class TrainState(train_state.TrainState):
+    dropout_rng: jnp.ndarray = None
+    epoch: int = 0
+    train_time: float = 0.0  # total time the model trained
+    train_samples: int = 0  # number of samples seen
+    def replicate(self):
+        return jax_utils.replicate(self).replace(
+            dropout_rng=shard_prng_key(self.dropout_rng)
+        )
+    def restore_state(self, artifact_dir):
+        # restore optimizer state
+        with (Path(artifact_dir) / "opt_state.msgpack").open("rb") as f:
+            new_opt_state = from_bytes(self.opt_state, f.read())
+        # restore other parameters
+        with (Path(artifact_dir) / "training_state.json").open("r") as f:
+            training_state = json.load(f)
+        # replace state
+        return self.replace(
+            opt_state=new_opt_state,
+            step=training_state["step"],
+            train_time=training_state["train_time"],
+            train_samples=training_state["train_samples"],
+        )
+def create_learning_rate_fn(
+    num_warmup_steps: int,
+    learning_rate: float,
+    use_decay: bool,
+    num_train_steps: int = None,  # used only with `use_decay`, typically train_size // batch_size * num_epochs
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    if use_decay:
+        assert (
+            num_train_steps is not None
+        ), "Learning rate with decay requires number of training steps"
+    warmup_fn = optax.linear_schedule(
+        init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps
+    )
+    if not use_decay:
+        return warmup_fn
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate,
+        end_value=0,
+        transition_steps=num_train_steps - num_warmup_steps,
+    )
+    schedule_fn = optax.join_schedules(
+        schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps]
+    )
+    return schedule_fn
+def wandb_log(metrics, step=None, prefix=None):
+    if jax.process_index() == 0:
+        log_metrics = {
+            f"{prefix}/{k}" if prefix is not None else k: v for k, v in metrics.items()
+        }
+        if step is not None:
+            log_metrics["train/step"] = step
+        wandb.log(log_metrics)
+def main():
+    # See all possible arguments by passing the --help flag to this script.
+    parser = HfArgumentParser(
+        (ModelArguments, DataTrainingArguments, TrainingArguments)
+    )
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(
+            json_file=os.path.abspath(sys.argv[1])
+        )
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Load dataset
+    dataset = Dataset(
+        **asdict(data_args),
+        do_train=training_args.do_train,
+        do_eval=training_args.do_eval,
+    )
+    # Set up wandb run
+    wandb.init(
+        entity="dalle-mini",
+        project="dalle-mini",
+        job_type="Seq2Seq",
+        config=parser.parse_args(),
+    )
+    if training_args.resume_from_checkpoint is not None:
+        artifact = wandb.run.use_artifact(training_args.resume_from_checkpoint)
+        artifact_dir = artifact.download()
+        # load model
+        model = CustomFlaxBartForConditionalGeneration.from_pretrained(artifact_dir)
+        # avoid OOM on TPU: see https://github.com/google/flax/issues/1658
+        print(model.params)
+        # load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            artifact_dir,
+            use_fast=True,
+        )
+    else:
+        # Set up our new model config
+        # TODO: simplify with custom config class
+        if model_args.config_name:
+            config = BartConfig.from_pretrained(model_args.config_name)
+        else:
+            config = BartConfig.from_pretrained(model_args.model_name_or_path)
+        if model_args.image_vocab_size:
+            config.image_vocab_size = model_args.image_vocab_size
+        assert (
+            getattr(config, "image_vocab_size") is not None
+        ), "image_vocab_size must be specified when not present in base model/config"
+        if model_args.image_length:
+            config.image_length = model_args.image_length
+        assert (
+            getattr(config, "image_length") is not None
+        ), "image_length must be specified when not present in base model/config"
+        # we append decoder bos to image vocab
+        config.decoder_start_token_id = config.image_vocab_size
+        # ensure we don't generate bos (in addition to decoder start token)
+        config.force_bos_token_to_be_generated = False
+        config.forced_bos_token_id = None  # we don't need this token
+        config.forced_eos_token_id = None  # we don't need this token
+        config.tie_word_embeddings = False
+        config.min_length = config.image_length + 1
+        config.max_length = config.image_length + 1
+        # below tokens need to be set to avoid error during generation (converted to jnp.array)
+        # they are not expected to be used and are set to unreachable token id
+        config.bos_token_id = config.image_vocab_size + 1
+        config.pos_token_id = config.image_vocab_size + 1
+        config.eos_token_id = config.image_vocab_size + 1
+        # save whether we normalize the text
+        if model_args.normalize_text is not None:
+            config.normalize_text = model_args.normalize_text
+        else:
+            config.normalize_text = getattr(config, "normalize_text", False)
+        # Load or create new model
+        if model_args.model_name_or_path:
+            model = CustomFlaxBartForConditionalGeneration.from_pretrained(
+                model_args.model_name_or_path,
+                config=config,
+                seed=training_args.seed_model,
+                dtype=getattr(jnp, model_args.dtype),
+            )
+            # avoid OOM on TPU: see https://github.com/google/flax/issues/1658
+            print(model.params)
+        else:
+            model = CustomFlaxBartForConditionalGeneration(
+                config,
+                seed=training_args.seed_model,
+                dtype=getattr(jnp, model_args.dtype),
+            )
+        # Load tokenizer
+        if model_args.tokenizer_name is not None:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_args.tokenizer_name, use_fast=True
+            )
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_args.model_name_or_path,
+                use_fast=True,
+            )
+    logger.info(f"TPUs: {jax.device_count()}")
+    assert jax.device_count() == 8, "TPUs in use, please check running processes"
+    # Preprocessing the datasets.
+    # We need to normalize and tokenize inputs and targets.
+    dataset.preprocess(
+        tokenizer=tokenizer,
+        decoder_start_token_id=model.config.decoder_start_token_id,
+        normalize_text=model.config.normalize_text,
+    )
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed_model)
+    rng, dropout_rng = jax.random.split(rng)
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = (
+        int(training_args.per_device_train_batch_size) * jax.device_count()
+    )
+    batch_size_per_update = train_batch_size * training_args.gradient_accumulation_steps
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    len_train_dataset, len_eval_dataset = dataset.length
+    steps_per_epoch = (
+        len_train_dataset // train_batch_size if len_train_dataset is not None else None
+    )
+    num_train_steps = (
+        steps_per_epoch * num_epochs if steps_per_epoch is not None else None
+    )
+    # Create learning rate schedule
+    learning_rate_fn = create_learning_rate_fn(
+        training_args.warmup_steps,
+        training_args.learning_rate,
+        training_args.use_decay,
+        num_train_steps,
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBart.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        layer_norm_params = [
+            (name, "scale")
+            for name in [
+                "self_attn_layer_norm",
+                "layernorm_embedding",
+                "final_layer_norm",
+            ]
+        ]
+        flat_mask = {
+            path: (path[-1] != "bias" and path[-2:] not in layer_norm_params)
+            for path in flat_params
+        }
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=learning_rate_fn,
+            weight_decay_rate=training_args.weight_decay,
+            weight_decay_mask=decay_mask_fn,
+            clipping_threshold=training_args.max_grad_norm,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=learning_rate_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )
+    # add gradient accumulation
+    if training_args.gradient_accumulation_steps > 1:
+        optimizer = optax.chain(
+            optax.apply_every(training_args.gradient_accumulation_steps), optimizer
+        )
+    # Setup train state
+    state = TrainState.create(
+        apply_fn=model.__call__,
+        params=model.params,
+        tx=optimizer,
+        dropout_rng=dropout_rng,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        # restore optimizer state and other parameters
+        # we currently ignore partial epoch training: see https://github.com/borisdayma/dalle-mini/issues/105
+        state = state.restore_state(artifact_dir)
+    # label smoothed cross entropy
+    def loss_fn(logits, labels):
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1]))
+        loss = loss.mean()
+        return loss
+    # Define gradient update step fn
+    def train_step(state, batch, delta_time):
+        dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
+        def compute_loss(params, batch):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(
+                **batch, params=params, dropout_rng=dropout_rng, train=True
+            )[0]
+            loss = loss_fn(logits, labels)
+            return loss
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grads = grad_fn(state.params, batch)
+        grads = jax.lax.pmean(grads, "batch")
+        state = state.apply_gradients(
+            grads=grads,
+            dropout_rng=new_dropout_rng,
+            train_time=state.train_time + delta_time,
+            train_samples=state.train_samples + train_batch_size,
+        )
+        metrics = {
+            "loss": loss,
+            "learning_rate": learning_rate_fn(state.step),
+        }
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return state, metrics
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        loss = loss_fn(logits, labels)
+        # summarize metrics
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return metrics
+    # Create parallel version of the train and eval step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    p_eval_step = jax.pmap(eval_step, "batch")
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len_train_dataset}")
+    logger.info(f"  Num Epochs = {num_epochs}")
+    logger.info(
+        f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}"
+    )
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & gradient accumulation) = {batch_size_per_update}"
+    )
+    epochs = tqdm(
+        range(state.epoch, num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0
+    )
+    # set default x-axis as 'train/step'
+    wandb_log({}, step=state.step)
+    wandb.define_metric("*", step_metric="train/step")
+    # add interesting config parameters
+    wandb.config.update(
+        {
+            "len_train_dataset": len_train_dataset,
+            "len_eval_dataset": len_eval_dataset,
+            "batch_size_per_update": batch_size_per_update,
+        }
+    )
+    # replicate state on each device
+    state = state.replicate()
+    def run_evaluation():
+        # ======================== Evaluating ==============================
+        eval_metrics = []
+        if training_args.do_eval:
+            eval_loader = dataset.dataloader("eval", eval_batch_size)
+            eval_steps = (
+                len_eval_dataset // eval_batch_size
+                if len_eval_dataset is not None
+                else None
+            )
+            for batch in tqdm(
+                eval_loader,
+                desc="Evaluating...",
+                position=2,
+                leave=False,
+                total=eval_steps,
+            ):
+                # Model forward
+                metrics = p_eval_step(state.params, batch)
+                eval_metrics.append(metrics)
+            # normalize eval metrics
+            eval_metrics = get_metrics(eval_metrics)
+            eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
+            # log metrics
+            wandb_log(eval_metrics, step=unreplicate(state.step), prefix="eval")
+            # Print metrics and update progress bar
+            desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']})"
+            epochs.write(desc)
+            epochs.desc = desc
+            return eval_metrics
+    def run_save_model(state, eval_metrics=None):
+        if jax.process_index() == 0:
+            params = jax.device_get(unreplicate(state.params))
+            # save model locally
+            model.save_pretrained(
+                training_args.output_dir,
+                params=params,
+            )
+            # save tokenizer
+            tokenizer.save_pretrained(training_args.output_dir)
+            # save state
+            opt_state = unreplicate(state.opt_state)
+            with (Path(training_args.output_dir) / "opt_state.msgpack").open("wb") as f:
+                f.write(to_bytes(opt_state))
+            state_dict = {
+                k: jax.device_get(unreplicate(getattr(state, k))).item()
+                for k in ["step", "epoch", "train_time", "train_samples"]
+            }
+            with (Path(training_args.output_dir) / "training_state.json").open(
+                "w"
+            ) as f:
+                json.dump(
+                    state_dict,
+                    f,
+                )
+            # save to W&B
+            if training_args.log_model:
+                # save some space
+                c = wandb.wandb_sdk.wandb_artifacts.get_artifacts_cache()
+                c.cleanup(wandb.util.from_human_size("10GB"))
+                metadata = dict(state_dict)
+                if eval_metrics is not None:
+                    metadata["eval"] = eval_metrics
+                artifact = wandb.Artifact(
+                    name=f"model-{wandb.run.id}", type="bart_model", metadata=metadata
+                )
+                artifact.add_file(
+                    str(Path(training_args.output_dir) / "flax_model.msgpack")
+                )
+                artifact.add_file(str(Path(training_args.output_dir) / "config.json"))
+                artifact.add_file(
+                    str(Path(training_args.output_dir) / "tokenizer.json")
+                )
+                artifact.add_file(
+                    str(Path(training_args.output_dir) / "tokenizer_config.json")
+                )
+                artifact.add_file(str(Path(training_args.output_dir) / "vocab.json"))
+                artifact.add_file(str(Path(training_args.output_dir) / "merges.txt"))
+                artifact.add_file(
+                    str(Path(training_args.output_dir) / "special_tokens_map.json")
+                )
+                artifact.add_file(
+                    str(Path(training_args.output_dir) / "opt_state.msgpack")
+                )
+                artifact.add_file(
+                    str(Path(training_args.output_dir) / "training_state.json")
+                )
+                wandb.run.log_artifact(artifact)
+            # save to the hub
+            if training_args.push_to_hub:
+                model.save_pretrained(
+                    training_args.output_dir,
+                    params=params,
+                    push_to_hub=training_args.push_to_hub,
+                    commit_message=f"Saving weights and logs at step {unreplicate(state.step)+1}",
+                    temp_dir=True,  # avoid issues with being in a repository
+                )
+    # init variables
+    last_time = time.perf_counter()
+    train_metrics = None
+    for epoch in epochs:
+        state.replace(epoch=jax_utils.replicate(epoch))
+        # ======================== Training ================================
+        wandb_log({"train/epoch": epoch}, step=unreplicate(state.step))
+        # Generate an epoch by shuffling sampling indices from the train dataset
+        train_loader = dataset.dataloader("train", train_batch_size)
+        # train
+        for batch in tqdm(
+            train_loader,
+            desc="Training...",
+            position=1,
+            leave=False,
+            total=steps_per_epoch,
+        ):
+            # calculate delta time (we have a lag of one step but it's ok)
+            new_time = time.perf_counter()
+            delta_time = new_time - last_time
+            last_time = new_time
+            # train step
+            state, train_metrics = p_train_step(
+                state, batch, jax_utils.replicate(delta_time)
+            )
+            step = unreplicate(state.step)
+            if step % training_args.logging_steps == 0 and jax.process_index() == 0:
+                # log metrics
+                metrics = unreplicate(train_metrics)
+                # log state parameters
+                state_dict = {
+                    k.split("_")[-1]: unreplicate(getattr(state, k))
+                    for k in ["epoch", "train_time", "train_samples"]
+                }
+                wandb_log({**metrics, **state_dict}, step=step, prefix="train")
+            eval_metrics = None
+            if training_args.eval_steps and step % training_args.eval_steps == 0:
+                eval_metrics = run_evaluation()
+            if step % training_args.save_steps == 0:
+                run_save_model(state, eval_metrics)
+        # log final train metrics
+        if train_metrics is not None:
+            train_metrics = unreplicate(train_metrics)
+            wandb_log(train_metrics, step=step, prefix="train")
+            epochs.write(
+                f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metrics['loss']}, Learning Rate: {train_metrics['learning_rate']})"
+            )
+        # Final evaluation
+        eval_metrics = run_evaluation()
+        # save checkpoint after each epoch
+        run_save_model(state, eval_metrics)
+if __name__ == "__main__":
+    main()