Spaces:

flax-community
/

dalle-mini

Running

App Files Files Community

boris commited on Jan 14, 2022

Commit

3a3d375

•

2 Parent(s): 26651dd af807f7

Merge pull request #115 from borisdayma/feat-shampoo

Browse files

Files changed (16) hide show

README.md +11 -0
dalle_mini/data.py +44 -16
dalle_mini/model.py +0 -64
dalle_mini/model/__init__.py +2 -0
dalle_mini/model/configuration.py +121 -0
dalle_mini/model/modeling.py +563 -0
dalle_mini/model/partitions.py +68 -0
setup.cfg +1 -0
tools/inference/inference_pipeline.ipynb +0 -0
tools/train/config/medium/config.json +33 -0
tools/train/config/mega/config.json +33 -0
tools/train/config/micro/config.json +33 -0
tools/train/config/mini/config.json +33 -0
tools/train/distributed_shampoo.py +1826 -0
tools/train/sweep.yaml +25 -30
tools/train/train.py +193 -157

README.md CHANGED Viewed

@@ -154,3 +154,14 @@ year = {2021}
       primaryClass={cs.CV}
 }
 ```

       primaryClass={cs.CV}
 }
 ```
+```
+@misc{anil2021scalable,
+      title={Scalable Second Order Optimization for Deep Learning},
+      author={Rohan Anil and Vineet Gupta and Tomer Koren and Kevin Regan and Yoram Singer},
+      year={2021},
+      eprint={2002.09018},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```

dalle_mini/data.py CHANGED Viewed

@@ -4,6 +4,7 @@ from functools import partial
 import jax
 import jax.numpy as jnp
 import numpy as np
 from datasets import Dataset, load_dataset
 from flax.training.common_utils import shard
@@ -15,12 +16,10 @@ class Dataset:
     dataset_repo_or_path: str
     train_file: str = None
     validation_file: str = None
-    dataset_type: str = "dataset"
     streaming: bool = True
     use_auth_token: bool = False
     text_column: str = "caption"
     encoding_column: str = "encoding"
-    max_source_length: int = 128
     max_train_samples: int = None
     max_eval_samples: int = None
     preprocessing_num_workers: int = None
@@ -28,13 +27,30 @@ class Dataset:
     do_train: bool = False
     do_eval: bool = True
     seed_dataset: int = None
     train_dataset: Dataset = field(init=False)
     eval_dataset: Dataset = field(init=False)
     rng_dataset: jnp.ndarray = field(init=False)
     def __post_init__(self):
         # define data_files
         if self.train_file is not None or self.validation_file is not None:
             data_files = {
                 "train": self.train_file,
                 "validation": self.validation_file,
@@ -70,7 +86,7 @@ class Dataset:
                     else self.eval_dataset.select(range(self.max_eval_samples))
                 )
-    def preprocess(self, tokenizer, decoder_start_token_id, normalize_text):
         if self.streaming:
             # we need to shuffle early in streaming mode
             if hasattr(self, "train_dataset"):
@@ -112,7 +128,7 @@ class Dataset:
             tokenizer=tokenizer,
             text_column=self.text_column,
             encoding_column=self.encoding_column,
-            max_source_length=self.max_source_length,
             decoder_start_token_id=decoder_start_token_id,
         )
         for ds in ["train_dataset", "eval_dataset"]:
@@ -165,17 +181,29 @@ class Dataset:
                 batch = shard(batch)
                 yield batch
-        def _dataloader_datasets_streaming(dataset: Dataset, batch_size: int):
             keys = ["input_ids", "attention_mask", "labels", "decoder_input_ids"]
             batch = {k: [] for k in keys}
-            for item in dataset:
-                for k, v in item.items():
-                    batch[k].append(v)
-                if len(batch[keys[0]]) == batch_size:
-                    batch = {k: jnp.array(v) for k, v in batch.items()}
-                    batch = shard(batch)
-                    yield batch
-                    batch = {k: [] for k in keys}
         if split == "train":
             ds = self.train_dataset
@@ -187,7 +215,7 @@ class Dataset:
         if self.streaming:
             if split == "train":
                 ds.set_epoch(epoch)
-            return _dataloader_datasets_streaming(ds, batch_size)
         else:
             if split == "train":
                 self.rng_dataset, input_rng = jax.random.split(self.rng_dataset)
@@ -232,14 +260,14 @@ def preprocess_function(
     tokenizer,
     text_column,
     encoding_column,
-    max_source_length,
     decoder_start_token_id,
 ):
     inputs = examples[text_column]
     # Setting padding="max_length" as we need fixed length inputs for jitted functions
     model_inputs = tokenizer(
         inputs,
-        max_length=max_source_length,
         padding="max_length",
         truncation=True,
         return_tensors="np",

 import jax
 import jax.numpy as jnp
 import numpy as np
+from braceexpand import braceexpand
 from datasets import Dataset, load_dataset
 from flax.training.common_utils import shard
     dataset_repo_or_path: str
     train_file: str = None
     validation_file: str = None
     streaming: bool = True
     use_auth_token: bool = False
     text_column: str = "caption"
     encoding_column: str = "encoding"
     max_train_samples: int = None
     max_eval_samples: int = None
     preprocessing_num_workers: int = None
     do_train: bool = False
     do_eval: bool = True
     seed_dataset: int = None
+    shard_by_host: bool = False
     train_dataset: Dataset = field(init=False)
     eval_dataset: Dataset = field(init=False)
     rng_dataset: jnp.ndarray = field(init=False)
+    multi_hosts: bool = field(init=False)
     def __post_init__(self):
+        self.multi_hosts = jax.process_count() > 1
         # define data_files
         if self.train_file is not None or self.validation_file is not None:
+            # accept braceexpand notation
+            for k in ["train_file", "validation_file"]:
+                f = getattr(self, k)
+                if isinstance(f, str):
+                    setattr(self, k, list(braceexpand(f)))
+            # for list of files, split training data shards by host
+            if (
+                isinstance(self.train_file, list)
+                and self.multi_hosts
+                and self.shard_by_host
+            ):
+                self.train_file = self.train_file[
+                    jax.process_index() :: jax.process_count()
+                ]
             data_files = {
                 "train": self.train_file,
                 "validation": self.validation_file,
                     else self.eval_dataset.select(range(self.max_eval_samples))
                 )
+    def preprocess(self, tokenizer, decoder_start_token_id, normalize_text, max_length):
         if self.streaming:
             # we need to shuffle early in streaming mode
             if hasattr(self, "train_dataset"):
             tokenizer=tokenizer,
             text_column=self.text_column,
             encoding_column=self.encoding_column,
+            max_length=max_length,
             decoder_start_token_id=decoder_start_token_id,
         )
         for ds in ["train_dataset", "eval_dataset"]:
                 batch = shard(batch)
                 yield batch
+        def _dataloader_datasets_streaming(
+            dataset: Dataset, batch_size: int, epoch: int
+        ):
+            # epoch is only use for multi-host
             keys = ["input_ids", "attention_mask", "labels", "decoder_input_ids"]
             batch = {k: [] for k in keys}
+            first_loop = True
+            while self.multi_hosts or first_loop:
+                # in multi-host, we run forever (no epoch) as hosts need to stop
+                # at the same time and we don't know how much data is on each host
+                if not first_loop:
+                    # multi-host setting, we reshuffle shards
+                    epoch += 1
+                    dataset.set_epoch(epoch)
+                for item in dataset:
+                    for k, v in item.items():
+                        batch[k].append(v)
+                    if len(batch[keys[0]]) == batch_size:
+                        batch = {k: jnp.array(v) for k, v in batch.items()}
+                        batch = shard(batch)
+                        yield batch
+                        batch = {k: [] for k in keys}
+                first_loop = False
         if split == "train":
             ds = self.train_dataset
         if self.streaming:
             if split == "train":
                 ds.set_epoch(epoch)
+            return _dataloader_datasets_streaming(ds, batch_size, epoch)
         else:
             if split == "train":
                 self.rng_dataset, input_rng = jax.random.split(self.rng_dataset)
     tokenizer,
     text_column,
     encoding_column,
+    max_length,
     decoder_start_token_id,
 ):
     inputs = examples[text_column]
     # Setting padding="max_length" as we need fixed length inputs for jitted functions
     model_inputs = tokenizer(
         inputs,
+        max_length=max_length,
         padding="max_length",
         truncation=True,
         return_tensors="np",

dalle_mini/model.py DELETED Viewed

@@ -1,64 +0,0 @@
-import flax.linen as nn
-import jax
-from transformers import BartConfig
-from transformers.models.bart.modeling_flax_bart import (
-    FlaxBartDecoder,
-    FlaxBartEncoder,
-    FlaxBartForConditionalGeneration,
-    FlaxBartForConditionalGenerationModule,
-    FlaxBartModule,
-)
-class CustomFlaxBartModule(FlaxBartModule):
-    def setup(self):
-        # we keep shared to easily load pre-trained weights
-        self.shared = nn.Embed(
-            self.config.vocab_size,
-            self.config.d_model,
-            embedding_init=jax.nn.initializers.normal(self.config.init_std),
-        )
-        # a separate embedding is used for the decoder
-        self.decoder_embed = nn.Embed(
-            self.config.image_vocab_size + 1,
-            self.config.d_model,
-            embedding_init=jax.nn.initializers.normal(self.config.init_std),
-        )
-        self.encoder = FlaxBartEncoder(
-            self.config, dtype=self.dtype, embed_tokens=self.shared
-        )
-        # the decoder has a different config
-        # TODO: should not be needed once we have custom config/module
-        decoder_config = BartConfig(self.config.to_dict())
-        decoder_config.max_position_embeddings = (
-            self.config.image_length + 1  # image tokens + BOS
-        )
-        decoder_config.vocab_size = self.config.image_vocab_size + 1
-        self.decoder = FlaxBartDecoder(
-            decoder_config, dtype=self.dtype, embed_tokens=self.decoder_embed
-        )
-class CustomFlaxBartForConditionalGenerationModule(
-    FlaxBartForConditionalGenerationModule
-):
-    def setup(self):
-        # set default config
-        self.config.normalize_text = getattr(self.config, "normalize_text", False)
-        self.config.image_length = getattr(self.config, "image_length", 256)
-        self.config.image_vocab_size = getattr(self.config, "image_vocab_size", 16384)
-        self.model = CustomFlaxBartModule(config=self.config, dtype=self.dtype)
-        self.lm_head = nn.Dense(
-            self.config.image_vocab_size + 1,  # encoded image token space + 1 for bos
-            use_bias=False,
-            kernel_init=jax.nn.initializers.normal(self.config.init_std),
-        )
-        self.final_logits_bias = self.param(
-            "final_logits_bias", self.bias_init, (1, self.config.image_vocab_size + 1)
-        )
-class CustomFlaxBartForConditionalGeneration(FlaxBartForConditionalGeneration):
-    module_class = CustomFlaxBartForConditionalGenerationModule

dalle_mini/model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .configuration import DalleBartConfig
2	+ from .modeling import DalleBart

dalle_mini/model/configuration.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DalleBart model configuration """
+import warnings
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class DalleBartConfig(PretrainedConfig):
+    model_type = "dallebart"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_attention_heads": "encoder_attention_heads",
+        "hidden_size": "d_model",
+    }
+    def __init__(
+        self,
+        normalize_text=False,
+        encoder_vocab_size=50264,
+        image_vocab_size=16384,  # encoded image token space
+        image_length=256,  # number of encoded tokens
+        max_text_length=64,  # max number of text tokens
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        gradient_checkpointing=False,
+        use_cache=True,
+        is_encoder_decoder=True,
+        forced_eos_token_id=None,
+        tie_word_embeddings=False,  # different modalities and sizes
+        **kwargs,
+    ):
+        self.normalize_text = normalize_text
+        self.encoder_vocab_size = encoder_vocab_size
+        self.image_vocab_size = image_vocab_size
+        self.image_length = image_length
+        self.max_text_length = max_text_length
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = (
+            scale_embedding  # scale factor will be sqrt(d_model) if True
+        )
+        # remove inferred keys to prevent errors when loading config (passed as kwargs)
+        for k in [
+            "pad_token_id",
+            "bos_token_id",
+            "eos_token_id",
+            "decoder_start_token_id",
+            "min_length",
+            "max_length",
+        ]:
+            kwargs.pop(k, None)
+        super().__init__(
+            pad_token_id=image_vocab_size
+            + 1,  # needed to avoid errors during generation (converted to jnp.array)
+            bos_token_id=image_vocab_size + 1,  # set to unreachable values
+            eos_token_id=image_vocab_size + 1,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=image_vocab_size,  # BOS appended to vocab
+            forced_eos_token_id=forced_eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            min_length=image_length + 1,
+            max_length=image_length + 1,
+            **kwargs,
+        )
+        # ensure backward compatibility for BART CNN models
+        if self.forced_bos_token_id is None and kwargs.get(
+            "force_bos_token_to_be_generated", False
+        ):
+            self.forced_bos_token_id = self.bos_token_id
+            warnings.warn(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions."
+                "The config can simply be saved and uploaded again to be fixed."
+            )

dalle_mini/model/modeling.py ADDED Viewed

	@@ -0,0 +1,563 @@

+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The Google Flax Team Authors And The HuggingFace Inc. team and the DalleBart team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DalleBart model. """
+import math
+from functools import partial
+from typing import Optional, Tuple
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import unfreeze
+from flax.linen import make_causal_mask
+from flax.traverse_util import flatten_dict
+from jax.random import PRNGKey
+from transformers.modeling_flax_outputs import (
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxSeq2SeqLMOutput,
+)
+from transformers.modeling_flax_utils import ACT2FN
+from transformers.models.bart.modeling_flax_bart import (
+    FlaxBartAttention,
+    FlaxBartDecoder,
+    FlaxBartDecoderLayer,
+    FlaxBartDecoderLayerCollection,
+    FlaxBartEncoder,
+    FlaxBartEncoderLayer,
+    FlaxBartEncoderLayerCollection,
+    FlaxBartForConditionalGeneration,
+    FlaxBartForConditionalGenerationModule,
+    FlaxBartModule,
+    FlaxBartPreTrainedModel,
+)
+from transformers.utils import logging
+from .configuration import DalleBartConfig
+logger = logging.get_logger(__name__)
+class FlaxBartAttention(FlaxBartAttention):
+    """
+    Edits:
+    - causal mask is used only in decoder and considers image_length + 1 (for BOS)
+    """
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            use_bias=self.bias,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
+        self.out_proj = dense()
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+        if self.causal:
+            # used only in decoder
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.image_length + 1), dtype="bool"), dtype="bool"
+            )
+class FlaxBartEncoderLayer(FlaxBartEncoderLayer):
+    """
+    Edits:
+    - no bias
+    - use custom FlaxBartAttention
+    """
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxBartAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.encoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            bias=False,
+            dtype=self.dtype,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.fc1 = nn.Dense(
+            self.config.encoder_ffn_dim,
+            dtype=self.dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.fc2 = nn.Dense(
+            self.embed_dim,
+            dtype=self.dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+class FlaxBartEncoderLayerCollection(FlaxBartEncoderLayerCollection):
+    """
+    Edits:
+    - use custom FlaxBartEncoderLayer
+    - allow Gradient Checkpointing (nn.remat)
+    """
+    def setup(self):
+        layer_module = (
+            nn.remat(FlaxBartEncoderLayer)
+            if self.config.gradient_checkpointing
+            else FlaxBartEncoderLayer
+        )
+        self.layers = [
+            layer_module(self.config, name=str(i), dtype=self.dtype)
+            for i in range(self.config.encoder_layers)
+        ]
+        self.layerdrop = self.config.encoder_layerdrop
+class FlaxBartDecoderLayer(FlaxBartDecoderLayer):
+    """
+    Edits:
+    - no bias
+    - uses custom FlaxBartAttention
+    """
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxBartAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            causal=True,
+            bias=False,
+            dtype=self.dtype,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.encoder_attn = FlaxBartAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            bias=False,
+            dtype=self.dtype,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.fc1 = nn.Dense(
+            self.config.encoder_ffn_dim,
+            dtype=self.dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.fc2 = nn.Dense(
+            self.embed_dim,
+            dtype=self.dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+class FlaxBartDecoderLayerCollection(FlaxBartDecoderLayerCollection):
+    """
+    Edits:
+    - use custom FlaxBartDecoderLayer
+    - allow Gradient Checkpointing (nn.remat)
+    """
+    def setup(self):
+        layer_module = (
+            nn.remat(FlaxBartDecoderLayer)
+            if self.config.gradient_checkpointing
+            else FlaxBartDecoderLayer
+        )
+        self.layers = [
+            layer_module(self.config, name=str(i), dtype=self.dtype)
+            for i in range(self.config.decoder_layers)
+        ]
+        self.layerdrop = self.config.decoder_layerdrop
+class FlaxBartEncoder(FlaxBartEncoder):
+    """
+    Edits:
+    - offset set to 0 (no padding token)
+    - use max_text_length instead of max_position_embeddings
+    - use custom FlaxBartEncoderLayerCollection
+    - embed_tokens cannot be None (issue at compile time)
+    """
+    def setup(self):
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        embed_dim = self.config.d_model
+        self.padding_idx = self.config.pad_token_id
+        self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 0
+        self.embed_positions = nn.Embed(
+            self.config.max_text_length + self.offset,
+            embed_dim,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.layers = FlaxBartEncoderLayerCollection(self.config, self.dtype)
+        self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+class FlaxBartDecoder(FlaxBartDecoder):
+    """
+    Edits:
+    - offset set to 0 (no padding token)
+    - use image_length + 1 (for BOS) instead of max_position_embeddings
+    - use custom FlaxBartDecoderLayerCollection
+    - embed_tokens cannot be None (issue at compile time)
+    """
+    def setup(self):
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        embed_dim = self.config.d_model
+        self.padding_idx = self.config.pad_token_id
+        self.embed_scale = (
+            math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0
+        )
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 0
+        self.embed_positions = nn.Embed(
+            self.config.image_length + 1 + self.offset,  # image length + 1 for BOS
+            embed_dim,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.layers = FlaxBartDecoderLayerCollection(self.config, self.dtype)
+        self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+class FlaxBartModule(FlaxBartModule):
+    """
+    Edits
+    - use custom FlaxBartEncoder & FlaxBartDecoder
+    - use separate embeddings for Encoder & Decoder
+    """
+    def setup(self):
+        encoder_embed_tokens = nn.Embed(
+            self.config.encoder_vocab_size,
+            self.config.d_model,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        decoder_embed_tokens = nn.Embed(
+            self.config.image_vocab_size + 1,  # image vocab size + 1 for BOS
+            self.config.d_model,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.encoder = FlaxBartEncoder(
+            self.config, dtype=self.dtype, embed_tokens=encoder_embed_tokens
+        )
+        self.decoder = FlaxBartDecoder(
+            self.config, dtype=self.dtype, embed_tokens=decoder_embed_tokens
+        )
+class FlaxBartPreTrainedModel(FlaxBartPreTrainedModel):
+    """
+    Edits:
+    - added num_params property
+    - config_class replaced to DalleBartConfig
+    - __init__ accepts abstract_init which does uses parameter shape to initialize the model
+    """
+    config_class = DalleBartConfig
+    def __init__(
+        self,
+        config: DalleBartConfig,
+        input_shape: Tuple[int] = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        abstract_init: bool = False,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        # adapted from HuggingFace FlaxPreTrainedModel
+        if config is None:
+            raise ValueError("config cannot be None")
+        if module is None:
+            raise ValueError("module cannot be None")
+        # Those are private to be exposed as typed property on derived classes.
+        self._config = config
+        self._module = module
+        # Those are public as their type is generic to every derived classes.
+        self.key = PRNGKey(seed)
+        self.dtype = dtype
+        # randomly initialized parameters
+        if abstract_init:
+            # init the model weights only abstractly, eval_shape will return a pytree
+            # with the structure as weights but without any actual values, this will just contain
+            # the shape information. Weights need to be loaded later.
+            init_fn = partial(self.init_weights, input_shape=input_shape)
+            random_params = jax.eval_shape(init_fn, self.key)
+        else:
+            random_params = self.init_weights(self.key, input_shape)
+        # save required_params as set
+        self._required_params = set(flatten_dict(unfreeze(random_params)).keys())
+        self.params = random_params
+    @property
+    def num_params(self):
+        num_params = jax.tree_map(
+            lambda param: param.size, flatten_dict(unfreeze(self.params))
+        ).values()
+        return sum(list(num_params))
+class FlaxBartForConditionalGenerationModule(FlaxBartForConditionalGenerationModule):
+    """
+    Edits:
+    - no bias
+    - lm_head set to image_vocab_size + 1 (for BOS)
+    - uses custom FlaxBartModule
+    """
+    def setup(self):
+        self.model = FlaxBartModule(config=self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.image_vocab_size + 1,  # image vocab size + 1 for BOS
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        position_ids,
+        decoder_position_ids,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            position_ids=position_ids,
+            decoder_position_ids=decoder_position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.model.variables["params"]["shared"]["embedding"]
+            lm_logits = self.lm_head.apply(
+                {"params": {"kernel": shared_embedding.T}}, hidden_states
+            )
+        else:
+            lm_logits = self.lm_head(hidden_states)
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return output
+        return FlaxSeq2SeqLMOutput(
+            logits=lm_logits,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+class DalleBart(FlaxBartPreTrainedModel, FlaxBartForConditionalGeneration):
+    """
+    Edits:
+    - renamed from FlaxBartForConditionalGeneration
+    - uses custom FlaxBartPreTrainedModel
+    - uses custom FlaxBartForConditionalGenerationModule
+    - no bias in decode method
+    """
+    module_class = FlaxBartForConditionalGenerationModule
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.return_dict
+        )
+        encoder_hidden_states = encoder_outputs[0]
+        if encoder_attention_mask is None:
+            batch_size, sequence_length = encoder_hidden_states.shape[:2]
+            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError(
+                    "Make sure to provide `decoder_position_ids` when passing `past_key_values`."
+                )
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+            )
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxBartAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        def _decoder_forward(
+            module,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+            **kwargs,
+        ):
+            decoder_module = module._get_decoder_module()
+            outputs = decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                **kwargs,
+            )
+            hidden_states = outputs[0]
+            if self.config.tie_word_embeddings:
+                shared_embedding = module.model.variables["params"]["shared"][
+                    "embedding"
+                ]
+                lm_logits = module.lm_head.apply(
+                    {"params": {"kernel": shared_embedding.T}}, hidden_states
+                )
+            else:
+                lm_logits = module.lm_head(hidden_states)
+            return lm_logits, outputs
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+        if past_key_values is None:
+            lm_logits, decoder_outputs = outputs
+        else:
+            (lm_logits, decoder_outputs), past = outputs
+        if return_dict:
+            outputs = FlaxCausalLMOutputWithCrossAttentions(
+                logits=lm_logits,
+                hidden_states=decoder_outputs.hidden_states,
+                attentions=decoder_outputs.attentions,
+                cross_attentions=decoder_outputs.cross_attentions,
+            )
+        else:
+            outputs = (lm_logits,) + decoder_outputs[1:]
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+        return outputs

dalle_mini/model/partitions.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import re
+from flax.core.frozen_dict import freeze
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax.experimental import PartitionSpec as P
+# utils adapted from https://github.com/google-research/google-research/blob/master/flax_models/t5x/partitions.py
+# Sentinels
+_unmatched = object()
+# For specifying empty leaf dict `{}`
+empty_dict = object()
+def _match(qs, ks):
+    """Return True if regexes in qs match any window of strings in tuple ks."""
+    # compile regexes and force complete match
+    qts = tuple(map(lambda x: re.compile(x + "$"), qs))
+    for i in range(len(ks) - len(qs) + 1):
+        matches = [x.match(y) for x, y in zip(qts, ks[i:])]
+        if matches and all(matches):
+            return True
+    return False
+def _replacement_rules(rules):
+    def replace(key, val):
+        for rule, replacement in rules:
+            if _match(rule, key):
+                return replacement
+        return val
+    return replace
+def _get_partition_rules():
+    return [
+        # embeddings
+        ((r"embed_positions", "embedding"), P("mp", None)),
+        ((r"embed_tokens", "embedding"), P("mp", None)),
+        # self-attention
+        ((r"self_attn", "(q_proj|k_proj|v_proj)", "kernel"), P(None, "mp")),
+        ((r"self_attn", "out_proj", "kernel"), P("mp", None)),
+        # enc-dec attention
+        ((r"encoder_attn", "(q_proj|k_proj|v_proj)", "kernel"), P(None, "mp")),
+        ((r"encoder_attn", "out_proj", "kernel"), P("mp", None)),
+        # FFN
+        ((r"fc1", "kernel"), P(None, "mp")),
+        ((r"fc2", "kernel"), P("mp", None)),
+        # layer norms
+        ((r"layernorm_embedding", "(bias|scale)"), None),
+        ((r"self_attn_layer_norm", "(bias|scale)"), None),
+        ((r"encoder_attn_layer_norm", "(bias|scale)"), None),
+        ((r"final_layer_norm", "(bias|scale)"), None),
+        ((r"lm_head", "kernel"), P(None, "mp")),
+    ]
+def set_partitions(in_dict):
+    rules = _get_partition_rules()
+    replace = _replacement_rules(rules)
+    initd = {k: _unmatched for k in flatten_dict(in_dict)}
+    result = {k: replace(k, v) for k, v in initd.items()}
+    for k, v in result.items():
+        if v == _unmatched:
+            print(k)
+    assert _unmatched not in result.values(), "Incomplete partition spec."
+    return freeze(unflatten_dict(result))

setup.cfg CHANGED Viewed

@@ -23,5 +23,6 @@ dev =
     tqdm
     wandb
     optax
     black[jupyter]
     isort

     tqdm
     wandb
     optax
+    braceexpand
     black[jupyter]
     isort

tools/inference/inference_pipeline.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

tools/train/config/medium/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "attention_dropout": 0.0,
+  "bos_token_id": 16385,
+  "classifier_dropout": 0.0,
+  "d_model": 1536,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 18,
+  "decoder_start_token_id": 16384,
+  "dropout": 0.1,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 18,
+  "encoder_vocab_size": 50264,
+  "eos_token_id": 16385,
+  "gradient_checkpointing": false,
+  "image_length": 256,
+  "image_vocab_size": 16384,
+  "init_std": 0.01,
+  "is_encoder_decoder": true,
+  "max_text_length": 64,
+  "model_type": "dallebart",
+  "normalize_text": true,
+  "pad_token_id": 16385,
+  "scale_embedding": false,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.13.0.dev0",
+  "use_cache": true
+}

tools/train/config/mega/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "attention_dropout": 0.0,
+  "bos_token_id": 16385,
+  "classifier_dropout": 0.0,
+  "d_model": 2048,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 31,
+  "decoder_start_token_id": 16384,
+  "dropout": 0.1,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 31,
+  "encoder_vocab_size": 50264,
+  "eos_token_id": 16385,
+  "gradient_checkpointing": false,
+  "image_length": 256,
+  "image_vocab_size": 16384,
+  "init_std": 0.01,
+  "is_encoder_decoder": true,
+  "max_text_length": 64,
+  "model_type": "dallebart",
+  "normalize_text": true,
+  "pad_token_id": 16385,
+  "scale_embedding": false,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.13.0.dev0",
+  "use_cache": true
+}

tools/train/config/micro/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "attention_dropout": 0.0,
+  "bos_token_id": 16385,
+  "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 2048,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 16384,
+  "dropout": 0.1,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 2048,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "encoder_vocab_size": 50264,
+  "eos_token_id": 16385,
+  "gradient_checkpointing": false,
+  "image_length": 256,
+  "image_vocab_size": 16384,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_text_length": 64,
+  "model_type": "dallebart",
+  "normalize_text": true,
+  "pad_token_id": 16385,
+  "scale_embedding": false,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.13.0.dev0",
+  "use_cache": true
+}

tools/train/config/mini/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "attention_dropout": 0.0,
+  "bos_token_id": 16385,
+  "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 12,
+  "decoder_start_token_id": 16384,
+  "dropout": 0.1,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 12,
+  "encoder_vocab_size": 50264,
+  "eos_token_id": 16385,
+  "gradient_checkpointing": false,
+  "image_length": 256,
+  "image_vocab_size": 16384,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_text_length": 64,
+  "model_type": "dallebart",
+  "normalize_text": true,
+  "pad_token_id": 16385,
+  "scale_embedding": false,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.13.0.dev0",
+  "use_cache": true
+}

tools/train/distributed_shampoo.py ADDED Viewed

	@@ -0,0 +1,1826 @@

+"""File copied from https://github.com/google-research/google-research/edit/master/scalable_shampoo/optax/distributed_shampoo.py"""
+# coding=utf-8
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# An implementation of distributed Shampoo optimizer from:
+#
+#  Scalable Second Order Optimization for Deep Learning
+#  Rohan Anil, Vineet Gupta, Tomer Koren, Kevin Regan, Yoram Singer
+#  Preprint Paper: https://arxiv.org/abs/2002.09018
+#
+# This implementation moves computation of inverse pth root back to the
+# accelerator (if higher precision is available).
+#
+# Authors: Rohan Anil (rohananil at google dot com)
+#    &     Vineet Gupta (vineet at google dot com)
+#
+"""Distributed Shampoo Implementation."""
+import enum
+import functools
+import itertools
+from typing import Any, List, NamedTuple
+import chex
+import jax
+import jax.experimental.pjit as pjit
+import jax.numpy as jnp
+import numpy as np
+import optax
+from flax import struct
+from jax import lax
+# pylint:disable=no-value-for-parameter
+@struct.dataclass
+class QuantizedValue:
+    """State associated with quantized value."""
+    quantized: chex.Array
+    diagonal: chex.Array  # Diagonal (if extract_diagonal is set)
+    bucket_size: chex.Array
+    quantized_dtype: jnp.dtype = struct.field(
+        pytree_node=False
+    )  # Dtype for the quantized value.
+    extract_diagonal: bool = struct.field(pytree_node=False)  # In case its centered.
+    shape: Any = struct.field(pytree_node=False)  # Shape of the tensor.
+    @classmethod
+    def from_float_value(cls, fvalue, quantized_dtype, extract_diagonal=False):
+        if isinstance(fvalue, list) and not fvalue:
+            return QuantizedValue([], [], [], quantized_dtype, extract_diagonal, [])
+        quantized, diagonal_fvalue, bucket_size = QuantizedValue.quantize(
+            fvalue, quantized_dtype, extract_diagonal
+        )
+        return QuantizedValue(
+            quantized,
+            diagonal_fvalue,
+            bucket_size,
+            quantized_dtype,
+            extract_diagonal,
+            list(quantized.shape),
+        )
+    # Quantization is from Lingvo JAX optimizers.
+    # We extend it for int16 quantization of PSD matrices.
+    @classmethod
+    def quantize(cls, fvalue, quantized_dtype, extract_diagonal=False):
+        """Returns quantized value and the bucket."""
+        if quantized_dtype == jnp.float32:
+            return fvalue, [], []
+        elif quantized_dtype == jnp.bfloat16:
+            return fvalue.astype(jnp.bfloat16), [], []
+        float_dtype = fvalue.dtype
+        if quantized_dtype == jnp.int8:
+            # value -128 is not used.
+            num_buckets = jnp.array(127.0, dtype=float_dtype)
+        elif quantized_dtype == jnp.int16:
+            # value -32768 is not used.
+            num_buckets = jnp.array(32767.0, dtype=float_dtype)
+        else:
+            raise ValueError(f"Quantized dtype {quantized_dtype} not supported.")
+        # max value is mapped to num_buckets
+        if extract_diagonal and fvalue.ndim != 2:
+            raise ValueError(
+                f"Input array {fvalue} must be 2D to work with extract_diagonal."
+            )
+        diagonal_fvalue = []
+        if extract_diagonal:
+            diagonal_fvalue = jnp.diag(fvalue)
+            # Remove the diagonal entries.
+            fvalue = fvalue - jnp.diag(diagonal_fvalue)
+        # TODO(rohananil): Extend this by making use of information about the blocks
+        # SM3 style which will be useful for diagonal statistics
+        # We first decide the scale.
+        if fvalue.ndim < 1:
+            raise ValueError(
+                f"Input array {fvalue} must have a strictly positive number of "
+                "dimensions."
+            )
+        max_abs = jnp.max(jnp.abs(fvalue), axis=0)
+        bucket_size = max_abs / num_buckets
+        bs_expanded = bucket_size[jnp.newaxis, Ellipsis]
+        # To avoid divide by 0.0
+        bs_nonzero = jnp.where(
+            bs_expanded > 0.0, bs_expanded, jnp.ones_like(bs_expanded)
+        )
+        ratio = fvalue / bs_nonzero
+        # We use rounding to remove bias.
+        quantized = jnp.round(ratio)
+        return quantized.astype(quantized_dtype), diagonal_fvalue, bucket_size
+    def to_float(self):
+        """Returns the float value."""
+        if isinstance(self.quantized, list) and not self.quantized:
+            return self.quantized
+        if self.quantized_dtype == jnp.float32:
+            return self.quantized
+        if self.quantized_dtype == jnp.bfloat16:
+            return self.quantized.astype(jnp.float32)
+        float_dtype = self.bucket_size.dtype
+        bucket_size = self.bucket_size[jnp.newaxis, Ellipsis]
+        val = self.quantized.astype(float_dtype) * bucket_size
+        if self.extract_diagonal:
+            val += jnp.diag(self.diagonal)
+        return val
+# Per parameter optimizer state used in data-parallel training.
+class ParameterStats(NamedTuple):
+    """State associated to each parameter of the model being trained."""
+    diagonal_statistics: QuantizedValue  # Accumulator for diagonal preconditioner
+    statistics: List[Any]  # Statistics (QuantizedValue, chex.Array)
+    preconditioners: List[Any]  # Preconditioners (QuantizedValue, chex.Array)
+    diagonal_momentum: QuantizedValue  # Momentum for the diagonal preconditioner
+    momentum: QuantizedValue  # Momentum for the shampoo preconditioner
+# For training extremely large model; We keep a global state with a concatenated
+# statistics and preconditioner states for all vars. This is so that we can
+# annotate the leading axis to be sharded to save memory at the cost of
+# communication.
+@struct.dataclass
+class GlobalShardedParameterStats:
+    statistics: chex.Array  # Statistics
+    preconditioners: chex.Array  # Preconditioners
+# These are per-parameter local states; All statistics here mirror the parameter
+# Thus the sharding is copied over from the param specification.
+@struct.dataclass
+class LocalShardedParameterStats:
+    """State associated to each parameter of the model being trained."""
+    diagonal_statistics: QuantizedValue  # Accumulator for diagonal preconditioner
+    diagonal_momentum: QuantizedValue  # Momentum for the diagonal preconditioner
+    momentum: QuantizedValue  # Momentum for the shampoo preconditioner
+    index_start: np.int32 = struct.field(
+        pytree_node=False
+    )  # Index into global statistics array
+    sizes: Any = struct.field(pytree_node=False)  # Sizes of the statistics.
+class ShardedShampooStats(NamedTuple):
+    """Shampoo state in sharded mode."""
+    global_stats: Any
+    local_stats: Any
+class ShampooState(NamedTuple):
+    count: chex.Array
+    stats: Any
+class GraftingType(enum.IntEnum):
+    SGD = 1
+    ADAGRAD = 2
+    RMSPROP = 3
+    RMSPROP_NORMALIZED = 4
+def power_iteration(
+    matrix, num_iters=100, error_tolerance=1e-6, precision=lax.Precision.HIGHEST
+):
+    r"""Power iteration algorithm.
+    The power iteration algorithm takes a symmetric PSD matrix `A`, and produces
+    a scalar `\lambda` , which is the greatest (in absolute value) eigenvalue
+    of `A`, and a vector v, which is the corresponding eigenvector of `A`.
+    References:
+      [Wikipedia, 2021](https://en.wikipedia.org/wiki/Power_iteration)
+    Args:
+      matrix: the symmetric PSD matrix.
+      num_iters: Number of iterations.
+      error_tolerance: Iterative exit condition.
+      precision: precision XLA related flag, the available options are:
+        a) lax.Precision.DEFAULT (better step time, but not precise)
+        b) lax.Precision.HIGH (increased precision, slower)
+        c) lax.Precision.HIGHEST (best possible precision, slowest)
+    Returns:
+      eigen vector, eigen value
+    """
+    matrix_size = matrix.shape[-1]
+    def _iter_condition(state):
+        i, unused_v, unused_s, unused_s_v, run_step = state
+        return jnp.logical_and(i < num_iters, run_step)
+    def _iter_body(state):
+        """One step of power iteration."""
+        i, new_v, s, s_v, unused_run_step = state
+        new_v = new_v / jnp.linalg.norm(new_v)
+        s_v = jnp.einsum("ij,j->i", matrix, new_v, precision=precision)
+        s_new = jnp.einsum("i,i->", new_v, s_v, precision=precision)
+        return (
+            i + 1,
+            s_v,
+            s_new,
+            s_v,
+            jnp.greater(jnp.abs(s_new - s), error_tolerance),
+        )
+    # Figure out how to use step as seed for random.
+    v_0 = (
+        np.random.RandomState(1729).uniform(-1.0, 1.0, matrix_size).astype(matrix.dtype)
+    )
+    init_state = tuple([0, v_0, jnp.zeros([], dtype=matrix.dtype), v_0, True])
+    _, v_out, s_out, _, _ = lax.while_loop(_iter_condition, _iter_body, init_state)
+    v_out = v_out / jnp.linalg.norm(v_out)
+    return v_out, s_out
+def matrix_inverse_pth_root(
+    matrix,
+    p,
+    num_iters=100,
+    ridge_epsilon=1e-6,
+    error_tolerance=1e-6,
+    precision=lax.Precision.HIGHEST,
+):
+    """Computes `matrix^(-1/p)`, where `p` is a positive integer.
+    This function uses the Coupled newton iterations algorithm for
+    the computation of a matrix's inverse pth root.
+    References:
+      [Functions of Matrices, Theory and Computation,
+       Nicholas J Higham, Pg 184, Eq 7.18](
+       https://epubs.siam.org/doi/book/10.1137/1.9780898717778)
+    Args:
+      matrix: the symmetric PSD matrix whose power it to be computed
+      p: exponent, for p a positive integer.
+      num_iters: Maximum number of iterations.
+      ridge_epsilon: Ridge epsilon added to make the matrix positive definite.
+      error_tolerance: Error indicator, useful for early termination.
+      precision: precision XLA related flag, the available options are:
+        a) lax.Precision.DEFAULT (better step time, but not precise)
+        b) lax.Precision.HIGH (increased precision, slower)
+        c) lax.Precision.HIGHEST (best possible precision, slowest)
+    Returns:
+      matrix^(-1/p)
+    """
+    # We use float32 for the matrix inverse pth root.
+    # Switch to f64 if you have hardware that supports it.
+    matrix_size = matrix.shape[0]
+    alpha = jnp.asarray(-1.0 / p, jnp.float32)
+    identity = jnp.eye(matrix_size, dtype=jnp.float32)
+    _, max_ev = power_iteration(
+        matrix=matrix, num_iters=100, error_tolerance=1e-6, precision=precision
+    )
+    ridge_epsilon = ridge_epsilon * jnp.maximum(max_ev, 1e-16)
+    def _unrolled_mat_pow_1(mat_m):
+        """Computes mat_m^1."""
+        return mat_m
+    def _unrolled_mat_pow_2(mat_m):
+        """Computes mat_m^2."""
+        return jnp.matmul(mat_m, mat_m, precision=precision)
+    def _unrolled_mat_pow_4(mat_m):
+        """Computes mat_m^4."""
+        mat_pow_2 = _unrolled_mat_pow_2(mat_m)
+        return jnp.matmul(mat_pow_2, mat_pow_2, precision=precision)
+    def _unrolled_mat_pow_8(mat_m):
+        """Computes mat_m^4."""
+        mat_pow_4 = _unrolled_mat_pow_4(mat_m)
+        return jnp.matmul(mat_pow_4, mat_pow_4, precision=precision)
+    def mat_power(mat_m, p):
+        """Computes mat_m^p, for p == 1, 2, 4 or 8.
+        Args:
+          mat_m: a square matrix
+          p: a positive integer
+        Returns:
+          mat_m^p
+        """
+        # We unrolled the loop for performance reasons.
+        exponent = jnp.round(jnp.log2(p))
+        return lax.switch(
+            jnp.asarray(exponent, jnp.int32),
+            [
+                _unrolled_mat_pow_1,
+                _unrolled_mat_pow_2,
+                _unrolled_mat_pow_4,
+                _unrolled_mat_pow_8,
+            ],
+            (mat_m),
+        )
+    def _iter_condition(state):
+        (i, unused_mat_m, unused_mat_h, unused_old_mat_h, error, run_step) = state
+        error_above_threshold = jnp.logical_and(error > error_tolerance, run_step)
+        return jnp.logical_and(i < num_iters, error_above_threshold)
+    def _iter_body(state):
+        (i, mat_m, mat_h, unused_old_mat_h, error, unused_run_step) = state
+        mat_m_i = (1 - alpha) * identity + alpha * mat_m
+        new_mat_m = jnp.matmul(mat_power(mat_m_i, p), mat_m, precision=precision)
+        new_mat_h = jnp.matmul(mat_h, mat_m_i, precision=precision)
+        new_error = jnp.max(jnp.abs(new_mat_m - identity))
+        # sometimes error increases after an iteration before decreasing and
+        # converging. 1.2 factor is used to bound the maximal allowed increase.
+        return (i + 1, new_mat_m, new_mat_h, mat_h, new_error, new_error < error * 1.2)
+    if matrix_size == 1:
+        resultant_mat_h = (matrix + ridge_epsilon) ** alpha
+        error = 0
+    else:
+        damped_matrix = matrix + ridge_epsilon * identity
+        z = (1 + p) / (2 * jnp.linalg.norm(damped_matrix))
+        new_mat_m_0 = damped_matrix * z
+        new_error = jnp.max(jnp.abs(new_mat_m_0 - identity))
+        new_mat_h_0 = identity * jnp.power(z, 1.0 / p)
+        init_state = tuple([0, new_mat_m_0, new_mat_h_0, new_mat_h_0, new_error, True])
+        _, mat_m, mat_h, old_mat_h, error, convergence = lax.while_loop(
+            _iter_condition, _iter_body, init_state
+        )
+        error = jnp.max(jnp.abs(mat_m - identity))
+        is_converged = jnp.asarray(convergence, old_mat_h.dtype)
+        resultant_mat_h = is_converged * mat_h + (1 - is_converged) * old_mat_h
+        resultant_mat_h = jnp.asarray(resultant_mat_h, matrix.dtype)
+    return resultant_mat_h, error
+def merge_small_dims(shape_to_merge, max_dim):
+    """Merge small dimensions.
+    If there are some small dimensions, we collapse them:
+    e.g. [1, 2, 512, 1, 2048, 1, 3, 4] --> [1024, 2048, 12] if max_dim = 1024
+         [1, 2, 768, 1, 2048] --> [2, 768, 2048]
+    Args:
+      shape_to_merge: Shape to merge small dimensions.
+      max_dim: Maximal dimension of output shape used in merging.
+    Returns:
+      Merged shape.
+    """
+    resulting_shape = []
+    product = 1
+    for d in shape_to_merge:
+        if product * d <= max_dim:
+            product *= d
+        else:
+            if product > 1:
+                resulting_shape.append(product)
+            product = d
+    if product > 1:
+        resulting_shape.append(product)
+    return resulting_shape
+def pad_matrix(mat, max_size):
+    """Pad a matrix to a max_size.
+    Args:
+      mat: a matrix to pad.
+      max_size: matrix size requested.
+    Returns:
+      Given M returns [[M, 0], [0, I]]
+    """
+    size = mat.shape[0]
+    assert size <= max_size
+    if size == max_size:
+        return mat
+    pad_size = max_size - size
+    zs1 = jnp.zeros([size, pad_size], dtype=mat.dtype)
+    zs2 = jnp.zeros([pad_size, size], dtype=mat.dtype)
+    eye = jnp.eye(pad_size, dtype=mat.dtype)
+    mat = jnp.concatenate([mat, zs1], 1)
+    mat = jnp.concatenate([mat, jnp.concatenate([zs2, eye], 1)], 0)
+    return mat
+def pad_vector(vec, max_size):
+    """Pad a vector to a max_size.
+    Args:
+      vec: a vector to pad.
+      max_size: matrix size requested.
+    Returns:
+      Given V returns [V, 0]
+    """
+    size = vec.shape[0]
+    assert size <= max_size
+    if size == max_size:
+        return vec
+    pad_size = max_size - size
+    zs1 = jnp.zeros([pad_size], dtype=vec.dtype)
+    return jnp.concatenate([vec, zs1], 0)
+def efficient_cond(predicate, compute_fn, init_state, *args, **kwargs):
+    """Avoids wasteful buffer allocation with XLA."""
+    def _iter_body(unused_state):
+        results = compute_fn(*args, **kwargs)
+        return tuple([False] + list(results))
+    def _iter_condition(state):
+        return state[0]
+    results = jax.lax.while_loop(
+        _iter_condition, _iter_body, tuple([predicate] + init_state)
+    )
+    return tuple(results[1:])
+class BlockPartitioner:
+    """Partitions a tensor into smaller tensors."""
+    def __init__(self, param, block_size):
+        self._shape = param.shape
+        self._splits = []
+        split_sizes = []
+        # We split params into smaller blocks. Here we store the metadata to make
+        # that split.
+        for i, d in enumerate(param.shape):
+            if 0 < block_size < d:
+                # d-1, otherwise split appends a 0-size array.
+                nsplit = (d - 1) // block_size
+                indices = (np.arange(nsplit, dtype=np.int32) + 1) * block_size
+                sizes = np.ones(nsplit + 1, dtype=np.int32) * block_size
+                sizes[-1] = d - indices[-1]
+                self._splits.append((i, indices))
+                split_sizes.append(sizes)
+            else:
+                split_sizes.append(np.array([d], dtype=np.int32))
+        self._num_splits = len(split_sizes)
+        self._preconditioner_shapes = []
+        for t in itertools.product(*split_sizes):
+            self._preconditioner_shapes.extend([[d, d] for d in t])
+    def shapes_for_preconditioners(self):
+        return self._preconditioner_shapes
+    def num_splits(self):
+        return self._num_splits
+    def partition(self, tensor):
+        """Partition tensor into blocks."""
+        assert tensor.shape == self._shape
+        tensors = [tensor]
+        for (i, indices) in self._splits:
+            tensors_local = []
+            for t in tensors:
+                tensors_local.extend(jnp.split(t, indices_or_sections=indices, axis=i))
+            tensors = tensors_local
+        return tensors
+    def merge_partitions(self, partitions):
+        """Merge partitions back to original shape."""
+        for (i, indices) in reversed(self._splits):
+            n = len(indices) + 1
+            partial_merged_tensors = []
+            ind = 0
+            while ind < len(partitions):
+                partial_merged_tensors.append(
+                    jnp.concatenate(partitions[ind : ind + n], axis=i)
+                )
+                ind += n
+            partitions = partial_merged_tensors
+        assert len(partitions) == 1
+        return partitions[0]
+class Preconditioner:
+    """Compute statistics/shape from gradients for preconditioning."""
+    def __init__(self, param, block_size, best_effort_shape_interpretation):
+        self._original_shape = param.shape
+        self._transformed_shape = param.shape
+        if best_effort_shape_interpretation:
+            self._transformed_shape = merge_small_dims(self._original_shape, block_size)
+        reshaped_param = jnp.reshape(param, self._transformed_shape)
+        self._partitioner = BlockPartitioner(reshaped_param, block_size)
+    def statistics_from_grad(self, grad):
+        """Compute statistics from gradients.
+        Args:
+          grad: Gradient to compute statistics from.
+        Returns:
+          A list of gradient statistics for each partition.
+        """
+        reshaped_grad = jnp.reshape(grad, self._transformed_shape)
+        partitioned_grads = self._partitioner.partition(reshaped_grad)
+        stats = []
+        for g in partitioned_grads:
+            g_stats = []
+            rank = len(g.shape)
+            for i in range(rank):
+                axes = list(range(i)) + list(range(i + 1, rank))
+                stat = jnp.tensordot(g, g, axes=(axes, axes))
+                g_stats.append(stat)
+            stats.extend(g_stats)
+        return stats
+    def shapes_for_preconditioners(self):
+        """Returns shape from statistics."""
+        return self._partitioner.shapes_for_preconditioners()
+    def exponent_for_preconditioner(self):
+        """Returns exponent to use for inverse-pth root M^{-1/p}."""
+        return 2 * len(self._transformed_shape)
+    def preconditioned_grad(self, grad, preconditioners):
+        """Precondition the gradient.
+        Args:
+          grad: A gradient tensor to precondition.
+          preconditioners: A list of preconditioners to apply.
+        Returns:
+          A preconditioned gradient.
+        """
+        reshaped_grad = jnp.reshape(grad, self._transformed_shape)
+        partitioned_grads = self._partitioner.partition(reshaped_grad)
+        preconditioned_partitioned_grads = []
+        num_splits = self._partitioner.num_splits()
+        for i, g in enumerate(partitioned_grads):
+            preconditioners_for_grad = preconditioners[
+                i * num_splits : (i + 1) * num_splits
+            ]
+            rank = len(g.shape)
+            precond_g = g
+            for j in range(rank):
+                precond_g = jnp.tensordot(
+                    precond_g, preconditioners_for_grad[j], axes=[[0], [0]]
+                )
+            preconditioned_partitioned_grads.append(precond_g)
+        merged_grad = self._partitioner.merge_partitions(
+            preconditioned_partitioned_grads
+        )
+        return jnp.reshape(merged_grad, self._original_shape)
+def _convert_to_parameter_stats(global_stats, local_stat):
+    """Creates parameter stats from sharded stats."""
+    index_start = int(local_stat.index_start)
+    index_end = int(len(local_stat.sizes)) + index_start
+    statistics = global_stats.statistics[index_start:index_end, :, :]
+    preconditioners = global_stats.preconditioners[index_start:index_end, :, :]
+    new_statistics = []
+    new_preconditioners = []
+    for i, size in enumerate(local_stat.sizes):
+        new_statistics.append(statistics[i][:size, :size])
+        new_preconditioners.append(preconditioners[i][:size, :size])
+    return ParameterStats(
+        local_stat.diagonal_statistics,
+        new_statistics,
+        new_preconditioners,
+        local_stat.diagonal_momentum,
+        local_stat.momentum,
+    )
+def _convert_from_parameter_stats(parameter_stats, local_stats):
+    """Creates sharded stats from paramter stats."""
+    return LocalShardedParameterStats(
+        parameter_stats.diagonal_statistics,
+        parameter_stats.diagonal_momentum,
+        parameter_stats.momentum,
+        local_stats.index_start,
+        local_stats.sizes,
+    )
+def batch(x, num_devices):
+    """Batch `x` so that so that leading axis is num_devices."""
+    n = len(x)
+    b = int(n / num_devices)
+    return jnp.stack([jnp.stack(x[idx : idx + b]) for idx in range(0, n, b)])
+def unbatch(batched_values):
+    """Unbatch values across leading axis and return a list of elements."""
+    b1, b2 = batched_values.shape[0], batched_values.shape[1]
+    results = []
+    for v_array in jnp.split(batched_values, indices_or_sections=b1, axis=0):
+        v_array = jnp.squeeze(v_array)
+        # b2 = batches (number of preconditioner computation) per core.
+        if b2 > 1:
+            for v in jnp.split(v_array, indices_or_sections=b2, axis=0):
+                results.append(jnp.squeeze(v))
+        else:
+            results.append(v_array)
+    return results
+def distributed_shampoo(
+    learning_rate,
+    block_size,
+    beta1=0.9,
+    beta2=0.999,
+    diagonal_epsilon=1e-10,
+    matrix_epsilon=1e-6,
+    weight_decay=0.0,
+    start_preconditioning_step=5,
+    preconditioning_compute_steps=1,
+    statistics_compute_steps=1,
+    best_effort_shape_interpretation=True,
+    graft_type=GraftingType.SGD,
+    nesterov=True,
+    exponent_override=0,
+    # Pass pmap 'batch axis name' in pmap mode.
+    batch_axis_name=None,
+    ### Only set following 3 params in pjit/spmd mode.
+    ### WARNING: Experimental
+    mesh_axis_names=None,
+    num_devices_for_pjit=None,
+    shard_optimizer_states=False,
+    ###
+    ### Experimental memory reduction mode
+    best_effort_memory_usage_reduction=False,
+    ###
+    inverse_failure_threshold=0.1,
+    moving_average_for_momentum=False,
+    skip_preconditioning_dim_size_gt=4096,
+    clip_by_scaled_gradient_norm=None,
+    precision=lax.Precision.HIGHEST,
+):
+    """Distributed Shampoo optimizer.
+    Distributed Shampoo is a second-order preconditioned method (concretely, a
+    variant of full-matrix Adagrad), that provides significant convergence and
+    wall-clock time improvements compared to conventional first-order methods,
+    and that has been shown to scale to large state-of-the-art deep learning
+    models.
+    References:
+      Scalable Second Order Optimization for Deep Learning,
+      Rohan Anil, Vineet Gupta, Tomer Koren, Kevin Regan, Yoram Singer
+      Preprint: https://arxiv.org/abs/2002.09018
+    Args:
+      learning_rate: the step size used to update the parameters.
+      block_size: Block size for large layers (if > 0). Preconditioning compute
+        operation is cubic in the dimension of the tensor. Block size allows us to
+        chunk the layers into sub-layers of maximal dimension dictated by this
+        value. Use 128 as default (increase if you have compute budget).
+      beta1: momentum parameter.
+      beta2: second moment averaging parameter.
+      diagonal_epsilon: epsilon for diagonal adagrad (only if layerwise grafting
+        to AdaGrad is enabled).
+      matrix_epsilon: epsilon to add to statistics before computing inverse pth
+        root. If you are running in f32 precision for inverse pth root
+        (recommended today) this can go upto 1e-6. If you have latest hardware
+        with native f64 precision, set this upto 1e-12.
+      weight_decay: Weight decay for regularization.
+      start_preconditioning_step: When to start Shampoo update before which
+        diagonal update is used. This is because we dont have enough information
+        to do stable inverse.
+      preconditioning_compute_steps: How often to compute preconditioner.
+        Performance tuning params for controlling memory and compute requirements.
+        Ideally set this and statistics_compute_steps params to 1.
+      statistics_compute_steps: How often to compute statistics.
+      best_effort_shape_interpretation: If there are some small dimensions,
+        collapse them e.g. [1, 2, 512, 1, 2048, 1, 3, 4] --> [1024, 2048, 12] if
+        block = 1024, [1, 2, 768, 1, 2048] --> [2, 768, 2048]
+      graft_type: Grafting is a technique to fix the layerwise scale of Shampoo
+        optimizer. This allows us to plugin the Shampoo optimizer into settings
+        where SGD/AdaGrad is already well tuned. Available options are:
+          GraftingType.SGD and GraftingType.ADAGRAD.
+      nesterov: Nesterov momentum.
+      exponent_override: Override the exponent used in matrix inverse.
+      batch_axis_name: labeled axis over pmap for data-parallel training the
+        optimizer used for.
+      mesh_axis_names: Axis names for the mesh (used in pjit).
+      num_devices_for_pjit: Number of devices to parallelize over when using pjit.
+      shard_optimizer_states: Shard optimizer states to save memory in model
+        parallel training.
+      best_effort_memory_usage_reduction: Best effort memory usage reduction.
+        diagonal_statistics -> jnp.bfloat16
+        momentum buffers (2x) -> jnp.int8
+        statistics, preconditioners -> jnp.int16 + diagonals
+      inverse_failure_threshold: numerics are hard and inverses fail sometimes; we
+        determine that using this threshold.
+      moving_average_for_momentum: Whether to use moving average for momentum
+        instead of exponential moving average.
+      skip_preconditioning_dim_size_gt: Skip if preconditioning dim size is
+          greater than this value.
+      clip_by_scaled_gradient_norm: Clip by scaled gradient norm (only useful
+        when using RMSProp Grafting).
+      precision: precision XLA related flag, the available options are: a)
+        lax.Precision.DEFAULT (better step time, but not precise) b)
+        lax.Precision.HIGH (increased precision, slower) c) lax.Precision.HIGHEST
+        (best possible precision, slowest)
+    Returns:
+      a GradientTransformation.
+    """
+    def quantized_dtype_for_momentum_buffers():
+        return jnp.int8 if best_effort_memory_usage_reduction else jnp.float32
+    # TODO(rohananil): Explore int8-16 quantization with non-linear bucket sizes.
+    def quantized_dtype_for_diagonal_statistics_buffers():
+        return jnp.bfloat16 if best_effort_memory_usage_reduction else jnp.float32
+    # Preconditioner and statistics are both stores as int16 in this mode.
+    # We take out the diagonal to make quantization easier.
+    def quantized_dtype_for_second_moment_statistics_buffers():
+        return (
+            jnp.int16
+            if best_effort_memory_usage_reduction and batch_axis_name
+            else jnp.float32
+        )
+    # Preconditioner and statistics are both stores as int16 in this mode.
+    # We take out the diagonal to make quantization easier.
+    def quantized_dtype_for_second_moment_preconditioner_buffers():
+        return (
+            jnp.int16
+            if best_effort_memory_usage_reduction and batch_axis_name
+            else jnp.float32
+        )
+    def _to_float(maybe_quantized):
+        if isinstance(maybe_quantized, QuantizedValue):
+            return maybe_quantized.to_float()
+        else:
+            return maybe_quantized
+    def _maybe_quantize_statistics(statistics_list):
+        return _maybe_quantize_matrices_with_dtype(
+            statistics_list, quantized_dtype_for_second_moment_statistics_buffers()
+        )
+    def _maybe_quantize_preconditioners(statistics_list):
+        return _maybe_quantize_matrices_with_dtype(
+            statistics_list, quantized_dtype_for_second_moment_preconditioner_buffers()
+        )
+    def _maybe_quantize_matrices_with_dtype(statistics_list, quantized_dtype):
+        if quantized_dtype != jnp.float32:
+            return [
+                QuantizedValue.from_float_value(
+                    s, quantized_dtype, extract_diagonal=True
+                )
+                for s in statistics_list
+            ]
+        else:
+            return statistics_list
+    def _maybe_dequantize_preconditioners(preconditioner_list):
+        return _maybe_dequantize_matrices_with_dtype(
+            preconditioner_list,
+            quantized_dtype_for_second_moment_preconditioner_buffers(),
+        )
+    def _maybe_dequantize_matrices_with_dtype(statistics_list, quantized_dtype):
+        if quantized_dtype != jnp.float32:
+            return [s.to_float() for s in statistics_list]
+        else:
+            return statistics_list
+    def _quantize_diagonal_statistics(diagonal_statistics):
+        return QuantizedValue.from_float_value(
+            diagonal_statistics, quantized_dtype_for_diagonal_statistics_buffers()
+        )
+    def _quantize_momentum(momentum_statistics):
+        return QuantizedValue.from_float_value(
+            momentum_statistics, quantized_dtype_for_momentum_buffers()
+        )
+    def sharded_init_fn(params):
+        params_flat, treedef = jax.tree_flatten(params)
+        # Find max size to pad to.
+        max_size = 0
+        for param in params_flat:
+            preconditioner = Preconditioner(
+                param, block_size, best_effort_shape_interpretation
+            )
+            if not _skip_preconditioning(param):
+                shapes = preconditioner.shapes_for_preconditioners()
+                sizes = [s[0] for s in shapes]
+                max_size = max(max(sizes), max_size)
+        padded_statistics = []
+        padded_preconditioners = []
+        local_stats_flat = []
+        for param in params_flat:
+            preconditioner = Preconditioner(
+                param, block_size, best_effort_shape_interpretation
+            )
+            shapes = preconditioner.shapes_for_preconditioners()
+            sizes = []
+            statistics = []
+            preconditioners = []
+            index_start = len(padded_statistics)
+            if not _skip_preconditioning(param):
+                sizes = [s[0] for s in shapes]
+                shapes = preconditioner.shapes_for_preconditioners()
+                statistics = [matrix_epsilon * jnp.eye(max_size) for s in shapes]
+                preconditioners = [jnp.eye(max_size) for s in shapes]
+                padded_statistics.extend(statistics)
+                padded_preconditioners.extend(preconditioners)
+            diagonal_statistics = []
+            if graft_type != GraftingType.SGD:
+                diagonal_statistics = jnp.zeros_like(param)
+            local_stats_flat.append(
+                LocalShardedParameterStats(
+                    _quantize_diagonal_statistics(diagonal_statistics),
+                    _quantize_momentum(jnp.zeros_like(param)),
+                    _quantize_momentum(jnp.zeros_like(param)),
+                    index_start,
+                    sizes,
+                )
+            )
+        local_stats = jax.tree_unflatten(treedef, local_stats_flat)
+        # Pad the statistics and preconditioner matrices to be a multiple of
+        # num devices.
+        # TODO(rohananil): Relax to only the size of the mesh axis where the dim
+        # is split on.
+        to_pad = -len(padded_statistics) % num_devices_for_pjit
+        padded_statistics.extend(
+            [jnp.eye(max_size, dtype=padded_statistics[0].dtype) for _ in range(to_pad)]
+        )
+        padded_preconditioners.extend(
+            [jnp.eye(max_size, dtype=padded_statistics[0].dtype) for _ in range(to_pad)]
+        )
+        global_stats = GlobalShardedParameterStats(
+            jnp.stack(padded_statistics), jnp.stack(padded_preconditioners)
+        )
+        return ShampooState(
+            count=jnp.zeros([], jnp.int32),
+            stats=ShardedShampooStats(global_stats, local_stats),
+        )
+    def sharded_update_fn(grads, state, params):
+        """Transform the input gradient and update all statistics in sharded mode.
+        Args:
+          grads: the gradient tensors for the parameters.
+          state: a named tuple containing the state of the optimizer
+          params: the parameters that should be updated.
+        Returns:
+          A tuple containing the new parameters and the new optimizer state.
+        """
+        params_flat, treedef = jax.tree_flatten(params)
+        grads_flat = treedef.flatten_up_to(grads)
+        global_stats = state.stats.global_stats
+        local_stats_flat = treedef.flatten_up_to(state.stats.local_stats)
+        stats_flat = [
+            _convert_to_parameter_stats(global_stats, local_stat)
+            for local_stat in local_stats_flat
+        ]
+        new_stats_flat = jax.tree_multimap(
+            lambda g, s, p: _compute_stats(g, s, p, state.count),
+            grads_flat,
+            stats_flat,
+            params_flat,
+        )
+        exponents = []
+        for stat, param in zip(new_stats_flat, params_flat):
+            num_statistics = len(stat.statistics)
+            if num_statistics > 0:
+                preconditioner = Preconditioner(
+                    param, block_size, best_effort_shape_interpretation
+                )
+                exponent = (
+                    preconditioner.exponent_for_preconditioner()
+                    if exponent_override == 0
+                    else exponent_override
+                )
+                exponents.extend([exponent] * num_statistics)
+        outputs = jax.tree_multimap(
+            lambda g, s, p: _transform_grad(g, s, p, state.count),
+            grads_flat,
+            new_stats_flat,
+            params_flat,
+        )
+        updates_flat, new_stats_flat = list(zip(*outputs)) if outputs else ((), ())
+        updates = jax.tree_unflatten(treedef, updates_flat)
+        # Create new local_stats
+        new_local_stats_flat = [
+            _convert_from_parameter_stats(new_stat, local_stat)
+            for new_stat, local_stat in zip(new_stats_flat, local_stats_flat)
+        ]
+        new_local_stats = jax.tree_unflatten(treedef, new_local_stats_flat)
+        max_size = global_stats.statistics.shape[1]
+        new_padded_statistics = []
+        for stat in new_stats_flat:
+            new_padded_statistics.extend(
+                [pad_matrix(stat, max_size) for stat in stat.statistics]
+            )
+        # Create global stats
+        # TODO(rohananil): Preconditioner is not updated every step, so cost of
+        # stack/pad can be obviated away.
+        # Pad the statistics and preconditioner matrices to be a multiple of
+        # num devices.
+        # TODO(rohananil): Relax to only the size of the mesh axis where the dim
+        # is split on.
+        to_pad = -len(new_padded_statistics) % num_devices_for_pjit
+        new_padded_statistics.extend(
+            [
+                jnp.eye(max_size, dtype=new_padded_statistics[0].dtype)
+                for _ in range(to_pad)
+            ]
+        )
+        exponents.extend([1 for _ in range(to_pad)])
+        new_stacked_padded_statistics = jnp.stack(new_padded_statistics)
+        new_stacked_exponents = jnp.stack(exponents)
+        def _matrix_inverse_pth_root_vmap(xs, ps):
+            mi_pth_root = functools.partial(
+                matrix_inverse_pth_root,
+                ridge_epsilon=matrix_epsilon,
+                precision=precision,
+            )
+            preconditioners, errors = jax.vmap(mi_pth_root)(xs, ps)
+            return preconditioners, errors
+        def _internal_inverse_pth_root_all():
+            preconditioners, errors = _matrix_inverse_pth_root_vmap(
+                new_stacked_padded_statistics, new_stacked_exponents
+            )
+            return preconditioners, errors
+        if preconditioning_compute_steps == 1:
+            new_preconditioners, errors = _internal_inverse_pth_root_all()
+        else:
+            # Passing statistics instead of preconditioners as they are similarly
+            # shaped tensors. Note statistics will be ignored as we are passing in
+            # a large init value for error.
+            preconditioners_init = new_stacked_padded_statistics
+            errors_init = np.stack([inverse_failure_threshold] * len(exponents))
+            init_state = [preconditioners_init, errors_init]
+            perform_step = state.count % preconditioning_compute_steps == 0
+            new_preconditioners, errors = efficient_cond(
+                perform_step, _internal_inverse_pth_root_all, init_state
+            )
+        errors = errors.reshape((-1, 1, 1))
+        predicate = jnp.logical_or(
+            jnp.isnan(errors), errors >= inverse_failure_threshold
+        ).astype(new_preconditioners.dtype)
+        # TODO(rohananil): Check for numerical instabilities.
+        new_conditional_preconditioners = (
+            predicate * global_stats.preconditioners
+            + (1.0 - predicate) * new_preconditioners
+        )
+        new_global_stats = GlobalShardedParameterStats(
+            new_stacked_padded_statistics, new_conditional_preconditioners
+        )
+        new_shampoo_state = ShampooState(
+            count=state.count + 1,
+            stats=ShardedShampooStats(new_global_stats, new_local_stats),
+        )
+        return updates, new_shampoo_state
+    def init_fn(params):
+        """Initialise the optimiser's state."""
+        def _init(param):
+            preconditioner = Preconditioner(
+                param, block_size, best_effort_shape_interpretation
+            )
+            statistics = []
+            preconditioners = []
+            if not _skip_preconditioning(param):
+                shapes = preconditioner.shapes_for_preconditioners()
+                statistics = [matrix_epsilon * jnp.eye(s[0]) for s in shapes]
+                preconditioners = [jnp.eye(s[0]) for s in shapes]
+            diagonal_statistics = []
+            if graft_type != GraftingType.SGD:
+                diagonal_statistics = jnp.zeros_like(param)
+            return ParameterStats(
+                _quantize_diagonal_statistics(diagonal_statistics),
+                _maybe_quantize_statistics(statistics),
+                _maybe_quantize_preconditioners(preconditioners),
+                _quantize_momentum(jnp.zeros_like(param)),
+                _quantize_momentum(jnp.zeros_like(param)),
+            )
+        return ShampooState(
+            count=jnp.zeros([], jnp.int32), stats=jax.tree_map(_init, params)
+        )
+    def _skip_preconditioning(param):
+        return len(param.shape) < 1 or any(
+            [s > skip_preconditioning_dim_size_gt for s in param.shape]
+        )
+    def _compute_stats(grad, state, param, step):
+        """Compute per-parameter statistics."""
+        preconditioner = Preconditioner(
+            param, block_size, best_effort_shape_interpretation
+        )
+        new_statistics = [[]] * len(state.statistics)
+        w1 = beta2
+        w2 = beta2 if beta2 == 1.0 else (1.0 - beta2)
+        if not _skip_preconditioning(param):
+            def compute_updated_statistics():
+                new_stats = preconditioner.statistics_from_grad(grad)
+                new_stats_accumulators = []
+                for stat, stat_accumulator in zip(new_stats, state.statistics):
+                    new_stats_accumulators.append(
+                        w1 * _to_float(stat_accumulator) + w2 * stat
+                    )
+                return _maybe_quantize_statistics(new_stats_accumulators)
+            if statistics_compute_steps > 1:
+                perform_step = step % statistics_compute_steps == 0
+                init_state = state.statistics
+                new_statistics = list(
+                    efficient_cond(perform_step, compute_updated_statistics, init_state)
+                )
+            else:
+                new_statistics = compute_updated_statistics()
+        return ParameterStats(
+            state.diagonal_statistics,
+            new_statistics,
+            state.preconditioners,
+            state.diagonal_momentum,
+            state.momentum,
+        )
+    def _matrix_inverse_pth_root_vmap(xs, ps):
+        mi_pth_root = functools.partial(
+            matrix_inverse_pth_root, ridge_epsilon=matrix_epsilon, precision=precision
+        )
+        return jax.vmap(mi_pth_root)(xs, ps)
+    def _quantized_matrix_inverse_pth_root_vmap(qxs, qds, qbs, ps):
+        def _quantized_to_float(qx, qd, qb):
+            qv = QuantizedValue(qx, qd, qb, qx.dtype, True, list(qx.shape))
+            return qv.to_float()
+        def matrix_inverse_pth_root_wrapper(qx, qd, qb, p):
+            v = _quantized_to_float(qx, qd, qb)
+            preconditioner, error = matrix_inverse_pth_root(
+                v, p, ridge_epsilon=matrix_epsilon, precision=precision
+            )
+            qp = QuantizedValue.from_float_value(preconditioner, qx.dtype, True)
+            return qp.quantized, qp.diagonal, qp.bucket_size, error
+        return jax.vmap(matrix_inverse_pth_root_wrapper)(qxs, qds, qbs, ps)
+    def _matrix_inverse_pth_root_pjit(xs, ps):
+        mesh_axis_names_tuple = tuple(mesh_axis_names)
+        # Partition the concatenated statistics matrix across all cores.
+        partitioned_xs, partitioned_ps = pjit.pjit(
+            lambda x, y: (x, y),
+            in_axis_resources=None,
+            out_axis_resources=pjit.PartitionSpec(
+                mesh_axis_names_tuple,
+            ),
+        )(xs, ps)
+        # Run matrix inverse pth root on each shard.
+        partitioned_preconditioners, partitioned_errors = _matrix_inverse_pth_root_vmap(
+            partitioned_xs, partitioned_ps
+        )
+        # Recombine the outputs at each core.
+        preconditioners, errors = pjit.pjit(
+            lambda x, y: (x, y),
+            in_axis_resources=(
+                pjit.PartitionSpec(
+                    mesh_axis_names_tuple,
+                ),
+                pjit.PartitionSpec(
+                    mesh_axis_names_tuple,
+                ),
+            ),
+            out_axis_resources=(None, None),
+        )(partitioned_preconditioners, partitioned_errors)
+        return preconditioners, errors
+    def _pmap_compute_preconditioners(
+        states,
+        step,
+        statistics,
+        num_statistics_per_state,
+        original_shapes,
+        exponents,
+        max_size,
+        prev_preconditioners,
+    ):
+        """Computes preconditioners for given statistics in states in PMAP mode.
+        Args:
+          states: A list of optimizer states.
+          step: Current step number
+          statistics: A list of statistics for all variables (for every dim)
+          num_statistics_per_state: Number of statistis per state to reconstruct
+            output states.
+          original_shapes: A list of shapes of the statistics.
+          exponents: Exponent power to use for inverse-pth roots.
+          max_size: Maximum dim of the statistics to pad.
+          prev_preconditioners: Previously available preconditioner.
+        Returns:
+          New optimizer states after computing the preconditioner.
+        """
+        num_devices = lax.psum(1, batch_axis_name)
+        num_statistics = len(statistics)
+        # Pad statistics and exponents to next multiple of num_devices.
+        packed_statistics = [pad_matrix(stat, max_size) for stat in statistics]
+        to_pad = -num_statistics % num_devices
+        packed_statistics.extend(
+            [jnp.eye(max_size, dtype=packed_statistics[0].dtype) for _ in range(to_pad)]
+        )
+        exponents.extend([1 for _ in range(to_pad)])
+        if not packed_statistics:
+            return states
+        all_statistics = batch(packed_statistics, num_devices)
+        all_exponents = batch(exponents, num_devices)
+        def _internal_inverse_pth_root_all():
+            current_replica = lax.axis_index(batch_axis_name)
+            preconditioners, errors = _matrix_inverse_pth_root_vmap(
+                all_statistics[current_replica], all_exponents[current_replica]
+            )
+            preconditioners = jax.lax.all_gather(preconditioners, batch_axis_name)
+            errors = jax.lax.all_gather(errors, batch_axis_name)
+            preconditioners_flat = unbatch(preconditioners)
+            errors_flat = unbatch(errors)
+            return preconditioners_flat, errors_flat
+        if preconditioning_compute_steps == 1:
+            preconditioners_flat, errors_flat = _internal_inverse_pth_root_all()
+        else:
+            # Passing statistics instead of preconditioners as they are similarly
+            # shaped tensors. Note statistics will be ignored as we are passing in
+            # a large init value for error.
+            preconditioners_init = packed_statistics
+            errors_init = [inverse_failure_threshold] * len(packed_statistics)
+            init_state = [preconditioners_init, errors_init]
+            perform_step = step % preconditioning_compute_steps == 0
+            preconditioners_flat, errors_flat = efficient_cond(
+                perform_step, _internal_inverse_pth_root_all, init_state
+            )
+        def _skip(error):
+            condition = jnp.logical_or(
+                jnp.isnan(error), error >= inverse_failure_threshold
+            )
+            return condition.astype(error.dtype)
+        def _select_preconditioner(error, new_p, old_p):
+            return lax.cond(
+                _skip(error), lambda _: old_p, lambda _: new_p, operand=None
+            )
+        new_preconditioners_flat = []
+        for p, shape, prev_p, error in zip(
+            preconditioners_flat, original_shapes, prev_preconditioners, errors_flat
+        ):
+            new_preconditioners_flat.append(
+                _select_preconditioner(error, p[: shape[0], : shape[1]], prev_p)
+            )
+        assert len(states) == len(num_statistics_per_state)
+        assert len(new_preconditioners_flat) == num_statistics
+        # Add back empty preconditioners so we that we can set the optimizer state.
+        preconditioners_for_states = []
+        idx = 0
+        for num_statistics, state in zip(num_statistics_per_state, states):
+            if num_statistics == 0:
+                preconditioners_for_states.append([])
+            else:
+                preconditioners_for_state = new_preconditioners_flat[
+                    idx : idx + num_statistics
+                ]
+                assert len(state.statistics) == len(preconditioners_for_state)
+                preconditioners_for_states.append(preconditioners_for_state)
+                idx += num_statistics
+        new_states = []
+        for state, new_preconditioners in zip(states, preconditioners_for_states):
+            new_states.append(
+                ParameterStats(
+                    state.diagonal_statistics,
+                    state.statistics,
+                    new_preconditioners,
+                    state.diagonal_momentum,
+                    state.momentum,
+                )
+            )
+        return new_states
+    def _pmap_quantized_compute_preconditioners(
+        states,
+        step,
+        statistics,
+        num_statistics_per_state,
+        original_shapes,
+        exponents,
+        max_size,
+        prev_preconditioners,
+    ):
+        """Computes preconditioners for given statistics in states in PMAP mode.
+        For quantization, each statistic is represented by three values:
+          quantized matrix, diagonal, and bucket sizes, we run inverse pth-roots
+          without ever recreating the original matrix in f32.
+        Args:
+          states: A list of optimizer states.
+          step: Current step number
+          statistics: A list of statistics for all variables (for every dim)
+          num_statistics_per_state: Number of statistis per state to reconstruct
+            output states.
+          original_shapes: A list of shapes of the statistics.
+          exponents: Exponent power to use for inverse-pth roots.
+          max_size: Maximum dim of the statistics to pad.
+          prev_preconditioners: Previously available preconditioner.
+        Returns:
+          New optimizer states after computing the preconditioner.
+        """
+        num_devices = lax.psum(1, batch_axis_name)
+        num_statistics = len(statistics)
+        quantized_dtype = quantized_dtype_for_second_moment_statistics_buffers()
+        # Complexity here is around: shapes needing be statically shaped,
+        # our custom quantization type requires a different type of packing.
+        # Parallel tensors:
+        # quantized [dxd]
+        # diagonals [d] f32
+        # bucket_sizes [d] f32
+        packed_quantized_statistics = [
+            pad_matrix(stat.quantized, max_size) for stat in statistics
+        ]
+        packed_quantized_diagonals = [
+            pad_vector(stat.diagonal, max_size) for stat in statistics
+        ]
+        packed_quantized_bucket_sizes = [
+            pad_vector(stat.bucket_size, max_size) for stat in statistics
+        ]
+        to_pad = -num_statistics % num_devices
+        padded_eye = jnp.eye(max_size, dtype=jnp.float32)
+        quantized_eye = QuantizedValue.from_float_value(
+            padded_eye, quantized_dtype, True
+        )
+        packed_quantized_statistics.extend(
+            [quantized_eye.quantized for _ in range(to_pad)]
+        )
+        packed_quantized_diagonals.extend(
+            [quantized_eye.diagonal for _ in range(to_pad)]
+        )
+        packed_quantized_bucket_sizes.extend(
+            [quantized_eye.bucket_size for _ in range(to_pad)]
+        )
+        exponents.extend([1 for _ in range(to_pad)])
+        if not packed_quantized_statistics:
+            return states
+        all_quantized_statistics = batch(packed_quantized_statistics, num_devices)
+        all_quantized_diagonals = batch(packed_quantized_diagonals, num_devices)
+        all_quantized_bucket_sizes = batch(packed_quantized_bucket_sizes, num_devices)
+        all_exponents = batch(exponents, num_devices)
+        def _internal_inverse_pth_root_all():
+            current_replica = lax.axis_index(batch_axis_name)
+            (
+                quantized_preconditioners,
+                quantized_diagonals,
+                quantized_bucket_sizes,
+                errors,
+            ) = _quantized_matrix_inverse_pth_root_vmap(
+                all_quantized_statistics[current_replica],
+                all_quantized_diagonals[current_replica],
+                all_quantized_bucket_sizes[current_replica],
+                all_exponents[current_replica],
+            )
+            quantized_preconditioners = jax.lax.all_gather(
+                quantized_preconditioners, batch_axis_name
+            )
+            quantized_diagonals = jax.lax.all_gather(
+                quantized_diagonals, batch_axis_name
+            )
+            quantized_bucket_sizes = jax.lax.all_gather(
+                quantized_bucket_sizes, batch_axis_name
+            )
+            errors = jax.lax.all_gather(errors, batch_axis_name)
+            quantized_preconditioners_flat = unbatch(quantized_preconditioners)
+            quantized_diagonals_flat = unbatch(quantized_diagonals)
+            quantized_bucket_sizes_flat = unbatch(quantized_bucket_sizes)
+            errors_flat = unbatch(errors)
+            return (
+                quantized_preconditioners_flat,
+                quantized_diagonals_flat,
+                quantized_bucket_sizes_flat,
+                errors_flat,
+            )
+        if preconditioning_compute_steps == 1:
+            (
+                quantized_preconditioners_flat,
+                quantized_diagonals_flat,
+                quantized_bucket_sizes_flat,
+                errors_flat,
+            ) = _internal_inverse_pth_root_all()
+        else:
+            # Passing statistics instead of preconditioners as they are similarly
+            # shaped tensors. Note statistics will be ignored as we are passing in
+            # a large init value for error.
+            quantized_preconditioners_init = packed_quantized_statistics
+            quantized_diagonals_init = packed_quantized_diagonals
+            quantized_bucket_sizes_init = packed_quantized_bucket_sizes
+            errors_init = [inverse_failure_threshold] * len(
+                quantized_preconditioners_init
+            )
+            init_state = [
+                quantized_preconditioners_init,
+                quantized_diagonals_init,
+                quantized_bucket_sizes_init,
+                errors_init,
+            ]
+            perform_step = step % preconditioning_compute_steps == 0
+            (
+                quantized_preconditioners_flat,
+                quantized_diagonals_flat,
+                quantized_bucket_sizes_flat,
+                errors_flat,
+            ) = efficient_cond(perform_step, _internal_inverse_pth_root_all, init_state)
+        def _skip(error):
+            condition = jnp.logical_or(
+                jnp.isnan(error), error >= inverse_failure_threshold
+            )
+            return condition.astype(error.dtype)
+        def _select_preconditioner(error, new_p, old_p):
+            return lax.cond(
+                _skip(error), lambda _: old_p, lambda _: new_p, operand=None
+            )
+        new_quantized_preconditioners_flat = []
+        new_quantized_diagonals_flat = []
+        new_quantized_bucket_sizes_flat = []
+        for p, d, b, shape, prev_p, error in zip(
+            quantized_preconditioners_flat,
+            quantized_diagonals_flat,
+            quantized_bucket_sizes_flat,
+            original_shapes,
+            prev_preconditioners,
+            errors_flat,
+        ):
+            new_quantized_preconditioners_flat.append(
+                _select_preconditioner(
+                    error, p[: shape[0], : shape[1]], prev_p.quantized
+                )
+            )
+            new_quantized_diagonals_flat.append(
+                _select_preconditioner(error, d[: shape[0]], prev_p.diagonal)
+            )
+            new_quantized_bucket_sizes_flat.append(
+                _select_preconditioner(error, b[: shape[0]], prev_p.bucket_size)
+            )
+        assert len(states) == len(num_statistics_per_state)
+        assert len(new_quantized_preconditioners_flat) == num_statistics
+        assert len(new_quantized_diagonals_flat) == num_statistics
+        assert len(new_quantized_bucket_sizes_flat) == num_statistics
+        # Add back empty preconditioners so we that we can set the optimizer state.
+        preconditioners_for_states = []
+        idx = 0
+        for num_statistics, state in zip(num_statistics_per_state, states):
+            if num_statistics == 0:
+                preconditioners_for_states.append([])
+            else:
+                quantized_preconditioners_for_state = (
+                    new_quantized_preconditioners_flat[idx : idx + num_statistics]
+                )
+                quantized_diagonals_for_state = new_quantized_diagonals_flat[
+                    idx : idx + num_statistics
+                ]
+                quantized_bucket_sizes_for_state = new_quantized_bucket_sizes_flat[
+                    idx : idx + num_statistics
+                ]
+                assert len(state.statistics) == len(quantized_preconditioners_for_state)
+                assert len(state.statistics) == len(quantized_diagonals_for_state)
+                assert len(state.statistics) == len(quantized_bucket_sizes_for_state)
+                quantized_preconditioners = []
+                for qv, qd, qb in zip(
+                    quantized_preconditioners_for_state,
+                    quantized_diagonals_for_state,
+                    quantized_bucket_sizes_for_state,
+                ):
+                    quantized_preconditioners.append(
+                        QuantizedValue(qv, qd, qb, qv.dtype, True, list(qv.shape))
+                    )
+                preconditioners_for_states.append(quantized_preconditioners)
+                idx += num_statistics
+        new_states = []
+        for state, new_preconditioners in zip(states, preconditioners_for_states):
+            new_states.append(
+                ParameterStats(
+                    state.diagonal_statistics,
+                    state.statistics,
+                    new_preconditioners,
+                    state.diagonal_momentum,
+                    state.momentum,
+                )
+            )
+        return new_states
+    def _pjit_compute_preconditioners(
+        states,
+        step,
+        statistics,
+        num_statistics_per_state,
+        original_shapes,
+        exponents,
+        max_size,
+        prev_preconditioners,
+    ):
+        """Computes preconditioners for given statistics in states in PJIT mode.
+        Args:
+          states: A list of optimizer states.
+          step: Current step number
+          statistics: A list of statistics for all variables (for every dim)
+          num_statistics_per_state: Number of statistis per state to reconstruct
+            output states.
+          original_shapes: A list of shapes of the statistics.
+          exponents: Exponent power to use for inverse-pth roots.
+          max_size: Maximum dim of the statistics to pad.
+          prev_preconditioners: Previously available preconditioner.
+        Returns:
+          New optimizer states after computing the preconditioner.
+        """
+        num_statistics = len(statistics)
+        to_pad = -num_statistics % num_devices_for_pjit
+        padded_statistics = [pad_matrix(stat, max_size) for stat in statistics]
+        padded_statistics.extend(
+            [jnp.eye(max_size, dtype=padded_statistics[0].dtype) for _ in range(to_pad)]
+        )
+        exponents.extend([1 for _ in range(to_pad)])
+        all_statistics = jnp.stack(padded_statistics)
+        all_exponents = jnp.stack(exponents)
+        def _internal_inverse_pth_root_all():
+            preconditioners, errors = _matrix_inverse_pth_root_pjit(
+                all_statistics, all_exponents
+            )
+            b1 = preconditioners.shape[0]
+            def split(batched_values):
+                return [
+                    jnp.squeeze(v)
+                    for v in jnp.split(batched_values, indices_or_sections=b1, axis=0)
+                ]
+            return split(preconditioners), split(errors)
+        if preconditioning_compute_steps == 1:
+            preconditioners_flat, errors_flat = _internal_inverse_pth_root_all()
+        else:
+            # Passing statistics instead of preconditioners as they are similarly
+            # shaped tensors. Note statistics will be ignored as we are passing in
+            # a large init value for error.
+            preconditioners_init = padded_statistics
+            errors_init = [inverse_failure_threshold] * len(padded_statistics)
+            init_state = [preconditioners_init, errors_init]
+            perform_step = step % preconditioning_compute_steps == 0
+            preconditioners_flat, errors_flat = efficient_cond(
+                perform_step, _internal_inverse_pth_root_all, init_state
+            )
+        def _skip(error):
+            condition = jnp.logical_or(
+                jnp.isnan(error), error >= inverse_failure_threshold
+            )
+            return condition.astype(error.dtype)
+        def _select_preconditioner(error, new_p, old_p):
+            return lax.cond(
+                _skip(error), lambda _: old_p, lambda _: new_p, operand=None
+            )
+        new_preconditioners_flat = []
+        for p, shape, prev_p, error in zip(
+            preconditioners_flat, original_shapes, prev_preconditioners, errors_flat
+        ):
+            new_preconditioners_flat.append(
+                _select_preconditioner(error, p[: shape[0], : shape[1]], prev_p)
+            )
+        assert len(states) == len(num_statistics_per_state)
+        assert len(new_preconditioners_flat) == num_statistics
+        # Add back empty preconditioners so we that we can set the optimizer state.
+        preconditioners_for_states = []
+        idx = 0
+        for num_statistics, state in zip(num_statistics_per_state, states):
+            if num_statistics == 0:
+                preconditioners_for_states.append([])
+            else:
+                preconditioners_for_state = new_preconditioners_flat[
+                    idx : idx + num_statistics
+                ]
+                assert len(state.statistics) == len(preconditioners_for_state)
+                preconditioners_for_states.append(preconditioners_for_state)
+                idx += num_statistics
+        new_states = []
+        for state, new_preconditioners in zip(states, preconditioners_for_states):
+            new_states.append(
+                ParameterStats(
+                    state.diagonal_statistics,
+                    state.statistics,
+                    new_preconditioners,
+                    state.diagonal_momentum,
+                    state.momentum,
+                )
+            )
+        return new_states
+    def _compute_preconditioners(states, params, step):
+        """Computes preconditioners for given statistics in states.
+        Args:
+          states: A list of optimizer states.
+          params: A list of params.
+          step: Current step number
+        Returns:
+          New optimizer states after computing the preconditioner.
+        """
+        statistics = []
+        num_statistics_per_state = []
+        original_shapes = []
+        exponents = []
+        max_size = 0
+        prev_preconditioners = []
+        for state, param in zip(states, params):
+            num_statistics = len(state.statistics)
+            num_statistics_per_state.append(num_statistics)
+            original_shapes_for_state = []
+            if num_statistics > 0:
+                preconditioner = Preconditioner(
+                    param, block_size, best_effort_shape_interpretation
+                )
+                for statistic in state.statistics:
+                    exponents.append(
+                        preconditioner.exponent_for_preconditioner()
+                        if exponent_override == 0
+                        else exponent_override
+                    )
+                    original_shapes_for_state.append(statistic.shape)
+                    max_size = max(max_size, statistic.shape[0])
+                statistics.extend(state.statistics)
+                prev_preconditioners.extend(state.preconditioners)
+                original_shapes.extend(original_shapes_for_state)
+        if batch_axis_name:
+            # Quantization is only enabled if batch_axis_name is not set.
+            quantized_dtype = quantized_dtype_for_second_moment_statistics_buffers()
+            if quantized_dtype == jnp.float32:
+                return _pmap_compute_preconditioners(
+                    states,
+                    step,
+                    statistics,
+                    num_statistics_per_state,
+                    original_shapes,
+                    exponents,
+                    max_size,
+                    prev_preconditioners,
+                )
+            else:
+                return _pmap_quantized_compute_preconditioners(
+                    states,
+                    step,
+                    statistics,
+                    num_statistics_per_state,
+                    original_shapes,
+                    exponents,
+                    max_size,
+                    prev_preconditioners,
+                )
+        else:
+            return _pjit_compute_preconditioners(
+                states,
+                step,
+                statistics,
+                num_statistics_per_state,
+                original_shapes,
+                exponents,
+                max_size,
+                prev_preconditioners,
+            )
+    def _transform_grad(grad, state, param, step):
+        """Transform per-parameter gradients."""
+        preconditioner = Preconditioner(
+            param, block_size, best_effort_shape_interpretation
+        )
+        sgd_update = grad
+        new_diagonal_statistics = state.diagonal_statistics.to_float()
+        if graft_type == GraftingType.ADAGRAD:
+            new_diagonal_statistics = state.diagonal_statistics.to_float() + jnp.square(
+                grad
+            )
+            adagrad_update = grad / (
+                jnp.sqrt(new_diagonal_statistics) + diagonal_epsilon
+            )
+            grafting_update = adagrad_update
+        elif (
+            graft_type == GraftingType.RMSPROP
+            or graft_type == GraftingType.RMSPROP_NORMALIZED
+        ):
+            scaled_grad = grad
+            if graft_type == GraftingType.RMSPROP_NORMALIZED:
+                scaled_grad = grad / jnp.linalg.norm(grad)
+            w1 = beta2
+            w2 = beta2 if beta2 == 1.0 else (1.0 - beta2)
+            new_diagonal_statistics = (
+                w1 * state.diagonal_statistics.to_float() + w2 * jnp.square(scaled_grad)
+            )
+            rmsprop_update = scaled_grad / (
+                jnp.sqrt(new_diagonal_statistics) + diagonal_epsilon
+            )
+            if clip_by_scaled_gradient_norm:
+                scaled_grad_norm = jnp.linalg.norm(rmsprop_update) / (
+                    jnp.sqrt(float(rmsprop_update.size))
+                )
+                clipping_denom = jnp.maximum(
+                    1.0, scaled_grad_norm / clip_by_scaled_gradient_norm
+                )
+                rmsprop_update /= clipping_denom
+            grafting_update = rmsprop_update
+        else:
+            grafting_update = sgd_update
+        precond_grad = grad
+        if not _skip_preconditioning(param):
+            precond_grad = preconditioner.preconditioned_grad(
+                precond_grad, _maybe_dequantize_preconditioners(state.preconditioners)
+            )
+        else:
+            precond_grad = grafting_update
+        grafting_update_norm = jnp.linalg.norm(grafting_update)
+        precond_grad_norm = jnp.linalg.norm(precond_grad)
+        multiplier = grafting_update_norm / (precond_grad_norm + 1e-16)
+        shampoo_update = precond_grad * multiplier
+        shampoo_update_with_wd = shampoo_update
+        grafting_update_with_wd = grafting_update
+        if weight_decay != 0:
+            shampoo_update_with_wd = shampoo_update + weight_decay * param
+            grafting_update_with_wd = grafting_update + weight_decay * param
+        w = (1.0 - beta1) if moving_average_for_momentum else 1.0
+        shampoo_update_with_wd_momentum = (
+            state.momentum.to_float() * beta1 + w * shampoo_update_with_wd
+        )
+        grafting_update_with_wd_momentum = (
+            state.diagonal_momentum.to_float() * beta1 + w * grafting_update_with_wd
+        )
+        run_shampoo = (step >= start_preconditioning_step).astype(
+            grafting_update_with_wd_momentum.dtype
+        )
+        momentum_update = (
+            run_shampoo * shampoo_update_with_wd_momentum
+            + (1.0 - run_shampoo) * grafting_update_with_wd_momentum
+        )
+        wd_update = (
+            run_shampoo * shampoo_update_with_wd
+            + (1.0 - run_shampoo) * grafting_update_with_wd
+        )
+        if nesterov:
+            momentum_update = w * wd_update + beta1 * momentum_update
+        lr = learning_rate
+        if callable(learning_rate):
+            lr = learning_rate(step)
+        transformed_update = -1.0 * lr * momentum_update
+        param_stats = ParameterStats(
+            _quantize_diagonal_statistics(new_diagonal_statistics),
+            state.statistics,
+            state.preconditioners,
+            _quantize_momentum(grafting_update_with_wd_momentum),
+            _quantize_momentum(shampoo_update_with_wd_momentum),
+        )
+        return transformed_update, param_stats
+    def update_fn(grads, state, params):
+        """Transform the input gradient and update all statistics.
+        Args:
+          grads: the gradient tensors for the parameters.
+          state: a named tuple containing the state of the optimizer
+          params: the parameters that should be updated.
+        Returns:
+          A tuple containing the new parameters and the new optimizer state.
+        """
+        params_flat, treedef = jax.tree_flatten(params)
+        stats_flat = treedef.flatten_up_to(state.stats)
+        grads_flat = treedef.flatten_up_to(grads)
+        new_stats_flat = jax.tree_multimap(
+            lambda g, s, p: _compute_stats(g, s, p, state.count),
+            grads_flat,
+            stats_flat,
+            params_flat,
+        )
+        new_stats_flat = _compute_preconditioners(
+            new_stats_flat, params_flat, state.count
+        )
+        outputs = jax.tree_multimap(
+            lambda g, s, p: _transform_grad(g, s, p, state.count),
+            grads_flat,
+            new_stats_flat,
+            params_flat,
+        )
+        updates_flat, new_stats_flat = list(zip(*outputs)) if outputs else ((), ())
+        updates = jax.tree_unflatten(treedef, updates_flat)
+        new_stats = jax.tree_unflatten(treedef, new_stats_flat)
+        new_state = ShampooState(count=state.count + 1, stats=new_stats)
+        return updates, new_state
+    if shard_optimizer_states:
+        return optax.GradientTransformation(sharded_init_fn, sharded_update_fn)
+    else:
+        return optax.GradientTransformation(init_fn, update_fn)

tools/train/sweep.yaml CHANGED Viewed

@@ -11,44 +11,39 @@ parameters:
     # from exp(min) to exp(max)
     min: -6.9
     max: -3.5
   gradient_accumulation_steps:
-    value: 8
   warmup_steps:
     value: 4000
-#TODO: outdated command
 command:
   - python3
   - ${program}
-  - "--tokenizer_name"
-  - "boris/dalle-mini-tokenizer"
-  - "--config_name"
-  - "facebook/bart-large-cnn"
-  - "--dataset_repo_or_path"
-  - "boris/gis_vqgan_f16_16384"
   - "--streaming"
-  - "--use_auth_token"
-  - "--image_vocab_size"
-  - 16384
-  - "--image_length"
-  - 256
-  - "--normalize_text"
-  - True
-  - "--per_device_train_batch_size"
-  - 56
-  - "--per_device_eval_batch_size"
-  - 56
-  - "--adafactor"
-  - "--do_train"
-  - "--do_eval"
-  - "--num_train_epochs"
-  - 1
-  - "--logging_steps"
-  - 40
-  - "--eval_steps"
-  - 800
   - "--output_dir"
   - "./output"
   - "--overwrite_output_dir"
-  - "--max_train_samples"
-  - 10000000
   - ${args}

     # from exp(min) to exp(max)
     min: -6.9
     max: -3.5
+  tokenizer_name:
+    value: boris/dalle-mini-tokenizer
+  config_name:
+    value: ./config/mini
+  dtype:
+    value: bfloat16
+  dataset_repo_or_path:
+    value: ./data
+  per_device_train_batch_size:
+    value: 64
+  per_device_eval_batch_size:
+    value: 64
   gradient_accumulation_steps:
+    value: 1
   warmup_steps:
     value: 4000
+  num_train_epochs:
+    value: 1
+  logging_steps:
+    value: 32
+  eval_steps:
+    value: 800
+  max_train_samples:
+    value: 1000000
 command:
   - python3
   - ${program}
   - "--streaming"
   - "--output_dir"
   - "./output"
   - "--overwrite_output_dir"
+  - "--adafactor"
+  - "--do_train"
+  - "--do_eval"
   - ${args}

tools/train/train.py CHANGED Viewed

@@ -34,6 +34,7 @@ import optax
 import transformers
 import wandb
 from datasets import Dataset
 from flax import jax_utils, traverse_util
 from flax.jax_utils import unreplicate
 from flax.serialization import from_bytes, to_bytes
@@ -41,10 +42,9 @@ from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard_prng_key
 from tqdm import tqdm
 from transformers import AutoTokenizer, HfArgumentParser
-from transformers.models.bart.modeling_flax_bart import BartConfig
 from dalle_mini.data import Dataset
-from dalle_mini.model import CustomFlaxBartForConditionalGeneration
 logger = logging.getLogger(__name__)
@@ -68,26 +68,12 @@ class ModelArguments:
             "help": "Pretrained config name or path if not the same as model_name"
         },
     )
-    image_vocab_size: Optional[int] = field(
-        default=None,
-        metadata={"help": "Vocab size of image encoder"},
-    )
-    image_length: Optional[int] = field(
-        default=None,
-        metadata={"help": "Number of tokens per image"},
-    )
     tokenizer_name: Optional[str] = field(
         default=None,
         metadata={
             "help": "Pretrained tokenizer name or path if not the same as model_name_or_path"
         },
     )
-    normalize_text: Optional[bool] = field(
-        default=None,
-        metadata={
-            "help": "Whether to normalize text or not. By default, we refer to base model or don't normalize for new models."
-        },
-    )
     dtype: Optional[str] = field(
         default="float32",
         metadata={
@@ -126,26 +112,21 @@ class DataTrainingArguments:
         default=None,
         metadata={"help": "An optional input evaluation data file (glob acceptable)."},
     )
-    dataset_type: str = field(
-        default="datasets",
-        metadata={"help": "Either 🤗 'dataset' (default) or 'webdataset'."},
-    )
     # data loading should not be a bottleneck so we use "streaming" mode by default
-    streaming: bool = field(
         default=True,
         metadata={"help": "Whether to stream the dataset."},
     )
-    use_auth_token: bool = field(
         default=False,
         metadata={
             "help": "Whether to use the authentication token for private datasets."
         },
     )
-    max_source_length: Optional[int] = field(
-        default=128,
         metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
         },
     )
     max_train_samples: Optional[int] = field(
@@ -232,7 +213,11 @@ class TrainingArguments:
     )
     adafactor: bool = field(
         default=False,
-        metadata={"help": "Whether or not to replace AdamW by Adafactor."},
     )
     weight_decay: float = field(
         default=None, metadata={"help": "Weight decay if we apply some."}
@@ -351,14 +336,39 @@ def create_learning_rate_fn(
     return schedule_fn
-def wandb_log(metrics, step=None, prefix=None):
-    if jax.process_index() == 0:
-        log_metrics = {
-            f"{prefix}/{k}" if prefix is not None else k: v for k, v in metrics.items()
         }
-        if step is not None:
-            log_metrics["train/step"] = step
-        wandb.log(log_metrics)
 def main():
@@ -411,20 +421,29 @@ def main():
         do_eval=training_args.do_eval,
     )
     # Set up wandb run
-    wandb.init(
-        entity="dalle-mini",
-        project="dalle-mini",
-        job_type="Seq2Seq",
-        config=parser.parse_args(),
-    )
     if training_args.resume_from_checkpoint is not None:
-        artifact = wandb.run.use_artifact(training_args.resume_from_checkpoint)
         artifact_dir = artifact.download()
         # load model
-        model = CustomFlaxBartForConditionalGeneration.from_pretrained(artifact_dir)
         # avoid OOM on TPU: see https://github.com/google/flax/issues/1658
         print(model.params)
@@ -436,56 +455,24 @@ def main():
     else:
         # Set up our new model config
-        # TODO: simplify with custom config class
         if model_args.config_name:
-            config = BartConfig.from_pretrained(model_args.config_name)
-        else:
-            config = BartConfig.from_pretrained(model_args.model_name_or_path)
-        if model_args.image_vocab_size:
-            config.image_vocab_size = model_args.image_vocab_size
-        assert (
-            getattr(config, "image_vocab_size") is not None
-        ), "image_vocab_size must be specified when not present in base model/config"
-        if model_args.image_length:
-            config.image_length = model_args.image_length
-        assert (
-            getattr(config, "image_length") is not None
-        ), "image_length must be specified when not present in base model/config"
-        # we append decoder bos to image vocab
-        config.decoder_start_token_id = config.image_vocab_size
-        # ensure we don't generate bos (in addition to decoder start token)
-        config.force_bos_token_to_be_generated = False
-        config.forced_bos_token_id = None  # we don't need this token
-        config.forced_eos_token_id = None  # we don't need this token
-        config.tie_word_embeddings = False
-        config.min_length = config.image_length + 1
-        config.max_length = config.image_length + 1
-        # below tokens need to be set to avoid error during generation (converted to jnp.array)
-        # they are not expected to be used and are set to unreachable token id
-        config.bos_token_id = config.image_vocab_size + 1
-        config.pos_token_id = config.image_vocab_size + 1
-        config.eos_token_id = config.image_vocab_size + 1
-        # save whether we normalize the text
-        if model_args.normalize_text is not None:
-            config.normalize_text = model_args.normalize_text
         else:
-            config.normalize_text = getattr(config, "normalize_text", False)
         # Load or create new model
         if model_args.model_name_or_path:
-            model = CustomFlaxBartForConditionalGeneration.from_pretrained(
                 model_args.model_name_or_path,
                 config=config,
                 seed=training_args.seed_model,
                 dtype=getattr(jnp, model_args.dtype),
             )
             # avoid OOM on TPU: see https://github.com/google/flax/issues/1658
             print(model.params)
         else:
-            model = CustomFlaxBartForConditionalGeneration(
                 config,
                 seed=training_args.seed_model,
                 dtype=getattr(jnp, model_args.dtype),
@@ -502,9 +489,6 @@ def main():
                 use_fast=True,
             )
-    logger.info(f"TPUs: {jax.device_count()}")
-    assert jax.device_count() == 8, "TPUs in use, please check running processes"
     # Preprocessing the datasets.
     # We need to normalize and tokenize inputs and targets.
@@ -512,6 +496,7 @@ def main():
         tokenizer=tokenizer,
         decoder_start_token_id=model.config.decoder_start_token_id,
         normalize_text=model.config.normalize_text,
     )
     # Initialize our training
@@ -520,18 +505,28 @@ def main():
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
     train_batch_size = (
-        int(training_args.per_device_train_batch_size) * jax.device_count()
     )
-    batch_size_per_update = train_batch_size * training_args.gradient_accumulation_steps
-    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
     len_train_dataset, len_eval_dataset = dataset.length
     steps_per_epoch = (
-        len_train_dataset // train_batch_size if len_train_dataset is not None else None
     )
     num_train_steps = (
         steps_per_epoch * num_epochs if steps_per_epoch is not None else None
     )
     # Create learning rate schedule
     learning_rate_fn = create_learning_rate_fn(
@@ -572,13 +567,43 @@ def main():
             weight_decay_mask=decay_mask_fn,
             clipping_threshold=training_args.max_grad_norm,
         )
     else:
         optimizer = optax.adamw(
             learning_rate=learning_rate_fn,
             b1=training_args.adam_beta1,
             b2=training_args.adam_beta2,
             eps=training_args.adam_epsilon,
-            weight_decay=training_args.weight_decay,
             mask=decay_mask_fn,
         )
@@ -625,7 +650,7 @@ def main():
             grads=grads,
             dropout_rng=new_dropout_rng,
             train_time=state.train_time + delta_time,
-            train_samples=state.train_samples + train_batch_size,
         )
         metrics = {
@@ -657,25 +682,30 @@ def main():
     logger.info(
         f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}"
     )
     logger.info(
         f"  Total train batch size (w. parallel, distributed & gradient accumulation) = {batch_size_per_update}"
     )
     epochs = tqdm(
         range(state.epoch, num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0
     )
-    # set default x-axis as 'train/step'
-    wandb_log({}, step=state.step)
-    wandb.define_metric("*", step_metric="train/step")
-    # add interesting config parameters
-    wandb.config.update(
-        {
-            "len_train_dataset": len_train_dataset,
-            "len_eval_dataset": len_eval_dataset,
-            "batch_size_per_update": batch_size_per_update,
-        }
-    )
     # replicate state on each device
     state = state.replicate()
@@ -706,7 +736,9 @@ def main():
             eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
             # log metrics
-            wandb_log(eval_metrics, step=unreplicate(state.step), prefix="eval")
             # Print metrics and update progress bar
             desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']})"
@@ -743,51 +775,61 @@ def main():
                     f,
                 )
-            # save to W&B
-            if training_args.log_model:
-                # save some space
-                c = wandb.wandb_sdk.wandb_artifacts.get_artifacts_cache()
-                c.cleanup(wandb.util.from_human_size("10GB"))
-                metadata = dict(state_dict)
-                if eval_metrics is not None:
-                    metadata["eval"] = eval_metrics
-                artifact = wandb.Artifact(
-                    name=f"model-{wandb.run.id}", type="bart_model", metadata=metadata
-                )
-                artifact.add_file(
-                    str(Path(training_args.output_dir) / "flax_model.msgpack")
-                )
-                artifact.add_file(str(Path(training_args.output_dir) / "config.json"))
-                artifact.add_file(
-                    str(Path(training_args.output_dir) / "tokenizer.json")
-                )
-                artifact.add_file(
-                    str(Path(training_args.output_dir) / "tokenizer_config.json")
-                )
-                artifact.add_file(str(Path(training_args.output_dir) / "vocab.json"))
-                artifact.add_file(str(Path(training_args.output_dir) / "merges.txt"))
-                artifact.add_file(
-                    str(Path(training_args.output_dir) / "special_tokens_map.json")
-                )
-                artifact.add_file(
-                    str(Path(training_args.output_dir) / "opt_state.msgpack")
-                )
-                artifact.add_file(
-                    str(Path(training_args.output_dir) / "training_state.json")
-                )
-                wandb.run.log_artifact(artifact)
-            # save to the hub
-            if training_args.push_to_hub:
-                model.save_pretrained(
-                    training_args.output_dir,
-                    params=params,
-                    push_to_hub=training_args.push_to_hub,
-                    commit_message=f"Saving weights and logs at step {unreplicate(state.step)+1}",
-                    temp_dir=True,  # avoid issues with being in a repository
-                )
     # init variables
     last_time = time.perf_counter()
@@ -796,7 +838,7 @@ def main():
     for epoch in epochs:
         state.replace(epoch=jax_utils.replicate(epoch))
         # ======================== Training ================================
-        wandb_log({"train/epoch": epoch}, step=unreplicate(state.step))
         # Generate an epoch by shuffling sampling indices from the train dataset
         train_loader = dataset.dataloader("train", train_batch_size)
@@ -821,14 +863,8 @@ def main():
             step = unreplicate(state.step)
             if step % training_args.logging_steps == 0 and jax.process_index() == 0:
-                # log metrics
-                metrics = unreplicate(train_metrics)
-                # log state parameters
-                state_dict = {
-                    k.split("_")[-1]: unreplicate(getattr(state, k))
-                    for k in ["epoch", "train_time", "train_samples"]
-                }
-                wandb_log({**metrics, **state_dict}, step=step, prefix="train")
             eval_metrics = None
             if training_args.eval_steps and step % training_args.eval_steps == 0:
@@ -839,8 +875,8 @@ def main():
         # log final train metrics
         if train_metrics is not None:
-            train_metrics = unreplicate(train_metrics)
-            wandb_log(train_metrics, step=step, prefix="train")
             epochs.write(
                 f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metrics['loss']}, Learning Rate: {train_metrics['learning_rate']})"

 import transformers
 import wandb
 from datasets import Dataset
+from distributed_shampoo import GraftingType, distributed_shampoo
 from flax import jax_utils, traverse_util
 from flax.jax_utils import unreplicate
 from flax.serialization import from_bytes, to_bytes
 from flax.training.common_utils import get_metrics, onehot, shard_prng_key
 from tqdm import tqdm
 from transformers import AutoTokenizer, HfArgumentParser
 from dalle_mini.data import Dataset
+from dalle_mini.model import DalleBart, DalleBartConfig
 logger = logging.getLogger(__name__)
             "help": "Pretrained config name or path if not the same as model_name"
         },
     )
     tokenizer_name: Optional[str] = field(
         default=None,
         metadata={
             "help": "Pretrained tokenizer name or path if not the same as model_name_or_path"
         },
     )
     dtype: Optional[str] = field(
         default="float32",
         metadata={
         default=None,
         metadata={"help": "An optional input evaluation data file (glob acceptable)."},
     )
     # data loading should not be a bottleneck so we use "streaming" mode by default
+    streaming: Optional[bool] = field(
         default=True,
         metadata={"help": "Whether to stream the dataset."},
     )
+    use_auth_token: Optional[bool] = field(
         default=False,
         metadata={
             "help": "Whether to use the authentication token for private datasets."
         },
     )
+    shard_by_host: Optional[bool] = field(
+        default=False,
         metadata={
+            "help": "Whether to shard data files by host in multi-host environments."
         },
     )
     max_train_samples: Optional[int] = field(
     )
     adafactor: bool = field(
         default=False,
+        metadata={"help": "Use Adafactor instead of AdamW."},
+    )
+    distributed_shampoo: bool = field(
+        default=False,
+        metadata={"help": "Use Distributed Shampoo optimizer instead of AdamW."},
     )
     weight_decay: float = field(
         default=None, metadata={"help": "Weight decay if we apply some."}
     return schedule_fn
+class MetricsLogger:
+    def __init__(self, state):
+        self.step = state.step
+        self.time = time.perf_counter()
+    def get_all_train_metrics(self, train_metrics, state):
+        """Make a dict of training metrics to be logged"""
+        metrics = unreplicate(train_metrics)
+        # get state parameters
+        state_dict = {
+            k.split("_")[-1]: unreplicate(getattr(state, k))
+            for k in ["epoch", "train_time", "train_samples"]
         }
+        # timing metrics
+        new_step = int(unreplicate(state.step))
+        new_time = time.perf_counter()
+        if new_step > self.step:
+            time_per_step = (new_time - self.time) / (new_step - self.step)
+            self.step = new_step
+            self.time = new_time
+            state_dict["time_per_step"] = time_per_step
+        return {**metrics, **state_dict}
+    @staticmethod
+    def log(metrics, step=None, prefix=None):
+        if jax.process_index() == 0:
+            log_metrics = {
+                f"{prefix}/{k}" if prefix is not None else k: v
+                for k, v in metrics.items()
+            }
+            if step is not None:
+                log_metrics["train/step"] = step
+            wandb.log(log_metrics)
 def main():
         do_eval=training_args.do_eval,
     )
+    logger.info(f"Local TPUs: {jax.local_device_count()}")
+    assert jax.local_device_count() == 8, "TPUs in use, please check running processes"
     # Set up wandb run
+    if jax.process_index() == 0:
+        wandb.init(
+            entity="dalle-mini",
+            project="dalle-mini",
+            job_type="Seq2Seq",
+            config=parser.parse_args(),
+        )
     if training_args.resume_from_checkpoint is not None:
+        if jax.process_index() == 0:
+            artifact = wandb.run.use_artifact(training_args.resume_from_checkpoint)
+        else:
+            artifact = wandb.Api().artifact(training_args.resume_from_checkpoint)
         artifact_dir = artifact.download()
         # load model
+        model = DalleBart.from_pretrained(
+            artifact_dir, dtype=getattr(jnp, model_args.dtype), abstract_init=True
+        )
         # avoid OOM on TPU: see https://github.com/google/flax/issues/1658
         print(model.params)
     else:
         # Set up our new model config
         if model_args.config_name:
+            config = DalleBartConfig.from_pretrained(model_args.config_name)
         else:
+            config = DalleBartConfig.from_pretrained(model_args.model_name_or_path)
         # Load or create new model
         if model_args.model_name_or_path:
+            model = DalleBart.from_pretrained(
                 model_args.model_name_or_path,
                 config=config,
                 seed=training_args.seed_model,
                 dtype=getattr(jnp, model_args.dtype),
+                abstract_init=True,
             )
             # avoid OOM on TPU: see https://github.com/google/flax/issues/1658
             print(model.params)
         else:
+            model = DalleBart(
                 config,
                 seed=training_args.seed_model,
                 dtype=getattr(jnp, model_args.dtype),
                 use_fast=True,
             )
     # Preprocessing the datasets.
     # We need to normalize and tokenize inputs and targets.
         tokenizer=tokenizer,
         decoder_start_token_id=model.config.decoder_start_token_id,
         normalize_text=model.config.normalize_text,
+        max_length=model.config.max_text_length,
     )
     # Initialize our training
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
+    # batch size per node
     train_batch_size = (
+        int(training_args.per_device_train_batch_size) * jax.local_device_count()
+    )
+    batch_size_per_update = (
+        train_batch_size
+        * training_args.gradient_accumulation_steps
+        * jax.process_count()
+    )
+    eval_batch_size = (
+        int(training_args.per_device_eval_batch_size) * jax.local_device_count()
     )
     len_train_dataset, len_eval_dataset = dataset.length
     steps_per_epoch = (
+        len_train_dataset // (train_batch_size * jax.process_count())
+        if len_train_dataset is not None
+        else None
     )
     num_train_steps = (
         steps_per_epoch * num_epochs if steps_per_epoch is not None else None
     )
+    num_params = model.num_params
     # Create learning rate schedule
     learning_rate_fn = create_learning_rate_fn(
             weight_decay_mask=decay_mask_fn,
             clipping_threshold=training_args.max_grad_norm,
         )
+    elif training_args.distributed_shampoo:
+        # parameters from https://github.com/tensorflow/lingvo/blob/03ee9d7cd50764b0424c7c863733c91fc0b053ec/lingvo/jax/optimizers.py#L729
+        # Notes:
+        # - mask for weight decay is not implemented but we don't use it anyway
+        optimizer = distributed_shampoo(
+            learning_rate_fn,
+            block_size=1024,  # recommended default for large LM is 1536
+            beta1=0.9,
+            beta2=0.999,
+            diagonal_epsilon=1e-10,
+            matrix_epsilon=1e-8,
+            weight_decay=0.0,
+            start_preconditioning_step=1001,
+            preconditioning_compute_steps=10,
+            statistics_compute_steps=1,
+            best_effort_shape_interpretation=True,
+            graft_type=GraftingType.RMSPROP_NORMALIZED,
+            nesterov=False,
+            exponent_override=0,
+            batch_axis_name="batch",
+            inverse_failure_threshold=0.1,
+            moving_average_for_momentum=True,
+            skip_preconditioning_dim_size_gt=4096,
+            clip_by_scaled_gradient_norm=None,
+            precision=jax.lax.Precision.HIGHEST,
+            best_effort_memory_usage_reduction=False,
+        )
     else:
         optimizer = optax.adamw(
             learning_rate=learning_rate_fn,
             b1=training_args.adam_beta1,
             b2=training_args.adam_beta2,
             eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay
+            if training_args.weight_decay is not None
+            else 0.0,
             mask=decay_mask_fn,
         )
             grads=grads,
             dropout_rng=new_dropout_rng,
             train_time=state.train_time + delta_time,
+            train_samples=state.train_samples + train_batch_size * jax.process_count(),
         )
         metrics = {
     logger.info(
         f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}"
     )
+    logger.info(f"  Number of devices = {jax.device_count()}")
     logger.info(
         f"  Total train batch size (w. parallel, distributed & gradient accumulation) = {batch_size_per_update}"
     )
+    logger.info(f"  Model parameters = {num_params:,}")
     epochs = tqdm(
         range(state.epoch, num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0
     )
+    metrics_logger = MetricsLogger(state)
+    if jax.process_index() == 0:
+        # set default x-axis as 'train/step'
+        metrics_logger.log({}, step=state.step)
+        wandb.define_metric("*", step_metric="train/step")
+        # add interesting config parameters
+        wandb.config.update(
+            {
+                "len_train_dataset": len_train_dataset,
+                "len_eval_dataset": len_eval_dataset,
+                "batch_size_per_update": batch_size_per_update,
+                "num_params": num_params,
+            }
+        )
     # replicate state on each device
     state = state.replicate()
             eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
             # log metrics
+            metrics_logger.log(
+                eval_metrics, step=unreplicate(state.step), prefix="eval"
+            )
             # Print metrics and update progress bar
             desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']})"
                     f,
                 )
+            if jax.process_index() == 0:
+                # save to W&B
+                if training_args.log_model:
+                    # save some space
+                    c = wandb.wandb_sdk.wandb_artifacts.get_artifacts_cache()
+                    c.cleanup(wandb.util.from_human_size("10GB"))
+                    metadata = dict(state_dict)
+                    metadata["num_params"] = num_params
+                    if eval_metrics is not None:
+                        metadata["eval"] = eval_metrics
+                    artifact = wandb.Artifact(
+                        name=f"model-{wandb.run.id}",
+                        type="bart_model",
+                        metadata=metadata,
+                    )
+                    artifact.add_file(
+                        str(Path(training_args.output_dir) / "flax_model.msgpack")
+                    )
+                    artifact.add_file(
+                        str(Path(training_args.output_dir) / "config.json")
+                    )
+                    artifact.add_file(
+                        str(Path(training_args.output_dir) / "tokenizer.json")
+                    )
+                    artifact.add_file(
+                        str(Path(training_args.output_dir) / "tokenizer_config.json")
+                    )
+                    artifact.add_file(
+                        str(Path(training_args.output_dir) / "vocab.json")
+                    )
+                    artifact.add_file(
+                        str(Path(training_args.output_dir) / "merges.txt")
+                    )
+                    artifact.add_file(
+                        str(Path(training_args.output_dir) / "special_tokens_map.json")
+                    )
+                    artifact.add_file(
+                        str(Path(training_args.output_dir) / "opt_state.msgpack")
+                    )
+                    artifact.add_file(
+                        str(Path(training_args.output_dir) / "training_state.json")
+                    )
+                    wandb.run.log_artifact(artifact)
+                # save to the hub
+                if training_args.push_to_hub:
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs at step {unreplicate(state.step)+1}",
+                        temp_dir=True,  # avoid issues with being in a repository
+                    )
     # init variables
     last_time = time.perf_counter()
     for epoch in epochs:
         state.replace(epoch=jax_utils.replicate(epoch))
         # ======================== Training ================================
+        metrics_logger.log({"train/epoch": epoch}, step=unreplicate(state.step))
         # Generate an epoch by shuffling sampling indices from the train dataset
         train_loader = dataset.dataloader("train", train_batch_size)
             step = unreplicate(state.step)
             if step % training_args.logging_steps == 0 and jax.process_index() == 0:
+                all_metrics = metrics_logger.get_all_train_metrics(train_metrics, state)
+                metrics_logger.log(all_metrics, step=step, prefix="train")
             eval_metrics = None
             if training_args.eval_steps and step % training_args.eval_steps == 0:
         # log final train metrics
         if train_metrics is not None:
+            all_metrics = metrics_logger.get_all_train_metrics(train_metrics, state)
+            metrics_logger.log(all_metrics, step=step, prefix="train")
             epochs.write(
                 f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metrics['loss']}, Learning Rate: {train_metrics['learning_rate']})"