Spaces:

HugoVoxx
/

GeoGenSolve

Running

App Files Files

HugoVoxx commited on Nov 13, 2024

Commit

15bcbe6

verified ·

1 Parent(s): f18cde5

Upload 20 files

Browse files

Files changed (20) hide show

aglib/meliad/transformer/__init__.py +15 -0
aglib/meliad/transformer/attention.py +443 -0
aglib/meliad/transformer/decoder_stack.py +426 -0
aglib/meliad/transformer/ht_main.py +56 -0
aglib/meliad/transformer/ht_main_inference.py +76 -0
aglib/meliad/transformer/inference_utils.py +271 -0
aglib/meliad/transformer/launcher.py +119 -0
aglib/meliad/transformer/memory_factory.py +120 -0
aglib/meliad/transformer/memory_layer.py +431 -0
aglib/meliad/transformer/metric_utils.py +115 -0
aglib/meliad/transformer/models.py +317 -0
aglib/meliad/transformer/nn_components.py +437 -0
aglib/meliad/transformer/position.py +242 -0
aglib/meliad/transformer/position_fourier.py +218 -0
aglib/meliad/transformer/position_t5.py +155 -0
aglib/meliad/transformer/synthetic_text_data.py +0 -0
aglib/meliad/transformer/tasks.py +52 -0
aglib/meliad/transformer/text_dataset.py +775 -0
aglib/meliad/transformer/transformer_base.py +451 -0
aglib/meliad/transformer/transformer_layer.py +817 -0

aglib/meliad/transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

aglib/meliad/transformer/attention.py ADDED Viewed

	@@ -0,0 +1,443 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer attention functions."""
+import typing
+from typing import Any, Callable, Mapping, NewType, Optional, Sequence, Tuple, Union
+from absl import logging
+from flax import linen as nn
+import jax
+import jax.numpy as jnp
+from transformer import nn_components
+from transformer import position
+Array = jnp.ndarray
+ArrayTree = Union[Array, Tuple["ArrayTree", ...]]
+DecoderState = NewType("DecoderState", Mapping[str, Array])
+# Tuple of keys, values, importance.
+KVITuple = Tuple[Array, Array, Optional[Array]]
+# Tuple of keys, values, queries, queries2, importance.
+KVQITuple = Tuple[Array, Array, Array, Optional[Array], Optional[Array]]
+# Tuple of scale factors.  See TransformerBase.attention_scale_factors().
+AttnScaleTuple = Tuple[Optional[Array], Optional[Array]]
+def initial_kvi(shape: Sequence[int], use_importance: bool, dtype: Any):
+  """Returns initial (zero) keys/values/i that can be passed to prev_kvi."""
+  z = jnp.zeros(shape, dtype=dtype)
+  if use_importance:
+    i = jnp.zeros((shape[0], shape[1]), dtype=dtype)  # (bsize, window_length)
+  else:
+    i = None
+  return (z, z, i)
+def concat_kvqi(kvqi: KVQITuple, prev_kvi: Optional[KVITuple]) -> (
+    Tuple[KVQITuple, Optional[KVITuple]]):
+  """Concatenate previous keys,values with current keys,values.
+  Args:
+    kvqi: Current keys, values, queries, quieres2, importance.
+    prev_kvi: Previous keys, values, importance.
+  Returns:
+    (kvqi: Concatenated (keys, values, queries, importance),
+     next_kvi:  Next (keys, values, importance))  (from kvqi)
+  """
+  (keys, values, queries, queries2, importance) = kvqi
+  # The current keys,values,importance will be passed to the next window.
+  next_kvi = (keys, values, importance)
+  (batch_size, _, num_heads, head_dim) = keys.shape    # (b, _, h, d)
+  if prev_kvi is None:
+    return (kvqi, None)   # If prev_kvi is None, next_kvi should be None.
+  # Unpack prev_kvi and check shapes.
+  (pkeys, pvalues, pimportance) = prev_kvi
+  num_pkeys = pkeys.shape[1]
+  assert pkeys.shape == (batch_size, num_pkeys, num_heads, head_dim)
+  assert pkeys.shape == pvalues.shape
+  if pimportance is not None:
+    assert pimportance.shape == (batch_size, num_pkeys)
+  # Concatenate keys and values.
+  keys = jnp.concatenate([pkeys, keys], axis=1)        # (b, k, h, d)
+  values = jnp.concatenate([pvalues, values], axis=1)  # (b, k, h, d)
+  if importance is not None:
+    assert pimportance is not None
+    importance = jnp.concatenate([pimportance, importance], axis=1)  # (b, k)
+    logging.info("attn: importance = %r", importance)
+  return ((keys, values, queries, queries2, importance), next_kvi)
+def simple_attention(keys: Array,
+                     values: Array,
+                     queries: Array,
+                     importance: Optional[Array],
+                     *,
+                     relative_position_bias: Optional[Array] = None,
+                     scale_factor: Optional[Array] = None,
+                     causal_mask: Optional[Array] = None,
+                     dropout_multiplier: Optional[Array] = None,
+                     dtype: Any = jnp.float32) -> Array:
+  """Simple attention from a set of queries to a set of keys,values.
+  Args:
+    keys: of shape [batch_size, num_keys, num_heads, head_dim].
+    values: of shape [batch_size, num_keys, num_heads, head_dim].
+    queries: of shape [batch_size, num_queries, num_heads, head_dim].
+    importance: of shape [batch_size, num_keys].
+    *: ---- the following arguments are passed by keyword only ----
+    relative_position_bias:  A positional attention matrix of shape
+          [num_heads, num_queries, num_keys]
+    scale_factor:  Learned scale factor for use with normalized keys,queries
+          of shape [num_heads]
+    causal_mask: A boolean array of shape [num_heads, num_queries, num_keys]
+    dropout_multiplier: A random mask of either 0.0 or 1.0/keep_prob,
+          of shape [num_heads, num_queries, num_keys]
+    dtype: data type to perform attention at.
+  Returns:
+    Attention outputs of shape [batch_size, num_queries, num_heads, head_size]
+  """
+  # (batch_size, num_keys, num_heads, head_dim)
+  (batch_size, num_keys, num_heads, head_dim) = keys.shape  # (b, k, h, d)
+  num_queries = queries.shape[1]
+  assert keys.shape == values.shape
+  assert queries.shape == (batch_size, num_queries, num_heads, head_dim)
+  if importance is not None:
+    assert importance.shape == (batch_size, num_keys)
+  logging.info("attn: keys = %r", keys)
+  logging.info("attn: queries = %r", queries)
+  # Compute attention matrix.
+  attn = jnp.einsum("...qhd,...khd->...hqk", queries, keys)  # (b, h, q, k)
+  logging.info("attn: content attn = %r", attn)
+  # Apply relative position bias.
+  if relative_position_bias is not None:
+    logging.info("attn: pbias = %r", relative_position_bias)
+    relative_position_bias = jnp.asarray(relative_position_bias, dtype=dtype)
+    pbias = position.broadcast_mask(relative_position_bias, attn)
+    attn = attn + pbias
+  # Apply learned attention scale.
+  if scale_factor is not None:
+    logging.info("attn: learned attention scale: %s", scale_factor)
+    # Broadcast scale over batch/keys/queries.
+    scale_factor = jnp.asarray(scale_factor, dtype=dtype)
+    scale_factor = scale_factor.reshape((1, num_heads, 1, 1))
+    attn = attn * scale_factor
+  # Apply causal mask.
+  if causal_mask is not None:
+    causal_mask = position.broadcast_mask(causal_mask, attn)
+    attn = jnp.where(causal_mask, attn, jnp.asarray(-1_000_000.0, dtype=dtype))
+  logging.info("attn: pre-softmax attn = %r", attn)
+  # Normalize attention matrix with softmax.
+  # min_x should be much smaller than minimum expected values in attn, but
+  # much larger than the masked_out values created by the causal mask. That
+  # way, if all tokens are masked out, then softmax will attend to nothing,
+  # rather than attend to everything equally.
+  min_x = jnp.asarray(-1000.0, dtype=dtype)
+  attn = nn_components.safe_softmax(attn, axis=-1, min_x=min_x)  # (b, h, q, k)
+  # Apply dropout to attention matrix.
+  if dropout_multiplier is not None:
+    logging.debug("attn: drop = %r", dropout_multiplier)
+    dropout_multiplier = jnp.asarray(dropout_multiplier, dtype=dtype)
+    attn = attn * dropout_multiplier
+  logging.info("attn: final attn = %r", attn)
+  # Compute output -- values weighted by attention matrix.
+  y = jnp.einsum("...hqk,...khd->...qhd", attn, values)  # (b, q, h, d)
+  logging.info("attn: y = %r", y)
+  return y
+def external_attention(external_keys: Array,
+                       external_values: Array,
+                       queries: Array,
+                       *,
+                       scale_factor: Optional[Array] = None,
+                       dtype: Any = jnp.float32) -> Array:
+  """Attention over (keys, values) retrieved from external memory.
+  Args:
+    external_keys: per-query keys from external memory, of shape
+        [batch_size, num_queries, num_heads, num_neighbors, head_size]
+    external_values: per-query values from external memory, of shape
+        [batch_size, num_queries, num_heads, num_neighbors, head_size]
+    queries: current queries, of shape:
+        [batch_size, num_queries, num_heads, head_size]
+    *: ---- the following arguments are passed by keyword only. ---
+    scale_factor:  Learned scale factor for use with normalized keys,queries
+          of shape [num_heads]
+    dtype: data type to perform attention at.
+  Returns:
+    Attention outputs of shape [batch_size, num_queries, num_heads, head_size]
+  """
+  (batch_size, num_queries, num_heads, _, head_dim) = external_keys.shape
+  assert queries.shape == (batch_size, num_queries, num_heads, head_dim)
+  assert external_values.shape == external_keys.shape
+  # Build attention matrix.
+  logging.info("ext_attn: external keys = %r", external_keys)
+  ext_attn = jnp.einsum("...qhd,...qhid->...hqi", queries, external_keys)
+  logging.info("ext_attn: external_mem_attn: %s", ext_attn)
+  if scale_factor is not None:
+    scale_factor = jnp.asarray(scale_factor, dtype=dtype)
+    scale_factor = scale_factor.reshape((1, num_heads, 1, 1))
+    logging.info("ext_attn: scaling external_mem_attn by %s", scale_factor)
+    ext_attn = ext_attn * scale_factor
+  ext_attn = nn.softmax(ext_attn, axis=-1)
+  # Compute weighted sum of values.
+  ext_y = jnp.einsum("...hqi,...qhid->...qhd", ext_attn, external_values)
+  logging.info("ext_attn: ext_y = %r", ext_y)
+  return ext_y
+def sliding_attention_window_shape(kvi: KVITuple,
+                                   prev_kvi: Optional[KVITuple],
+                                   queries: Array,
+                                   window_length: int) -> Tuple[int, int]:
+  """Return (num_queries, num_keys) for the sliding attention window."""
+  # Do error checking here.
+  (keys, values, importance) = kvi
+  assert keys.shape == queries.shape
+  assert values.shape == queries.shape
+  # Get sizes...
+  (batch_size, sequence_length, _, _) = queries.shape
+  if importance is not None:
+    assert importance.ndim == 2
+    assert importance.shape == (batch_size, sequence_length)
+  assert window_length > 0
+  if window_length >= sequence_length:
+    # No sliding window.
+    num_queries = sequence_length
+    num_keys = sequence_length
+    if prev_kvi is not None:
+      num_keys += prev_kvi[0].shape[1]
+  else:
+    # Sliding window.
+    if prev_kvi is not None:
+      assert prev_kvi[0].shape[1] == window_length
+    num_queries = window_length
+    num_keys = window_length * 2
+  return (num_queries, num_keys)
+def split_tree(tree: ArrayTree, sections: int, axis: int = 0) -> (
+    Sequence[ArrayTree]):
+  """Recursively splits a possibly nested tuple of arrays along the given axis.
+  Args:
+    tree: A nested tree of tuples and arrays.
+    sections: The number of sections to split the tree into.
+    axis: The axis to do the split on arrays.
+  Returns:
+    A list of trees, of length sections, where each has the same shape as the
+    original, but with arrays of size 1/sections.
+  """
+  if tree is None:
+    return [None] * sections
+  elif isinstance(tree, jnp.ndarray):
+    return jnp.split(tree, sections, axis=axis)
+  elif isinstance(tree, tuple):
+    # Recursively split each element of the tuple into a list.
+    branch_lists = [split_tree(tree_i, sections, axis=axis) for tree_i in tree]
+    # Rearrange the tuple of lists into a list of tuples.
+    return [tuple([brs[i] for brs in branch_lists]) for i in range(sections)]
+  else:
+    raise ValueError("Argument %r must be an ndarray or tuple." % tree)
+def concat_trees(tree_list: Sequence[ArrayTree], axis: int = 0) -> ArrayTree:
+  """Merges a list of trees into a single tree by concatenating their elements.
+  Args:
+    tree_list: A list of trees, all of the same shape.
+    axis: The axis to concatenate arrays on.
+  Returns:
+    A single tree, with the same shape as the trees in tree_list.
+  """
+  # All trees in the list are required to have the same shape.
+  # We return a tree with the same shape as each of the trees in the list,
+  first_tree = tree_list[0]
+  if first_tree is None:
+    # Merge a list of None into a single None.
+    for tree_i in tree_list:
+      assert tree_i is None
+    return None
+  elif isinstance(first_tree, jnp.ndarray):
+    # Concatenate a list of arrays.
+    for tree_i in tree_list:
+      assert isinstance(tree_i, jnp.ndarray)
+    return jnp.concatenate(tree_list, axis=axis)
+  elif isinstance(first_tree, tuple):
+    # Reshape a list of tuples into a tuple of concatenated lists.
+    for tree_i in tree_list:
+      assert isinstance(tree_i, tuple) and len(tree_i) == len(first_tree)
+    num_branches = len(first_tree)
+    return tuple([concat_trees([tree[b] for tree in tree_list], axis=axis)
+                  for b in range(num_branches)])
+  else:
+    raise ValueError("Argument %r must be an ndarray or tuple." % first_tree)
+def reshape_transpose_tree(tree: ArrayTree, sections: int, axis: int = 0) -> (
+    ArrayTree):
+  """Reshape and transpose arrays so that the window is dimension 0."""
+  # We could use jax tree utils for this, but we do it the hard way so the
+  # implementaiton can be compared with split_tree.
+  if tree is None:
+    return None
+  elif isinstance(tree, jnp.ndarray):
+    tree = typing.cast(Array, tree)  # Tell type-checker about isinstance
+    ndim = tree.ndim
+    wlen = tree.shape[axis] // sections
+    assert sections * wlen == tree.shape[axis]  # Must be evenly divisible.
+    # Break the axis dimension into sections * window_size
+    arr = tree
+    sh = list(arr.shape)
+    nshape = sh[0:axis] + [sections, wlen] + sh[axis + 1:]
+    arr = jnp.reshape(arr, nshape)
+    # Transpose sections to be dimension 0.
+    tdims = [axis] + list(range(0, axis)) + list(range(axis + 1, ndim + 1))
+    arr = jnp.transpose(arr, tdims)
+    return arr
+  elif isinstance(tree, tuple):
+    return tuple([reshape_transpose_tree(b, sections, axis) for b in tree])
+  else:
+    raise ValueError("Argument %r must be an ndarray or tuple." % tree)
+def transpose_reshape_tree(tree: ArrayTree, sections: int, axis: int = 0) -> (
+    ArrayTree):
+  """Reshape and transpose arrays so that the window is dimension 0."""
+  # We could use jax tree utils for this, but we do it the hard way so the
+  # implementaiton can be compared with split_tree.
+  if tree is None:
+    return None
+  elif isinstance(tree, jnp.ndarray):
+    tree = typing.cast(Array, tree)  # Tell type-checker about isinstance
+    ndim = tree.ndim - 1   # Input tree has 1 extra dimension on front.
+    assert axis < ndim
+    wlen = tree.shape[axis + 1]  # Window length.
+    # Transpose dimension 0 back to its proper place.
+    arr = tree
+    tdims = list(range(1, axis + 1)) + [0] + list(range(axis + 1, ndim + 1))
+    arr = jnp.transpose(arr, tdims)
+    # Combine the sections and window_size dimensions.
+    sh = list(arr.shape)
+    nshape = sh[0:axis] + [sections * wlen] + sh[axis + 2:]
+    arr = jnp.reshape(arr, nshape)
+    return arr
+  elif isinstance(tree, tuple):
+    return tuple([transpose_reshape_tree(b, sections, axis) for b in tree])
+  else:
+    raise ValueError("Argument %r must be an ndarray or tuple." % tree)
+def split_and_scan(fn: Callable[[ArrayTree, ArrayTree],
+                                Tuple[ArrayTree, ArrayTree]],
+                   carry: ArrayTree, input_arrays: ArrayTree,
+                   sections: int, axis: int = 0,
+                   max_unrolled_windows: int = -1) -> (
+                       Tuple[ArrayTree, ArrayTree]):
+  """Scan over a set of input arrays in chunks.
+  Splits each array in 'input_arrays' into the number of chunks given by
+  'sections', and then loops over the chunks using a scan operation.
+  Returns a concatenation of the results.
+  Args:
+    fn: A function from (carry, input_i) -> (carry, output_i).
+    carry: The initial state for the scan, that will be passed from one
+           iteration to the next.
+    input_arrays: A nested tree of tuples of arrays.
+    sections: The number of sections or chunks for the split.
+    axis: The axis to split each array along.
+    max_unrolled_windows: If 0 <= max_unrolled_windows < sections,
+        use jax.lax.scan rather than unrolling the windows with a python loop.
+  Returns:
+    (carry, output)
+  """
+  if sections == 1:
+    logging.info("Single window, no scan.")
+    return fn(carry, input_arrays)
+  if axis < 0:
+    raise ValueError(f"Axis must be positive. Got {axis}")
+  logging.info("Scanning over %d windows", sections)
+  if 0 <= max_unrolled_windows and max_unrolled_windows < sections:
+    logging.info("Using jax.lax.scan.")
+    in_arrs = reshape_transpose_tree(input_arrays, sections, axis)
+    (carry, out_arrs) = jax.lax.scan(fn, carry, in_arrs)
+    output_arrays = transpose_reshape_tree(out_arrs, sections, axis)
+    return (carry, output_arrays)
+  logging.info("Using unrolled for-loop.")
+  in_list = split_tree(input_arrays, sections, axis=axis)
+  out_list = []
+  for (k, in_chunk) in enumerate(in_list):
+    logging.info("Processing window %d", k)
+    (carry, out_chunk) = fn(carry, in_chunk)
+    out_list.append(out_chunk)
+  output_arrays = concat_trees(out_list, axis=axis)
+  return (carry, output_arrays)

aglib/meliad/transformer/decoder_stack.py ADDED Viewed

	@@ -0,0 +1,426 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Hierarchical transformer."""
+import functools
+from typing import Any, Callable, Optional, Sequence, Tuple
+from absl import logging
+from flax import linen as nn
+from flax import struct
+import gin
+import jax.numpy as jnp
+from transformer import attention
+from transformer import metric_utils
+from transformer import nn_components
+from transformer import position
+from transformer import transformer_layer
+Array = Any
+# Basic task options are shared among multiple classes.
+@gin.configurable
+@struct.dataclass
+class TransformerTaskConfig:
+  """Configuration hyperparameters for sequence-to-sequence tasks."""
+  dataset_name: str = "synthetic"
+  train_split: str = "train"
+  test_split: str = "test"
+  sequential_chunks: bool = True  # Process chunks of text in sequential order.
+  sequence_length: int = 4096
+  batch_size: int = 1  # per device batch size
+  vocab_size: int = 256
+DStackDecoderState = Tuple[transformer_layer.DecoderState, ...]
+DStackWindowState = Tuple[transformer_layer.WindowState, ...]
+@gin.configurable
+class DecoderStack(nn.Module):
+  """Stack of transformer decoder layers."""
+  mode: str
+  task_config: TransformerTaskConfig = gin.REQUIRED
+  # Configurable hyperparameters.
+  num_layers: int = gin.REQUIRED
+  embedding_size: int = gin.REQUIRED
+  embedding_stddev: float = 1.0
+  # The class to use for an individual transformer layer.
+  layer_factory: Any = gin.REQUIRED
+  # Window length to use for the decoder stack.
+  # If nonzero, use this instead of TransformerLayer.window_length.
+  dstack_window_length: int = 0
+  use_absolute_positions: bool = False
+  use_final_layernorm: bool = True
+  final_dropout_rate: float = 0.0
+  final_mlp_factory: Optional[Callable[[int], nn.Module]] = None
+  # Enable recurrence on particular layers.
+  recurrent_layer_indices: Sequence[int] = ()
+  feedback_recurrence: bool = True
+  # The factory function which creates a MemoryManager, or None.
+  memory_factory: Any = None
+  # Layers to equip with external memory.
+  memory_layer_indices: Sequence[int] = ()
+  dtype: Any = jnp.float32
+  def is_training(self):
+    return self.mode == "train"
+  def supports_generate(self) -> bool:
+    return all([lyr.supports_generate() for lyr in self.transformer_layers])
+  def setup(self):
+    task_config = self.task_config
+    embed_init = nn.initializers.normal(stddev=self.embedding_stddev,
+                                        dtype=jnp.float32)
+    self.embed = nn.Embed(num_embeddings=task_config.vocab_size,
+                          features=self.embedding_size,
+                          embedding_init=embed_init)
+    # Create a memory_factory.MemoryManager object, which is shared among
+    # all transformer layers.  Each layer will use the MemoryManager object
+    # to instantiate a block of memory for that layer.
+    memory = None
+    if self.memory_factory is not None:
+      if self.memory_layer_indices:
+        memory = self.memory_factory(batch_size=task_config.batch_size,
+                                     mode=self.mode)
+      else:
+        logging.warning(
+            "Memory factory specified, but memory_layer_indices is empty.")
+    # Allow negative numbers in memory_layer_indices.
+    # Negative numbers refer to layers at the top of the stack.
+    for k in self.memory_layer_indices:
+      if k < -self.num_layers or k >= self.num_layers:
+        raise ValueError(f"Invalid memory layer index {k}")
+    # The % operator will convert negative k to self.num_layers + k.
+    mem_layer_indices = [
+        idx % self.num_layers for idx in self.memory_layer_indices
+    ]
+    # Allow negative numbers in recurrent_layer_indices.
+    for k in self.recurrent_layer_indices:
+      if k < -self.num_layers or k >= self.num_layers:
+        raise ValueError(f"Invalid recurrent layer index {k}")
+    recurrent_layer_indices = [
+        idx % self.num_layers for idx in self.recurrent_layer_indices
+    ]
+    # Turn on cross attention if there are recurrent layers with feedback.
+    enable_cross_attn = (self.feedback_recurrence and
+                         self.recurrent_layer_indices and
+                         self.dstack_window_length > 0)
+    layers = []
+    for i in range(0, self.num_layers):
+      mem = memory if (i in mem_layer_indices) else None
+      rec_i = i in recurrent_layer_indices
+      layer_fn = functools.partial(
+          self.layer_factory,
+          mode=self.mode,
+          batch_size=self.task_config.batch_size,
+          embedding_size=self.embedding_size,
+          name=f"transformer{i}",
+          recurrent_attention=rec_i,
+          cross_attention=enable_cross_attn and not rec_i)
+      if mem:
+        logging.info("Using external memory with transformer layer %d.", i)
+        layer_fn = functools.partial(
+            layer_fn,
+            memory=mem,
+            # We use partial function applications here only to avoid
+            # overwriting the head size unless memory is involved.
+            head_size=mem.key_size,
+            num_heads=mem.num_heads)
+      layers.append(layer_fn())
+    self.transformer_layers = layers
+    if self.use_final_layernorm:
+      self.final_layernorm = nn_components.LayerNorm()
+    if self.final_mlp_factory is not None:
+      self.final_mlp = self.final_mlp_factory(self.embedding_size)
+  def init_decoder_state(self, sequence_length: int,
+                         start_of_sequence: Array) -> DStackDecoderState:
+    """Return initial state for autoregressive generation."""
+    return tuple([
+        layer.init_decoder_state(sequence_length, start_of_sequence)
+        for layer in self.transformer_layers
+    ])
+  def load_window_state(self, start_of_sequence: Array) -> DStackWindowState:
+    """Load cached state that is passed from one window to the next."""
+    return tuple([
+        layer.load_window_state(start_of_sequence)
+        for layer in self.transformer_layers
+    ])
+  def store_window_state(self, window_state: DStackWindowState):
+    """Write window state to the cache."""
+    for (layer, wstate) in zip(self.transformer_layers, window_state):
+      layer.store_window_state(wstate)
+  def _eval_layer_stack(self, xs: Array, start_of_sequence: Array,
+                        window_state: Optional[DStackWindowState],
+                        decoder_state: Optional[DStackDecoderState]) -> (
+                            Tuple[Array, Optional[DStackWindowState],
+                                  Optional[DStackDecoderState], Any]):
+    """Evaluate a stack of transformer layers on an input."""
+    ys = xs  # (batch_size, seq_len, num_hidden)
+    importance = None  # (batch_size, sequence_length)
+    next_window_states = []
+    next_decoder_states = []
+    attn_viz_dicts = []
+    # If we have a recurrent layer, grab the keys and values from it.
+    # All other layers can then cross-attend to the recurrent keys and values.
+    recurrent_kv = None
+    enable_cross_attn = (self.feedback_recurrence and
+                         self.recurrent_layer_indices and
+                         self.dstack_window_length > 0)
+    if enable_cross_attn and window_state is not None:
+      # TODO(delesley): fix this so it works with the autoregressive decoder.
+      assert decoder_state is None
+      logging.info("dstack: using recurrent cross attention on all layers.")
+      for (layer, wstate_i) in zip(self.transformer_layers, window_state):
+        rkv = layer.get_recurrent_kv(wstate_i)
+        if rkv is not None:
+          recurrent_kv = rkv
+    # Apply transformer layers.
+    for (i, layer) in enumerate(self.transformer_layers):
+      if layer.recurrent_attention:
+        cross_kv = None  # The recurrent layer handles rkv internally.
+      else:
+        cross_kv = recurrent_kv  # Other layers cross-attend to recurrent one.
+      logging.info("dstack: ---- Layer %d ----", i)
+      wstate_i = None if window_state is None else window_state[i]
+      dstate_i = None if decoder_state is None else decoder_state[i]
+      (ys, importance, n_wstate_i, n_dstate_i, viz_dict) = layer(
+          ys, start_of_sequence,
+          importance=importance,
+          cross_attention_kv=cross_kv,   # cross-attend to recurrent_kv.
+          window_state=wstate_i,
+          decoder_state=dstate_i)
+      next_window_states.append(n_wstate_i)
+      next_decoder_states.append(n_dstate_i)
+      attn_viz_dicts.append(viz_dict)
+    window_state = tuple(next_window_states)
+    decoder_state = tuple(next_decoder_states)
+    return (ys, window_state, decoder_state, attn_viz_dicts)
+  def __call__(self,
+               input_tokens: Array,
+               target_tokens: Array,
+               start_of_sequence: Array,
+               decoder_state: Optional[DStackDecoderState] = None) -> (
+                   Tuple[Array, Optional[DStackDecoderState], Any]):
+    """Call the decoder stack.
+    This function will embed tokens, run the embeddings through a stack of
+    decoder layers, and then compute logits for the target tokens using the
+    transpose of the embeddings.  It returns un-normalized (pre-softmax)
+    logits.
+    Args:
+      input_tokens: Integer array of shape [batch_size, sequence_length]
+      target_tokens: For compatibility.  Ignored by this class.
+      start_of_sequence: Boolean array of shape [batch_size],
+          which indicates whether a sequence is at the start of sequence.
+      decoder_state: State object for autoregressive decoding,
+          created from init_decoder_state.
+    Returns:
+       (logits, of shape [batch_size, sequence_length, vocab_size],
+        next_decoder_state: for autoregressive decoding,
+        viz_dict: dictionary of visualizations,
+       )
+    """
+    del target_tokens
+    task_config = self.task_config
+    # Embed tokens.
+    embeddings = self.embed(input_tokens)  # (batch_size, seq_len, num_hidden)
+    embeddings = embeddings.astype(self.dtype)
+    sequence_length = embeddings.shape[1]
+    logging.info("dstack: embeddings = %r", embeddings)
+    # Add absolute position encodings if necessary.
+    if self.use_absolute_positions:
+      # Use a large max_wavelength so that only part of the input vector
+      # is used for positions.
+      positions = position.position_encoding(
+          num_positions=task_config.sequence_length,
+          input_dim=self.embedding_size,
+          max_wavelength=10_000)
+      positions = jnp.asarray(positions, dtype=self.dtype)
+      positions = jnp.expand_dims(positions, 0)  # Add batch dimension.
+      logging.info("dstack: absolute positions = %r", positions)
+      embeddings = embeddings + positions
+    # Function to run the whole transformer stack on a single window.
+    # ---------------------------------------------------------------
+    def single_window_stack(carry, inputs_w):
+      (window_state_w, start_of_seq_w) = carry
+      (outputs_w, window_state_w, _, _) = self._eval_layer_stack(
+          inputs_w, start_of_seq_w,
+          window_state=window_state_w, decoder_state=None)
+      # start_of_sequence is false after the first window.
+      bsize = self.task_config.batch_size
+      next_start_of_seq = jnp.asarray([False] * bsize, dtype=jnp.bool_)
+      return ((window_state_w, next_start_of_seq), outputs_w)
+    # Find the number of windows.  A sequence may be split into multiple
+    # windows here, or alternatively, it may be split (or further split) within
+    # TransformerLayer, depending on configuration.
+    if (self.dstack_window_length == 0 or
+        self.dstack_window_length >= sequence_length):
+      num_windows = 1
+    else:
+      num_windows = sequence_length // self.dstack_window_length
+      assert (num_windows * self.dstack_window_length) == sequence_length
+    # Evaluate the stack of layers, scanning over windows if configured.
+    # ------------------------------------------------------------------
+    if decoder_state is None:
+      logging.info("dstack: scanning over %d windows.", num_windows)
+      # Load cached state from the previous training step, for truncated BPTT.
+      window_state = self.load_window_state(start_of_sequence)
+      # Scan single_window_stack over the sequence.
+      cstate = (window_state, start_of_sequence)
+      (cstate, ys) = attention.split_and_scan(single_window_stack,
+                                              cstate,
+                                              embeddings,
+                                              sections=num_windows,
+                                              axis=1)
+      (window_state, _) = cstate
+      # Cache state for the next training step, for truncated BPTT.
+      self.store_window_state(window_state)
+      attn_viz_dicts = {}  # Temporarily disabled.
+    else:
+      logging.info("dstack: autoregressive generator.")
+      # Run as an autoregressive decoder: evaluate the whole stack on a token.
+      # Do not load or store window_state; decoder_state is used instead.
+      (ys, _, decoder_state, _) = self._eval_layer_stack(
+          embeddings, start_of_sequence,
+          window_state=None, decoder_state=decoder_state)
+      attn_viz_dicts = {}
+    # Apply layernorm to the final output, before calculating logits.
+    # With a pre-layernorm architecture, this has to be done here.
+    if self.use_final_layernorm:
+      logging.info("dstack: Final layernorm.")
+      ys = self.final_layernorm(ys)
+    # Final dropout before token prediction.
+    drop_tile_shape = (1, 128, self.embedding_size)
+    get_dropout_rng = lambda: self.make_rng("dropout")
+    ys = nn_components.tiled_dropout(ys, drop_tile_shape,
+                                     self.final_dropout_rate,
+                                     rng_function=get_dropout_rng,
+                                     deterministic=not self.is_training())
+    # Apply an MLP at the very end to convert the output of the transformer
+    # into a vector to look up target tokens in the embedding table.
+    # This final layer allows the NN to distinguish between the "input context",
+    # which is returned by the transformer resnet, and the "predicted target".
+    if self.final_mlp_factory is not None:
+      logging.info("dstack: Final MLP layer.")
+      ys = self.final_mlp(ys)
+    # Reverse embedding to generate logits which predict the output tokens.
+    logits = self.embed.attend(ys)  # (..., seq_len, vocab_size)
+    logging.info("dstack: logits = %r", logits)
+    # Normalize so that the range of logits is reasonable.
+    logits = logits / jnp.sqrt(logits.shape[-1]).astype(self.dtype)
+    # Produce various visualizations in generate mode.
+    # TODO(delesley): Too many visualizations crashes the summary writer.
+    if self.mode == "generate":
+      img_dict = self._make_images(attn_viz_dicts, [])
+      hist_dict = {}  # metric_utils.make_histograms(attn_viz_dicts)
+      info_dict = {**img_dict, **hist_dict}
+    else:
+      info_dict = {}  # Don't output any visualizations.
+    return (logits, decoder_state, info_dict)
+  def _make_importance_image(self, importance_list, scaled=True) -> Array:
+    rows = []
+    for imp in importance_list:
+      rows += [imp] * 8  # Rows are 8 pixels high for better visability.
+    image = jnp.stack(rows)
+    if scaled:
+      image = jnp.exp(image)
+    image = metric_utils.normalize_image(image, True)
+    return metric_utils.reshape_image(image)
+  def _make_images(self, viz_dicts, importance_list):
+    image_dict = {}
+    for (i, viz_dict) in enumerate(viz_dicts):
+      if "attn_importance_gate" in viz_dict:
+        imp_gate = viz_dict["attn_importance_gate"][0]  # First item in batch.
+        imp_strip = metric_utils.normalize_image(imp_gate[:, 0:8, :], True)
+      else:
+        imp_strip = None
+      for (k, attn_images) in viz_dict.items():
+        if k not in {"attn_content",
+                     "attn_pre_softmax",
+                     "attn_log",
+                     "attn",
+                     "attn_position_bias",
+                     "attn_importance_bias",
+                     "attn_importance_gate"}:
+          continue
+        attn_img = attn_images[0]  # Grab the first item in the batch.
+        attn_img = metric_utils.normalize_image(attn_img,
+                                                as_group=(k != "attn"))
+        if imp_strip is not None and k in {"attn_log", "attn"}:
+          # Show importance bias in a strip at the bottom of the image.
+          attn_img = metric_utils.overlay_images(attn_img, imp_strip)
+        attn_img = metric_utils.reshape_image(attn_img)  # Returns None on fail.
+        if attn_img is not None:
+          image_dict[k + "_" + str(i)] = attn_img
+    if importance_list:
+      # Create an image out of the importance for each layer.
+      image_dict["importance_gate"] = self._make_importance_image(
+          importance_list, scaled=True)
+      image_dict["importance_raw"] = self._make_importance_image(
+          importance_list, scaled=False)
+    return image_dict

aglib/meliad/transformer/ht_main.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Main program to train htransformer models.
+"""
+from typing import Sequence
+from absl import app
+from absl import flags
+from clu import platform
+import jax
+from transformer import launcher
+import tensorflow.compat.v2 as tf
+FLAGS = flags.FLAGS
+def main(argv: Sequence[str]) -> None:
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+  launcher.parse_gin_configuration()
+  # Hide any GPUs from TensorFlow. Otherwise TF might reserve memory and make
+  # it unavailable to JAX.
+  tf.config.experimental.set_visible_devices([], "GPU")
+  # Set global seed for datasets.
+  # tf.random.set_seed(1234)
+  # Add a note so that we can tell which task is which JAX host.
+  # (Depending on the platform task 0 is not guaranteed to be host 0)
+  platform.work_unit().set_task_status(f"process_index: {jax.process_index()}, "
+                                       f"process_count: {jax.process_count()}")
+  platform.work_unit().create_artifact(platform.ArtifactType.DIRECTORY,
+                                       FLAGS.workdir, "workdir")
+  launcher.run_training_loop(testing=False)
+if __name__ == "__main__":
+  app.run(main)

aglib/meliad/transformer/ht_main_inference.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Program to run a transformer model over a single article.
+"""
+# This program is currently a template, which can be expanded to do more
+# sophisticated analysis.
+from typing import Sequence
+from absl import app
+from absl import flags
+from clu import platform
+import jax
+from transformer import inference_utils
+from transformer import tasks  # pylint: disable=unused-import
+import tensorflow.compat.v2 as tf
+flags.DEFINE_string("workdir", "", "Directory to save model checkpoints.")
+flags.DEFINE_string("load_dir", "", "Directory to load pre-trained model.")
+flags.DEFINE_integer("num_steps", 110, "Number of steps.")
+flags.DEFINE_list(
+    "gin_search_paths",
+    ["transformer/configs"],
+    "List of paths where the Gin config files are located.")
+flags.DEFINE_multi_string(
+    "gin_file", ["base_htrans.gin"], "List of Gin config files.")
+flags.DEFINE_multi_string(
+    "gin_param", None, "Newline separated list of Gin parameter bindings.")
+FLAGS = flags.FLAGS
+def main(argv: Sequence[str]) -> None:
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+  # Hide any GPUs from TensorFlow. Otherwise TF might reserve memory and make
+  # it unavailable to JAX.
+  tf.config.experimental.set_visible_devices([], "GPU")
+  # Add a note so that we can tell which task is which JAX host.
+  # (Depending on the platform task 0 is not guaranteed to be host 0)
+  platform.work_unit().set_task_status(f"process_index: {jax.process_index()}, "
+                                       f"process_count: {jax.process_count()}")
+  platform.work_unit().create_artifact(platform.ArtifactType.DIRECTORY,
+                                       FLAGS.workdir, "workdir")
+  inference_utils.parse_gin_configuration(FLAGS.gin_file, FLAGS.gin_param,
+                                          gin_paths=FLAGS.gin_search_paths)
+  article_data = inference_utils.read_article(True)
+  (_, vocab) = article_data
+  (task, task_state, _) = inference_utils.create_model_and_task(
+      vocab, load_dir=FLAGS.load_dir)
+  outs = inference_utils.run_model(task, task_state, article_data,
+                                   verbose=True)
+  inference_utils.get_token_losses(outs)
+if __name__ == "__main__":
+  app.run(main)

aglib/meliad/transformer/inference_utils.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Various utility functions for doing inference on data.
+This file provides a simple procedural API for loading a model, loading data,
+and running the model over data.  It is intended for use in, e.g., colabs.
+"""
+from typing import Any, Dict, Optional, Sequence, Tuple
+from absl import logging
+import gin
+import jax
+import  training_loop
+from transformer import decoder_stack
+from transformer import models
+from transformer import text_dataset
+import numpy as np
+import seqio
+Trainer = training_loop.Trainer
+TrainState = training_loop.TrainState
+TrainingTask = training_loop.TrainingTask
+PRNGKeys = training_loop.PRNGKeys
+ModelInput = Dict[str, Any]     # Input to model.
+MetricsOutput = Dict[str, Any]  # Metrics output by model.
+ArticleData = Tuple[Sequence[ModelInput], seqio.Vocabulary]
+TaskState = Tuple[TrainState, int]
+DEFAULT_GIN_PATHS = [
+    "transformer/configs"
+]
+def parse_gin_configuration(gin_files: Optional[Sequence[str]],
+                            gin_params: Optional[Sequence[str]],
+                            gin_paths: Optional[Sequence[str]] = None):
+  """Load gin configuration options.
+  Args:
+    gin_files: A list of gin file names with the configuration to load.
+    gin_params: A list of additional parameter overrides.
+    gin_paths: A list of paths to search for gin_files.
+  """
+  # We allow None values to more easily handle command-line flags.
+  if gin_files is None:
+    gin_files = []
+  if gin_params is None:
+    gin_params = []
+  if gin_paths is None:
+    gin_paths = DEFAULT_GIN_PATHS
+  logging.info("Parsing gin configuration.")
+  for path in gin_paths:
+    logging.info("Added Gin search path %s", path)
+    gin.add_config_file_search_path(path)
+  for file_name in gin_files:
+    logging.info("Loading Gin config file %s", file_name)
+  for param in gin_params:
+    logging.info("Overriding Gin param %s", param)
+  gin.parse_config_files_and_bindings(gin_files, gin_params)
+def read_article(split: Optional[str] = None,
+                 verbose: bool = False) -> ArticleData:
+  """Read a single article from the dataset and save it as a list of blocks.
+  This routine will return blocks for a single article; so the tokens will
+  have a batch size of 1. The blocks can be fed to the model directly as input.
+  Args:
+    split: The dataset split to load from.  Defaults to the test split.
+    verbose: If True, will dump the contents of the article to the log.
+  Returns:
+    A pair of (list_of_blocks, vocabulary)
+  """
+  logging.info("Reading article.")
+  text_dataset.set_default_data_directory()
+  task_config = decoder_stack.TransformerTaskConfig()
+  batch_size = 1
+  if split is None:
+    split = task_config.test_split
+  (test_ds, vocab) = text_dataset.load_text_dataset(
+      name=task_config.dataset_name,
+      split=split,
+      sequence_length=task_config.sequence_length,
+      batch_size=batch_size,
+      sequential=task_config.sequential_chunks,
+      shard_dataset=False)
+  logging.info("Configured vocab_size = %d", task_config.vocab_size)
+  logging.info("Task vocabulary size = %d", vocab.vocab_size)
+  if task_config.vocab_size < vocab.vocab_size:
+    raise ValueError(
+        "Task vocabulary size does not match configured vocab_size: " +
+        f"{task_config.vocab_size} < {vocab.vocab_size}")
+  article_segments = []
+  ds_iter = test_ds.as_numpy_iterator()
+  vocab_map = {"targets": vocab}
+  segment_num = 0
+  while True:
+    try:
+      x = next(ds_iter)
+    except StopIteration:
+      logging.info("End of epoch? Something went wrong.")
+      break
+    # Make sure we've started reading, otherwise it immediately quits...
+    if article_segments:
+      if x["start_of_sequence"][0]:
+        break
+    if verbose:
+      logging.info("Segment %d = %s", segment_num,
+                   text_dataset.pretty_print_article(x, vocab_map,
+                                                     max_length=10_000))
+    article_segments.append(x)
+    segment_num += 1
+  logging.info("Done reading article: %d segments.", segment_num)
+  logging.info("Num tokens = %d", segment_num * task_config.sequence_length)
+  return (article_segments, vocab)
+def create_model_and_task(vocab: seqio.Vocabulary,
+                          load_dir: Optional[str] = None) -> (
+                              Tuple[TrainingTask, TaskState, Trainer]):
+  """Initialize the model and get a task for inference.
+  The task will be configured to take test (inference) steps with the model.
+  The task will also be configured to run on a single replica, at batch size 1.
+  Args:
+    vocab: The vocabulary for the training data, used for logging and decoding.
+    load_dir: A directory which contains a pre-trained model.
+  Returns:
+    (task -- has a run_step method to take individual steps with the model,
+     state -- contains trainable parameters and other state,
+     trainer -- a Trainer object (see training_loop.py))
+  """
+  logging.info("JAX process: %d / %d", jax.process_index(), jax.process_count())
+  logging.info("JAX local devices: %r", jax.local_devices())
+  # This task won't be pulling from a dataset.
+  def null_iter_fn():
+    return None
+  trainer = training_loop.Trainer(
+      get_training_dataset_iterator=null_iter_fn,
+      get_test_dataset_iterator=None,
+      pretty_print_input_function=None,
+      process_summaries_function=models.process_summaries_function(vocab),
+      load_dir=load_dir,
+      workdir="",            # Don't log or save checkpoints.
+      replicate_mode=False)  # Run on a single device at batch size 1.
+  # Create and initialize the model.
+  (tstate, start_step, imodel, prngs) = trainer.initialize_model()
+  # Create an inference task.
+  writers = {}
+  task = trainer.create_training_task("test", imodel, prngs, writers)
+  # Register any additional actions.
+  # Actions are cleared first for use with colab.
+  training_loop.clear_interstep_callbacks()
+  training_loop.register_interstep_callbacks()
+  task_state = (tstate, start_step)
+  return (task, task_state, trainer)
+def run_model(task: TrainingTask, task_state: TaskState,
+              article_data: ArticleData, verbose: bool = False) -> (
+                  Sequence[MetricsOutput]):
+  """Run the model on an article, and return the outputs for each segment.
+  Args:
+    task: The task to run, from create_model_and_task.
+    task_state: The state of the model, from create_model_and_task.
+    article_data: The article and vocabulary, from read_article.
+    verbose: If True, will send input and output to the log.
+  Returns:
+    A sequence of model outputs for each block.
+  """
+  logging.info("Running the model.")
+  (article_segments, vocab) = article_data
+  (tstate, start_step) = task_state
+  vocab_map = {"targets": vocab}
+  # Ignore the iterator for the test task, and loop over the article.
+  step = start_step
+  segment_num = 0
+  # Loop over the article, and run the model on each segment.
+  segment_outputs = []
+  for x in article_segments:
+    if verbose:
+      logging.info("Segment [%d] = %s", segment_num,
+                   text_dataset.pretty_print_article(x, vocab_map,
+                                                     max_length=10_000))
+    else:
+      logging.info("Segment %d, step %d.", segment_num, step)
+    (tstate, metrics_np) = task.run_step(tstate, x, step)
+    training_loop.run_interstep_callbacks("test", step)
+    segment_outputs.append(metrics_np)
+    if verbose:
+      logging.info("Output [%d] = %s", segment_num, metrics_np)
+    del x
+    segment_num += 1
+    step += 1
+  logging.info("Done running the model: %d segments.", segment_num)
+  return segment_outputs
+def get_token_losses(segment_outputs: Sequence[Any]) -> np.ndarray:
+  """Return the loss for each token in a sequence.
+  Given a list of model outputs, extract the token losses from each output
+  and concatenate them together.
+  Args:
+    segment_outputs: the outputs from run_model().
+  Returns:
+    An array of shape (batch_size, sequence_length), of float.
+  """
+  block_token_losses = []
+  for seg in segment_outputs:
+    if "token_losses" in seg:
+      block_token_losses.append(seg["token_losses"])
+    else:
+      raise ValueError("Token losses were not recorded.")
+  logging.info("Got token losses for %d segments", len(block_token_losses))
+  token_losses = np.concatenate(block_token_losses, axis=-1)
+  logging.info("token_losses.shape = %r", token_losses.shape)
+  return token_losses

aglib/meliad/transformer/launcher.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Setup the data pipeline and launch the main training loop."""
+from absl import flags
+from absl import logging
+import gin
+import jax
+import  training_loop
+from transformer import decoder_stack
+from transformer import models
+from transformer import tasks  # pylint: disable=unused-import
+from transformer import text_dataset
+flags.DEFINE_string("workdir", "", "Directory to save model checkpoints.")
+flags.DEFINE_string("load_dir", "", "Directory to load pre-trained model.")
+flags.DEFINE_integer("num_steps", 110, "Number of steps.")
+flags.DEFINE_list(
+    "gin_search_paths",
+    ["transformer/configs"],
+    "List of paths where the Gin config files are located.")
+flags.DEFINE_multi_string(
+    "gin_file", ["base_htrans.gin"], "List of Gin config files.")
+flags.DEFINE_multi_string(
+    "gin_param", None, "Newline separated list of Gin parameter bindings.")
+FLAGS = flags.FLAGS
+def parse_gin_configuration():
+  """Load and parse Gin configuration from command-line flags."""
+  for gin_file_path in FLAGS.gin_search_paths:
+    logging.info("Added Gin search path %s", gin_file_path)
+    gin.add_config_file_search_path(gin_file_path)
+  for gin_file in FLAGS.gin_file:
+    logging.info("Loading Gin config file %s", gin_file)
+  if FLAGS.gin_param:
+    for gin_param in FLAGS.gin_param:
+      logging.info("Overriding Gin param %s", gin_param)
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
+def run_training_loop(testing: bool = False):
+  """Setup data pipeline and launch the main training loop."""
+  logging.info("JAX process: %d / %d", jax.process_index(), jax.process_count())
+  logging.info("JAX local devices: %r", jax.local_devices())
+  text_dataset.set_default_data_directory()
+  task_config = decoder_stack.TransformerTaskConfig()
+  batch_size = task_config.batch_size * jax.local_device_count()
+  (train_ds, vocab) = text_dataset.load_text_dataset(
+      name=task_config.dataset_name,
+      split=task_config.train_split,  # train
+      sequence_length=task_config.sequence_length,
+      batch_size=batch_size,
+      sequential=task_config.sequential_chunks,
+      shard_dataset=True)
+  (test_ds, test_vocab) = text_dataset.load_text_dataset(
+      name=task_config.dataset_name,
+      split=task_config.test_split,   # test
+      sequence_length=task_config.sequence_length,
+      batch_size=batch_size,
+      sequential=task_config.sequential_chunks,
+      shard_dataset=False)
+  logging.info("Configured vocab_size = %d", task_config.vocab_size)
+  logging.info("Task vocabulary size = %d", vocab.vocab_size)
+  assert vocab.vocab_size == test_vocab.vocab_size  # Sanity check.
+  if task_config.vocab_size < vocab.vocab_size:
+    raise ValueError(
+        "Task vocabulary size does not match configured vocab_size: " +
+        f"{task_config.vocab_size} < {vocab.vocab_size}")
+  # Pretty printing depends on the vocabulary object.
+  def pretty_print_article_fn(article) -> str:
+    return text_dataset.pretty_print_article(article, {"targets": vocab}, 32768)
+  train_ds_iter_fn = text_dataset.get_iterator_function(train_ds)
+  test_ds_iter_fn = text_dataset.get_iterator_function(test_ds)
+  if testing:
+    # Build trainer, which is configurable by Gin, and run training loop.
+    trainer = training_loop.Trainer(
+        get_training_dataset_iterator=train_ds_iter_fn,
+        get_test_dataset_iterator=test_ds_iter_fn,
+        pretty_print_input_function=pretty_print_article_fn,
+        process_summaries_function=models.process_summaries_function(vocab),
+        num_steps=FLAGS.num_steps,  # Ignore Gin config for these options.
+        load_dir=FLAGS.load_dir,
+        workdir=FLAGS.workdir)
+  else:
+    trainer = training_loop.Trainer(
+        get_training_dataset_iterator=train_ds_iter_fn,
+        get_test_dataset_iterator=test_ds_iter_fn,
+        pretty_print_input_function=pretty_print_article_fn,
+        process_summaries_function=models.process_summaries_function(vocab),
+        load_dir=FLAGS.load_dir,
+        workdir=FLAGS.workdir)
+  trainer.train()

aglib/meliad/transformer/memory_factory.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax modules and functions for using external memory."""
+from typing import Any, Optional, Tuple
+from absl import logging
+from flax import linen
+import gin
+import jax
+from transformer import memory_layer
+PRNGKey = Any
+Shape = Tuple[int]
+Dtype = Any
+Array = Any
+MemoryResource = Any
+class MemoryManager:
+  """Manages any external resources that may be required by external memory.
+  MemoryManager also functions as a factory, to create Flax modules that will
+  read and write to whatever external memory has been configured.
+  """
+  def __init__(self,
+               batch_size: int,
+               mode: str,
+               num_heads: int,
+               key_size: int,
+               value_size: int,
+               database_size: Optional[int] = None,
+               dtype: Dtype = "float32",
+               off_device_memory: Optional[MemoryResource] = None):
+    """Create a MemoryManager object.
+    A MemoryManager configures external memory, and is used as a factory to
+    construct flax modules that read or write to the memory.
+    Args:
+      batch_size: The number of separate documents in a batch.
+      mode:       e.g. ("train", or "test")
+      num_heads:  The number of transformer heads.
+      key_size:   The length of the key vectors.
+      value_size: The length of the value vectors.
+      database_size:  The total number of tokens in the database.
+      dtype:      The datatype used for keys and values.
+      off_device_memory: An object which manages underlying SCAM memory.
+          If None, then the model will use on-device memory.
+    """
+    self.batch_size = batch_size
+    self.mode = mode
+    self.num_heads = num_heads
+    self.key_size = key_size
+    self.value_size = value_size
+    self.database_size = database_size
+    self.dtype = dtype
+    self.off_device_memory = off_device_memory
+  def create_memory_layer(self) -> linen.Module:
+    """Create a flax Module that implements external memory."""
+    num_datasets = (
+        self.batch_size * self.num_heads  #
+        if self.off_device_memory is None  #
+        else self.num_heads)
+    if self.off_device_memory is not None:
+      mem_layer = None
+      if mem_layer is None:
+        raise ValueError("Off-device memory is not supported at this time.")
+      return memory_layer.BatchedMemory(
+          mem_layer,
+          split_dimensions=(-2,),
+      )
+    else:
+      assert self.database_size is not None
+      mem_layer = memory_layer.MemoryOnTpu(num_datasets=num_datasets,
+                                           key_features=self.key_size,
+                                           value_features=self.value_size,
+                                           database_size=self.database_size,
+                                           dtype=self.dtype)
+    # Handle queries of shape [batch_size, seq_len, num_heads, kv_features]
+    return memory_layer.BatchedMemory(mem_layer,
+                                      split_dimensions=(0, -2))
+@gin.configurable
+def memory_on_tpu_factory(batch_size: int,
+                          mode: str,
+                          num_heads: int = gin.REQUIRED,
+                          key_size: int = gin.REQUIRED,
+                          value_size: int = gin.REQUIRED,
+                          database_size: int = gin.REQUIRED,
+                          dtype: Dtype = gin.REQUIRED) -> MemoryManager:
+  """Implement SCAM memory on device."""
+  return MemoryManager(batch_size=batch_size,
+                       mode=mode,
+                       num_heads=num_heads,
+                       key_size=key_size,
+                       value_size=value_size,
+                       database_size=database_size,
+                       dtype=dtype,
+                       off_device_memory=None)

aglib/meliad/transformer/memory_layer.py ADDED Viewed

	@@ -0,0 +1,431 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FLAX layers for on-TPU memory."""
+import abc
+import functools
+from typing import Callable, Sequence, Tuple, TypeVar, Union
+from absl import logging
+from flax import linen
+import gin
+import jax
+from jax import lax
+import jax.numpy as jnp
+import numpy as np  # use with care!
+Shape = Sequence[int]
+Dtype = jnp.dtype
+Array = jnp.ndarray
+Axes = Union[int, Tuple[int, ...]]
+F = TypeVar('F', bound=Callable)
+class MemoryLayer(linen.Module, metaclass=abc.ABCMeta):
+  """Internal interface for memory layers without batch dim.
+  See BatchedMemory for a layer that can be used in Flax models.
+  """
+  num_datasets: int
+  @abc.abstractmethod
+  def update(self, key: Array, value: Array) -> int:
+    """Adds key/value pairs to memory.
+    Args:
+      key: of shape (num_kv, num_datasets, k_features)
+      value: of shape (num_kv, num_datasets, v_features)
+    Returns:
+      Dummy value so that TPU operations can wait for the update to finish if
+      desired.
+    """
+    raise NotImplementedError()
+  @abc.abstractmethod
+  def topk_retrieval(self, query: Array,
+                     num_neighbors: int) -> Tuple[Array, Array]:
+    """Retrieves the nearest neighbors for each query.
+    Args:
+      query: of shape (num_queries, num_datasets, k_features)
+      num_neighbors: int indicating the number of neighbors to retrieve
+    Returns:
+      Tuple of selected keys and selected values of shapes
+      (num_queries, num_datasets, num_neighbors, k_features), and
+      (num_queries, num_datasets, num_neighbors, v_features)
+    """
+    raise NotImplementedError()
+  @abc.abstractmethod
+  def reset(self, datasets: Array) -> int:
+    """Reset some or all of the datasets in the memory.
+    Args:
+      datasets: A vector of shape (num_datasets) of type bool. Each position
+        indicates whether the dataset with the same index should be reset.
+    Returns:
+      Dummy value so that TPU operations can wait for the update to finish if
+      desired.
+    """
+    raise NotImplementedError()
+  def __call__(self, query, num_neighbors):
+    self.topk_retrieval(query, num_neighbors)
+def _target_dimensions(shape: Shape,
+                       source_dimensions: Sequence[int]) -> Sequence[int]:
+  target_dimensions = range(-2, -2 - len(source_dimensions), -1)
+  assert len(source_dimensions) == len(target_dimensions)
+  return sorted(d % len(shape) for d in target_dimensions)
+def _rearrange_dimensions_shapes(
+    shape: Shape, split_dimensions: Sequence[int]) -> Tuple[Shape, Shape]:
+  split_shape = tuple(shape[d] for d in split_dimensions)
+  remaining_shape = tuple(
+      shape[d] for d in range(len(shape)) if d not in split_dimensions)
+  batch_shape = remaining_shape[:-1]
+  return split_shape, batch_shape
+def _rearrange_dimensions(x: Array, split_dimensions: Sequence[int]) -> Array:
+  """Rearrange array so that we can split by a single dimension.
+  Turns an array of shape [d1, ..., dn, features] and a list of dimensions to
+  split by into [prod(remaining_dimensions), prod(split_dimensions),
+  features]
+  Args:
+    x: array of shape [d1, ..., dn, features]
+    split_dimensions: list of dimensions that should end up in dimension -2.
+  Returns:
+    Rearranged array as described above.
+  """
+  split_dimensions = [d % len(x.shape) for d in split_dimensions]
+  split_dimensions = sorted(split_dimensions)
+  split_shape, batch_shape = _rearrange_dimensions_shapes(
+      x.shape, split_dimensions)
+  target_dimensions = _target_dimensions(x.shape, split_dimensions)
+  x = jnp.moveaxis(x, split_dimensions, target_dimensions)
+  assert len(x.shape) > len(split_dimensions)
+  assert all(isinstance(d, int) and d >= 0 for d in batch_shape)
+  assert all(isinstance(d, int) and d >= 0 for d in split_shape)
+  new_shape = [
+      # The use of numpy is okay here, since shapes are concrete at jit time.
+      np.prod(batch_shape),
+      np.prod(split_shape),
+      x.shape[-1]  # features dimension
+  ]
+  res = x.reshape(new_shape)
+  assert res.ndim == 3
+  return res
+def _restore_dimensions(x: Array, original_shape: Shape,
+                        split_dimensions: Sequence[int]) -> Array:
+  """Restores arrays encoded with _rearrange_dimensions.
+  Args:
+    x: Array of shape [prod(batch_shape), prod(split_shape), feature...]
+    original_shape: Shape of the array to restore to.
+    split_dimensions: Dimensions that were multiplied into dimension 2.
+  Returns:
+    Array of the original shape and axis order for all dimensions in batch_shape
+    and split_shape. Feature dimensions may have changed (can include additional
+    dimensions for neighbors, for example).
+  """
+  split_dimensions = [d % len(original_shape) for d in split_dimensions]
+  split_dimensions = sorted(split_dimensions)
+  split_shape, batch_shape = _rearrange_dimensions_shapes(
+      original_shape, split_dimensions)
+  features_shape = x.shape[2:]
+  x = x.reshape((*batch_shape, *split_shape, *features_shape))
+  # rearrange
+  target_dimensions = _target_dimensions(original_shape, split_dimensions)
+  x = jnp.moveaxis(x, target_dimensions, split_dimensions)
+  return x
+@gin.configurable
+class BatchedMemory(linen.Module):
+  """Equips a memory module with a batch dimension."""
+  # We wrap this linen.Module:
+  wrapped: MemoryLayer
+  # `split_dimensions` indicates the dimensions of the query and update tensors
+  # that will go to separate databases. By default, we use a separate database
+  # for each head.
+  # Note that some implementations of the memory share memory across all hosts
+  # and devices (memory_on_borg, unless configured otherwise) or just across
+  # devices of each host (memory_on_host).
+  # Default is (-2,) to split by head only; use (0, -2) to also slit by batch
+  # dimensions.
+  split_dimensions: Tuple[int, ...] = (-2,)
+  query_stride: int = 1
+  update_stride: int = 1
+  def update(self, key: Array, value: Array):
+    """Adds key/value pairs to memory.
+    Args:
+      key: typically of shape (batch, kv_len, num_heads, k_features). This
+        tensor is split up into datasets according to `split_dimensions`.
+      value: typically of shape (batch, kv_len, num_heads, v_features). This
+        tensor is split up into datasets according to `split_dimensions`.
+    Returns:
+      A dummy value 0, once the operation has completed.
+    """
+    if key.ndim != 4 or value.ndim != 4:
+      raise ValueError('Expected batched inputs; got shapes: %s and %s.' %
+                       (key.shape, value.shape))
+    key = _rearrange_dimensions(key, self.split_dimensions)
+    value = _rearrange_dimensions(value, self.split_dimensions)
+    update_stride = self.update_stride
+    if update_stride == 1:
+      return self.wrapped.update(key, value)
+    return self.wrapped.update(key[update_stride - 1::update_stride, ...],
+                               value[update_stride - 1::update_stride, ...])
+  def topk_retrieval(self, query: Array, num_neighbors: int):
+    """Retrieves the nearest neighbors for each query.
+    Args:
+      query: typically of shape (batch, q_len, num_heads, k_features). This
+        tensor is split up into datasets according to `split_dimensions`.
+      num_neighbors: number of neighbors to retrieve
+    Returns:
+      Tuple of tensors with the retrieved keys and value of the same shape as
+      query, but with an extra dimension of length num_neighbors - typically:
+      (batch, q_len, num_heads, num_neighbors, k_features)
+    """
+    if query.ndim != 4:
+      raise ValueError('Expected batched inputs; got shape: %s.' % query.shape)
+    query_stride = self.query_stride
+    original_shape = query.shape
+    query = _rearrange_dimensions(query, self.split_dimensions)
+    if query_stride == 1:
+      key, value = self.wrapped.topk_retrieval(query, num_neighbors)
+    else:
+      num_queries, num_heads, k_features = query.shape
+      throttled_query = query[0::query_stride, ...]
+      key = jnp.zeros(
+          shape=(num_queries, num_heads, num_neighbors, k_features),
+          dtype=query.dtype)
+      throttled_key, throttled_value = (
+          self.wrapped.topk_retrieval(throttled_query, num_neighbors))
+      _, _, _, v_features = throttled_value.shape
+      value = jnp.zeros(
+          shape=(num_queries, num_heads, num_neighbors, v_features),
+          dtype=query.dtype)
+      key = key.at[0::query_stride, ...].set(throttled_key)
+      value = value.at[0::query_stride, ...].set(throttled_value)
+    key = _restore_dimensions(key, original_shape, self.split_dimensions)
+    # Note that `original_shape` here may have the wrong feature dimension (if
+    # k_features != v_features. But `_restore_dimensions` does not depend on
+    # that dimension and the tests cover this case.
+    value = _restore_dimensions(value, original_shape, self.split_dimensions)
+    assert key.ndim == len(original_shape) + 1
+    return key, value
+  def reset(self, datasets: Array) -> int:
+    """Resets the memory.
+    Args:
+      datasets: of shape (num_datasets,), typically the same as (num_heads,).
+    Returns:
+      A dummy value 0, once the operation has completed.
+    """
+    return self.wrapped.reset(datasets)
+@functools.partial(jax.jit, static_argnames=('num_buckets', 'bucket_size'))
+def _chunking_sparsify(query: Array, key: Array, num_buckets: int,
+                       bucket_size: int) -> Tuple[Array, Array, Array]:
+  """Approximate top k operation for a single head."""
+  # q = q_length, f = qk features, d = database_size
+  scores = jnp.einsum('qf,df->qd', query, key)
+  mask = (key.sum(-1) == 0).astype(jnp.bfloat16) * -1e6
+  scores += mask
+  num_queries, _ = scores.shape
+  reshaped_scores = jnp.reshape(scores, (num_queries, bucket_size, num_buckets))
+  sparse_scores = linen.softmax(reshaped_scores * 1e6, axis=1)
+  # topk_scores and topk_indices will only be computed if we depend on their
+  # results.
+  topk_scores = jnp.max(reshaped_scores, axis=1)
+  local_indices = jnp.argmax(reshaped_scores, axis=1)
+  topk_indices = (
+      local_indices * num_buckets + jnp.arange(num_buckets).reshape(
+          (1, num_buckets)))
+  return sparse_scores, topk_scores, topk_indices
+def _retrieve_topk_gatherless(
+    query: Array, key: Array, value: Array,
+    num_neighbors: int) -> Tuple[Array, Array, Array, Array]:
+  """Retrieves for a single head - used to simplify array accesses."""
+  num_kv, query_features = query.shape
+  database_size, key_features = key.shape
+  _, value_features = value.shape
+  assert query_features == key_features
+  num_buckets = num_neighbors
+  if num_buckets > database_size:
+    raise ValueError('More buckets than items in database. %s > %s' %
+                     (num_buckets, database_size))
+  if database_size % num_buckets:
+    raise ValueError('Buckets must divide database: %s %% %s.' %
+                     (database_size, num_buckets))
+  bucket_size = database_size // num_buckets
+  sparse_scores, topk_scores, topk_indices = _chunking_sparsify(
+      query, key, num_buckets, bucket_size)
+  key = key.reshape(bucket_size, num_buckets, key_features)
+  value = value.reshape(bucket_size, num_buckets, value_features)
+  selected_keys = jnp.einsum('qbn,bnd->qnd', sparse_scores, key)
+  selected_values = jnp.einsum('qbn,bnd->qnd', sparse_scores, value)
+  assert selected_keys.shape == (num_kv, num_neighbors, key_features)
+  assert selected_values.shape == (num_kv, num_neighbors, value_features)
+  return selected_keys, selected_values, topk_scores, topk_indices
+class MemoryOnTpu(MemoryLayer):
+  """Approximate top K search on TPU."""
+  # database_size must be integer multiple of prod(batch_dims) * num_neighbors.
+  database_size: int
+  dtype: Dtype = jnp.float32  # pylint: disable=g-bare-generic
+  key_features: int = 64
+  value_features: int = 64
+  report_scores_and_indices: bool = False
+  def setup(self):
+    self.db_index = self.variable('database', 'database_index',
+                                  functools.partial(jnp.zeros, dtype=jnp.int32),
+                                  (self.num_datasets,))
+    self.key_db = self.variable(
+        'database', 'key_db', functools.partial(jnp.zeros, dtype=self.dtype),
+        (self.num_datasets, self.database_size, self.key_features))
+    self.value_db = self.variable(
+        'database', 'value_db', functools.partial(jnp.zeros, dtype=self.dtype),
+        (self.num_datasets, self.database_size, self.value_features))
+    self.retrieved_indices = self.variable(
+        'database', 'retrieved_indices',
+        functools.partial(jnp.zeros, dtype=jnp.int32), (0, 0, 0))
+    self.retrieved_indices_scores = self.variable(
+        'database', 'retrieved_indices_scores',
+        functools.partial(jnp.zeros, dtype=jnp.float32), (0, 0, 0))
+  def _update_kv_database(self, database, new_values, start_index):
+    num_datasets, database_size, _ = database.shape
+    assert database_size == self.database_size, f'{database_size} vs {self.database_size}'
+    assert num_datasets == self.num_datasets
+    assert new_values.ndim == 3
+    assert start_index.shape == (self.num_datasets,)
+    def _update(database, new_values, start_index):
+      return lax.dynamic_update_slice(
+          database, new_values, start_indices=(start_index, 0))
+    return jax.vmap(
+        _update, in_axes=(0, 0, 0), out_axes=0)(database, new_values,
+                                                start_index)
+  def update(self, key: Array, value: Array) -> int:
+    """Add keys and values to the memory; overwrite oldest if memory is full."""
+    key = lax.stop_gradient(key)
+    value = lax.stop_gradient(value)
+    assert len(key.shape) == len(value.shape)
+    assert key.shape[:-1] == value.shape[:-1]
+    num_kv, num_datasets, key_features = key.shape
+    assert num_datasets == self.num_datasets
+    assert key_features == self.key_features
+    assert value.shape[-1] == self.value_features
+    assert self.database_size % num_kv == 0, (
+        'Database size must be integer multiple of num_kv.')
+    key = jnp.moveaxis(key, source=1, destination=0)  # split by dataset
+    value = jnp.moveaxis(value, source=1, destination=0)  # split by dataset
+    # start_index can be larger than DB - we use that to detect which entries
+    # are not written to yet
+    start_index = self.db_index.value % self.database_size
+    self.key_db.value = self._update_kv_database(self.key_db.value, key,
+                                                 start_index)
+    self.value_db.value = self._update_kv_database(self.value_db.value, value,
+                                                   start_index)
+    self.db_index.value = self.db_index.value + num_kv
+    return 0
+  def topk_retrieval(self, query: Array,
+                     num_neighbors: int) -> Tuple[Array, Array]:
+    """Nearest neighbors by full multiplication and approximate top k on TPU."""
+    query = lax.stop_gradient(query)
+    unused_num_kv, num_datasets, query_features = query.shape
+    assert num_datasets == self.num_datasets
+    assert query_features == self.key_features
+    query = jnp.moveaxis(query, source=1, destination=0)
+    # Process different heads sequentially
+    selected_keys, selected_values, topk_scores, topk_indices = lax.map(
+        lambda x: _retrieve_topk_gatherless(*x, num_neighbors),
+        (query, self.key_db.value, self.value_db.value))
+    if self.report_scores_and_indices:
+      # TODO(mrabe): These variable updates may not work perfectly yet. Find out
+      # why Flax does not like them.
+      self.retrieved_indices.value = topk_indices
+      self.retrieved_indices_scores.value = topk_scores
+    assert selected_keys.ndim == selected_values.ndim == 4
+    selected_keys = jnp.moveaxis(selected_keys, source=0, destination=1)
+    selected_values = jnp.moveaxis(selected_values, source=0, destination=1)
+    return selected_keys, selected_values
+  def reset(self, datasets: Array) -> int:
+    """Resets specified datasets."""
+    datasets = lax.stop_gradient(datasets)
+    assert datasets.shape == (self.num_datasets,)
+    assert datasets.dtype == jnp.bool_
+    def _reset_single_dataset(input_tuple):
+      """Resets a single head; reset is a single bool."""
+      database, reset = input_tuple
+      assert reset.shape == tuple(), reset.shape
+      assert reset.dtype == jnp.bool_
+      return database * (1 - reset)
+    self.db_index.value = self.db_index.value * (1 - datasets)
+    self.key_db.value = lax.map(
+        _reset_single_dataset, xs=(self.key_db.value, datasets))
+    self.value_db.value = lax.map(
+        _reset_single_dataset, xs=(self.value_db.value, datasets))
+    return 0

aglib/meliad/transformer/metric_utils.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper routines for recording various training metrics."""
+from typing import Any
+import jax.numpy as jnp
+Array = Any
+def compute_accuracy_sum(logits, targets, valid_loss_mask=None):
+  """Compute accuracy for logits and targets.
+  Args:
+   logits: [batch, length, num_classes] float array.
+   targets: categorical targets [batch, length] int array.
+   valid_loss_mask: None or array of shape bool[batch, length]
+  Returns:
+    The number of correct tokens in the output.
+  """
+  if logits.shape[:-1] != targets.shape:
+    raise ValueError("Incorrect shapes. Got shape %s logits and %s targets" %
+                     logits.shape, targets.shape)
+  if valid_loss_mask is not None and valid_loss_mask.shape != targets.shape:
+    raise ValueError("Incorrect shapes. Got shape %s targets and %s mask" %
+                     targets.shape, valid_loss_mask.shape)
+  accuracy = jnp.equal(jnp.argmax(logits, axis=-1), targets)
+  if valid_loss_mask is not None:
+    accuracy = jnp.logical_and(accuracy, valid_loss_mask)
+  return jnp.sum(accuracy)  # Sum of the number of True values.
+def reshape_image(image):
+  """Reshape image to something that tensorboard recognizes.
+  Args:
+    image: Array of shape [xsize, size] or [num_images, xsize, ysize]
+  Returns:
+    Array of shape [num_images, xsize, ysize, 1]
+  """
+  # Reshape to [num_images, xdim, ydim, rgb] for tensorboard.
+  sh = image.shape
+  if image.ndim == 2:
+    return jnp.reshape(image, [1, sh[0], sh[1], 1]).astype(jnp.float32)
+  elif image.ndim == 3:
+    return jnp.reshape(image, [sh[0], sh[1], sh[2], 1]).astype(jnp.float32)
+  else:
+    return None  # Not an image.
+def normalize_image(images: Array, as_group: bool = False) -> Array:
+  """Rescale the values in images to between 0.0 and 1.0.
+  Args:
+    images:   Array of size [batch_size, xsize, ysize]
+    as_group: Scale all images in the batch by the same amount if True.
+  Returns:
+    A rescaled image of the same shape.
+  """
+  images = images.astype(jnp.float32)  # Return images as float32.
+  if as_group:
+    # Normalize the batch of images as a group.
+    min_img = jnp.min(images)
+    max_img = jnp.max(images)
+  else:
+    # Normalize each image in the batch individually.
+    min_img = jnp.min(images, axis=(-2, -1), keepdims=True)
+    max_img = jnp.max(images, axis=(-2, -1), keepdims=True)
+  norm_image = (images - min_img) / (max_img - min_img + 1e-6)
+  return jnp.where(jnp.isfinite(norm_image), norm_image, 0.0)
+def overlay_images(image1: Array, image2: Array) -> Array:
+  """Place image1 on top of image2, broadcasting image2 if necessary.
+  Args:
+    image1: array of shape [num_images, xsize, ysize]
+    image2: array of shape [num_images, xsize, ysize]
+  Returns:
+    A combined image.
+  """
+  assert image1.ndim == 3  # (num_images, xsize, ysize)
+  assert image2.ndim == 3
+  image2 = jnp.broadcast_to(image2, image1.shape)
+  return jnp.concatenate([image1, image2], axis=1)
+def make_histograms(viz_dicts):
+  """Generate image histograms."""
+  hist_dict = {}
+  for (i, viz_dict) in enumerate(viz_dicts):
+    for (k, images) in viz_dict.items():
+      hist_dict["h_" + k + "_" + str(i)] = images
+  return hist_dict

aglib/meliad/transformer/models.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sequence to sequence model."""
+from typing import Any, Callable, Dict, Tuple
+from absl import logging
+from flax import linen as nn
+from flax.training import common_utils
+import gin
+import jax
+import jax.numpy as jnp
+import  metrics_summary
+from transformer import decoder_stack
+from transformer import metric_utils
+from transformer import text_dataset
+import numpy as np
+import seqio
+Array = jnp.ndarray
+MetricsSummary = metrics_summary.MetricsSummary
+# TODO(mrabe): Remove this function and find a better way to turn text metrics
+# into text on tensorboard.
+def process_summaries(vocab: seqio.Vocabulary,
+                      met_summary: MetricsSummary,
+                      mode: str) -> MetricsSummary:
+  """Compute some additional summaries, and convert tokens to text.
+  Args:
+    vocab: The vocabulary to detokenize generated text.
+    met_summary: The summary object to process.
+    mode: The mode of the summary (e.g. "test", "train")
+  Returns:
+    The modified summary dictionary.
+  """
+  mdict = met_summary.current_metric_dict()
+  # Calculate perplexity from the average nats_per_token over all replicas.
+  # This has to be done here, because the perplexities themselves can't be
+  # averaged in the usual way.
+  if "nats_per_token" in mdict:
+    nats_per_token = mdict["nats_per_token"].to_value()
+    met_summary.add({"perplexity": np.exp(nats_per_token)})
+  if mode == "generate" and "gen_tokens" in mdict:
+    # Convert output tokens to example output text.
+    # Write text to both the summary, and pretty-print to the log file.
+    gen_toks = mdict["gen_tokens"].to_value()
+    if np.ndim(gen_toks) != 2:
+      raise ValueError("Unsupported shape for gen_tokens: %s" % gen_toks.shape)
+    ntoks = gen_toks.shape[-1]
+    gen_text = text_dataset.decode_tokens(gen_toks, vocab, max_length=ntoks)
+    logging.info("Generated text = %s", gen_text)
+    met_summary.add_text({"gen_text": gen_text})
+    del mdict["gen_tokens"]   # Otherwise it will turn into a histogram.
+  return met_summary
+@gin.configurable
+def process_summaries_function(vocab: seqio.Vocabulary) -> Callable[
+    [MetricsSummary, str], MetricsSummary]:
+  """Return a function that processes summaries with the given vocabulary."""
+  # For use with training_loop.process_summaries_function
+  def process_fn(met_summary: MetricsSummary, mode: str):
+    return process_summaries(vocab, met_summary, mode)
+  return process_fn
+@gin.configurable
+class DecoderOnlyLanguageModel(nn.Module):
+  """Decoder only language modeling."""
+  mode: str
+  task_config: decoder_stack.TransformerTaskConfig = gin.REQUIRED
+  decoder_factory: Callable[[], Any] = gin.REQUIRED
+  sample_method: str = "sample"   # Can be {"sample", "greedy"}
+  output_token_losses: bool = False
+  def get_fake_input(self):
+    """Returns a fake input for initialization of the appropriate shape."""
+    b = self.task_config.batch_size
+    fake_input_dict = {
+        "targets": jnp.ones([b, self.task_config.sequence_length],
+                            dtype=jnp.int32),
+        "start_of_sequence": jnp.ones([b], dtype=jnp.bool_),
+        "epoch": jnp.ones([b], dtype=jnp.int32),
+    }
+    if text_dataset.get_loss_mask_tokens(split=self.mode) != (None, None):
+      # We are not adding the loss mask to the dummy input by default as it can
+      # cause a slowdown during evaluation and perhaps inference.
+      fake_input_dict["loss_mask"] = jnp.ones(
+          [b, self.task_config.sequence_length], dtype=jnp.bool_)
+    return fake_input_dict
+  def metrics_summary_operations(self, aggregate_over: str) -> Dict[str, str]:
+    """Summary operation to use for recorded metrics."""
+    metric_ops = {
+        "loss": "mean",
+        "nats_per_token": "mean",
+        "bits_per_token": "mean",
+        "bits_per_char": "mean",
+        "accuracy": "mean",
+        "num_tokens": "mean",
+        "num_chars_per_device": "mean",
+        "num_chars_per_batch": "mean",
+        "nonzero_tokens": "mean",
+        "num_tokens_per_device": "mean",
+        "num_tokens_per_batch": "mean",
+        "epoch": "mean",
+    }
+    if aggregate_over == "steps":
+      return metric_ops
+    elif aggregate_over == "devices":
+      # Ensure that statistics that refer to the total batch size stay constant
+      # as TPU topologies change. For those we have to sum over devices, but
+      # compute the mean over steps.
+      metric_ops.update({
+          "num_tokens_per_batch": "sum",
+          "num_chars_per_batch": "sum",
+          "loss": "sum"})
+      return metric_ops
+    else:
+      raise ValueError("Don't know how to aggregate over: %s" % aggregate_over)
+  def setup(self):
+    self.decoder = self.decoder_factory(mode=self.mode,
+                                        task_config=self.task_config)  # pytype: disable=wrong-keyword-args  # trace-all-classes
+  def __call__(self, inputs: ...):
+    task_config = self.task_config
+    input_tokens = inputs["targets"]                  # [b, seq_len]
+    start_of_sequence = inputs["start_of_sequence"]   # [b]
+    epochs = inputs["epoch"]                          # [b]
+    if "loss_mask" in inputs:
+      loss_mask = inputs["loss_mask"]                 # [b, seq_len]
+    else:
+      loss_mask = jnp.ones((1, 1), dtype=jnp.bool_)
+    input_tokens = jnp.asarray(input_tokens)
+    assert input_tokens.ndim == 2
+    assert input_tokens.shape[0] == task_config.batch_size
+    assert input_tokens.shape[1] == task_config.sequence_length
+    assert start_of_sequence.shape[0] == task_config.batch_size
+    # Sanity check to avoid out-of-bounds on token lookup.
+    input_tokens = input_tokens % task_config.vocab_size
+    logging.info("langmodel: Compiling model for mode %s", self.mode)
+    logging.info("langmodel: input_tokens = %r", input_tokens)
+    logging.info("langmodel: start_of_sequece = %r", start_of_sequence)
+    logging.info("langmodel: epochs = %r", epochs)
+    # The target outputs are the next character in each sequence.
+    # Shift tokens left and pad with a zero at the end.
+    # TODO(delesley): We don't predict the first token of each sequence.
+    target_tokens = jnp.pad(input_tokens[:, 1:], [(0, 0), (0, 1)])
+    logging.info("langmodel: target_tokens = %r", target_tokens)
+    # Invoke the decoder stack.
+    # The decoder will return pre-softmax logits for the predicted targets.
+    (logits, _, d_metrics) = self.decoder(input_tokens=input_tokens,
+                                          target_tokens=target_tokens,
+                                          start_of_sequence=start_of_sequence)
+    # Softmax cross-entropy loss on target tokens.
+    logits = nn.log_softmax(logits, axis=-1)   # (b, seq_len, vocab_size)
+    logging.info("langmodel: logits = %r", logits)
+    soft_targets = common_utils.onehot(target_tokens, task_config.vocab_size)
+    logging.info("langmodel: soft_targets = %r", soft_targets)
+    losses = -jnp.sum(soft_targets * logits, axis=-1)  # (b, seq_len)
+    logging.info("langmodel: losses = %r", losses)
+    # Don't predict null tokens which are past the end-of-sequence.
+    # Also don't predict the 0 at the end of the sequence.
+    # TODO(delesley): Predict the final end-of-sequence marker.
+    loss_mask = jnp.logical_and(
+        loss_mask,
+        input_tokens > 0)
+    loss_mask = jnp.logical_and(
+        loss_mask,
+        target_tokens > 0)
+    logging.info("langmodel: loss_mask = %r", loss_mask)
+    losses = jnp.where(loss_mask, losses, 0.0)  # (batch_size, seq_len)
+    loss = jnp.sum(losses)  # total loss on device
+    token_count = jnp.sum(loss_mask)  # tokens on device
+    token_count_nz = token_count + 1.0e-6
+    loss_per_token = loss / token_count_nz
+    bits_per_token = loss_per_token * 1.442695  # log(e)/log(2)
+    accuracy = metric_utils.compute_accuracy_sum(logits, target_tokens,
+                                                 loss_mask)
+    accuracy = accuracy / token_count_nz  # Percent correct.
+    epoch = jnp.mean(epochs)
+    if self.mode == "generate" and self.decoder.supports_generate():
+      # Generate example text.
+      logging.info("lang_model: text inference.")
+      gen_tokens = self.generate(inputs, task_config.sequence_length)
+      # Return generated text, along with vizualizations and histograms.
+      metrics = {"gen_tokens": gen_tokens, **d_metrics}
+      return (loss, metrics)
+    # Just return metrics related to the loss.
+    metrics = {
+        "loss": loss,   # will be summed over devices
+        "nats_per_token": (loss_per_token, token_count),
+        "bits_per_token": (bits_per_token, token_count),
+        "accuracy": (accuracy, token_count),
+        "num_tokens_per_device": token_count,
+        "num_tokens_per_batch": token_count,  # will be summed over devices
+        "epoch": epoch,
+    }
+    # Compute bits per character if we have the number of characters.
+    if "num_chars" in inputs:
+      num_chars = jnp.sum(inputs["num_chars"])
+      bits_per_char = loss / (num_chars + 1e-6) * 1.442695
+      metrics["num_chars_per_device"] = num_chars
+      metrics["num_chars_per_batch"] = num_chars  # will be summed over devices
+      metrics["bits_per_char"] = (bits_per_char, num_chars)
+    # Provided to make sure that the data pipeline and the the model agree
+    # on the number of tokens with a loss.
+    if "nonzero_tokens" in inputs:
+      nonzero_tokens = jnp.sum(inputs["nonzero_tokens"])
+      metrics["nonzero_tokens"] = nonzero_tokens
+    if self.output_token_losses:
+      metrics["token_losses"] = losses
+    return (loss, metrics)
+  def generate(self, inputs: ..., sequence_length: int) -> Array:
+    """Generate an output sequence.
+    Args:
+      inputs: the same as argument to _call_.
+      sequence_length: the length of sequence to generate.
+    Returns:
+      An array of generated tokens of shape (batch_size, sequence_length).
+    """
+    # TODO(delesley): Add support for passing the prefix as an argument.
+    # TODO(delesley): Add support for temperature, gumbel softmax, beam search.
+    batch_size = self.task_config.batch_size
+    input_tokens = inputs["targets"]                  # [b,seq_len]
+    start_of_sequence = inputs["start_of_sequence"]   # [b]
+    # Initialize decoder.
+    dstate = self.decoder.init_decoder_state(sequence_length,
+                                             start_of_sequence)
+    # TODO(delesley): Handle start-of-sequence in a better way.
+    # There is no special token for start of sequence, so we grab the first
+    # one from the ground-truth input data.
+    first_token = input_tokens[:, 0:1]
+    no_start_of_seq = jnp.array([False] * batch_size, dtype=jnp.bool_)
+    sample_method = self.sample_method
+    sample_prng = self.make_rng("sample")
+    # Greedy autoregressive decoder function.
+    def loop_fn(scan_state: Any, i: Array) -> Tuple[Any, Array]:
+      prng = jax.random.fold_in(sample_prng, i)
+      (dstate, input_token) = scan_state
+      del i
+      (logits, dstate, _) = self.decoder(input_tokens=input_token,
+                                         target_tokens=None,
+                                         start_of_sequence=no_start_of_seq,
+                                         decoder_state=dstate)
+      if sample_method == "sample":
+        logging.info("Using categorical sampling.")
+        output_token = jax.random.categorical(prng, logits, axis=-1)
+      elif sample_method == "greedy":
+        logging.info("Using greedy sampling.")
+        output_token = jnp.argmax(logits, axis=-1)
+      else:
+        raise ValueError(f"Invalid sampling method: {sample_method}")
+      logging.info("generate_loop_fn: output_token = %r", output_token)
+      return ((dstate, output_token), output_token)
+    # Scan over the sequence length.
+    iterations = jnp.arange(sequence_length)
+    initial_scan_state = (dstate, first_token)
+    (_, output_tokens) = jax.lax.scan(loop_fn, initial_scan_state, iterations)
+    logging.info("generate: output_tokens = %r", output_tokens)
+    # Output_tokens has shape (sequence_length, batch_size, 1)
+    assert output_tokens.shape == (sequence_length, batch_size, 1)
+    output_tokens = jnp.reshape(
+        output_tokens, (sequence_length, self.task_config.batch_size))
+    output_tokens = output_tokens.transpose([1, 0])
+    return output_tokens

aglib/meliad/transformer/nn_components.py ADDED Viewed

	@@ -0,0 +1,437 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Core NN components used in models.
+"""
+from typing import Any, Callable, Optional, Tuple, Union
+from absl import logging
+from flax import linen as nn
+import gin
+import jax
+from jax import lax
+from jax.nn import initializers
+import jax.numpy as jnp
+PRNGKey = Any
+Array = jnp.ndarray
+Shape = Tuple[int, ...]
+Dtype = Union[jnp.dtype, str]
+def scalar_initializer(x):
+  """Like linen.zeros, but initializes a parameter to a scalar value."""
+  def init_fun(key, shape, dtype):
+    del key
+    return jnp.broadcast_to(jnp.array(x, dtype=dtype), shape)
+  return init_fun
+def swish(x: Array) -> Array:
+  """Swish function, which is very similar to gelu."""
+  return x * nn.sigmoid(x)
+def soft_abs(x: Array) -> Array:
+  """Soft version of absolute value, that is smoothly differentiable."""
+  return jnp.sqrt(jnp.square(x) + 1) - 1
+def get_activation_function(fname: Optional[str]) -> Callable[[Array], Array]:
+  """Get activation function from the specified string."""
+  if fname is None:
+    return lambda x: x
+  elif fname == "relu":
+    return nn.relu
+  elif fname == "swish":
+    return swish
+  elif fname == "sigmoid":
+    return nn.sigmoid
+  elif fname == "tanh":
+    return nn.tanh
+  else:
+    raise ValueError("Unknown activation function %s" % fname)
+# Adapted from flax.linen.softmax.
+def safe_softmax(x: Array,
+                 axis: Optional[Union[int, Tuple[int, ...]]] = -1,
+                 min_x: Optional[Array] = None) -> Array:
+  r"""Softmax function.
+  Computes the function which rescales elements to the range :math:`[0, 1]`
+  such that the elements along :code:`axis` sum to :math:`1`.
+  This version of softmax is intended for use with causal attention masks, and
+  safely covers the situation where all elements are masked out.  If min_x is
+  not None, then probabability will be distributed between the values in x, and
+  min_x.  If x >> min_x, then the probability allocated to min_x will be zero,
+  and this function will be the same as the usual softmax.  However, if
+  x << min_x, (because all the values in x are masked out) then probability
+  will be allocated to min_x instead, and the probability allocated to x will
+  be 0.  I.e., attention will attend to nothing if everything is masked out.
+  .. math ::
+    \mathrm{softmax}(x) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
+  Args:
+    x: input array
+    axis: the axis or axes along which the softmax should be computed. The
+      softmax output summed across these dimensions should sum to :math:`1`.
+      Either an integer or a tuple of integers.
+    min_x: the value of a minimum element which will be included in the
+      softmax sum.  The value of min_x should be small when compared to the
+      expected values in x.  If all of the values in x are smaller than
+      min_x, then probability will be allocated to the minimum element
+      instead, and the result of softmax will sum to less than 1.
+  Returns:
+    An array of the same shape as x.
+  """
+  # Subtract maximum value in x for numerical stability, so that the exponent
+  # never exceeds numerical precision.
+  x_max = lax.stop_gradient(jnp.max(x, axis, initial=min_x, keepdims=True))
+  if min_x is not None:
+    min_x = jnp.asarray(min_x, dtype=x.dtype)
+    x_max = jnp.maximum(x_max, min_x)
+  unnormalized = jnp.exp(x - x_max)
+  x_sum = jnp.sum(unnormalized, axis=axis, keepdims=True)
+  if min_x is not None:
+    x_sum = x_sum + jnp.exp(min_x - x_max)
+  return unnormalized / x_sum
+def dropout_multiplier_mask(rng, dropout_rate: float, shape: Shape,
+                            dtype: Dtype):
+  """Returns an array which can be multiplied by an input to perform dropout.
+  Args:
+    rng: A random number generator.
+    dropout_rate: The rate at which to drop.
+    shape: The shape of the output array.
+    dtype: The type of the output array.
+  Returns:
+    An array of given shape, where values are { 0.0, 1.0/keep_probibility. }.
+  """
+  if dropout_rate <= 0.0:
+    return jnp.ones(shape, dtype=dtype)
+  logging.info("dropout mask: %s", shape)
+  keep_prob = 1.0 - dropout_rate
+  keep = jax.random.bernoulli(rng, keep_prob, shape)
+  dropout_multiplier = (keep.astype(dtype) / jnp.asarray(keep_prob, dtype))
+  return dropout_multiplier
+def tiled_dropout(x: Array, shape: Shape, dropout_rate: float,
+                  rng_function: Callable[[], jax.random.KeyArray],
+                  deterministic: bool) -> Array:
+  """Tiles a dropout mask over a larger array.
+  This will generate a smaller dropout mask of the given shape, and tile it
+  over a larger array, which reduces the computational cost and memory
+  associated with generating a large dropout mask.
+  Args:
+    x: The input array.
+    shape: The shape of the dropout mask to tile.
+    dropout_rate: The rate at which to drop.
+    rng_function: A function which returns a random number generator, e.g.
+                  lambda. self.make_rng("dropout").  The function will not
+                  be called if dropout is not enabled.
+    deterministic: If True, don't do dropout.
+  Returns:
+    An array of the same shape as x, with some values dropped out.
+  """
+  if deterministic or dropout_rate <= 0.0:
+    return x
+  if x.ndim != len(shape):
+    raise ValueError("Shapes must have same number of dimensions %r, %r." %
+                     (x.shape, shape))
+  for (xd, sd) in zip(x.shape, shape):
+    if (xd % sd) != 0:
+      raise ValueError("Incompatible shapes %r, %r" % (x.shape, shape))
+  # Get random number generator for dropout.
+  rng = rng_function()
+  repeats = [(1 if sd == 1 else xd // sd) for (xd, sd) in zip(x.shape, shape)]
+  logging.info("tiled dropout %r, tile: %r", x.shape, shape)
+  dtype = x.dtype
+  keep_prob = 1.0 - dropout_rate
+  keep = jax.random.bernoulli(rng, keep_prob, shape)
+  keep = jnp.tile(keep, repeats)
+  keep = jnp.broadcast_to(keep, x.shape)
+  x_scaled = x / jnp.asarray(keep_prob, dtype=dtype)
+  return lax.select(keep, x_scaled, jnp.zeros_like(x, dtype=dtype))
+@gin.configurable
+class MLP(nn.Module):
+  """Implements a multi-layer perceptron, with optional resnet or gate."""
+  # Arguments to module.
+  num_output_features: int                # Length of output vectors.
+  # Gin configurable parameters.
+  num_layers: int = gin.REQUIRED          # Number of layers in the MLP.
+  num_hidden_units: int = gin.REQUIRED    # Length of hidden unit vectors.
+  hidden_activation: Optional[str] = "relu"  # Hidden layer activation fn.
+  final_activation: Optional[str] = None     # Final layer activation fn.
+  use_bias: bool = True                   # Use a bias in each dense layer.
+  gate_type: Optional[str] = None         # { "residual", "bias", "full" }
+  initializer_scale: float = 1.0          # Scale of initial values.
+  dtype: Any = jnp.float32
+  def setup(self):
+    kernel_init = jax.nn.initializers.variance_scaling(
+        scale=self.initializer_scale, mode="fan_in",
+        distribution="truncated_normal")
+    assert self.num_layers > 0
+    hlayers = []
+    for i in range(0, self.num_layers - 1):
+      assert self.num_hidden_units > 0
+      hlayer = nn.Dense(self.num_hidden_units,
+                        use_bias=self.use_bias,
+                        kernel_init=kernel_init,
+                        dtype=self.dtype,
+                        name=f"hidden{i}")
+      hlayers.append(hlayer)
+    self.hidden_layers = hlayers
+    self.output_layer = nn.Dense(self.num_output_features,
+                                 use_bias=self.use_bias,
+                                 kernel_init=kernel_init,
+                                 dtype=self.dtype)
+    if self.gate_type is None or self.gate_type == "residual":
+      return
+    # We use a low but non-zero bias so that adafactor knows how to scale it.
+    gate_bias_init = jax.nn.initializers.normal(stddev=0.1)
+    # Also use a lower than normal kernel.
+    gate_kernel_init = jax.nn.initializers.variance_scaling(
+        scale=0.1, mode="fan_in", distribution="truncated_normal")
+    if self.gate_type == "bias":
+      self.gate_bias = self.param("gate_bias", gate_bias_init,
+                                  (self.num_output_features,), jnp.float32)
+    elif self.gate_type == "full":
+      self.gate_layer = nn.Dense(self.num_output_features,
+                                 use_bias=True,
+                                 bias_init=gate_bias_init,
+                                 kernel_init=gate_kernel_init,
+                                 dtype=self.dtype)
+    elif self.gate_type == "lstm":
+      self.input_gate = nn.Dense(self.num_output_features,
+                                 use_bias=True,
+                                 bias_init=gate_bias_init,
+                                 kernel_init=gate_kernel_init,
+                                 dtype=self.dtype)
+      self.forget_gate = nn.Dense(self.num_output_features,
+                                  use_bias=True,
+                                  bias_init=gate_bias_init,
+                                  kernel_init=gate_kernel_init,
+                                  dtype=self.dtype)
+    else:
+      raise ValueError("Unsupported gate_type: %s" % self.gate_type)
+  def _gate(self, y_hidden: Array, state: Array, y_out: Array) -> Array:
+    """Compute the value to use for the gate."""
+    if self.gate_type == "residual":
+      # Residual connection: just add y_out to the state.
+      logging.info("mlp: residual")
+      return state + y_out
+    elif self.gate_type == "bias":
+      # Simple gate: use a gru_style gate with a learned bias (no kernel).
+      bias = jnp.asarray(self.gate_bias, dtype=self.dtype)
+      bias = jnp.reshape(bias, (1,) * (y_out.ndim - 1) + (-1,))  # batch dims.
+      g = jax.nn.sigmoid(bias)
+      logging.info("mlp: gate bias = %r", g)
+      return (state * g) + (y_out * (1 - g))
+    elif self.gate_type == "full":
+      # Normal GRU style gate -- compute g using both a kernel and bias.
+      g = jax.nn.sigmoid(self.gate_layer(y_hidden) + 1)  # biased to remember
+      logging.info("mlp: gate full = %r", g)
+      return (state * g) + (y_out * (1 - g))
+    elif self.gate_type == "lstm":
+      # LSTM style gate with input and forget gates.
+      fg = jax.nn.sigmoid(self.forget_gate(y_hidden) + 1)  # biased to remember
+      ig = jax.nn.sigmoid(self.input_gate(y_hidden) - 1)
+      logging.info("mlp: gate lstm = %r, %r", ig, fg)
+      return (state * fg) + (y_out * ig)
+    else:
+      raise ValueError("Unsupported gate type %s" % self.gate_type)
+  def __call__(self, x: Array, state: Optional[Array],
+               apply_dropout: bool = False,
+               dropout_rate: float = 0.0,
+               drop_tile_shape: Optional[Shape] = None,
+               rng_function: Optional[Callable[[], Any]] = None) -> Array:
+    """Apply the multi-layer perceptron to the input x.
+    For simple MLPs, returns f(x), where f is the MLP function.
+    For resnets and gated architectures, it returns
+      state + f(x)            -- for resnet.
+      g*state + (1-g)*f(x)    -- for gated architecture, where g is the gate.
+    Args:
+      x: The input to the MLP.
+      state: The prior value, if this MLP is used as part of a resnet or gated
+             architecture.
+      apply_dropout: If true, applies dropout to the result.
+      dropout_rate: The dropout rate to use.
+      drop_tile_shape: The dropout tile shape.
+      rng_function: Gets a random number seed for dropout.
+    Returns:
+      The combination of f(x) and the (optional) prior state.
+    """
+    x = jnp.asarray(x, self.dtype)
+    hidden_act_fun = get_activation_function(self.hidden_activation)
+    final_act_fun = get_activation_function(self.final_activation)
+    if self.hidden_layers:
+      # Apply some number of hidden layers.
+      y = x
+      for layer in self.hidden_layers:
+        logging.info("mlp: hidden %d, %s", self.num_hidden_units,
+                     self.hidden_activation)
+        y = hidden_act_fun(layer(y))
+    else:
+      # Apply the hidden activation function to the input.
+      logging.info("mlp: activation = %s", self.hidden_activation)
+      y = hidden_act_fun(x)
+    y_hidden = y  # The hidden layer right before the output.
+    logging.info("mlp: final activation = %s", self.final_activation)
+    y_out = self.output_layer(y_hidden)  # The MLP final output.
+    y_out = final_act_fun(y_out)         # Apply final activation function.
+    logging.info("mlp: final = %r", y_out)
+    # Optionally apply dropout to the output.
+    if apply_dropout:
+      if drop_tile_shape is None:
+        raise ValueError("drop_tile_shape must be specified for dropout.")
+      if rng_function is None:
+        raise ValueError("rng_function must be specified for dropout.")
+      logging.info("mlp: dropout rate = %s", dropout_rate)
+      y_out = tiled_dropout(
+          y_out, shape=drop_tile_shape, dropout_rate=dropout_rate,
+          rng_function=rng_function, deterministic=False)
+    if state is None:
+      # Simple MLP.  No gate to combine y_out with the state.
+      assert self.gate_type is None
+      logging.info("mlp: gate type = None.")
+      return y_out
+    # When using state, gate_type must be specified.
+    assert self.gate_type is not None
+    return self._gate(y_hidden, state, y_out)
+# Modified slightly from the flax implementation.
+@gin.configurable
+class LayerNorm(nn.Module):
+  """Layer normalization (https://arxiv.org/abs/1607.06450).
+  Operates on the last axis of the input data.
+  It normalizes the activations of the layer for each given example in a
+  batch independently, rather than across a batch like Batch Normalization.
+  i.e. applies a transformation that maintains the mean activation within
+  each example close to 0 and the activation standard deviation close to 1.
+  Attributes:
+    epsilon: A small float added to variance to avoid dividing by zero.
+    dtype: the dtype of the computation (default: float32).
+    use_bias:  If True, bias (beta) is added.
+    use_scale: If True, multiply by scale (gamma).
+    use_mean: If True, compute and adjust for the mean.
+      Note that that T5X layernorm does not use the mean.
+      Empirically, ignoring the mean can stabilize learning in transformers.
+    use_scalar_scale_bias: If True, using a single scalar for scale & bias.
+    enable_layernorm: If False, does not perform layernorm.
+    bias_init: Initializer for bias, by default, zero.
+    scale_init: Initializer for scale, by default, one.
+  """
+  epsilon: float = 1e-6
+  dtype: Any = jnp.float32
+  use_scale: bool = True               # Apply a learned scale.
+  use_bias: bool = False               # Apply a learned bias.
+  use_mean: bool = False               # Calculate and adjust for the mean.
+  use_scalar_scale_bias: bool = False  # Learn a single scalar scale & bias.
+  enable_layernorm: bool = True        # Turn off layernorm if false.
+  bias_init: Callable[[PRNGKey, Shape, Dtype], Array] = initializers.zeros
+  scale_init: Callable[[PRNGKey, Shape, Dtype], Array] = initializers.ones
+  @nn.compact
+  def __call__(self, x):
+    """Applies layer normalization on the input.
+    Args:
+      x: the inputs
+    Returns:
+      Normalized inputs (the same shape as inputs).
+    """
+    if not self.enable_layernorm:
+      return x
+    x = jnp.asarray(x)
+    # Calculate mean and variance at higher precision.
+    xf = jnp.asarray(x, jnp.float32)
+    if self.use_mean:
+      mean = jnp.mean(xf, axis=-1, keepdims=True)
+      xf = xf - mean
+    var = jnp.mean(lax.square(xf), axis=-1, keepdims=True)
+    mul = lax.rsqrt(var + self.epsilon)
+    # Rescale x
+    # if not use_mean, then rescale around zero instead. (A simplification.)
+    if self.use_mean:
+      y = (x - mean) * mul
+    else:
+      y = x * mul
+    if self.use_scalar_scale_bias:
+      # Learn a single scalar value for bias and scale.
+      # (Which mirrors the single value for mean and stddev above.)
+      num_scale_bias_features = 1
+    else:
+      # Learn a different value per neuron/feature for bias and scale.
+      num_scale_bias_features = x.shape[-1]
+    # Apply learned scale and bias.
+    if self.use_scale:
+      y = y * jnp.asarray(
+          self.param("scale", self.scale_init, (num_scale_bias_features,)),
+          dtype=self.dtype)
+    if self.use_bias:
+      y = y + jnp.asarray(
+          self.param("bias", self.bias_init, (num_scale_bias_features,)),
+          dtype=self.dtype)
+    return y.astype(self.dtype)

aglib/meliad/transformer/position.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for dealing with relative and absolute positions, and masks."""
+from typing import Optional, Tuple, Union
+import jax.numpy as jnp
+import numpy as np
+Array = jnp.ndarray
+NpArray = np.ndarray
+Dtype = Union[jnp.dtype, str]
+def relative_positions(num_queries: int, num_keys: int,
+                       offset: Optional[int] = None):
+  """Returns an jax array of relative positions between query and key.
+  If num_keys >= num_queries, e.g. for transformer XL or sliding window,
+  then offset should be (num_keys - num_queries) to make the last N queries
+  line up with the last N keys.  This is the default if offset is None.
+  Args:
+      num_queries: Number of queries.
+      num_keys:    Number of keys.
+      offset:      Offset of the first query wrt. the first key.
+  Returns:
+    A /jax/ array of shape [num_queries, num_keys] with the signed distance
+    from each query to each key.
+  """
+  # Get the offset of each query wrt. to each key.
+  # If not specified, assume the last N queries line up with the last N keys.
+  if offset is None:
+    if num_keys < num_queries:
+      raise ValueError("Number of keys %d must be greater than queries %d" %
+                       (num_keys, num_queries))
+    offset = num_keys - num_queries
+  qidx = jnp.arange(0, num_queries, dtype=jnp.int32).reshape(num_queries, 1)
+  kidx = jnp.arange(0, num_keys, dtype=jnp.int32).reshape(1, num_keys)
+  return kidx - (qidx + offset)
+def relative_positions_np(num_queries: int, num_keys: int,
+                          offset: Optional[int] = None):
+  """Returns a numpy array of relative positions between query and key.
+  If num_keys >= num_queries, e.g. for transformer XL or sliding window,
+  then offset should be (num_keys - num_queries) to make the last N queries
+  line up with the last N keys.  This is the default if offset is None.
+  Args:
+      num_queries: Number of queries.
+      num_keys:    Number of keys.
+      offset:      Offset of the first query wrt. to the first key.
+  Returns:
+    A /numpy/ array of shape [num_queries, num_keys] with the signed distance
+    from each query to each key.
+  """
+  # Get the offset of each query wrt. to each key.
+  # If not specified, assume the last N queries line up with the last N keys.
+  if offset is None:
+    if num_keys < num_queries:
+      raise ValueError("Number of keys %d must be greater than queries %d" %
+                       (num_keys, num_queries))
+    offset = num_keys - num_queries
+  qidx = np.arange(0, num_queries, dtype=np.int32).reshape(num_queries, 1)
+  kidx = np.arange(0, num_keys, dtype=np.int32).reshape(1, num_keys)
+  return kidx - (qidx + offset)
+def broadcast_mask(mask: Array, attn: Array):
+  """Broadcast a mask or bias over all the dimensions of attn."""
+  # Add leading dimensions for batch_size, num_heads if necessary.
+  if mask.ndim < attn.ndim:
+    mask = jnp.expand_dims(mask, axis=tuple(range(0, attn.ndim - mask.ndim)))
+  return mask
+def causal_mask(num_queries: int, num_keys: int, window_length: int = 0):
+  """Returns a causal mask of the same shape as attn."""
+  # The mask ranges over the window_length positions prior to each query.
+  if window_length == 0:
+    window_length = num_queries
+  kqpos = relative_positions(num_queries, num_keys)  # 2D mask
+  # The causal mask includes only those tokens *before* the current token.
+  # This slightly improves perplexity in practice, and simplifies generation.
+  # Each token attends to exactly window_length prior tokens.
+  mask = (kqpos < 0) & (kqpos >= -window_length)
+  return mask
+def position_encoding(num_positions: int,
+                      input_dim: int,
+                      *,
+                      offset: int = 0,
+                      max_wavelength: float = 0) -> NpArray:
+  """Returns a position encoding of shape (num_positions, input_dim).
+  Positions are encoded as sin/cos pairs at geometrically increasing
+  wavelengths.
+  The length of a half-wave (peak to trough) increases geometrically from 1 to
+  max_wavelength.  (Technically, it's slightly less; the last sin/cos pair has
+  a wavelength of max_wavelength**((d-1)/d), where d = input_dim/2.)
+  NOTE: unlike prior published position encodings, we multiply the position of
+  each token by pi to convert from fractions of a wave (position/wavelength)
+  to radians.  Thus, the highest frequency wave alternates between -1 and 1 on
+  every token, whereas in prior published work the highest frequency alternates
+  between -1 and 1 every pi tokens.  The max_wavelength is also effectively
+  1/pi times as long, so a prior published factor of 10,000
+  (e.g. https://arxiv.org/abs/1706.03762) would equate to a max_wavelength
+  of 31,416.
+  This encoding also does not alternate between sin/cos values, but puts all of
+  the cos values on one side, and the sin values on the other.  That makes it
+  easier to split the sin,cos values to construct or apply a rotation matrix.
+  The default value for max_wavelength is 2 * num_positions.
+  Args:
+    num_positions:  The number of positions.
+    input_dim:      The dimension of the position vector.
+    *:  --- The following are keyword arguments only. ---
+    offset:         Positions count from offset to (offset + num_positions).
+    max_wavelength: The maximum length of a half-wave (peak to trough)
+  Returns:
+    Numpy matrix of shape (num_positions, input_dim) containing the encodings.
+    Position encodings are packed as concat(cos_values, sin_values, axis=1).
+  """
+  if max_wavelength == 0:
+    max_wavelength = 2 * num_positions
+  assert max_wavelength > 1
+  assert (input_dim % 2) == 0
+  idim2 = input_dim // 2
+  # t ranges from 0 <= t < 1
+  t = np.arange(0, idim2, dtype=np.float32) / idim2
+  # wavelength (columns)
+  # The length of a half-wave (trough to peak) increases geometrically
+  # 1 <= wavelength < max_wavelength
+  wavelength = float(max_wavelength)**t
+  wavelength = np.reshape(wavelength, (1, idim2))  # broadcast over rows
+  # k is the position in the sequence (rows)
+  k = np.arange(offset, num_positions + offset, dtype=np.float32)
+  k = np.reshape(k, (num_positions, 1))  # broadcast over columns
+  # For each position (row) compute an angle (column) at various wavelengths.
+  # NOTE: unlike prior published work, we multiply by pi to convert to radians.
+  pi_f = np.array(np.pi, dtype=np.float32)
+  angles = pi_f * k / wavelength  # shape (num_positions, idim2)
+  posx = np.cos(angles, dtype=np.float32)
+  posy = np.sin(angles, dtype=np.float32)
+  return np.concatenate([posx, posy], axis=1)  # shape (num_positions, idim)
+def rotate_kq(keys: Array, queries: Array,
+              *,  # the following args must be passed by keyword.
+              max_wavelength: float,
+              offset: Optional[int] = None,
+              dtype: Optional[Dtype] = None) -> Tuple[Array, Array]:
+  """Rotate keys and queries by the relative distance between query and key.
+  Implements rotary position embeddings (RoPE) https://arxiv.org/abs/2104.09864.
+  Args:
+    keys: array of shape (batch_size, num_keys, num_heads, head_size)
+    queries: aray of shape (batch_size, num_queries, num_heads, head_size)
+    max_wavelength: The maximum length of a half-wave (peak to trough)
+    offset: The relative positional offset from keys[i] to queries[i].
+            Defaults to num_keys - num_queries if not specified.
+    dtype: The precision to perform the rotation at.
+           Defaults to keys.dtype.
+  Returns:
+    (keys, queries) after rotation.
+  """
+  (batch_size, num_keys, num_heads, head_size) = keys.shape
+  (_, num_queries, _, _) = queries.shape
+  assert queries.shape == (batch_size, num_queries, num_heads, head_size)
+  if offset is None:
+    assert num_keys >= num_queries
+    offset = num_keys - num_queries
+  if dtype is None:
+    dtype = keys.dtype
+  def rotate_k_or_q(kq: Array, num_kq: int, kq_offset: int) -> Array:
+    nonlocal max_wavelength
+    nonlocal dtype
+    # Get position encodings, which can be used to do a rotation.
+    kq_pos = position_encoding(num_kq, head_size, offset=kq_offset,
+                               max_wavelength=max_wavelength)
+    # Broadcast over batch_size and num_heads.
+    kq_pos = np.reshape(kq_pos, (1, num_kq, 1, head_size))
+    # Split position encoding into separate sin/cos values in order to
+    # construct a rotation matrix.
+    (cosa, sina) = np.split(kq_pos, 2, axis=-1)
+    cosa = jnp.asarray(cosa, dtype=dtype)  # convert from numpy -> jax
+    sina = jnp.asarray(sina, dtype=dtype)  # convert from numpy -> jax
+    # Split keys/queries into real & imaginary (i.e. x & y) parts.
+    (kqx, kqy) = jnp.split(kq, 2, axis=-1)
+    # Apply rotation matrix.
+    kqx_rot = (kqx * cosa) - (kqy * sina)
+    kqy_rot = (kqx * sina) + (kqy * cosa)
+    # Concatenate back into keys/queries.
+    return jnp.concatenate([kqx_rot, kqy_rot], axis=-1)
+  keys = rotate_k_or_q(keys, num_keys, -offset)  # pylint: disable=invalid-unary-operand-type
+  queries = rotate_k_or_q(queries, num_queries, 0)
+  return (keys, queries)

aglib/meliad/transformer/position_fourier.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Class for Fourier relative position biases.
+This implementation uses the same Fourier position encodings that are used
+in the absolute position encoding.  However, instead of adding the positions
+to the input, where the position vector and content vectors become entangled,
+the relative encoding computes a relative position bias matrix, which is then
+added to the content-based attention matrix before applying softmax.
+The bias matrix is computed as follows.  First, a learned transformation is
+applied to each query position, which transforms it so that it matches a set
+of key positions. The relative position bias between query 'i' and key 'j' is
+the dot product between the transformed position 'i', and position 'j'.
+The learned transformation is designed so that the match between query and key
+is a function of the relative distance between the two.  Although absolute
+positions are fed as inputs, the rest of the network can't "see" the absolute
+positions; it can only transform them by some relative amount.
+A position vector consists of a sequence of (sin, cos) pairs, which have
+geometrically increasing wavelengths that span from 2 (for the first pair
+in each vector) to twice the length of the token sequence (for the last pair).
+Each sin/cos pair encodes the (x, y) value of a 2D unit vector at a particular
+angle.  For each sin/cos pair in the query position vector, we apply a learned
+2x2 rotation matrix, which will rotate and scale the pair by some amount.
+The dot product of two (sin, cos) pairs is the cosine of the angle between them.
+The dot product of the query position and key position vectors is thus the sum
+of such cosines. By rotating and scaling the query position, it is possible to
+approximate any function over relative position as a Fourier series: a sum of
+cosine waves at different wavelengths. The rotation provides phase, and the
+scale provides magnitude.
+Put another way, rotating the (sin, cos) pairs of a query position will compute
+a relative offset from the /query/ position to some target /key/ position.
+"""
+from typing import Any, Optional
+from flax import linen as nn
+import gin
+import jax.numpy as jnp
+from transformer import position
+import numpy as np
+Array = jnp.ndarray
+def _initialize_frel_rotation_matrix(rng, num_heads, vec_size):
+  """Intialize the rotation matrices."""
+  # Initialize each rotation matrix to the identity * scale.
+  #
+  # Initially scale by 1 / number of sine waves = 1/2 the position vector size.
+  # With this initialization, the initial position bias terms should be
+  # between -1.0 and 1.0 after the rotation matrix has been applied.
+  del rng  # required for init function but unused
+  scale = float(2.0 / vec_size)
+  tmat_a = jnp.ones([num_heads, vec_size // 2], dtype=jnp.float32) * scale
+  tmat_b = jnp.zeros([num_heads, vec_size // 2], dtype=jnp.float32)
+  return jnp.concatenate([tmat_a, tmat_b], axis=1)
+@gin.configurable
+class RelativeFourierPositions(nn.Module):
+  """A implementation of Fourier relative positions."""
+  # The number of attention heads.
+  num_heads: int = 8
+  # The maximum number of keys to attend to.
+  # The sin/cos wavelengths of the position vectors will be tuned to this max.
+  max_number_of_keys: int = 1024
+  # Size of the position vector. Needs to be large enough to address the keys.
+  position_vector_size: int = 128
+  # Data type to use for the rotation matrices.
+  dtype: Any = jnp.float32
+  @nn.compact
+  def __call__(self, num_queries: int, num_keys: int,
+               offset: Optional[int] = None,
+               bidirectional: bool = True) -> Array:
+    """Returns relative positional attention matrix.
+    If num_keys >= num_queries, e.g. for transformer XL or sliding window,
+    then offset should be (num_keys - num_queries) to make the last N queries
+    line up with the last N keys.  This is the default if offset is None.
+    Args:
+      num_queries: Number of queries.
+      num_keys:    Number of keys.
+      offset:      Offset of the first query with respect to the first key.
+                   (See position.relative_positions() for more info.)
+      bidirectional: Unused, included for compatibility.
+                     Relative positions are always bidirectional.
+    Returns:
+      Attention matrix of shape (num_heads, num_queries, num_keys)
+    """
+    # Get the offset of each query with respect to each key.
+    # If not specified, the last N queries line up with the last N keys.
+    if offset is None:
+      assert num_keys >= num_queries
+      offset = num_keys - num_queries
+    max_wavelength = 2 * self.max_number_of_keys
+    # Compute absolute position vectors for keys.
+    # Use numpy to compute these arrays statically.
+    # ks : (num_keys, pvec_size)
+    ks = position.position_encoding(num_keys,
+                                    self.position_vector_size,
+                                    offset=0,  # offset of queries wrt. keys
+                                    max_wavelength=max_wavelength)
+    # Compute absolute position vectors for queries.
+    # qs : (num_queries, pvec_size)
+    if offset >= 0 and offset + num_queries <= num_keys:
+      # Query positions are a subset of the key positions.
+      qs = ks[offset:offset + num_queries]
+    else:
+      # Query positions must be computed separately.
+      qs = position.position_encoding(num_queries,
+                                      self.position_vector_size,
+                                      offset=offset,
+                                      max_wavelength=max_wavelength)
+    # Split qs into x and y coordinates for rotation.
+    (qx, qy) = np.split(qs, 2, axis=-1)
+    qs_xs = np.concatenate([qx, qx], axis=-1)
+    qs_ys = np.concatenate([qy, qy], axis=-1)
+    del qs
+    # Convert from numpy to jax.
+    ks = jnp.asarray(ks, dtype=self.dtype)
+    qs_xs = jnp.asarray(qs_xs, dtype=self.dtype)
+    qs_ys = jnp.asarray(qs_ys, dtype=self.dtype)
+    # Initialize the rotation matrices to the identity.
+    rotation_matrix = self.param("rotation_matrix",
+                                 _initialize_frel_rotation_matrix,
+                                 self.num_heads,
+                                 self.position_vector_size)
+    rotation_matrix = jnp.asarray(rotation_matrix, dtype=self.dtype)
+    # Unpack rotatation_matrix to a set of 2x2 matrices.
+    rmat1 = rotation_matrix  # [rm_a, rm_b]
+    (rm_a, rm_b) = jnp.split(rotation_matrix, 2, axis=-1)
+    rmat2 = jnp.concatenate([-rm_b, rm_a], axis=-1)
+    # Vectors in qs consist of a set of (x,y) (e.g. sin,cos) pairs.
+    # We transform each (x,y) pair with a 2D rotation matrix:
+    #
+    #   x' = a*x + -b*y
+    #   y' = b*x + a*y
+    #
+    # or equivalently, x' + y'i = (a + bi)(x + yi) where i = sqrt(-1).
+    #
+    # For an angle theta, and scale s, a = cos(theta)*s, b = sin(theta)*s,
+    # and a + bi = s*exp(i*theta). We avoid computing sin,cos by training a,b
+    # directly.
+    #
+    # qs_xs = [x0 .. xn;   x0 .. xn]    -- layout of qs_xs
+    # qs_ys = [y0 .. yn;   y0 .. yn]
+    # rmat1 = [a0 .. an;   b0 .. bn]    -- layout of (a,b) values in rmat1
+    # rmat2 = [-b0 .. -bn; a0 .. an]
+    #
+    # rot_qs: (num_heads, num_queries, pvec_size)
+    # Broadcast qs over the number of heads.
+    # Broadcast rmat over the number of queries.
+    qs_xs = qs_xs[jnp.newaxis, ...]     # (1, num_queries, pvec_size)
+    qs_ys = qs_ys[jnp.newaxis, ...]
+    rmat1 = rmat1[:, jnp.newaxis, ...]  # (num_heads, 1, pvec_size)
+    rmat2 = rmat2[:, jnp.newaxis, ...]
+    rot_qs = ((rmat1 * qs_xs) + (rmat2 * qs_ys))
+    # Compute the dot product of each position vector in ks by the rotated qs.
+    #
+    # The dot product of each (x, y) pair in ks, and each (x', y') in rot_qs,
+    # is equal to the cosine of the angle between them, times the length
+    # of (x', y').
+    #
+    # The angle of the cosine for each pair depends on:
+    #   - The distance between the key and the query, divided by the wavelength.
+    #     (From the initial position encoding for ks and qs).
+    #   - The rotation performed by (a,b).
+    #
+    # The length of (x', y') is equal to the scale of (a, b).
+    #
+    # The dot product of two complete position vectors is the sum of the
+    # cosines for all pairs.  The cosines form a progression of geometrically
+    # increasing wavelengths, and each wave has a scale and phase provided by
+    # the rotation matrix.  The sum of such waves can thus approximate any
+    # function of position.
+    #
+    # pbias: (num_heads, num_queries, num_keys)
+    pbias = jnp.einsum("hqd,kd->hqk", rot_qs, ks)
+    # Add batch dimension; --> shape (1, num_heads, num_queries, num_keys)
+    pbias = jnp.expand_dims(pbias, 0)
+    return pbias.astype(self.dtype)

aglib/meliad/transformer/position_t5.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Class for T5 relative position biases.
+Adapted from flaxformer.components.relative_position_biases.py
+"""
+from typing import Any, Callable, Optional
+from flax import linen as nn
+import gin
+from jax import lax
+import jax.numpy as jnp
+from transformer import position
+import numpy as np
+Array = Any
+@gin.configurable
+class T5RelativePositionBiases(nn.Module):
+  """Adds T5-style relative positional embeddings to the attention logits.
+  Attributes:
+    num_buckets: Number of buckets to bucket distances between key and query
+      positions into.
+    max_distance: Maximum distance before everything is lumped into the last
+      distance bucket.
+    num_heads: Number of heads in the attention layer. Each head will get a
+      different relative position weighting.
+    dtype: Type of arrays through this module.
+    embedding_init: initializer for relative embedding table.
+  """
+  num_buckets: int
+  max_distance: int
+  num_heads: int
+  dtype: Any
+  embedding_init: Callable[..., Array] = nn.linear.default_embed_init
+  @staticmethod
+  def _relative_position_bucket(relative_position,
+                                bidirectional=True,
+                                num_buckets=32,
+                                max_distance=128):
+    """Translate relative position to a bucket number for relative attention.
+    The relative position is defined as memory_position - query_position, i.e.
+    the distance in tokens from the attending position to the attended-to
+    position.  If bidirectional=False, then positive relative positions are
+    invalid.
+    We use smaller buckets for small absolute relative_position and larger
+    buckets for larger absolute relative_positions.  All relative
+    positions >=max_distance  map to the same bucket.  All relative
+    positions <=-max_distance map to the same bucket.  This should allow for
+    more graceful generalization to longer sequences than the model has been
+    trained on.
+    Args:
+      relative_position: an int32 array
+      bidirectional: a boolean - whether the attention is bidirectional
+      num_buckets: an integer
+      max_distance: an integer
+    Returns:
+      a Tensor with the same shape as relative_position, containing int32
+        values in the range [0, num_buckets)
+    """
+    ret = 0
+    n = -relative_position
+    if bidirectional:
+      num_buckets //= 2
+      ret += (n < 0).astype(np.int32) * num_buckets
+      n = np.abs(n)
+    else:
+      n = np.maximum(n, 0)
+    # now n is in the range [0, inf)
+    max_exact = num_buckets // 2
+    is_small = (n < max_exact)
+    val_if_large = max_exact + (
+        np.log(n.astype(np.float32) / max_exact + np.finfo(np.float32).eps) /
+        np.log(max_distance / max_exact) *
+        (num_buckets - max_exact)).astype(np.int32)
+    val_if_large = np.minimum(val_if_large, num_buckets - 1)
+    ret += np.where(is_small, n, val_if_large)
+    return ret
+  @nn.compact
+  def __call__(self, num_queries, num_keys, offset: Optional[int]=None,
+               bidirectional=True):
+    """Produce relative position embedding attention biases.
+    Args:
+      num_queries: Number of queries.
+      num_keys: Number of keys.
+      offset: Offset of the first query with respect to the first key.
+              (See position.relative_positions() for more info.)
+      bidirectional: whether to allow positive memory-query relative position
+        embeddings.
+    Returns:
+      output: `(1, num_heads, num_queries, num_keys)` attention bias
+    """
+    # Find the distance between each query and each key.
+    # This is where this implementation differs from the T5 implementation;
+    # this version lines the /last/ N queries up with the /last/ N keys,
+    # (which is appropriate for XL/sliding window) while the T5 version lines
+    # up the /first/ N queries with the first N keys, in cases where the
+    # number of keys and queries differ.
+    relative_position = position.relative_positions_np(
+        num_queries=num_queries, num_keys=num_keys, offset=offset)
+    rp_bucket = self._relative_position_bucket(
+        relative_position,
+        bidirectional=bidirectional,
+        num_buckets=self.num_buckets,
+        max_distance=self.max_distance)
+    relative_attention_bias = self.param('rel_embedding', self.embedding_init,
+                                         (self.num_heads, self.num_buckets),
+                                         jnp.float32)
+    relative_attention_bias = jnp.asarray(relative_attention_bias, self.dtype)
+    # Instead of using a slow gather, we create a leading-dimension one-hot
+    # array from rp_bucket and use it to perform the gather-equivalent via a
+    # contraction, i.e.:
+    # (num_head, num_buckets) x (num_buckets one-hot, num_queries, num_keys).
+    # This is equivalent to relative_attention_bias[:, rp_bucket]
+    bcast_iota = lax.broadcasted_iota(jnp.int32, (self.num_buckets, 1, 1), 0)
+    rp_bucket_one_hot = jnp.array(
+        rp_bucket[jnp.newaxis, ...] == bcast_iota, dtype=self.dtype)
+    # --> shape (num_queries, num_keys, num_heads)
+    values = lax.dot_general(
+        relative_attention_bias,
+        rp_bucket_one_hot,
+        (
+            ((1,), (0,)),  # rhs, lhs contracting dims
+            ((), ())))  # no batched dims
+    # Add a singleton batch dimension.
+    # --> shape (1, num_heads, num_queries, num_keys)
+    out = values[jnp.newaxis, ...]
+    return out

aglib/meliad/transformer/synthetic_text_data.py ADDED Viewed

The diff for this file is too large to render. See raw diff

aglib/meliad/transformer/tasks.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Add Tasks to registry."""
+import functools
+from transformer import text_dataset
+import seqio
+import t5.data
+from t5.data import preprocessors
+import tensorflow as tf
+TaskRegistry = seqio.TaskRegistry
+def define_pg19_task(name: str, vocab: seqio.Vocabulary):
+  seqio.TaskRegistry.add(
+      name,
+      seqio.TfdsDataSource(
+          tfds_name="pg19:0.1.1"
+      ),
+      preprocessors=[
+          functools.partial(text_dataset.rekey_articles,
+                            rekey={"book_text": "targets"},
+                            keep={"book_title", "book_id", "publication_date"}),
+          seqio.preprocessors.tokenize,
+      ],
+      output_features={
+          "targets": seqio.Feature(vocab,
+                                   add_eos=False, dtype=tf.int32),
+      }
+  )
+T5_DEFAULT_VOCABULARY = t5.data.get_default_vocabulary()
+define_pg19_task("pg19_bytes", seqio.ByteVocabulary())
+define_pg19_task("pg19_tokens", T5_DEFAULT_VOCABULARY)

aglib/meliad/transformer/text_dataset.py ADDED Viewed

	@@ -0,0 +1,775 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Load text datasets for long-range transformer models."""
+import os
+import re
+from typing import Any, Callable, Dict, Iterable, Mapping, Optional, Sequence, Set, Tuple, Union
+from absl import flags
+from absl import logging
+import gin
+import jax
+from transformer import synthetic_text_data
+import numpy as np
+import seqio
+import tensorflow.compat.v2 as tf
+flags.DEFINE_string("default_data_dir", None,
+                    "Default directory where data is stored.")
+FLAGS = flags.FLAGS
+_DEFAULT_DATA_DIRECTORY = None
+@gin.configurable
+def set_default_data_directory(directory_name=None):
+  """Set the default directory where training data is located."""
+  global _DEFAULT_DATA_DIRECTORY
+  # If the data directory has been overridden with a command-line flag, use it.
+  # If not, the see if directory_name has been configured by Gin.
+  # Otherwise, use the default tfds directory.
+  if FLAGS.default_data_dir:
+    directory_name = FLAGS.default_data_dir
+  if directory_name is not None:
+    seqio.set_tfds_data_dir_override(directory_name)
+    _DEFAULT_DATA_DIRECTORY = directory_name
+def get_iterator_function(dataset: Optional[tf.data.Dataset]):
+  """Returns a function which gets an iterator over the given dataset."""
+  if dataset is None:
+    return None
+  else:
+    return dataset.as_numpy_iterator
+@gin.configurable
+def get_loss_mask_tokens(
+    split: str,
+    loss_mask_start_tokens: Sequence[int] = (),
+    loss_mask_end_tokens: Sequence[int] = (),
+    splits: Sequence[str] = ("all",)
+) -> Tuple[Sequence[int], Sequence[int]]:
+  """Returns two token sequences to indicate start and end of the loss.
+  Please configure loss_mask_start_tokens, loss_mask_end_tokens, and
+  split_filter via gin. Example gin config to only apply loss between tokens 2
+  and 1 for the test set (and everywhere for any other data split):
+  ```
+  text_dataset.get_loss_mask_tokens:
+    loss_mask_start_tokens=(2,)
+    loss_mask_end_tokens=(1,)
+    restrict_to_splits=("test",)
+  ```
+  Args:
+    split: The mode ("test", "train", ...)
+    loss_mask_start_tokens: token sequence to starts the loss
+    loss_mask_end_tokens: token sequence to stop the loss
+    splits: Only compute the loss mask for splits in this list.
+      By default it is 'all', which is a reserved split string that applies to
+      all splits.
+  """
+  if "all" in splits or split in splits:
+    return loss_mask_start_tokens, loss_mask_end_tokens
+  return (), ()
+@gin.configurable
+def load_text_dataset(name: str,
+                      split: str,
+                      sequence_length: int,
+                      batch_size: int,
+                      sequential: bool = True,
+                      shard_dataset: bool = True,
+                      verbose: bool = False,
+                      ) -> Tuple[tf.data.Dataset, seqio.Vocabulary]:
+  """Load a text dataset of long articles or books, and split_and_batch them.
+  The input dataset must produce complete books or articles, where each article
+  is a dictionary containing a "tokens" field.
+  See split_and_batch for more information on the output dataset.
+  Args:
+    name:  The name of the seqio task which produces the dataset.
+    split: The name of the split to use, e.g. "train" or "test".
+    sequence_length: Split text into sequences of this length.
+    batch_size: Draw from batch_size articles in each batch.
+    sequential: If True, return the chunks of each article in sequence.
+    shard_dataset: If True, split data set into shards.
+    verbose: Log (an excerpt) of every text example loaded from disk. If False,
+      will only print 1 excerpt every 60 seconds.
+  Returns:
+    (dataset, vocabulary)
+    where vocabulary is the seqio.Vocabulary which is used to encode "targets".
+  """
+  logging.info("Loading text data set %s, split=%s, shape=(%d, %d)",
+               name, split, batch_size, sequence_length)
+  if name == "synthetic":
+    ds = synthetic_data_long(split, sequence_length, batch_size)
+    return (ds, seqio.PassThroughVocabulary(256, 0))
+  elif name == "synthetic_short":
+    ds = synthetic_data_short(split, sequence_length, batch_size)
+    return (ds, seqio.PassThroughVocabulary(256, 0))
+  elif name == "enwik8":
+    # TODO(delesley): Encapsulate enwik8 into a Task.
+    ds = load_enwik8(split, sequence_length, batch_size,
+                     data_dir=_DEFAULT_DATA_DIRECTORY)
+    return (ds, seqio.PassThroughVocabulary(256, 0))
+  # Bypass the seqio "feature converter", and get the task directly.
+  task = seqio.get_mixture_or_task(name)
+  vocab = task.output_features["targets"].vocabulary
+  # Create the task input pipeline.
+  if shard_dataset:
+    logging.info("Shards: %d of %d", jax.process_index(), jax.process_count())
+    shard_info = seqio.ShardInfo(index=jax.process_index(),
+                                 num_shards=jax.process_count())
+  else:
+    shard_info = None
+  if sequential:
+    task_seqlen = None             # We do our own splitting.
+    shuffle_buffer_size = 1000     # Number of full-length books.
+  else:
+    task_seqlen = {"targets": sequence_length}  # Ask the task to do splitting.
+    shuffle_buffer_size = 10_000   # Number of chunks.
+  ds = task.get_dataset(
+      sequence_length=task_seqlen,
+      split=split,
+      use_cached=False,
+      shuffle=True,
+      shuffle_buffer_size=shuffle_buffer_size,
+      seed=None,
+      shard_info=shard_info,
+      num_epochs=1)
+  if sequence_length == 0:
+    return (ds, vocab)  # Don't chop into subsequences.
+  def extract_fn(article):
+    return article["targets"]
+  include_loss_mask = bool(get_loss_mask_tokens(split)[0])
+  ds = split_and_batch(ds,
+                       split=split,
+                       extract_fn=extract_fn,
+                       sequence_length=sequence_length,
+                       batch_size=batch_size,
+                       auto_rewind=True,
+                       vocab=vocab,
+                       include_loss_mask=include_loss_mask,
+                       verbose=verbose)
+  return (ds, vocab)
+def rekey_articles(ds: tf.data.Dataset,
+                   rekey: Mapping[str, str],
+                   keep: Optional[Set[str]] = None) -> tf.data.Dataset:
+  """Rekey the articles in ds.
+  Fields in rekey will be renamed, field in keep will be kept, others will
+  be discarded.  E.g., For PG19:
+    rekey_article(ds,
+                  rekey={"book_text": "targets"},
+                  keep={"book_title", "book_id"})
+  Args:
+    ds: The dataset to rekey.
+    rekey: Dictionary which contains fields to rename.
+    keep: Set of fields to keep.
+  Returns:
+    A rekeyed dataset.
+  """
+  def rekey_fn(article):
+    result_dict = {}
+    for (k, v) in article.items():
+      if k in rekey:
+        result_dict[rekey[k]] = v
+      elif k in keep:
+        result_dict[k] = v
+    return result_dict
+  return ds.map(rekey_fn)
+def pretty_print_article(article,
+                         vocab_map: Mapping[str, Optional[seqio.Vocabulary]],
+                         max_length: int = 60) -> str:
+  """Convert the contents of a long article to a short string."""
+  if not hasattr(article, "items"):
+    return pretty_print_value(article, max_length)  # Not a dictionary.
+  dstr = "{"
+  for (k, v) in article.items():
+    if vocab_map and k in vocab_map:
+      vstr = decode_tokens(v, vocab_map[k], max_length)
+    else:
+      vstr = pretty_print_value(v, max_length)
+    dstr += "\n  " + k + ": " + vstr
+  return dstr + "\n}"
+def pretty_print_value(value, max_length: int) -> str:
+  """Convert a possibly large value to a short string."""
+  if isinstance(value, bytes):
+    if len(value) <= max_length:
+      return str(value)
+    else:
+      return f"bytes[{len(value)}] " + str(value[:max_length]) + "..."
+  elif isinstance(value, str):
+    if len(value) <= max_length:
+      return value
+    else:
+      return f"str[{len(value)}] " + value[:max_length] + "..."
+  elif isinstance(value, np.ndarray):
+    vstr = f"ndarray({value.shape}, {value.dtype.str})"
+    if value.size <= (max_length / 4):
+      vstr += " = " + str(value)
+    return vstr
+  elif np.ndim(value) == 0:
+    return str(value)   # Scalar data.
+  else:
+    return str(type(value))
+def decode_tokens(tokens: Any, vocab: seqio.Vocabulary, max_length: int) -> str:
+  """Convert tokens to a human-readable string."""
+  if isinstance(tokens, np.ndarray):
+    tstr = f"ndarray({tokens.shape}, {tokens.dtype.str}) = "
+  else:
+    tstr = f"{str(type(tokens))} = "
+  if np.ndim(tokens) == 1:
+    tstr += decode_tokens_1d(tokens, vocab, max_length)
+  elif np.ndim(tokens) == 2:
+    jtstr = ",\n    ".join([decode_tokens_1d(s, vocab, max_length)
+                            for s in tokens])
+    tstr += f"[\n    {jtstr}\n  ]"
+  else:
+    tstr = pretty_print_value(tokens, max_length)
+  return tstr
+def decode_tokens_1d(tokens: Any, vocab: Any, max_length: int,
+                     raw_string: bool = False) -> Union[str, bytes]:
+  """Convert a 1D array of tokens to a human-readable string.
+  Args:
+    tokens:     1-dimensional array of integers.
+    vocab:      The vocabulary to detokenize the array.
+    max_length: The maximum number of tokens to detokenize.
+    raw_string: If True, return the string as bytes.
+                If false, pretty print it (e.g. with "\n").
+  Returns:
+    The detokenized string.
+  """
+  assert np.ndim(tokens) == 1
+  # The type of tokens is np.ndarray((sequence_length,), "int32")
+  # We have to convert this to an actual list of python integers, NOT numpy
+  # integers, or decode will blow up, and fail to marshall the data to C++.
+  dtoks = [int(i) for i in tokens[:max_length]]
+  tstr = vocab.decode(dtoks)
+  # Convert the decoded string to a byte string.
+  # PassThroughVocabulary returns a list, not a string.
+  if isinstance(tstr, str):
+    tstr = bytes(tstr.encode("utf-8"))
+  else:
+    tstr = bytes(tstr)
+  # If raw_string, return immediately.
+  if raw_string:
+    return tstr
+  # Otherwise format it for pretty-printing.
+  # Converting bytes to str will convert, e.g., newlines as "\n".
+  tstr = str(tstr)
+  if len(tokens) > max_length:
+    tstr += "..."
+  return tstr
+def bytes_to_tokens(s: str):
+  """Convert a byte string to an array of integers."""
+  return np.fromiter((char for char in s), count=len(s), dtype=np.int32)
+def pad_chunk(s: Optional[np.ndarray], sequence_length: int):
+  """Pad an array s out to the given sequence_length."""
+  if s is None:
+    return np.zeros(sequence_length, dtype=np.int32)
+  assert np.ndim(s) == 1
+  chunk_len = len(s)
+  assert chunk_len <= sequence_length
+  if chunk_len == sequence_length:
+    return s
+  else:
+    return np.pad(s, (0, sequence_length - chunk_len),
+                  mode="constant", constant_values=0)
+def split_article(tokens: np.ndarray, sequence_length: int, split: str,
+                  include_loss_mask: bool) -> (
+                      Iterable[Tuple[np.ndarray, np.ndarray]]):
+  """Split an array into segments of length sequence_length."""
+  assert np.ndim(tokens) == 1
+  if include_loss_mask:
+    loss_mask = loss_mask_from_tokens(tokens, split)
+  for k in range(0, len(tokens), sequence_length):
+    segment = pad_chunk(tokens[k:k + sequence_length], sequence_length)
+    if include_loss_mask:
+      segment_loss_mask = pad_chunk(
+          loss_mask[k:k + sequence_length], sequence_length).astype(bool)
+    else:
+      segment_loss_mask = np.array(True, dtype=bool)  # dummy mask
+    yield (segment, segment_loss_mask)
+def nonzero_tokens(tokens: np.ndarray,
+                   loss_mask: Optional[np.ndarray]) -> list[int]:
+  """Removes tokens that are not predicted by the model."""
+  # TODO(delesley): Fix the model so that it predicts the first token.
+  # The language model doesn't predict the first token.
+  toks = [int(tokens[i]) for i in range(1, len(tokens))
+          if (tokens[i] != 0 and (loss_mask is None or loss_mask[i]))]
+  return toks
+def _find_subsequence_idxs(sequence: np.ndarray, subsequence: Sequence[int]):
+  """Returns the indices where `subsequence` occurs in `sequence`."""
+  subsequence = np.asarray(subsequence, dtype=np.int32)
+  # use np.where as an efficient way to iterate over the whole array; but we can
+  # only test for a single token, unfortunately.
+  potential_matches = np.where(sequence == subsequence[0])[0]
+  match_indices = []
+  for start_index in potential_matches:
+    if np.array_equal(sequence[start_index:start_index + len(subsequence)],
+                      subsequence):
+      match_indices.append(start_index)
+  return match_indices
+def loss_mask_from_tokens(tokens: np.ndarray, split: str) -> np.ndarray:
+  """Compute a mask for language modelling loss using start and end tokens."""
+  assert np.ndim(tokens) == 1
+  tokens = tokens.astype(np.int32)
+  # Position offset of loss mask and target positions. Typically -1, which
+  # indicates that targets are shifted 1 position left compared to inputs.
+  offset = -1
+  start_tokens, end_tokens = get_loss_mask_tokens(split=split)
+  if not start_tokens:
+    # default to not masking out any loss
+    return np.ones_like(tokens, dtype=bool)
+  start = 0
+  end = len(tokens)  # include end_tokens
+  start_indices = _find_subsequence_idxs(tokens, start_tokens)
+  if start_indices:
+    if end_tokens:
+      end_indices = _find_subsequence_idxs(tokens, end_tokens)
+    else:
+      end_indices = []
+    if len(start_indices) > 1 or len(end_indices) > 1:
+      logging.error("Multiple start or end tokens for loss mask: %s, %s",
+                    start_indices, end_indices)
+    start = start_indices[0]
+    if end_indices and end_indices[0] >= start:
+      end = end_indices[0]
+    # We include the start_tokens and the end_tokens, which represents that the
+    # model must predict the location, the content, and the end of the
+    # subsequence.
+    start += offset
+    start = max(0, start)  # to prevent offset creating negative indices
+    end += len(end_tokens) + offset
+  # Create the actual mask. Roughly equivalent to
+  # mask = np.array([i >= start && i <= end for i in range(len(tokens))])
+  mask = np.concatenate([
+      np.zeros((start,), dtype=bool),
+      np.ones((end - start,), dtype=bool),
+      np.zeros((len(tokens) - end,), dtype=bool)
+  ])
+  return mask
+def _batched_interleave_generator(
+    ds: tf.data.Dataset,
+    flat_map_func: Callable[[str], Iterable[Tuple[np.ndarray, np.ndarray]]],
+    post_map_func,
+    batch_size: int,
+    vocab: Optional[seqio.Vocabulary] = None,
+    include_loss_mask: bool = False,
+    auto_rewind: bool = False) -> Iterable[Dict[str, np.ndarray]]:
+  """Generator which combines the interleave and batch dataset operations.
+  Given a set of articles from ds, flat_map_func is mapped over the articles
+  to break each article up into an iterable of chunks and their loss masks.
+  The generator will return the examples from each article in sequential order,
+  for transformer-XL style models that process long articles over multiple
+  training steps.
+  Articles are combined into batches of size batch_size, where each example in
+  the batch is pulled from a different article. When one article ends, the
+  generator will start pulling examples from the next article.  The overall
+  result is similar to tf.Data.Dataset.interleave, except that interleave does
+  not always maintain the same order of articles.  If this generator starts
+  pulling from article "foo" as the 3rd item in the batch, then consecutive
+  examples from "foo" will remain as the 3rd item until the article ends.  This
+  guarantee is necessary to pass state from one training step to the next.
+  If auto_rewind, then the generator will automatically grab a new iterator
+  from ds at the end of the epoch, and increment the epoch counter. Otherwise,
+  it will yield empty datasets until all articles in the batch have been
+  completed.
+  Args:
+    ds:            A dataset of articles.
+    flat_map_func: A function which returns an iterator over chunks of tokens
+      and the loss masks associated with those tokens.
+    post_map_func: A function which post-processes each item to fixed size.
+    batch_size:    The number of articles in a batch.
+    vocab:         The vocabulary to detokenize strings and count characters.
+    include_loss_mask: If true, will return a loss mask with the tokens.
+    auto_rewind:   Automatically rewind ds at end of epoch.
+  Yields:
+    Batches of consecutive examples from articles.
+    Each example has type: {
+      "targets": int32[batch_size, sequence_length],
+      "start_of_sequence": bool[batch_size],
+      "epoch": int32[batch_size],
+      "loss_mask": bool[batch_size, sequence_length],
+    }
+  """
+  ds_iter = ds.as_numpy_iterator()
+  document_start = [True] * batch_size  # At start of each article.
+  readers = [None] * batch_size         # Iterator for each article
+  still_reading = [True] * batch_size   # End of current article?
+  item_epochs = [0] * batch_size        # Epoch of the given item.
+  epoch = 0
+  # Main generator loop
+  while any(still_reading):
+    targets = [None] * batch_size
+    loss_mask = [None] * batch_size
+    for i in range(0, batch_size):
+      targets_i = None
+      loss_mask_i = None
+      while targets_i is None and still_reading[i]:
+        if readers[i] is not None:
+          try:
+            # Grab the next item from the article.
+            targets_i, loss_mask_i = next(readers[i])
+          except StopIteration:
+            # Article has ended; continue the while loop to grab a new one.
+            readers[i] = None
+        else:
+          # Grab the next article from ds if the current one has ended.
+          dsi = None
+          try:
+            dsi = iter(flat_map_func(next(ds_iter)))
+          except StopIteration:
+            logging.info("End of epoch %d.", epoch)
+            if auto_rewind:
+              epoch = epoch + 1
+              logging.info("Starting epoch %d.", epoch)
+              ds_iter = ds.as_numpy_iterator()
+              dsi = iter(flat_map_func(next(ds_iter)))
+            else:
+              still_reading[i] = False  # No more articles on i
+          if dsi is not None:
+            # Start reading the new article.
+            # Continue while loop to grab the first chunk.
+            readers[i] = dsi
+            document_start[i] = True
+            item_epochs[i] = epoch
+      # post_map_func must handle None values, and return stackable np.arrays.
+      targets[i] = post_map_func(targets_i)  # handles None
+      if include_loss_mask:
+        loss_mask[i] = post_map_func(loss_mask_i).astype(bool)  # handles None
+    # If we've reached the end of all articles, stop immediately.
+    if not any(still_reading):
+      break
+    doc_start_orig = document_start.copy()  # Return doc_start_orig.
+    for i in range(0, batch_size):
+      # Now that we've read an item, set /start/ to false for each reader.
+      document_start[i] = False
+    # Decode the tokenized segement back to characters, to count the number
+    # of characters for the bits-per-character computation.
+    num_chars = [0] * batch_size
+    nz_toks = [0] * batch_size
+    for i in range(0, batch_size):
+      lmask = loss_mask[i] if include_loss_mask else None
+      toks = nonzero_tokens(targets[i], lmask)
+      if vocab is not None:
+        bchars = decode_tokens_1d(toks, vocab, max_length=len(targets[i]),
+                                  raw_string=True)
+        num_chars[i] = len(bchars)
+      else:
+        num_chars[i] = len(toks)
+      nz_toks[i] = len(toks)
+    item = {
+        "targets": np.stack(targets),
+        "start_of_sequence": np.array(doc_start_orig),
+        "epoch": np.array(item_epochs),
+        "num_chars": np.stack(num_chars),
+        "nonzero_tokens": np.stack(nz_toks),
+    }
+    if include_loss_mask:
+      item["loss_mask"] = np.stack(loss_mask)
+    yield item
+def split_and_batch(ds: tf.data.Dataset,
+                    split: str,
+                    extract_fn: Callable[[Any], Any],
+                    sequence_length: int,
+                    batch_size: int,
+                    auto_rewind: bool = False,
+                    vocab: Optional[seqio.Vocabulary] = None,
+                    include_loss_mask: bool = False,
+                    verbose: bool = False) -> tf.data.Dataset:
+  """Converts articles to tokens and chops and batches them.
+  See batched_interleave_generator for more details.
+  Args:
+    ds:                A dataset of articles.
+    split:             Which dataset split is to be computed, e.g. 'train'.
+    extract_fn:        Return a sequence of tokens from article.
+    sequence_length:   The number of tokens in each sequence.
+    batch_size:        The number of examples in each batch.
+    auto_rewind:       If True, will automatically rewind at end of epoch.
+    vocab:             Vocabulary, used to count characters.
+    include_loss_mask: Return a loss mask for each batch.
+    verbose:           Write article info to log as they are read.
+  Returns:
+    A dataset which yields examples of shape {
+        "targets": int32[batch_size, sequence_length],
+        "start_of_sequence": bool[batch_size],
+        "epoch": int32[batch_size],
+        "loss_mask": bool[batch_size, sequence_length],
+        "num_chars": A count of the number of detokenized characters.
+        "nonzero_tokens": A count of the number of nonzero predicted tokens.
+    }
+  """
+  # Tokenize article, compute loss mask, split into multiple chunks.
+  # The entire article must fit into memory.
+  def wrap_split_article(article):
+    if verbose:
+      logging.info("Reading article: %s", pretty_print_article(article, {}))
+    else:
+      logging.log_every_n_seconds(logging.INFO, "Reading article: %s", 60,
+                                  pretty_print_article(article, {}))
+    tokens = extract_fn(article)
+    if isinstance(tokens, str) or isinstance(tokens, bytes):
+      tokens = bytes_to_tokens(tokens)
+    elif isinstance(tokens, np.ndarray):
+      tokens = tokens.astype(np.int32)
+    else:
+      raise TypeError("Unusupported sequence type: %s" % str(type(tokens)))
+    return split_article(tokens, sequence_length, split=split,
+                         include_loss_mask=include_loss_mask)
+  # Handle None values.
+  def wrap_pad_chunk(s):
+    return pad_chunk(s, sequence_length)
+  def wrap_batched_interleave_generator():
+    return _batched_interleave_generator(ds,
+                                         flat_map_func=wrap_split_article,
+                                         post_map_func=wrap_pad_chunk,
+                                         batch_size=batch_size,
+                                         vocab=vocab,
+                                         include_loss_mask=include_loss_mask,
+                                         auto_rewind=auto_rewind)
+  out_sig = {
+      "targets": tf.TensorSpec(shape=(batch_size, sequence_length),
+                               dtype=tf.int32),
+      "start_of_sequence": tf.TensorSpec(shape=(batch_size,), dtype=tf.bool),
+      "epoch": tf.TensorSpec(shape=(batch_size,), dtype=tf.int32),
+      "num_chars": tf.TensorSpec(shape=(batch_size,), dtype=tf.int32),
+      "nonzero_tokens": tf.TensorSpec(shape=(batch_size,), dtype=tf.int32),
+  }
+  if include_loss_mask:
+    out_sig["loss_mask"] = tf.TensorSpec(shape=(batch_size, sequence_length),
+                                         dtype=tf.bool)
+  cds = tf.data.Dataset.from_generator(wrap_batched_interleave_generator,
+                                       output_signature=out_sig)
+  return cds
+def merge_articles(article_starts_ends, sequence_length):
+  """Merge consecutive articles if their combined length < sequence_length."""
+  cs = 0
+  ce = 0
+  for (s, e) in article_starts_ends:
+    if ce == 0:
+      ce = s
+    if (e - cs) > sequence_length:
+      if ce > cs:
+        # print("Yield: ", cs, " to ", ce)
+        yield (cs, ce)   # Yield prior merged articles
+      cs = s           # Reset to start of current article
+      ce = e
+    else:
+      ce = e           # Merge article with current set.
+    # print("Article: ", s, " to ", e)
+  if ce > 0:
+    # print("Yield: ", cs, " to ", ce)
+    yield (cs, ce)     # Yield final merged set.
+def _targets_to_tokens(article):
+  return bytes_to_tokens(article["targets"])
+def _wrap_text_in_dict(text):
+  return {"targets": text}
+# ---------------------
+def load_enwik8(split: str,
+                sequence_length: int,
+                batch_size: int,
+                data_dir: str) -> tf.data.Dataset:
+  """Load the enwik8 dataset, partitioning into articles."""
+  if data_dir is None:
+    raise ValueError("Must specify a data directory for enwik8")
+  filename = os.path.join(data_dir, "enwik8")
+  filename = os.path.join(filename, "enwik8_" + split)
+  # Don't attempt to split the data, just shuffle it differently for
+  # each worker.
+  local_seed = 42 + jax.process_index()
+  logging.info("Enwik8: reading %s", filename)
+  with gfile.Open(filename, "r") as f:
+    text_data = f.read()
+  logging.info("Enwik8: parsing %s", filename)
+  article_starts = [m.start(0) for m in re.finditer("<page>", text_data)]
+  article_ends = article_starts[1:] + [len(text_data)]
+  logging.info("Enwik8: found %d articles.", len(article_starts))
+  merged_se = merge_articles(zip(article_starts, article_ends),
+                             sequence_length)
+  articles = [text_data[s:e] for (s, e) in merged_se]
+  num_articles = len(articles)
+  logging.info("Enwik8: merged into %d articles.", num_articles)
+  logging.info("Building dataset.")
+  ds = tf.data.Dataset.from_tensor_slices(articles)
+  ds = ds.map(_wrap_text_in_dict)
+  ds = ds.shuffle(num_articles, reshuffle_each_iteration=True, seed=local_seed)
+  if sequence_length == 0:
+    return ds  # Don't split and batch
+  return split_and_batch(ds,
+                         split=split,
+                         extract_fn=_targets_to_tokens,
+                         sequence_length=sequence_length,
+                         batch_size=batch_size,
+                         auto_rewind=True,
+                         verbose=False)
+# ---------------------
+def synthetic_data_short(split: str,
+                         sequence_length: int,
+                         batch_size: int,
+                         auto_rewind: bool = True) -> tf.data.Dataset:
+  """Return a synthetic data set of sequences."""
+  strings = [
+      b"The quick brown fox jumped over the lazy dog.",
+      b"Humpty dumpty sat on a wall and had a great fall and went splat.",
+      b"She sells sea shells by the sea shore.",
+      b"Peter piper picked a peck of pickled peppercorns."
+  ]
+  logging.info("Building synthetic dataset (short).")
+  ds = tf.data.Dataset.from_tensor_slices(strings)
+  ds = ds.map(_wrap_text_in_dict)
+  ds = ds.shuffle(4, reshuffle_each_iteration=True, seed=42)
+  if sequence_length == 0:
+    return ds  # Don't split and batch
+  return split_and_batch(ds,
+                         split=split,
+                         extract_fn=_targets_to_tokens,
+                         sequence_length=sequence_length,
+                         batch_size=batch_size,
+                         auto_rewind=auto_rewind,
+                         verbose=False)
+def synthetic_data_long(split: str,
+                        sequence_length: int,
+                        batch_size: int,
+                        auto_rewind: bool = True) -> tf.data.Dataset:
+  """Returns a synthetic data set with several long articles."""
+  articles = [
+      synthetic_text_data.text1_illiad_book1,
+      synthetic_text_data.text2_huckleberry_finn,
+      synthetic_text_data.text3_call_of_the_wild,
+      synthetic_text_data.text4_the_prince
+  ]
+  logging.info("Building synthetic dataset (long).")
+  ds = tf.data.Dataset.from_tensor_slices(articles)
+  ds = ds.map(_wrap_text_in_dict)
+  ds = ds.shuffle(4, reshuffle_each_iteration=True, seed=42)
+  if sequence_length == 0:
+    return ds  # Don't split and batch
+  return split_and_batch(ds,
+                         split=split,
+                         extract_fn=_targets_to_tokens,
+                         sequence_length=sequence_length,
+                         batch_size=batch_size,
+                         auto_rewind=auto_rewind,
+                         verbose=False)

aglib/meliad/transformer/transformer_base.py ADDED Viewed

	@@ -0,0 +1,451 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Base class for transformer layers."""
+from typing import Any, Callable, Optional, Tuple
+from absl import logging
+from flax import linen as nn
+import gin
+import jax
+import jax.numpy as jnp
+from transformer import nn_components
+Array = Any
+# Tuple of scale factors
+AttnScaleTuple = Tuple[Optional[Array], Optional[Array]]
+# Tuple of keys,values,queries
+KVQTuple = Tuple[Array, Array, Optional[Array], Optional[Array]]
+@gin.configurable
+class KVQLayer(nn.Module):
+  """Generate keys, values, and queries for attention."""
+  embedding_size: int
+  num_heads: int
+  head_size: int
+  has_queries: bool = True
+  has_queries2: bool = False  # For cross-attention, e.g. decoder or recurrence.
+  normalize_keys: bool = True  # Normalize keys and queries.
+  num_position_embeddings: int = 0  # Learned absolute position embeddings.
+  pre_attn_dropout: bool = True
+  dropout_rate: float = 0.0
+  dtype: Any = jnp.float32
+  def setup(self):
+    kernel_init = nn.initializers.variance_scaling(
+        scale=1.0, mode="fan_in", distribution="truncated_normal")
+    # Project to keys,values,queries
+    # Disable bias.  This prevents a failure mode whereby the attention matrix
+    # can become filled with very large uniform values, due to high bias.
+    self.keys_layer = nn.Dense(
+        features=self.num_heads * self.head_size,
+        use_bias=False,   # No bias for keys.
+        kernel_init=kernel_init,
+        dtype=self.dtype)
+    self.values_layer = nn.Dense(
+        features=self.num_heads * self.head_size,
+        use_bias=False,   # No bias for values.
+        kernel_init=kernel_init,
+        dtype=self.dtype)
+    if self.has_queries:
+      self.queries_layer = nn.Dense(
+          features=self.num_heads * self.head_size,
+          use_bias=False,   # No bias for queries.
+          kernel_init=kernel_init,
+          dtype=self.dtype)
+    if self.has_queries2:
+      self.queries2_layer = nn.Dense(
+          features=self.num_heads * self.head_size,
+          use_bias=False,   # No bias for queries.
+          kernel_init=kernel_init,
+          dtype=self.dtype)
+    # When normalizing keys and queries, attention must be scaled with
+    # learned parameters.
+    if self.normalize_keys:
+      self.attention_scale = self.param("attention_scale",
+                                        jax.nn.initializers.ones,
+                                        (self.num_heads,), jnp.float32)
+    # Learned position embeddings for absolute positions.
+    if self.num_position_embeddings > 0:
+      # Embeddings for query elements.
+      self.position_embeddings = self.param(
+          "position_embeddings",
+          jax.nn.initializers.normal(stddev=1.0),
+          (self.num_position_embeddings, self.embedding_size),
+          jnp.float32)
+    # Layernorm
+    self.pre_attn_layernorm = nn_components.LayerNorm()
+  def attention_scale_factor(self) -> Optional[Array]:
+    """Returns the attention scale, when keys and queries are normalized."""
+    if self.normalize_keys:
+      return jnp.asarray(self.attention_scale, dtype=self.dtype)
+    else:
+      return None
+  def _get_dropout_rng(self):
+    return self.make_rng("dropout")
+  def _normalize_kq(self, kq: Array) -> Array:
+    """Normalize function for keys and queries."""
+    epsilon = jnp.array(1.0e-6, dtype=self.dtype)
+    kq_sum_sqr = jnp.sum(jnp.square(kq), axis=-1, keepdims=True)
+    norm_kq = kq * jax.lax.rsqrt(kq_sum_sqr + epsilon)
+    return jnp.asarray(norm_kq, dtype=self.dtype)
+  def __call__(self, xs: Array, deterministic: bool = False) -> KVQTuple:
+    """Takes a sequence of embeddings as input, and returns keys,values,queries.
+    First apply pre_attn layernorm, and pre_attn dropout.
+    Then add learned positional embeddings, if any.
+    Return (keys, values, queries, queries2).
+    Args:
+      xs: input sequence of shape (batch_size, sequence_length, embedding_size)
+      deterministic: if False, apply dropout.
+    Returns:
+      (keys, values, queries, queries2) of shape
+          (batch_size, sequence_length, num_heads, head_size)
+    """
+    # Project inputs to (keys, values, queries).
+    (batch_size, num_keys, _) = xs.shape
+    drop_tile_shape = (1, 128, self.embedding_size)
+    # Apply layernorm to input, rather than the output.
+    # This provides better gradients through the resnet, and also avoids
+    # the need for a prolonged warmup phase (https://arxiv.org/abs/2002.04745)
+    # Layernorm for self-attention.
+    logging.info("kvq: pre_attn xs = %r", xs)
+    xs = jnp.asarray(xs, dtype=self.dtype)
+    xs = self.pre_attn_layernorm(xs)
+    # Add (optional) learned position embeddings.
+    if self.num_position_embeddings > 0:
+      assert xs.ndim == 3   # (b, sequence_length, embedding_size)
+      assert xs.shape[-2] == self.num_position_embeddings
+      logging.info("kvq: learned positions.")
+      xs_pos = jnp.asarray(self.position_embeddings, dtype=self.dtype)
+      xs_pos = jnp.expand_dims(xs_pos, 0)  # Add batch dimension.
+      xs = xs + xs_pos
+    # Pre-attention dropout.
+    if self.pre_attn_dropout:
+      logging.info("kvq: pre_attn dropout.")
+      xs = nn_components.tiled_dropout(xs, drop_tile_shape, self.dropout_rate,
+                                       rng_function=self._get_dropout_rng,
+                                       deterministic=deterministic)
+    # Compute keys and values.
+    keys = self.keys_layer(xs)  # (b, num_keys, num_heads * head_size)
+    values = self.values_layer(xs)
+    # Compute queries and cross-attention queries if necessary.
+    if self.has_queries:
+      queries = self.queries_layer(xs)  # (b, num_keys, n_heads * head_size)
+      logging.info("kvq: queries = %r", queries)
+    else:
+      queries = None
+    if self.has_queries2:
+      queries2 = self.queries2_layer(xs)  # (b, num_keys, n_heads * head_size)
+      logging.info("kvq: queries2 = %r", queries2)
+    else:
+      queries2 = None
+    # Reshape to split num_heads, head_size into separate dimensions.
+    kv_shape = (batch_size, num_keys, self.num_heads, self.head_size)
+    keys = jnp.reshape(keys, kv_shape)
+    values = jnp.reshape(values, kv_shape)
+    if queries is not None:
+      queries = jnp.reshape(queries, kv_shape)
+    if queries2 is not None:
+      queries2 = jnp.reshape(queries2, kv_shape)
+    if self.normalize_keys:
+      # Normalize both keys and queries.
+      # The learned attention_scale_factors() will return non-None.
+      logging.info("kvq: normalize keys, queries.")
+      keys = self._normalize_kq(keys)
+      if queries is not None:
+        queries = self._normalize_kq(queries)
+      if queries2 is not None:
+        queries2 = self._normalize_kq(queries2)
+    else:
+      # Scale queries by 1 / sqrt(d) when using unnormalized keys,queries.
+      d_scale = jax.lax.rsqrt(float(self.head_size)).astype(self.dtype)
+      logging.info("kvq: scale queries by 1/sqrt(d).")
+      if queries is not None:
+        queries = queries * d_scale
+      if queries2 is not None:
+        queries2 = queries2 * d_scale
+    # Return keys, values, and queries.
+    return (keys, values, queries, queries2)
+@gin.configurable
+class TransformerBase(nn.Module):
+  """TransformerBase implements everything except attention.
+  It handles:
+    - Projection to (keys, values, queries) before attention.
+    - Projection MLP back to embedding_size after attention.
+    - Final FFN layer.
+    - layernorm, dropout, and normalization of keys and queries.
+  This functionality is ecapsulated here so that it can be reused with more
+  complicated attention mechanisms.
+  """
+  # Options set by parent module.
+  mode: str
+  embedding_size: int
+  num_heads: int
+  head_size: int
+  cross_attention_q: bool = False         # Additional q for cross-attention.
+  cross_attention_kv: bool = False        # Additional kv for cross-attention.
+  num_position_embeddings: int = 0        # Learned position embeddings.
+  num_cross_position_embeddings: int = 0  # Learned position embeddings.
+  # Configurable hyperparameters.
+  attn_mlp_factory: Callable[[int], nn.Module] = gin.REQUIRED
+  ffn_factory: Callable[[int], nn.Module] = gin.REQUIRED
+  gate_type: str = "residual"
+  single_gate: bool = False
+  skip_ffn: bool = False
+  normalize_keys: bool = True
+  dropout_rate: float = 0.0
+  pre_attn_dropout: bool = True
+  post_attn_dropout: bool = False
+  pre_ffn_dropout: bool = False
+  post_ffn_dropout: bool = True
+  dtype: Any = jnp.float32
+  def is_training(self) -> bool:
+    return self.mode == "train"
+  def _get_dropout_rng(self):
+    return self.make_rng("dropout")
+  def _normalize_kq(self, kq: Array) -> Array:
+    """Normalize function for keys and queries."""
+    epsilon = jnp.array(1.0e-6, dtype=self.dtype)
+    kq_sum_sqr = jnp.sum(jnp.square(kq), axis=-1, keepdims=True)
+    norm_kq = kq * jax.lax.rsqrt(kq_sum_sqr + epsilon)
+    return jnp.asarray(norm_kq, dtype=self.dtype)
+  def setup(self):
+    # Keys,values,queries for self-attention; queries for cross-attention.
+    self._kvq = KVQLayer(self.embedding_size, self.num_heads, self.head_size,
+                         has_queries=True,
+                         has_queries2=self.cross_attention_q,
+                         num_position_embeddings=self.num_position_embeddings,
+                         normalize_keys=self.normalize_keys,
+                         pre_attn_dropout=self.pre_attn_dropout,
+                         dropout_rate=self.dropout_rate,
+                         dtype=self.dtype)
+    # Keys,values, attention_scale for cross-attention.
+    if self.cross_attention_kv:
+      # Use a full kvq layer, with layernorm and attention scale.
+      self._cross_kv = KVQLayer(
+          self.embedding_size, self.num_heads, self.head_size,
+          has_queries=False,
+          has_queries2=False,
+          num_position_embeddings=self.num_cross_position_embeddings,
+          normalize_keys=self.normalize_keys,
+          pre_attn_dropout=self.pre_attn_dropout,
+          dropout_rate=self.dropout_rate,
+          dtype=self.dtype)
+    elif self.cross_attention_q:
+      # No separate keys,values for cross-attention, but we may still need
+      # cross-attention-scale, so we create our own.
+      assert self.num_cross_position_embeddings == 0
+      if self.normalize_keys:
+        self.attention_scale2 = self.param("attention_scale2",
+                                           jax.nn.initializers.ones,
+                                           (self.num_heads,), jnp.float32)
+    # Post-attention linear projection.
+    if not self.single_gate:
+      self.post_attn_mlp = self.attn_mlp_factory(
+          self.embedding_size,
+          gate_type=self.gate_type,
+          final_activation=None,
+          dtype=self.dtype)  # pytype: disable=wrong-keyword-args  # trace-all-classes
+    # Final FNN.
+    if not self.skip_ffn:
+      self.ffn = self.ffn_factory(
+          self.embedding_size,
+          gate_type=self.gate_type,
+          final_activation=("tanh" if self.single_gate else None),
+          dtype=self.dtype)  # pytype: disable=wrong-keyword-args  # trace-all-classes
+    # Layernorm.
+    self.pre_ffn_layernorm = nn_components.LayerNorm()
+  def force_init(self, xs: Array):
+    """Force flax initialization of self, prior to use with lax.scan.
+    Args:
+      xs: The input sequence that the module will be called with.
+    """
+    logging.info("tbase: Begin forced initialization.")
+    _ = self.kvq(xs)
+    batch_size = xs.shape[0]
+    seq_len = xs.shape[1]
+    attn_ys_shape = (batch_size, seq_len, self.num_heads, self.head_size)
+    dummy_attn_ys = jnp.zeros(attn_ys_shape, dtype=self.dtype)
+    if self.cross_attention_kv or self.cross_attention_q:
+      dummy_cross_attn_ys = dummy_attn_ys
+    else:
+      dummy_cross_attn_ys = None
+    _ = self.post_attn_ffn(xs, dummy_attn_ys, dummy_cross_attn_ys)
+    logging.info("tbase: End forced initialization.")
+  def attention_scale_factors(self) -> AttnScaleTuple:
+    """Returns the attention scales, when keys and queries are normalized.
+    Returns: (scale for kv (i.e. queries), scale for cross_kv (i.e queries2))
+    """
+    sfactor = self._kvq.attention_scale_factor()
+    if self.cross_attention_kv:
+      cross_sfactor = self._cross_kv.attention_scale_factor()
+    elif self.cross_attention_q and self.normalize_keys:
+      cross_sfactor = jnp.asarray(self.attention_scale2, dtype=self.dtype)
+    else:
+      cross_sfactor = None
+    return (sfactor, cross_sfactor)
+  def kvq(self, xs: Array) -> KVQTuple:
+    enable_dropout = self.pre_attn_dropout and self.is_training()
+    return self._kvq(xs, deterministic=not enable_dropout)
+  def cross_kv(self, xs: Array) -> Tuple[Array, Array]:
+    assert self.cross_attention_kv
+    enable_dropout = self.pre_attn_dropout and self.is_training()
+    (k, v, _, _) = self._cross_kv(xs, deterministic=not enable_dropout)
+    return (k, v)
+  def post_attn_ffn(self, xs: Array, attn_ys: Array,
+                    cross_attn_ys: Optional[Array]) -> Array:
+    """Combines the output of attention with the original input sequence.
+    Post-attn MLP on attn_ys, followed by resnet/gate.
+    Pre-FFN layernorm and dropout, then the FFN layer, followed by resnet/gate.
+    Args:
+      xs: Original input sequence of shape
+          (batch_size, sequence_length, embedding_size)
+      attn_ys: Output of the self-attention module, of shape
+          (batch_size, sequence_length, num_heads, head_size)
+      cross_attn_ys: Output of the cross-attention module, of shape
+          (batch_size, sequence_length, num_heads, head_size)
+    Returns:
+      Array of shape (batch_size, sequence_length, embedding_size)
+    """
+    (batch_size, sequence_length, _) = xs.shape
+    assert attn_ys.shape == (batch_size, sequence_length,
+                             self.num_heads, self.head_size)
+    no_dropout = not self.is_training()
+    drop_tile_shape = (1, 128, self.embedding_size)
+    # Concatenate cross-attention and self-attention results.
+    if cross_attn_ys is not None:
+      # Concatenate self-attention and cross-attention results, before
+      # applying the projection layer.
+      logging.info("tbase: using cross-attention.")
+      assert attn_ys.shape == (batch_size, sequence_length,
+                               self.num_heads, self.head_size)
+      attn_ys = jnp.concatenate([attn_ys, cross_attn_ys], axis=2)
+      att_ys_num_heads = self.num_heads * 2
+    else:
+      # Only use self-attention.
+      att_ys_num_heads = self.num_heads
+    logging.info("tbase: attn_ys = %r", attn_ys)
+    attn_ys = attn_ys.reshape(
+        (batch_size, sequence_length, att_ys_num_heads * self.head_size))
+    if self.single_gate:
+      logging.info("tbase: single gate.")
+      assert not self.skip_ffn
+      # Skip post-attention linear projection and residual connection.
+      ys_hidden = xs    # The FFN (below) will be gated onto xs (the input).
+      ffn_in = attn_ys  # The input to the FFN is the output of attention.
+    else:
+      logging.info("tbase: post-attention MLP.")
+      # Standard transformer archicture.
+      # The post-attention MLP applies a linear projection to project attn_ys
+      # to embedding space.  It then uses a residual connection or gate to
+      # combine the projection with xs.  Post-attention dropout is applied
+      # before the residual/gate.
+      post_attn_ys = self.post_attn_mlp(
+          attn_ys, xs,
+          apply_dropout=self.post_attn_dropout and not no_dropout,
+          dropout_rate=self.dropout_rate,
+          drop_tile_shape=drop_tile_shape,
+          rng_function=self._get_dropout_rng)
+      # The FFN (below) will be gated onto post_attn_ys (which gates onto xs).
+      ys_hidden = post_attn_ys
+      if self.skip_ffn:
+        logging.info("tbase: skip final FFN. ys = %r", ys_hidden)
+        return ys_hidden
+      # The input to the FFN; Layernorm is applied before the FFN.
+      ffn_in = self.pre_ffn_layernorm(ys_hidden)
+      logging.info("tbase: pre-FFN layernorm = %r", ffn_in)
+      # Pre-FFN dropout.
+      if self.pre_ffn_dropout:
+        logging.info("tbase: pre-FFN dropout.")
+        ffn_in = nn_components.tiled_dropout(
+            ffn_in, drop_tile_shape, self.dropout_rate,
+            rng_function=self._get_dropout_rng, deterministic=no_dropout)
+    # FFN layer.
+    # Large MLP with hidden layers followed by residual connection or gate.
+    # The MLP will apply post-ffn dropout before the gate.
+    logging.info("tbase: final FFN")
+    ys = self.ffn(ffn_in, ys_hidden,
+                  apply_dropout=self.post_ffn_dropout and not no_dropout,
+                  dropout_rate=self.dropout_rate,
+                  drop_tile_shape=drop_tile_shape,
+                  rng_function=self._get_dropout_rng)
+    logging.info("tbase: ys = %r", ys)
+    return ys

aglib/meliad/transformer/transformer_layer.py ADDED Viewed

	@@ -0,0 +1,817 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A single transformer layer."""
+from typing import Any, Mapping, NewType, Optional, Sequence, Tuple
+from absl import logging
+from flax import linen as nn
+import gin
+import jax
+import jax.numpy as jnp
+from transformer import attention
+from transformer import memory_factory
+from transformer import nn_components
+from transformer import position
+from transformer import position_fourier
+from transformer import position_t5
+from transformer import transformer_base
+Array = jnp.ndarray
+DecoderState = NewType("DecoderState", Mapping[str, Array])
+WindowState = Optional[Tuple[attention.KVITuple, Array]]
+KVITuple = attention.KVITuple
+@gin.configurable
+class TransformerLayer(nn.Module):
+  """Full transformer layer, with attention."""
+  # Set by DecoderStack
+  mode: str
+  batch_size: int
+  embedding_size: int
+  cross_attention: bool = False
+  recurrent_attention: bool = False
+  memory: Optional[memory_factory.MemoryManager] = None
+  # Configurable hyper-parameters
+  num_heads: int = gin.REQUIRED
+  head_size: int = gin.REQUIRED
+  window_length: int = gin.REQUIRED
+  use_long_xl_architecture: bool = True
+  max_unrolled_windows: int = -1  # Always unroll.
+  relative_position_type: Optional[str] = "fourier"  # {None, "fourier", "t5"}
+  use_causal_mask: bool = True
+  attn_dropout_rate: float = 0.0
+  recurrent_num_states: int = 0
+  recurrent_gate_type: str = "bias"
+  recurrent_single_gate: bool = False
+  recurrent_skip_ffn: bool = False
+  compute_importance: bool = False
+  memory_num_neighbors: int = 0
+  memory_reset_on_new_doc: bool = True
+  dtype: Any = jnp.float32
+  # Modes which support caching of previous keys and values.
+  supported_modes_for_cache: Sequence[str] = ("train", "test")
+  update_memory_modes: Sequence[str] = ("train", "test")
+  def supports_generate(self) -> bool:
+    return self.use_long_xl_architecture
+  def _get_cache_name_from_mode(self, mode: str) -> Tuple[str, bool, bool]:
+    """Get the name of the cache, and whether to update the cache, from mode."""
+    # This is a hack to ensure that "generate" steps generate text as a
+    # continuation of the text that is stored in the "test" cache,
+    # but it does not update the "test" cache.
+    if mode == "generate":
+      assert "test" in self.supported_modes_for_cache
+      return ("test", False, False)   # Use test cache, but don't update it.
+    elif mode == "init":
+      return ("train", False, False)   # Use training cache for initialization.
+    else:
+      return (mode, True, mode in self.update_memory_modes)
+  def _allocate_cached_kvi(self, mode: str) -> KVITuple:
+    """Allocate (keys, values, importance) which can be cached between steps."""
+    kv_shape = [self.batch_size, self.window_length,
+                self.num_heads, self.head_size]
+    imp_shape = [self.batch_size, self.window_length]
+    def kv_initializer(shape):
+      return jnp.zeros(shape, dtype=self.dtype)
+    def imp_initializer(shape):
+      return jnp.zeros(shape, dtype=self.dtype)
+    pkeys = self.variable("state", "previous_keys_" + mode,
+                          kv_initializer, kv_shape)
+    pvals = self.variable("state", "previous_values_" + mode,
+                          kv_initializer, kv_shape)
+    if self.compute_importance:
+      pimportance = self.variable("state", "previous_importance_" + mode,
+                                  imp_initializer, imp_shape)
+    else:
+      pimportance = None
+    return (pkeys, pvals, pimportance)
+  def _allocate_cached_recurrent_state(self, mode: str):
+    rec_num_states = self.recurrent_num_states
+    st_shape = [self.batch_size, rec_num_states, self.embedding_size]
+    def st_initializer(shape):
+      return jnp.zeros(shape, dtype=self.dtype)
+    return self.variable("state", "recurrent_state_" + mode,
+                         st_initializer, st_shape)
+  def setup(self):
+    # Basic transformer functionality: everything except attention.
+    self.tbase = transformer_base.TransformerBase(
+        mode=self.mode,
+        embedding_size=self.embedding_size,
+        num_heads=self.num_heads,
+        head_size=self.head_size,
+        cross_attention_q=self.recurrent_attention or self.cross_attention,
+        cross_attention_kv=False,         # or True to use separate k,v.
+        num_position_embeddings=0,
+        num_cross_position_embeddings=0,  # or self.recurrent_num_states w/ k,v.
+        dtype=self.dtype)
+    # Recurrent transformer functionality.
+    self.recurrent_tbase = None
+    if self.recurrent_attention:
+      # Recurrent transformer layer.
+      # We use a learned position embedding so that each element of the state
+      # can learn to query and compute different summaries.
+      self.recurrent_tbase = transformer_base.TransformerBase(
+          mode="pure",  # Disable dropout, which breaks jax.lax.scan.
+          embedding_size=self.embedding_size,
+          num_heads=self.num_heads,
+          head_size=self.head_size,
+          cross_attention_q=True,
+          cross_attention_kv=False,          # or True to use separate k,v.
+          num_position_embeddings=self.recurrent_num_states,
+          num_cross_position_embeddings=0,   # or self.window_length w/ k,v.
+          gate_type=self.recurrent_gate_type,
+          single_gate=self.recurrent_single_gate,
+          skip_ffn=self.recurrent_skip_ffn,
+          dtype=self.dtype)
+      # Initial state at start of document.
+      # We want this to be initially small, but large enough that adafactor
+      # will scale updates to a reasonable value.
+      self.recurrent_initial_state = self.param(
+          "recurrent_initial_state",
+          jax.nn.initializers.normal(stddev=0.1),
+          (self.recurrent_num_states, self.embedding_size), jnp.float32)
+      # Cached state from previous step for BPTT.
+      rec_state = {}
+      for mkey in self.supported_modes_for_cache:
+        rec_state[mkey] = self._allocate_cached_recurrent_state(mkey)
+      self.cached_recurrent_state = rec_state
+    # Set up relative position encoding.
+    if self.relative_position_type == "fourier":
+      self.relative_positions = position_fourier.RelativeFourierPositions(
+          num_heads=self.num_heads,
+          max_number_of_keys=self.window_length,
+          dtype=self.dtype)
+    elif self.relative_position_type == "t5":
+      self.relative_positions = position_t5.T5RelativePositionBiases(
+          num_buckets=32,      # TODO(delesley): Let Gin configure these.
+          max_distance=128,
+          num_heads=self.num_heads,
+          dtype=self.dtype)
+    elif self.relative_position_type == "rotary":
+      # Rotary position encodings (RoPE).  No learned bias parameters.
+      self.relative_positions = None
+    else:
+      assert self.relative_position_type is None
+      self.relative_positions = None
+    # Set up cache for Transformer-XL style architectures.
+    # A separate cache is created for each each mode (e.g. train, test)
+    cached_kvi = {}
+    if self.use_long_xl_architecture:
+      for mkey in self.supported_modes_for_cache:
+        cached_kvi[mkey] = self._allocate_cached_kvi(mkey)
+    self.cached_kvi = cached_kvi
+    # Set up external memory.
+    # A separate memory will be created for each mode (e.g. train, test)
+    mem_layers = {}
+    if self.memory is not None:
+      self.memory_bias = self.param("external_memory_bias", nn.zeros,
+                                    (self.num_heads,), "float32")
+      for mkey in self.supported_modes_for_cache:
+        mlayer = self.memory.create_memory_layer()
+        # Use setattr to setup the name and module containership hierarchy.
+        setattr(self, "mem_layer_" + mkey, mlayer)
+        mem_layers[mkey] = mlayer
+    self.mem_layers = mem_layers
+  def _get_cached_kvi(self, start_of_sequence: Array,
+                      mode: str) -> Optional[KVITuple]:
+    """Returns cached (keys, values, importance) from the previous step."""
+    if not self.use_long_xl_architecture:
+      return None
+    if mode not in self.cached_kvi:
+      # No cache, but we're using XL / sliding window, so return zeros.
+      logging.info("tlayer: using zero as initial XL cache value.")
+      kvi_shape = (self.batch_size, self.window_length,
+                   self.num_heads, self.head_size)
+      return attention.initial_kvi(kvi_shape,
+                                   self.compute_importance, dtype=self.dtype)
+    # New documents start with zero_kv.
+    # Continuing the same document will attend to previous keys/vals.
+    (pkeys, pvals, pimportance) = self.cached_kvi[mode]
+    (zkeys, zvals, zimportance) = attention.initial_kvi(
+        pkeys.value.shape, self.compute_importance, dtype=self.dtype)
+    # Broadcast start_of_sequence over non-batch dims.
+    b = self.batch_size
+    start_of_sequence_kv = jnp.reshape(start_of_sequence, [b, 1, 1, 1])
+    prev_keys = jnp.where(start_of_sequence_kv, zkeys, pkeys.value)
+    prev_vals = jnp.where(start_of_sequence_kv, zvals, pvals.value)
+    if self.compute_importance:
+      start_of_sequence_imp = jnp.reshape(start_of_sequence, [b, 1])
+      prev_importance = jnp.where(start_of_sequence_imp, zimportance,
+                                  pimportance.value)
+    else:
+      prev_importance = None
+    logging.debug("tlayer: start_of_sequence = %r", start_of_sequence)
+    logging.info("tlayer: prev_keys[%r] = %r", mode, prev_keys)
+    logging.debug("tlayer: prev_importance[%r] = %r", mode, prev_importance)
+    return (prev_keys, prev_vals, prev_importance)
+  def _set_cached_kvi(self, next_kvi: KVITuple, mode: str):
+    """Caches the last (keys, values, importance) from the current step."""
+    if not self.use_long_xl_architecture:
+      return
+    if mode not in self.cached_kvi:
+      return
+    (pkeys, pvals, pimportance) = self.cached_kvi[mode]
+    (nkeys, nvals, nimportance) = next_kvi   # From last window
+    logging.info("tlayer: next_keys[%r] = %r", mode, nkeys)
+    pkeys.value = nkeys
+    pvals.value = nvals
+    if self.compute_importance:
+      logging.info("tlayer: next_importance[%r] = %r", mode, nimportance)
+      pimportance.value = nimportance
+  def _get_cached_recurrent_state(self, start_of_sequence: Array,
+                                  mode: str) -> Optional[Array]:
+    """Returns cached recurrent state from the previous step."""
+    if not self.recurrent_attention:
+      return None
+    if mode not in self.cached_recurrent_state:
+      return None
+    b = self.batch_size
+    rstate = self.cached_recurrent_state[mode].value
+    istate = jnp.asarray(self.recurrent_initial_state, dtype=self.dtype)
+    istate = istate[jnp.newaxis, :, :]   # Add batch dimension for broadcast.
+    logging.info("tlayer: get_cached_recurrent_state %r, %r", istate, rstate)
+    start_of_sequence_st = jnp.reshape(start_of_sequence, (b, 1, 1))
+    return jnp.where(start_of_sequence_st, istate, rstate)
+  def _set_cached_recurrent_state(self, next_state: Array, mode: str):
+    """Store the next recurrent state in the cache."""
+    if not self.recurrent_attention:
+      return
+    if mode not in self.cached_recurrent_state:
+      return
+    logging.info("tlayer: set_cached_recurrent_state %r", next_state)
+    rstate = self.cached_recurrent_state[mode]
+    rstate.value = next_state
+  def _query_external_memory(self, keys: Array, values: Array, queries: Array,
+                             start_of_sequence: Array,
+                             mode: str, update_memory: bool):
+    """Query and update external memory."""
+    if self.memory is None:
+      return None
+    # Make sure we initialize (allocate) the external memories for all modes.
+    # Per the flax lazy module initialization scheme, setup() will not be
+    # invoked on a submodule until that module is actually used.
+    if mode == "init":
+      for (_, mlayer) in self.mem_layers.items():
+        (_, _) = mlayer.topk_retrieval(queries, self.memory_num_neighbors)
+        mode = "train"  # Pretend we're in training mode during initialization.
+    if mode not in self.mem_layers:
+      return None
+    if self.memory_num_neighbors == 0:
+      raise ValueError("Using memory, but num_neighbors == 0")
+    # Grab the appropriate memory layer for the current mode.
+    memory_layer = self.mem_layers[mode]
+    # Clear the relevant memories at the start of each new document.
+    if update_memory and self.memory_reset_on_new_doc:
+      # The number of "datasets" is batch_dim * num_heads.
+      # jnp.repeat will "broadcast" start_of_sequence over num_heads.
+      # E.g. if start_of_sequence = [True, False] and 4 heads,
+      # jnp.repeat will yield [T, T, T, T, F, F, F, F]
+      memory_layer.reset(jnp.repeat(start_of_sequence, self.num_heads))
+    # Query external memory, with queries.
+    (rkeys, rvals) = memory_layer.topk_retrieval(queries,
+                                                 self.memory_num_neighbors)
+    logging.info("tlayer: query external memory (%r): rvals = %r", mode, rvals)
+    # Sanity check all dimensions are as expected.
+    assert rkeys.ndim == 5   # (b, seq_len, num_heads, num_neigh, head_dim)
+    assert rvals.ndim == 5
+    assert rkeys.shape == rvals.shape
+    assert rkeys.shape[0] == queries.shape[0]  # batch size
+    assert rkeys.shape[1] == queries.shape[1]  # sequence length
+    assert rkeys.shape[2] == self.num_heads
+    assert rkeys.shape[3] == self.memory_num_neighbors
+    assert rkeys.shape[4] == self.head_size
+    # Update external memory, with (keys, values).
+    if update_memory:
+      memory_layer.update(keys, values)
+    return (rkeys, rvals)
+  def __call__(self, xs: Array, start_of_sequence: Array,
+               *,
+               importance: Optional[Array] = None,
+               cross_attention_kv: Optional[Tuple[Array, Array]] = None,
+               window_state: Optional[WindowState] = None,
+               decoder_state: Optional[DecoderState] = None) -> (
+                   Tuple[Array, Optional[Array], Optional[WindowState],
+                         Optional[DecoderState], Any]):
+    """Computes attention over a sequence of inputs.
+    Args:
+      xs: input sequence of shape (batch_size, sequence_length, num_hidden)
+      start_of_sequence: An input array of shape (batch_size)
+      --- The following must be passed by keyword only. ---
+      importance: Array of shape (batch_size, sequence_length).
+                  An importance bias for attention.
+      cross_attention_kv: Keys and values from encoder for cross-attention.
+      window_state: State object which contains context from the prior
+                    window when using a transformer-XL or sliding window.
+                    Initially created with load_window_state().
+      decoder_state: State object for autoregressive decoding, initially
+                     created with from init_decoder_state().
+    Returns:
+      (ys: outputs of shape (batch_size, sequence_length, num_hidden),
+       importance: importance values for the next layer,
+       next_window_state: state to pass to the next window,
+       next_decoder_state: next decoder state for autoregressive decoding,
+       viz_dict: dictionary of visualizations
+      )
+    """
+    xs = jnp.asarray(xs, dtype=self.dtype)
+    logging.info("tlayer: xs = %r", xs)
+    logging.info("tlayer: recurrent = %r", self.recurrent_attention)
+    logging.info("tlayer: cross-attention = %r", cross_attention_kv is not None)
+    is_training = (self.mode == "train")
+    # Compute keys, values and queries.
+    # ---------------------------------
+    logging.info("tlayer: compute keys,values,queries.")
+    (keys, values, queries, queries2) = self.tbase.kvq(xs)
+    attention_scale_factors = self.tbase.attention_scale_factors()
+    (_, sequence_length, num_heads, _) = queries.shape  # (b, k, h, d)
+    # Get biases and masks that are shared across windows.
+    # ----------------------------------------------------
+    if decoder_state is not None:
+      logging.info("tlayer: using autoregressive decoder.")
+      # When decoding, prior keys,values are loaded from the decoder state.
+      # Other values are precomputed, and loaded from the decoder state.
+      # The decoder state will be updated with the current token.
+      assert window_state is None
+      prev_kvi = None
+      recurrent_state = None   # Use precomputed recurrent_kvq.
+      cross_attention_kv = None
+      rel_position_bias = decoder_state["relative_position_bias"]
+      causal_mask = None
+      dropout_multiplier = None
+      # Reuse cached recurrent keys,values for each token.
+      cached_recurrent_kvq = decoder_state["recurrent_kvq"]
+      if cached_recurrent_kvq is not None:
+        assert cross_attention_kv is None
+        cross_attention_kv = (cached_recurrent_kvq[0], cached_recurrent_kvq[1])
+      del cached_recurrent_kvq
+      # Get a full window of keys,values and update decoder state.
+      (decoder_state, keys, values) = self._next_decoder_state(
+          decoder_state, keys, values)
+      # Each query attends to window_length prior keys.
+      assert keys.shape[1] == self.window_length
+      kq_relative_offset = self.window_length
+    else:
+      logging.info("tlayer: windowed attention.")
+      # When training, attention is done using windows or chunks, and prior
+      # context (e.g. keys,values from the previous window) is stored in the
+      # window_state object.
+      (prev_kvi, recurrent_state) = window_state  # pytype: disable=attribute-error
+      # Get the size of the sliding window for pos bias, dropout, & causal mask.
+      (num_queries, num_keys) = attention.sliding_attention_window_shape(
+          (keys, values, importance), prev_kvi, queries,
+          window_length=self.window_length)
+      kq_relative_offset = num_keys - num_queries
+      # Get the relative position bias.
+      # The bias doesn't depend on the query content, and so can be precomputed.
+      if self.relative_positions is not None:
+        rel_position_bias = self.relative_positions(num_queries, num_keys,
+                                                    bidirectional=False)
+        logging.info("tlayer: %s relative bias = %r",
+                     self.relative_position_type, rel_position_bias)
+      else:
+        rel_position_bias = None
+      # Get causal mask.
+      if self.use_causal_mask:
+        causal_mask = position.causal_mask(num_queries, num_keys,
+                                           window_length=self.window_length)
+        logging.info("tlayer: causal mask = %r", causal_mask)
+      else:
+        causal_mask = None
+      # Apply dropout to the attention matrix.
+      # The mask will be broadcast across batches and windows.
+      if self.attn_dropout_rate > 0.0 and is_training:
+        dropout_rng = self.make_rng("dropout")
+        attn_shape = (self.num_heads, num_queries, num_keys)
+        dropout_multiplier = nn_components.dropout_multiplier_mask(
+            dropout_rng, self.attn_dropout_rate, attn_shape, self.dtype)
+        logging.info("tlayer: attn_dropout = %r", dropout_multiplier)
+      else:
+        dropout_multiplier = None
+    # Load and store values into external memory, if memory is not None.
+    # ------------------------------------------------------------------
+    (mode, _, update_memory) = self._get_cache_name_from_mode(self.mode)
+    external_kv = self._query_external_memory(
+        keys, values, queries,
+        start_of_sequence=start_of_sequence, mode=mode,
+        update_memory=decoder_state is None and update_memory)
+    if self.memory is not None:
+      external_memory_bias = jnp.asarray(self.memory_bias, dtype=self.dtype)
+      external_memory_bias = jnp.reshape(external_memory_bias,
+                                         (1, 1, num_heads, 1))
+      external_memory_bias = jax.nn.sigmoid(external_memory_bias)
+    else:
+      external_memory_bias = None
+    # Compute the number of windows.
+    # ------------------------------
+    if sequence_length < self.window_length:
+      num_windows = 1  # Happens with autoregressive decoding.
+    elif sequence_length == self.window_length:
+      num_windows = 1
+      if self.use_long_xl_architecture:
+        assert prev_kvi is not None
+    else:
+      if not self.use_long_xl_architecture:
+        raise ValueError("Can only use sliding window with Transformer XL.")
+      num_windows = sequence_length // self.window_length
+      if (num_windows * self.window_length) != sequence_length:
+        raise ValueError(f"Window length {self.window_length} must be a " +
+                         f"multiple of sequence length {sequence_length}")
+    logging.info("tlayer: num_windows = %d.", num_windows)
+    # Define the function to do attention within a single window.
+    # ---------------------------------------------------------
+    def single_window_attention(carry, inputs_w):
+      # This function uses the following variables from the outer scope.
+      # They are listed here for clarity.
+      nonlocal rel_position_bias
+      nonlocal causal_mask
+      nonlocal kq_relative_offset
+      nonlocal dropout_multiplier
+      nonlocal attention_scale_factors
+      nonlocal external_memory_bias
+      nonlocal cross_attention_kv  # externally supplied.
+      # keys,values,queries over the whole sequence will be split into chunks.
+      # xs_w, kvqi_w, etc. are the chunk for the current window.
+      (prev_kvi_w, rec_state) = carry  # carried from one window to the next.
+      (kvqi_w, external_kv_w) = inputs_w  # inputs to the current window.
+      # (keys_curr_w, values_curr_w, _, _, importance_curr_w) = kvqi_w
+      # Concatenate keys,values from the previous window with the current
+      # window to implement sliding window attention.
+      (kvqi_w, next_kvi_w) = attention.concat_kvqi(kvqi_w, prev_kvi_w)
+      (keys_w, values_w, queries_w, queries2_w, importance_w) = kvqi_w
+      # Perform recurrent attention within the current window to get the next
+      # recurrent state, and set up cross attention.
+      if rec_state is not None:
+        logging.info("tlayer: recurrent attention.")
+        # NOTE -- recurrent states and input tokens are handled separately,
+        # because they have separate learned positional embeddings.  Due to
+        # the way TransformerBase does cross-attention, this means that we use
+        # separate key,value layers for rec_state and tokens_w.
+        # Keys, values, queries from recurrent state.
+        logging.info("tlayer: recurrent kvq.")
+        rec_kvq = self.recurrent_tbase.kvq(rec_state)
+        r_scale_factors = self.recurrent_tbase.attention_scale_factors()
+        (r_keys, r_values, r_queries, r_queries2) = rec_kvq
+        # Joint attention over both recurrent states and input tokens.
+        logging.info("tlayer: recurrent self-attention.")
+        r_attn_ys = attention.simple_attention(
+            r_keys, r_values, r_queries, None,
+            scale_factor=r_scale_factors[0],
+            dtype=self.dtype)
+        logging.info("tlayer: recurrent cross-attention.")
+        r_cross_attn_ys = attention.simple_attention(
+            keys_w, values_w, r_queries2, importance_w,
+            scale_factor=r_scale_factors[1],
+            dtype=self.dtype)
+        # Recurrent post-attention FFN.
+        logging.info("tlayer: recurrent ffn.")
+        next_rec_state = self.recurrent_tbase.post_attn_ffn(
+            rec_state, r_attn_ys, r_cross_attn_ys)
+        # Get keys and values for cross-attention from recurrent state.
+        assert cross_attention_kv is None
+        local_cross_attention_kv = (r_keys, r_values)
+      else:
+        # Get keys and values for cross-attention from external argument.
+        next_rec_state = None
+        local_cross_attention_kv = cross_attention_kv
+      # If using RoPE, keys and queries are rotated before self-attention.
+      if self.relative_position_type == "rotary":
+        logging.info("Using rotary position encodings (RoPE), offset = %d",
+                     kq_relative_offset)
+        (keys_w, queries_w) = position.rotate_kq(keys_w, queries_w,
+                                                 max_wavelength=10_000,
+                                                 offset=kq_relative_offset)
+      # Self-attention over input tokens.
+      logging.info("tlayer: self-attention.")
+      attn_ys_w = attention.simple_attention(
+          keys_w, values_w, queries_w, importance_w,
+          relative_position_bias=rel_position_bias,
+          scale_factor=attention_scale_factors[0],
+          causal_mask=causal_mask,
+          dropout_multiplier=dropout_multiplier,
+          dtype=self.dtype)
+      # Attention over external memory.
+      if external_kv_w is not None:
+        (external_keys_w, external_values_w) = external_kv_w
+        y_ext = attention.external_attention(
+            external_keys_w, external_values_w, queries_w,
+            scale_factor=attention_scale_factors[0])
+        if external_memory_bias is not None:
+          ebias = external_memory_bias
+          logging.info("tlayer: using external memory bias = %r", ebias)
+          attn_ys_w = (attn_ys_w * (1 - ebias)) + (y_ext * ebias)
+        else:
+          attn_ys_w += y_ext
+      # Cross attention from input tokens to encoder or recurrent state.
+      if local_cross_attention_kv is not None:
+        logging.info("tlayer: cross-attention.")
+        (c_keys, c_values) = local_cross_attention_kv
+        # Cross-attention using queries2.
+        cross_attn_ys_w = attention.simple_attention(
+            c_keys, c_values, queries2_w, None,
+            scale_factor=attention_scale_factors[1],
+            dtype=self.dtype)
+      else:
+        cross_attn_ys_w = None
+      # End function single_window_attention(...)
+      return ((next_kvi_w, next_rec_state),
+              (attn_ys_w, cross_attn_ys_w))
+    # Initialize recurrent_tbase before calling jax.lax.scan.
+    # Otherwise flax will throw a tantrum.
+    if (self.recurrent_attention and 0 <= self.max_unrolled_windows and
+        self.max_unrolled_windows < num_windows):
+      logging.info("tlayer: force initialization of recurrent_tbase.")
+      self.recurrent_tbase.force_init(recurrent_state)
+    # Perform sliding window attention over all keys,values,queries.
+    # --------------------------------------------------------------
+    initial_carry = (prev_kvi, recurrent_state)  # window state.
+    kvqi = (keys, values, queries, queries2, importance)
+    attn_inputs = (kvqi, external_kv)
+    (next_carry, attn_outputs) = attention.split_and_scan(
+        single_window_attention,
+        initial_carry,
+        attn_inputs,
+        sections=num_windows,
+        axis=1,
+        max_unrolled_windows=self.max_unrolled_windows)
+    (attn_ys, cross_attn_ys) = attn_outputs
+    logging.info("tlayer: End windows.")
+    # Post-attention MLP, resnet, and FFN.
+    # ------------------------------------
+    logging.info("tlayer: final FFN.")
+    ys = self.tbase.post_attn_ffn(xs, attn_ys, cross_attn_ys)
+    importance_output = None
+    next_window_state = next_carry if window_state is not None else None
+    viz_dict = {}  # Visualizations, not currently enabled.
+    return (ys, importance_output, next_window_state, decoder_state, viz_dict)
+  def load_window_state(self, start_of_sequence: Array) -> WindowState:
+    """Load cached state that is passed from one window to the next."""
+    (mode, _, _) = self._get_cache_name_from_mode(self.mode)
+    prev_kvi = self._get_cached_kvi(start_of_sequence, mode)
+    rec_state = self._get_cached_recurrent_state(start_of_sequence, mode)
+    if prev_kvi is not None:
+      logging.info("tlayer: Loaded keys,values for mode %s from cache %s",
+                   self.mode, mode)
+    else:
+      logging.info("tlayer: Skipping XL cache for mode %s.", self.mode)
+    if rec_state is not None:
+      logging.info("tlayer: Loaded recurrent state for mode %s from cache %s.",
+                   self.mode, mode)
+    return (prev_kvi, rec_state)
+  def store_window_state(self, window_state: WindowState):
+    """Write window state to the cache."""
+    (mode, update_cache, _) = self._get_cache_name_from_mode(self.mode)
+    (next_kvi, next_rec_state) = window_state  # pytype: disable=attribute-error
+    if update_cache and next_kvi is not None:
+      logging.info("tlayer: Storing keys,values for mode %s in cache %s.",
+                   self.mode, mode)
+      self._set_cached_kvi(next_kvi, mode)
+    else:
+      logging.info("tlayer: Skipping XL cache update for mode %s.", self.mode)
+    if update_cache and next_rec_state is not None:
+      logging.info("tlayer: Storing recurrent state for mode %s in cache %s.",
+                   self.mode, mode)
+      self._set_cached_recurrent_state(next_rec_state, mode)
+  def get_recurrent_kv(self, window_state: WindowState):
+    """Get the recurrent keys,values from window_state."""
+    # TODO(delesley): optimize.
+    # This isn't ideal, because we wind up computing the recurrent keys,values
+    # twice -- once within the sliding window above, and again in the
+    # DecoderStack, so they can be passed to other layers.  However, the
+    # plumbing is a lot simpler this way.
+    if window_state is None:
+      return None
+    (_, rec_state) = window_state
+    if rec_state is None:
+      return None
+    logging.info("tlayer: get_recurrent_kv.")
+    (r_keys, r_values, _, _) = self.recurrent_tbase.kvq(rec_state)
+    return (r_keys, r_values)
+  def init_decoder_state(self, sequence_length: int,
+                         start_of_sequence: Array) -> DecoderState:
+    """Initialize decoder state for autoregressive generation.
+    Args:
+      sequence_length: The maximum length of the sequence to generate.
+      start_of_sequence: Array of boolean of shape (batch_size,)
+                         True if starting a new sequence (with no prefix).
+    Returns:
+      A state object that can be passed to __call__.
+    """
+    # Note that generate always uses a local context of size window_length.
+    # Training should be set up appropriately.
+    if not self.use_long_xl_architecture:
+      raise ValueError("Generation is only supported for transformer XL.")
+    if not self.use_causal_mask:
+      raise ValueError("Generator must have been trained with a causal mask.")
+    (mode, _, _) = self._get_cache_name_from_mode(self.mode)
+    # Get relative position bias.
+    if self.relative_positions is not None:
+      # Relative positions for all tokens *prior* to the current token.
+      # The causal mask prevents each token from attending to itself.
+      rel_position_bias = self.relative_positions(1, self.window_length,
+                                                  offset=self.window_length,
+                                                  bidirectional=False)
+    else:
+      rel_position_bias = None
+    # Initialize autoregressive storage for (key, value) pairs.
+    # Include space for a prefix of window_length tokens.
+    num_keys = sequence_length + self.window_length
+    stored_shape = (self.batch_size, num_keys, self.num_heads, self.head_size)
+    stored_keys = jnp.zeros(stored_shape, dtype=self.dtype)
+    stored_values = jnp.zeros(stored_shape, dtype=self.dtype)
+    start_index = self.window_length
+    # Copy keys,values from cache into storage, for use as a prefix.
+    prev_kvi = self._get_cached_kvi(start_of_sequence, mode)
+    if prev_kvi is not None:
+      (pkeys, pvals, prev_imps) = prev_kvi
+      assert prev_imps is None  # Not yet supported.
+      assert pkeys.ndim == 4
+      assert pkeys.shape[1] == self.window_length  # (b, wlen, num_heads, d)
+      stored_keys = jax.lax.dynamic_update_slice_in_dim(
+          stored_keys, pkeys, 0, axis=1)
+      stored_values = jax.lax.dynamic_update_slice_in_dim(
+          stored_values, pvals, 0, axis=1)
+    # Grab the current recurrent_state, and precompute keys,values,queries.
+    rstate = self._get_cached_recurrent_state(start_of_sequence, mode)
+    if rstate is not None:
+      recurrent_kvq = self.recurrent_tbase.kvq(rstate)
+    else:
+      recurrent_kvq = None
+    decoder_state_dict = {
+        "keys": stored_keys,
+        "values": stored_values,
+        "current_index": start_index,
+        "relative_position_bias": rel_position_bias,
+        "recurrent_kvq": recurrent_kvq
+    }
+    return DecoderState(decoder_state_dict)
+  def _next_decoder_state(self, decoder_state: DecoderState,
+                          keys: Array, values: Array) -> Tuple[
+                              DecoderState, Array, Array]:
+    """Compute the next decoder state, and return keys,values to attend to.
+    The keys,values returned from this function are drawn from the prior
+    decoding state, and comprise a full window of local context.
+    Args:
+      decoder_state: The current decoder state, initially created using
+          init_decoder_state().
+      keys: The key for the current token, of shape (batch_size, 1, dim)
+      values: The value for the current token of shape (batch_size, 1, dim)
+    Returns:
+      (next_decoder_state,
+       window of keys of shape (batch_size, window_length, dim),
+       window of values of shape (batch_size, window_length, dim))
+    """
+    assert keys.shape[1] == 1   # single-token autoregressive decoding.
+    logging.info("attn_layer: next decoder state; key = %r", keys)
+    # Unpack decoder_state
+    stored_keys = decoder_state["keys"]
+    stored_values = decoder_state["values"]
+    curr_index = decoder_state["current_index"]
+    # Slice to get window_length-sized chunk of previous keys,values.
+    out_decoder_state = {}
+    curr_win_index = curr_index - self.window_length
+    out_keys = jax.lax.dynamic_slice_in_dim(
+        stored_keys, curr_win_index, self.window_length, axis=1)
+    out_values = jax.lax.dynamic_slice_in_dim(
+        stored_values, curr_win_index, self.window_length, axis=1)
+    # Write current keys,values to stored keys, values.
+    stored_keys = jax.lax.dynamic_update_slice_in_dim(
+        stored_keys, keys, curr_index, axis=1)
+    stored_values = jax.lax.dynamic_update_slice_in_dim(
+        stored_values, values, curr_index, axis=1)
+    curr_index = curr_index + 1
+    # Pack a new decoder_state object.
+    out_decoder_state["keys"] = stored_keys
+    out_decoder_state["values"] = stored_values
+    out_decoder_state["current_index"] = curr_index
+    out_decoder_state["relative_position_bias"] = (
+        decoder_state["relative_position_bias"])
+    out_decoder_state["recurrent_kvq"] = decoder_state["recurrent_kvq"]
+    return (DecoderState(out_decoder_state), out_keys, out_values)