# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ==============================================================================
"""
LFADS - Latent Factor Analysis via Dynamical Systems.

LFADS is an unsupervised method to decompose time series data into
various factors, such as an initial condition, a generative
dynamical system, control inputs to that generator, and a low
dimensional description of the observed data, called the factors.
Additionally, the observations have a noise model (in this case
Poisson), so a denoised version of the observations is also created
(e.g. underlying rates of a Poisson distribution given the observed
event counts).

The main data structure being passed around is a dataset.  This is a dictionary
of data dictionaries.

DATASET: The top level dictionary is simply name (string -> dictionary).
The nested dictionary is the DATA DICTIONARY, which has the following keys:
  'train_data' and 'valid_data', whose values are the corresponding training
    and validation data with shape
    ExTxD, E - # examples, T - # time steps, D - # dimensions in data.
  The data dictionary also has a few more keys:
    'train_ext_input' and 'valid_ext_input', if there are know external inputs
      to the system being modeled, these take on dimensions:
      ExTxI, E - # examples, T - # time steps, I = # dimensions in input.
   'alignment_matrix_cxf' - If you are using multiple days data, it's possible
     that one can align the channels (see manuscript).  If so each dataset will
     contain this matrix, which will be used for both the input adapter and the
     output adapter for each dataset. These matrices, if provided, must be of
     size [data_dim x factors] where data_dim is the number of neurons recorded
     on that day, and factors is chosen and set through the '--factors' flag.
   'alignment_bias_c' - See alignment_matrix_cxf.  This bias will used to
     the offset for the alignment transformation.  It will *subtract* off the
     bias from the data, so pca style inits can align factors across sessions.


  If one runs LFADS on data where the true rates are known for some trials,
  (say simulated, testing data, as in the example shipped with the paper), then
  one can add three more fields for plotting purposes.  These are 'train_truth'
  and 'valid_truth', and 'conversion_factor'.  These have the same dimensions as
  'train_data', and 'valid_data' but represent the underlying rates of the
  observations.  Finally, if one needs to convert scale for plotting the true
  underlying firing rates, there is the 'conversion_factor' key.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import numpy as np
import os
import tensorflow as tf
from distributions import LearnableDiagonalGaussian, DiagonalGaussianFromInput
from distributions import diag_gaussian_log_likelihood
from distributions import KLCost_GaussianGaussian, Poisson
from distributions import LearnableAutoRegressive1Prior
from distributions import KLCost_GaussianGaussianProcessSampled

from utils import init_linear, linear, list_t_bxn_to_tensor_bxtxn, write_data
from utils import log_sum_exp, flatten
from plot_lfads import plot_lfads


class GRU(object):
  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).

  """
  def __init__(self, num_units, forget_bias=1.0, weight_scale=1.0,
               clip_value=np.inf, collections=None):
    """Create a GRU object.

    Args:
      num_units: Number of units in the GRU
      forget_bias (optional): Hack to help learning.
      weight_scale (optional): weights are scaled by ws/sqrt(#inputs), with
       ws being the weight scale.
      clip_value (optional): if the recurrent values grow above this value,
        clip them.
      collections (optional): List of additonal collections variables should
        belong to.
    """
    self._num_units = num_units
    self._forget_bias = forget_bias
    self._weight_scale = weight_scale
    self._clip_value = clip_value
    self._collections = collections

  @property
  def state_size(self):
    return self._num_units

  @property
  def output_size(self):
    return self._num_units

  @property
  def state_multiplier(self):
    return 1

  def output_from_state(self, state):
    """Return the output portion of the state."""
    return state

  def __call__(self, inputs, state, scope=None):
    """Gated recurrent unit (GRU) function.

    Args:
      inputs: A 2D batch x input_dim tensor of inputs.
      state: The previous state from the last time step.
      scope (optional): TF variable scope for defined GRU variables.

    Returns:
      A tuple (state, state), where state is the newly computed state at time t.
      It is returned twice to respect an interface that works for LSTMs.
    """

    x = inputs
    h = state
    if inputs is not None:
      xh = tf.concat(axis=1, values=[x, h])
    else:
      xh = h

    with tf.variable_scope(scope or type(self).__name__):  # "GRU"
      with tf.variable_scope("Gates"):  # Reset gate and update gate.
        # We start with bias of 1.0 to not reset and not update.
        r, u = tf.split(axis=1, num_or_size_splits=2, value=linear(xh,
                                     2 * self._num_units,
                                     alpha=self._weight_scale,
                                     name="xh_2_ru",
                                     collections=self._collections))
        r, u = tf.sigmoid(r), tf.sigmoid(u + self._forget_bias)
      with tf.variable_scope("Candidate"):
        xrh = tf.concat(axis=1, values=[x, r * h])
        c = tf.tanh(linear(xrh, self._num_units, name="xrh_2_c",
                           collections=self._collections))
      new_h = u * h + (1 - u) * c
      new_h = tf.clip_by_value(new_h, -self._clip_value, self._clip_value)

    return new_h, new_h


class GenGRU(object):
  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).

  This version is specialized for the generator, but isn't as fast, so
  we have two.  Note this allows for l2 regularization on the recurrent
  weights, but also implicitly rescales the inputs via the 1/sqrt(input)
  scaling in the linear helper routine to be large magnitude, if there are
  fewer inputs than recurrent state.

  """
  def __init__(self, num_units, forget_bias=1.0,
               input_weight_scale=1.0, rec_weight_scale=1.0, clip_value=np.inf,
               input_collections=None, recurrent_collections=None):
    """Create a GRU object.

    Args:
      num_units: Number of units in the GRU
      forget_bias (optional): Hack to help learning.
      input_weight_scale (optional): weights are scaled ws/sqrt(#inputs), with
        ws being the weight scale.
      rec_weight_scale (optional): weights are scaled ws/sqrt(#inputs),
        with ws being the weight scale.
      clip_value (optional): if the recurrent values grow above this value,
        clip them.
      input_collections (optional): List of additonal collections variables
        that input->rec weights should belong to.
      recurrent_collections (optional): List of additonal collections variables
        that rec->rec weights should belong to.
    """
    self._num_units = num_units
    self._forget_bias = forget_bias
    self._input_weight_scale = input_weight_scale
    self._rec_weight_scale = rec_weight_scale
    self._clip_value = clip_value
    self._input_collections = input_collections
    self._rec_collections = recurrent_collections

  @property
  def state_size(self):
    return self._num_units

  @property
  def output_size(self):
    return self._num_units

  @property
  def state_multiplier(self):
    return 1

  def output_from_state(self, state):
    """Return the output portion of the state."""
    return state

  def __call__(self, inputs, state, scope=None):
    """Gated recurrent unit (GRU) function.

    Args:
      inputs: A 2D batch x input_dim tensor of inputs.
      state: The previous state from the last time step.
      scope (optional): TF variable scope for defined GRU variables.

    Returns:
      A tuple (state, state), where state is the newly computed state at time t.
      It is returned twice to respect an interface that works for LSTMs.
    """

    x = inputs
    h = state
    with tf.variable_scope(scope or type(self).__name__):  # "GRU"
      with tf.variable_scope("Gates"):  # Reset gate and update gate.
        # We start with bias of 1.0 to not reset and not update.
        r_x = u_x = 0.0
        if x is not None:
          r_x, u_x = tf.split(axis=1, num_or_size_splits=2, value=linear(x,
                                           2 * self._num_units,
                                           alpha=self._input_weight_scale,
                                           do_bias=False,
                                           name="x_2_ru",
                                           normalized=False,
                                           collections=self._input_collections))

        r_h, u_h = tf.split(axis=1, num_or_size_splits=2, value=linear(h,
                                         2 * self._num_units,
                                         do_bias=True,
                                         alpha=self._rec_weight_scale,
                                         name="h_2_ru",
                                         collections=self._rec_collections))
        r = r_x + r_h
        u = u_x + u_h
        r, u = tf.sigmoid(r), tf.sigmoid(u + self._forget_bias)

      with tf.variable_scope("Candidate"):
        c_x = 0.0
        if x is not None:
          c_x = linear(x, self._num_units, name="x_2_c", do_bias=False,
                       alpha=self._input_weight_scale,
                       normalized=False,
                       collections=self._input_collections)
        c_rh = linear(r*h, self._num_units, name="rh_2_c", do_bias=True,
                     alpha=self._rec_weight_scale,
                     collections=self._rec_collections)
        c = tf.tanh(c_x + c_rh)

      new_h = u * h + (1 - u) * c
      new_h = tf.clip_by_value(new_h, -self._clip_value, self._clip_value)

    return new_h, new_h


class LFADS(object):
  """LFADS - Latent Factor Analysis via Dynamical Systems.

  LFADS is an unsupervised method to decompose time series data into
  various factors, such as an initial condition, a generative
  dynamical system, inferred inputs to that generator, and a low
  dimensional description of the observed data, called the factors.
  Additoinally, the observations have a noise model (in this case
  Poisson), so a denoised version of the observations is also created
  (e.g. underlying rates of a Poisson distribution given the observed
  event counts).
  """

  def __init__(self, hps, kind="train", datasets=None):
    """Create an LFADS model.

       train - a model for training, sampling of posteriors is used
       posterior_sample_and_average - sample from the posterior, this is used
         for evaluating the expected value of the outputs of LFADS, given a
         specific input, by averaging over multiple samples from the approx
         posterior.  Also used for the lower bound on the negative
         log-likelihood using IWAE error (Importance Weighed Auto-encoder).
         This is the denoising operation.
       prior_sample - a model for generation - sampling from priors is used

    Args:
      hps: The dictionary of hyper parameters.
      kind: the type of model to build (see above).
      datasets: a dictionary of named data_dictionaries, see top of lfads.py
    """
    print("Building graph...")
    all_kinds = ['train', 'posterior_sample_and_average', 'posterior_push_mean',
                 'prior_sample']
    assert kind in all_kinds, 'Wrong kind'
    if hps.feedback_factors_or_rates == "rates":
      assert len(hps.dataset_names) == 1, \
      "Multiple datasets not supported for rate feedback."
    num_steps = hps.num_steps
    ic_dim = hps.ic_dim
    co_dim = hps.co_dim
    ext_input_dim = hps.ext_input_dim
    cell_class = GRU
    gen_cell_class = GenGRU

    def makelambda(v):          # Used with tf.case
      return lambda: v

    # Define the data placeholder, and deal with all parts of the graph
    # that are dataset dependent.
    self.dataName = tf.placeholder(tf.string, shape=())
    # The batch_size to be inferred from data, as normal.
    # Additionally, the data_dim will be inferred as well, allowing for a
    # single placeholder for all datasets, regardless of data dimension.
    if hps.output_dist == 'poisson':
      # Enforce correct dtype
      assert np.issubdtype(
          datasets[hps.dataset_names[0]]['train_data'].dtype, int), \
          "Data dtype must be int for poisson output distribution"
      data_dtype = tf.int32
    elif hps.output_dist == 'gaussian':
      assert np.issubdtype(
          datasets[hps.dataset_names[0]]['train_data'].dtype, float), \
          "Data dtype must be float for gaussian output dsitribution"
      data_dtype = tf.float32
    else:
      assert False, "NIY"
    self.dataset_ph = dataset_ph = tf.placeholder(data_dtype,
                                                  [None, num_steps, None],
                                                  name="data")
    self.train_step = tf.get_variable("global_step", [], tf.int64,
                                      tf.zeros_initializer(),
                                      trainable=False)
    self.hps = hps
    ndatasets = hps.ndatasets
    factors_dim = hps.factors_dim
    self.preds = preds = [None] * ndatasets
    self.fns_in_fac_Ws = fns_in_fac_Ws = [None] * ndatasets
    self.fns_in_fatcor_bs = fns_in_fac_bs = [None] * ndatasets
    self.fns_out_fac_Ws = fns_out_fac_Ws = [None] * ndatasets
    self.fns_out_fac_bs = fns_out_fac_bs = [None] * ndatasets
    self.datasetNames = dataset_names = hps.dataset_names
    self.ext_inputs = ext_inputs = None

    if len(dataset_names) == 1:  # single session
      if 'alignment_matrix_cxf' in datasets[dataset_names[0]].keys():
        used_in_factors_dim = factors_dim
        in_identity_if_poss = False
      else:
        used_in_factors_dim = hps.dataset_dims[dataset_names[0]]
        in_identity_if_poss = True
    else:  # multisession
      used_in_factors_dim = factors_dim
      in_identity_if_poss = False

    for d, name in enumerate(dataset_names):
      data_dim = hps.dataset_dims[name]
      in_mat_cxf = None
      in_bias_1xf = None
      align_bias_1xc = None

      if datasets and 'alignment_matrix_cxf' in datasets[name].keys():
        dataset = datasets[name]
        if hps.do_train_readin:
            print("Initializing trainable readin matrix with alignment matrix" \
                  " provided for dataset:", name)
        else:
            print("Setting non-trainable readin matrix to alignment matrix" \
                  " provided for dataset:", name)
        in_mat_cxf = dataset['alignment_matrix_cxf'].astype(np.float32)
        if in_mat_cxf.shape != (data_dim, factors_dim):
          raise ValueError("""Alignment matrix must have dimensions %d x %d
          (data_dim x factors_dim), but currently has %d x %d."""%
                           (data_dim, factors_dim, in_mat_cxf.shape[0],
                            in_mat_cxf.shape[1]))
      if datasets and 'alignment_bias_c' in datasets[name].keys():
        dataset = datasets[name]
        if hps.do_train_readin:
          print("Initializing trainable readin bias with alignment bias " \
                "provided for dataset:", name)
        else:
          print("Setting non-trainable readin bias to alignment bias " \
                "provided for dataset:", name)
        align_bias_c = dataset['alignment_bias_c'].astype(np.float32)
        align_bias_1xc = np.expand_dims(align_bias_c, axis=0)
        if align_bias_1xc.shape[1] != data_dim:
          raise ValueError("""Alignment bias must have dimensions %d
          (data_dim), but currently has %d."""%
                           (data_dim, in_mat_cxf.shape[0]))
        if in_mat_cxf is not None and align_bias_1xc is not None:
          # (data - alignment_bias) * W_in
          # data * W_in - alignment_bias * W_in
          # So b = -alignment_bias * W_in to accommodate PCA style offset.
          in_bias_1xf = -np.dot(align_bias_1xc, in_mat_cxf)

      if hps.do_train_readin:
          # only add to IO transformations collection only if we want it to be
          # learnable, because IO_transformations collection will be trained
          # when do_train_io_only
          collections_readin=['IO_transformations']
      else:
          collections_readin=None

      in_fac_lin = init_linear(data_dim, used_in_factors_dim,
                               do_bias=True,
                               mat_init_value=in_mat_cxf,
                               bias_init_value=in_bias_1xf,
                               identity_if_possible=in_identity_if_poss,
                               normalized=False, name="x_2_infac_"+name,
                               collections=collections_readin,
                               trainable=hps.do_train_readin)
      in_fac_W, in_fac_b = in_fac_lin
      fns_in_fac_Ws[d] = makelambda(in_fac_W)
      fns_in_fac_bs[d] = makelambda(in_fac_b)

    with tf.variable_scope("glm"):
      out_identity_if_poss = False
      if len(dataset_names) == 1 and \
          factors_dim == hps.dataset_dims[dataset_names[0]]:
        out_identity_if_poss = True
      for d, name in enumerate(dataset_names):
        data_dim = hps.dataset_dims[name]
        in_mat_cxf = None
        if datasets and 'alignment_matrix_cxf' in datasets[name].keys():
          dataset = datasets[name]
          in_mat_cxf = dataset['alignment_matrix_cxf'].astype(np.float32)

        if datasets and 'alignment_bias_c' in datasets[name].keys():
          dataset = datasets[name]
          align_bias_c = dataset['alignment_bias_c'].astype(np.float32)
          align_bias_1xc = np.expand_dims(align_bias_c, axis=0)

        out_mat_fxc = None
        out_bias_1xc = None
        if in_mat_cxf is not None:
            out_mat_fxc = in_mat_cxf.T
        if align_bias_1xc is not None:
          out_bias_1xc = align_bias_1xc

        if hps.output_dist == 'poisson':
          out_fac_lin = init_linear(factors_dim, data_dim, do_bias=True,
                                    mat_init_value=out_mat_fxc,
                                    bias_init_value=out_bias_1xc,
                                    identity_if_possible=out_identity_if_poss,
                                    normalized=False,
                                    name="fac_2_logrates_"+name,
                                    collections=['IO_transformations'])
          out_fac_W, out_fac_b = out_fac_lin

        elif hps.output_dist == 'gaussian':
          out_fac_lin_mean = \
              init_linear(factors_dim, data_dim, do_bias=True,
                          mat_init_value=out_mat_fxc,
                          bias_init_value=out_bias_1xc,
                          normalized=False,
                          name="fac_2_means_"+name,
                          collections=['IO_transformations'])
          out_fac_W_mean, out_fac_b_mean = out_fac_lin_mean

          mat_init_value = np.zeros([factors_dim, data_dim]).astype(np.float32)
          bias_init_value = np.ones([1, data_dim]).astype(np.float32)
          out_fac_lin_logvar = \
              init_linear(factors_dim, data_dim, do_bias=True,
                          mat_init_value=mat_init_value,
                          bias_init_value=bias_init_value,
                          normalized=False,
                          name="fac_2_logvars_"+name,
                          collections=['IO_transformations'])
          out_fac_W_mean, out_fac_b_mean = out_fac_lin_mean
          out_fac_W_logvar, out_fac_b_logvar = out_fac_lin_logvar
          out_fac_W = tf.concat(
              axis=1, values=[out_fac_W_mean, out_fac_W_logvar])
          out_fac_b = tf.concat(
              axis=1, values=[out_fac_b_mean, out_fac_b_logvar])
        else:
          assert False, "NIY"

        preds[d] = tf.equal(tf.constant(name), self.dataName)
        data_dim = hps.dataset_dims[name]
        fns_out_fac_Ws[d] = makelambda(out_fac_W)
        fns_out_fac_bs[d] =  makelambda(out_fac_b)

    pf_pairs_in_fac_Ws = zip(preds, fns_in_fac_Ws)
    pf_pairs_in_fac_bs = zip(preds, fns_in_fac_bs)
    pf_pairs_out_fac_Ws = zip(preds, fns_out_fac_Ws)
    pf_pairs_out_fac_bs = zip(preds, fns_out_fac_bs)

    this_in_fac_W = tf.case(pf_pairs_in_fac_Ws, exclusive=True)
    this_in_fac_b = tf.case(pf_pairs_in_fac_bs, exclusive=True)
    this_out_fac_W = tf.case(pf_pairs_out_fac_Ws, exclusive=True)
    this_out_fac_b = tf.case(pf_pairs_out_fac_bs, exclusive=True)

    # External inputs (not changing by dataset, by definition).
    if hps.ext_input_dim > 0:
      self.ext_input = tf.placeholder(tf.float32,
                                      [None, num_steps, ext_input_dim],
                                      name="ext_input")
    else:
      self.ext_input = None
    ext_input_bxtxi = self.ext_input

    self.keep_prob = keep_prob = tf.placeholder(tf.float32, [], "keep_prob")
    self.batch_size = batch_size = int(hps.batch_size)
    self.learning_rate = tf.Variable(float(hps.learning_rate_init),
                                     trainable=False, name="learning_rate")
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * hps.learning_rate_decay_factor)

    # Dropout the data.
    dataset_do_bxtxd = tf.nn.dropout(tf.to_float(dataset_ph), keep_prob)
    if hps.ext_input_dim > 0:
      ext_input_do_bxtxi = tf.nn.dropout(ext_input_bxtxi, keep_prob)
    else:
      ext_input_do_bxtxi = None

    # ENCODERS
    def encode_data(dataset_bxtxd, enc_cell, name, forward_or_reverse,
                num_steps_to_encode):
      """Encode data for LFADS
      Args:
        dataset_bxtxd - the data to encode, as a 3 tensor, with dims
          time x batch x data dims.
        enc_cell: encoder cell
        name: name of encoder
        forward_or_reverse: string, encode in forward or reverse direction
        num_steps_to_encode: number of steps to  encode, 0:num_steps_to_encode
      Returns:
        encoded data as a list with num_steps_to_encode items, in order
      """
      if forward_or_reverse == "forward":
        dstr = "_fwd"
        time_fwd_or_rev = range(num_steps_to_encode)
      else:
        dstr = "_rev"
        time_fwd_or_rev = reversed(range(num_steps_to_encode))

      with tf.variable_scope(name+"_enc"+dstr, reuse=False):
        enc_state = tf.tile(
            tf.Variable(tf.zeros([1, enc_cell.state_size]),
                        name=name+"_enc_t0"+dstr), tf.stack([batch_size, 1]))
        enc_state.set_shape([None, enc_cell.state_size]) # tile loses shape

      enc_outs = [None] * num_steps_to_encode
      for i, t in enumerate(time_fwd_or_rev):
        with tf.variable_scope(name+"_enc"+dstr, reuse=True if i > 0 else None):
          dataset_t_bxd = dataset_bxtxd[:,t,:]
          in_fac_t_bxf = tf.matmul(dataset_t_bxd, this_in_fac_W) + this_in_fac_b
          in_fac_t_bxf.set_shape([None, used_in_factors_dim])
          if ext_input_dim > 0 and not hps.inject_ext_input_to_gen:
            ext_input_t_bxi = ext_input_do_bxtxi[:,t,:]
            enc_input_t_bxfpe = tf.concat(
                axis=1, values=[in_fac_t_bxf, ext_input_t_bxi])
          else:
            enc_input_t_bxfpe = in_fac_t_bxf
          enc_out, enc_state = enc_cell(enc_input_t_bxfpe, enc_state)
          enc_outs[t] = enc_out

      return enc_outs

    # Encode initial condition means and variances
    # ([x_T, x_T-1, ... x_0] and [x_0, x_1, ... x_T] -> g0/c0)
    self.ic_enc_fwd = [None] * num_steps
    self.ic_enc_rev = [None] * num_steps
    if ic_dim > 0:
      enc_ic_cell = cell_class(hps.ic_enc_dim,
                               weight_scale=hps.cell_weight_scale,
                               clip_value=hps.cell_clip_value)
      ic_enc_fwd = encode_data(dataset_do_bxtxd, enc_ic_cell,
                               "ic", "forward",
                               hps.num_steps_for_gen_ic)
      ic_enc_rev = encode_data(dataset_do_bxtxd, enc_ic_cell,
                               "ic", "reverse",
                               hps.num_steps_for_gen_ic)
      self.ic_enc_fwd = ic_enc_fwd
      self.ic_enc_rev = ic_enc_rev

    # Encoder control input means and variances, bi-directional encoding so:
    # ([x_T, x_T-1, ..., x_0] and [x_0, x_1 ... x_T] -> u_t)
    self.ci_enc_fwd = [None] * num_steps
    self.ci_enc_rev = [None] * num_steps
    if co_dim > 0:
      enc_ci_cell = cell_class(hps.ci_enc_dim,
                               weight_scale=hps.cell_weight_scale,
                               clip_value=hps.cell_clip_value)
      ci_enc_fwd = encode_data(dataset_do_bxtxd, enc_ci_cell,
                               "ci", "forward",
                               hps.num_steps)
      if hps.do_causal_controller:
        ci_enc_rev = None
      else:
        ci_enc_rev = encode_data(dataset_do_bxtxd, enc_ci_cell,
                                 "ci", "reverse",
                                 hps.num_steps)
      self.ci_enc_fwd = ci_enc_fwd
      self.ci_enc_rev = ci_enc_rev

    # STOCHASTIC LATENT VARIABLES, priors and posteriors
    # (initial conditions g0, and control inputs, u_t)
    # Note that zs represent all the stochastic latent variables.
    with tf.variable_scope("z", reuse=False):
      self.prior_zs_g0 = None
      self.posterior_zs_g0 = None
      self.g0s_val = None
      if ic_dim > 0:
        self.prior_zs_g0 = \
            LearnableDiagonalGaussian(batch_size, ic_dim, name="prior_g0",
                                      mean_init=0.0,
                                      var_min=hps.ic_prior_var_min,
                                      var_init=hps.ic_prior_var_scale,
                                      var_max=hps.ic_prior_var_max)
        ic_enc = tf.concat(axis=1, values=[ic_enc_fwd[-1], ic_enc_rev[0]])
        ic_enc = tf.nn.dropout(ic_enc, keep_prob)
        self.posterior_zs_g0 = \
            DiagonalGaussianFromInput(ic_enc, ic_dim, "ic_enc_2_post_g0",
                                      var_min=hps.ic_post_var_min)
        if kind in ["train", "posterior_sample_and_average",
                    "posterior_push_mean"]:
          zs_g0 = self.posterior_zs_g0
        else:
          zs_g0 = self.prior_zs_g0
        if kind in ["train", "posterior_sample_and_average", "prior_sample"]:
          self.g0s_val = zs_g0.sample
        else:
          self.g0s_val = zs_g0.mean

      # Priors for controller, 'co' for controller output
      self.prior_zs_co = prior_zs_co = [None] * num_steps
      self.posterior_zs_co = posterior_zs_co = [None] * num_steps
      self.zs_co = zs_co = [None] * num_steps
      self.prior_zs_ar_con = None
      if co_dim > 0:
        # Controller outputs
        autocorrelation_taus = [hps.prior_ar_atau for x in range(hps.co_dim)]
        noise_variances = [hps.prior_ar_nvar for x in range(hps.co_dim)]
        self.prior_zs_ar_con = prior_zs_ar_con = \
            LearnableAutoRegressive1Prior(batch_size, hps.co_dim,
                                          autocorrelation_taus,
                                          noise_variances,
                                          hps.do_train_prior_ar_atau,
                                          hps.do_train_prior_ar_nvar,
                                          num_steps, "u_prior_ar1")

    # CONTROLLER -> GENERATOR -> RATES
    # (u(t) -> gen(t) -> factors(t) -> rates(t) -> p(x_t|z_t) )
    self.controller_outputs = u_t = [None] * num_steps
    self.con_ics = con_state = None
    self.con_states = con_states = [None] * num_steps
    self.con_outs = con_outs = [None] * num_steps
    self.gen_inputs = gen_inputs = [None] * num_steps
    if co_dim > 0:
      # gen_cell_class here for l2 penalty recurrent weights
      # didn't split the cell_weight scale here, because I doubt it matters
      con_cell = gen_cell_class(hps.con_dim,
                                input_weight_scale=hps.cell_weight_scale,
                                rec_weight_scale=hps.cell_weight_scale,
                                clip_value=hps.cell_clip_value,
                                recurrent_collections=['l2_con_reg'])
      with tf.variable_scope("con", reuse=False):
        self.con_ics = tf.tile(
            tf.Variable(tf.zeros([1, hps.con_dim*con_cell.state_multiplier]),
                        name="c0"),
            tf.stack([batch_size, 1]))
        self.con_ics.set_shape([None, con_cell.state_size]) # tile loses shape
        con_states[-1] = self.con_ics

    gen_cell = gen_cell_class(hps.gen_dim,
                              input_weight_scale=hps.gen_cell_input_weight_scale,
                              rec_weight_scale=hps.gen_cell_rec_weight_scale,
                              clip_value=hps.cell_clip_value,
                              recurrent_collections=['l2_gen_reg'])
    with tf.variable_scope("gen", reuse=False):
      if ic_dim == 0:
        self.gen_ics = tf.tile(
              tf.Variable(tf.zeros([1, gen_cell.state_size]), name="g0"),
              tf.stack([batch_size, 1]))
      else:
        self.gen_ics = linear(self.g0s_val, gen_cell.state_size,
                              identity_if_possible=True,
                              name="g0_2_gen_ic")

      self.gen_states = gen_states = [None] * num_steps
      self.gen_outs = gen_outs = [None] * num_steps
      gen_states[-1] = self.gen_ics
      gen_outs[-1] = gen_cell.output_from_state(gen_states[-1])
      self.factors = factors = [None] * num_steps
      factors[-1] = linear(gen_outs[-1], factors_dim, do_bias=False,
                           normalized=True, name="gen_2_fac")

    self.rates = rates = [None] * num_steps
    # rates[-1] is collected to potentially feed back to controller
    with tf.variable_scope("glm", reuse=False):
      if hps.output_dist == 'poisson':
        log_rates_t0 = tf.matmul(factors[-1], this_out_fac_W) + this_out_fac_b
        log_rates_t0.set_shape([None, None])
        rates[-1] = tf.exp(log_rates_t0) # rate
        rates[-1].set_shape([None, hps.dataset_dims[hps.dataset_names[0]]])
      elif hps.output_dist == 'gaussian':
        mean_n_logvars = tf.matmul(factors[-1],this_out_fac_W) + this_out_fac_b
        mean_n_logvars.set_shape([None, None])
        means_t_bxd, logvars_t_bxd = tf.split(axis=1, num_or_size_splits=2,
                                              value=mean_n_logvars)
        rates[-1] = means_t_bxd
      else:
        assert False, "NIY"

    # We support multiple output distributions, for example Poisson, and also
    # Gaussian. In these two cases respectively, there are one and two
    # parameters (rates vs. mean and variance).  So the output_dist_params
    # tensor will variable sizes via tf.concat and tf.split, along the 1st
    # dimension. So in the case of gaussian, for example, it'll be
    # batch x (D+D), where each D dims is the mean, and then variances,
    # respectively. For a distribution with 3 parameters, it would be
    # batch x (D+D+D).
    self.output_dist_params = dist_params = [None] * num_steps
    self.log_p_xgz_b = log_p_xgz_b = 0.0  # log P(x|z)
    for t in range(num_steps):
      # Controller
      if co_dim > 0:
        # Build inputs for controller
        tlag = t - hps.controller_input_lag
        if tlag < 0:
          con_in_f_t = tf.zeros_like(ci_enc_fwd[0])
        else:
          con_in_f_t = ci_enc_fwd[tlag]
        if hps.do_causal_controller:
          # If controller is causal (wrt to data generation process), then it
          # cannot see future data.  Thus, excluding ci_enc_rev[t] is obvious.
          # Less obvious is the need to exclude factors[t-1].  This arises
          # because information flows from g0 through factors to the controller
          # input.  The g0 encoding is backwards, so we must necessarily exclude
          # the factors in order to keep the controller input purely from a
          # forward encoding (however unlikely it is that
          # g0->factors->controller channel might actually be used in this way).
          con_in_list_t = [con_in_f_t]
        else:
          tlag_rev = t + hps.controller_input_lag
          if tlag_rev >= num_steps:
            # better than zeros
            con_in_r_t = tf.zeros_like(ci_enc_rev[0])
          else:
            con_in_r_t = ci_enc_rev[tlag_rev]
          con_in_list_t = [con_in_f_t, con_in_r_t]

        if hps.do_feed_factors_to_controller:
          if hps.feedback_factors_or_rates == "factors":
            con_in_list_t.append(factors[t-1])
          elif hps.feedback_factors_or_rates == "rates":
            con_in_list_t.append(rates[t-1])
          else:
            assert False, "NIY"

        con_in_t = tf.concat(axis=1, values=con_in_list_t)
        con_in_t = tf.nn.dropout(con_in_t, keep_prob)
        with tf.variable_scope("con", reuse=True if t > 0 else None):
          con_outs[t], con_states[t] = con_cell(con_in_t, con_states[t-1])
          posterior_zs_co[t] = \
            DiagonalGaussianFromInput(con_outs[t], co_dim,
                                      name="con_to_post_co")
        if kind == "train":
          u_t[t] = posterior_zs_co[t].sample
        elif kind == "posterior_sample_and_average":
          u_t[t] = posterior_zs_co[t].sample
        elif kind == "posterior_push_mean":
          u_t[t] = posterior_zs_co[t].mean
        else:
          u_t[t] = prior_zs_ar_con.samples_t[t]

      # Inputs to the generator (controller output + external input)
      if ext_input_dim > 0 and hps.inject_ext_input_to_gen:
        ext_input_t_bxi = ext_input_do_bxtxi[:,t,:]
        if co_dim > 0:
          gen_inputs[t] = tf.concat(axis=1, values=[u_t[t], ext_input_t_bxi])
        else:
          gen_inputs[t] = ext_input_t_bxi
      else:
        gen_inputs[t] = u_t[t]

      # Generator
      data_t_bxd = dataset_ph[:,t,:]
      with tf.variable_scope("gen", reuse=True if t > 0 else None):
        gen_outs[t], gen_states[t] = gen_cell(gen_inputs[t], gen_states[t-1])
        gen_outs[t] = tf.nn.dropout(gen_outs[t], keep_prob)
      with tf.variable_scope("gen", reuse=True): # ic defined it above
        factors[t] = linear(gen_outs[t], factors_dim, do_bias=False,
                            normalized=True, name="gen_2_fac")
      with tf.variable_scope("glm", reuse=True if t > 0 else None):
        if hps.output_dist == 'poisson':
          log_rates_t = tf.matmul(factors[t], this_out_fac_W) + this_out_fac_b
          log_rates_t.set_shape([None, None])
          rates[t] = dist_params[t] = tf.exp(tf.clip_by_value(log_rates_t, -hps._clip_value, hps._clip_value)) # rates feed back
          rates[t].set_shape([None, hps.dataset_dims[hps.dataset_names[0]]])
          loglikelihood_t = Poisson(log_rates_t).logp(data_t_bxd)

        elif hps.output_dist == 'gaussian':
          mean_n_logvars = tf.matmul(factors[t],this_out_fac_W) + this_out_fac_b
          mean_n_logvars.set_shape([None, None])
          means_t_bxd, logvars_t_bxd = tf.split(axis=1, num_or_size_splits=2,
                                                value=mean_n_logvars)
          rates[t] = means_t_bxd # rates feed back to controller
          dist_params[t] = tf.concat(
              axis=1, values=[means_t_bxd, tf.exp(tf.clip_by_value(logvars_t_bxd, -hps._clip_value, hps._clip_value))])
          loglikelihood_t = \
              diag_gaussian_log_likelihood(data_t_bxd,
                                           means_t_bxd, logvars_t_bxd)
        else:
          assert False, "NIY"

        log_p_xgz_b += tf.reduce_sum(loglikelihood_t, [1])

    # Correlation of inferred inputs cost.
    self.corr_cost = tf.constant(0.0)
    if hps.co_mean_corr_scale > 0.0:
      all_sum_corr = []
      for i in range(hps.co_dim):
        for j in range(i+1, hps.co_dim):
          sum_corr_ij = tf.constant(0.0)
          for t in range(num_steps):
            u_mean_t = posterior_zs_co[t].mean
            sum_corr_ij += u_mean_t[:,i]*u_mean_t[:,j]
          all_sum_corr.append(0.5 * tf.square(sum_corr_ij))
      self.corr_cost = tf.reduce_mean(all_sum_corr) # div by batch and by n*(n-1)/2 pairs

    # Variational Lower Bound on posterior, p(z|x), plus reconstruction cost.
    # KL and reconstruction costs are normalized only by batch size, not by
    # dimension, or by time steps.
    kl_cost_g0_b = tf.zeros_like(batch_size, dtype=tf.float32)
    kl_cost_co_b = tf.zeros_like(batch_size, dtype=tf.float32)
    self.kl_cost = tf.constant(0.0) # VAE KL cost
    self.recon_cost = tf.constant(0.0) # VAE reconstruction cost
    self.nll_bound_vae = tf.constant(0.0)
    self.nll_bound_iwae = tf.constant(0.0) # for eval with IWAE cost.
    if kind in ["train", "posterior_sample_and_average", "posterior_push_mean"]:
      kl_cost_g0_b = 0.0
      kl_cost_co_b = 0.0
      if ic_dim > 0:
        g0_priors = [self.prior_zs_g0]
        g0_posts = [self.posterior_zs_g0]
        kl_cost_g0_b = KLCost_GaussianGaussian(g0_posts, g0_priors).kl_cost_b
        kl_cost_g0_b = hps.kl_ic_weight * kl_cost_g0_b
      if co_dim > 0:
        kl_cost_co_b = \
            KLCost_GaussianGaussianProcessSampled(
                posterior_zs_co, prior_zs_ar_con).kl_cost_b
        kl_cost_co_b = hps.kl_co_weight * kl_cost_co_b

      # L = -KL + log p(x|z), to maximize bound on likelihood
      # -L = KL - log p(x|z), to minimize bound on NLL
      # so 'reconstruction cost' is negative log likelihood
      self.recon_cost = - tf.reduce_mean(log_p_xgz_b)
      self.kl_cost = tf.reduce_mean(kl_cost_g0_b + kl_cost_co_b)

      lb_on_ll_b = log_p_xgz_b - kl_cost_g0_b - kl_cost_co_b

      # VAE error averages outside the log
      self.nll_bound_vae = -tf.reduce_mean(lb_on_ll_b)

      # IWAE error averages inside the log
      k = tf.cast(tf.shape(log_p_xgz_b)[0], tf.float32)
      iwae_lb_on_ll = -tf.log(k) + log_sum_exp(lb_on_ll_b)
      self.nll_bound_iwae = -iwae_lb_on_ll

    # L2 regularization on the generator, normalized by number of parameters.
    self.l2_cost = tf.constant(0.0)
    if self.hps.l2_gen_scale > 0.0 or self.hps.l2_con_scale > 0.0:
      l2_costs = []
      l2_numels = []
      l2_reg_var_lists = [tf.get_collection('l2_gen_reg'),
                          tf.get_collection('l2_con_reg')]
      l2_reg_scales = [self.hps.l2_gen_scale, self.hps.l2_con_scale]
      for l2_reg_vars, l2_scale in zip(l2_reg_var_lists, l2_reg_scales):
        for v in l2_reg_vars:
          numel = tf.reduce_prod(tf.concat(axis=0, values=tf.shape(v)))
          numel_f = tf.cast(numel, tf.float32)
          l2_numels.append(numel_f)
          v_l2 = tf.reduce_sum(v*v)
          l2_costs.append(0.5 * l2_scale * v_l2)
      self.l2_cost = tf.add_n(l2_costs) / tf.add_n(l2_numels)

    # Compute the cost for training, part of the graph regardless.
    # The KL cost can be problematic at the beginning of optimization,
    # so we allow an exponential increase in weighting the KL from 0
    # to 1.
    self.kl_decay_step = tf.maximum(self.train_step - hps.kl_start_step, 0)
    self.l2_decay_step = tf.maximum(self.train_step - hps.l2_start_step, 0)
    kl_decay_step_f = tf.cast(self.kl_decay_step, tf.float32)
    l2_decay_step_f = tf.cast(self.l2_decay_step, tf.float32)
    kl_increase_steps_f = tf.cast(hps.kl_increase_steps, tf.float32)
    l2_increase_steps_f = tf.cast(hps.l2_increase_steps, tf.float32)
    self.kl_weight = kl_weight = \
        tf.minimum(kl_decay_step_f / kl_increase_steps_f, 1.0)
    self.l2_weight = l2_weight = \
        tf.minimum(l2_decay_step_f / l2_increase_steps_f, 1.0)

    self.timed_kl_cost = kl_weight * self.kl_cost
    self.timed_l2_cost = l2_weight * self.l2_cost
    self.weight_corr_cost = hps.co_mean_corr_scale * self.corr_cost
    self.cost = self.recon_cost + self.timed_kl_cost + \
        self.timed_l2_cost + self.weight_corr_cost

    if kind != "train":
      # save every so often
      self.seso_saver = tf.train.Saver(tf.global_variables(),
                                       max_to_keep=hps.max_ckpt_to_keep)
      # lowest validation error
      self.lve_saver = tf.train.Saver(tf.global_variables(),
                                      max_to_keep=hps.max_ckpt_to_keep_lve)

      return

    # OPTIMIZATION
    # train the io matrices only
    if self.hps.do_train_io_only:
      self.train_vars = tvars = \
        tf.get_collection('IO_transformations',
                          scope=tf.get_variable_scope().name)
    # train the encoder only
    elif self.hps.do_train_encoder_only:
      tvars1 = \
        tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                          scope='LFADS/ic_enc_*')
      tvars2 = \
        tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                          scope='LFADS/z/ic_enc_*')

      self.train_vars = tvars = tvars1 + tvars2
    # train all variables
    else:
      self.train_vars = tvars = \
        tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                          scope=tf.get_variable_scope().name)
    print("done.")
    print("Model Variables (to be optimized): ")
    total_params = 0
    for i in range(len(tvars)):
      shape = tvars[i].get_shape().as_list()
      print("    ", i, tvars[i].name, shape)
      total_params += np.prod(shape)
    print("Total model parameters: ", total_params)

    grads = tf.gradients(self.cost, tvars)
    grads, grad_global_norm = tf.clip_by_global_norm(grads, hps.max_grad_norm)
    opt = tf.train.AdamOptimizer(self.learning_rate, beta1=0.9, beta2=0.999,
                                 epsilon=1e-01)
    self.grads = grads
    self.grad_global_norm = grad_global_norm
    self.train_op = opt.apply_gradients(
        zip(grads, tvars), global_step=self.train_step)

    self.seso_saver = tf.train.Saver(tf.global_variables(),
                                     max_to_keep=hps.max_ckpt_to_keep)

    # lowest validation error
    self.lve_saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=hps.max_ckpt_to_keep)

    # SUMMARIES, used only during training.
    # example summary
    self.example_image = tf.placeholder(tf.float32, shape=[1,None,None,3],
                                        name='image_tensor')
    self.example_summ = tf.summary.image("LFADS example", self.example_image,
                                         collections=["example_summaries"])

    # general training summaries
    self.lr_summ = tf.summary.scalar("Learning rate", self.learning_rate)
    self.kl_weight_summ = tf.summary.scalar("KL weight", self.kl_weight)
    self.l2_weight_summ = tf.summary.scalar("L2 weight", self.l2_weight)
    self.corr_cost_summ = tf.summary.scalar("Corr cost", self.weight_corr_cost)
    self.grad_global_norm_summ = tf.summary.scalar("Gradient global norm",
                                                   self.grad_global_norm)
    if hps.co_dim > 0:
      self.atau_summ = [None] * hps.co_dim
      self.pvar_summ = [None] * hps.co_dim
      for c in range(hps.co_dim):
        self.atau_summ[c] = \
            tf.summary.scalar("AR Autocorrelation taus " + str(c),
                              tf.exp(self.prior_zs_ar_con.logataus_1xu[0,c]))
        self.pvar_summ[c] = \
            tf.summary.scalar("AR Variances " + str(c),
                              tf.exp(self.prior_zs_ar_con.logpvars_1xu[0,c]))

    # cost summaries, separated into different collections for
    # training vs validation.  We make placeholders for these, because
    # even though the graph computes these costs on a per-batch basis,
    # we want to report the more reliable metric of per-epoch cost.
    kl_cost_ph = tf.placeholder(tf.float32, shape=[], name='kl_cost_ph')
    self.kl_t_cost_summ = tf.summary.scalar("KL cost (train)", kl_cost_ph,
                                            collections=["train_summaries"])
    self.kl_v_cost_summ = tf.summary.scalar("KL cost (valid)", kl_cost_ph,
                                            collections=["valid_summaries"])
    l2_cost_ph = tf.placeholder(tf.float32, shape=[], name='l2_cost_ph')
    self.l2_cost_summ = tf.summary.scalar("L2 cost", l2_cost_ph,
                                          collections=["train_summaries"])

    recon_cost_ph = tf.placeholder(tf.float32, shape=[], name='recon_cost_ph')
    self.recon_t_cost_summ = tf.summary.scalar("Reconstruction cost (train)",
                                               recon_cost_ph,
                                               collections=["train_summaries"])
    self.recon_v_cost_summ = tf.summary.scalar("Reconstruction cost (valid)",
                                               recon_cost_ph,
                                               collections=["valid_summaries"])

    total_cost_ph = tf.placeholder(tf.float32, shape=[], name='total_cost_ph')
    self.cost_t_summ = tf.summary.scalar("Total cost (train)", total_cost_ph,
                                         collections=["train_summaries"])
    self.cost_v_summ = tf.summary.scalar("Total cost (valid)", total_cost_ph,
                                         collections=["valid_summaries"])

    self.kl_cost_ph = kl_cost_ph
    self.l2_cost_ph = l2_cost_ph
    self.recon_cost_ph = recon_cost_ph
    self.total_cost_ph = total_cost_ph

    # Merged summaries, for easy coding later.
    self.merged_examples = tf.summary.merge_all(key="example_summaries")
    self.merged_generic = tf.summary.merge_all() # default key is 'summaries'
    self.merged_train = tf.summary.merge_all(key="train_summaries")
    self.merged_valid = tf.summary.merge_all(key="valid_summaries")

    session = tf.get_default_session()
    self.logfile = os.path.join(hps.lfads_save_dir, "lfads_log")
    self.writer = tf.summary.FileWriter(self.logfile)

  def build_feed_dict(self, train_name, data_bxtxd, ext_input_bxtxi=None,
                      keep_prob=None):
    """Build the feed dictionary, handles cases where there is no value defined.

    Args:
      train_name: The key into the datasets, to set the tf.case statement for
        the proper readin / readout matrices.
      data_bxtxd: The data tensor
      ext_input_bxtxi (optional): The external input tensor
      keep_prob: The drop out keep probability.

    Returns:
      The feed dictionary with TF tensors as keys and data as values, for use
      with tf.Session.run()

    """
    feed_dict = {}
    B, T, _ = data_bxtxd.shape
    feed_dict[self.dataName] = train_name
    feed_dict[self.dataset_ph] = data_bxtxd

    if self.ext_input is not None and ext_input_bxtxi is not None:
      feed_dict[self.ext_input] = ext_input_bxtxi

    if keep_prob is None:
      feed_dict[self.keep_prob] = self.hps.keep_prob
    else:
      feed_dict[self.keep_prob] = keep_prob

    return feed_dict

  @staticmethod
  def get_batch(data_extxd, ext_input_extxi=None, batch_size=None,
                example_idxs=None):
    """Get a batch of data, either randomly chosen, or specified directly.

    Args:
      data_extxd: The data to model, numpy tensors with shape:
        # examples x # time steps x # dimensions
      ext_input_extxi (optional): The external inputs, numpy tensor with shape:
        # examples x # time steps x # external input dimensions
      batch_size:  The size of the batch to return
      example_idxs (optional): The example indices used to select examples.

    Returns:
      A tuple with two parts:
        1. Batched data numpy tensor with shape:
        batch_size x # time steps x # dimensions
        2. Batched external input numpy tensor with shape:
        batch_size x # time steps x # external input dims
    """
    assert batch_size is not None or example_idxs is not None, "Problems"
    E, T, D = data_extxd.shape
    if example_idxs is None:
      example_idxs = np.random.choice(E, batch_size)

    ext_input_bxtxi = None
    if ext_input_extxi is not None:
      ext_input_bxtxi = ext_input_extxi[example_idxs,:,:]

    return data_extxd[example_idxs,:,:], ext_input_bxtxi

  @staticmethod
  def example_idxs_mod_batch_size(nexamples, batch_size):
    """Given a number of examples, E, and a batch_size, B, generate indices
    [0, 1, 2, ... B-1;
    [B, B+1, ... 2*B-1;
    ...
    ]
    returning those indices as a 2-dim tensor shaped like E/B x B.  Note that
    shape is only correct if E % B == 0.  If not, then an extra row is generated
    so that the remainder of examples is included. The extra examples are
    explicitly to to the zero index (see randomize_example_idxs_mod_batch_size)
    for randomized behavior.

    Args:
      nexamples: The number of examples to batch up.
      batch_size: The size of the batch.
    Returns:
      2-dim tensor as described above.
    """
    bmrem = batch_size - (nexamples % batch_size)
    bmrem_examples = []
    if bmrem < batch_size:
      #bmrem_examples = np.zeros(bmrem, dtype=np.int32)
      ridxs = np.random.permutation(nexamples)[0:bmrem].astype(np.int32)
      bmrem_examples = np.sort(ridxs)
    example_idxs = range(nexamples) + list(bmrem_examples)
    example_idxs_e_x_edivb = np.reshape(example_idxs, [-1, batch_size])
    return example_idxs_e_x_edivb, bmrem

  @staticmethod
  def randomize_example_idxs_mod_batch_size(nexamples, batch_size):
    """Indices 1:nexamples, randomized, in 2D form of
    shape = (nexamples / batch_size) x batch_size.  The remainder
    is managed by drawing randomly from 1:nexamples.

    Args:
      nexamples: number of examples to randomize
      batch_size: number of elements in batch

    Returns:
      The randomized, properly shaped indicies.
    """
    assert nexamples > batch_size, "Problems"
    bmrem = batch_size - nexamples % batch_size
    bmrem_examples = []
    if bmrem < batch_size:
      bmrem_examples = np.random.choice(range(nexamples),
                                        size=bmrem, replace=False)
    example_idxs = range(nexamples) + list(bmrem_examples)
    mixed_example_idxs = np.random.permutation(example_idxs)
    example_idxs_e_x_edivb = np.reshape(mixed_example_idxs, [-1, batch_size])
    return example_idxs_e_x_edivb, bmrem

  def shuffle_spikes_in_time(self, data_bxtxd):
    """Shuffle the spikes in the temporal dimension.  This is useful to
    help the LFADS system avoid overfitting to individual spikes or fast
    oscillations found in the data that are irrelevant to behavior. A
    pure 'tabula rasa' approach would avoid this, but LFADS is sensitive
    enough to pick up dynamics that you may not want.

    Args:
      data_bxtxd: numpy array of spike count data to be shuffled.
    Returns:
    S_bxtxd, a numpy array with the same dimensions and contents as
      data_bxtxd, but shuffled appropriately.

    """

    B, T, N = data_bxtxd.shape
    w = self.hps.temporal_spike_jitter_width

    if w == 0:
      return data_bxtxd

    max_counts = np.max(data_bxtxd)
    S_bxtxd = np.zeros([B,T,N])

    # Intuitively, shuffle spike occurances, 0 or 1, but since we have counts,
    # Do it over and over again up to the max count.
    for mc in range(1,max_counts+1):
      idxs = np.nonzero(data_bxtxd >= mc)

      data_ones = np.zeros_like(data_bxtxd)
      data_ones[data_bxtxd >= mc] = 1

      nfound = len(idxs[0])
      shuffles_incrs_in_time = np.random.randint(-w, w, size=nfound)

      shuffle_tidxs = idxs[1].copy()
      shuffle_tidxs += shuffles_incrs_in_time

      # Reflect on the boundaries to not lose mass.
      shuffle_tidxs[shuffle_tidxs < 0] = -shuffle_tidxs[shuffle_tidxs < 0]
      shuffle_tidxs[shuffle_tidxs > T-1] = \
          (T-1)-(shuffle_tidxs[shuffle_tidxs > T-1] -(T-1))

      for iii in zip(idxs[0], shuffle_tidxs, idxs[2]):
        S_bxtxd[iii] += 1

    return S_bxtxd

  def shuffle_and_flatten_datasets(self, datasets, kind='train'):
    """Since LFADS supports multiple datasets in the same dynamical model,
    we have to be careful to use all the data in a single training epoch.  But
    since the datasets my have different data dimensionality, we cannot batch
    examples from data dictionaries together.  Instead, we generate random
    batches within each data dictionary, and then randomize these batches
    while holding onto the dataname, so that when it's time to feed
    the graph, the correct in/out matrices can be selected, per batch.

    Args:
      datasets: A dict of data dicts.  The dataset dict is simply a
        name(string)-> data dictionary mapping (See top of lfads.py).
      kind: 'train' or 'valid'

    Returns:
      A flat list, in which each element is a pair ('name', indices).
    """
    batch_size = self.hps.batch_size
    ndatasets = len(datasets)
    random_example_idxs = {}
    epoch_idxs = {}
    all_name_example_idx_pairs = []
    kind_data = kind + '_data'
    for name, data_dict in datasets.items():
      nexamples, ntime, data_dim = data_dict[kind_data].shape
      epoch_idxs[name] = 0
      random_example_idxs, _ = \
        self.randomize_example_idxs_mod_batch_size(nexamples, batch_size)

      epoch_size = random_example_idxs.shape[0]
      names = [name] * epoch_size
      all_name_example_idx_pairs += zip(names, random_example_idxs)

    np.random.shuffle(all_name_example_idx_pairs) # shuffle in place

    return all_name_example_idx_pairs

  def train_epoch(self, datasets, batch_size=None, do_save_ckpt=True):
    """Train the model through the entire dataset once.

    Args:
      datasets: A dict of data dicts.  The dataset dict is simply a
        name(string)-> data dictionary mapping (See top of lfads.py).
      batch_size (optional):  The batch_size to use
      do_save_ckpt (optional): Should the routine save a checkpoint on this
        training epoch?

    Returns:
    A tuple with 6 float values:
      (total cost of the epoch, epoch reconstruction cost,
       epoch kl cost, KL weight used this training epoch,
       total l2 cost on generator, and the corresponding weight).
    """
    ops_to_eval = [self.cost, self.recon_cost,
                   self.kl_cost, self.kl_weight,
                   self.l2_cost, self.l2_weight,
                   self.train_op]
    collected_op_values = self.run_epoch(datasets, ops_to_eval, kind="train")

    total_cost = total_recon_cost = total_kl_cost = 0.0
    # normalizing by batch done in distributions.py
    epoch_size = len(collected_op_values)
    for op_values in collected_op_values:
      total_cost += op_values[0]
      total_recon_cost += op_values[1]
      total_kl_cost += op_values[2]

    kl_weight = collected_op_values[-1][3]
    l2_cost = collected_op_values[-1][4]
    l2_weight = collected_op_values[-1][5]

    epoch_total_cost = total_cost / epoch_size
    epoch_recon_cost = total_recon_cost / epoch_size
    epoch_kl_cost = total_kl_cost / epoch_size

    if do_save_ckpt:
      session = tf.get_default_session()
      checkpoint_path = os.path.join(self.hps.lfads_save_dir,
                                     self.hps.checkpoint_name + '.ckpt')
      self.seso_saver.save(session, checkpoint_path,
                           global_step=self.train_step)

    return epoch_total_cost, epoch_recon_cost, epoch_kl_cost, \
        kl_weight, l2_cost, l2_weight


  def run_epoch(self, datasets, ops_to_eval, kind="train", batch_size=None,
                do_collect=True, keep_prob=None):
    """Run the model through the entire dataset once.

    Args:
      datasets: A dict of data dicts.  The dataset dict is simply a
        name(string)-> data dictionary mapping (See top of lfads.py).
      ops_to_eval: A list of tensorflow operations that will be evaluated in
        the tf.session.run() call.
      batch_size (optional):  The batch_size to use
      do_collect (optional): Should the routine collect all session.run
        output as a list, and return it?
      keep_prob (optional): The dropout keep probability.

    Returns:
      A list of lists, the internal list is the return for the ops for each
      session.run() call.  The outer list collects over the epoch.
    """
    hps = self.hps
    all_name_example_idx_pairs = \
        self.shuffle_and_flatten_datasets(datasets, kind)

    kind_data = kind + '_data'
    kind_ext_input = kind + '_ext_input'

    total_cost = total_recon_cost = total_kl_cost = 0.0
    session = tf.get_default_session()
    epoch_size = len(all_name_example_idx_pairs)
    evaled_ops_list = []
    for name, example_idxs in all_name_example_idx_pairs:
      data_dict = datasets[name]
      data_extxd = data_dict[kind_data]
      if hps.output_dist == 'poisson' and hps.temporal_spike_jitter_width > 0:
        data_extxd = self.shuffle_spikes_in_time(data_extxd)

      ext_input_extxi = data_dict[kind_ext_input]
      data_bxtxd, ext_input_bxtxi = self.get_batch(data_extxd, ext_input_extxi,
                                                   example_idxs=example_idxs)

      feed_dict = self.build_feed_dict(name, data_bxtxd, ext_input_bxtxi,
                                       keep_prob=keep_prob)
      evaled_ops_np = session.run(ops_to_eval, feed_dict=feed_dict)
      if do_collect:
        evaled_ops_list.append(evaled_ops_np)

    return evaled_ops_list

  def summarize_all(self, datasets, summary_values):
    """Plot and summarize stuff in tensorboard.

    Note that everything done in the current function is otherwise done on
    a single, randomly selected dataset (except for summary_values, which are
    passed in.)

    Args:
      datasets, the dictionary of datasets used in the study.
      summary_values:  These summary values are created from the training loop,
      and so summarize the entire set of datasets.
    """
    hps = self.hps
    tr_kl_cost = summary_values['tr_kl_cost']
    tr_recon_cost = summary_values['tr_recon_cost']
    tr_total_cost = summary_values['tr_total_cost']
    kl_weight = summary_values['kl_weight']
    l2_weight = summary_values['l2_weight']
    l2_cost = summary_values['l2_cost']
    has_any_valid_set = summary_values['has_any_valid_set']
    i = summary_values['nepochs']

    session = tf.get_default_session()
    train_summ, train_step = session.run([self.merged_train,
                                          self.train_step],
                             feed_dict={self.l2_cost_ph:l2_cost,
                                        self.kl_cost_ph:tr_kl_cost,
                                        self.recon_cost_ph:tr_recon_cost,
                                        self.total_cost_ph:tr_total_cost})
    self.writer.add_summary(train_summ, train_step)
    if has_any_valid_set:
      ev_kl_cost = summary_values['ev_kl_cost']
      ev_recon_cost = summary_values['ev_recon_cost']
      ev_total_cost = summary_values['ev_total_cost']
      eval_summ = session.run(self.merged_valid,
                              feed_dict={self.kl_cost_ph:ev_kl_cost,
                                         self.recon_cost_ph:ev_recon_cost,
                                         self.total_cost_ph:ev_total_cost})
      self.writer.add_summary(eval_summ, train_step)
      print("Epoch:%d, step:%d (TRAIN, VALID): total: %.2f, %.2f\
      recon: %.2f, %.2f,     kl: %.2f, %.2f,     l2: %.5f,\
      kl weight: %.2f, l2 weight: %.2f" % \
            (i, train_step, tr_total_cost, ev_total_cost,
             tr_recon_cost, ev_recon_cost, tr_kl_cost, ev_kl_cost,
             l2_cost, kl_weight, l2_weight))

      csv_outstr = "epoch,%d, step,%d, total,%.2f,%.2f, \
      recon,%.2f,%.2f, kl,%.2f,%.2f, l2,%.5f, \
      klweight,%.2f, l2weight,%.2f\n"% \
      (i, train_step, tr_total_cost, ev_total_cost,
       tr_recon_cost, ev_recon_cost, tr_kl_cost, ev_kl_cost,
       l2_cost, kl_weight, l2_weight)

    else:
      print("Epoch:%d, step:%d TRAIN: total: %.2f     recon: %.2f, kl: %.2f,\
      l2: %.5f,    kl weight: %.2f, l2 weight: %.2f" % \
            (i, train_step, tr_total_cost, tr_recon_cost, tr_kl_cost,
             l2_cost, kl_weight, l2_weight))
      csv_outstr = "epoch,%d, step,%d, total,%.2f, recon,%.2f, kl,%.2f, \
      l2,%.5f, klweight,%.2f, l2weight,%.2f\n"% \
      (i, train_step, tr_total_cost, tr_recon_cost,
       tr_kl_cost, l2_cost, kl_weight, l2_weight)

    if self.hps.csv_log:
      csv_file = os.path.join(self.hps.lfads_save_dir, self.hps.csv_log+'.csv')
      with open(csv_file, "a") as myfile:
        myfile.write(csv_outstr)


  def plot_single_example(self, datasets):
    """Plot an image relating to a randomly chosen, specific example.  We use
    posterior sample and average by taking one example, and filling a whole
    batch with that example, sample from the posterior, and then average the
    quantities.

    """
    hps = self.hps
    all_data_names = datasets.keys()
    data_name = np.random.permutation(all_data_names)[0]
    data_dict = datasets[data_name]
    has_valid_set = True if data_dict['valid_data'] is not None else False
    cf = 1.0                  # plotting concern

    # posterior sample and average here
    E, _, _ = data_dict['train_data'].shape
    eidx = np.random.choice(E)
    example_idxs = eidx * np.ones(hps.batch_size, dtype=np.int32)

    train_data_bxtxd, train_ext_input_bxtxi = \
        self.get_batch(data_dict['train_data'], data_dict['train_ext_input'],
                       example_idxs=example_idxs)

    truth_train_data_bxtxd = None
    if 'train_truth' in data_dict and data_dict['train_truth'] is not None:
      truth_train_data_bxtxd, _ = self.get_batch(data_dict['train_truth'],
                                                 example_idxs=example_idxs)
      cf = data_dict['conversion_factor']

    # plotter does averaging
    train_model_values = self.eval_model_runs_batch(data_name,
                                                    train_data_bxtxd,
                                                    train_ext_input_bxtxi,
                                                    do_average_batch=False)

    train_step = train_model_values['train_steps']
    feed_dict = self.build_feed_dict(data_name, train_data_bxtxd,
                                     train_ext_input_bxtxi, keep_prob=1.0)

    session = tf.get_default_session()
    generic_summ = session.run(self.merged_generic, feed_dict=feed_dict)
    self.writer.add_summary(generic_summ, train_step)

    valid_data_bxtxd = valid_model_values = valid_ext_input_bxtxi = None
    truth_valid_data_bxtxd = None
    if has_valid_set:
      E, _, _ = data_dict['valid_data'].shape
      eidx = np.random.choice(E)
      example_idxs = eidx * np.ones(hps.batch_size, dtype=np.int32)
      valid_data_bxtxd, valid_ext_input_bxtxi = \
          self.get_batch(data_dict['valid_data'],
                         data_dict['valid_ext_input'],
                         example_idxs=example_idxs)
      if 'valid_truth' in data_dict and data_dict['valid_truth'] is not None:
        truth_valid_data_bxtxd, _ = self.get_batch(data_dict['valid_truth'],
                                                   example_idxs=example_idxs)
      else:
        truth_valid_data_bxtxd = None

      # plotter does averaging
      valid_model_values = self.eval_model_runs_batch(data_name,
                                                      valid_data_bxtxd,
                                                      valid_ext_input_bxtxi,
                                                      do_average_batch=False)

    example_image = plot_lfads(train_bxtxd=train_data_bxtxd,
                               train_model_vals=train_model_values,
                               train_ext_input_bxtxi=train_ext_input_bxtxi,
                               train_truth_bxtxd=truth_train_data_bxtxd,
                               valid_bxtxd=valid_data_bxtxd,
                               valid_model_vals=valid_model_values,
                               valid_ext_input_bxtxi=valid_ext_input_bxtxi,
                               valid_truth_bxtxd=truth_valid_data_bxtxd,
                               bidx=None, cf=cf, output_dist=hps.output_dist)
    example_image = np.expand_dims(example_image, axis=0)
    example_summ = session.run(self.merged_examples,
                               feed_dict={self.example_image : example_image})
    self.writer.add_summary(example_summ)

  def train_model(self, datasets):
    """Train the model, print per-epoch information, and save checkpoints.

    Loop over training epochs. The function that actually does the
    training is train_epoch.  This function iterates over the training
    data, one epoch at a time.  The learning rate schedule is such
    that it will stay the same until the cost goes up in comparison to
    the last few values, then it will drop.

    Args:
      datasets: A dict of data dicts.  The dataset dict is simply a
        name(string)-> data dictionary mapping (See top of lfads.py).
    """
    hps = self.hps
    has_any_valid_set = False
    for data_dict in datasets.values():
      if data_dict['valid_data'] is not None:
        has_any_valid_set = True
        break

    session = tf.get_default_session()
    lr = session.run(self.learning_rate)
    lr_stop = hps.learning_rate_stop
    i = -1
    train_costs = []
    valid_costs = []
    ev_total_cost = ev_recon_cost = ev_kl_cost = 0.0
    lowest_ev_cost = np.Inf
    while True:
      i += 1
      do_save_ckpt = True if i % 10 ==0 else False
      tr_total_cost, tr_recon_cost, tr_kl_cost, kl_weight, l2_cost, l2_weight = \
                self.train_epoch(datasets, do_save_ckpt=do_save_ckpt)

      # Evaluate the validation cost, and potentially save.  Note that this
      # routine will not save a validation checkpoint until the kl weight and
      # l2 weights are equal to 1.0.
      if has_any_valid_set:
        ev_total_cost, ev_recon_cost, ev_kl_cost = \
            self.eval_cost_epoch(datasets, kind='valid')
        valid_costs.append(ev_total_cost)

        # > 1 may give more consistent results, but not the actual lowest vae.
        # == 1 gives the lowest vae seen so far.
        n_lve = 1
        run_avg_lve = np.mean(valid_costs[-n_lve:])

        # conditions for saving checkpoints:
        #   KL weight must have finished stepping (>=1.0), AND
        #   L2 weight must have finished stepping OR L2 is not being used, AND
        #   the current run has a lower LVE than previous runs AND
        #     len(valid_costs > n_lve) (not sure what that does)
        if kl_weight >= 1.0 and \
          (l2_weight >= 1.0 or \
           (self.hps.l2_gen_scale == 0.0 and self.hps.l2_con_scale == 0.0)) \
           and (len(valid_costs) > n_lve and run_avg_lve < lowest_ev_cost):

          lowest_ev_cost = run_avg_lve
          checkpoint_path = os.path.join(self.hps.lfads_save_dir,
                                         self.hps.checkpoint_name + '_lve.ckpt')
          self.lve_saver.save(session, checkpoint_path,
                              global_step=self.train_step,
                              latest_filename='checkpoint_lve')

      # Plot and summarize.
      values = {'nepochs':i, 'has_any_valid_set': has_any_valid_set,
                'tr_total_cost':tr_total_cost, 'ev_total_cost':ev_total_cost,
                'tr_recon_cost':tr_recon_cost, 'ev_recon_cost':ev_recon_cost,
                'tr_kl_cost':tr_kl_cost, 'ev_kl_cost':ev_kl_cost,
                'l2_weight':l2_weight, 'kl_weight':kl_weight,
                'l2_cost':l2_cost}
      self.summarize_all(datasets, values)
      self.plot_single_example(datasets)

      # Manage learning rate.
      train_res = tr_total_cost
      n_lr = hps.learning_rate_n_to_compare
      if len(train_costs) > n_lr and train_res > np.max(train_costs[-n_lr:]):
        _ = session.run(self.learning_rate_decay_op)
        lr = session.run(self.learning_rate)
        print("     Decreasing learning rate to %f." % lr)
        # Force the system to run n_lr times while at this lr.
        train_costs.append(np.inf)
      else:
        train_costs.append(train_res)

      if lr < lr_stop:
        print("Stopping optimization based on learning rate criteria.")
        break

  def eval_cost_epoch(self, datasets, kind='train', ext_input_extxi=None,
                      batch_size=None):
    """Evaluate the cost of the epoch.

    Args:
      data_dict: The dictionary of data (training and validation) used for
        training and evaluation of the model, respectively.

    Returns:
      a 3 tuple of costs:
        (epoch total cost, epoch reconstruction cost, epoch KL cost)
    """
    ops_to_eval = [self.cost, self.recon_cost, self.kl_cost]
    collected_op_values = self.run_epoch(datasets, ops_to_eval, kind=kind,
                                         keep_prob=1.0)

    total_cost = total_recon_cost = total_kl_cost = 0.0
    # normalizing by batch done in distributions.py
    epoch_size = len(collected_op_values)
    for op_values in collected_op_values:
      total_cost += op_values[0]
      total_recon_cost += op_values[1]
      total_kl_cost += op_values[2]

    epoch_total_cost = total_cost / epoch_size
    epoch_recon_cost = total_recon_cost / epoch_size
    epoch_kl_cost = total_kl_cost / epoch_size

    return epoch_total_cost, epoch_recon_cost, epoch_kl_cost

  def eval_model_runs_batch(self, data_name, data_bxtxd, ext_input_bxtxi=None,
                            do_eval_cost=False, do_average_batch=False):
    """Returns all the goodies for the entire model, per batch.

    If data_bxtxd and ext_input_bxtxi can have fewer than batch_size along dim 1
    in which case this handles the padding and truncating automatically

    Args:
      data_name: The name of the data dict, to select which in/out matrices
        to use.
      data_bxtxd:  Numpy array training data with shape:
        batch_size x # time steps x # dimensions
      ext_input_bxtxi: Numpy array training external input with shape:
        batch_size x # time steps x # external input dims
      do_eval_cost (optional): If true, the IWAE (Importance Weighted
         Autoencoder) log likeihood bound, instead of the VAE version.
      do_average_batch (optional): average over the batch, useful for getting
      good IWAE costs, and model outputs for a single data point.

    Returns:
      A dictionary with the outputs of the model decoder, namely:
        prior g0 mean, prior g0 variance, approx. posterior mean, approx
        posterior mean, the generator initial conditions, the control inputs (if
        enabled), the state of the generator, the factors, and the rates.
    """
    session = tf.get_default_session()

    # if fewer than batch_size provided, pad to batch_size
    hps = self.hps
    batch_size = hps.batch_size
    E, _, _ = data_bxtxd.shape
    if E < hps.batch_size:
      data_bxtxd = np.pad(data_bxtxd, ((0, hps.batch_size-E), (0, 0), (0, 0)),
                          mode='constant', constant_values=0)
      if ext_input_bxtxi is not None:
        ext_input_bxtxi = np.pad(ext_input_bxtxi,
                                 ((0, hps.batch_size-E), (0, 0), (0, 0)),
                                 mode='constant', constant_values=0)

    feed_dict = self.build_feed_dict(data_name, data_bxtxd,
                                     ext_input_bxtxi, keep_prob=1.0)

    # Non-temporal signals will be batch x dim.
    # Temporal signals are list length T with elements batch x dim.
    tf_vals = [self.gen_ics, self.gen_states, self.factors,
               self.output_dist_params]
    tf_vals.append(self.cost)
    tf_vals.append(self.nll_bound_vae)
    tf_vals.append(self.nll_bound_iwae)
    tf_vals.append(self.train_step) # not train_op!
    if self.hps.ic_dim > 0:
      tf_vals += [self.prior_zs_g0.mean, self.prior_zs_g0.logvar,
                  self.posterior_zs_g0.mean, self.posterior_zs_g0.logvar]
    if self.hps.co_dim > 0:
      tf_vals.append(self.controller_outputs)
    tf_vals_flat, fidxs = flatten(tf_vals)

    np_vals_flat = session.run(tf_vals_flat, feed_dict=feed_dict)

    ff = 0
    gen_ics = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    gen_states = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    factors = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    out_dist_params = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    costs = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    nll_bound_vaes = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    nll_bound_iwaes = [np_vals_flat[f] for f in fidxs[ff]]; ff +=1
    train_steps = [np_vals_flat[f] for f in fidxs[ff]]; ff +=1
    if self.hps.ic_dim > 0:
      prior_g0_mean = [np_vals_flat[f] for f in fidxs[ff]]; ff +=1
      prior_g0_logvar = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
      post_g0_mean = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
      post_g0_logvar = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    if self.hps.co_dim > 0:
      controller_outputs = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1

    # [0] are to take out the non-temporal items from lists
    gen_ics = gen_ics[0]
    costs = costs[0]
    nll_bound_vaes = nll_bound_vaes[0]
    nll_bound_iwaes = nll_bound_iwaes[0]
    train_steps = train_steps[0]

    # Convert to full tensors, not lists of tensors in time dim.
    gen_states = list_t_bxn_to_tensor_bxtxn(gen_states)
    factors = list_t_bxn_to_tensor_bxtxn(factors)
    out_dist_params = list_t_bxn_to_tensor_bxtxn(out_dist_params)
    if self.hps.ic_dim > 0:
      # select first time point
      prior_g0_mean = prior_g0_mean[0]
      prior_g0_logvar = prior_g0_logvar[0]
      post_g0_mean = post_g0_mean[0]
      post_g0_logvar = post_g0_logvar[0]
    if self.hps.co_dim > 0:
      controller_outputs = list_t_bxn_to_tensor_bxtxn(controller_outputs)

    # slice out the trials in case < batch_size provided
    if E < hps.batch_size:
      idx = np.arange(E)
      gen_ics = gen_ics[idx, :]
      gen_states = gen_states[idx, :]
      factors = factors[idx, :, :]
      out_dist_params = out_dist_params[idx, :, :]
      if self.hps.ic_dim > 0:
        prior_g0_mean = prior_g0_mean[idx, :]
        prior_g0_logvar = prior_g0_logvar[idx, :]
        post_g0_mean = post_g0_mean[idx, :]
        post_g0_logvar = post_g0_logvar[idx, :]
      if self.hps.co_dim > 0:
        controller_outputs = controller_outputs[idx, :, :]

    if do_average_batch:
      gen_ics = np.mean(gen_ics, axis=0)
      gen_states = np.mean(gen_states, axis=0)
      factors = np.mean(factors, axis=0)
      out_dist_params = np.mean(out_dist_params, axis=0)
      if self.hps.ic_dim > 0:
        prior_g0_mean = np.mean(prior_g0_mean, axis=0)
        prior_g0_logvar = np.mean(prior_g0_logvar, axis=0)
        post_g0_mean = np.mean(post_g0_mean, axis=0)
        post_g0_logvar = np.mean(post_g0_logvar, axis=0)
      if self.hps.co_dim > 0:
        controller_outputs = np.mean(controller_outputs, axis=0)

    model_vals = {}
    model_vals['gen_ics'] = gen_ics
    model_vals['gen_states'] = gen_states
    model_vals['factors'] = factors
    model_vals['output_dist_params'] = out_dist_params
    model_vals['costs'] = costs
    model_vals['nll_bound_vaes'] = nll_bound_vaes
    model_vals['nll_bound_iwaes'] = nll_bound_iwaes
    model_vals['train_steps'] = train_steps
    if self.hps.ic_dim > 0:
      model_vals['prior_g0_mean'] = prior_g0_mean
      model_vals['prior_g0_logvar'] = prior_g0_logvar
      model_vals['post_g0_mean'] = post_g0_mean
      model_vals['post_g0_logvar'] = post_g0_logvar
    if self.hps.co_dim > 0:
      model_vals['controller_outputs'] = controller_outputs

    return model_vals

  def eval_model_runs_avg_epoch(self, data_name, data_extxd,
                                ext_input_extxi=None):
    """Returns all the expected value for goodies for the entire model.

    The expected value is taken over hidden (z) variables, namely the initial
    conditions and the control inputs.  The expected value is approximate, and
    accomplished via sampling (batch_size) samples for every examples.

    Args:
      data_name: The name of the data dict, to select which in/out matrices
        to use.
      data_extxd:  Numpy array training data with shape:
        # examples x # time steps x # dimensions
      ext_input_extxi (optional): Numpy array training external input with
        shape: # examples x # time steps x # external input dims

    Returns:
      A dictionary with the averaged outputs of the model decoder, namely:
        prior g0 mean, prior g0 variance, approx. posterior mean, approx
        posterior mean, the generator initial conditions, the control inputs (if
        enabled), the state of the generator, the factors, and the output
        distribution parameters, e.g. (rates or mean and variances).
    """
    hps = self.hps
    batch_size = hps.batch_size
    E, T, D  = data_extxd.shape
    E_to_process = hps.ps_nexamples_to_process
    if E_to_process > E:
      E_to_process = E

    if hps.ic_dim > 0:
      prior_g0_mean = np.zeros([E_to_process, hps.ic_dim])
      prior_g0_logvar = np.zeros([E_to_process, hps.ic_dim])
      post_g0_mean = np.zeros([E_to_process, hps.ic_dim])
      post_g0_logvar = np.zeros([E_to_process, hps.ic_dim])

    if hps.co_dim > 0:
      controller_outputs = np.zeros([E_to_process, T, hps.co_dim])
    gen_ics = np.zeros([E_to_process, hps.gen_dim])
    gen_states = np.zeros([E_to_process, T, hps.gen_dim])
    factors = np.zeros([E_to_process, T, hps.factors_dim])

    if hps.output_dist == 'poisson':
      out_dist_params = np.zeros([E_to_process, T, D])
    elif hps.output_dist == 'gaussian':
      out_dist_params = np.zeros([E_to_process, T, D+D])
    else:
      assert False, "NIY"

    costs = np.zeros(E_to_process)
    nll_bound_vaes = np.zeros(E_to_process)
    nll_bound_iwaes = np.zeros(E_to_process)
    train_steps = np.zeros(E_to_process)
    for es_idx in range(E_to_process):
      print("Running %d of %d." % (es_idx+1, E_to_process))
      example_idxs = es_idx * np.ones(batch_size, dtype=np.int32)
      data_bxtxd, ext_input_bxtxi = self.get_batch(data_extxd,
                                                   ext_input_extxi,
                                                   batch_size=batch_size,
                                                   example_idxs=example_idxs)
      model_values = self.eval_model_runs_batch(data_name, data_bxtxd,
                                                ext_input_bxtxi,
                                                do_eval_cost=True,
                                                do_average_batch=True)

      if self.hps.ic_dim > 0:
        prior_g0_mean[es_idx,:] = model_values['prior_g0_mean']
        prior_g0_logvar[es_idx,:] = model_values['prior_g0_logvar']
        post_g0_mean[es_idx,:] = model_values['post_g0_mean']
        post_g0_logvar[es_idx,:] = model_values['post_g0_logvar']
      gen_ics[es_idx,:] = model_values['gen_ics']

      if self.hps.co_dim > 0:
        controller_outputs[es_idx,:,:] = model_values['controller_outputs']
      gen_states[es_idx,:,:] = model_values['gen_states']
      factors[es_idx,:,:] = model_values['factors']
      out_dist_params[es_idx,:,:] = model_values['output_dist_params']
      costs[es_idx] = model_values['costs']
      nll_bound_vaes[es_idx] = model_values['nll_bound_vaes']
      nll_bound_iwaes[es_idx] = model_values['nll_bound_iwaes']
      train_steps[es_idx] = model_values['train_steps']
      print('bound nll(vae): %.3f, bound nll(iwae): %.3f' \
            % (nll_bound_vaes[es_idx], nll_bound_iwaes[es_idx]))

    model_runs = {}
    if self.hps.ic_dim > 0:
      model_runs['prior_g0_mean'] = prior_g0_mean
      model_runs['prior_g0_logvar'] = prior_g0_logvar
      model_runs['post_g0_mean'] = post_g0_mean
      model_runs['post_g0_logvar'] = post_g0_logvar
    model_runs['gen_ics'] = gen_ics

    if self.hps.co_dim > 0:
      model_runs['controller_outputs'] = controller_outputs
    model_runs['gen_states'] = gen_states
    model_runs['factors'] = factors
    model_runs['output_dist_params'] = out_dist_params
    model_runs['costs'] = costs
    model_runs['nll_bound_vaes'] = nll_bound_vaes
    model_runs['nll_bound_iwaes'] = nll_bound_iwaes
    model_runs['train_steps'] = train_steps
    return model_runs

  def eval_model_runs_push_mean(self, data_name, data_extxd,
                                ext_input_extxi=None):
    """Returns values of interest for the  model by pushing the means through

    The mean values for both initial conditions and the control inputs are
    pushed through the model instead of sampling (as is done in
    eval_model_runs_avg_epoch).
    This is a quick and approximate version of estimating these values instead
    of sampling from the posterior many times and then averaging those values of
    interest.

    Internally, a total of batch_size trials are run through the model at once.

    Args:
      data_name: The name of the data dict, to select which in/out matrices
        to use.
      data_extxd:  Numpy array training data with shape:
        # examples x # time steps x # dimensions
      ext_input_extxi (optional): Numpy array training external input with
        shape: # examples x # time steps x # external input dims

    Returns:
      A dictionary with the estimated outputs of the model decoder, namely:
        prior g0 mean, prior g0 variance, approx. posterior mean, approx
        posterior mean, the generator initial conditions, the control inputs (if
        enabled), the state of the generator, the factors, and the output
        distribution parameters, e.g. (rates or mean and variances).
    """
    hps = self.hps
    batch_size = hps.batch_size
    E, T, D  = data_extxd.shape
    E_to_process = hps.ps_nexamples_to_process
    if E_to_process > E:
      print("Setting number of posterior samples to process to : ", E)
      E_to_process = E

    if hps.ic_dim > 0:
      prior_g0_mean = np.zeros([E_to_process, hps.ic_dim])
      prior_g0_logvar = np.zeros([E_to_process, hps.ic_dim])
      post_g0_mean = np.zeros([E_to_process, hps.ic_dim])
      post_g0_logvar = np.zeros([E_to_process, hps.ic_dim])

    if hps.co_dim > 0:
      controller_outputs = np.zeros([E_to_process, T, hps.co_dim])
    gen_ics = np.zeros([E_to_process, hps.gen_dim])
    gen_states = np.zeros([E_to_process, T, hps.gen_dim])
    factors = np.zeros([E_to_process, T, hps.factors_dim])

    if hps.output_dist == 'poisson':
      out_dist_params = np.zeros([E_to_process, T, D])
    elif hps.output_dist == 'gaussian':
      out_dist_params = np.zeros([E_to_process, T, D+D])
    else:
      assert False, "NIY"

    costs = np.zeros(E_to_process)
    nll_bound_vaes = np.zeros(E_to_process)
    nll_bound_iwaes = np.zeros(E_to_process)
    train_steps = np.zeros(E_to_process)

    # generator that will yield 0:N in groups of per items, e.g.
    # (0:per-1), (per:2*per-1), ..., with the last group containing <= per items
    # this will be used to feed per=batch_size trials into the model at a time
    def trial_batches(N, per):
      for i in range(0, N, per):
        yield np.arange(i, min(i+per, N), dtype=np.int32)

    for batch_idx, es_idx in enumerate(trial_batches(E_to_process,
                                                     hps.batch_size)):
      print("Running trial batch %d with %d trials" % (batch_idx+1,
                                                       len(es_idx)))
      data_bxtxd, ext_input_bxtxi = self.get_batch(data_extxd,
                                                   ext_input_extxi,
                                                   batch_size=batch_size,
                                                   example_idxs=es_idx)
      model_values = self.eval_model_runs_batch(data_name, data_bxtxd,
                                                ext_input_bxtxi,
                                                do_eval_cost=True,
                                                do_average_batch=False)

      if self.hps.ic_dim > 0:
        prior_g0_mean[es_idx,:] = model_values['prior_g0_mean']
        prior_g0_logvar[es_idx,:] = model_values['prior_g0_logvar']
        post_g0_mean[es_idx,:] = model_values['post_g0_mean']
        post_g0_logvar[es_idx,:] = model_values['post_g0_logvar']
      gen_ics[es_idx,:] = model_values['gen_ics']

      if self.hps.co_dim > 0:
        controller_outputs[es_idx,:,:] = model_values['controller_outputs']
      gen_states[es_idx,:,:] = model_values['gen_states']
      factors[es_idx,:,:] = model_values['factors']
      out_dist_params[es_idx,:,:] = model_values['output_dist_params']

      # TODO
      # model_values['costs'] and other costs come out as scalars, summed over
      # all the trials in the batch. what we want is the per-trial costs
      costs[es_idx] = model_values['costs']
      nll_bound_vaes[es_idx] = model_values['nll_bound_vaes']
      nll_bound_iwaes[es_idx] = model_values['nll_bound_iwaes']

      train_steps[es_idx] = model_values['train_steps']

    model_runs = {}
    if self.hps.ic_dim > 0:
      model_runs['prior_g0_mean'] = prior_g0_mean
      model_runs['prior_g0_logvar'] = prior_g0_logvar
      model_runs['post_g0_mean'] = post_g0_mean
      model_runs['post_g0_logvar'] = post_g0_logvar
    model_runs['gen_ics'] = gen_ics

    if self.hps.co_dim > 0:
      model_runs['controller_outputs'] = controller_outputs
    model_runs['gen_states'] = gen_states
    model_runs['factors'] = factors
    model_runs['output_dist_params'] = out_dist_params

    # You probably do not want the LL associated values when pushing the mean
    # instead of sampling.
    model_runs['costs'] = costs
    model_runs['nll_bound_vaes'] = nll_bound_vaes
    model_runs['nll_bound_iwaes'] = nll_bound_iwaes
    model_runs['train_steps'] = train_steps
    return model_runs

  def write_model_runs(self, datasets, output_fname=None, push_mean=False):
    """Run the model on the data in data_dict, and save the computed values.

    LFADS generates a number of outputs for each examples, and these are all
    saved.  They are:
      The mean and variance of the prior of g0.
      The mean and variance of approximate posterior of g0.
      The control inputs (if enabled)
      The initial conditions, g0, for all examples.
      The generator states for all time.
      The factors for all time.
      The output distribution parameters (e.g. rates) for all time.

    Args:
      datasets: a dictionary of named data_dictionaries, see top of lfads.py
      output_fname: a file name stem for the output files.
      push_mean: if False (default), generates batch_size samples for each trial
        and averages the results. if True, runs each trial once without noise,
        pushing the posterior mean initial conditions and control inputs through
        the trained model. False is used for posterior_sample_and_average, True
        is used for posterior_push_mean.
    """
    hps = self.hps
    kind = hps.kind

    for data_name, data_dict in datasets.items():
      data_tuple = [('train', data_dict['train_data'],
                     data_dict['train_ext_input']),
                    ('valid', data_dict['valid_data'],
                     data_dict['valid_ext_input'])]
      for data_kind, data_extxd, ext_input_extxi in data_tuple:
        if not output_fname:
          fname = "model_runs_" + data_name + '_' + data_kind + '_' + kind
        else:
          fname = output_fname + data_name + '_' + data_kind + '_' + kind

        print("Writing data for %s data and kind %s." % (data_name, data_kind))
        if push_mean:
          model_runs = self.eval_model_runs_push_mean(data_name, data_extxd,
                                                      ext_input_extxi)
        else:
          model_runs = self.eval_model_runs_avg_epoch(data_name, data_extxd,
                                                      ext_input_extxi)
        full_fname = os.path.join(hps.lfads_save_dir, fname)
        write_data(full_fname, model_runs, compression='gzip')
        print("Done.")

  def write_model_samples(self, dataset_name, output_fname=None):
    """Use the prior distribution to generate batch_size number of samples
    from the model.

    LFADS generates a number of outputs for each sample, and these are all
    saved.  They are:
      The mean and variance of the prior of g0.
      The control inputs (if enabled)
      The initial conditions, g0, for all examples.
      The generator states for all time.
      The factors for all time.
      The output distribution parameters (e.g. rates) for all time.

    Args:
      dataset_name: The name of the dataset to grab the factors -> rates
      alignment matrices from.
      output_fname: The name of the file in which to save the generated
        samples.
    """
    hps = self.hps
    batch_size = hps.batch_size

    print("Generating %d samples" % (batch_size))
    tf_vals = [self.factors, self.gen_states, self.gen_ics,
               self.cost, self.output_dist_params]
    if hps.ic_dim > 0:
      tf_vals += [self.prior_zs_g0.mean, self.prior_zs_g0.logvar]
    if hps.co_dim > 0:
      tf_vals += [self.prior_zs_ar_con.samples_t]
    tf_vals_flat, fidxs = flatten(tf_vals)

    session = tf.get_default_session()
    feed_dict = {}
    feed_dict[self.dataName] = dataset_name
    feed_dict[self.keep_prob] = 1.0

    np_vals_flat = session.run(tf_vals_flat, feed_dict=feed_dict)

    ff = 0
    factors = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    gen_states = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    gen_ics = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    costs = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    output_dist_params = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    if hps.ic_dim > 0:
      prior_g0_mean = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
      prior_g0_logvar = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1
    if hps.co_dim > 0:
      prior_zs_ar_con = [np_vals_flat[f] for f in fidxs[ff]]; ff += 1

    # [0] are to take out the non-temporal items from lists
    gen_ics = gen_ics[0]
    costs = costs[0]

    # Convert to full tensors, not lists of tensors in time dim.
    gen_states = list_t_bxn_to_tensor_bxtxn(gen_states)
    factors = list_t_bxn_to_tensor_bxtxn(factors)
    output_dist_params = list_t_bxn_to_tensor_bxtxn(output_dist_params)
    if hps.ic_dim > 0:
      prior_g0_mean = prior_g0_mean[0]
      prior_g0_logvar = prior_g0_logvar[0]
    if hps.co_dim > 0:
      prior_zs_ar_con = list_t_bxn_to_tensor_bxtxn(prior_zs_ar_con)

    model_vals = {}
    model_vals['gen_ics'] = gen_ics
    model_vals['gen_states'] = gen_states
    model_vals['factors'] = factors
    model_vals['output_dist_params'] = output_dist_params
    model_vals['costs'] = costs.reshape(1)
    if hps.ic_dim > 0:
      model_vals['prior_g0_mean'] = prior_g0_mean
      model_vals['prior_g0_logvar'] = prior_g0_logvar
    if hps.co_dim > 0:
      model_vals['prior_zs_ar_con'] = prior_zs_ar_con

    full_fname = os.path.join(hps.lfads_save_dir, output_fname)
    write_data(full_fname, model_vals, compression='gzip')
    print("Done.")

  @staticmethod
  def eval_model_parameters(use_nested=True, include_strs=None):
    """Evaluate and return all of the TF variables in the model.

    Args:
    use_nested (optional): For returning values, use a nested dictoinary, based
      on variable scoping, or return all variables in a flat dictionary.
    include_strs (optional): A list of strings to use as a filter, to reduce the
      number of variables returned.  A variable name must contain at least one
      string in include_strs as a sub-string in order to be returned.

    Returns:
      The parameters of the model.  This can be in a flat
      dictionary, or a nested dictionary, where the nesting is by variable
      scope.
    """
    all_tf_vars = tf.global_variables()
    session = tf.get_default_session()
    all_tf_vars_eval = session.run(all_tf_vars)
    vars_dict = {}
    strs = ["LFADS"]
    if include_strs:
      strs += include_strs

    for i, (var, var_eval) in enumerate(zip(all_tf_vars, all_tf_vars_eval)):
      if any(s in include_strs for s in var.name):
        if not isinstance(var_eval, np.ndarray): # for H5PY
          print(var.name, """ is not numpy array, saving as numpy array
                with value: """, var_eval, type(var_eval))
          e = np.array(var_eval)
          print(e, type(e))
        else:
          e = var_eval
        vars_dict[var.name] = e

    if not use_nested:
      return vars_dict

    var_names = vars_dict.keys()
    nested_vars_dict = {}
    current_dict = nested_vars_dict
    for v, var_name in enumerate(var_names):
      var_split_name_list = var_name.split('/')
      split_name_list_len = len(var_split_name_list)
      current_dict = nested_vars_dict
      for p, part in enumerate(var_split_name_list):
        if p < split_name_list_len - 1:
          if part in current_dict:
            current_dict = current_dict[part]
          else:
            current_dict[part] = {}
            current_dict = current_dict[part]
        else:
          current_dict[part] = vars_dict[var_name]

    return nested_vars_dict

  @staticmethod
  def spikify_rates(rates_bxtxd):
    """Randomly spikify underlying rates according a Poisson distribution

    Args:
      rates_bxtxd: a numpy tensor with shape:

    Returns:
      A numpy array with the same shape as rates_bxtxd, but with the event
      counts.
    """

    B,T,N = rates_bxtxd.shape
    assert all([B > 0, N > 0]), "problems"

    # Because the rates are changing, there is nesting
    spikes_bxtxd = np.zeros([B,T,N], dtype=np.int32)
    for b in range(B):
      for t in range(T):
        for n in range(N):
          rate = rates_bxtxd[b,t,n]
          count = np.random.poisson(rate)
          spikes_bxtxd[b,t,n] = count

    return spikes_bxtxd