Spaces:

ntt123
/

WaveGRU-Text-To-Speech

Runtime error

File size: 17,537 Bytes

d1a84ee

/*
 * Copyright 2021 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_
#define LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_

#include <cstdint>

#include "absl/memory/memory.h"
#include "glog/logging.h"
#include "sparse_matmul/layers/csr_blocksparse_matrix.h"
#include "sparse_matmul/layers/masked_sparse_matrix.h"
#include "sparse_matmul/numerics/type_utils.h"
#include "sparse_matmul/os/coop_threads.h"
#include "sparse_matmul/vector/cache_aligned_vector.h"

namespace csrblocksparse {

template <typename WeightType, typename RhsType,
          typename BiasType = typename TypeOfProduct<WeightType, RhsType>::type,
          typename DeltaType = int16_t>
class SparseLinearLayer {
 public:
  SparseLinearLayer() {}

  SparseLinearLayer(CsrBlockSparseMatrix<WeightType, RhsType>&& sparse_matrix,
                    CacheAlignedVector<BiasType>&& bias)
      : sparse_matrix_(std::move(sparse_matrix)), full_bias_(std::move(bias)) {
    CHECK_EQ(sparse_matrix_.rows(), full_bias_.size());
    // Some kernels expect that the bias is divided by 4, so we store a second
    // copy of a quarter of the bias.
    // TODO(b/189958858): Remove the quartered bias if it can be done without
    // loss of speed, and rename the |full_bias_| member back to |bias_|.
    bias_ = full_bias_;
    for (int i = 0; i < bias_.size(); ++i) {
      bias_[i] = static_cast<BiasType>(.25f * static_cast<float>(bias_[i]));
    }
  }
  SparseLinearLayer(
      const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) {
    *this = src;
  }
  SparseLinearLayer& operator=(
      const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) {
    sparse_matrix_ = src.sparse_matrix_;
    bias_ = src.bias_;
    full_bias_ = src.full_bias_;
    mid_output_ = src.mid_output_;
    thread_layers_ = src.thread_layers_;
    num_threads_ = src.num_threads_;
    if (src.split_pc_) {
      split_pc_ = absl::make_unique<ProducerConsumer>(
          src.split_pc_->num_producers(), src.split_pc_->num_consumers());
    }
    return *this;
  }

  // Does Ax + b where A is a block sparse compressed sparse row matrix and
  // x is a COLUMN MAJOR dense vector or matrix.  Bias is a vector that is
  // broadcast if rhs has more than one column.
  template <typename RhsClassType, typename OutType>
  void SpMM_bias(const RhsClassType& rhs, OutType* out, bool relu = false,
                 int tid = 0, SpinBarrier* barrier = nullptr) const {
    static_assert(
        std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
    sparse_matrix_.SpMM_bias(rhs, bias_, out, relu, tid, barrier);
  }
  // Multiplies a sparse matrix by a possibly dense matrix, as SpMM_bias above,
  // and then samples from the output (softmax distribution) layer.
  template <typename RhsClassType, typename OutType>
  int SpMM_bias_Sample(const RhsClassType& rhs, OutType* out, float temperature,
                       int tid, SpinBarrier* barrier, std::minstd_rand* gen,
                       CacheAlignedVector<float>* scratch) const {
    static_assert(
        std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
    return sparse_matrix_.SpMM_bias_Sample(rhs, bias_, out, temperature, tid,
                                           barrier, gen, scratch);
  }
  template <typename RhsClassType, typename OutType>
  void MatVec(const RhsClassType& rhs, bool relu, int tid, int replicas,
              int output_stride, OutType* output,
              SpinBarrier* barrier = nullptr) {
    static_assert(
        std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
#ifdef __AVX2__
    if (block_width() == 4 && (block_height() == 4 || block_height() == 8) &&
        !IsCustomFloatType<WeightType>::value) {
      if (!IsSplit()) {
        sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu,
                              tid, replicas, output_stride, output->data());
        if (barrier != nullptr) barrier->barrier();
        return;
      }
      // NOTE: Until the quartered bias is removed it is a bad idea to split
      // for ARM in the same way, as we would have to quarter the output of
      // the first part of the split before running the second part.
      // Signal completion of the previous MatVec.
      split_pc_->produce();
      PartLinearLayer& thread_part = thread_layers_[tid];
      auto offset_output =
          sparse_matrix_.thread_bounds().OffsetOutput(output->data(), tid);
      auto mid_output =
          sparse_matrix_.thread_bounds().OffsetOutput(mid_output_.data(), tid);
      auto offset_bias = sparse_matrix_.thread_bounds().OffsetOutput(
          mid_output_.cast_data(), tid);
      // We can continue to consume the data that this thread produced and
      // compute just the |self_matrix| part.
      // No |relu| or |replicas|, as this is only a partial matmul.
      // |tid| is always zero because the matrix has been split by tid.
      thread_part.self_matrix.MatVec(
          rhs.cast_data(), thread_part.full_bias.cast_data(), /*relu=*/false,
          /*tid=*/0, /*replicas=*/1, output_stride, mid_output);
      // We have to wait for the other threads to finish working on the previous
      // MatMul before consuming the rest of |rhs|.
      split_pc_->consume();
      thread_part.other_matrix.MatVec(rhs.cast_data(), offset_bias, relu,
                                      /*tid=*/0, replicas, output_stride,
                                      offset_output);
      return;
    }
#endif
    DCHECK_EQ(replicas, 1) << "Must have single replica for SpMM API";
    if (IsSplit()) {
      // Generics aren't setup to use a split matrix. This will be inefficient.
      split_pc_->produce();
      split_pc_->consume();
    }
    if (block_height() == 8) {
      // We are currently forced to use MatVec generics for this case.
      LOG(WARNING) << "Need to implement MatVec for 8x4 for non-AVX2 targets!!";
      sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu, tid,
                            replicas, output_stride, output->data());
      if (barrier != nullptr) barrier->barrier();
    } else {
      sparse_matrix_.SpMM_bias(rhs, bias_, output, relu, tid, barrier);
    }
  }

  int rows() const { return sparse_matrix_.rows(); }
  int cols() const { return sparse_matrix_.cols(); }
  float sparsity() const { return sparse_matrix_.sparsity(); }
  int block_width() const { return sparse_matrix_.block_width(); }
  int block_height() const { return sparse_matrix_.block_height(); }
  int num_threads() const { return sparse_matrix_.num_threads(); }
  const CacheAlignedVector<BiasType>& bias() const { return bias_; }
  const std::vector<int>& split_points() const {
    return sparse_matrix_.split_points();
  }
  bool IsSplit() const {
    return !thread_layers_.empty() && split_pc_ != nullptr;
  }

  std::size_t bytes() const { return sparse_matrix_.bytes() + bias_.bytes(); }
  void Print() const {
    printf("Matrix\n");
    sparse_matrix_.Print();
    printf("Bias\n");
    bias_.Print();
  }

  // Combines adjacent row blocks, doubling the block height.
  // This necessarily involves adding zero weights where the blocks don't align
  // across adjacent pairs of rows, so use with caution, as the resulting matrix
  // is most likely to run slower if very sparse to begin with.
  // In the few cases where the blocks do mostly align, the resulting matmul
  // could be much faster, as the number of reads of the rhs will be halved.
  void DoubleBlockHeight() { sparse_matrix_.DoubleBlockHeight(); }

  // Cache_line_size is provided only for testing. Normally uses a value for
  // the current architecture.
  int PrepareForThreads(int num_threads, int cache_line_size = -1) {
    num_threads_ = num_threads;
    if (num_threads_ > 1) {
      split_pc_ =
          absl::make_unique<ProducerConsumer>(num_threads_, num_threads_);
    } else {
      split_pc_.reset(nullptr);
    }
    return sparse_matrix_.PrepareForThreads(num_threads, cache_line_size);
  }

  // Partitions the matrix into pieces by thread.
  // In this matrix, we can go ahead and calculate the part that only depends
  // on rhs inputs that were generated by this thread in the previous matvec,
  // without having to use any thread synchronization, and only after that do we
  // have to wait for the other threads to finish the previous matvec.
  // So we split the matrix using the |split_points| from the previous matrix
  // into 2 * |num_threads_| pieces: self and other for each thread, being the
  //  parts that can be calculated before and after the other threads have
  // completed their calculation of the previous matvec.
  // We then have to use a ProducerConsumer lock instead of a SpinBarrier to
  // synchronize the data produced by the other threads.
  void SliceForThreads(const std::vector<int>& split_points) {
    thread_layers_.clear();
    thread_layers_.reserve(num_threads_);
    LOG(INFO) << "Slicing " << rows() << "x" << cols() << " matrix for "
              << num_threads_ << " threads";
    for (int tid = 0; tid < num_threads_; ++tid) {
      thread_layers_.emplace_back(
          sparse_matrix_, full_bias_, bias_, tid,
          split_points[tid] * sparse_matrix_.block_height(),
          split_points[tid + 1] * sparse_matrix_.block_height());
    }
    mid_output_ =
        std::move(csrblocksparse::CacheAlignedVector<BiasType>(rows()));
    mid_output_.FillZero();
  }

  // Splits the layer by inputs into 2 equal pieces. Each of the resulting
  // layers should be computed independently on the first and second halves of
  // the inputs respectively and the results added to achieve the same effect
  // as the original layer.
  void SplitInputs(
      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1,
      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) {
    CsrBlockSparseMatrix<WeightType, RhsType> matrix1(
        sparse_matrix_.SplitByColumn(0, sparse_matrix_.cols() / 2));
    CsrBlockSparseMatrix<WeightType, RhsType> matrix2(
        sparse_matrix_.SplitByColumn(sparse_matrix_.cols() / 2,
                                     sparse_matrix_.cols()));
    *part1 =
        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
            std::move(matrix1),
            std::move(CacheAlignedVector<BiasType>(full_bias_))));
    CacheAlignedVector<BiasType> bias2(sparse_matrix_.rows());
    bias2.FillZero();
    *part2 =
        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
            std::move(matrix2), std::move(bias2)));
  }

  // Splits the layer by outputs into 2 equal pieces. Each of the resulting
  // layers should be computed independently on the full inputs and the results
  // concatenated to achieve the same effect as the original layer.
  void SplitOutputs(
      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1,
      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) {
    LOG(INFO) << "input rows=" << sparse_matrix_.rows()
              << ", cols=" << sparse_matrix_.cols();
    CsrBlockSparseMatrix<WeightType, RhsType> matrix1(
        sparse_matrix_.SplitByRow(0, sparse_matrix_.rows() / 2));
    CsrBlockSparseMatrix<WeightType, RhsType> matrix2(sparse_matrix_.SplitByRow(
        sparse_matrix_.rows() / 2, sparse_matrix_.rows()));
    CacheAlignedVector<BiasType> bias1(full_bias_, 0, full_bias_.size() / 2);
    *part1 =
        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
            std::move(matrix1), std::move(bias1)));
    CacheAlignedVector<BiasType> bias2(full_bias_, full_bias_.size() / 2,
                                       full_bias_.size());
    *part2 =
        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
            std::move(matrix2), std::move(bias2)));
  }

 private:
  // Simple struct to hold a partitioned layer.
  struct PartLinearLayer {
    // The original matrix is first split by row to generate only the outputs
    // for the given tid. The |row_sub_matrix| is then split by column into two
    // partitions:
    // self is the part for which the rhs elements in [|start_col|, |end_col|)
    // were generated by this thread in some previous matmul.
    // |other| is the rest of the columns that require rhs elements from other
    // threads.
    // NOTE that| start_col|, |end_col| are in raw columns, not blocks.
    PartLinearLayer(const CsrBlockSparseMatrix<WeightType, RhsType>& matrix,
                    const CacheAlignedVector<BiasType>& bias,
                    const CacheAlignedVector<BiasType>& bias_4, int tid,
                    int start_col, int end_col) {
      int block_height = matrix.block_height();
      // Split the input matrix by row, selecting only the rows relevant to
      // thread tid.
      int start_row = matrix.split_points()[tid] * block_height;
      int end_row = matrix.split_points()[tid + 1] * block_height;
      LOG(INFO) << "input cols [" << start_col << "," << end_col << ") rows ["
                << start_row << "," << end_row << ")";
      CsrBlockSparseMatrix<WeightType, RhsType> row_sub_matrix =
          matrix.SplitByRow(start_row, end_row);
      // Partition into the columns that use rhs elements that thread tid
      // produced in a previous matmul, and the other rhs elements.
      // NOTE that we |keep_rhs_size|=true so that each matrix can operate on
      // the same rhs input vector. The self matrix just guarantees not to
      // access any of the elements that are generated by another thread.
      self_matrix = std::move(row_sub_matrix.SplitByColumn(
          start_col, end_col, /*keep_rhs_size=*/true));
      self_matrix.PrepareForThreads(1);
      // The reversed start and end slice out the complement of [start, end).
      other_matrix = std::move(row_sub_matrix.SplitByColumn(
          end_col, start_col, /*keep_rhs_size=*/true));
      other_matrix.PrepareForThreads(1);
      full_bias =
          std::move(CacheAlignedVector<BiasType>(bias, start_row, end_row));
      // TODO(b/189958858): Eliminate the quarter bias from all the code.
      quarter_bias =
          std::move(CacheAlignedVector<BiasType>(bias_4, start_row, end_row));
    }
    // The part of the matrix that only depends on this thread for rhs inputs.
    CsrBlockSparseMatrix<WeightType, RhsType> self_matrix;
    CacheAlignedVector<BiasType> full_bias;
    CacheAlignedVector<BiasType> quarter_bias;
    // The part of the matrix that uses rhs inputs from other threads.
    CsrBlockSparseMatrix<WeightType, RhsType> other_matrix;
  };
  CsrBlockSparseMatrix<WeightType, RhsType, DeltaType> sparse_matrix_;
  CacheAlignedVector<BiasType> bias_;
  CacheAlignedVector<BiasType> full_bias_;
  // Output from the self_matrix that will be given to |other_matrix| as bias.
  CacheAlignedVector<BiasType> mid_output_;
  // One partitioned pair of matrices for each thread.
  std::vector<PartLinearLayer> thread_layers_;
  // Producer-consumer lock used to wait between computing |self_matrix| and
  // |other_matrix| for the other threads to finish the *previous* matvec.
  std::unique_ptr<ProducerConsumer> split_pc_;
  int num_threads_ = 0;
};

template <typename WeightType, typename RhsType>
SparseLinearLayer<WeightType, RhsType> CreateRandomLayer(int rows, int cols,
                                                         float sparsity,
                                                         int block_height = 1,
                                                         int block_width = 1) {
  typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType;
  CacheAlignedVector<BiasType> bias(rows);
  bias.FillRandom();

  auto masked_matrix = MaskedSparseMatrix<float>(rows, cols, sparsity,
                                                 block_height, block_width);
  auto sparse_matrix = CsrBlockSparseMatrix<WeightType, RhsType>(masked_matrix);

  return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix),
                                                std::move(bias));
}

template <typename WeightType, typename RhsType>
SparseLinearLayer<WeightType, RhsType> CreateConstantLayer(
    int rows, int cols, float sparsity, float constant = 1.f) {
  typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType;
  CacheAlignedVector<BiasType> bias(rows);
  bias.FillOnes();

  MaskedSparseMatrix<float> masked_matrix(rows, cols, sparsity,
                                          /*block_height=*/1, /*block_width=*/1,
                                          constant, /*random=*/false);
  CsrBlockSparseMatrix<WeightType, RhsType> sparse_matrix(masked_matrix);

  return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix),
                                                std::move(bias));
}

}  // namespace csrblocksparse

#endif  // LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_