Spaces:

ntt123
/

WaveGRU-Text-To-Speech

Running

App Files Files Community

WaveGRU-Text-To-Speech / sparse_matmul /layers /sparse_linear_layer.h

NTT123

add fast cpp wavegru

d1a84ee over 2 years ago

raw

history blame

No virus

17.5 kB

	/*
	* Copyright 2021 Google LLC
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_
	#define LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_

	#include <cstdint>

	#include "absl/memory/memory.h"
	#include "glog/logging.h"
	#include "sparse_matmul/layers/csr_blocksparse_matrix.h"
	#include "sparse_matmul/layers/masked_sparse_matrix.h"
	#include "sparse_matmul/numerics/type_utils.h"
	#include "sparse_matmul/os/coop_threads.h"
	#include "sparse_matmul/vector/cache_aligned_vector.h"

	namespace csrblocksparse {

	template <typename WeightType, typename RhsType,
	typename BiasType = typename TypeOfProduct<WeightType, RhsType>::type,
	typename DeltaType = int16_t>
	class SparseLinearLayer {
	public:
	SparseLinearLayer() {}

	SparseLinearLayer(CsrBlockSparseMatrix<WeightType, RhsType>&& sparse_matrix,
	CacheAlignedVector<BiasType>&& bias)
	: sparse_matrix_(std::move(sparse_matrix)), full_bias_(std::move(bias)) {
	CHECK_EQ(sparse_matrix_.rows(), full_bias_.size());
	// Some kernels expect that the bias is divided by 4, so we store a second
	// copy of a quarter of the bias.
	// TODO(b/189958858): Remove the quartered bias if it can be done without
	// loss of speed, and rename the \|full_bias_\| member back to \|bias_\|.
	bias_ = full_bias_;
	for (int i = 0; i < bias_.size(); ++i) {
	bias_[i] = static_cast<BiasType>(.25f * static_cast<float>(bias_[i]));
	}
	}
	SparseLinearLayer(
	const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) {
	*this = src;
	}
	SparseLinearLayer& operator=(
	const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) {
	sparse_matrix_ = src.sparse_matrix_;
	bias_ = src.bias_;
	full_bias_ = src.full_bias_;
	mid_output_ = src.mid_output_;
	thread_layers_ = src.thread_layers_;
	num_threads_ = src.num_threads_;
	if (src.split_pc_) {
	split_pc_ = absl::make_unique<ProducerConsumer>(
	src.split_pc_->num_producers(), src.split_pc_->num_consumers());
	}
	return *this;
	}

	// Does Ax + b where A is a block sparse compressed sparse row matrix and
	// x is a COLUMN MAJOR dense vector or matrix. Bias is a vector that is
	// broadcast if rhs has more than one column.
	template <typename RhsClassType, typename OutType>
	void SpMM_bias(const RhsClassType& rhs, OutType* out, bool relu = false,
	int tid = 0, SpinBarrier* barrier = nullptr) const {
	static_assert(
	std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
	sparse_matrix_.SpMM_bias(rhs, bias_, out, relu, tid, barrier);
	}
	// Multiplies a sparse matrix by a possibly dense matrix, as SpMM_bias above,
	// and then samples from the output (softmax distribution) layer.
	template <typename RhsClassType, typename OutType>
	int SpMM_bias_Sample(const RhsClassType& rhs, OutType* out, float temperature,
	int tid, SpinBarrier* barrier, std::minstd_rand* gen,
	CacheAlignedVector<float>* scratch) const {
	static_assert(
	std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
	return sparse_matrix_.SpMM_bias_Sample(rhs, bias_, out, temperature, tid,
	barrier, gen, scratch);
	}
	template <typename RhsClassType, typename OutType>
	void MatVec(const RhsClassType& rhs, bool relu, int tid, int replicas,
	int output_stride, OutType* output,
	SpinBarrier* barrier = nullptr) {
	static_assert(
	std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
	#ifdef __AVX2__
	if (block_width() == 4 && (block_height() == 4 \|\| block_height() == 8) &&
	!IsCustomFloatType<WeightType>::value) {
	if (!IsSplit()) {
	sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu,
	tid, replicas, output_stride, output->data());
	if (barrier != nullptr) barrier->barrier();
	return;
	}
	// NOTE: Until the quartered bias is removed it is a bad idea to split
	// for ARM in the same way, as we would have to quarter the output of
	// the first part of the split before running the second part.
	// Signal completion of the previous MatVec.
	split_pc_->produce();
	PartLinearLayer& thread_part = thread_layers_[tid];
	auto offset_output =
	sparse_matrix_.thread_bounds().OffsetOutput(output->data(), tid);
	auto mid_output =
	sparse_matrix_.thread_bounds().OffsetOutput(mid_output_.data(), tid);
	auto offset_bias = sparse_matrix_.thread_bounds().OffsetOutput(
	mid_output_.cast_data(), tid);
	// We can continue to consume the data that this thread produced and
	// compute just the \|self_matrix\| part.
	// No \|relu\| or \|replicas\|, as this is only a partial matmul.
	// \|tid\| is always zero because the matrix has been split by tid.
	thread_part.self_matrix.MatVec(
	rhs.cast_data(), thread_part.full_bias.cast_data(), /relu=/false,
	/tid=/0, /replicas=/1, output_stride, mid_output);
	// We have to wait for the other threads to finish working on the previous
	// MatMul before consuming the rest of \|rhs\|.
	split_pc_->consume();
	thread_part.other_matrix.MatVec(rhs.cast_data(), offset_bias, relu,
	/tid=/0, replicas, output_stride,
	offset_output);
	return;
	}
	#endif
	DCHECK_EQ(replicas, 1) << "Must have single replica for SpMM API";
	if (IsSplit()) {
	// Generics aren't setup to use a split matrix. This will be inefficient.
	split_pc_->produce();
	split_pc_->consume();
	}
	if (block_height() == 8) {
	// We are currently forced to use MatVec generics for this case.
	LOG(WARNING) << "Need to implement MatVec for 8x4 for non-AVX2 targets!!";
	sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu, tid,
	replicas, output_stride, output->data());
	if (barrier != nullptr) barrier->barrier();
	} else {
	sparse_matrix_.SpMM_bias(rhs, bias_, output, relu, tid, barrier);
	}
	}

	int rows() const { return sparse_matrix_.rows(); }
	int cols() const { return sparse_matrix_.cols(); }
	float sparsity() const { return sparse_matrix_.sparsity(); }
	int block_width() const { return sparse_matrix_.block_width(); }
	int block_height() const { return sparse_matrix_.block_height(); }
	int num_threads() const { return sparse_matrix_.num_threads(); }
	const CacheAlignedVector<BiasType>& bias() const { return bias_; }
	const std::vector<int>& split_points() const {
	return sparse_matrix_.split_points();
	}
	bool IsSplit() const {
	return !thread_layers_.empty() && split_pc_ != nullptr;
	}

	std::size_t bytes() const { return sparse_matrix_.bytes() + bias_.bytes(); }
	void Print() const {
	printf("Matrix\n");
	sparse_matrix_.Print();
	printf("Bias\n");
	bias_.Print();
	}

	// Combines adjacent row blocks, doubling the block height.
	// This necessarily involves adding zero weights where the blocks don't align
	// across adjacent pairs of rows, so use with caution, as the resulting matrix
	// is most likely to run slower if very sparse to begin with.
	// In the few cases where the blocks do mostly align, the resulting matmul
	// could be much faster, as the number of reads of the rhs will be halved.
	void DoubleBlockHeight() { sparse_matrix_.DoubleBlockHeight(); }

	// Cache_line_size is provided only for testing. Normally uses a value for
	// the current architecture.
	int PrepareForThreads(int num_threads, int cache_line_size = -1) {
	num_threads_ = num_threads;
	if (num_threads_ > 1) {
	split_pc_ =
	absl::make_unique<ProducerConsumer>(num_threads_, num_threads_);
	} else {
	split_pc_.reset(nullptr);
	}
	return sparse_matrix_.PrepareForThreads(num_threads, cache_line_size);
	}

	// Partitions the matrix into pieces by thread.
	// In this matrix, we can go ahead and calculate the part that only depends
	// on rhs inputs that were generated by this thread in the previous matvec,
	// without having to use any thread synchronization, and only after that do we
	// have to wait for the other threads to finish the previous matvec.
	// So we split the matrix using the \|split_points\| from the previous matrix
	// into 2 * \|num_threads_\| pieces: self and other for each thread, being the
	// parts that can be calculated before and after the other threads have
	// completed their calculation of the previous matvec.
	// We then have to use a ProducerConsumer lock instead of a SpinBarrier to
	// synchronize the data produced by the other threads.
	void SliceForThreads(const std::vector<int>& split_points) {
	thread_layers_.clear();
	thread_layers_.reserve(num_threads_);
	LOG(INFO) << "Slicing " << rows() << "x" << cols() << " matrix for "
	<< num_threads_ << " threads";
	for (int tid = 0; tid < num_threads_; ++tid) {
	thread_layers_.emplace_back(
	sparse_matrix_, full_bias_, bias_, tid,
	split_points[tid] * sparse_matrix_.block_height(),
	split_points[tid + 1] * sparse_matrix_.block_height());
	}
	mid_output_ =
	std::move(csrblocksparse::CacheAlignedVector<BiasType>(rows()));
	mid_output_.FillZero();
	}

	// Splits the layer by inputs into 2 equal pieces. Each of the resulting
	// layers should be computed independently on the first and second halves of
	// the inputs respectively and the results added to achieve the same effect
	// as the original layer.
	void SplitInputs(
	SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1,
	SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) {
	CsrBlockSparseMatrix<WeightType, RhsType> matrix1(
	sparse_matrix_.SplitByColumn(0, sparse_matrix_.cols() / 2));
	CsrBlockSparseMatrix<WeightType, RhsType> matrix2(
	sparse_matrix_.SplitByColumn(sparse_matrix_.cols() / 2,
	sparse_matrix_.cols()));
	*part1 =
	std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
	std::move(matrix1),
	std::move(CacheAlignedVector<BiasType>(full_bias_))));
	CacheAlignedVector<BiasType> bias2(sparse_matrix_.rows());
	bias2.FillZero();
	*part2 =
	std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
	std::move(matrix2), std::move(bias2)));
	}

	// Splits the layer by outputs into 2 equal pieces. Each of the resulting
	// layers should be computed independently on the full inputs and the results
	// concatenated to achieve the same effect as the original layer.
	void SplitOutputs(
	SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1,
	SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) {
	LOG(INFO) << "input rows=" << sparse_matrix_.rows()
	<< ", cols=" << sparse_matrix_.cols();
	CsrBlockSparseMatrix<WeightType, RhsType> matrix1(
	sparse_matrix_.SplitByRow(0, sparse_matrix_.rows() / 2));
	CsrBlockSparseMatrix<WeightType, RhsType> matrix2(sparse_matrix_.SplitByRow(
	sparse_matrix_.rows() / 2, sparse_matrix_.rows()));
	CacheAlignedVector<BiasType> bias1(full_bias_, 0, full_bias_.size() / 2);
	*part1 =
	std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
	std::move(matrix1), std::move(bias1)));
	CacheAlignedVector<BiasType> bias2(full_bias_, full_bias_.size() / 2,
	full_bias_.size());
	*part2 =
	std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
	std::move(matrix2), std::move(bias2)));
	}

	private:
	// Simple struct to hold a partitioned layer.
	struct PartLinearLayer {
	// The original matrix is first split by row to generate only the outputs
	// for the given tid. The \|row_sub_matrix\| is then split by column into two
	// partitions:
	// self is the part for which the rhs elements in [\|start_col\|, \|end_col\|)
	// were generated by this thread in some previous matmul.
	// \|other\| is the rest of the columns that require rhs elements from other
	// threads.
	// NOTE that\| start_col\|, \|end_col\| are in raw columns, not blocks.
	PartLinearLayer(const CsrBlockSparseMatrix<WeightType, RhsType>& matrix,
	const CacheAlignedVector<BiasType>& bias,
	const CacheAlignedVector<BiasType>& bias_4, int tid,
	int start_col, int end_col) {
	int block_height = matrix.block_height();
	// Split the input matrix by row, selecting only the rows relevant to
	// thread tid.
	int start_row = matrix.split_points()[tid] * block_height;
	int end_row = matrix.split_points()[tid + 1] * block_height;
	LOG(INFO) << "input cols [" << start_col << "," << end_col << ") rows ["
	<< start_row << "," << end_row << ")";
	CsrBlockSparseMatrix<WeightType, RhsType> row_sub_matrix =
	matrix.SplitByRow(start_row, end_row);
	// Partition into the columns that use rhs elements that thread tid
	// produced in a previous matmul, and the other rhs elements.
	// NOTE that we \|keep_rhs_size\|=true so that each matrix can operate on
	// the same rhs input vector. The self matrix just guarantees not to
	// access any of the elements that are generated by another thread.
	self_matrix = std::move(row_sub_matrix.SplitByColumn(
	start_col, end_col, /keep_rhs_size=/true));
	self_matrix.PrepareForThreads(1);
	// The reversed start and end slice out the complement of [start, end).
	other_matrix = std::move(row_sub_matrix.SplitByColumn(
	end_col, start_col, /keep_rhs_size=/true));
	other_matrix.PrepareForThreads(1);
	full_bias =
	std::move(CacheAlignedVector<BiasType>(bias, start_row, end_row));
	// TODO(b/189958858): Eliminate the quarter bias from all the code.
	quarter_bias =
	std::move(CacheAlignedVector<BiasType>(bias_4, start_row, end_row));
	}
	// The part of the matrix that only depends on this thread for rhs inputs.
	CsrBlockSparseMatrix<WeightType, RhsType> self_matrix;
	CacheAlignedVector<BiasType> full_bias;
	CacheAlignedVector<BiasType> quarter_bias;
	// The part of the matrix that uses rhs inputs from other threads.
	CsrBlockSparseMatrix<WeightType, RhsType> other_matrix;
	};
	CsrBlockSparseMatrix<WeightType, RhsType, DeltaType> sparse_matrix_;
	CacheAlignedVector<BiasType> bias_;
	CacheAlignedVector<BiasType> full_bias_;
	// Output from the self_matrix that will be given to \|other_matrix\| as bias.
	CacheAlignedVector<BiasType> mid_output_;
	// One partitioned pair of matrices for each thread.
	std::vector<PartLinearLayer> thread_layers_;
	// Producer-consumer lock used to wait between computing \|self_matrix\| and
	// \|other_matrix\| for the other threads to finish the previous matvec.
	std::unique_ptr<ProducerConsumer> split_pc_;
	int num_threads_ = 0;
	};

	template <typename WeightType, typename RhsType>
	SparseLinearLayer<WeightType, RhsType> CreateRandomLayer(int rows, int cols,
	float sparsity,
	int block_height = 1,
	int block_width = 1) {
	typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType;
	CacheAlignedVector<BiasType> bias(rows);
	bias.FillRandom();

	auto masked_matrix = MaskedSparseMatrix<float>(rows, cols, sparsity,
	block_height, block_width);
	auto sparse_matrix = CsrBlockSparseMatrix<WeightType, RhsType>(masked_matrix);

	return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix),
	std::move(bias));
	}

	template <typename WeightType, typename RhsType>
	SparseLinearLayer<WeightType, RhsType> CreateConstantLayer(
	int rows, int cols, float sparsity, float constant = 1.f) {
	typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType;
	CacheAlignedVector<BiasType> bias(rows);
	bias.FillOnes();

	MaskedSparseMatrix<float> masked_matrix(rows, cols, sparsity,
	/block_height=/1, /block_width=/1,
	constant, /random=/false);
	CsrBlockSparseMatrix<WeightType, RhsType> sparse_matrix(masked_matrix);

	return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix),
	std::move(bias));
	}

	} // namespace csrblocksparse

	#endif // LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_