Spaces:
Runtime error
Runtime error
/* | |
* Copyright 2021 Google LLC | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
namespace csrblocksparse { | |
template <typename WeightType, typename RhsType, | |
typename BiasType = typename TypeOfProduct<WeightType, RhsType>::type, | |
typename DeltaType = int16_t> | |
class SparseLinearLayer { | |
public: | |
SparseLinearLayer() {} | |
SparseLinearLayer(CsrBlockSparseMatrix<WeightType, RhsType>&& sparse_matrix, | |
CacheAlignedVector<BiasType>&& bias) | |
: sparse_matrix_(std::move(sparse_matrix)), full_bias_(std::move(bias)) { | |
CHECK_EQ(sparse_matrix_.rows(), full_bias_.size()); | |
// Some kernels expect that the bias is divided by 4, so we store a second | |
// copy of a quarter of the bias. | |
// TODO(b/189958858): Remove the quartered bias if it can be done without | |
// loss of speed, and rename the |full_bias_| member back to |bias_|. | |
bias_ = full_bias_; | |
for (int i = 0; i < bias_.size(); ++i) { | |
bias_[i] = static_cast<BiasType>(.25f * static_cast<float>(bias_[i])); | |
} | |
} | |
SparseLinearLayer( | |
const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) { | |
*this = src; | |
} | |
SparseLinearLayer& operator=( | |
const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) { | |
sparse_matrix_ = src.sparse_matrix_; | |
bias_ = src.bias_; | |
full_bias_ = src.full_bias_; | |
mid_output_ = src.mid_output_; | |
thread_layers_ = src.thread_layers_; | |
num_threads_ = src.num_threads_; | |
if (src.split_pc_) { | |
split_pc_ = absl::make_unique<ProducerConsumer>( | |
src.split_pc_->num_producers(), src.split_pc_->num_consumers()); | |
} | |
return *this; | |
} | |
// Does Ax + b where A is a block sparse compressed sparse row matrix and | |
// x is a COLUMN MAJOR dense vector or matrix. Bias is a vector that is | |
// broadcast if rhs has more than one column. | |
template <typename RhsClassType, typename OutType> | |
void SpMM_bias(const RhsClassType& rhs, OutType* out, bool relu = false, | |
int tid = 0, SpinBarrier* barrier = nullptr) const { | |
static_assert( | |
std::is_same<typename RhsClassType::value_type, RhsType>::value, ""); | |
sparse_matrix_.SpMM_bias(rhs, bias_, out, relu, tid, barrier); | |
} | |
// Multiplies a sparse matrix by a possibly dense matrix, as SpMM_bias above, | |
// and then samples from the output (softmax distribution) layer. | |
template <typename RhsClassType, typename OutType> | |
int SpMM_bias_Sample(const RhsClassType& rhs, OutType* out, float temperature, | |
int tid, SpinBarrier* barrier, std::minstd_rand* gen, | |
CacheAlignedVector<float>* scratch) const { | |
static_assert( | |
std::is_same<typename RhsClassType::value_type, RhsType>::value, ""); | |
return sparse_matrix_.SpMM_bias_Sample(rhs, bias_, out, temperature, tid, | |
barrier, gen, scratch); | |
} | |
template <typename RhsClassType, typename OutType> | |
void MatVec(const RhsClassType& rhs, bool relu, int tid, int replicas, | |
int output_stride, OutType* output, | |
SpinBarrier* barrier = nullptr) { | |
static_assert( | |
std::is_same<typename RhsClassType::value_type, RhsType>::value, ""); | |
if (block_width() == 4 && (block_height() == 4 || block_height() == 8) && | |
!IsCustomFloatType<WeightType>::value) { | |
if (!IsSplit()) { | |
sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu, | |
tid, replicas, output_stride, output->data()); | |
if (barrier != nullptr) barrier->barrier(); | |
return; | |
} | |
// NOTE: Until the quartered bias is removed it is a bad idea to split | |
// for ARM in the same way, as we would have to quarter the output of | |
// the first part of the split before running the second part. | |
// Signal completion of the previous MatVec. | |
split_pc_->produce(); | |
PartLinearLayer& thread_part = thread_layers_[tid]; | |
auto offset_output = | |
sparse_matrix_.thread_bounds().OffsetOutput(output->data(), tid); | |
auto mid_output = | |
sparse_matrix_.thread_bounds().OffsetOutput(mid_output_.data(), tid); | |
auto offset_bias = sparse_matrix_.thread_bounds().OffsetOutput( | |
mid_output_.cast_data(), tid); | |
// We can continue to consume the data that this thread produced and | |
// compute just the |self_matrix| part. | |
// No |relu| or |replicas|, as this is only a partial matmul. | |
// |tid| is always zero because the matrix has been split by tid. | |
thread_part.self_matrix.MatVec( | |
rhs.cast_data(), thread_part.full_bias.cast_data(), /*relu=*/false, | |
/*tid=*/0, /*replicas=*/1, output_stride, mid_output); | |
// We have to wait for the other threads to finish working on the previous | |
// MatMul before consuming the rest of |rhs|. | |
split_pc_->consume(); | |
thread_part.other_matrix.MatVec(rhs.cast_data(), offset_bias, relu, | |
/*tid=*/0, replicas, output_stride, | |
offset_output); | |
return; | |
} | |
DCHECK_EQ(replicas, 1) << "Must have single replica for SpMM API"; | |
if (IsSplit()) { | |
// Generics aren't setup to use a split matrix. This will be inefficient. | |
split_pc_->produce(); | |
split_pc_->consume(); | |
} | |
if (block_height() == 8) { | |
// We are currently forced to use MatVec generics for this case. | |
LOG(WARNING) << "Need to implement MatVec for 8x4 for non-AVX2 targets!!"; | |
sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu, tid, | |
replicas, output_stride, output->data()); | |
if (barrier != nullptr) barrier->barrier(); | |
} else { | |
sparse_matrix_.SpMM_bias(rhs, bias_, output, relu, tid, barrier); | |
} | |
} | |
int rows() const { return sparse_matrix_.rows(); } | |
int cols() const { return sparse_matrix_.cols(); } | |
float sparsity() const { return sparse_matrix_.sparsity(); } | |
int block_width() const { return sparse_matrix_.block_width(); } | |
int block_height() const { return sparse_matrix_.block_height(); } | |
int num_threads() const { return sparse_matrix_.num_threads(); } | |
const CacheAlignedVector<BiasType>& bias() const { return bias_; } | |
const std::vector<int>& split_points() const { | |
return sparse_matrix_.split_points(); | |
} | |
bool IsSplit() const { | |
return !thread_layers_.empty() && split_pc_ != nullptr; | |
} | |
std::size_t bytes() const { return sparse_matrix_.bytes() + bias_.bytes(); } | |
void Print() const { | |
printf("Matrix\n"); | |
sparse_matrix_.Print(); | |
printf("Bias\n"); | |
bias_.Print(); | |
} | |
// Combines adjacent row blocks, doubling the block height. | |
// This necessarily involves adding zero weights where the blocks don't align | |
// across adjacent pairs of rows, so use with caution, as the resulting matrix | |
// is most likely to run slower if very sparse to begin with. | |
// In the few cases where the blocks do mostly align, the resulting matmul | |
// could be much faster, as the number of reads of the rhs will be halved. | |
void DoubleBlockHeight() { sparse_matrix_.DoubleBlockHeight(); } | |
// Cache_line_size is provided only for testing. Normally uses a value for | |
// the current architecture. | |
int PrepareForThreads(int num_threads, int cache_line_size = -1) { | |
num_threads_ = num_threads; | |
if (num_threads_ > 1) { | |
split_pc_ = | |
absl::make_unique<ProducerConsumer>(num_threads_, num_threads_); | |
} else { | |
split_pc_.reset(nullptr); | |
} | |
return sparse_matrix_.PrepareForThreads(num_threads, cache_line_size); | |
} | |
// Partitions the matrix into pieces by thread. | |
// In this matrix, we can go ahead and calculate the part that only depends | |
// on rhs inputs that were generated by this thread in the previous matvec, | |
// without having to use any thread synchronization, and only after that do we | |
// have to wait for the other threads to finish the previous matvec. | |
// So we split the matrix using the |split_points| from the previous matrix | |
// into 2 * |num_threads_| pieces: self and other for each thread, being the | |
// parts that can be calculated before and after the other threads have | |
// completed their calculation of the previous matvec. | |
// We then have to use a ProducerConsumer lock instead of a SpinBarrier to | |
// synchronize the data produced by the other threads. | |
void SliceForThreads(const std::vector<int>& split_points) { | |
thread_layers_.clear(); | |
thread_layers_.reserve(num_threads_); | |
LOG(INFO) << "Slicing " << rows() << "x" << cols() << " matrix for " | |
<< num_threads_ << " threads"; | |
for (int tid = 0; tid < num_threads_; ++tid) { | |
thread_layers_.emplace_back( | |
sparse_matrix_, full_bias_, bias_, tid, | |
split_points[tid] * sparse_matrix_.block_height(), | |
split_points[tid + 1] * sparse_matrix_.block_height()); | |
} | |
mid_output_ = | |
std::move(csrblocksparse::CacheAlignedVector<BiasType>(rows())); | |
mid_output_.FillZero(); | |
} | |
// Splits the layer by inputs into 2 equal pieces. Each of the resulting | |
// layers should be computed independently on the first and second halves of | |
// the inputs respectively and the results added to achieve the same effect | |
// as the original layer. | |
void SplitInputs( | |
SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1, | |
SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) { | |
CsrBlockSparseMatrix<WeightType, RhsType> matrix1( | |
sparse_matrix_.SplitByColumn(0, sparse_matrix_.cols() / 2)); | |
CsrBlockSparseMatrix<WeightType, RhsType> matrix2( | |
sparse_matrix_.SplitByColumn(sparse_matrix_.cols() / 2, | |
sparse_matrix_.cols())); | |
*part1 = | |
std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>( | |
std::move(matrix1), | |
std::move(CacheAlignedVector<BiasType>(full_bias_)))); | |
CacheAlignedVector<BiasType> bias2(sparse_matrix_.rows()); | |
bias2.FillZero(); | |
*part2 = | |
std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>( | |
std::move(matrix2), std::move(bias2))); | |
} | |
// Splits the layer by outputs into 2 equal pieces. Each of the resulting | |
// layers should be computed independently on the full inputs and the results | |
// concatenated to achieve the same effect as the original layer. | |
void SplitOutputs( | |
SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1, | |
SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) { | |
LOG(INFO) << "input rows=" << sparse_matrix_.rows() | |
<< ", cols=" << sparse_matrix_.cols(); | |
CsrBlockSparseMatrix<WeightType, RhsType> matrix1( | |
sparse_matrix_.SplitByRow(0, sparse_matrix_.rows() / 2)); | |
CsrBlockSparseMatrix<WeightType, RhsType> matrix2(sparse_matrix_.SplitByRow( | |
sparse_matrix_.rows() / 2, sparse_matrix_.rows())); | |
CacheAlignedVector<BiasType> bias1(full_bias_, 0, full_bias_.size() / 2); | |
*part1 = | |
std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>( | |
std::move(matrix1), std::move(bias1))); | |
CacheAlignedVector<BiasType> bias2(full_bias_, full_bias_.size() / 2, | |
full_bias_.size()); | |
*part2 = | |
std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>( | |
std::move(matrix2), std::move(bias2))); | |
} | |
private: | |
// Simple struct to hold a partitioned layer. | |
struct PartLinearLayer { | |
// The original matrix is first split by row to generate only the outputs | |
// for the given tid. The |row_sub_matrix| is then split by column into two | |
// partitions: | |
// self is the part for which the rhs elements in [|start_col|, |end_col|) | |
// were generated by this thread in some previous matmul. | |
// |other| is the rest of the columns that require rhs elements from other | |
// threads. | |
// NOTE that| start_col|, |end_col| are in raw columns, not blocks. | |
PartLinearLayer(const CsrBlockSparseMatrix<WeightType, RhsType>& matrix, | |
const CacheAlignedVector<BiasType>& bias, | |
const CacheAlignedVector<BiasType>& bias_4, int tid, | |
int start_col, int end_col) { | |
int block_height = matrix.block_height(); | |
// Split the input matrix by row, selecting only the rows relevant to | |
// thread tid. | |
int start_row = matrix.split_points()[tid] * block_height; | |
int end_row = matrix.split_points()[tid + 1] * block_height; | |
LOG(INFO) << "input cols [" << start_col << "," << end_col << ") rows [" | |
<< start_row << "," << end_row << ")"; | |
CsrBlockSparseMatrix<WeightType, RhsType> row_sub_matrix = | |
matrix.SplitByRow(start_row, end_row); | |
// Partition into the columns that use rhs elements that thread tid | |
// produced in a previous matmul, and the other rhs elements. | |
// NOTE that we |keep_rhs_size|=true so that each matrix can operate on | |
// the same rhs input vector. The self matrix just guarantees not to | |
// access any of the elements that are generated by another thread. | |
self_matrix = std::move(row_sub_matrix.SplitByColumn( | |
start_col, end_col, /*keep_rhs_size=*/true)); | |
self_matrix.PrepareForThreads(1); | |
// The reversed start and end slice out the complement of [start, end). | |
other_matrix = std::move(row_sub_matrix.SplitByColumn( | |
end_col, start_col, /*keep_rhs_size=*/true)); | |
other_matrix.PrepareForThreads(1); | |
full_bias = | |
std::move(CacheAlignedVector<BiasType>(bias, start_row, end_row)); | |
// TODO(b/189958858): Eliminate the quarter bias from all the code. | |
quarter_bias = | |
std::move(CacheAlignedVector<BiasType>(bias_4, start_row, end_row)); | |
} | |
// The part of the matrix that only depends on this thread for rhs inputs. | |
CsrBlockSparseMatrix<WeightType, RhsType> self_matrix; | |
CacheAlignedVector<BiasType> full_bias; | |
CacheAlignedVector<BiasType> quarter_bias; | |
// The part of the matrix that uses rhs inputs from other threads. | |
CsrBlockSparseMatrix<WeightType, RhsType> other_matrix; | |
}; | |
CsrBlockSparseMatrix<WeightType, RhsType, DeltaType> sparse_matrix_; | |
CacheAlignedVector<BiasType> bias_; | |
CacheAlignedVector<BiasType> full_bias_; | |
// Output from the self_matrix that will be given to |other_matrix| as bias. | |
CacheAlignedVector<BiasType> mid_output_; | |
// One partitioned pair of matrices for each thread. | |
std::vector<PartLinearLayer> thread_layers_; | |
// Producer-consumer lock used to wait between computing |self_matrix| and | |
// |other_matrix| for the other threads to finish the *previous* matvec. | |
std::unique_ptr<ProducerConsumer> split_pc_; | |
int num_threads_ = 0; | |
}; | |
template <typename WeightType, typename RhsType> | |
SparseLinearLayer<WeightType, RhsType> CreateRandomLayer(int rows, int cols, | |
float sparsity, | |
int block_height = 1, | |
int block_width = 1) { | |
typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType; | |
CacheAlignedVector<BiasType> bias(rows); | |
bias.FillRandom(); | |
auto masked_matrix = MaskedSparseMatrix<float>(rows, cols, sparsity, | |
block_height, block_width); | |
auto sparse_matrix = CsrBlockSparseMatrix<WeightType, RhsType>(masked_matrix); | |
return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix), | |
std::move(bias)); | |
} | |
template <typename WeightType, typename RhsType> | |
SparseLinearLayer<WeightType, RhsType> CreateConstantLayer( | |
int rows, int cols, float sparsity, float constant = 1.f) { | |
typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType; | |
CacheAlignedVector<BiasType> bias(rows); | |
bias.FillOnes(); | |
MaskedSparseMatrix<float> masked_matrix(rows, cols, sparsity, | |
/*block_height=*/1, /*block_width=*/1, | |
constant, /*random=*/false); | |
CsrBlockSparseMatrix<WeightType, RhsType> sparse_matrix(masked_matrix); | |
return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix), | |
std::move(bias)); | |
} | |
} // namespace csrblocksparse | |